1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 */ 40 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 #include "opt_ktrace.h" 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 #include <sys/sysproto.h> 49 #include <sys/filedesc.h> 50 #include <sys/filio.h> 51 #include <sys/fcntl.h> 52 #include <sys/file.h> 53 #include <sys/proc.h> 54 #include <sys/signalvar.h> 55 #include <sys/socketvar.h> 56 #include <sys/uio.h> 57 #include <sys/kernel.h> 58 #include <sys/limits.h> 59 #include <sys/malloc.h> 60 #include <sys/poll.h> 61 #include <sys/resourcevar.h> 62 #include <sys/selinfo.h> 63 #include <sys/syscallsubr.h> 64 #include <sys/sysctl.h> 65 #include <sys/sysent.h> 66 #include <sys/bio.h> 67 #include <sys/buf.h> 68 #include <sys/condvar.h> 69 #ifdef KTRACE 70 #include <sys/ktrace.h> 71 #endif 72 #include <vm/vm.h> 73 #include <vm/vm_page.h> 74 75 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 76 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 77 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 78 79 static int pollscan(struct thread *, struct pollfd *, u_int); 80 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 81 static int dofileread(struct thread *, struct file *, int, void *, 82 size_t, off_t, int); 83 static int dofilewrite(struct thread *, struct file *, int, 84 const void *, size_t, off_t, int); 85 static void doselwakeup(struct selinfo *, int); 86 87 /* 88 * Read system call. 89 */ 90 #ifndef _SYS_SYSPROTO_H_ 91 struct read_args { 92 int fd; 93 void *buf; 94 size_t nbyte; 95 }; 96 #endif 97 /* 98 * MPSAFE 99 */ 100 int 101 read(td, uap) 102 struct thread *td; 103 struct read_args *uap; 104 { 105 struct file *fp; 106 int error; 107 108 if ((error = fget_read(td, uap->fd, &fp)) == 0) { 109 error = dofileread(td, fp, uap->fd, uap->buf, 110 uap->nbyte, (off_t)-1, 0); 111 fdrop(fp, td); 112 } 113 return(error); 114 } 115 116 /* 117 * Pread system call 118 */ 119 #ifndef _SYS_SYSPROTO_H_ 120 struct pread_args { 121 int fd; 122 void *buf; 123 size_t nbyte; 124 int pad; 125 off_t offset; 126 }; 127 #endif 128 /* 129 * MPSAFE 130 */ 131 int 132 pread(td, uap) 133 struct thread *td; 134 struct pread_args *uap; 135 { 136 struct file *fp; 137 int error; 138 139 if ((error = fget_read(td, uap->fd, &fp)) != 0) 140 return (error); 141 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) { 142 error = ESPIPE; 143 } else { 144 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 145 uap->offset, FOF_OFFSET); 146 } 147 fdrop(fp, td); 148 return(error); 149 } 150 151 /* 152 * Code common for read and pread 153 */ 154 static int 155 dofileread(td, fp, fd, buf, nbyte, offset, flags) 156 struct thread *td; 157 struct file *fp; 158 int fd, flags; 159 void *buf; 160 size_t nbyte; 161 off_t offset; 162 { 163 struct uio auio; 164 struct iovec aiov; 165 long cnt, error = 0; 166 #ifdef KTRACE 167 struct iovec ktriov; 168 struct uio ktruio; 169 int didktr = 0; 170 #endif 171 172 aiov.iov_base = buf; 173 aiov.iov_len = nbyte; 174 auio.uio_iov = &aiov; 175 auio.uio_iovcnt = 1; 176 auio.uio_offset = offset; 177 if (nbyte > INT_MAX) 178 return (EINVAL); 179 auio.uio_resid = nbyte; 180 auio.uio_rw = UIO_READ; 181 auio.uio_segflg = UIO_USERSPACE; 182 auio.uio_td = td; 183 #ifdef KTRACE 184 /* 185 * if tracing, save a copy of iovec 186 */ 187 if (KTRPOINT(td, KTR_GENIO)) { 188 ktriov = aiov; 189 ktruio = auio; 190 didktr = 1; 191 } 192 #endif 193 cnt = nbyte; 194 195 if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) { 196 if (auio.uio_resid != cnt && (error == ERESTART || 197 error == EINTR || error == EWOULDBLOCK)) 198 error = 0; 199 } 200 cnt -= auio.uio_resid; 201 #ifdef KTRACE 202 if (didktr && error == 0) { 203 ktruio.uio_iov = &ktriov; 204 ktruio.uio_resid = cnt; 205 ktrgenio(fd, UIO_READ, &ktruio, error); 206 } 207 #endif 208 td->td_retval[0] = cnt; 209 return (error); 210 } 211 212 /* 213 * Scatter read system call. 214 */ 215 #ifndef _SYS_SYSPROTO_H_ 216 struct readv_args { 217 int fd; 218 struct iovec *iovp; 219 u_int iovcnt; 220 }; 221 #endif 222 /* 223 * MPSAFE 224 */ 225 int 226 readv(td, uap) 227 struct thread *td; 228 struct readv_args *uap; 229 { 230 struct file *fp; 231 struct uio auio; 232 struct iovec *iov; 233 struct iovec *needfree; 234 struct iovec aiov[UIO_SMALLIOV]; 235 long i, cnt; 236 int error; 237 u_int iovlen; 238 #ifdef KTRACE 239 struct iovec *ktriov = NULL; 240 struct uio ktruio; 241 #endif 242 243 if ((error = fget_read(td, uap->fd, &fp)) != 0) 244 return (error); 245 needfree = NULL; 246 /* note: can't use iovlen until iovcnt is validated */ 247 iovlen = uap->iovcnt * sizeof (struct iovec); 248 if (uap->iovcnt > UIO_SMALLIOV) { 249 if (uap->iovcnt > UIO_MAXIOV) { 250 error = EINVAL; 251 goto done; 252 } 253 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 254 needfree = iov; 255 } else 256 iov = aiov; 257 auio.uio_iov = iov; 258 auio.uio_iovcnt = uap->iovcnt; 259 auio.uio_rw = UIO_READ; 260 auio.uio_segflg = UIO_USERSPACE; 261 auio.uio_td = td; 262 auio.uio_offset = -1; 263 if ((error = copyin(uap->iovp, iov, iovlen))) 264 goto done; 265 auio.uio_resid = 0; 266 for (i = 0; i < uap->iovcnt; i++) { 267 if (iov->iov_len > INT_MAX - auio.uio_resid) { 268 error = EINVAL; 269 goto done; 270 } 271 auio.uio_resid += iov->iov_len; 272 iov++; 273 } 274 #ifdef KTRACE 275 /* 276 * if tracing, save a copy of iovec 277 */ 278 if (KTRPOINT(td, KTR_GENIO)) { 279 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 280 bcopy(auio.uio_iov, ktriov, iovlen); 281 ktruio = auio; 282 } 283 #endif 284 cnt = auio.uio_resid; 285 if ((error = fo_read(fp, &auio, td->td_ucred, 0, td))) { 286 if (auio.uio_resid != cnt && (error == ERESTART || 287 error == EINTR || error == EWOULDBLOCK)) 288 error = 0; 289 } 290 cnt -= auio.uio_resid; 291 #ifdef KTRACE 292 if (ktriov != NULL) { 293 if (error == 0) { 294 ktruio.uio_iov = ktriov; 295 ktruio.uio_resid = cnt; 296 ktrgenio(uap->fd, UIO_READ, &ktruio, error); 297 } 298 FREE(ktriov, M_TEMP); 299 } 300 #endif 301 td->td_retval[0] = cnt; 302 done: 303 fdrop(fp, td); 304 if (needfree) 305 FREE(needfree, M_IOV); 306 return (error); 307 } 308 309 /* 310 * Write system call 311 */ 312 #ifndef _SYS_SYSPROTO_H_ 313 struct write_args { 314 int fd; 315 const void *buf; 316 size_t nbyte; 317 }; 318 #endif 319 /* 320 * MPSAFE 321 */ 322 int 323 write(td, uap) 324 struct thread *td; 325 struct write_args *uap; 326 { 327 struct file *fp; 328 int error; 329 330 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 331 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, 332 (off_t)-1, 0); 333 fdrop(fp, td); 334 } else { 335 error = EBADF; /* XXX this can't be right */ 336 } 337 return(error); 338 } 339 340 /* 341 * Pwrite system call 342 */ 343 #ifndef _SYS_SYSPROTO_H_ 344 struct pwrite_args { 345 int fd; 346 const void *buf; 347 size_t nbyte; 348 int pad; 349 off_t offset; 350 }; 351 #endif 352 /* 353 * MPSAFE 354 */ 355 int 356 pwrite(td, uap) 357 struct thread *td; 358 struct pwrite_args *uap; 359 { 360 struct file *fp; 361 int error; 362 363 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 364 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) { 365 error = ESPIPE; 366 } else { 367 error = dofilewrite(td, fp, uap->fd, uap->buf, 368 uap->nbyte, uap->offset, FOF_OFFSET); 369 } 370 fdrop(fp, td); 371 } else { 372 error = EBADF; /* this can't be right */ 373 } 374 return(error); 375 } 376 377 static int 378 dofilewrite(td, fp, fd, buf, nbyte, offset, flags) 379 struct thread *td; 380 struct file *fp; 381 int fd, flags; 382 const void *buf; 383 size_t nbyte; 384 off_t offset; 385 { 386 struct uio auio; 387 struct iovec aiov; 388 long cnt, error = 0; 389 #ifdef KTRACE 390 struct iovec ktriov; 391 struct uio ktruio; 392 int didktr = 0; 393 #endif 394 395 aiov.iov_base = (void *)(uintptr_t)buf; 396 aiov.iov_len = nbyte; 397 auio.uio_iov = &aiov; 398 auio.uio_iovcnt = 1; 399 auio.uio_offset = offset; 400 if (nbyte > INT_MAX) 401 return (EINVAL); 402 auio.uio_resid = nbyte; 403 auio.uio_rw = UIO_WRITE; 404 auio.uio_segflg = UIO_USERSPACE; 405 auio.uio_td = td; 406 #ifdef KTRACE 407 /* 408 * if tracing, save a copy of iovec and uio 409 */ 410 if (KTRPOINT(td, KTR_GENIO)) { 411 ktriov = aiov; 412 ktruio = auio; 413 didktr = 1; 414 } 415 #endif 416 cnt = nbyte; 417 if (fp->f_type == DTYPE_VNODE) 418 bwillwrite(); 419 if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) { 420 if (auio.uio_resid != cnt && (error == ERESTART || 421 error == EINTR || error == EWOULDBLOCK)) 422 error = 0; 423 /* Socket layer is responsible for issuing SIGPIPE. */ 424 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) { 425 PROC_LOCK(td->td_proc); 426 psignal(td->td_proc, SIGPIPE); 427 PROC_UNLOCK(td->td_proc); 428 } 429 } 430 cnt -= auio.uio_resid; 431 #ifdef KTRACE 432 if (didktr && error == 0) { 433 ktruio.uio_iov = &ktriov; 434 ktruio.uio_resid = cnt; 435 ktrgenio(fd, UIO_WRITE, &ktruio, error); 436 } 437 #endif 438 td->td_retval[0] = cnt; 439 return (error); 440 } 441 442 /* 443 * Gather write system call 444 */ 445 #ifndef _SYS_SYSPROTO_H_ 446 struct writev_args { 447 int fd; 448 struct iovec *iovp; 449 u_int iovcnt; 450 }; 451 #endif 452 /* 453 * MPSAFE 454 */ 455 int 456 writev(td, uap) 457 struct thread *td; 458 register struct writev_args *uap; 459 { 460 struct file *fp; 461 struct uio auio; 462 register struct iovec *iov; 463 struct iovec *needfree; 464 struct iovec aiov[UIO_SMALLIOV]; 465 long i, cnt, error = 0; 466 u_int iovlen; 467 #ifdef KTRACE 468 struct iovec *ktriov = NULL; 469 struct uio ktruio; 470 #endif 471 472 if ((error = fget_write(td, uap->fd, &fp)) != 0) 473 return (EBADF); 474 needfree = NULL; 475 /* note: can't use iovlen until iovcnt is validated */ 476 iovlen = uap->iovcnt * sizeof (struct iovec); 477 if (uap->iovcnt > UIO_SMALLIOV) { 478 if (uap->iovcnt > UIO_MAXIOV) { 479 error = EINVAL; 480 goto done; 481 } 482 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 483 needfree = iov; 484 } else 485 iov = aiov; 486 auio.uio_iov = iov; 487 auio.uio_iovcnt = uap->iovcnt; 488 auio.uio_rw = UIO_WRITE; 489 auio.uio_segflg = UIO_USERSPACE; 490 auio.uio_td = td; 491 auio.uio_offset = -1; 492 if ((error = copyin(uap->iovp, iov, iovlen))) 493 goto done; 494 auio.uio_resid = 0; 495 for (i = 0; i < uap->iovcnt; i++) { 496 if (iov->iov_len > INT_MAX - auio.uio_resid) { 497 error = EINVAL; 498 goto done; 499 } 500 auio.uio_resid += iov->iov_len; 501 iov++; 502 } 503 #ifdef KTRACE 504 /* 505 * if tracing, save a copy of iovec and uio 506 */ 507 if (KTRPOINT(td, KTR_GENIO)) { 508 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 509 bcopy(auio.uio_iov, ktriov, iovlen); 510 ktruio = auio; 511 } 512 #endif 513 cnt = auio.uio_resid; 514 if (fp->f_type == DTYPE_VNODE) 515 bwillwrite(); 516 if ((error = fo_write(fp, &auio, td->td_ucred, 0, td))) { 517 if (auio.uio_resid != cnt && (error == ERESTART || 518 error == EINTR || error == EWOULDBLOCK)) 519 error = 0; 520 if (error == EPIPE) { 521 PROC_LOCK(td->td_proc); 522 psignal(td->td_proc, SIGPIPE); 523 PROC_UNLOCK(td->td_proc); 524 } 525 } 526 cnt -= auio.uio_resid; 527 #ifdef KTRACE 528 if (ktriov != NULL) { 529 if (error == 0) { 530 ktruio.uio_iov = ktriov; 531 ktruio.uio_resid = cnt; 532 ktrgenio(uap->fd, UIO_WRITE, &ktruio, error); 533 } 534 FREE(ktriov, M_TEMP); 535 } 536 #endif 537 td->td_retval[0] = cnt; 538 done: 539 fdrop(fp, td); 540 if (needfree) 541 FREE(needfree, M_IOV); 542 return (error); 543 } 544 545 /* 546 * Ioctl system call 547 */ 548 #ifndef _SYS_SYSPROTO_H_ 549 struct ioctl_args { 550 int fd; 551 u_long com; 552 caddr_t data; 553 }; 554 #endif 555 /* 556 * MPSAFE 557 */ 558 /* ARGSUSED */ 559 int 560 ioctl(td, uap) 561 struct thread *td; 562 register struct ioctl_args *uap; 563 { 564 struct file *fp; 565 register struct filedesc *fdp; 566 register u_long com; 567 int error = 0; 568 register u_int size; 569 caddr_t data, memp; 570 int tmp; 571 #define STK_PARAMS 128 572 union { 573 char stkbuf[STK_PARAMS]; 574 long align; 575 } ubuf; 576 577 if ((error = fget(td, uap->fd, &fp)) != 0) 578 return (error); 579 mtx_lock(&Giant); 580 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 581 fdrop(fp, td); 582 mtx_unlock(&Giant); 583 return (EBADF); 584 } 585 fdp = td->td_proc->p_fd; 586 switch (com = uap->com) { 587 case FIONCLEX: 588 FILEDESC_LOCK(fdp); 589 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; 590 FILEDESC_UNLOCK(fdp); 591 fdrop(fp, td); 592 mtx_unlock(&Giant); 593 return (0); 594 case FIOCLEX: 595 FILEDESC_LOCK(fdp); 596 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; 597 FILEDESC_UNLOCK(fdp); 598 fdrop(fp, td); 599 mtx_unlock(&Giant); 600 return (0); 601 } 602 603 /* 604 * Interpret high order word to find amount of data to be 605 * copied to/from the user's address space. 606 */ 607 size = IOCPARM_LEN(com); 608 if (size > IOCPARM_MAX) { 609 fdrop(fp, td); 610 mtx_unlock(&Giant); 611 return (ENOTTY); 612 } 613 614 memp = NULL; 615 if (size > sizeof (ubuf.stkbuf)) { 616 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 617 data = memp; 618 } else { 619 data = ubuf.stkbuf; 620 } 621 if (com&IOC_IN) { 622 if (size) { 623 error = copyin(uap->data, data, (u_int)size); 624 if (error) { 625 if (memp) 626 free(memp, M_IOCTLOPS); 627 fdrop(fp, td); 628 goto done; 629 } 630 } else { 631 *(caddr_t *)data = uap->data; 632 } 633 } else if ((com&IOC_OUT) && size) { 634 /* 635 * Zero the buffer so the user always 636 * gets back something deterministic. 637 */ 638 bzero(data, size); 639 } else if (com&IOC_VOID) { 640 *(caddr_t *)data = uap->data; 641 } 642 643 switch (com) { 644 645 case FIONBIO: 646 FILE_LOCK(fp); 647 if ((tmp = *(int *)data)) 648 fp->f_flag |= FNONBLOCK; 649 else 650 fp->f_flag &= ~FNONBLOCK; 651 FILE_UNLOCK(fp); 652 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 653 break; 654 655 case FIOASYNC: 656 FILE_LOCK(fp); 657 if ((tmp = *(int *)data)) 658 fp->f_flag |= FASYNC; 659 else 660 fp->f_flag &= ~FASYNC; 661 FILE_UNLOCK(fp); 662 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); 663 break; 664 665 default: 666 error = fo_ioctl(fp, com, data, td->td_ucred, td); 667 /* 668 * Copy any data to user, size was 669 * already set and checked above. 670 */ 671 if (error == 0 && (com&IOC_OUT) && size) 672 error = copyout(data, uap->data, (u_int)size); 673 break; 674 } 675 if (memp) 676 free(memp, M_IOCTLOPS); 677 fdrop(fp, td); 678 done: 679 mtx_unlock(&Giant); 680 return (error); 681 } 682 683 /* 684 * sellock and selwait are initialized in selectinit() via SYSINIT. 685 */ 686 struct mtx sellock; 687 struct cv selwait; 688 u_int nselcoll; /* Select collisions since boot */ 689 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 690 691 /* 692 * Select system call. 693 */ 694 #ifndef _SYS_SYSPROTO_H_ 695 struct select_args { 696 int nd; 697 fd_set *in, *ou, *ex; 698 struct timeval *tv; 699 }; 700 #endif 701 /* 702 * MPSAFE 703 */ 704 int 705 select(td, uap) 706 register struct thread *td; 707 register struct select_args *uap; 708 { 709 struct timeval tv, *tvp; 710 int error; 711 712 if (uap->tv != NULL) { 713 error = copyin(uap->tv, &tv, sizeof(tv)); 714 if (error) 715 return (error); 716 tvp = &tv; 717 } else 718 tvp = NULL; 719 720 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp)); 721 } 722 723 int 724 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 725 fd_set *fd_ex, struct timeval *tvp) 726 { 727 struct filedesc *fdp; 728 /* 729 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 730 * infds with the new FD_SETSIZE of 1024, and more than enough for 731 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 732 * of 256. 733 */ 734 fd_mask s_selbits[howmany(2048, NFDBITS)]; 735 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 736 struct timeval atv, rtv, ttv; 737 int error, timo; 738 u_int ncoll, nbufbytes, ncpbytes, nfdbits; 739 740 if (nd < 0) 741 return (EINVAL); 742 fdp = td->td_proc->p_fd; 743 mtx_lock(&Giant); 744 FILEDESC_LOCK(fdp); 745 746 if (nd > td->td_proc->p_fd->fd_nfiles) 747 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 748 FILEDESC_UNLOCK(fdp); 749 750 /* 751 * Allocate just enough bits for the non-null fd_sets. Use the 752 * preallocated auto buffer if possible. 753 */ 754 nfdbits = roundup(nd, NFDBITS); 755 ncpbytes = nfdbits / NBBY; 756 nbufbytes = 0; 757 if (fd_in != NULL) 758 nbufbytes += 2 * ncpbytes; 759 if (fd_ou != NULL) 760 nbufbytes += 2 * ncpbytes; 761 if (fd_ex != NULL) 762 nbufbytes += 2 * ncpbytes; 763 if (nbufbytes <= sizeof s_selbits) 764 selbits = &s_selbits[0]; 765 else 766 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 767 768 /* 769 * Assign pointers into the bit buffers and fetch the input bits. 770 * Put the output buffers together so that they can be bzeroed 771 * together. 772 */ 773 sbp = selbits; 774 #define getbits(name, x) \ 775 do { \ 776 if (name == NULL) \ 777 ibits[x] = NULL; \ 778 else { \ 779 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 780 obits[x] = sbp; \ 781 sbp += ncpbytes / sizeof *sbp; \ 782 error = copyin(name, ibits[x], ncpbytes); \ 783 if (error != 0) \ 784 goto done_nosellock; \ 785 } \ 786 } while (0) 787 getbits(fd_in, 0); 788 getbits(fd_ou, 1); 789 getbits(fd_ex, 2); 790 #undef getbits 791 if (nbufbytes != 0) 792 bzero(selbits, nbufbytes / 2); 793 794 if (tvp != NULL) { 795 atv = *tvp; 796 if (itimerfix(&atv)) { 797 error = EINVAL; 798 goto done_nosellock; 799 } 800 getmicrouptime(&rtv); 801 timevaladd(&atv, &rtv); 802 } else { 803 atv.tv_sec = 0; 804 atv.tv_usec = 0; 805 } 806 timo = 0; 807 TAILQ_INIT(&td->td_selq); 808 mtx_lock(&sellock); 809 retry: 810 ncoll = nselcoll; 811 mtx_lock_spin(&sched_lock); 812 td->td_flags |= TDF_SELECT; 813 mtx_unlock_spin(&sched_lock); 814 mtx_unlock(&sellock); 815 816 error = selscan(td, ibits, obits, nd); 817 mtx_lock(&sellock); 818 if (error || td->td_retval[0]) 819 goto done; 820 if (atv.tv_sec || atv.tv_usec) { 821 getmicrouptime(&rtv); 822 if (timevalcmp(&rtv, &atv, >=)) 823 goto done; 824 ttv = atv; 825 timevalsub(&ttv, &rtv); 826 timo = ttv.tv_sec > 24 * 60 * 60 ? 827 24 * 60 * 60 * hz : tvtohz(&ttv); 828 } 829 830 /* 831 * An event of interest may occur while we do not hold 832 * sellock, so check TDF_SELECT and the number of 833 * collisions and rescan the file descriptors if 834 * necessary. 835 */ 836 mtx_lock_spin(&sched_lock); 837 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 838 mtx_unlock_spin(&sched_lock); 839 goto retry; 840 } 841 mtx_unlock_spin(&sched_lock); 842 843 if (timo > 0) 844 error = cv_timedwait_sig(&selwait, &sellock, timo); 845 else 846 error = cv_wait_sig(&selwait, &sellock); 847 848 if (error == 0) 849 goto retry; 850 851 done: 852 clear_selinfo_list(td); 853 mtx_lock_spin(&sched_lock); 854 td->td_flags &= ~TDF_SELECT; 855 mtx_unlock_spin(&sched_lock); 856 mtx_unlock(&sellock); 857 858 done_nosellock: 859 /* select is not restarted after signals... */ 860 if (error == ERESTART) 861 error = EINTR; 862 if (error == EWOULDBLOCK) 863 error = 0; 864 #define putbits(name, x) \ 865 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ 866 error = error2; 867 if (error == 0) { 868 int error2; 869 870 putbits(fd_in, 0); 871 putbits(fd_ou, 1); 872 putbits(fd_ex, 2); 873 #undef putbits 874 } 875 if (selbits != &s_selbits[0]) 876 free(selbits, M_SELECT); 877 878 mtx_unlock(&Giant); 879 return (error); 880 } 881 882 static int 883 selscan(td, ibits, obits, nfd) 884 struct thread *td; 885 fd_mask **ibits, **obits; 886 int nfd; 887 { 888 int msk, i, fd; 889 fd_mask bits; 890 struct file *fp; 891 int n = 0; 892 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 893 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 894 struct filedesc *fdp = td->td_proc->p_fd; 895 896 FILEDESC_LOCK(fdp); 897 for (msk = 0; msk < 3; msk++) { 898 if (ibits[msk] == NULL) 899 continue; 900 for (i = 0; i < nfd; i += NFDBITS) { 901 bits = ibits[msk][i/NFDBITS]; 902 /* ffs(int mask) not portable, fd_mask is long */ 903 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 904 if (!(bits & 1)) 905 continue; 906 if ((fp = fget_locked(fdp, fd)) == NULL) { 907 FILEDESC_UNLOCK(fdp); 908 return (EBADF); 909 } 910 if (fo_poll(fp, flag[msk], td->td_ucred, 911 td)) { 912 obits[msk][(fd)/NFDBITS] |= 913 ((fd_mask)1 << ((fd) % NFDBITS)); 914 n++; 915 } 916 } 917 } 918 } 919 FILEDESC_UNLOCK(fdp); 920 td->td_retval[0] = n; 921 return (0); 922 } 923 924 /* 925 * Poll system call. 926 */ 927 #ifndef _SYS_SYSPROTO_H_ 928 struct poll_args { 929 struct pollfd *fds; 930 u_int nfds; 931 int timeout; 932 }; 933 #endif 934 /* 935 * MPSAFE 936 */ 937 int 938 poll(td, uap) 939 struct thread *td; 940 struct poll_args *uap; 941 { 942 caddr_t bits; 943 char smallbits[32 * sizeof(struct pollfd)]; 944 struct timeval atv, rtv, ttv; 945 int error = 0, timo; 946 u_int ncoll, nfds; 947 size_t ni; 948 949 nfds = uap->nfds; 950 951 mtx_lock(&Giant); 952 /* 953 * This is kinda bogus. We have fd limits, but that is not 954 * really related to the size of the pollfd array. Make sure 955 * we let the process use at least FD_SETSIZE entries and at 956 * least enough for the current limits. We want to be reasonably 957 * safe, but not overly restrictive. 958 */ 959 if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) && 960 (nfds > FD_SETSIZE)) { 961 error = EINVAL; 962 goto done2; 963 } 964 ni = nfds * sizeof(struct pollfd); 965 if (ni > sizeof(smallbits)) 966 bits = malloc(ni, M_TEMP, M_WAITOK); 967 else 968 bits = smallbits; 969 error = copyin(uap->fds, bits, ni); 970 if (error) 971 goto done_nosellock; 972 if (uap->timeout != INFTIM) { 973 atv.tv_sec = uap->timeout / 1000; 974 atv.tv_usec = (uap->timeout % 1000) * 1000; 975 if (itimerfix(&atv)) { 976 error = EINVAL; 977 goto done_nosellock; 978 } 979 getmicrouptime(&rtv); 980 timevaladd(&atv, &rtv); 981 } else { 982 atv.tv_sec = 0; 983 atv.tv_usec = 0; 984 } 985 timo = 0; 986 TAILQ_INIT(&td->td_selq); 987 mtx_lock(&sellock); 988 retry: 989 ncoll = nselcoll; 990 mtx_lock_spin(&sched_lock); 991 td->td_flags |= TDF_SELECT; 992 mtx_unlock_spin(&sched_lock); 993 mtx_unlock(&sellock); 994 995 error = pollscan(td, (struct pollfd *)bits, nfds); 996 mtx_lock(&sellock); 997 if (error || td->td_retval[0]) 998 goto done; 999 if (atv.tv_sec || atv.tv_usec) { 1000 getmicrouptime(&rtv); 1001 if (timevalcmp(&rtv, &atv, >=)) 1002 goto done; 1003 ttv = atv; 1004 timevalsub(&ttv, &rtv); 1005 timo = ttv.tv_sec > 24 * 60 * 60 ? 1006 24 * 60 * 60 * hz : tvtohz(&ttv); 1007 } 1008 /* 1009 * An event of interest may occur while we do not hold 1010 * sellock, so check TDF_SELECT and the number of collisions 1011 * and rescan the file descriptors if necessary. 1012 */ 1013 mtx_lock_spin(&sched_lock); 1014 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 1015 mtx_unlock_spin(&sched_lock); 1016 goto retry; 1017 } 1018 mtx_unlock_spin(&sched_lock); 1019 1020 if (timo > 0) 1021 error = cv_timedwait_sig(&selwait, &sellock, timo); 1022 else 1023 error = cv_wait_sig(&selwait, &sellock); 1024 1025 if (error == 0) 1026 goto retry; 1027 1028 done: 1029 clear_selinfo_list(td); 1030 mtx_lock_spin(&sched_lock); 1031 td->td_flags &= ~TDF_SELECT; 1032 mtx_unlock_spin(&sched_lock); 1033 mtx_unlock(&sellock); 1034 1035 done_nosellock: 1036 /* poll is not restarted after signals... */ 1037 if (error == ERESTART) 1038 error = EINTR; 1039 if (error == EWOULDBLOCK) 1040 error = 0; 1041 if (error == 0) { 1042 error = copyout(bits, uap->fds, ni); 1043 if (error) 1044 goto out; 1045 } 1046 out: 1047 if (ni > sizeof(smallbits)) 1048 free(bits, M_TEMP); 1049 done2: 1050 mtx_unlock(&Giant); 1051 return (error); 1052 } 1053 1054 static int 1055 pollscan(td, fds, nfd) 1056 struct thread *td; 1057 struct pollfd *fds; 1058 u_int nfd; 1059 { 1060 register struct filedesc *fdp = td->td_proc->p_fd; 1061 int i; 1062 struct file *fp; 1063 int n = 0; 1064 1065 FILEDESC_LOCK(fdp); 1066 for (i = 0; i < nfd; i++, fds++) { 1067 if (fds->fd >= fdp->fd_nfiles) { 1068 fds->revents = POLLNVAL; 1069 n++; 1070 } else if (fds->fd < 0) { 1071 fds->revents = 0; 1072 } else { 1073 fp = fdp->fd_ofiles[fds->fd]; 1074 if (fp == NULL) { 1075 fds->revents = POLLNVAL; 1076 n++; 1077 } else { 1078 /* 1079 * Note: backend also returns POLLHUP and 1080 * POLLERR if appropriate. 1081 */ 1082 fds->revents = fo_poll(fp, fds->events, 1083 td->td_ucred, td); 1084 if (fds->revents != 0) 1085 n++; 1086 } 1087 } 1088 } 1089 FILEDESC_UNLOCK(fdp); 1090 td->td_retval[0] = n; 1091 return (0); 1092 } 1093 1094 /* 1095 * OpenBSD poll system call. 1096 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1097 */ 1098 #ifndef _SYS_SYSPROTO_H_ 1099 struct openbsd_poll_args { 1100 struct pollfd *fds; 1101 u_int nfds; 1102 int timeout; 1103 }; 1104 #endif 1105 /* 1106 * MPSAFE 1107 */ 1108 int 1109 openbsd_poll(td, uap) 1110 register struct thread *td; 1111 register struct openbsd_poll_args *uap; 1112 { 1113 return (poll(td, (struct poll_args *)uap)); 1114 } 1115 1116 /* 1117 * Remove the references to the thread from all of the objects 1118 * we were polling. 1119 * 1120 * This code assumes that the underlying owner of the selinfo 1121 * structure will hold sellock before it changes it, and that 1122 * it will unlink itself from our list if it goes away. 1123 */ 1124 void 1125 clear_selinfo_list(td) 1126 struct thread *td; 1127 { 1128 struct selinfo *si; 1129 1130 mtx_assert(&sellock, MA_OWNED); 1131 TAILQ_FOREACH(si, &td->td_selq, si_thrlist) 1132 si->si_thread = NULL; 1133 TAILQ_INIT(&td->td_selq); 1134 } 1135 1136 /* 1137 * Record a select request. 1138 */ 1139 void 1140 selrecord(selector, sip) 1141 struct thread *selector; 1142 struct selinfo *sip; 1143 { 1144 1145 mtx_lock(&sellock); 1146 /* 1147 * If the selinfo's thread pointer is NULL then take ownership of it. 1148 * 1149 * If the thread pointer is not NULL and it points to another 1150 * thread, then we have a collision. 1151 * 1152 * If the thread pointer is not NULL and points back to us then leave 1153 * it alone as we've already added pointed it at us and added it to 1154 * our list. 1155 */ 1156 if (sip->si_thread == NULL) { 1157 sip->si_thread = selector; 1158 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); 1159 } else if (sip->si_thread != selector) { 1160 sip->si_flags |= SI_COLL; 1161 } 1162 1163 mtx_unlock(&sellock); 1164 } 1165 1166 /* Wake up a selecting thread. */ 1167 void 1168 selwakeup(sip) 1169 struct selinfo *sip; 1170 { 1171 doselwakeup(sip, -1); 1172 } 1173 1174 /* Wake up a selecting thread, and set its priority. */ 1175 void 1176 selwakeuppri(sip, pri) 1177 struct selinfo *sip; 1178 int pri; 1179 { 1180 doselwakeup(sip, pri); 1181 } 1182 1183 /* 1184 * Do a wakeup when a selectable event occurs. 1185 */ 1186 static void 1187 doselwakeup(sip, pri) 1188 struct selinfo *sip; 1189 int pri; 1190 { 1191 struct thread *td; 1192 1193 mtx_lock(&sellock); 1194 td = sip->si_thread; 1195 if ((sip->si_flags & SI_COLL) != 0) { 1196 nselcoll++; 1197 sip->si_flags &= ~SI_COLL; 1198 cv_broadcastpri(&selwait, pri); 1199 } 1200 if (td == NULL) { 1201 mtx_unlock(&sellock); 1202 return; 1203 } 1204 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); 1205 sip->si_thread = NULL; 1206 mtx_lock_spin(&sched_lock); 1207 if (td->td_wchan == &selwait) { 1208 cv_waitq_remove(td); 1209 TD_CLR_SLEEPING(td); 1210 if (pri >= PRI_MIN && pri <= PRI_MAX && td->td_priority > pri) 1211 td->td_priority = pri; 1212 setrunnable(td); 1213 } else 1214 td->td_flags &= ~TDF_SELECT; 1215 mtx_unlock_spin(&sched_lock); 1216 mtx_unlock(&sellock); 1217 } 1218 1219 static void selectinit(void *); 1220 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1221 1222 /* ARGSUSED*/ 1223 static void 1224 selectinit(dummy) 1225 void *dummy; 1226 { 1227 cv_init(&selwait, "select"); 1228 mtx_init(&sellock, "sellck", NULL, MTX_DEF); 1229 } 1230