1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 * $FreeBSD$ 40 */ 41 42 #include "opt_ktrace.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/sysproto.h> 47 #include <sys/filedesc.h> 48 #include <sys/filio.h> 49 #include <sys/fcntl.h> 50 #include <sys/file.h> 51 #include <sys/proc.h> 52 #include <sys/signalvar.h> 53 #include <sys/socketvar.h> 54 #include <sys/uio.h> 55 #include <sys/kernel.h> 56 #include <sys/limits.h> 57 #include <sys/malloc.h> 58 #include <sys/poll.h> 59 #include <sys/resourcevar.h> 60 #include <sys/selinfo.h> 61 #include <sys/syscallsubr.h> 62 #include <sys/sysctl.h> 63 #include <sys/sysent.h> 64 #include <sys/bio.h> 65 #include <sys/buf.h> 66 #include <sys/condvar.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 #include <vm/vm.h> 71 #include <vm/vm_page.h> 72 73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 75 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 76 77 static int pollscan(struct thread *, struct pollfd *, u_int); 78 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 79 static int dofileread(struct thread *, struct file *, int, void *, 80 size_t, off_t, int); 81 static int dofilewrite(struct thread *, struct file *, int, 82 const void *, size_t, off_t, int); 83 84 /* 85 * Read system call. 86 */ 87 #ifndef _SYS_SYSPROTO_H_ 88 struct read_args { 89 int fd; 90 void *buf; 91 size_t nbyte; 92 }; 93 #endif 94 /* 95 * MPSAFE 96 */ 97 int 98 read(td, uap) 99 struct thread *td; 100 struct read_args *uap; 101 { 102 struct file *fp; 103 int error; 104 105 if ((error = fget_read(td, uap->fd, &fp)) == 0) { 106 error = dofileread(td, fp, uap->fd, uap->buf, 107 uap->nbyte, (off_t)-1, 0); 108 fdrop(fp, td); 109 } 110 return(error); 111 } 112 113 /* 114 * Pread system call 115 */ 116 #ifndef _SYS_SYSPROTO_H_ 117 struct pread_args { 118 int fd; 119 void *buf; 120 size_t nbyte; 121 int pad; 122 off_t offset; 123 }; 124 #endif 125 /* 126 * MPSAFE 127 */ 128 int 129 pread(td, uap) 130 struct thread *td; 131 struct pread_args *uap; 132 { 133 struct file *fp; 134 int error; 135 136 if ((error = fget_read(td, uap->fd, &fp)) != 0) 137 return (error); 138 if (fp->f_type != DTYPE_VNODE) { 139 error = ESPIPE; 140 } else { 141 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 142 uap->offset, FOF_OFFSET); 143 } 144 fdrop(fp, td); 145 return(error); 146 } 147 148 /* 149 * Code common for read and pread 150 */ 151 static int 152 dofileread(td, fp, fd, buf, nbyte, offset, flags) 153 struct thread *td; 154 struct file *fp; 155 int fd, flags; 156 void *buf; 157 size_t nbyte; 158 off_t offset; 159 { 160 struct uio auio; 161 struct iovec aiov; 162 long cnt, error = 0; 163 #ifdef KTRACE 164 struct iovec ktriov; 165 struct uio ktruio; 166 int didktr = 0; 167 #endif 168 169 aiov.iov_base = buf; 170 aiov.iov_len = nbyte; 171 auio.uio_iov = &aiov; 172 auio.uio_iovcnt = 1; 173 auio.uio_offset = offset; 174 if (nbyte > INT_MAX) 175 return (EINVAL); 176 auio.uio_resid = nbyte; 177 auio.uio_rw = UIO_READ; 178 auio.uio_segflg = UIO_USERSPACE; 179 auio.uio_td = td; 180 #ifdef KTRACE 181 /* 182 * if tracing, save a copy of iovec 183 */ 184 if (KTRPOINT(td, KTR_GENIO)) { 185 ktriov = aiov; 186 ktruio = auio; 187 didktr = 1; 188 } 189 #endif 190 cnt = nbyte; 191 192 if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) { 193 if (auio.uio_resid != cnt && (error == ERESTART || 194 error == EINTR || error == EWOULDBLOCK)) 195 error = 0; 196 } 197 cnt -= auio.uio_resid; 198 #ifdef KTRACE 199 if (didktr && error == 0) { 200 ktruio.uio_iov = &ktriov; 201 ktruio.uio_resid = cnt; 202 ktrgenio(fd, UIO_READ, &ktruio, error); 203 } 204 #endif 205 td->td_retval[0] = cnt; 206 return (error); 207 } 208 209 /* 210 * Scatter read system call. 211 */ 212 #ifndef _SYS_SYSPROTO_H_ 213 struct readv_args { 214 int fd; 215 struct iovec *iovp; 216 u_int iovcnt; 217 }; 218 #endif 219 /* 220 * MPSAFE 221 */ 222 int 223 readv(td, uap) 224 struct thread *td; 225 struct readv_args *uap; 226 { 227 struct file *fp; 228 struct uio auio; 229 struct iovec *iov; 230 struct iovec *needfree; 231 struct iovec aiov[UIO_SMALLIOV]; 232 long i, cnt; 233 int error; 234 u_int iovlen; 235 #ifdef KTRACE 236 struct iovec *ktriov = NULL; 237 struct uio ktruio; 238 #endif 239 240 if ((error = fget_read(td, uap->fd, &fp)) != 0) 241 return (error); 242 needfree = NULL; 243 /* note: can't use iovlen until iovcnt is validated */ 244 iovlen = uap->iovcnt * sizeof (struct iovec); 245 if (uap->iovcnt > UIO_SMALLIOV) { 246 if (uap->iovcnt > UIO_MAXIOV) { 247 error = EINVAL; 248 goto done; 249 } 250 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 251 needfree = iov; 252 } else 253 iov = aiov; 254 auio.uio_iov = iov; 255 auio.uio_iovcnt = uap->iovcnt; 256 auio.uio_rw = UIO_READ; 257 auio.uio_segflg = UIO_USERSPACE; 258 auio.uio_td = td; 259 auio.uio_offset = -1; 260 if ((error = copyin(uap->iovp, iov, iovlen))) 261 goto done; 262 auio.uio_resid = 0; 263 for (i = 0; i < uap->iovcnt; i++) { 264 if (iov->iov_len > INT_MAX - auio.uio_resid) { 265 error = EINVAL; 266 goto done; 267 } 268 auio.uio_resid += iov->iov_len; 269 iov++; 270 } 271 #ifdef KTRACE 272 /* 273 * if tracing, save a copy of iovec 274 */ 275 if (KTRPOINT(td, KTR_GENIO)) { 276 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 277 bcopy(auio.uio_iov, ktriov, iovlen); 278 ktruio = auio; 279 } 280 #endif 281 cnt = auio.uio_resid; 282 if ((error = fo_read(fp, &auio, td->td_ucred, 0, td))) { 283 if (auio.uio_resid != cnt && (error == ERESTART || 284 error == EINTR || error == EWOULDBLOCK)) 285 error = 0; 286 } 287 cnt -= auio.uio_resid; 288 #ifdef KTRACE 289 if (ktriov != NULL) { 290 if (error == 0) { 291 ktruio.uio_iov = ktriov; 292 ktruio.uio_resid = cnt; 293 ktrgenio(uap->fd, UIO_READ, &ktruio, error); 294 } 295 FREE(ktriov, M_TEMP); 296 } 297 #endif 298 td->td_retval[0] = cnt; 299 done: 300 fdrop(fp, td); 301 if (needfree) 302 FREE(needfree, M_IOV); 303 return (error); 304 } 305 306 /* 307 * Write system call 308 */ 309 #ifndef _SYS_SYSPROTO_H_ 310 struct write_args { 311 int fd; 312 const void *buf; 313 size_t nbyte; 314 }; 315 #endif 316 /* 317 * MPSAFE 318 */ 319 int 320 write(td, uap) 321 struct thread *td; 322 struct write_args *uap; 323 { 324 struct file *fp; 325 int error; 326 327 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 328 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, 329 (off_t)-1, 0); 330 fdrop(fp, td); 331 } else { 332 error = EBADF; /* XXX this can't be right */ 333 } 334 return(error); 335 } 336 337 /* 338 * Pwrite system call 339 */ 340 #ifndef _SYS_SYSPROTO_H_ 341 struct pwrite_args { 342 int fd; 343 const void *buf; 344 size_t nbyte; 345 int pad; 346 off_t offset; 347 }; 348 #endif 349 /* 350 * MPSAFE 351 */ 352 int 353 pwrite(td, uap) 354 struct thread *td; 355 struct pwrite_args *uap; 356 { 357 struct file *fp; 358 int error; 359 360 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 361 if (fp->f_type == DTYPE_VNODE) { 362 error = dofilewrite(td, fp, uap->fd, uap->buf, 363 uap->nbyte, uap->offset, FOF_OFFSET); 364 } else { 365 error = ESPIPE; 366 } 367 fdrop(fp, td); 368 } else { 369 error = EBADF; /* this can't be right */ 370 } 371 return(error); 372 } 373 374 static int 375 dofilewrite(td, fp, fd, buf, nbyte, offset, flags) 376 struct thread *td; 377 struct file *fp; 378 int fd, flags; 379 const void *buf; 380 size_t nbyte; 381 off_t offset; 382 { 383 struct uio auio; 384 struct iovec aiov; 385 long cnt, error = 0; 386 #ifdef KTRACE 387 struct iovec ktriov; 388 struct uio ktruio; 389 int didktr = 0; 390 #endif 391 392 aiov.iov_base = (void *)(uintptr_t)buf; 393 aiov.iov_len = nbyte; 394 auio.uio_iov = &aiov; 395 auio.uio_iovcnt = 1; 396 auio.uio_offset = offset; 397 if (nbyte > INT_MAX) 398 return (EINVAL); 399 auio.uio_resid = nbyte; 400 auio.uio_rw = UIO_WRITE; 401 auio.uio_segflg = UIO_USERSPACE; 402 auio.uio_td = td; 403 #ifdef KTRACE 404 /* 405 * if tracing, save a copy of iovec and uio 406 */ 407 if (KTRPOINT(td, KTR_GENIO)) { 408 ktriov = aiov; 409 ktruio = auio; 410 didktr = 1; 411 } 412 #endif 413 cnt = nbyte; 414 if (fp->f_type == DTYPE_VNODE) 415 bwillwrite(); 416 if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) { 417 if (auio.uio_resid != cnt && (error == ERESTART || 418 error == EINTR || error == EWOULDBLOCK)) 419 error = 0; 420 /* Socket layer is responsible for issuing SIGPIPE. */ 421 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) { 422 PROC_LOCK(td->td_proc); 423 psignal(td->td_proc, SIGPIPE); 424 PROC_UNLOCK(td->td_proc); 425 } 426 } 427 cnt -= auio.uio_resid; 428 #ifdef KTRACE 429 if (didktr && error == 0) { 430 ktruio.uio_iov = &ktriov; 431 ktruio.uio_resid = cnt; 432 ktrgenio(fd, UIO_WRITE, &ktruio, error); 433 } 434 #endif 435 td->td_retval[0] = cnt; 436 return (error); 437 } 438 439 /* 440 * Gather write system call 441 */ 442 #ifndef _SYS_SYSPROTO_H_ 443 struct writev_args { 444 int fd; 445 struct iovec *iovp; 446 u_int iovcnt; 447 }; 448 #endif 449 /* 450 * MPSAFE 451 */ 452 int 453 writev(td, uap) 454 struct thread *td; 455 register struct writev_args *uap; 456 { 457 struct file *fp; 458 struct uio auio; 459 register struct iovec *iov; 460 struct iovec *needfree; 461 struct iovec aiov[UIO_SMALLIOV]; 462 long i, cnt, error = 0; 463 u_int iovlen; 464 #ifdef KTRACE 465 struct iovec *ktriov = NULL; 466 struct uio ktruio; 467 #endif 468 469 mtx_lock(&Giant); 470 if ((error = fget_write(td, uap->fd, &fp)) != 0) { 471 error = EBADF; 472 goto done2; 473 } 474 /* note: can't use iovlen until iovcnt is validated */ 475 iovlen = uap->iovcnt * sizeof (struct iovec); 476 if (uap->iovcnt > UIO_SMALLIOV) { 477 if (uap->iovcnt > UIO_MAXIOV) { 478 needfree = NULL; 479 error = EINVAL; 480 goto done; 481 } 482 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 483 needfree = iov; 484 } else { 485 iov = aiov; 486 needfree = NULL; 487 } 488 auio.uio_iov = iov; 489 auio.uio_iovcnt = uap->iovcnt; 490 auio.uio_rw = UIO_WRITE; 491 auio.uio_segflg = UIO_USERSPACE; 492 auio.uio_td = td; 493 auio.uio_offset = -1; 494 if ((error = copyin(uap->iovp, iov, iovlen))) 495 goto done; 496 auio.uio_resid = 0; 497 for (i = 0; i < uap->iovcnt; i++) { 498 if (iov->iov_len > INT_MAX - auio.uio_resid) { 499 error = EINVAL; 500 goto done; 501 } 502 auio.uio_resid += iov->iov_len; 503 iov++; 504 } 505 #ifdef KTRACE 506 /* 507 * if tracing, save a copy of iovec and uio 508 */ 509 if (KTRPOINT(td, KTR_GENIO)) { 510 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 511 bcopy(auio.uio_iov, ktriov, iovlen); 512 ktruio = auio; 513 } 514 #endif 515 cnt = auio.uio_resid; 516 if (fp->f_type == DTYPE_VNODE) 517 bwillwrite(); 518 if ((error = fo_write(fp, &auio, td->td_ucred, 0, td))) { 519 if (auio.uio_resid != cnt && (error == ERESTART || 520 error == EINTR || error == EWOULDBLOCK)) 521 error = 0; 522 if (error == EPIPE) { 523 PROC_LOCK(td->td_proc); 524 psignal(td->td_proc, SIGPIPE); 525 PROC_UNLOCK(td->td_proc); 526 } 527 } 528 cnt -= auio.uio_resid; 529 #ifdef KTRACE 530 if (ktriov != NULL) { 531 if (error == 0) { 532 ktruio.uio_iov = ktriov; 533 ktruio.uio_resid = cnt; 534 ktrgenio(uap->fd, UIO_WRITE, &ktruio, error); 535 } 536 FREE(ktriov, M_TEMP); 537 } 538 #endif 539 td->td_retval[0] = cnt; 540 done: 541 fdrop(fp, td); 542 if (needfree) 543 FREE(needfree, M_IOV); 544 done2: 545 mtx_unlock(&Giant); 546 return (error); 547 } 548 549 /* 550 * Ioctl system call 551 */ 552 #ifndef _SYS_SYSPROTO_H_ 553 struct ioctl_args { 554 int fd; 555 u_long com; 556 caddr_t data; 557 }; 558 #endif 559 /* 560 * MPSAFE 561 */ 562 /* ARGSUSED */ 563 int 564 ioctl(td, uap) 565 struct thread *td; 566 register struct ioctl_args *uap; 567 { 568 struct file *fp; 569 register struct filedesc *fdp; 570 register u_long com; 571 int error = 0; 572 register u_int size; 573 caddr_t data, memp; 574 int tmp; 575 #define STK_PARAMS 128 576 union { 577 char stkbuf[STK_PARAMS]; 578 long align; 579 } ubuf; 580 581 if ((error = fget(td, uap->fd, &fp)) != 0) 582 return (error); 583 mtx_lock(&Giant); 584 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 585 fdrop(fp, td); 586 mtx_unlock(&Giant); 587 return (EBADF); 588 } 589 fdp = td->td_proc->p_fd; 590 switch (com = uap->com) { 591 case FIONCLEX: 592 FILEDESC_LOCK(fdp); 593 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; 594 FILEDESC_UNLOCK(fdp); 595 fdrop(fp, td); 596 mtx_unlock(&Giant); 597 return (0); 598 case FIOCLEX: 599 FILEDESC_LOCK(fdp); 600 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; 601 FILEDESC_UNLOCK(fdp); 602 fdrop(fp, td); 603 mtx_unlock(&Giant); 604 return (0); 605 } 606 607 /* 608 * Interpret high order word to find amount of data to be 609 * copied to/from the user's address space. 610 */ 611 size = IOCPARM_LEN(com); 612 if (size > IOCPARM_MAX) { 613 fdrop(fp, td); 614 mtx_unlock(&Giant); 615 return (ENOTTY); 616 } 617 618 memp = NULL; 619 if (size > sizeof (ubuf.stkbuf)) { 620 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 621 data = memp; 622 } else { 623 data = ubuf.stkbuf; 624 } 625 if (com&IOC_IN) { 626 if (size) { 627 error = copyin(uap->data, data, (u_int)size); 628 if (error) { 629 if (memp) 630 free(memp, M_IOCTLOPS); 631 fdrop(fp, td); 632 goto done; 633 } 634 } else { 635 *(caddr_t *)data = uap->data; 636 } 637 } else if ((com&IOC_OUT) && size) { 638 /* 639 * Zero the buffer so the user always 640 * gets back something deterministic. 641 */ 642 bzero(data, size); 643 } else if (com&IOC_VOID) { 644 *(caddr_t *)data = uap->data; 645 } 646 647 switch (com) { 648 649 case FIONBIO: 650 FILE_LOCK(fp); 651 if ((tmp = *(int *)data)) 652 fp->f_flag |= FNONBLOCK; 653 else 654 fp->f_flag &= ~FNONBLOCK; 655 FILE_UNLOCK(fp); 656 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 657 break; 658 659 case FIOASYNC: 660 FILE_LOCK(fp); 661 if ((tmp = *(int *)data)) 662 fp->f_flag |= FASYNC; 663 else 664 fp->f_flag &= ~FASYNC; 665 FILE_UNLOCK(fp); 666 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); 667 break; 668 669 default: 670 error = fo_ioctl(fp, com, data, td->td_ucred, td); 671 /* 672 * Copy any data to user, size was 673 * already set and checked above. 674 */ 675 if (error == 0 && (com&IOC_OUT) && size) 676 error = copyout(data, uap->data, (u_int)size); 677 break; 678 } 679 if (memp) 680 free(memp, M_IOCTLOPS); 681 fdrop(fp, td); 682 done: 683 mtx_unlock(&Giant); 684 return (error); 685 } 686 687 /* 688 * sellock and selwait are initialized in selectinit() via SYSINIT. 689 */ 690 struct mtx sellock; 691 struct cv selwait; 692 u_int nselcoll; /* Select collisions since boot */ 693 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 694 695 /* 696 * Select system call. 697 */ 698 #ifndef _SYS_SYSPROTO_H_ 699 struct select_args { 700 int nd; 701 fd_set *in, *ou, *ex; 702 struct timeval *tv; 703 }; 704 #endif 705 /* 706 * MPSAFE 707 */ 708 int 709 select(td, uap) 710 register struct thread *td; 711 register struct select_args *uap; 712 { 713 struct timeval tv, *tvp; 714 int error; 715 716 if (uap->tv != NULL) { 717 error = copyin(uap->tv, &tv, sizeof(tv)); 718 if (error) 719 return (error); 720 tvp = &tv; 721 } else 722 tvp = NULL; 723 724 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp)); 725 } 726 727 int 728 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 729 fd_set *fd_ex, struct timeval *tvp) 730 { 731 struct filedesc *fdp; 732 /* 733 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 734 * infds with the new FD_SETSIZE of 1024, and more than enough for 735 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 736 * of 256. 737 */ 738 fd_mask s_selbits[howmany(2048, NFDBITS)]; 739 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 740 struct timeval atv, rtv, ttv; 741 int error, timo; 742 u_int ncoll, nbufbytes, ncpbytes, nfdbits; 743 744 if (nd < 0) 745 return (EINVAL); 746 fdp = td->td_proc->p_fd; 747 mtx_lock(&Giant); 748 FILEDESC_LOCK(fdp); 749 750 if (nd > td->td_proc->p_fd->fd_nfiles) 751 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 752 FILEDESC_UNLOCK(fdp); 753 754 /* 755 * Allocate just enough bits for the non-null fd_sets. Use the 756 * preallocated auto buffer if possible. 757 */ 758 nfdbits = roundup(nd, NFDBITS); 759 ncpbytes = nfdbits / NBBY; 760 nbufbytes = 0; 761 if (fd_in != NULL) 762 nbufbytes += 2 * ncpbytes; 763 if (fd_ou != NULL) 764 nbufbytes += 2 * ncpbytes; 765 if (fd_ex != NULL) 766 nbufbytes += 2 * ncpbytes; 767 if (nbufbytes <= sizeof s_selbits) 768 selbits = &s_selbits[0]; 769 else 770 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 771 772 /* 773 * Assign pointers into the bit buffers and fetch the input bits. 774 * Put the output buffers together so that they can be bzeroed 775 * together. 776 */ 777 sbp = selbits; 778 #define getbits(name, x) \ 779 do { \ 780 if (name == NULL) \ 781 ibits[x] = NULL; \ 782 else { \ 783 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 784 obits[x] = sbp; \ 785 sbp += ncpbytes / sizeof *sbp; \ 786 error = copyin(name, ibits[x], ncpbytes); \ 787 if (error != 0) \ 788 goto done_nosellock; \ 789 } \ 790 } while (0) 791 getbits(fd_in, 0); 792 getbits(fd_ou, 1); 793 getbits(fd_ex, 2); 794 #undef getbits 795 if (nbufbytes != 0) 796 bzero(selbits, nbufbytes / 2); 797 798 if (tvp != NULL) { 799 atv = *tvp; 800 if (itimerfix(&atv)) { 801 error = EINVAL; 802 goto done_nosellock; 803 } 804 getmicrouptime(&rtv); 805 timevaladd(&atv, &rtv); 806 } else { 807 atv.tv_sec = 0; 808 atv.tv_usec = 0; 809 } 810 timo = 0; 811 TAILQ_INIT(&td->td_selq); 812 mtx_lock(&sellock); 813 retry: 814 ncoll = nselcoll; 815 mtx_lock_spin(&sched_lock); 816 td->td_flags |= TDF_SELECT; 817 mtx_unlock_spin(&sched_lock); 818 mtx_unlock(&sellock); 819 820 error = selscan(td, ibits, obits, nd); 821 mtx_lock(&sellock); 822 if (error || td->td_retval[0]) 823 goto done; 824 if (atv.tv_sec || atv.tv_usec) { 825 getmicrouptime(&rtv); 826 if (timevalcmp(&rtv, &atv, >=)) 827 goto done; 828 ttv = atv; 829 timevalsub(&ttv, &rtv); 830 timo = ttv.tv_sec > 24 * 60 * 60 ? 831 24 * 60 * 60 * hz : tvtohz(&ttv); 832 } 833 834 /* 835 * An event of interest may occur while we do not hold 836 * sellock, so check TDF_SELECT and the number of 837 * collisions and rescan the file descriptors if 838 * necessary. 839 */ 840 mtx_lock_spin(&sched_lock); 841 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 842 mtx_unlock_spin(&sched_lock); 843 goto retry; 844 } 845 mtx_unlock_spin(&sched_lock); 846 847 if (timo > 0) 848 error = cv_timedwait_sig(&selwait, &sellock, timo); 849 else 850 error = cv_wait_sig(&selwait, &sellock); 851 852 if (error == 0) 853 goto retry; 854 855 done: 856 clear_selinfo_list(td); 857 mtx_lock_spin(&sched_lock); 858 td->td_flags &= ~TDF_SELECT; 859 mtx_unlock_spin(&sched_lock); 860 mtx_unlock(&sellock); 861 862 done_nosellock: 863 /* select is not restarted after signals... */ 864 if (error == ERESTART) 865 error = EINTR; 866 if (error == EWOULDBLOCK) 867 error = 0; 868 #define putbits(name, x) \ 869 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ 870 error = error2; 871 if (error == 0) { 872 int error2; 873 874 putbits(fd_in, 0); 875 putbits(fd_ou, 1); 876 putbits(fd_ex, 2); 877 #undef putbits 878 } 879 if (selbits != &s_selbits[0]) 880 free(selbits, M_SELECT); 881 882 mtx_unlock(&Giant); 883 return (error); 884 } 885 886 static int 887 selscan(td, ibits, obits, nfd) 888 struct thread *td; 889 fd_mask **ibits, **obits; 890 int nfd; 891 { 892 int msk, i, fd; 893 fd_mask bits; 894 struct file *fp; 895 int n = 0; 896 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 897 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 898 struct filedesc *fdp = td->td_proc->p_fd; 899 900 FILEDESC_LOCK(fdp); 901 for (msk = 0; msk < 3; msk++) { 902 if (ibits[msk] == NULL) 903 continue; 904 for (i = 0; i < nfd; i += NFDBITS) { 905 bits = ibits[msk][i/NFDBITS]; 906 /* ffs(int mask) not portable, fd_mask is long */ 907 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 908 if (!(bits & 1)) 909 continue; 910 if ((fp = fget_locked(fdp, fd)) == NULL) { 911 FILEDESC_UNLOCK(fdp); 912 return (EBADF); 913 } 914 if (fo_poll(fp, flag[msk], td->td_ucred, 915 td)) { 916 obits[msk][(fd)/NFDBITS] |= 917 ((fd_mask)1 << ((fd) % NFDBITS)); 918 n++; 919 } 920 } 921 } 922 } 923 FILEDESC_UNLOCK(fdp); 924 td->td_retval[0] = n; 925 return (0); 926 } 927 928 /* 929 * Poll system call. 930 */ 931 #ifndef _SYS_SYSPROTO_H_ 932 struct poll_args { 933 struct pollfd *fds; 934 u_int nfds; 935 int timeout; 936 }; 937 #endif 938 /* 939 * MPSAFE 940 */ 941 int 942 poll(td, uap) 943 struct thread *td; 944 struct poll_args *uap; 945 { 946 caddr_t bits; 947 char smallbits[32 * sizeof(struct pollfd)]; 948 struct timeval atv, rtv, ttv; 949 int error = 0, timo; 950 u_int ncoll, nfds; 951 size_t ni; 952 953 nfds = uap->nfds; 954 955 mtx_lock(&Giant); 956 /* 957 * This is kinda bogus. We have fd limits, but that is not 958 * really related to the size of the pollfd array. Make sure 959 * we let the process use at least FD_SETSIZE entries and at 960 * least enough for the current limits. We want to be reasonably 961 * safe, but not overly restrictive. 962 */ 963 if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) && 964 (nfds > FD_SETSIZE)) { 965 error = EINVAL; 966 goto done2; 967 } 968 ni = nfds * sizeof(struct pollfd); 969 if (ni > sizeof(smallbits)) 970 bits = malloc(ni, M_TEMP, M_WAITOK); 971 else 972 bits = smallbits; 973 error = copyin(uap->fds, bits, ni); 974 if (error) 975 goto done_nosellock; 976 if (uap->timeout != INFTIM) { 977 atv.tv_sec = uap->timeout / 1000; 978 atv.tv_usec = (uap->timeout % 1000) * 1000; 979 if (itimerfix(&atv)) { 980 error = EINVAL; 981 goto done_nosellock; 982 } 983 getmicrouptime(&rtv); 984 timevaladd(&atv, &rtv); 985 } else { 986 atv.tv_sec = 0; 987 atv.tv_usec = 0; 988 } 989 timo = 0; 990 TAILQ_INIT(&td->td_selq); 991 mtx_lock(&sellock); 992 retry: 993 ncoll = nselcoll; 994 mtx_lock_spin(&sched_lock); 995 td->td_flags |= TDF_SELECT; 996 mtx_unlock_spin(&sched_lock); 997 mtx_unlock(&sellock); 998 999 error = pollscan(td, (struct pollfd *)bits, nfds); 1000 mtx_lock(&sellock); 1001 if (error || td->td_retval[0]) 1002 goto done; 1003 if (atv.tv_sec || atv.tv_usec) { 1004 getmicrouptime(&rtv); 1005 if (timevalcmp(&rtv, &atv, >=)) 1006 goto done; 1007 ttv = atv; 1008 timevalsub(&ttv, &rtv); 1009 timo = ttv.tv_sec > 24 * 60 * 60 ? 1010 24 * 60 * 60 * hz : tvtohz(&ttv); 1011 } 1012 /* 1013 * An event of interest may occur while we do not hold 1014 * sellock, so check TDF_SELECT and the number of collisions 1015 * and rescan the file descriptors if necessary. 1016 */ 1017 mtx_lock_spin(&sched_lock); 1018 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 1019 mtx_unlock_spin(&sched_lock); 1020 goto retry; 1021 } 1022 mtx_unlock_spin(&sched_lock); 1023 1024 if (timo > 0) 1025 error = cv_timedwait_sig(&selwait, &sellock, timo); 1026 else 1027 error = cv_wait_sig(&selwait, &sellock); 1028 1029 if (error == 0) 1030 goto retry; 1031 1032 done: 1033 clear_selinfo_list(td); 1034 mtx_lock_spin(&sched_lock); 1035 td->td_flags &= ~TDF_SELECT; 1036 mtx_unlock_spin(&sched_lock); 1037 mtx_unlock(&sellock); 1038 1039 done_nosellock: 1040 /* poll is not restarted after signals... */ 1041 if (error == ERESTART) 1042 error = EINTR; 1043 if (error == EWOULDBLOCK) 1044 error = 0; 1045 if (error == 0) { 1046 error = copyout(bits, uap->fds, ni); 1047 if (error) 1048 goto out; 1049 } 1050 out: 1051 if (ni > sizeof(smallbits)) 1052 free(bits, M_TEMP); 1053 done2: 1054 mtx_unlock(&Giant); 1055 return (error); 1056 } 1057 1058 static int 1059 pollscan(td, fds, nfd) 1060 struct thread *td; 1061 struct pollfd *fds; 1062 u_int nfd; 1063 { 1064 register struct filedesc *fdp = td->td_proc->p_fd; 1065 int i; 1066 struct file *fp; 1067 int n = 0; 1068 1069 FILEDESC_LOCK(fdp); 1070 for (i = 0; i < nfd; i++, fds++) { 1071 if (fds->fd >= fdp->fd_nfiles) { 1072 fds->revents = POLLNVAL; 1073 n++; 1074 } else if (fds->fd < 0) { 1075 fds->revents = 0; 1076 } else { 1077 fp = fdp->fd_ofiles[fds->fd]; 1078 if (fp == NULL) { 1079 fds->revents = POLLNVAL; 1080 n++; 1081 } else { 1082 /* 1083 * Note: backend also returns POLLHUP and 1084 * POLLERR if appropriate. 1085 */ 1086 fds->revents = fo_poll(fp, fds->events, 1087 td->td_ucred, td); 1088 if (fds->revents != 0) 1089 n++; 1090 } 1091 } 1092 } 1093 FILEDESC_UNLOCK(fdp); 1094 td->td_retval[0] = n; 1095 return (0); 1096 } 1097 1098 /* 1099 * OpenBSD poll system call. 1100 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1101 */ 1102 #ifndef _SYS_SYSPROTO_H_ 1103 struct openbsd_poll_args { 1104 struct pollfd *fds; 1105 u_int nfds; 1106 int timeout; 1107 }; 1108 #endif 1109 /* 1110 * MPSAFE 1111 */ 1112 int 1113 openbsd_poll(td, uap) 1114 register struct thread *td; 1115 register struct openbsd_poll_args *uap; 1116 { 1117 return (poll(td, (struct poll_args *)uap)); 1118 } 1119 1120 /* 1121 * Remove the references to the thread from all of the objects 1122 * we were polling. 1123 * 1124 * This code assumes that the underlying owner of the selinfo 1125 * structure will hold sellock before it changes it, and that 1126 * it will unlink itself from our list if it goes away. 1127 */ 1128 void 1129 clear_selinfo_list(td) 1130 struct thread *td; 1131 { 1132 struct selinfo *si; 1133 1134 mtx_assert(&sellock, MA_OWNED); 1135 TAILQ_FOREACH(si, &td->td_selq, si_thrlist) 1136 si->si_thread = NULL; 1137 TAILQ_INIT(&td->td_selq); 1138 } 1139 1140 /*ARGSUSED*/ 1141 int 1142 seltrue(dev, events, td) 1143 dev_t dev; 1144 int events; 1145 struct thread *td; 1146 { 1147 1148 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1149 } 1150 1151 /* 1152 * Record a select request. 1153 */ 1154 void 1155 selrecord(selector, sip) 1156 struct thread *selector; 1157 struct selinfo *sip; 1158 { 1159 1160 mtx_lock(&sellock); 1161 /* 1162 * If the selinfo's thread pointer is NULL then take ownership of it. 1163 * 1164 * If the thread pointer is not NULL and it points to another 1165 * thread, then we have a collision. 1166 * 1167 * If the thread pointer is not NULL and points back to us then leave 1168 * it alone as we've already added pointed it at us and added it to 1169 * our list. 1170 */ 1171 if (sip->si_thread == NULL) { 1172 sip->si_thread = selector; 1173 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); 1174 } else if (sip->si_thread != selector) { 1175 sip->si_flags |= SI_COLL; 1176 } 1177 1178 mtx_unlock(&sellock); 1179 } 1180 1181 /* 1182 * Do a wakeup when a selectable event occurs. 1183 */ 1184 void 1185 selwakeup(sip) 1186 struct selinfo *sip; 1187 { 1188 struct thread *td; 1189 1190 mtx_lock(&sellock); 1191 td = sip->si_thread; 1192 if ((sip->si_flags & SI_COLL) != 0) { 1193 nselcoll++; 1194 sip->si_flags &= ~SI_COLL; 1195 cv_broadcast(&selwait); 1196 } 1197 if (td == NULL) { 1198 mtx_unlock(&sellock); 1199 return; 1200 } 1201 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); 1202 sip->si_thread = NULL; 1203 mtx_lock_spin(&sched_lock); 1204 if (td->td_wchan == &selwait) { 1205 cv_waitq_remove(td); 1206 TD_CLR_SLEEPING(td); 1207 setrunnable(td); 1208 } else 1209 td->td_flags &= ~TDF_SELECT; 1210 mtx_unlock_spin(&sched_lock); 1211 mtx_unlock(&sellock); 1212 } 1213 1214 static void selectinit(void *); 1215 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1216 1217 /* ARGSUSED*/ 1218 static void 1219 selectinit(dummy) 1220 void *dummy; 1221 { 1222 cv_init(&selwait, "select"); 1223 mtx_init(&sellock, "sellck", NULL, MTX_DEF); 1224 } 1225