1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 * $FreeBSD$ 40 */ 41 42 #include "opt_ktrace.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/sysproto.h> 47 #include <sys/filedesc.h> 48 #include <sys/filio.h> 49 #include <sys/fcntl.h> 50 #include <sys/file.h> 51 #include <sys/proc.h> 52 #include <sys/signalvar.h> 53 #include <sys/socketvar.h> 54 #include <sys/uio.h> 55 #include <sys/kernel.h> 56 #include <sys/malloc.h> 57 #include <sys/poll.h> 58 #include <sys/resourcevar.h> 59 #include <sys/selinfo.h> 60 #include <sys/syscallsubr.h> 61 #include <sys/sysctl.h> 62 #include <sys/sysent.h> 63 #include <sys/bio.h> 64 #include <sys/buf.h> 65 #include <sys/condvar.h> 66 #ifdef KTRACE 67 #include <sys/ktrace.h> 68 #endif 69 #include <vm/vm.h> 70 #include <vm/vm_page.h> 71 72 #include <machine/limits.h> 73 74 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 75 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 76 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 77 78 static int pollscan(struct thread *, struct pollfd *, u_int); 79 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 80 static int dofileread(struct thread *, struct file *, int, void *, 81 size_t, off_t, int); 82 static int dofilewrite(struct thread *, struct file *, int, 83 const void *, size_t, off_t, int); 84 85 /* 86 * Read system call. 87 */ 88 #ifndef _SYS_SYSPROTO_H_ 89 struct read_args { 90 int fd; 91 void *buf; 92 size_t nbyte; 93 }; 94 #endif 95 /* 96 * MPSAFE 97 */ 98 int 99 read(td, uap) 100 struct thread *td; 101 struct read_args *uap; 102 { 103 struct file *fp; 104 int error; 105 106 if ((error = fget_read(td, uap->fd, &fp)) == 0) { 107 error = dofileread(td, fp, uap->fd, uap->buf, 108 uap->nbyte, (off_t)-1, 0); 109 fdrop(fp, td); 110 } 111 return(error); 112 } 113 114 /* 115 * Pread system call 116 */ 117 #ifndef _SYS_SYSPROTO_H_ 118 struct pread_args { 119 int fd; 120 void *buf; 121 size_t nbyte; 122 int pad; 123 off_t offset; 124 }; 125 #endif 126 /* 127 * MPSAFE 128 */ 129 int 130 pread(td, uap) 131 struct thread *td; 132 struct pread_args *uap; 133 { 134 struct file *fp; 135 int error; 136 137 if ((error = fget_read(td, uap->fd, &fp)) != 0) 138 return (error); 139 if (fp->f_type != DTYPE_VNODE) { 140 error = ESPIPE; 141 } else { 142 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 143 uap->offset, FOF_OFFSET); 144 } 145 fdrop(fp, td); 146 return(error); 147 } 148 149 /* 150 * Code common for read and pread 151 */ 152 static int 153 dofileread(td, fp, fd, buf, nbyte, offset, flags) 154 struct thread *td; 155 struct file *fp; 156 int fd, flags; 157 void *buf; 158 size_t nbyte; 159 off_t offset; 160 { 161 struct uio auio; 162 struct iovec aiov; 163 long cnt, error = 0; 164 #ifdef KTRACE 165 struct iovec ktriov; 166 struct uio ktruio; 167 int didktr = 0; 168 #endif 169 170 aiov.iov_base = buf; 171 aiov.iov_len = nbyte; 172 auio.uio_iov = &aiov; 173 auio.uio_iovcnt = 1; 174 auio.uio_offset = offset; 175 if (nbyte > INT_MAX) 176 return (EINVAL); 177 auio.uio_resid = nbyte; 178 auio.uio_rw = UIO_READ; 179 auio.uio_segflg = UIO_USERSPACE; 180 auio.uio_td = td; 181 #ifdef KTRACE 182 /* 183 * if tracing, save a copy of iovec 184 */ 185 if (KTRPOINT(td, KTR_GENIO)) { 186 ktriov = aiov; 187 ktruio = auio; 188 didktr = 1; 189 } 190 #endif 191 cnt = nbyte; 192 193 if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) { 194 if (auio.uio_resid != cnt && (error == ERESTART || 195 error == EINTR || error == EWOULDBLOCK)) 196 error = 0; 197 } 198 cnt -= auio.uio_resid; 199 #ifdef KTRACE 200 if (didktr && error == 0) { 201 ktruio.uio_iov = &ktriov; 202 ktruio.uio_resid = cnt; 203 ktrgenio(fd, UIO_READ, &ktruio, error); 204 } 205 #endif 206 td->td_retval[0] = cnt; 207 return (error); 208 } 209 210 /* 211 * Scatter read system call. 212 */ 213 #ifndef _SYS_SYSPROTO_H_ 214 struct readv_args { 215 int fd; 216 struct iovec *iovp; 217 u_int iovcnt; 218 }; 219 #endif 220 /* 221 * MPSAFE 222 */ 223 int 224 readv(td, uap) 225 struct thread *td; 226 struct readv_args *uap; 227 { 228 struct file *fp; 229 struct uio auio; 230 struct iovec *iov; 231 struct iovec *needfree; 232 struct iovec aiov[UIO_SMALLIOV]; 233 long i, cnt; 234 int error; 235 u_int iovlen; 236 #ifdef KTRACE 237 struct iovec *ktriov = NULL; 238 struct uio ktruio; 239 #endif 240 241 if ((error = fget_read(td, uap->fd, &fp)) != 0) 242 return (error); 243 needfree = NULL; 244 /* note: can't use iovlen until iovcnt is validated */ 245 iovlen = uap->iovcnt * sizeof (struct iovec); 246 if (uap->iovcnt > UIO_SMALLIOV) { 247 if (uap->iovcnt > UIO_MAXIOV) { 248 error = EINVAL; 249 goto done; 250 } 251 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 252 needfree = iov; 253 } else 254 iov = aiov; 255 auio.uio_iov = iov; 256 auio.uio_iovcnt = uap->iovcnt; 257 auio.uio_rw = UIO_READ; 258 auio.uio_segflg = UIO_USERSPACE; 259 auio.uio_td = td; 260 auio.uio_offset = -1; 261 if ((error = copyin(uap->iovp, iov, iovlen))) 262 goto done; 263 auio.uio_resid = 0; 264 for (i = 0; i < uap->iovcnt; i++) { 265 if (iov->iov_len > INT_MAX - auio.uio_resid) { 266 error = EINVAL; 267 goto done; 268 } 269 auio.uio_resid += iov->iov_len; 270 iov++; 271 } 272 #ifdef KTRACE 273 /* 274 * if tracing, save a copy of iovec 275 */ 276 if (KTRPOINT(td, KTR_GENIO)) { 277 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 278 bcopy(auio.uio_iov, ktriov, iovlen); 279 ktruio = auio; 280 } 281 #endif 282 cnt = auio.uio_resid; 283 if ((error = fo_read(fp, &auio, td->td_ucred, 0, td))) { 284 if (auio.uio_resid != cnt && (error == ERESTART || 285 error == EINTR || error == EWOULDBLOCK)) 286 error = 0; 287 } 288 cnt -= auio.uio_resid; 289 #ifdef KTRACE 290 if (ktriov != NULL) { 291 if (error == 0) { 292 ktruio.uio_iov = ktriov; 293 ktruio.uio_resid = cnt; 294 ktrgenio(uap->fd, UIO_READ, &ktruio, error); 295 } 296 FREE(ktriov, M_TEMP); 297 } 298 #endif 299 td->td_retval[0] = cnt; 300 done: 301 fdrop(fp, td); 302 if (needfree) 303 FREE(needfree, M_IOV); 304 return (error); 305 } 306 307 /* 308 * Write system call 309 */ 310 #ifndef _SYS_SYSPROTO_H_ 311 struct write_args { 312 int fd; 313 const void *buf; 314 size_t nbyte; 315 }; 316 #endif 317 /* 318 * MPSAFE 319 */ 320 int 321 write(td, uap) 322 struct thread *td; 323 struct write_args *uap; 324 { 325 struct file *fp; 326 int error; 327 328 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 329 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, 330 (off_t)-1, 0); 331 fdrop(fp, td); 332 } else { 333 error = EBADF; /* XXX this can't be right */ 334 } 335 return(error); 336 } 337 338 /* 339 * Pwrite system call 340 */ 341 #ifndef _SYS_SYSPROTO_H_ 342 struct pwrite_args { 343 int fd; 344 const void *buf; 345 size_t nbyte; 346 int pad; 347 off_t offset; 348 }; 349 #endif 350 /* 351 * MPSAFE 352 */ 353 int 354 pwrite(td, uap) 355 struct thread *td; 356 struct pwrite_args *uap; 357 { 358 struct file *fp; 359 int error; 360 361 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 362 if (fp->f_type == DTYPE_VNODE) { 363 error = dofilewrite(td, fp, uap->fd, uap->buf, 364 uap->nbyte, uap->offset, FOF_OFFSET); 365 } else { 366 error = ESPIPE; 367 } 368 fdrop(fp, td); 369 } else { 370 error = EBADF; /* this can't be right */ 371 } 372 return(error); 373 } 374 375 static int 376 dofilewrite(td, fp, fd, buf, nbyte, offset, flags) 377 struct thread *td; 378 struct file *fp; 379 int fd, flags; 380 const void *buf; 381 size_t nbyte; 382 off_t offset; 383 { 384 struct uio auio; 385 struct iovec aiov; 386 long cnt, error = 0; 387 #ifdef KTRACE 388 struct iovec ktriov; 389 struct uio ktruio; 390 int didktr = 0; 391 #endif 392 393 aiov.iov_base = (void *)(uintptr_t)buf; 394 aiov.iov_len = nbyte; 395 auio.uio_iov = &aiov; 396 auio.uio_iovcnt = 1; 397 auio.uio_offset = offset; 398 if (nbyte > INT_MAX) 399 return (EINVAL); 400 auio.uio_resid = nbyte; 401 auio.uio_rw = UIO_WRITE; 402 auio.uio_segflg = UIO_USERSPACE; 403 auio.uio_td = td; 404 #ifdef KTRACE 405 /* 406 * if tracing, save a copy of iovec and uio 407 */ 408 if (KTRPOINT(td, KTR_GENIO)) { 409 ktriov = aiov; 410 ktruio = auio; 411 didktr = 1; 412 } 413 #endif 414 cnt = nbyte; 415 if (fp->f_type == DTYPE_VNODE) 416 bwillwrite(); 417 if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) { 418 if (auio.uio_resid != cnt && (error == ERESTART || 419 error == EINTR || error == EWOULDBLOCK)) 420 error = 0; 421 /* Socket layer is responsible for issuing SIGPIPE. */ 422 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) { 423 PROC_LOCK(td->td_proc); 424 psignal(td->td_proc, SIGPIPE); 425 PROC_UNLOCK(td->td_proc); 426 } 427 } 428 cnt -= auio.uio_resid; 429 #ifdef KTRACE 430 if (didktr && error == 0) { 431 ktruio.uio_iov = &ktriov; 432 ktruio.uio_resid = cnt; 433 ktrgenio(fd, UIO_WRITE, &ktruio, error); 434 } 435 #endif 436 td->td_retval[0] = cnt; 437 return (error); 438 } 439 440 /* 441 * Gather write system call 442 */ 443 #ifndef _SYS_SYSPROTO_H_ 444 struct writev_args { 445 int fd; 446 struct iovec *iovp; 447 u_int iovcnt; 448 }; 449 #endif 450 /* 451 * MPSAFE 452 */ 453 int 454 writev(td, uap) 455 struct thread *td; 456 register struct writev_args *uap; 457 { 458 struct file *fp; 459 struct uio auio; 460 register struct iovec *iov; 461 struct iovec *needfree; 462 struct iovec aiov[UIO_SMALLIOV]; 463 long i, cnt, error = 0; 464 u_int iovlen; 465 #ifdef KTRACE 466 struct iovec *ktriov = NULL; 467 struct uio ktruio; 468 #endif 469 470 mtx_lock(&Giant); 471 if ((error = fget_write(td, uap->fd, &fp)) != 0) { 472 error = EBADF; 473 goto done2; 474 } 475 /* note: can't use iovlen until iovcnt is validated */ 476 iovlen = uap->iovcnt * sizeof (struct iovec); 477 if (uap->iovcnt > UIO_SMALLIOV) { 478 if (uap->iovcnt > UIO_MAXIOV) { 479 needfree = NULL; 480 error = EINVAL; 481 goto done; 482 } 483 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 484 needfree = iov; 485 } else { 486 iov = aiov; 487 needfree = NULL; 488 } 489 auio.uio_iov = iov; 490 auio.uio_iovcnt = uap->iovcnt; 491 auio.uio_rw = UIO_WRITE; 492 auio.uio_segflg = UIO_USERSPACE; 493 auio.uio_td = td; 494 auio.uio_offset = -1; 495 if ((error = copyin(uap->iovp, iov, iovlen))) 496 goto done; 497 auio.uio_resid = 0; 498 for (i = 0; i < uap->iovcnt; i++) { 499 if (iov->iov_len > INT_MAX - auio.uio_resid) { 500 error = EINVAL; 501 goto done; 502 } 503 auio.uio_resid += iov->iov_len; 504 iov++; 505 } 506 #ifdef KTRACE 507 /* 508 * if tracing, save a copy of iovec and uio 509 */ 510 if (KTRPOINT(td, KTR_GENIO)) { 511 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 512 bcopy(auio.uio_iov, ktriov, iovlen); 513 ktruio = auio; 514 } 515 #endif 516 cnt = auio.uio_resid; 517 if (fp->f_type == DTYPE_VNODE) 518 bwillwrite(); 519 if ((error = fo_write(fp, &auio, td->td_ucred, 0, td))) { 520 if (auio.uio_resid != cnt && (error == ERESTART || 521 error == EINTR || error == EWOULDBLOCK)) 522 error = 0; 523 if (error == EPIPE) { 524 PROC_LOCK(td->td_proc); 525 psignal(td->td_proc, SIGPIPE); 526 PROC_UNLOCK(td->td_proc); 527 } 528 } 529 cnt -= auio.uio_resid; 530 #ifdef KTRACE 531 if (ktriov != NULL) { 532 if (error == 0) { 533 ktruio.uio_iov = ktriov; 534 ktruio.uio_resid = cnt; 535 ktrgenio(uap->fd, UIO_WRITE, &ktruio, error); 536 } 537 FREE(ktriov, M_TEMP); 538 } 539 #endif 540 td->td_retval[0] = cnt; 541 done: 542 fdrop(fp, td); 543 if (needfree) 544 FREE(needfree, M_IOV); 545 done2: 546 mtx_unlock(&Giant); 547 return (error); 548 } 549 550 /* 551 * Ioctl system call 552 */ 553 #ifndef _SYS_SYSPROTO_H_ 554 struct ioctl_args { 555 int fd; 556 u_long com; 557 caddr_t data; 558 }; 559 #endif 560 /* 561 * MPSAFE 562 */ 563 /* ARGSUSED */ 564 int 565 ioctl(td, uap) 566 struct thread *td; 567 register struct ioctl_args *uap; 568 { 569 struct file *fp; 570 register struct filedesc *fdp; 571 register u_long com; 572 int error = 0; 573 register u_int size; 574 caddr_t data, memp; 575 int tmp; 576 #define STK_PARAMS 128 577 union { 578 char stkbuf[STK_PARAMS]; 579 long align; 580 } ubuf; 581 582 if ((error = fget(td, uap->fd, &fp)) != 0) 583 return (error); 584 mtx_lock(&Giant); 585 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 586 fdrop(fp, td); 587 mtx_unlock(&Giant); 588 return (EBADF); 589 } 590 fdp = td->td_proc->p_fd; 591 switch (com = uap->com) { 592 case FIONCLEX: 593 FILEDESC_LOCK(fdp); 594 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; 595 FILEDESC_UNLOCK(fdp); 596 fdrop(fp, td); 597 mtx_unlock(&Giant); 598 return (0); 599 case FIOCLEX: 600 FILEDESC_LOCK(fdp); 601 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; 602 FILEDESC_UNLOCK(fdp); 603 fdrop(fp, td); 604 mtx_unlock(&Giant); 605 return (0); 606 } 607 608 /* 609 * Interpret high order word to find amount of data to be 610 * copied to/from the user's address space. 611 */ 612 size = IOCPARM_LEN(com); 613 if (size > IOCPARM_MAX) { 614 fdrop(fp, td); 615 mtx_unlock(&Giant); 616 return (ENOTTY); 617 } 618 619 memp = NULL; 620 if (size > sizeof (ubuf.stkbuf)) { 621 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 622 data = memp; 623 } else { 624 data = ubuf.stkbuf; 625 } 626 if (com&IOC_IN) { 627 if (size) { 628 error = copyin(uap->data, data, (u_int)size); 629 if (error) { 630 if (memp) 631 free(memp, M_IOCTLOPS); 632 fdrop(fp, td); 633 goto done; 634 } 635 } else { 636 *(caddr_t *)data = uap->data; 637 } 638 } else if ((com&IOC_OUT) && size) { 639 /* 640 * Zero the buffer so the user always 641 * gets back something deterministic. 642 */ 643 bzero(data, size); 644 } else if (com&IOC_VOID) { 645 *(caddr_t *)data = uap->data; 646 } 647 648 switch (com) { 649 650 case FIONBIO: 651 FILE_LOCK(fp); 652 if ((tmp = *(int *)data)) 653 fp->f_flag |= FNONBLOCK; 654 else 655 fp->f_flag &= ~FNONBLOCK; 656 FILE_UNLOCK(fp); 657 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 658 break; 659 660 case FIOASYNC: 661 FILE_LOCK(fp); 662 if ((tmp = *(int *)data)) 663 fp->f_flag |= FASYNC; 664 else 665 fp->f_flag &= ~FASYNC; 666 FILE_UNLOCK(fp); 667 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); 668 break; 669 670 default: 671 error = fo_ioctl(fp, com, data, td->td_ucred, td); 672 /* 673 * Copy any data to user, size was 674 * already set and checked above. 675 */ 676 if (error == 0 && (com&IOC_OUT) && size) 677 error = copyout(data, uap->data, (u_int)size); 678 break; 679 } 680 if (memp) 681 free(memp, M_IOCTLOPS); 682 fdrop(fp, td); 683 done: 684 mtx_unlock(&Giant); 685 return (error); 686 } 687 688 /* 689 * sellock and selwait are initialized in selectinit() via SYSINIT. 690 */ 691 struct mtx sellock; 692 struct cv selwait; 693 u_int nselcoll; /* Select collisions since boot */ 694 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 695 696 /* 697 * Select system call. 698 */ 699 #ifndef _SYS_SYSPROTO_H_ 700 struct select_args { 701 int nd; 702 fd_set *in, *ou, *ex; 703 struct timeval *tv; 704 }; 705 #endif 706 /* 707 * MPSAFE 708 */ 709 int 710 select(td, uap) 711 register struct thread *td; 712 register struct select_args *uap; 713 { 714 struct timeval tv, *tvp; 715 int error; 716 717 if (uap->tv != NULL) { 718 error = copyin(uap->tv, &tv, sizeof(tv)); 719 if (error) 720 return (error); 721 tvp = &tv; 722 } else 723 tvp = NULL; 724 725 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp)); 726 } 727 728 int 729 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 730 fd_set *fd_ex, struct timeval *tvp) 731 { 732 struct filedesc *fdp; 733 /* 734 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 735 * infds with the new FD_SETSIZE of 1024, and more than enough for 736 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 737 * of 256. 738 */ 739 fd_mask s_selbits[howmany(2048, NFDBITS)]; 740 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 741 struct timeval atv, rtv, ttv; 742 int error, timo; 743 u_int ncoll, nbufbytes, ncpbytes, nfdbits; 744 745 if (nd < 0) 746 return (EINVAL); 747 fdp = td->td_proc->p_fd; 748 mtx_lock(&Giant); 749 FILEDESC_LOCK(fdp); 750 751 if (nd > td->td_proc->p_fd->fd_nfiles) 752 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 753 FILEDESC_UNLOCK(fdp); 754 755 /* 756 * Allocate just enough bits for the non-null fd_sets. Use the 757 * preallocated auto buffer if possible. 758 */ 759 nfdbits = roundup(nd, NFDBITS); 760 ncpbytes = nfdbits / NBBY; 761 nbufbytes = 0; 762 if (fd_in != NULL) 763 nbufbytes += 2 * ncpbytes; 764 if (fd_ou != NULL) 765 nbufbytes += 2 * ncpbytes; 766 if (fd_ex != NULL) 767 nbufbytes += 2 * ncpbytes; 768 if (nbufbytes <= sizeof s_selbits) 769 selbits = &s_selbits[0]; 770 else 771 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 772 773 /* 774 * Assign pointers into the bit buffers and fetch the input bits. 775 * Put the output buffers together so that they can be bzeroed 776 * together. 777 */ 778 sbp = selbits; 779 #define getbits(name, x) \ 780 do { \ 781 if (name == NULL) \ 782 ibits[x] = NULL; \ 783 else { \ 784 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 785 obits[x] = sbp; \ 786 sbp += ncpbytes / sizeof *sbp; \ 787 error = copyin(name, ibits[x], ncpbytes); \ 788 if (error != 0) \ 789 goto done_nosellock; \ 790 } \ 791 } while (0) 792 getbits(fd_in, 0); 793 getbits(fd_ou, 1); 794 getbits(fd_ex, 2); 795 #undef getbits 796 if (nbufbytes != 0) 797 bzero(selbits, nbufbytes / 2); 798 799 if (tvp != NULL) { 800 atv = *tvp; 801 if (itimerfix(&atv)) { 802 error = EINVAL; 803 goto done_nosellock; 804 } 805 getmicrouptime(&rtv); 806 timevaladd(&atv, &rtv); 807 } else { 808 atv.tv_sec = 0; 809 atv.tv_usec = 0; 810 } 811 timo = 0; 812 TAILQ_INIT(&td->td_selq); 813 mtx_lock(&sellock); 814 retry: 815 ncoll = nselcoll; 816 mtx_lock_spin(&sched_lock); 817 td->td_flags |= TDF_SELECT; 818 mtx_unlock_spin(&sched_lock); 819 mtx_unlock(&sellock); 820 821 error = selscan(td, ibits, obits, nd); 822 mtx_lock(&sellock); 823 if (error || td->td_retval[0]) 824 goto done; 825 if (atv.tv_sec || atv.tv_usec) { 826 getmicrouptime(&rtv); 827 if (timevalcmp(&rtv, &atv, >=)) 828 goto done; 829 ttv = atv; 830 timevalsub(&ttv, &rtv); 831 timo = ttv.tv_sec > 24 * 60 * 60 ? 832 24 * 60 * 60 * hz : tvtohz(&ttv); 833 } 834 835 /* 836 * An event of interest may occur while we do not hold 837 * sellock, so check TDF_SELECT and the number of 838 * collisions and rescan the file descriptors if 839 * necessary. 840 */ 841 mtx_lock_spin(&sched_lock); 842 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 843 mtx_unlock_spin(&sched_lock); 844 goto retry; 845 } 846 mtx_unlock_spin(&sched_lock); 847 848 if (timo > 0) 849 error = cv_timedwait_sig(&selwait, &sellock, timo); 850 else 851 error = cv_wait_sig(&selwait, &sellock); 852 853 if (error == 0) 854 goto retry; 855 856 done: 857 clear_selinfo_list(td); 858 mtx_lock_spin(&sched_lock); 859 td->td_flags &= ~TDF_SELECT; 860 mtx_unlock_spin(&sched_lock); 861 mtx_unlock(&sellock); 862 863 done_nosellock: 864 /* select is not restarted after signals... */ 865 if (error == ERESTART) 866 error = EINTR; 867 if (error == EWOULDBLOCK) 868 error = 0; 869 #define putbits(name, x) \ 870 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ 871 error = error2; 872 if (error == 0) { 873 int error2; 874 875 putbits(fd_in, 0); 876 putbits(fd_ou, 1); 877 putbits(fd_ex, 2); 878 #undef putbits 879 } 880 if (selbits != &s_selbits[0]) 881 free(selbits, M_SELECT); 882 883 mtx_unlock(&Giant); 884 return (error); 885 } 886 887 static int 888 selscan(td, ibits, obits, nfd) 889 struct thread *td; 890 fd_mask **ibits, **obits; 891 int nfd; 892 { 893 int msk, i, fd; 894 fd_mask bits; 895 struct file *fp; 896 int n = 0; 897 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 898 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 899 struct filedesc *fdp = td->td_proc->p_fd; 900 901 FILEDESC_LOCK(fdp); 902 for (msk = 0; msk < 3; msk++) { 903 if (ibits[msk] == NULL) 904 continue; 905 for (i = 0; i < nfd; i += NFDBITS) { 906 bits = ibits[msk][i/NFDBITS]; 907 /* ffs(int mask) not portable, fd_mask is long */ 908 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 909 if (!(bits & 1)) 910 continue; 911 if ((fp = fget_locked(fdp, fd)) == NULL) { 912 FILEDESC_UNLOCK(fdp); 913 return (EBADF); 914 } 915 if (fo_poll(fp, flag[msk], td->td_ucred, 916 td)) { 917 obits[msk][(fd)/NFDBITS] |= 918 ((fd_mask)1 << ((fd) % NFDBITS)); 919 n++; 920 } 921 } 922 } 923 } 924 FILEDESC_UNLOCK(fdp); 925 td->td_retval[0] = n; 926 return (0); 927 } 928 929 /* 930 * Poll system call. 931 */ 932 #ifndef _SYS_SYSPROTO_H_ 933 struct poll_args { 934 struct pollfd *fds; 935 u_int nfds; 936 int timeout; 937 }; 938 #endif 939 /* 940 * MPSAFE 941 */ 942 int 943 poll(td, uap) 944 struct thread *td; 945 struct poll_args *uap; 946 { 947 caddr_t bits; 948 char smallbits[32 * sizeof(struct pollfd)]; 949 struct timeval atv, rtv, ttv; 950 int error = 0, timo; 951 u_int ncoll, nfds; 952 size_t ni; 953 954 nfds = uap->nfds; 955 956 mtx_lock(&Giant); 957 /* 958 * This is kinda bogus. We have fd limits, but that is not 959 * really related to the size of the pollfd array. Make sure 960 * we let the process use at least FD_SETSIZE entries and at 961 * least enough for the current limits. We want to be reasonably 962 * safe, but not overly restrictive. 963 */ 964 if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) && 965 (nfds > FD_SETSIZE)) { 966 error = EINVAL; 967 goto done2; 968 } 969 ni = nfds * sizeof(struct pollfd); 970 if (ni > sizeof(smallbits)) 971 bits = malloc(ni, M_TEMP, M_WAITOK); 972 else 973 bits = smallbits; 974 error = copyin(uap->fds, bits, ni); 975 if (error) 976 goto done_nosellock; 977 if (uap->timeout != INFTIM) { 978 atv.tv_sec = uap->timeout / 1000; 979 atv.tv_usec = (uap->timeout % 1000) * 1000; 980 if (itimerfix(&atv)) { 981 error = EINVAL; 982 goto done_nosellock; 983 } 984 getmicrouptime(&rtv); 985 timevaladd(&atv, &rtv); 986 } else { 987 atv.tv_sec = 0; 988 atv.tv_usec = 0; 989 } 990 timo = 0; 991 TAILQ_INIT(&td->td_selq); 992 mtx_lock(&sellock); 993 retry: 994 ncoll = nselcoll; 995 mtx_lock_spin(&sched_lock); 996 td->td_flags |= TDF_SELECT; 997 mtx_unlock_spin(&sched_lock); 998 mtx_unlock(&sellock); 999 1000 error = pollscan(td, (struct pollfd *)bits, nfds); 1001 mtx_lock(&sellock); 1002 if (error || td->td_retval[0]) 1003 goto done; 1004 if (atv.tv_sec || atv.tv_usec) { 1005 getmicrouptime(&rtv); 1006 if (timevalcmp(&rtv, &atv, >=)) 1007 goto done; 1008 ttv = atv; 1009 timevalsub(&ttv, &rtv); 1010 timo = ttv.tv_sec > 24 * 60 * 60 ? 1011 24 * 60 * 60 * hz : tvtohz(&ttv); 1012 } 1013 /* 1014 * An event of interest may occur while we do not hold 1015 * sellock, so check TDF_SELECT and the number of collisions 1016 * and rescan the file descriptors if necessary. 1017 */ 1018 mtx_lock_spin(&sched_lock); 1019 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 1020 mtx_unlock_spin(&sched_lock); 1021 goto retry; 1022 } 1023 mtx_unlock_spin(&sched_lock); 1024 1025 if (timo > 0) 1026 error = cv_timedwait_sig(&selwait, &sellock, timo); 1027 else 1028 error = cv_wait_sig(&selwait, &sellock); 1029 1030 if (error == 0) 1031 goto retry; 1032 1033 done: 1034 clear_selinfo_list(td); 1035 mtx_lock_spin(&sched_lock); 1036 td->td_flags &= ~TDF_SELECT; 1037 mtx_unlock_spin(&sched_lock); 1038 mtx_unlock(&sellock); 1039 1040 done_nosellock: 1041 /* poll is not restarted after signals... */ 1042 if (error == ERESTART) 1043 error = EINTR; 1044 if (error == EWOULDBLOCK) 1045 error = 0; 1046 if (error == 0) { 1047 error = copyout(bits, uap->fds, ni); 1048 if (error) 1049 goto out; 1050 } 1051 out: 1052 if (ni > sizeof(smallbits)) 1053 free(bits, M_TEMP); 1054 done2: 1055 mtx_unlock(&Giant); 1056 return (error); 1057 } 1058 1059 static int 1060 pollscan(td, fds, nfd) 1061 struct thread *td; 1062 struct pollfd *fds; 1063 u_int nfd; 1064 { 1065 register struct filedesc *fdp = td->td_proc->p_fd; 1066 int i; 1067 struct file *fp; 1068 int n = 0; 1069 1070 FILEDESC_LOCK(fdp); 1071 for (i = 0; i < nfd; i++, fds++) { 1072 if (fds->fd >= fdp->fd_nfiles) { 1073 fds->revents = POLLNVAL; 1074 n++; 1075 } else if (fds->fd < 0) { 1076 fds->revents = 0; 1077 } else { 1078 fp = fdp->fd_ofiles[fds->fd]; 1079 if (fp == NULL) { 1080 fds->revents = POLLNVAL; 1081 n++; 1082 } else { 1083 /* 1084 * Note: backend also returns POLLHUP and 1085 * POLLERR if appropriate. 1086 */ 1087 fds->revents = fo_poll(fp, fds->events, 1088 td->td_ucred, td); 1089 if (fds->revents != 0) 1090 n++; 1091 } 1092 } 1093 } 1094 FILEDESC_UNLOCK(fdp); 1095 td->td_retval[0] = n; 1096 return (0); 1097 } 1098 1099 /* 1100 * OpenBSD poll system call. 1101 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1102 */ 1103 #ifndef _SYS_SYSPROTO_H_ 1104 struct openbsd_poll_args { 1105 struct pollfd *fds; 1106 u_int nfds; 1107 int timeout; 1108 }; 1109 #endif 1110 /* 1111 * MPSAFE 1112 */ 1113 int 1114 openbsd_poll(td, uap) 1115 register struct thread *td; 1116 register struct openbsd_poll_args *uap; 1117 { 1118 return (poll(td, (struct poll_args *)uap)); 1119 } 1120 1121 /* 1122 * Remove the references to the thread from all of the objects 1123 * we were polling. 1124 * 1125 * This code assumes that the underlying owner of the selinfo 1126 * structure will hold sellock before it changes it, and that 1127 * it will unlink itself from our list if it goes away. 1128 */ 1129 void 1130 clear_selinfo_list(td) 1131 struct thread *td; 1132 { 1133 struct selinfo *si; 1134 1135 mtx_assert(&sellock, MA_OWNED); 1136 TAILQ_FOREACH(si, &td->td_selq, si_thrlist) 1137 si->si_thread = NULL; 1138 TAILQ_INIT(&td->td_selq); 1139 } 1140 1141 /*ARGSUSED*/ 1142 int 1143 seltrue(dev, events, td) 1144 dev_t dev; 1145 int events; 1146 struct thread *td; 1147 { 1148 1149 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1150 } 1151 1152 /* 1153 * Record a select request. 1154 */ 1155 void 1156 selrecord(selector, sip) 1157 struct thread *selector; 1158 struct selinfo *sip; 1159 { 1160 1161 mtx_lock(&sellock); 1162 /* 1163 * If the selinfo's thread pointer is NULL then take ownership of it. 1164 * 1165 * If the thread pointer is not NULL and it points to another 1166 * thread, then we have a collision. 1167 * 1168 * If the thread pointer is not NULL and points back to us then leave 1169 * it alone as we've already added pointed it at us and added it to 1170 * our list. 1171 */ 1172 if (sip->si_thread == NULL) { 1173 sip->si_thread = selector; 1174 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); 1175 } else if (sip->si_thread != selector) { 1176 sip->si_flags |= SI_COLL; 1177 } 1178 1179 mtx_unlock(&sellock); 1180 } 1181 1182 /* 1183 * Do a wakeup when a selectable event occurs. 1184 */ 1185 void 1186 selwakeup(sip) 1187 struct selinfo *sip; 1188 { 1189 struct thread *td; 1190 1191 mtx_lock(&sellock); 1192 td = sip->si_thread; 1193 if ((sip->si_flags & SI_COLL) != 0) { 1194 nselcoll++; 1195 sip->si_flags &= ~SI_COLL; 1196 cv_broadcast(&selwait); 1197 } 1198 if (td == NULL) { 1199 mtx_unlock(&sellock); 1200 return; 1201 } 1202 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); 1203 sip->si_thread = NULL; 1204 mtx_lock_spin(&sched_lock); 1205 if (td->td_wchan == &selwait) { 1206 cv_waitq_remove(td); 1207 TD_CLR_SLEEPING(td); 1208 setrunnable(td); 1209 } else 1210 td->td_flags &= ~TDF_SELECT; 1211 mtx_unlock_spin(&sched_lock); 1212 mtx_unlock(&sellock); 1213 } 1214 1215 static void selectinit(void *); 1216 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1217 1218 /* ARGSUSED*/ 1219 static void 1220 selectinit(dummy) 1221 void *dummy; 1222 { 1223 cv_init(&selwait, "select"); 1224 mtx_init(&sellock, "sellck", NULL, MTX_DEF); 1225 } 1226