1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 */ 40 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 #include "opt_ktrace.h" 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 #include <sys/sysproto.h> 49 #include <sys/filedesc.h> 50 #include <sys/filio.h> 51 #include <sys/fcntl.h> 52 #include <sys/file.h> 53 #include <sys/proc.h> 54 #include <sys/signalvar.h> 55 #include <sys/socketvar.h> 56 #include <sys/uio.h> 57 #include <sys/kernel.h> 58 #include <sys/limits.h> 59 #include <sys/malloc.h> 60 #include <sys/poll.h> 61 #include <sys/resourcevar.h> 62 #include <sys/selinfo.h> 63 #include <sys/syscallsubr.h> 64 #include <sys/sysctl.h> 65 #include <sys/sysent.h> 66 #include <sys/bio.h> 67 #include <sys/buf.h> 68 #include <sys/condvar.h> 69 #ifdef KTRACE 70 #include <sys/ktrace.h> 71 #endif 72 #include <vm/vm.h> 73 #include <vm/vm_page.h> 74 75 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 76 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 77 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 78 79 static int pollscan(struct thread *, struct pollfd *, u_int); 80 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 81 static int dofileread(struct thread *, struct file *, int, void *, 82 size_t, off_t, int); 83 static int dofilewrite(struct thread *, struct file *, int, 84 const void *, size_t, off_t, int); 85 86 /* 87 * Read system call. 88 */ 89 #ifndef _SYS_SYSPROTO_H_ 90 struct read_args { 91 int fd; 92 void *buf; 93 size_t nbyte; 94 }; 95 #endif 96 /* 97 * MPSAFE 98 */ 99 int 100 read(td, uap) 101 struct thread *td; 102 struct read_args *uap; 103 { 104 struct file *fp; 105 int error; 106 107 if ((error = fget_read(td, uap->fd, &fp)) == 0) { 108 error = dofileread(td, fp, uap->fd, uap->buf, 109 uap->nbyte, (off_t)-1, 0); 110 fdrop(fp, td); 111 } 112 return(error); 113 } 114 115 /* 116 * Pread system call 117 */ 118 #ifndef _SYS_SYSPROTO_H_ 119 struct pread_args { 120 int fd; 121 void *buf; 122 size_t nbyte; 123 int pad; 124 off_t offset; 125 }; 126 #endif 127 /* 128 * MPSAFE 129 */ 130 int 131 pread(td, uap) 132 struct thread *td; 133 struct pread_args *uap; 134 { 135 struct file *fp; 136 int error; 137 138 if ((error = fget_read(td, uap->fd, &fp)) != 0) 139 return (error); 140 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) { 141 error = ESPIPE; 142 } else { 143 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 144 uap->offset, FOF_OFFSET); 145 } 146 fdrop(fp, td); 147 return(error); 148 } 149 150 /* 151 * Code common for read and pread 152 */ 153 static int 154 dofileread(td, fp, fd, buf, nbyte, offset, flags) 155 struct thread *td; 156 struct file *fp; 157 int fd, flags; 158 void *buf; 159 size_t nbyte; 160 off_t offset; 161 { 162 struct uio auio; 163 struct iovec aiov; 164 long cnt, error = 0; 165 #ifdef KTRACE 166 struct iovec ktriov; 167 struct uio ktruio; 168 int didktr = 0; 169 #endif 170 171 aiov.iov_base = buf; 172 aiov.iov_len = nbyte; 173 auio.uio_iov = &aiov; 174 auio.uio_iovcnt = 1; 175 auio.uio_offset = offset; 176 if (nbyte > INT_MAX) 177 return (EINVAL); 178 auio.uio_resid = nbyte; 179 auio.uio_rw = UIO_READ; 180 auio.uio_segflg = UIO_USERSPACE; 181 auio.uio_td = td; 182 #ifdef KTRACE 183 /* 184 * if tracing, save a copy of iovec 185 */ 186 if (KTRPOINT(td, KTR_GENIO)) { 187 ktriov = aiov; 188 ktruio = auio; 189 didktr = 1; 190 } 191 #endif 192 cnt = nbyte; 193 194 if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) { 195 if (auio.uio_resid != cnt && (error == ERESTART || 196 error == EINTR || error == EWOULDBLOCK)) 197 error = 0; 198 } 199 cnt -= auio.uio_resid; 200 #ifdef KTRACE 201 if (didktr && error == 0) { 202 ktruio.uio_iov = &ktriov; 203 ktruio.uio_resid = cnt; 204 ktrgenio(fd, UIO_READ, &ktruio, error); 205 } 206 #endif 207 td->td_retval[0] = cnt; 208 return (error); 209 } 210 211 /* 212 * Scatter read system call. 213 */ 214 #ifndef _SYS_SYSPROTO_H_ 215 struct readv_args { 216 int fd; 217 struct iovec *iovp; 218 u_int iovcnt; 219 }; 220 #endif 221 /* 222 * MPSAFE 223 */ 224 int 225 readv(td, uap) 226 struct thread *td; 227 struct readv_args *uap; 228 { 229 struct file *fp; 230 struct uio auio; 231 struct iovec *iov; 232 struct iovec *needfree; 233 struct iovec aiov[UIO_SMALLIOV]; 234 long i, cnt; 235 int error; 236 u_int iovlen; 237 #ifdef KTRACE 238 struct iovec *ktriov = NULL; 239 struct uio ktruio; 240 #endif 241 242 if ((error = fget_read(td, uap->fd, &fp)) != 0) 243 return (error); 244 needfree = NULL; 245 /* note: can't use iovlen until iovcnt is validated */ 246 iovlen = uap->iovcnt * sizeof (struct iovec); 247 if (uap->iovcnt > UIO_SMALLIOV) { 248 if (uap->iovcnt > UIO_MAXIOV) { 249 error = EINVAL; 250 goto done; 251 } 252 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 253 needfree = iov; 254 } else 255 iov = aiov; 256 auio.uio_iov = iov; 257 auio.uio_iovcnt = uap->iovcnt; 258 auio.uio_rw = UIO_READ; 259 auio.uio_segflg = UIO_USERSPACE; 260 auio.uio_td = td; 261 auio.uio_offset = -1; 262 if ((error = copyin(uap->iovp, iov, iovlen))) 263 goto done; 264 auio.uio_resid = 0; 265 for (i = 0; i < uap->iovcnt; i++) { 266 if (iov->iov_len > INT_MAX - auio.uio_resid) { 267 error = EINVAL; 268 goto done; 269 } 270 auio.uio_resid += iov->iov_len; 271 iov++; 272 } 273 #ifdef KTRACE 274 /* 275 * if tracing, save a copy of iovec 276 */ 277 if (KTRPOINT(td, KTR_GENIO)) { 278 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 279 bcopy(auio.uio_iov, ktriov, iovlen); 280 ktruio = auio; 281 } 282 #endif 283 cnt = auio.uio_resid; 284 if ((error = fo_read(fp, &auio, td->td_ucred, 0, td))) { 285 if (auio.uio_resid != cnt && (error == ERESTART || 286 error == EINTR || error == EWOULDBLOCK)) 287 error = 0; 288 } 289 cnt -= auio.uio_resid; 290 #ifdef KTRACE 291 if (ktriov != NULL) { 292 if (error == 0) { 293 ktruio.uio_iov = ktriov; 294 ktruio.uio_resid = cnt; 295 ktrgenio(uap->fd, UIO_READ, &ktruio, error); 296 } 297 FREE(ktriov, M_TEMP); 298 } 299 #endif 300 td->td_retval[0] = cnt; 301 done: 302 fdrop(fp, td); 303 if (needfree) 304 FREE(needfree, M_IOV); 305 return (error); 306 } 307 308 /* 309 * Write system call 310 */ 311 #ifndef _SYS_SYSPROTO_H_ 312 struct write_args { 313 int fd; 314 const void *buf; 315 size_t nbyte; 316 }; 317 #endif 318 /* 319 * MPSAFE 320 */ 321 int 322 write(td, uap) 323 struct thread *td; 324 struct write_args *uap; 325 { 326 struct file *fp; 327 int error; 328 329 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 330 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, 331 (off_t)-1, 0); 332 fdrop(fp, td); 333 } else { 334 error = EBADF; /* XXX this can't be right */ 335 } 336 return(error); 337 } 338 339 /* 340 * Pwrite system call 341 */ 342 #ifndef _SYS_SYSPROTO_H_ 343 struct pwrite_args { 344 int fd; 345 const void *buf; 346 size_t nbyte; 347 int pad; 348 off_t offset; 349 }; 350 #endif 351 /* 352 * MPSAFE 353 */ 354 int 355 pwrite(td, uap) 356 struct thread *td; 357 struct pwrite_args *uap; 358 { 359 struct file *fp; 360 int error; 361 362 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 363 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) { 364 error = ESPIPE; 365 } else { 366 error = dofilewrite(td, fp, uap->fd, uap->buf, 367 uap->nbyte, uap->offset, FOF_OFFSET); 368 } 369 fdrop(fp, td); 370 } else { 371 error = EBADF; /* this can't be right */ 372 } 373 return(error); 374 } 375 376 static int 377 dofilewrite(td, fp, fd, buf, nbyte, offset, flags) 378 struct thread *td; 379 struct file *fp; 380 int fd, flags; 381 const void *buf; 382 size_t nbyte; 383 off_t offset; 384 { 385 struct uio auio; 386 struct iovec aiov; 387 long cnt, error = 0; 388 #ifdef KTRACE 389 struct iovec ktriov; 390 struct uio ktruio; 391 int didktr = 0; 392 #endif 393 394 aiov.iov_base = (void *)(uintptr_t)buf; 395 aiov.iov_len = nbyte; 396 auio.uio_iov = &aiov; 397 auio.uio_iovcnt = 1; 398 auio.uio_offset = offset; 399 if (nbyte > INT_MAX) 400 return (EINVAL); 401 auio.uio_resid = nbyte; 402 auio.uio_rw = UIO_WRITE; 403 auio.uio_segflg = UIO_USERSPACE; 404 auio.uio_td = td; 405 #ifdef KTRACE 406 /* 407 * if tracing, save a copy of iovec and uio 408 */ 409 if (KTRPOINT(td, KTR_GENIO)) { 410 ktriov = aiov; 411 ktruio = auio; 412 didktr = 1; 413 } 414 #endif 415 cnt = nbyte; 416 if (fp->f_type == DTYPE_VNODE) 417 bwillwrite(); 418 if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) { 419 if (auio.uio_resid != cnt && (error == ERESTART || 420 error == EINTR || error == EWOULDBLOCK)) 421 error = 0; 422 /* Socket layer is responsible for issuing SIGPIPE. */ 423 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) { 424 PROC_LOCK(td->td_proc); 425 psignal(td->td_proc, SIGPIPE); 426 PROC_UNLOCK(td->td_proc); 427 } 428 } 429 cnt -= auio.uio_resid; 430 #ifdef KTRACE 431 if (didktr && error == 0) { 432 ktruio.uio_iov = &ktriov; 433 ktruio.uio_resid = cnt; 434 ktrgenio(fd, UIO_WRITE, &ktruio, error); 435 } 436 #endif 437 td->td_retval[0] = cnt; 438 return (error); 439 } 440 441 /* 442 * Gather write system call 443 */ 444 #ifndef _SYS_SYSPROTO_H_ 445 struct writev_args { 446 int fd; 447 struct iovec *iovp; 448 u_int iovcnt; 449 }; 450 #endif 451 /* 452 * MPSAFE 453 */ 454 int 455 writev(td, uap) 456 struct thread *td; 457 register struct writev_args *uap; 458 { 459 struct file *fp; 460 struct uio auio; 461 register struct iovec *iov; 462 struct iovec *needfree; 463 struct iovec aiov[UIO_SMALLIOV]; 464 long i, cnt, error = 0; 465 u_int iovlen; 466 #ifdef KTRACE 467 struct iovec *ktriov = NULL; 468 struct uio ktruio; 469 #endif 470 471 if ((error = fget_write(td, uap->fd, &fp)) != 0) 472 return (EBADF); 473 needfree = NULL; 474 /* note: can't use iovlen until iovcnt is validated */ 475 iovlen = uap->iovcnt * sizeof (struct iovec); 476 if (uap->iovcnt > UIO_SMALLIOV) { 477 if (uap->iovcnt > UIO_MAXIOV) { 478 error = EINVAL; 479 goto done; 480 } 481 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 482 needfree = iov; 483 } else 484 iov = aiov; 485 auio.uio_iov = iov; 486 auio.uio_iovcnt = uap->iovcnt; 487 auio.uio_rw = UIO_WRITE; 488 auio.uio_segflg = UIO_USERSPACE; 489 auio.uio_td = td; 490 auio.uio_offset = -1; 491 if ((error = copyin(uap->iovp, iov, iovlen))) 492 goto done; 493 auio.uio_resid = 0; 494 for (i = 0; i < uap->iovcnt; i++) { 495 if (iov->iov_len > INT_MAX - auio.uio_resid) { 496 error = EINVAL; 497 goto done; 498 } 499 auio.uio_resid += iov->iov_len; 500 iov++; 501 } 502 #ifdef KTRACE 503 /* 504 * if tracing, save a copy of iovec and uio 505 */ 506 if (KTRPOINT(td, KTR_GENIO)) { 507 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 508 bcopy(auio.uio_iov, ktriov, iovlen); 509 ktruio = auio; 510 } 511 #endif 512 cnt = auio.uio_resid; 513 if (fp->f_type == DTYPE_VNODE) 514 bwillwrite(); 515 if ((error = fo_write(fp, &auio, td->td_ucred, 0, td))) { 516 if (auio.uio_resid != cnt && (error == ERESTART || 517 error == EINTR || error == EWOULDBLOCK)) 518 error = 0; 519 if (error == EPIPE) { 520 PROC_LOCK(td->td_proc); 521 psignal(td->td_proc, SIGPIPE); 522 PROC_UNLOCK(td->td_proc); 523 } 524 } 525 cnt -= auio.uio_resid; 526 #ifdef KTRACE 527 if (ktriov != NULL) { 528 if (error == 0) { 529 ktruio.uio_iov = ktriov; 530 ktruio.uio_resid = cnt; 531 ktrgenio(uap->fd, UIO_WRITE, &ktruio, error); 532 } 533 FREE(ktriov, M_TEMP); 534 } 535 #endif 536 td->td_retval[0] = cnt; 537 done: 538 fdrop(fp, td); 539 if (needfree) 540 FREE(needfree, M_IOV); 541 return (error); 542 } 543 544 /* 545 * Ioctl system call 546 */ 547 #ifndef _SYS_SYSPROTO_H_ 548 struct ioctl_args { 549 int fd; 550 u_long com; 551 caddr_t data; 552 }; 553 #endif 554 /* 555 * MPSAFE 556 */ 557 /* ARGSUSED */ 558 int 559 ioctl(td, uap) 560 struct thread *td; 561 register struct ioctl_args *uap; 562 { 563 struct file *fp; 564 register struct filedesc *fdp; 565 register u_long com; 566 int error = 0; 567 register u_int size; 568 caddr_t data, memp; 569 int tmp; 570 #define STK_PARAMS 128 571 union { 572 char stkbuf[STK_PARAMS]; 573 long align; 574 } ubuf; 575 576 if ((error = fget(td, uap->fd, &fp)) != 0) 577 return (error); 578 mtx_lock(&Giant); 579 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 580 fdrop(fp, td); 581 mtx_unlock(&Giant); 582 return (EBADF); 583 } 584 fdp = td->td_proc->p_fd; 585 switch (com = uap->com) { 586 case FIONCLEX: 587 FILEDESC_LOCK(fdp); 588 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; 589 FILEDESC_UNLOCK(fdp); 590 fdrop(fp, td); 591 mtx_unlock(&Giant); 592 return (0); 593 case FIOCLEX: 594 FILEDESC_LOCK(fdp); 595 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; 596 FILEDESC_UNLOCK(fdp); 597 fdrop(fp, td); 598 mtx_unlock(&Giant); 599 return (0); 600 } 601 602 /* 603 * Interpret high order word to find amount of data to be 604 * copied to/from the user's address space. 605 */ 606 size = IOCPARM_LEN(com); 607 if (size > IOCPARM_MAX) { 608 fdrop(fp, td); 609 mtx_unlock(&Giant); 610 return (ENOTTY); 611 } 612 613 memp = NULL; 614 if (size > sizeof (ubuf.stkbuf)) { 615 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 616 data = memp; 617 } else { 618 data = ubuf.stkbuf; 619 } 620 if (com&IOC_IN) { 621 if (size) { 622 error = copyin(uap->data, data, (u_int)size); 623 if (error) { 624 if (memp) 625 free(memp, M_IOCTLOPS); 626 fdrop(fp, td); 627 goto done; 628 } 629 } else { 630 *(caddr_t *)data = uap->data; 631 } 632 } else if ((com&IOC_OUT) && size) { 633 /* 634 * Zero the buffer so the user always 635 * gets back something deterministic. 636 */ 637 bzero(data, size); 638 } else if (com&IOC_VOID) { 639 *(caddr_t *)data = uap->data; 640 } 641 642 switch (com) { 643 644 case FIONBIO: 645 FILE_LOCK(fp); 646 if ((tmp = *(int *)data)) 647 fp->f_flag |= FNONBLOCK; 648 else 649 fp->f_flag &= ~FNONBLOCK; 650 FILE_UNLOCK(fp); 651 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 652 break; 653 654 case FIOASYNC: 655 FILE_LOCK(fp); 656 if ((tmp = *(int *)data)) 657 fp->f_flag |= FASYNC; 658 else 659 fp->f_flag &= ~FASYNC; 660 FILE_UNLOCK(fp); 661 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); 662 break; 663 664 default: 665 error = fo_ioctl(fp, com, data, td->td_ucred, td); 666 /* 667 * Copy any data to user, size was 668 * already set and checked above. 669 */ 670 if (error == 0 && (com&IOC_OUT) && size) 671 error = copyout(data, uap->data, (u_int)size); 672 break; 673 } 674 if (memp) 675 free(memp, M_IOCTLOPS); 676 fdrop(fp, td); 677 done: 678 mtx_unlock(&Giant); 679 return (error); 680 } 681 682 /* 683 * sellock and selwait are initialized in selectinit() via SYSINIT. 684 */ 685 struct mtx sellock; 686 struct cv selwait; 687 u_int nselcoll; /* Select collisions since boot */ 688 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 689 690 /* 691 * Select system call. 692 */ 693 #ifndef _SYS_SYSPROTO_H_ 694 struct select_args { 695 int nd; 696 fd_set *in, *ou, *ex; 697 struct timeval *tv; 698 }; 699 #endif 700 /* 701 * MPSAFE 702 */ 703 int 704 select(td, uap) 705 register struct thread *td; 706 register struct select_args *uap; 707 { 708 struct timeval tv, *tvp; 709 int error; 710 711 if (uap->tv != NULL) { 712 error = copyin(uap->tv, &tv, sizeof(tv)); 713 if (error) 714 return (error); 715 tvp = &tv; 716 } else 717 tvp = NULL; 718 719 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp)); 720 } 721 722 int 723 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 724 fd_set *fd_ex, struct timeval *tvp) 725 { 726 struct filedesc *fdp; 727 /* 728 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 729 * infds with the new FD_SETSIZE of 1024, and more than enough for 730 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 731 * of 256. 732 */ 733 fd_mask s_selbits[howmany(2048, NFDBITS)]; 734 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 735 struct timeval atv, rtv, ttv; 736 int error, timo; 737 u_int ncoll, nbufbytes, ncpbytes, nfdbits; 738 739 if (nd < 0) 740 return (EINVAL); 741 fdp = td->td_proc->p_fd; 742 mtx_lock(&Giant); 743 FILEDESC_LOCK(fdp); 744 745 if (nd > td->td_proc->p_fd->fd_nfiles) 746 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 747 FILEDESC_UNLOCK(fdp); 748 749 /* 750 * Allocate just enough bits for the non-null fd_sets. Use the 751 * preallocated auto buffer if possible. 752 */ 753 nfdbits = roundup(nd, NFDBITS); 754 ncpbytes = nfdbits / NBBY; 755 nbufbytes = 0; 756 if (fd_in != NULL) 757 nbufbytes += 2 * ncpbytes; 758 if (fd_ou != NULL) 759 nbufbytes += 2 * ncpbytes; 760 if (fd_ex != NULL) 761 nbufbytes += 2 * ncpbytes; 762 if (nbufbytes <= sizeof s_selbits) 763 selbits = &s_selbits[0]; 764 else 765 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 766 767 /* 768 * Assign pointers into the bit buffers and fetch the input bits. 769 * Put the output buffers together so that they can be bzeroed 770 * together. 771 */ 772 sbp = selbits; 773 #define getbits(name, x) \ 774 do { \ 775 if (name == NULL) \ 776 ibits[x] = NULL; \ 777 else { \ 778 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 779 obits[x] = sbp; \ 780 sbp += ncpbytes / sizeof *sbp; \ 781 error = copyin(name, ibits[x], ncpbytes); \ 782 if (error != 0) \ 783 goto done_nosellock; \ 784 } \ 785 } while (0) 786 getbits(fd_in, 0); 787 getbits(fd_ou, 1); 788 getbits(fd_ex, 2); 789 #undef getbits 790 if (nbufbytes != 0) 791 bzero(selbits, nbufbytes / 2); 792 793 if (tvp != NULL) { 794 atv = *tvp; 795 if (itimerfix(&atv)) { 796 error = EINVAL; 797 goto done_nosellock; 798 } 799 getmicrouptime(&rtv); 800 timevaladd(&atv, &rtv); 801 } else { 802 atv.tv_sec = 0; 803 atv.tv_usec = 0; 804 } 805 timo = 0; 806 TAILQ_INIT(&td->td_selq); 807 mtx_lock(&sellock); 808 retry: 809 ncoll = nselcoll; 810 mtx_lock_spin(&sched_lock); 811 td->td_flags |= TDF_SELECT; 812 mtx_unlock_spin(&sched_lock); 813 mtx_unlock(&sellock); 814 815 error = selscan(td, ibits, obits, nd); 816 mtx_lock(&sellock); 817 if (error || td->td_retval[0]) 818 goto done; 819 if (atv.tv_sec || atv.tv_usec) { 820 getmicrouptime(&rtv); 821 if (timevalcmp(&rtv, &atv, >=)) 822 goto done; 823 ttv = atv; 824 timevalsub(&ttv, &rtv); 825 timo = ttv.tv_sec > 24 * 60 * 60 ? 826 24 * 60 * 60 * hz : tvtohz(&ttv); 827 } 828 829 /* 830 * An event of interest may occur while we do not hold 831 * sellock, so check TDF_SELECT and the number of 832 * collisions and rescan the file descriptors if 833 * necessary. 834 */ 835 mtx_lock_spin(&sched_lock); 836 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 837 mtx_unlock_spin(&sched_lock); 838 goto retry; 839 } 840 mtx_unlock_spin(&sched_lock); 841 842 if (timo > 0) 843 error = cv_timedwait_sig(&selwait, &sellock, timo); 844 else 845 error = cv_wait_sig(&selwait, &sellock); 846 847 if (error == 0) 848 goto retry; 849 850 done: 851 clear_selinfo_list(td); 852 mtx_lock_spin(&sched_lock); 853 td->td_flags &= ~TDF_SELECT; 854 mtx_unlock_spin(&sched_lock); 855 mtx_unlock(&sellock); 856 857 done_nosellock: 858 /* select is not restarted after signals... */ 859 if (error == ERESTART) 860 error = EINTR; 861 if (error == EWOULDBLOCK) 862 error = 0; 863 #define putbits(name, x) \ 864 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ 865 error = error2; 866 if (error == 0) { 867 int error2; 868 869 putbits(fd_in, 0); 870 putbits(fd_ou, 1); 871 putbits(fd_ex, 2); 872 #undef putbits 873 } 874 if (selbits != &s_selbits[0]) 875 free(selbits, M_SELECT); 876 877 mtx_unlock(&Giant); 878 return (error); 879 } 880 881 static int 882 selscan(td, ibits, obits, nfd) 883 struct thread *td; 884 fd_mask **ibits, **obits; 885 int nfd; 886 { 887 int msk, i, fd; 888 fd_mask bits; 889 struct file *fp; 890 int n = 0; 891 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 892 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 893 struct filedesc *fdp = td->td_proc->p_fd; 894 895 FILEDESC_LOCK(fdp); 896 for (msk = 0; msk < 3; msk++) { 897 if (ibits[msk] == NULL) 898 continue; 899 for (i = 0; i < nfd; i += NFDBITS) { 900 bits = ibits[msk][i/NFDBITS]; 901 /* ffs(int mask) not portable, fd_mask is long */ 902 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 903 if (!(bits & 1)) 904 continue; 905 if ((fp = fget_locked(fdp, fd)) == NULL) { 906 FILEDESC_UNLOCK(fdp); 907 return (EBADF); 908 } 909 if (fo_poll(fp, flag[msk], td->td_ucred, 910 td)) { 911 obits[msk][(fd)/NFDBITS] |= 912 ((fd_mask)1 << ((fd) % NFDBITS)); 913 n++; 914 } 915 } 916 } 917 } 918 FILEDESC_UNLOCK(fdp); 919 td->td_retval[0] = n; 920 return (0); 921 } 922 923 /* 924 * Poll system call. 925 */ 926 #ifndef _SYS_SYSPROTO_H_ 927 struct poll_args { 928 struct pollfd *fds; 929 u_int nfds; 930 int timeout; 931 }; 932 #endif 933 /* 934 * MPSAFE 935 */ 936 int 937 poll(td, uap) 938 struct thread *td; 939 struct poll_args *uap; 940 { 941 caddr_t bits; 942 char smallbits[32 * sizeof(struct pollfd)]; 943 struct timeval atv, rtv, ttv; 944 int error = 0, timo; 945 u_int ncoll, nfds; 946 size_t ni; 947 948 nfds = uap->nfds; 949 950 mtx_lock(&Giant); 951 /* 952 * This is kinda bogus. We have fd limits, but that is not 953 * really related to the size of the pollfd array. Make sure 954 * we let the process use at least FD_SETSIZE entries and at 955 * least enough for the current limits. We want to be reasonably 956 * safe, but not overly restrictive. 957 */ 958 if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) && 959 (nfds > FD_SETSIZE)) { 960 error = EINVAL; 961 goto done2; 962 } 963 ni = nfds * sizeof(struct pollfd); 964 if (ni > sizeof(smallbits)) 965 bits = malloc(ni, M_TEMP, M_WAITOK); 966 else 967 bits = smallbits; 968 error = copyin(uap->fds, bits, ni); 969 if (error) 970 goto done_nosellock; 971 if (uap->timeout != INFTIM) { 972 atv.tv_sec = uap->timeout / 1000; 973 atv.tv_usec = (uap->timeout % 1000) * 1000; 974 if (itimerfix(&atv)) { 975 error = EINVAL; 976 goto done_nosellock; 977 } 978 getmicrouptime(&rtv); 979 timevaladd(&atv, &rtv); 980 } else { 981 atv.tv_sec = 0; 982 atv.tv_usec = 0; 983 } 984 timo = 0; 985 TAILQ_INIT(&td->td_selq); 986 mtx_lock(&sellock); 987 retry: 988 ncoll = nselcoll; 989 mtx_lock_spin(&sched_lock); 990 td->td_flags |= TDF_SELECT; 991 mtx_unlock_spin(&sched_lock); 992 mtx_unlock(&sellock); 993 994 error = pollscan(td, (struct pollfd *)bits, nfds); 995 mtx_lock(&sellock); 996 if (error || td->td_retval[0]) 997 goto done; 998 if (atv.tv_sec || atv.tv_usec) { 999 getmicrouptime(&rtv); 1000 if (timevalcmp(&rtv, &atv, >=)) 1001 goto done; 1002 ttv = atv; 1003 timevalsub(&ttv, &rtv); 1004 timo = ttv.tv_sec > 24 * 60 * 60 ? 1005 24 * 60 * 60 * hz : tvtohz(&ttv); 1006 } 1007 /* 1008 * An event of interest may occur while we do not hold 1009 * sellock, so check TDF_SELECT and the number of collisions 1010 * and rescan the file descriptors if necessary. 1011 */ 1012 mtx_lock_spin(&sched_lock); 1013 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 1014 mtx_unlock_spin(&sched_lock); 1015 goto retry; 1016 } 1017 mtx_unlock_spin(&sched_lock); 1018 1019 if (timo > 0) 1020 error = cv_timedwait_sig(&selwait, &sellock, timo); 1021 else 1022 error = cv_wait_sig(&selwait, &sellock); 1023 1024 if (error == 0) 1025 goto retry; 1026 1027 done: 1028 clear_selinfo_list(td); 1029 mtx_lock_spin(&sched_lock); 1030 td->td_flags &= ~TDF_SELECT; 1031 mtx_unlock_spin(&sched_lock); 1032 mtx_unlock(&sellock); 1033 1034 done_nosellock: 1035 /* poll is not restarted after signals... */ 1036 if (error == ERESTART) 1037 error = EINTR; 1038 if (error == EWOULDBLOCK) 1039 error = 0; 1040 if (error == 0) { 1041 error = copyout(bits, uap->fds, ni); 1042 if (error) 1043 goto out; 1044 } 1045 out: 1046 if (ni > sizeof(smallbits)) 1047 free(bits, M_TEMP); 1048 done2: 1049 mtx_unlock(&Giant); 1050 return (error); 1051 } 1052 1053 static int 1054 pollscan(td, fds, nfd) 1055 struct thread *td; 1056 struct pollfd *fds; 1057 u_int nfd; 1058 { 1059 register struct filedesc *fdp = td->td_proc->p_fd; 1060 int i; 1061 struct file *fp; 1062 int n = 0; 1063 1064 FILEDESC_LOCK(fdp); 1065 for (i = 0; i < nfd; i++, fds++) { 1066 if (fds->fd >= fdp->fd_nfiles) { 1067 fds->revents = POLLNVAL; 1068 n++; 1069 } else if (fds->fd < 0) { 1070 fds->revents = 0; 1071 } else { 1072 fp = fdp->fd_ofiles[fds->fd]; 1073 if (fp == NULL) { 1074 fds->revents = POLLNVAL; 1075 n++; 1076 } else { 1077 /* 1078 * Note: backend also returns POLLHUP and 1079 * POLLERR if appropriate. 1080 */ 1081 fds->revents = fo_poll(fp, fds->events, 1082 td->td_ucred, td); 1083 if (fds->revents != 0) 1084 n++; 1085 } 1086 } 1087 } 1088 FILEDESC_UNLOCK(fdp); 1089 td->td_retval[0] = n; 1090 return (0); 1091 } 1092 1093 /* 1094 * OpenBSD poll system call. 1095 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1096 */ 1097 #ifndef _SYS_SYSPROTO_H_ 1098 struct openbsd_poll_args { 1099 struct pollfd *fds; 1100 u_int nfds; 1101 int timeout; 1102 }; 1103 #endif 1104 /* 1105 * MPSAFE 1106 */ 1107 int 1108 openbsd_poll(td, uap) 1109 register struct thread *td; 1110 register struct openbsd_poll_args *uap; 1111 { 1112 return (poll(td, (struct poll_args *)uap)); 1113 } 1114 1115 /* 1116 * Remove the references to the thread from all of the objects 1117 * we were polling. 1118 * 1119 * This code assumes that the underlying owner of the selinfo 1120 * structure will hold sellock before it changes it, and that 1121 * it will unlink itself from our list if it goes away. 1122 */ 1123 void 1124 clear_selinfo_list(td) 1125 struct thread *td; 1126 { 1127 struct selinfo *si; 1128 1129 mtx_assert(&sellock, MA_OWNED); 1130 TAILQ_FOREACH(si, &td->td_selq, si_thrlist) 1131 si->si_thread = NULL; 1132 TAILQ_INIT(&td->td_selq); 1133 } 1134 1135 /*ARGSUSED*/ 1136 int 1137 seltrue(dev, events, td) 1138 dev_t dev; 1139 int events; 1140 struct thread *td; 1141 { 1142 1143 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1144 } 1145 1146 /* 1147 * Record a select request. 1148 */ 1149 void 1150 selrecord(selector, sip) 1151 struct thread *selector; 1152 struct selinfo *sip; 1153 { 1154 1155 mtx_lock(&sellock); 1156 /* 1157 * If the selinfo's thread pointer is NULL then take ownership of it. 1158 * 1159 * If the thread pointer is not NULL and it points to another 1160 * thread, then we have a collision. 1161 * 1162 * If the thread pointer is not NULL and points back to us then leave 1163 * it alone as we've already added pointed it at us and added it to 1164 * our list. 1165 */ 1166 if (sip->si_thread == NULL) { 1167 sip->si_thread = selector; 1168 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); 1169 } else if (sip->si_thread != selector) { 1170 sip->si_flags |= SI_COLL; 1171 } 1172 1173 mtx_unlock(&sellock); 1174 } 1175 1176 /* 1177 * Do a wakeup when a selectable event occurs. 1178 */ 1179 void 1180 selwakeup(sip) 1181 struct selinfo *sip; 1182 { 1183 struct thread *td; 1184 1185 mtx_lock(&sellock); 1186 td = sip->si_thread; 1187 if ((sip->si_flags & SI_COLL) != 0) { 1188 nselcoll++; 1189 sip->si_flags &= ~SI_COLL; 1190 cv_broadcast(&selwait); 1191 } 1192 if (td == NULL) { 1193 mtx_unlock(&sellock); 1194 return; 1195 } 1196 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); 1197 sip->si_thread = NULL; 1198 mtx_lock_spin(&sched_lock); 1199 if (td->td_wchan == &selwait) { 1200 cv_waitq_remove(td); 1201 TD_CLR_SLEEPING(td); 1202 setrunnable(td); 1203 } else 1204 td->td_flags &= ~TDF_SELECT; 1205 mtx_unlock_spin(&sched_lock); 1206 mtx_unlock(&sellock); 1207 } 1208 1209 static void selectinit(void *); 1210 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1211 1212 /* ARGSUSED*/ 1213 static void 1214 selectinit(dummy) 1215 void *dummy; 1216 { 1217 cv_init(&selwait, "select"); 1218 mtx_init(&sellock, "sellck", NULL, MTX_DEF); 1219 } 1220