1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_compat.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/sysproto.h> 46 #include <sys/filedesc.h> 47 #include <sys/filio.h> 48 #include <sys/fcntl.h> 49 #include <sys/file.h> 50 #include <sys/proc.h> 51 #include <sys/signalvar.h> 52 #include <sys/socketvar.h> 53 #include <sys/uio.h> 54 #include <sys/kernel.h> 55 #include <sys/limits.h> 56 #include <sys/malloc.h> 57 #include <sys/poll.h> 58 #include <sys/resourcevar.h> 59 #include <sys/selinfo.h> 60 #include <sys/sleepqueue.h> 61 #include <sys/syscallsubr.h> 62 #include <sys/sysctl.h> 63 #include <sys/sysent.h> 64 #include <sys/vnode.h> 65 #include <sys/bio.h> 66 #include <sys/buf.h> 67 #include <sys/condvar.h> 68 #ifdef KTRACE 69 #include <sys/ktrace.h> 70 #endif 71 72 #include <sys/ktr.h> 73 74 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 75 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 76 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 77 78 static int pollscan(struct thread *, struct pollfd *, u_int); 79 static int pollrescan(struct thread *); 80 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 81 static int selrescan(struct thread *, fd_mask **, fd_mask **); 82 static void selfdalloc(struct thread *, void *); 83 static void selfdfree(struct seltd *, struct selfd *); 84 static int dofileread(struct thread *, int, struct file *, struct uio *, 85 off_t, int); 86 static int dofilewrite(struct thread *, int, struct file *, struct uio *, 87 off_t, int); 88 static void doselwakeup(struct selinfo *, int); 89 static void seltdinit(struct thread *); 90 static int seltdwait(struct thread *, int); 91 static void seltdclear(struct thread *); 92 93 /* 94 * One seltd per-thread allocated on demand as needed. 95 * 96 * t - protected by st_mtx 97 * k - Only accessed by curthread or read-only 98 */ 99 struct seltd { 100 STAILQ_HEAD(, selfd) st_selq; /* (k) List of selfds. */ 101 struct selfd *st_free1; /* (k) free fd for read set. */ 102 struct selfd *st_free2; /* (k) free fd for write set. */ 103 struct mtx st_mtx; /* Protects struct seltd */ 104 struct cv st_wait; /* (t) Wait channel. */ 105 int st_flags; /* (t) SELTD_ flags. */ 106 }; 107 108 #define SELTD_PENDING 0x0001 /* We have pending events. */ 109 #define SELTD_RESCAN 0x0002 /* Doing a rescan. */ 110 111 /* 112 * One selfd allocated per-thread per-file-descriptor. 113 * f - protected by sf_mtx 114 */ 115 struct selfd { 116 STAILQ_ENTRY(selfd) sf_link; /* (k) fds owned by this td. */ 117 TAILQ_ENTRY(selfd) sf_threads; /* (f) fds on this selinfo. */ 118 struct selinfo *sf_si; /* (f) selinfo when linked. */ 119 struct mtx *sf_mtx; /* Pointer to selinfo mtx. */ 120 struct seltd *sf_td; /* (k) owning seltd. */ 121 void *sf_cookie; /* (k) fd or pollfd. */ 122 }; 123 124 static uma_zone_t selfd_zone; 125 126 #ifndef _SYS_SYSPROTO_H_ 127 struct read_args { 128 int fd; 129 void *buf; 130 size_t nbyte; 131 }; 132 #endif 133 int 134 read(td, uap) 135 struct thread *td; 136 struct read_args *uap; 137 { 138 struct uio auio; 139 struct iovec aiov; 140 int error; 141 142 if (uap->nbyte > INT_MAX) 143 return (EINVAL); 144 aiov.iov_base = uap->buf; 145 aiov.iov_len = uap->nbyte; 146 auio.uio_iov = &aiov; 147 auio.uio_iovcnt = 1; 148 auio.uio_resid = uap->nbyte; 149 auio.uio_segflg = UIO_USERSPACE; 150 error = kern_readv(td, uap->fd, &auio); 151 return(error); 152 } 153 154 /* 155 * Positioned read system call 156 */ 157 #ifndef _SYS_SYSPROTO_H_ 158 struct pread_args { 159 int fd; 160 void *buf; 161 size_t nbyte; 162 int pad; 163 off_t offset; 164 }; 165 #endif 166 int 167 pread(td, uap) 168 struct thread *td; 169 struct pread_args *uap; 170 { 171 struct uio auio; 172 struct iovec aiov; 173 int error; 174 175 if (uap->nbyte > INT_MAX) 176 return (EINVAL); 177 aiov.iov_base = uap->buf; 178 aiov.iov_len = uap->nbyte; 179 auio.uio_iov = &aiov; 180 auio.uio_iovcnt = 1; 181 auio.uio_resid = uap->nbyte; 182 auio.uio_segflg = UIO_USERSPACE; 183 error = kern_preadv(td, uap->fd, &auio, uap->offset); 184 return(error); 185 } 186 187 int 188 freebsd6_pread(td, uap) 189 struct thread *td; 190 struct freebsd6_pread_args *uap; 191 { 192 struct pread_args oargs; 193 194 oargs.fd = uap->fd; 195 oargs.buf = uap->buf; 196 oargs.nbyte = uap->nbyte; 197 oargs.offset = uap->offset; 198 return (pread(td, &oargs)); 199 } 200 201 /* 202 * Scatter read system call. 203 */ 204 #ifndef _SYS_SYSPROTO_H_ 205 struct readv_args { 206 int fd; 207 struct iovec *iovp; 208 u_int iovcnt; 209 }; 210 #endif 211 int 212 readv(struct thread *td, struct readv_args *uap) 213 { 214 struct uio *auio; 215 int error; 216 217 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 218 if (error) 219 return (error); 220 error = kern_readv(td, uap->fd, auio); 221 free(auio, M_IOV); 222 return (error); 223 } 224 225 int 226 kern_readv(struct thread *td, int fd, struct uio *auio) 227 { 228 struct file *fp; 229 int error; 230 231 error = fget_read(td, fd, &fp); 232 if (error) 233 return (error); 234 error = dofileread(td, fd, fp, auio, (off_t)-1, 0); 235 fdrop(fp, td); 236 return (error); 237 } 238 239 /* 240 * Scatter positioned read system call. 241 */ 242 #ifndef _SYS_SYSPROTO_H_ 243 struct preadv_args { 244 int fd; 245 struct iovec *iovp; 246 u_int iovcnt; 247 off_t offset; 248 }; 249 #endif 250 int 251 preadv(struct thread *td, struct preadv_args *uap) 252 { 253 struct uio *auio; 254 int error; 255 256 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 257 if (error) 258 return (error); 259 error = kern_preadv(td, uap->fd, auio, uap->offset); 260 free(auio, M_IOV); 261 return (error); 262 } 263 264 int 265 kern_preadv(td, fd, auio, offset) 266 struct thread *td; 267 int fd; 268 struct uio *auio; 269 off_t offset; 270 { 271 struct file *fp; 272 int error; 273 274 error = fget_read(td, fd, &fp); 275 if (error) 276 return (error); 277 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 278 error = ESPIPE; 279 else if (offset < 0 && fp->f_vnode->v_type != VCHR) 280 error = EINVAL; 281 else 282 error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET); 283 fdrop(fp, td); 284 return (error); 285 } 286 287 /* 288 * Common code for readv and preadv that reads data in 289 * from a file using the passed in uio, offset, and flags. 290 */ 291 static int 292 dofileread(td, fd, fp, auio, offset, flags) 293 struct thread *td; 294 int fd; 295 struct file *fp; 296 struct uio *auio; 297 off_t offset; 298 int flags; 299 { 300 ssize_t cnt; 301 int error; 302 #ifdef KTRACE 303 struct uio *ktruio = NULL; 304 #endif 305 306 /* Finish zero length reads right here */ 307 if (auio->uio_resid == 0) { 308 td->td_retval[0] = 0; 309 return(0); 310 } 311 auio->uio_rw = UIO_READ; 312 auio->uio_offset = offset; 313 auio->uio_td = td; 314 #ifdef KTRACE 315 if (KTRPOINT(td, KTR_GENIO)) 316 ktruio = cloneuio(auio); 317 #endif 318 cnt = auio->uio_resid; 319 if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) { 320 if (auio->uio_resid != cnt && (error == ERESTART || 321 error == EINTR || error == EWOULDBLOCK)) 322 error = 0; 323 } 324 cnt -= auio->uio_resid; 325 #ifdef KTRACE 326 if (ktruio != NULL) { 327 ktruio->uio_resid = cnt; 328 ktrgenio(fd, UIO_READ, ktruio, error); 329 } 330 #endif 331 td->td_retval[0] = cnt; 332 return (error); 333 } 334 335 #ifndef _SYS_SYSPROTO_H_ 336 struct write_args { 337 int fd; 338 const void *buf; 339 size_t nbyte; 340 }; 341 #endif 342 int 343 write(td, uap) 344 struct thread *td; 345 struct write_args *uap; 346 { 347 struct uio auio; 348 struct iovec aiov; 349 int error; 350 351 if (uap->nbyte > INT_MAX) 352 return (EINVAL); 353 aiov.iov_base = (void *)(uintptr_t)uap->buf; 354 aiov.iov_len = uap->nbyte; 355 auio.uio_iov = &aiov; 356 auio.uio_iovcnt = 1; 357 auio.uio_resid = uap->nbyte; 358 auio.uio_segflg = UIO_USERSPACE; 359 error = kern_writev(td, uap->fd, &auio); 360 return(error); 361 } 362 363 /* 364 * Positioned write system call. 365 */ 366 #ifndef _SYS_SYSPROTO_H_ 367 struct pwrite_args { 368 int fd; 369 const void *buf; 370 size_t nbyte; 371 int pad; 372 off_t offset; 373 }; 374 #endif 375 int 376 pwrite(td, uap) 377 struct thread *td; 378 struct pwrite_args *uap; 379 { 380 struct uio auio; 381 struct iovec aiov; 382 int error; 383 384 if (uap->nbyte > INT_MAX) 385 return (EINVAL); 386 aiov.iov_base = (void *)(uintptr_t)uap->buf; 387 aiov.iov_len = uap->nbyte; 388 auio.uio_iov = &aiov; 389 auio.uio_iovcnt = 1; 390 auio.uio_resid = uap->nbyte; 391 auio.uio_segflg = UIO_USERSPACE; 392 error = kern_pwritev(td, uap->fd, &auio, uap->offset); 393 return(error); 394 } 395 396 int 397 freebsd6_pwrite(td, uap) 398 struct thread *td; 399 struct freebsd6_pwrite_args *uap; 400 { 401 struct pwrite_args oargs; 402 403 oargs.fd = uap->fd; 404 oargs.buf = uap->buf; 405 oargs.nbyte = uap->nbyte; 406 oargs.offset = uap->offset; 407 return (pwrite(td, &oargs)); 408 } 409 410 /* 411 * Gather write system call. 412 */ 413 #ifndef _SYS_SYSPROTO_H_ 414 struct writev_args { 415 int fd; 416 struct iovec *iovp; 417 u_int iovcnt; 418 }; 419 #endif 420 int 421 writev(struct thread *td, struct writev_args *uap) 422 { 423 struct uio *auio; 424 int error; 425 426 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 427 if (error) 428 return (error); 429 error = kern_writev(td, uap->fd, auio); 430 free(auio, M_IOV); 431 return (error); 432 } 433 434 int 435 kern_writev(struct thread *td, int fd, struct uio *auio) 436 { 437 struct file *fp; 438 int error; 439 440 error = fget_write(td, fd, &fp); 441 if (error) 442 return (error); 443 error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0); 444 fdrop(fp, td); 445 return (error); 446 } 447 448 /* 449 * Gather positioned write system call. 450 */ 451 #ifndef _SYS_SYSPROTO_H_ 452 struct pwritev_args { 453 int fd; 454 struct iovec *iovp; 455 u_int iovcnt; 456 off_t offset; 457 }; 458 #endif 459 int 460 pwritev(struct thread *td, struct pwritev_args *uap) 461 { 462 struct uio *auio; 463 int error; 464 465 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 466 if (error) 467 return (error); 468 error = kern_pwritev(td, uap->fd, auio, uap->offset); 469 free(auio, M_IOV); 470 return (error); 471 } 472 473 int 474 kern_pwritev(td, fd, auio, offset) 475 struct thread *td; 476 struct uio *auio; 477 int fd; 478 off_t offset; 479 { 480 struct file *fp; 481 int error; 482 483 error = fget_write(td, fd, &fp); 484 if (error) 485 return (error); 486 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 487 error = ESPIPE; 488 else if (offset < 0 && fp->f_vnode->v_type != VCHR) 489 error = EINVAL; 490 else 491 error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET); 492 fdrop(fp, td); 493 return (error); 494 } 495 496 /* 497 * Common code for writev and pwritev that writes data to 498 * a file using the passed in uio, offset, and flags. 499 */ 500 static int 501 dofilewrite(td, fd, fp, auio, offset, flags) 502 struct thread *td; 503 int fd; 504 struct file *fp; 505 struct uio *auio; 506 off_t offset; 507 int flags; 508 { 509 ssize_t cnt; 510 int error; 511 #ifdef KTRACE 512 struct uio *ktruio = NULL; 513 #endif 514 515 auio->uio_rw = UIO_WRITE; 516 auio->uio_td = td; 517 auio->uio_offset = offset; 518 #ifdef KTRACE 519 if (KTRPOINT(td, KTR_GENIO)) 520 ktruio = cloneuio(auio); 521 #endif 522 cnt = auio->uio_resid; 523 if (fp->f_type == DTYPE_VNODE) 524 bwillwrite(); 525 if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) { 526 if (auio->uio_resid != cnt && (error == ERESTART || 527 error == EINTR || error == EWOULDBLOCK)) 528 error = 0; 529 /* Socket layer is responsible for issuing SIGPIPE. */ 530 if (fp->f_type != DTYPE_SOCKET && error == EPIPE) { 531 PROC_LOCK(td->td_proc); 532 psignal(td->td_proc, SIGPIPE); 533 PROC_UNLOCK(td->td_proc); 534 } 535 } 536 cnt -= auio->uio_resid; 537 #ifdef KTRACE 538 if (ktruio != NULL) { 539 ktruio->uio_resid = cnt; 540 ktrgenio(fd, UIO_WRITE, ktruio, error); 541 } 542 #endif 543 td->td_retval[0] = cnt; 544 return (error); 545 } 546 547 #ifndef _SYS_SYSPROTO_H_ 548 struct ioctl_args { 549 int fd; 550 u_long com; 551 caddr_t data; 552 }; 553 #endif 554 /* ARGSUSED */ 555 int 556 ioctl(struct thread *td, struct ioctl_args *uap) 557 { 558 u_long com; 559 int arg, error; 560 u_int size; 561 caddr_t data; 562 563 if (uap->com > 0xffffffff) { 564 printf( 565 "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", 566 td->td_proc->p_pid, td->td_name, uap->com); 567 uap->com &= 0xffffffff; 568 } 569 com = uap->com; 570 571 /* 572 * Interpret high order word to find amount of data to be 573 * copied to/from the user's address space. 574 */ 575 size = IOCPARM_LEN(com); 576 if ((size > IOCPARM_MAX) || 577 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || 578 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43) 579 ((com & IOC_OUT) && size == 0) || 580 #else 581 ((com & (IOC_IN | IOC_OUT)) && size == 0) || 582 #endif 583 ((com & IOC_VOID) && size > 0 && size != sizeof(int))) 584 return (ENOTTY); 585 586 if (size > 0) { 587 if (!(com & IOC_VOID)) 588 data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 589 else { 590 /* Integer argument. */ 591 arg = (intptr_t)uap->data; 592 data = (void *)&arg; 593 size = 0; 594 } 595 } else 596 data = (void *)&uap->data; 597 if (com & IOC_IN) { 598 error = copyin(uap->data, data, (u_int)size); 599 if (error) { 600 if (size > 0) 601 free(data, M_IOCTLOPS); 602 return (error); 603 } 604 } else if (com & IOC_OUT) { 605 /* 606 * Zero the buffer so the user always 607 * gets back something deterministic. 608 */ 609 bzero(data, size); 610 } 611 612 error = kern_ioctl(td, uap->fd, com, data); 613 614 if (error == 0 && (com & IOC_OUT)) 615 error = copyout(data, uap->data, (u_int)size); 616 617 if (size > 0) 618 free(data, M_IOCTLOPS); 619 return (error); 620 } 621 622 int 623 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) 624 { 625 struct file *fp; 626 struct filedesc *fdp; 627 int error; 628 int tmp; 629 630 if ((error = fget(td, fd, &fp)) != 0) 631 return (error); 632 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 633 fdrop(fp, td); 634 return (EBADF); 635 } 636 fdp = td->td_proc->p_fd; 637 switch (com) { 638 case FIONCLEX: 639 FILEDESC_XLOCK(fdp); 640 fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE; 641 FILEDESC_XUNLOCK(fdp); 642 goto out; 643 case FIOCLEX: 644 FILEDESC_XLOCK(fdp); 645 fdp->fd_ofileflags[fd] |= UF_EXCLOSE; 646 FILEDESC_XUNLOCK(fdp); 647 goto out; 648 case FIONBIO: 649 FILE_LOCK(fp); 650 if ((tmp = *(int *)data)) 651 fp->f_flag |= FNONBLOCK; 652 else 653 fp->f_flag &= ~FNONBLOCK; 654 FILE_UNLOCK(fp); 655 data = (void *)&tmp; 656 break; 657 case FIOASYNC: 658 FILE_LOCK(fp); 659 if ((tmp = *(int *)data)) 660 fp->f_flag |= FASYNC; 661 else 662 fp->f_flag &= ~FASYNC; 663 FILE_UNLOCK(fp); 664 data = (void *)&tmp; 665 break; 666 } 667 668 error = fo_ioctl(fp, com, data, td->td_ucred, td); 669 out: 670 fdrop(fp, td); 671 return (error); 672 } 673 674 #ifndef _SYS_SYSPROTO_H_ 675 struct select_args { 676 int nd; 677 fd_set *in, *ou, *ex; 678 struct timeval *tv; 679 }; 680 #endif 681 int 682 select(td, uap) 683 register struct thread *td; 684 register struct select_args *uap; 685 { 686 struct timeval tv, *tvp; 687 int error; 688 689 if (uap->tv != NULL) { 690 error = copyin(uap->tv, &tv, sizeof(tv)); 691 if (error) 692 return (error); 693 tvp = &tv; 694 } else 695 tvp = NULL; 696 697 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp)); 698 } 699 700 int 701 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 702 fd_set *fd_ex, struct timeval *tvp) 703 { 704 struct filedesc *fdp; 705 /* 706 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 707 * infds with the new FD_SETSIZE of 1024, and more than enough for 708 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 709 * of 256. 710 */ 711 fd_mask s_selbits[howmany(2048, NFDBITS)]; 712 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 713 struct timeval atv, rtv, ttv; 714 int error, timo; 715 u_int nbufbytes, ncpbytes, nfdbits; 716 717 if (nd < 0) 718 return (EINVAL); 719 fdp = td->td_proc->p_fd; 720 721 FILEDESC_SLOCK(fdp); 722 if (nd > td->td_proc->p_fd->fd_nfiles) 723 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 724 FILEDESC_SUNLOCK(fdp); 725 726 /* 727 * Allocate just enough bits for the non-null fd_sets. Use the 728 * preallocated auto buffer if possible. 729 */ 730 nfdbits = roundup(nd, NFDBITS); 731 ncpbytes = nfdbits / NBBY; 732 nbufbytes = 0; 733 if (fd_in != NULL) 734 nbufbytes += 2 * ncpbytes; 735 if (fd_ou != NULL) 736 nbufbytes += 2 * ncpbytes; 737 if (fd_ex != NULL) 738 nbufbytes += 2 * ncpbytes; 739 if (nbufbytes <= sizeof s_selbits) 740 selbits = &s_selbits[0]; 741 else 742 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 743 744 /* 745 * Assign pointers into the bit buffers and fetch the input bits. 746 * Put the output buffers together so that they can be bzeroed 747 * together. 748 */ 749 sbp = selbits; 750 #define getbits(name, x) \ 751 do { \ 752 if (name == NULL) \ 753 ibits[x] = NULL; \ 754 else { \ 755 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 756 obits[x] = sbp; \ 757 sbp += ncpbytes / sizeof *sbp; \ 758 error = copyin(name, ibits[x], ncpbytes); \ 759 if (error != 0) \ 760 goto done; \ 761 } \ 762 } while (0) 763 getbits(fd_in, 0); 764 getbits(fd_ou, 1); 765 getbits(fd_ex, 2); 766 #undef getbits 767 if (nbufbytes != 0) 768 bzero(selbits, nbufbytes / 2); 769 770 if (tvp != NULL) { 771 atv = *tvp; 772 if (itimerfix(&atv)) { 773 error = EINVAL; 774 goto done; 775 } 776 getmicrouptime(&rtv); 777 timevaladd(&atv, &rtv); 778 } else { 779 atv.tv_sec = 0; 780 atv.tv_usec = 0; 781 } 782 timo = 0; 783 seltdinit(td); 784 /* Iterate until the timeout expires or descriptors become ready. */ 785 for (;;) { 786 error = selscan(td, ibits, obits, nd); 787 if (error || td->td_retval[0] != 0) 788 break; 789 if (atv.tv_sec || atv.tv_usec) { 790 getmicrouptime(&rtv); 791 if (timevalcmp(&rtv, &atv, >=)) 792 break; 793 ttv = atv; 794 timevalsub(&ttv, &rtv); 795 timo = ttv.tv_sec > 24 * 60 * 60 ? 796 24 * 60 * 60 * hz : tvtohz(&ttv); 797 } 798 error = seltdwait(td, timo); 799 if (error) 800 break; 801 error = selrescan(td, ibits, obits); 802 if (error || td->td_retval[0] != 0) 803 break; 804 } 805 seltdclear(td); 806 807 done: 808 /* select is not restarted after signals... */ 809 if (error == ERESTART) 810 error = EINTR; 811 if (error == EWOULDBLOCK) 812 error = 0; 813 #define putbits(name, x) \ 814 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ 815 error = error2; 816 if (error == 0) { 817 int error2; 818 819 putbits(fd_in, 0); 820 putbits(fd_ou, 1); 821 putbits(fd_ex, 2); 822 #undef putbits 823 } 824 if (selbits != &s_selbits[0]) 825 free(selbits, M_SELECT); 826 827 return (error); 828 } 829 830 /* 831 * Traverse the list of fds attached to this thread's seltd and check for 832 * completion. 833 */ 834 static int 835 selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits) 836 { 837 struct seltd *stp; 838 struct selfd *sfp; 839 struct selfd *sfn; 840 struct selinfo *si; 841 struct file *fp; 842 int msk, fd; 843 int n = 0; 844 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 845 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 846 struct filedesc *fdp = td->td_proc->p_fd; 847 848 stp = td->td_sel; 849 FILEDESC_SLOCK(fdp); 850 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { 851 fd = (int)(uintptr_t)sfp->sf_cookie; 852 si = sfp->sf_si; 853 selfdfree(stp, sfp); 854 /* If the selinfo wasn't cleared the event didn't fire. */ 855 if (si != NULL) 856 continue; 857 if ((fp = fget_locked(fdp, fd)) == NULL) { 858 FILEDESC_SUNLOCK(fdp); 859 return (EBADF); 860 } 861 for (msk = 0; msk < 3; msk++) { 862 if (ibits[msk] == NULL) 863 continue; 864 if ((ibits[msk][fd/NFDBITS] & 865 ((fd_mask) 1 << (fd % NFDBITS))) == 0) 866 continue; 867 if (fo_poll(fp, flag[msk], td->td_ucred, td)) { 868 obits[msk][(fd)/NFDBITS] |= 869 ((fd_mask)1 << ((fd) % NFDBITS)); 870 n++; 871 } 872 } 873 } 874 FILEDESC_SUNLOCK(fdp); 875 stp->st_flags = 0; 876 td->td_retval[0] = n; 877 return (0); 878 } 879 880 /* 881 * Perform the initial filedescriptor scan and register ourselves with 882 * each selinfo. 883 */ 884 static int 885 selscan(td, ibits, obits, nfd) 886 struct thread *td; 887 fd_mask **ibits, **obits; 888 int nfd; 889 { 890 int msk, i, fd; 891 fd_mask bits; 892 struct file *fp; 893 int n = 0; 894 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 895 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 896 struct filedesc *fdp = td->td_proc->p_fd; 897 898 FILEDESC_SLOCK(fdp); 899 for (msk = 0; msk < 3; msk++) { 900 if (ibits[msk] == NULL) 901 continue; 902 for (i = 0; i < nfd; i += NFDBITS) { 903 bits = ibits[msk][i/NFDBITS]; 904 /* ffs(int mask) not portable, fd_mask is long */ 905 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 906 if (!(bits & 1)) 907 continue; 908 if ((fp = fget_locked(fdp, fd)) == NULL) { 909 FILEDESC_SUNLOCK(fdp); 910 return (EBADF); 911 } 912 selfdalloc(td, (void *)(uintptr_t)fd); 913 if (fo_poll(fp, flag[msk], td->td_ucred, 914 td)) { 915 obits[msk][(fd)/NFDBITS] |= 916 ((fd_mask)1 << ((fd) % NFDBITS)); 917 n++; 918 } 919 } 920 } 921 } 922 FILEDESC_SUNLOCK(fdp); 923 td->td_retval[0] = n; 924 return (0); 925 } 926 927 #ifndef _SYS_SYSPROTO_H_ 928 struct poll_args { 929 struct pollfd *fds; 930 u_int nfds; 931 int timeout; 932 }; 933 #endif 934 int 935 poll(td, uap) 936 struct thread *td; 937 struct poll_args *uap; 938 { 939 struct pollfd *bits; 940 struct pollfd smallbits[32]; 941 struct timeval atv, rtv, ttv; 942 int error = 0, timo; 943 u_int nfds; 944 size_t ni; 945 946 nfds = uap->nfds; 947 948 /* 949 * This is kinda bogus. We have fd limits, but that is not 950 * really related to the size of the pollfd array. Make sure 951 * we let the process use at least FD_SETSIZE entries and at 952 * least enough for the current limits. We want to be reasonably 953 * safe, but not overly restrictive. 954 */ 955 PROC_LOCK(td->td_proc); 956 if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) && 957 (nfds > FD_SETSIZE)) { 958 PROC_UNLOCK(td->td_proc); 959 return (EINVAL); 960 } 961 PROC_UNLOCK(td->td_proc); 962 ni = nfds * sizeof(struct pollfd); 963 if (ni > sizeof(smallbits)) 964 bits = malloc(ni, M_TEMP, M_WAITOK); 965 else 966 bits = smallbits; 967 error = copyin(uap->fds, bits, ni); 968 if (error) 969 goto done; 970 if (uap->timeout != INFTIM) { 971 atv.tv_sec = uap->timeout / 1000; 972 atv.tv_usec = (uap->timeout % 1000) * 1000; 973 if (itimerfix(&atv)) { 974 error = EINVAL; 975 goto done; 976 } 977 getmicrouptime(&rtv); 978 timevaladd(&atv, &rtv); 979 } else { 980 atv.tv_sec = 0; 981 atv.tv_usec = 0; 982 } 983 timo = 0; 984 seltdinit(td); 985 /* Iterate until the timeout expires or descriptors become ready. */ 986 for (;;) { 987 error = pollscan(td, bits, nfds); 988 if (error || td->td_retval[0] != 0) 989 break; 990 if (atv.tv_sec || atv.tv_usec) { 991 getmicrouptime(&rtv); 992 if (timevalcmp(&rtv, &atv, >=)) 993 break; 994 ttv = atv; 995 timevalsub(&ttv, &rtv); 996 timo = ttv.tv_sec > 24 * 60 * 60 ? 997 24 * 60 * 60 * hz : tvtohz(&ttv); 998 } 999 error = seltdwait(td, timo); 1000 if (error) 1001 break; 1002 error = pollrescan(td); 1003 if (error || td->td_retval[0] != 0) 1004 break; 1005 } 1006 seltdclear(td); 1007 1008 done: 1009 /* poll is not restarted after signals... */ 1010 if (error == ERESTART) 1011 error = EINTR; 1012 if (error == EWOULDBLOCK) 1013 error = 0; 1014 if (error == 0) { 1015 error = copyout(bits, uap->fds, ni); 1016 if (error) 1017 goto out; 1018 } 1019 out: 1020 if (ni > sizeof(smallbits)) 1021 free(bits, M_TEMP); 1022 return (error); 1023 } 1024 1025 static int 1026 pollrescan(struct thread *td) 1027 { 1028 struct seltd *stp; 1029 struct selfd *sfp; 1030 struct selfd *sfn; 1031 struct selinfo *si; 1032 struct filedesc *fdp; 1033 struct file *fp; 1034 struct pollfd *fd; 1035 int n; 1036 1037 n = 0; 1038 fdp = td->td_proc->p_fd; 1039 stp = td->td_sel; 1040 FILEDESC_SLOCK(fdp); 1041 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { 1042 fd = (struct pollfd *)sfp->sf_cookie; 1043 si = sfp->sf_si; 1044 selfdfree(stp, sfp); 1045 /* If the selinfo wasn't cleared the event didn't fire. */ 1046 if (si != NULL) 1047 continue; 1048 fp = fdp->fd_ofiles[fd->fd]; 1049 if (fp == NULL) { 1050 fd->revents = POLLNVAL; 1051 n++; 1052 continue; 1053 } 1054 /* 1055 * Note: backend also returns POLLHUP and 1056 * POLLERR if appropriate. 1057 */ 1058 fd->revents = fo_poll(fp, fd->events, td->td_ucred, td); 1059 if (fd->revents != 0) 1060 n++; 1061 } 1062 FILEDESC_SUNLOCK(fdp); 1063 stp->st_flags = 0; 1064 td->td_retval[0] = n; 1065 return (0); 1066 } 1067 1068 1069 static int 1070 pollscan(td, fds, nfd) 1071 struct thread *td; 1072 struct pollfd *fds; 1073 u_int nfd; 1074 { 1075 struct filedesc *fdp = td->td_proc->p_fd; 1076 int i; 1077 struct file *fp; 1078 int n = 0; 1079 1080 FILEDESC_SLOCK(fdp); 1081 for (i = 0; i < nfd; i++, fds++) { 1082 if (fds->fd >= fdp->fd_nfiles) { 1083 fds->revents = POLLNVAL; 1084 n++; 1085 } else if (fds->fd < 0) { 1086 fds->revents = 0; 1087 } else { 1088 fp = fdp->fd_ofiles[fds->fd]; 1089 if (fp == NULL) { 1090 fds->revents = POLLNVAL; 1091 n++; 1092 } else { 1093 /* 1094 * Note: backend also returns POLLHUP and 1095 * POLLERR if appropriate. 1096 */ 1097 selfdalloc(td, fds); 1098 fds->revents = fo_poll(fp, fds->events, 1099 td->td_ucred, td); 1100 if (fds->revents != 0) 1101 n++; 1102 } 1103 } 1104 } 1105 FILEDESC_SUNLOCK(fdp); 1106 td->td_retval[0] = n; 1107 return (0); 1108 } 1109 1110 /* 1111 * OpenBSD poll system call. 1112 * 1113 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1114 */ 1115 #ifndef _SYS_SYSPROTO_H_ 1116 struct openbsd_poll_args { 1117 struct pollfd *fds; 1118 u_int nfds; 1119 int timeout; 1120 }; 1121 #endif 1122 int 1123 openbsd_poll(td, uap) 1124 register struct thread *td; 1125 register struct openbsd_poll_args *uap; 1126 { 1127 return (poll(td, (struct poll_args *)uap)); 1128 } 1129 1130 /* 1131 * XXX This was created specifically to support netncp and netsmb. This 1132 * allows the caller to specify a socket to wait for events on. It returns 1133 * 0 if any events matched and an error otherwise. There is no way to 1134 * determine which events fired. 1135 */ 1136 int 1137 selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td) 1138 { 1139 struct timeval atv, rtv, ttv; 1140 int error, timo; 1141 1142 if (tvp != NULL) { 1143 atv = *tvp; 1144 if (itimerfix(&atv)) 1145 return (EINVAL); 1146 getmicrouptime(&rtv); 1147 timevaladd(&atv, &rtv); 1148 } else { 1149 atv.tv_sec = 0; 1150 atv.tv_usec = 0; 1151 } 1152 1153 timo = 0; 1154 seltdinit(td); 1155 /* 1156 * Iterate until the timeout expires or the socket becomes ready. 1157 */ 1158 for (;;) { 1159 selfdalloc(td, NULL); 1160 error = sopoll(so, events, NULL, td); 1161 /* error here is actually the ready events. */ 1162 if (error) 1163 return (0); 1164 if (atv.tv_sec || atv.tv_usec) { 1165 getmicrouptime(&rtv); 1166 if (timevalcmp(&rtv, &atv, >=)) { 1167 seltdclear(td); 1168 return (EWOULDBLOCK); 1169 } 1170 ttv = atv; 1171 timevalsub(&ttv, &rtv); 1172 timo = ttv.tv_sec > 24 * 60 * 60 ? 1173 24 * 60 * 60 * hz : tvtohz(&ttv); 1174 } 1175 error = seltdwait(td, timo); 1176 seltdclear(td); 1177 if (error) 1178 break; 1179 } 1180 /* XXX Duplicates ncp/smb behavior. */ 1181 if (error == ERESTART) 1182 error = 0; 1183 return (error); 1184 } 1185 1186 /* 1187 * Preallocate two selfds associated with 'cookie'. Some fo_poll routines 1188 * have two select sets, one for read and another for write. 1189 */ 1190 static void 1191 selfdalloc(struct thread *td, void *cookie) 1192 { 1193 struct seltd *stp; 1194 1195 stp = td->td_sel; 1196 if (stp->st_free1 == NULL) 1197 stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); 1198 stp->st_free1->sf_td = stp; 1199 stp->st_free1->sf_cookie = cookie; 1200 if (stp->st_free2 == NULL) 1201 stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); 1202 stp->st_free2->sf_td = stp; 1203 stp->st_free2->sf_cookie = cookie; 1204 } 1205 1206 static void 1207 selfdfree(struct seltd *stp, struct selfd *sfp) 1208 { 1209 STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link); 1210 mtx_lock(sfp->sf_mtx); 1211 if (sfp->sf_si) 1212 TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads); 1213 mtx_unlock(sfp->sf_mtx); 1214 uma_zfree(selfd_zone, sfp); 1215 } 1216 1217 /* 1218 * Record a select request. 1219 */ 1220 void 1221 selrecord(selector, sip) 1222 struct thread *selector; 1223 struct selinfo *sip; 1224 { 1225 struct selfd *sfp; 1226 struct seltd *stp; 1227 struct mtx *mtxp; 1228 1229 stp = selector->td_sel; 1230 /* 1231 * Don't record when doing a rescan. 1232 */ 1233 if (stp->st_flags & SELTD_RESCAN) 1234 return; 1235 /* 1236 * Grab one of the preallocated descriptors. 1237 */ 1238 sfp = NULL; 1239 if ((sfp = stp->st_free1) != NULL) 1240 stp->st_free1 = NULL; 1241 else if ((sfp = stp->st_free2) != NULL) 1242 stp->st_free2 = NULL; 1243 else 1244 panic("selrecord: No free selfd on selq"); 1245 mtxp = mtx_pool_find(mtxpool_sleep, sip); 1246 /* 1247 * Initialize the sfp and queue it in the thread. 1248 */ 1249 sfp->sf_si = sip; 1250 sfp->sf_mtx = mtxp; 1251 STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link); 1252 /* 1253 * Now that we've locked the sip, check for initialization. 1254 */ 1255 mtx_lock(mtxp); 1256 if (sip->si_mtx == NULL) { 1257 sip->si_mtx = mtxp; 1258 TAILQ_INIT(&sip->si_tdlist); 1259 } 1260 /* 1261 * Add this thread to the list of selfds listening on this selinfo. 1262 */ 1263 TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads); 1264 mtx_unlock(sip->si_mtx); 1265 } 1266 1267 /* Wake up a selecting thread. */ 1268 void 1269 selwakeup(sip) 1270 struct selinfo *sip; 1271 { 1272 doselwakeup(sip, -1); 1273 } 1274 1275 /* Wake up a selecting thread, and set its priority. */ 1276 void 1277 selwakeuppri(sip, pri) 1278 struct selinfo *sip; 1279 int pri; 1280 { 1281 doselwakeup(sip, pri); 1282 } 1283 1284 /* 1285 * Do a wakeup when a selectable event occurs. 1286 */ 1287 static void 1288 doselwakeup(sip, pri) 1289 struct selinfo *sip; 1290 int pri; 1291 { 1292 struct selfd *sfp; 1293 struct selfd *sfn; 1294 struct seltd *stp; 1295 1296 /* If it's not initialized there can't be any waiters. */ 1297 if (sip->si_mtx == NULL) 1298 return; 1299 /* 1300 * Locking the selinfo locks all selfds associated with it. 1301 */ 1302 mtx_lock(sip->si_mtx); 1303 TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) { 1304 /* 1305 * Once we remove this sfp from the list and clear the 1306 * sf_si seltdclear will know to ignore this si. 1307 */ 1308 TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads); 1309 sfp->sf_si = NULL; 1310 stp = sfp->sf_td; 1311 mtx_lock(&stp->st_mtx); 1312 stp->st_flags |= SELTD_PENDING; 1313 cv_broadcastpri(&stp->st_wait, pri); 1314 mtx_unlock(&stp->st_mtx); 1315 } 1316 mtx_unlock(sip->si_mtx); 1317 } 1318 1319 static void 1320 seltdinit(struct thread *td) 1321 { 1322 struct seltd *stp; 1323 1324 if ((stp = td->td_sel) != NULL) 1325 goto out; 1326 td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO); 1327 mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF); 1328 cv_init(&stp->st_wait, "select"); 1329 out: 1330 stp->st_flags = 0; 1331 STAILQ_INIT(&stp->st_selq); 1332 } 1333 1334 static int 1335 seltdwait(struct thread *td, int timo) 1336 { 1337 struct seltd *stp; 1338 int error; 1339 1340 stp = td->td_sel; 1341 /* 1342 * An event of interest may occur while we do not hold the seltd 1343 * locked so check the pending flag before we sleep. 1344 */ 1345 mtx_lock(&stp->st_mtx); 1346 /* 1347 * Any further calls to selrecord will be a rescan. 1348 */ 1349 stp->st_flags |= SELTD_RESCAN; 1350 if (stp->st_flags & SELTD_PENDING) { 1351 mtx_unlock(&stp->st_mtx); 1352 return (0); 1353 } 1354 if (timo > 0) 1355 error = cv_timedwait_sig(&stp->st_wait, &stp->st_mtx, timo); 1356 else 1357 error = cv_wait_sig(&stp->st_wait, &stp->st_mtx); 1358 mtx_unlock(&stp->st_mtx); 1359 1360 return (error); 1361 } 1362 1363 void 1364 seltdfini(struct thread *td) 1365 { 1366 struct seltd *stp; 1367 1368 stp = td->td_sel; 1369 if (stp == NULL) 1370 return; 1371 if (stp->st_free1) 1372 uma_zfree(selfd_zone, stp->st_free1); 1373 if (stp->st_free2) 1374 uma_zfree(selfd_zone, stp->st_free2); 1375 td->td_sel = NULL; 1376 free(stp, M_SELECT); 1377 } 1378 1379 /* 1380 * Remove the references to the thread from all of the objects we were 1381 * polling. 1382 */ 1383 static void 1384 seltdclear(struct thread *td) 1385 { 1386 struct seltd *stp; 1387 struct selfd *sfp; 1388 struct selfd *sfn; 1389 1390 stp = td->td_sel; 1391 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) 1392 selfdfree(stp, sfp); 1393 stp->st_flags = 0; 1394 } 1395 1396 static void selectinit(void *); 1397 SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL); 1398 static void 1399 selectinit(void *dummy __unused) 1400 { 1401 selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL, 1402 NULL, NULL, UMA_ALIGN_PTR, 0); 1403 } 1404