1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 * $FreeBSD$ 40 */ 41 42 #include "opt_ktrace.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/sysproto.h> 47 #include <sys/filedesc.h> 48 #include <sys/filio.h> 49 #include <sys/fcntl.h> 50 #include <sys/file.h> 51 #include <sys/proc.h> 52 #include <sys/signalvar.h> 53 #include <sys/socketvar.h> 54 #include <sys/uio.h> 55 #include <sys/kernel.h> 56 #include <sys/malloc.h> 57 #include <sys/poll.h> 58 #include <sys/resourcevar.h> 59 #include <sys/selinfo.h> 60 #include <sys/sysctl.h> 61 #include <sys/sysent.h> 62 #include <sys/bio.h> 63 #include <sys/buf.h> 64 #include <sys/condvar.h> 65 #ifdef __alpha__ 66 #include <sys/disklabel.h> 67 #endif 68 #ifdef KTRACE 69 #include <sys/ktrace.h> 70 #endif 71 #include <vm/vm.h> 72 #include <vm/vm_page.h> 73 74 #include <machine/limits.h> 75 76 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 77 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 78 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 79 80 static int pollscan(struct thread *, struct pollfd *, u_int); 81 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 82 static int dofileread(struct thread *, struct file *, int, void *, 83 size_t, off_t, int); 84 static int dofilewrite(struct thread *, struct file *, int, 85 const void *, size_t, off_t, int); 86 87 /* 88 * Read system call. 89 */ 90 #ifndef _SYS_SYSPROTO_H_ 91 struct read_args { 92 int fd; 93 void *buf; 94 size_t nbyte; 95 }; 96 #endif 97 /* 98 * MPSAFE 99 */ 100 int 101 read(td, uap) 102 struct thread *td; 103 struct read_args *uap; 104 { 105 struct file *fp; 106 int error; 107 108 if ((error = fget_read(td, uap->fd, &fp)) == 0) { 109 error = dofileread(td, fp, uap->fd, uap->buf, 110 uap->nbyte, (off_t)-1, 0); 111 fdrop(fp, td); 112 } 113 return(error); 114 } 115 116 /* 117 * Pread system call 118 */ 119 #ifndef _SYS_SYSPROTO_H_ 120 struct pread_args { 121 int fd; 122 void *buf; 123 size_t nbyte; 124 int pad; 125 off_t offset; 126 }; 127 #endif 128 /* 129 * MPSAFE 130 */ 131 int 132 pread(td, uap) 133 struct thread *td; 134 struct pread_args *uap; 135 { 136 struct file *fp; 137 int error; 138 139 if ((error = fget_read(td, uap->fd, &fp)) != 0) 140 return (error); 141 if (fp->f_type != DTYPE_VNODE) { 142 error = ESPIPE; 143 } else { 144 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 145 uap->offset, FOF_OFFSET); 146 } 147 fdrop(fp, td); 148 return(error); 149 } 150 151 /* 152 * Code common for read and pread 153 */ 154 int 155 dofileread(td, fp, fd, buf, nbyte, offset, flags) 156 struct thread *td; 157 struct file *fp; 158 int fd, flags; 159 void *buf; 160 size_t nbyte; 161 off_t offset; 162 { 163 struct uio auio; 164 struct iovec aiov; 165 long cnt, error = 0; 166 #ifdef KTRACE 167 struct iovec ktriov; 168 struct uio ktruio; 169 int didktr = 0; 170 #endif 171 172 aiov.iov_base = (caddr_t)buf; 173 aiov.iov_len = nbyte; 174 auio.uio_iov = &aiov; 175 auio.uio_iovcnt = 1; 176 auio.uio_offset = offset; 177 if (nbyte > INT_MAX) 178 return (EINVAL); 179 auio.uio_resid = nbyte; 180 auio.uio_rw = UIO_READ; 181 auio.uio_segflg = UIO_USERSPACE; 182 auio.uio_td = td; 183 #ifdef KTRACE 184 /* 185 * if tracing, save a copy of iovec 186 */ 187 if (KTRPOINT(td, KTR_GENIO)) { 188 ktriov = aiov; 189 ktruio = auio; 190 didktr = 1; 191 } 192 #endif 193 cnt = nbyte; 194 195 if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) { 196 if (auio.uio_resid != cnt && (error == ERESTART || 197 error == EINTR || error == EWOULDBLOCK)) 198 error = 0; 199 } 200 cnt -= auio.uio_resid; 201 #ifdef KTRACE 202 if (didktr && error == 0) { 203 ktruio.uio_iov = &ktriov; 204 ktruio.uio_resid = cnt; 205 ktrgenio(fd, UIO_READ, &ktruio, error); 206 } 207 #endif 208 td->td_retval[0] = cnt; 209 return (error); 210 } 211 212 /* 213 * Scatter read system call. 214 */ 215 #ifndef _SYS_SYSPROTO_H_ 216 struct readv_args { 217 int fd; 218 struct iovec *iovp; 219 u_int iovcnt; 220 }; 221 #endif 222 /* 223 * MPSAFE 224 */ 225 int 226 readv(td, uap) 227 struct thread *td; 228 struct readv_args *uap; 229 { 230 struct file *fp; 231 struct uio auio; 232 struct iovec *iov; 233 struct iovec *needfree; 234 struct iovec aiov[UIO_SMALLIOV]; 235 long i, cnt; 236 int error; 237 u_int iovlen; 238 #ifdef KTRACE 239 struct iovec *ktriov = NULL; 240 struct uio ktruio; 241 #endif 242 243 if ((error = fget_read(td, uap->fd, &fp)) != 0) 244 return (error); 245 needfree = NULL; 246 /* note: can't use iovlen until iovcnt is validated */ 247 iovlen = uap->iovcnt * sizeof (struct iovec); 248 if (uap->iovcnt > UIO_SMALLIOV) { 249 if (uap->iovcnt > UIO_MAXIOV) { 250 error = EINVAL; 251 goto done; 252 } 253 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 254 needfree = iov; 255 } else 256 iov = aiov; 257 auio.uio_iov = iov; 258 auio.uio_iovcnt = uap->iovcnt; 259 auio.uio_rw = UIO_READ; 260 auio.uio_segflg = UIO_USERSPACE; 261 auio.uio_td = td; 262 auio.uio_offset = -1; 263 if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen))) 264 goto done; 265 auio.uio_resid = 0; 266 for (i = 0; i < uap->iovcnt; i++) { 267 if (iov->iov_len > INT_MAX - auio.uio_resid) { 268 error = EINVAL; 269 goto done; 270 } 271 auio.uio_resid += iov->iov_len; 272 iov++; 273 } 274 #ifdef KTRACE 275 /* 276 * if tracing, save a copy of iovec 277 */ 278 if (KTRPOINT(td, KTR_GENIO)) { 279 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 280 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); 281 ktruio = auio; 282 } 283 #endif 284 cnt = auio.uio_resid; 285 if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) { 286 if (auio.uio_resid != cnt && (error == ERESTART || 287 error == EINTR || error == EWOULDBLOCK)) 288 error = 0; 289 } 290 cnt -= auio.uio_resid; 291 #ifdef KTRACE 292 if (ktriov != NULL) { 293 if (error == 0) { 294 ktruio.uio_iov = ktriov; 295 ktruio.uio_resid = cnt; 296 ktrgenio(uap->fd, UIO_READ, &ktruio, error); 297 } 298 FREE(ktriov, M_TEMP); 299 } 300 #endif 301 td->td_retval[0] = cnt; 302 done: 303 fdrop(fp, td); 304 if (needfree) 305 FREE(needfree, M_IOV); 306 return (error); 307 } 308 309 /* 310 * Write system call 311 */ 312 #ifndef _SYS_SYSPROTO_H_ 313 struct write_args { 314 int fd; 315 const void *buf; 316 size_t nbyte; 317 }; 318 #endif 319 /* 320 * MPSAFE 321 */ 322 int 323 write(td, uap) 324 struct thread *td; 325 struct write_args *uap; 326 { 327 struct file *fp; 328 int error; 329 330 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 331 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, 332 (off_t)-1, 0); 333 fdrop(fp, td); 334 } else { 335 error = EBADF; /* XXX this can't be right */ 336 } 337 return(error); 338 } 339 340 /* 341 * Pwrite system call 342 */ 343 #ifndef _SYS_SYSPROTO_H_ 344 struct pwrite_args { 345 int fd; 346 const void *buf; 347 size_t nbyte; 348 int pad; 349 off_t offset; 350 }; 351 #endif 352 /* 353 * MPSAFE 354 */ 355 int 356 pwrite(td, uap) 357 struct thread *td; 358 struct pwrite_args *uap; 359 { 360 struct file *fp; 361 int error; 362 363 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 364 if (fp->f_type == DTYPE_VNODE) { 365 error = dofilewrite(td, fp, uap->fd, uap->buf, 366 uap->nbyte, uap->offset, FOF_OFFSET); 367 } else { 368 error = ESPIPE; 369 } 370 fdrop(fp, td); 371 } else { 372 error = EBADF; /* this can't be right */ 373 } 374 return(error); 375 } 376 377 static int 378 dofilewrite(td, fp, fd, buf, nbyte, offset, flags) 379 struct thread *td; 380 struct file *fp; 381 int fd, flags; 382 const void *buf; 383 size_t nbyte; 384 off_t offset; 385 { 386 struct uio auio; 387 struct iovec aiov; 388 long cnt, error = 0; 389 #ifdef KTRACE 390 struct iovec ktriov; 391 struct uio ktruio; 392 int didktr = 0; 393 #endif 394 395 aiov.iov_base = (void *)(uintptr_t)buf; 396 aiov.iov_len = nbyte; 397 auio.uio_iov = &aiov; 398 auio.uio_iovcnt = 1; 399 auio.uio_offset = offset; 400 if (nbyte > INT_MAX) 401 return (EINVAL); 402 auio.uio_resid = nbyte; 403 auio.uio_rw = UIO_WRITE; 404 auio.uio_segflg = UIO_USERSPACE; 405 auio.uio_td = td; 406 #ifdef KTRACE 407 /* 408 * if tracing, save a copy of iovec and uio 409 */ 410 if (KTRPOINT(td, KTR_GENIO)) { 411 ktriov = aiov; 412 ktruio = auio; 413 didktr = 1; 414 } 415 #endif 416 cnt = nbyte; 417 if (fp->f_type == DTYPE_VNODE) 418 bwillwrite(); 419 if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) { 420 if (auio.uio_resid != cnt && (error == ERESTART || 421 error == EINTR || error == EWOULDBLOCK)) 422 error = 0; 423 if (error == EPIPE) { 424 PROC_LOCK(td->td_proc); 425 psignal(td->td_proc, SIGPIPE); 426 PROC_UNLOCK(td->td_proc); 427 } 428 } 429 cnt -= auio.uio_resid; 430 #ifdef KTRACE 431 if (didktr && error == 0) { 432 ktruio.uio_iov = &ktriov; 433 ktruio.uio_resid = cnt; 434 ktrgenio(fd, UIO_WRITE, &ktruio, error); 435 } 436 #endif 437 td->td_retval[0] = cnt; 438 return (error); 439 } 440 441 /* 442 * Gather write system call 443 */ 444 #ifndef _SYS_SYSPROTO_H_ 445 struct writev_args { 446 int fd; 447 struct iovec *iovp; 448 u_int iovcnt; 449 }; 450 #endif 451 /* 452 * MPSAFE 453 */ 454 int 455 writev(td, uap) 456 struct thread *td; 457 register struct writev_args *uap; 458 { 459 struct file *fp; 460 struct uio auio; 461 register struct iovec *iov; 462 struct iovec *needfree; 463 struct iovec aiov[UIO_SMALLIOV]; 464 long i, cnt, error = 0; 465 u_int iovlen; 466 #ifdef KTRACE 467 struct iovec *ktriov = NULL; 468 struct uio ktruio; 469 #endif 470 471 mtx_lock(&Giant); 472 if ((error = fget_write(td, uap->fd, &fp)) != 0) { 473 error = EBADF; 474 goto done2; 475 } 476 /* note: can't use iovlen until iovcnt is validated */ 477 iovlen = uap->iovcnt * sizeof (struct iovec); 478 if (uap->iovcnt > UIO_SMALLIOV) { 479 if (uap->iovcnt > UIO_MAXIOV) { 480 needfree = NULL; 481 error = EINVAL; 482 goto done; 483 } 484 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 485 needfree = iov; 486 } else { 487 iov = aiov; 488 needfree = NULL; 489 } 490 auio.uio_iov = iov; 491 auio.uio_iovcnt = uap->iovcnt; 492 auio.uio_rw = UIO_WRITE; 493 auio.uio_segflg = UIO_USERSPACE; 494 auio.uio_td = td; 495 auio.uio_offset = -1; 496 if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen))) 497 goto done; 498 auio.uio_resid = 0; 499 for (i = 0; i < uap->iovcnt; i++) { 500 if (iov->iov_len > INT_MAX - auio.uio_resid) { 501 error = EINVAL; 502 goto done; 503 } 504 auio.uio_resid += iov->iov_len; 505 iov++; 506 } 507 #ifdef KTRACE 508 /* 509 * if tracing, save a copy of iovec and uio 510 */ 511 if (KTRPOINT(td, KTR_GENIO)) { 512 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 513 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); 514 ktruio = auio; 515 } 516 #endif 517 cnt = auio.uio_resid; 518 if (fp->f_type == DTYPE_VNODE) 519 bwillwrite(); 520 if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) { 521 if (auio.uio_resid != cnt && (error == ERESTART || 522 error == EINTR || error == EWOULDBLOCK)) 523 error = 0; 524 if (error == EPIPE) { 525 PROC_LOCK(td->td_proc); 526 psignal(td->td_proc, SIGPIPE); 527 PROC_UNLOCK(td->td_proc); 528 } 529 } 530 cnt -= auio.uio_resid; 531 #ifdef KTRACE 532 if (ktriov != NULL) { 533 if (error == 0) { 534 ktruio.uio_iov = ktriov; 535 ktruio.uio_resid = cnt; 536 ktrgenio(uap->fd, UIO_WRITE, &ktruio, error); 537 } 538 FREE(ktriov, M_TEMP); 539 } 540 #endif 541 td->td_retval[0] = cnt; 542 done: 543 fdrop(fp, td); 544 if (needfree) 545 FREE(needfree, M_IOV); 546 done2: 547 mtx_unlock(&Giant); 548 return (error); 549 } 550 551 /* 552 * Ioctl system call 553 */ 554 #ifndef _SYS_SYSPROTO_H_ 555 struct ioctl_args { 556 int fd; 557 u_long com; 558 caddr_t data; 559 }; 560 #endif 561 /* 562 * MPSAFE 563 */ 564 /* ARGSUSED */ 565 int 566 ioctl(td, uap) 567 struct thread *td; 568 register struct ioctl_args *uap; 569 { 570 struct file *fp; 571 register struct filedesc *fdp; 572 register u_long com; 573 int error = 0; 574 register u_int size; 575 caddr_t data, memp; 576 int tmp; 577 #define STK_PARAMS 128 578 union { 579 char stkbuf[STK_PARAMS]; 580 long align; 581 } ubuf; 582 583 if ((error = fget(td, uap->fd, &fp)) != 0) 584 return (error); 585 mtx_lock(&Giant); 586 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 587 fdrop(fp, td); 588 mtx_unlock(&Giant); 589 return (EBADF); 590 } 591 fdp = td->td_proc->p_fd; 592 switch (com = uap->com) { 593 case FIONCLEX: 594 FILEDESC_LOCK(fdp); 595 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; 596 FILEDESC_UNLOCK(fdp); 597 fdrop(fp, td); 598 mtx_unlock(&Giant); 599 return (0); 600 case FIOCLEX: 601 FILEDESC_LOCK(fdp); 602 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; 603 FILEDESC_UNLOCK(fdp); 604 fdrop(fp, td); 605 mtx_unlock(&Giant); 606 return (0); 607 } 608 609 /* 610 * Interpret high order word to find amount of data to be 611 * copied to/from the user's address space. 612 */ 613 size = IOCPARM_LEN(com); 614 if (size > IOCPARM_MAX) { 615 fdrop(fp, td); 616 mtx_unlock(&Giant); 617 return (ENOTTY); 618 } 619 620 memp = NULL; 621 if (size > sizeof (ubuf.stkbuf)) { 622 memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 623 data = memp; 624 } else { 625 data = ubuf.stkbuf; 626 } 627 if (com&IOC_IN) { 628 if (size) { 629 error = copyin(uap->data, data, (u_int)size); 630 if (error) { 631 if (memp) 632 free(memp, M_IOCTLOPS); 633 fdrop(fp, td); 634 goto done; 635 } 636 } else { 637 *(caddr_t *)data = uap->data; 638 } 639 } else if ((com&IOC_OUT) && size) { 640 /* 641 * Zero the buffer so the user always 642 * gets back something deterministic. 643 */ 644 bzero(data, size); 645 } else if (com&IOC_VOID) { 646 *(caddr_t *)data = uap->data; 647 } 648 649 switch (com) { 650 651 case FIONBIO: 652 FILE_LOCK(fp); 653 if ((tmp = *(int *)data)) 654 fp->f_flag |= FNONBLOCK; 655 else 656 fp->f_flag &= ~FNONBLOCK; 657 FILE_UNLOCK(fp); 658 error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td); 659 break; 660 661 case FIOASYNC: 662 FILE_LOCK(fp); 663 if ((tmp = *(int *)data)) 664 fp->f_flag |= FASYNC; 665 else 666 fp->f_flag &= ~FASYNC; 667 FILE_UNLOCK(fp); 668 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td); 669 break; 670 671 default: 672 error = fo_ioctl(fp, com, data, td); 673 /* 674 * Copy any data to user, size was 675 * already set and checked above. 676 */ 677 if (error == 0 && (com&IOC_OUT) && size) 678 error = copyout(data, uap->data, (u_int)size); 679 break; 680 } 681 if (memp) 682 free(memp, M_IOCTLOPS); 683 fdrop(fp, td); 684 done: 685 mtx_unlock(&Giant); 686 return (error); 687 } 688 689 /* 690 * sellock and selwait are initialized in selectinit() via SYSINIT. 691 */ 692 struct mtx sellock; 693 struct cv selwait; 694 u_int nselcoll; /* Select collisions since boot */ 695 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 696 697 /* 698 * Select system call. 699 */ 700 #ifndef _SYS_SYSPROTO_H_ 701 struct select_args { 702 int nd; 703 fd_set *in, *ou, *ex; 704 struct timeval *tv; 705 }; 706 #endif 707 /* 708 * MPSAFE 709 */ 710 int 711 select(td, uap) 712 register struct thread *td; 713 register struct select_args *uap; 714 { 715 struct filedesc *fdp; 716 /* 717 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 718 * infds with the new FD_SETSIZE of 1024, and more than enough for 719 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 720 * of 256. 721 */ 722 fd_mask s_selbits[howmany(2048, NFDBITS)]; 723 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 724 struct timeval atv, rtv, ttv; 725 int error, timo; 726 u_int ncoll, nbufbytes, ncpbytes, nfdbits; 727 728 if (uap->nd < 0) 729 return (EINVAL); 730 fdp = td->td_proc->p_fd; 731 mtx_lock(&Giant); 732 FILEDESC_LOCK(fdp); 733 734 if (uap->nd > td->td_proc->p_fd->fd_nfiles) 735 uap->nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 736 FILEDESC_UNLOCK(fdp); 737 738 /* 739 * Allocate just enough bits for the non-null fd_sets. Use the 740 * preallocated auto buffer if possible. 741 */ 742 nfdbits = roundup(uap->nd, NFDBITS); 743 ncpbytes = nfdbits / NBBY; 744 nbufbytes = 0; 745 if (uap->in != NULL) 746 nbufbytes += 2 * ncpbytes; 747 if (uap->ou != NULL) 748 nbufbytes += 2 * ncpbytes; 749 if (uap->ex != NULL) 750 nbufbytes += 2 * ncpbytes; 751 if (nbufbytes <= sizeof s_selbits) 752 selbits = &s_selbits[0]; 753 else 754 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 755 756 /* 757 * Assign pointers into the bit buffers and fetch the input bits. 758 * Put the output buffers together so that they can be bzeroed 759 * together. 760 */ 761 sbp = selbits; 762 #define getbits(name, x) \ 763 do { \ 764 if (uap->name == NULL) \ 765 ibits[x] = NULL; \ 766 else { \ 767 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 768 obits[x] = sbp; \ 769 sbp += ncpbytes / sizeof *sbp; \ 770 error = copyin(uap->name, ibits[x], ncpbytes); \ 771 if (error != 0) \ 772 goto done_nosellock; \ 773 } \ 774 } while (0) 775 getbits(in, 0); 776 getbits(ou, 1); 777 getbits(ex, 2); 778 #undef getbits 779 if (nbufbytes != 0) 780 bzero(selbits, nbufbytes / 2); 781 782 if (uap->tv) { 783 error = copyin((caddr_t)uap->tv, (caddr_t)&atv, 784 sizeof (atv)); 785 if (error) 786 goto done_nosellock; 787 if (itimerfix(&atv)) { 788 error = EINVAL; 789 goto done_nosellock; 790 } 791 getmicrouptime(&rtv); 792 timevaladd(&atv, &rtv); 793 } else { 794 atv.tv_sec = 0; 795 atv.tv_usec = 0; 796 } 797 timo = 0; 798 mtx_lock(&sellock); 799 retry: 800 ncoll = nselcoll; 801 mtx_lock_spin(&sched_lock); 802 td->td_flags |= TDF_SELECT; 803 mtx_unlock_spin(&sched_lock); 804 mtx_unlock(&sellock); 805 806 /* XXX Is there a better place for this? */ 807 TAILQ_INIT(&td->td_selq); 808 error = selscan(td, ibits, obits, uap->nd); 809 mtx_lock(&sellock); 810 if (error || td->td_retval[0]) 811 goto done; 812 if (atv.tv_sec || atv.tv_usec) { 813 getmicrouptime(&rtv); 814 if (timevalcmp(&rtv, &atv, >=)) 815 goto done; 816 ttv = atv; 817 timevalsub(&ttv, &rtv); 818 timo = ttv.tv_sec > 24 * 60 * 60 ? 819 24 * 60 * 60 * hz : tvtohz(&ttv); 820 } 821 822 /* 823 * An event of interest may occur while we do not hold 824 * sellock, so check TDF_SELECT and the number of 825 * collisions and rescan the file descriptors if 826 * necessary. 827 */ 828 mtx_lock_spin(&sched_lock); 829 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 830 mtx_unlock_spin(&sched_lock); 831 goto retry; 832 } 833 mtx_unlock_spin(&sched_lock); 834 835 if (timo > 0) 836 error = cv_timedwait_sig(&selwait, &sellock, timo); 837 else 838 error = cv_wait_sig(&selwait, &sellock); 839 840 if (error == 0) 841 goto retry; 842 843 done: 844 clear_selinfo_list(td); 845 mtx_lock_spin(&sched_lock); 846 td->td_flags &= ~TDF_SELECT; 847 mtx_unlock_spin(&sched_lock); 848 mtx_unlock(&sellock); 849 850 done_nosellock: 851 /* select is not restarted after signals... */ 852 if (error == ERESTART) 853 error = EINTR; 854 if (error == EWOULDBLOCK) 855 error = 0; 856 #define putbits(name, x) \ 857 if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \ 858 error = error2; 859 if (error == 0) { 860 int error2; 861 862 putbits(in, 0); 863 putbits(ou, 1); 864 putbits(ex, 2); 865 #undef putbits 866 } 867 if (selbits != &s_selbits[0]) 868 free(selbits, M_SELECT); 869 870 mtx_unlock(&Giant); 871 return (error); 872 } 873 874 static int 875 selscan(td, ibits, obits, nfd) 876 struct thread *td; 877 fd_mask **ibits, **obits; 878 int nfd; 879 { 880 int msk, i, fd; 881 fd_mask bits; 882 struct file *fp; 883 int n = 0; 884 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 885 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 886 struct filedesc *fdp = td->td_proc->p_fd; 887 888 FILEDESC_LOCK(fdp); 889 for (msk = 0; msk < 3; msk++) { 890 if (ibits[msk] == NULL) 891 continue; 892 for (i = 0; i < nfd; i += NFDBITS) { 893 bits = ibits[msk][i/NFDBITS]; 894 /* ffs(int mask) not portable, fd_mask is long */ 895 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 896 if (!(bits & 1)) 897 continue; 898 if ((fp = fget_locked(fdp, fd)) == NULL) { 899 FILEDESC_UNLOCK(fdp); 900 return (EBADF); 901 } 902 if (fo_poll(fp, flag[msk], fp->f_cred, td)) { 903 obits[msk][(fd)/NFDBITS] |= 904 ((fd_mask)1 << ((fd) % NFDBITS)); 905 n++; 906 } 907 } 908 } 909 } 910 FILEDESC_UNLOCK(fdp); 911 td->td_retval[0] = n; 912 return (0); 913 } 914 915 /* 916 * Poll system call. 917 */ 918 #ifndef _SYS_SYSPROTO_H_ 919 struct poll_args { 920 struct pollfd *fds; 921 u_int nfds; 922 int timeout; 923 }; 924 #endif 925 /* 926 * MPSAFE 927 */ 928 int 929 poll(td, uap) 930 struct thread *td; 931 struct poll_args *uap; 932 { 933 caddr_t bits; 934 char smallbits[32 * sizeof(struct pollfd)]; 935 struct timeval atv, rtv, ttv; 936 int error = 0, timo; 937 u_int ncoll, nfds; 938 size_t ni; 939 940 nfds = SCARG(uap, nfds); 941 942 mtx_lock(&Giant); 943 /* 944 * This is kinda bogus. We have fd limits, but that is not 945 * really related to the size of the pollfd array. Make sure 946 * we let the process use at least FD_SETSIZE entries and at 947 * least enough for the current limits. We want to be reasonably 948 * safe, but not overly restrictive. 949 */ 950 if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) && 951 (nfds > FD_SETSIZE)) { 952 error = EINVAL; 953 goto done2; 954 } 955 ni = nfds * sizeof(struct pollfd); 956 if (ni > sizeof(smallbits)) 957 bits = malloc(ni, M_TEMP, M_WAITOK); 958 else 959 bits = smallbits; 960 error = copyin(SCARG(uap, fds), bits, ni); 961 if (error) 962 goto done_nosellock; 963 if (SCARG(uap, timeout) != INFTIM) { 964 atv.tv_sec = SCARG(uap, timeout) / 1000; 965 atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000; 966 if (itimerfix(&atv)) { 967 error = EINVAL; 968 goto done_nosellock; 969 } 970 getmicrouptime(&rtv); 971 timevaladd(&atv, &rtv); 972 } else { 973 atv.tv_sec = 0; 974 atv.tv_usec = 0; 975 } 976 timo = 0; 977 mtx_lock(&sellock); 978 retry: 979 ncoll = nselcoll; 980 mtx_lock_spin(&sched_lock); 981 td->td_flags |= TDF_SELECT; 982 mtx_unlock_spin(&sched_lock); 983 mtx_unlock(&sellock); 984 985 /* XXX Is there a better place for this? */ 986 TAILQ_INIT(&td->td_selq); 987 error = pollscan(td, (struct pollfd *)bits, nfds); 988 mtx_lock(&sellock); 989 if (error || td->td_retval[0]) 990 goto done; 991 if (atv.tv_sec || atv.tv_usec) { 992 getmicrouptime(&rtv); 993 if (timevalcmp(&rtv, &atv, >=)) 994 goto done; 995 ttv = atv; 996 timevalsub(&ttv, &rtv); 997 timo = ttv.tv_sec > 24 * 60 * 60 ? 998 24 * 60 * 60 * hz : tvtohz(&ttv); 999 } 1000 /* 1001 * An event of interest may occur while we do not hold 1002 * sellock, so check TDF_SELECT and the number of collisions 1003 * and rescan the file descriptors if necessary. 1004 */ 1005 mtx_lock_spin(&sched_lock); 1006 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 1007 mtx_unlock_spin(&sched_lock); 1008 goto retry; 1009 } 1010 mtx_unlock_spin(&sched_lock); 1011 1012 if (timo > 0) 1013 error = cv_timedwait_sig(&selwait, &sellock, timo); 1014 else 1015 error = cv_wait_sig(&selwait, &sellock); 1016 1017 if (error == 0) 1018 goto retry; 1019 1020 done: 1021 clear_selinfo_list(td); 1022 mtx_lock_spin(&sched_lock); 1023 td->td_flags &= ~TDF_SELECT; 1024 mtx_unlock_spin(&sched_lock); 1025 mtx_unlock(&sellock); 1026 1027 done_nosellock: 1028 /* poll is not restarted after signals... */ 1029 if (error == ERESTART) 1030 error = EINTR; 1031 if (error == EWOULDBLOCK) 1032 error = 0; 1033 if (error == 0) { 1034 error = copyout(bits, SCARG(uap, fds), ni); 1035 if (error) 1036 goto out; 1037 } 1038 out: 1039 if (ni > sizeof(smallbits)) 1040 free(bits, M_TEMP); 1041 done2: 1042 mtx_unlock(&Giant); 1043 return (error); 1044 } 1045 1046 static int 1047 pollscan(td, fds, nfd) 1048 struct thread *td; 1049 struct pollfd *fds; 1050 u_int nfd; 1051 { 1052 register struct filedesc *fdp = td->td_proc->p_fd; 1053 int i; 1054 struct file *fp; 1055 int n = 0; 1056 1057 FILEDESC_LOCK(fdp); 1058 for (i = 0; i < nfd; i++, fds++) { 1059 if (fds->fd >= fdp->fd_nfiles) { 1060 fds->revents = POLLNVAL; 1061 n++; 1062 } else if (fds->fd < 0) { 1063 fds->revents = 0; 1064 } else { 1065 fp = fdp->fd_ofiles[fds->fd]; 1066 if (fp == NULL) { 1067 fds->revents = POLLNVAL; 1068 n++; 1069 } else { 1070 /* 1071 * Note: backend also returns POLLHUP and 1072 * POLLERR if appropriate. 1073 */ 1074 fds->revents = fo_poll(fp, fds->events, 1075 fp->f_cred, td); 1076 if (fds->revents != 0) 1077 n++; 1078 } 1079 } 1080 } 1081 FILEDESC_UNLOCK(fdp); 1082 td->td_retval[0] = n; 1083 return (0); 1084 } 1085 1086 /* 1087 * OpenBSD poll system call. 1088 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1089 */ 1090 #ifndef _SYS_SYSPROTO_H_ 1091 struct openbsd_poll_args { 1092 struct pollfd *fds; 1093 u_int nfds; 1094 int timeout; 1095 }; 1096 #endif 1097 /* 1098 * MPSAFE 1099 */ 1100 int 1101 openbsd_poll(td, uap) 1102 register struct thread *td; 1103 register struct openbsd_poll_args *uap; 1104 { 1105 return (poll(td, (struct poll_args *)uap)); 1106 } 1107 1108 /* 1109 * Remove the references to the thread from all of the objects 1110 * we were polling. 1111 * 1112 * This code assumes that the underlying owner of the selinfo 1113 * structure will hold sellock before it changes it, and that 1114 * it will unlink itself from our list if it goes away. 1115 */ 1116 void 1117 clear_selinfo_list(td) 1118 struct thread *td; 1119 { 1120 struct selinfo *si; 1121 1122 mtx_assert(&sellock, MA_OWNED); 1123 TAILQ_FOREACH(si, &td->td_selq, si_thrlist) 1124 si->si_thread = NULL; 1125 TAILQ_INIT(&td->td_selq); 1126 } 1127 1128 /*ARGSUSED*/ 1129 int 1130 seltrue(dev, events, td) 1131 dev_t dev; 1132 int events; 1133 struct thread *td; 1134 { 1135 1136 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1137 } 1138 1139 /* 1140 * Record a select request. 1141 */ 1142 void 1143 selrecord(selector, sip) 1144 struct thread *selector; 1145 struct selinfo *sip; 1146 { 1147 1148 mtx_lock(&sellock); 1149 /* 1150 * If the thread is NULL then take ownership of selinfo 1151 * however if the thread is not NULL and the thread points to 1152 * someone else, then we have a collision, otherwise leave it alone 1153 * as we've owned it in a previous selrecord on this selinfo. 1154 */ 1155 if (sip->si_thread == NULL) { 1156 sip->si_thread = selector; 1157 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); 1158 } else if (sip->si_thread != selector) { 1159 sip->si_flags |= SI_COLL; 1160 } 1161 1162 mtx_unlock(&sellock); 1163 } 1164 1165 /* 1166 * Do a wakeup when a selectable event occurs. 1167 */ 1168 void 1169 selwakeup(sip) 1170 struct selinfo *sip; 1171 { 1172 struct thread *td; 1173 1174 mtx_lock(&sellock); 1175 td = sip->si_thread; 1176 if ((sip->si_flags & SI_COLL) != 0) { 1177 nselcoll++; 1178 sip->si_flags &= ~SI_COLL; 1179 cv_broadcast(&selwait); 1180 } 1181 if (td == NULL) { 1182 mtx_unlock(&sellock); 1183 return; 1184 } 1185 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); 1186 sip->si_thread = NULL; 1187 mtx_lock_spin(&sched_lock); 1188 if (td->td_wchan == (caddr_t)&selwait) { 1189 if (td->td_proc->p_stat == SSLEEP) 1190 setrunnable(td); 1191 else 1192 cv_waitq_remove(td); 1193 } else 1194 td->td_flags &= ~TDF_SELECT; 1195 mtx_unlock_spin(&sched_lock); 1196 mtx_unlock(&sellock); 1197 } 1198 1199 static void selectinit(void *); 1200 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1201 1202 /* ARGSUSED*/ 1203 static void 1204 selectinit(dummy) 1205 void *dummy; 1206 { 1207 cv_init(&selwait, "select"); 1208 mtx_init(&sellock, "sellck", NULL, MTX_DEF); 1209 } 1210