1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 * $FreeBSD$ 40 */ 41 42 #include "opt_ktrace.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/sysproto.h> 47 #include <sys/filedesc.h> 48 #include <sys/filio.h> 49 #include <sys/fcntl.h> 50 #include <sys/file.h> 51 #include <sys/proc.h> 52 #include <sys/signalvar.h> 53 #include <sys/socketvar.h> 54 #include <sys/uio.h> 55 #include <sys/kernel.h> 56 #include <sys/malloc.h> 57 #include <sys/poll.h> 58 #include <sys/resourcevar.h> 59 #include <sys/selinfo.h> 60 #include <sys/sysctl.h> 61 #include <sys/sysent.h> 62 #include <sys/bio.h> 63 #include <sys/buf.h> 64 #include <sys/condvar.h> 65 #ifdef KTRACE 66 #include <sys/ktrace.h> 67 #endif 68 #include <vm/vm.h> 69 #include <vm/vm_page.h> 70 71 #include <machine/limits.h> 72 73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 75 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 76 77 static int pollscan __P((struct thread *, struct pollfd *, u_int)); 78 static int pollholddrop __P((struct thread *, struct pollfd *, u_int, int)); 79 static int selscan __P((struct thread *, fd_mask **, fd_mask **, int)); 80 static int selholddrop __P((struct thread *, fd_mask *, fd_mask *, int, int)); 81 static int dofileread __P((struct thread *, struct file *, int, void *, 82 size_t, off_t, int)); 83 static int dofilewrite __P((struct thread *, struct file *, int, 84 const void *, size_t, off_t, int)); 85 86 struct file* 87 holdfp(fdp, fd, flag) 88 struct filedesc* fdp; 89 int fd, flag; 90 { 91 struct file* fp; 92 93 if (((u_int)fd) >= fdp->fd_nfiles || 94 (fp = fdp->fd_ofiles[fd]) == NULL || 95 (fp->f_flag & flag) == 0) { 96 return (NULL); 97 } 98 fhold(fp); 99 return (fp); 100 } 101 102 /* 103 * Read system call. 104 */ 105 #ifndef _SYS_SYSPROTO_H_ 106 struct read_args { 107 int fd; 108 void *buf; 109 size_t nbyte; 110 }; 111 #endif 112 /* 113 * MPSAFE 114 */ 115 int 116 read(td, uap) 117 struct thread *td; 118 register struct read_args *uap; 119 { 120 register struct file *fp; 121 int error; 122 123 mtx_lock(&Giant); 124 if ((fp = holdfp(td->td_proc->p_fd, uap->fd, FREAD)) != NULL) { 125 error = dofileread(td, fp, uap->fd, uap->buf, 126 uap->nbyte, (off_t)-1, 0); 127 fdrop(fp, td); 128 } else { 129 error = EBADF; 130 } 131 mtx_unlock(&Giant); 132 return(error); 133 } 134 135 /* 136 * Pread system call 137 */ 138 #ifndef _SYS_SYSPROTO_H_ 139 struct pread_args { 140 int fd; 141 void *buf; 142 size_t nbyte; 143 int pad; 144 off_t offset; 145 }; 146 #endif 147 /* 148 * MPSAFE 149 */ 150 int 151 pread(td, uap) 152 struct thread *td; 153 register struct pread_args *uap; 154 { 155 register struct file *fp; 156 int error; 157 158 mtx_lock(&Giant); 159 if ((fp = holdfp(td->td_proc->p_fd, uap->fd, FREAD)) == NULL) { 160 error = EBADF; 161 } else if (fp->f_type != DTYPE_VNODE) { 162 error = ESPIPE; 163 fdrop(fp, td); 164 } else { 165 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 166 uap->offset, FOF_OFFSET); 167 fdrop(fp, td); 168 } 169 mtx_unlock(&Giant); 170 return(error); 171 } 172 173 /* 174 * Code common for read and pread 175 */ 176 int 177 dofileread(td, fp, fd, buf, nbyte, offset, flags) 178 struct thread *td; 179 struct file *fp; 180 int fd, flags; 181 void *buf; 182 size_t nbyte; 183 off_t offset; 184 { 185 struct uio auio; 186 struct iovec aiov; 187 long cnt, error = 0; 188 #ifdef KTRACE 189 struct iovec ktriov; 190 struct uio ktruio; 191 int didktr = 0; 192 #endif 193 194 aiov.iov_base = (caddr_t)buf; 195 aiov.iov_len = nbyte; 196 auio.uio_iov = &aiov; 197 auio.uio_iovcnt = 1; 198 auio.uio_offset = offset; 199 if (nbyte > INT_MAX) 200 return (EINVAL); 201 auio.uio_resid = nbyte; 202 auio.uio_rw = UIO_READ; 203 auio.uio_segflg = UIO_USERSPACE; 204 auio.uio_td = td; 205 #ifdef KTRACE 206 /* 207 * if tracing, save a copy of iovec 208 */ 209 if (KTRPOINT(td->td_proc, KTR_GENIO)) { 210 ktriov = aiov; 211 ktruio = auio; 212 didktr = 1; 213 } 214 #endif 215 cnt = nbyte; 216 217 if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) { 218 if (auio.uio_resid != cnt && (error == ERESTART || 219 error == EINTR || error == EWOULDBLOCK)) 220 error = 0; 221 } 222 cnt -= auio.uio_resid; 223 #ifdef KTRACE 224 if (didktr && error == 0) { 225 ktruio.uio_iov = &ktriov; 226 ktruio.uio_resid = cnt; 227 ktrgenio(td->td_proc->p_tracep, fd, UIO_READ, &ktruio, error); 228 } 229 #endif 230 td->td_retval[0] = cnt; 231 return (error); 232 } 233 234 /* 235 * Scatter read system call. 236 */ 237 #ifndef _SYS_SYSPROTO_H_ 238 struct readv_args { 239 int fd; 240 struct iovec *iovp; 241 u_int iovcnt; 242 }; 243 #endif 244 /* 245 * MPSAFE 246 */ 247 int 248 readv(td, uap) 249 struct thread *td; 250 register struct readv_args *uap; 251 { 252 register struct file *fp; 253 register struct filedesc *fdp; 254 struct uio auio; 255 register struct iovec *iov; 256 struct iovec *needfree; 257 struct iovec aiov[UIO_SMALLIOV]; 258 long i, cnt, error = 0; 259 u_int iovlen; 260 #ifdef KTRACE 261 struct iovec *ktriov = NULL; 262 struct uio ktruio; 263 #endif 264 mtx_lock(&Giant); 265 fdp = td->td_proc->p_fd; 266 267 if ((fp = holdfp(fdp, uap->fd, FREAD)) == NULL) { 268 error = EBADF; 269 goto done2; 270 } 271 /* note: can't use iovlen until iovcnt is validated */ 272 iovlen = uap->iovcnt * sizeof (struct iovec); 273 if (uap->iovcnt > UIO_SMALLIOV) { 274 if (uap->iovcnt > UIO_MAXIOV) { 275 error = EINVAL; 276 goto done2; 277 } 278 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 279 needfree = iov; 280 } else { 281 iov = aiov; 282 needfree = NULL; 283 } 284 auio.uio_iov = iov; 285 auio.uio_iovcnt = uap->iovcnt; 286 auio.uio_rw = UIO_READ; 287 auio.uio_segflg = UIO_USERSPACE; 288 auio.uio_td = td; 289 auio.uio_offset = -1; 290 if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen))) 291 goto done; 292 auio.uio_resid = 0; 293 for (i = 0; i < uap->iovcnt; i++) { 294 if (iov->iov_len > INT_MAX - auio.uio_resid) { 295 error = EINVAL; 296 goto done; 297 } 298 auio.uio_resid += iov->iov_len; 299 iov++; 300 } 301 #ifdef KTRACE 302 /* 303 * if tracing, save a copy of iovec 304 */ 305 if (KTRPOINT(td->td_proc, KTR_GENIO)) { 306 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 307 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); 308 ktruio = auio; 309 } 310 #endif 311 cnt = auio.uio_resid; 312 if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) { 313 if (auio.uio_resid != cnt && (error == ERESTART || 314 error == EINTR || error == EWOULDBLOCK)) 315 error = 0; 316 } 317 cnt -= auio.uio_resid; 318 #ifdef KTRACE 319 if (ktriov != NULL) { 320 if (error == 0) { 321 ktruio.uio_iov = ktriov; 322 ktruio.uio_resid = cnt; 323 ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_READ, &ktruio, 324 error); 325 } 326 FREE(ktriov, M_TEMP); 327 } 328 #endif 329 td->td_retval[0] = cnt; 330 done: 331 fdrop(fp, td); 332 if (needfree) 333 FREE(needfree, M_IOV); 334 done2: 335 mtx_unlock(&Giant); 336 return (error); 337 } 338 339 /* 340 * Write system call 341 */ 342 #ifndef _SYS_SYSPROTO_H_ 343 struct write_args { 344 int fd; 345 const void *buf; 346 size_t nbyte; 347 }; 348 #endif 349 /* 350 * MPSAFE 351 */ 352 int 353 write(td, uap) 354 struct thread *td; 355 register struct write_args *uap; 356 { 357 register struct file *fp; 358 int error; 359 360 mtx_lock(&Giant); 361 if ((fp = holdfp(td->td_proc->p_fd, uap->fd, FWRITE)) != NULL) { 362 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, 363 (off_t)-1, 0); 364 fdrop(fp, td); 365 } else { 366 error = EBADF; 367 } 368 mtx_unlock(&Giant); 369 return(error); 370 } 371 372 /* 373 * Pwrite system call 374 */ 375 #ifndef _SYS_SYSPROTO_H_ 376 struct pwrite_args { 377 int fd; 378 const void *buf; 379 size_t nbyte; 380 int pad; 381 off_t offset; 382 }; 383 #endif 384 /* 385 * MPSAFE 386 */ 387 int 388 pwrite(td, uap) 389 struct thread *td; 390 register struct pwrite_args *uap; 391 { 392 register struct file *fp; 393 int error; 394 395 mtx_lock(&Giant); 396 if ((fp = holdfp(td->td_proc->p_fd, uap->fd, FWRITE)) == NULL) { 397 error = EBADF; 398 } else if (fp->f_type != DTYPE_VNODE) { 399 error = ESPIPE; 400 fdrop(fp, td); 401 } else { 402 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, 403 uap->offset, FOF_OFFSET); 404 fdrop(fp, td); 405 } 406 mtx_unlock(&Giant); 407 return(error); 408 } 409 410 static int 411 dofilewrite(td, fp, fd, buf, nbyte, offset, flags) 412 struct thread *td; 413 struct file *fp; 414 int fd, flags; 415 const void *buf; 416 size_t nbyte; 417 off_t offset; 418 { 419 struct uio auio; 420 struct iovec aiov; 421 long cnt, error = 0; 422 #ifdef KTRACE 423 struct iovec ktriov; 424 struct uio ktruio; 425 int didktr = 0; 426 #endif 427 428 aiov.iov_base = (void *)(uintptr_t)buf; 429 aiov.iov_len = nbyte; 430 auio.uio_iov = &aiov; 431 auio.uio_iovcnt = 1; 432 auio.uio_offset = offset; 433 if (nbyte > INT_MAX) 434 return (EINVAL); 435 auio.uio_resid = nbyte; 436 auio.uio_rw = UIO_WRITE; 437 auio.uio_segflg = UIO_USERSPACE; 438 auio.uio_td = td; 439 #ifdef KTRACE 440 /* 441 * if tracing, save a copy of iovec and uio 442 */ 443 if (KTRPOINT(td->td_proc, KTR_GENIO)) { 444 ktriov = aiov; 445 ktruio = auio; 446 didktr = 1; 447 } 448 #endif 449 cnt = nbyte; 450 if (fp->f_type == DTYPE_VNODE) 451 bwillwrite(); 452 if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) { 453 if (auio.uio_resid != cnt && (error == ERESTART || 454 error == EINTR || error == EWOULDBLOCK)) 455 error = 0; 456 if (error == EPIPE) { 457 PROC_LOCK(td->td_proc); 458 psignal(td->td_proc, SIGPIPE); 459 PROC_UNLOCK(td->td_proc); 460 } 461 } 462 cnt -= auio.uio_resid; 463 #ifdef KTRACE 464 if (didktr && error == 0) { 465 ktruio.uio_iov = &ktriov; 466 ktruio.uio_resid = cnt; 467 ktrgenio(td->td_proc->p_tracep, fd, UIO_WRITE, &ktruio, error); 468 } 469 #endif 470 td->td_retval[0] = cnt; 471 return (error); 472 } 473 474 /* 475 * Gather write system call 476 */ 477 #ifndef _SYS_SYSPROTO_H_ 478 struct writev_args { 479 int fd; 480 struct iovec *iovp; 481 u_int iovcnt; 482 }; 483 #endif 484 /* 485 * MPSAFE 486 */ 487 int 488 writev(td, uap) 489 struct thread *td; 490 register struct writev_args *uap; 491 { 492 register struct file *fp; 493 register struct filedesc *fdp; 494 struct uio auio; 495 register struct iovec *iov; 496 struct iovec *needfree; 497 struct iovec aiov[UIO_SMALLIOV]; 498 long i, cnt, error = 0; 499 u_int iovlen; 500 #ifdef KTRACE 501 struct iovec *ktriov = NULL; 502 struct uio ktruio; 503 #endif 504 505 mtx_lock(&Giant); 506 fdp = td->td_proc->p_fd; 507 if ((fp = holdfp(fdp, uap->fd, FWRITE)) == NULL) { 508 error = EBADF; 509 goto done2; 510 } 511 /* note: can't use iovlen until iovcnt is validated */ 512 iovlen = uap->iovcnt * sizeof (struct iovec); 513 if (uap->iovcnt > UIO_SMALLIOV) { 514 if (uap->iovcnt > UIO_MAXIOV) { 515 needfree = NULL; 516 error = EINVAL; 517 goto done; 518 } 519 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 520 needfree = iov; 521 } else { 522 iov = aiov; 523 needfree = NULL; 524 } 525 auio.uio_iov = iov; 526 auio.uio_iovcnt = uap->iovcnt; 527 auio.uio_rw = UIO_WRITE; 528 auio.uio_segflg = UIO_USERSPACE; 529 auio.uio_td = td; 530 auio.uio_offset = -1; 531 if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen))) 532 goto done; 533 auio.uio_resid = 0; 534 for (i = 0; i < uap->iovcnt; i++) { 535 if (iov->iov_len > INT_MAX - auio.uio_resid) { 536 error = EINVAL; 537 goto done; 538 } 539 auio.uio_resid += iov->iov_len; 540 iov++; 541 } 542 #ifdef KTRACE 543 /* 544 * if tracing, save a copy of iovec and uio 545 */ 546 if (KTRPOINT(td->td_proc, KTR_GENIO)) { 547 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 548 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); 549 ktruio = auio; 550 } 551 #endif 552 cnt = auio.uio_resid; 553 if (fp->f_type == DTYPE_VNODE) 554 bwillwrite(); 555 if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) { 556 if (auio.uio_resid != cnt && (error == ERESTART || 557 error == EINTR || error == EWOULDBLOCK)) 558 error = 0; 559 if (error == EPIPE) { 560 PROC_LOCK(td->td_proc); 561 psignal(td->td_proc, SIGPIPE); 562 PROC_UNLOCK(td->td_proc); 563 } 564 } 565 cnt -= auio.uio_resid; 566 #ifdef KTRACE 567 if (ktriov != NULL) { 568 if (error == 0) { 569 ktruio.uio_iov = ktriov; 570 ktruio.uio_resid = cnt; 571 ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_WRITE, &ktruio, 572 error); 573 } 574 FREE(ktriov, M_TEMP); 575 } 576 #endif 577 td->td_retval[0] = cnt; 578 done: 579 fdrop(fp, td); 580 if (needfree) 581 FREE(needfree, M_IOV); 582 done2: 583 mtx_unlock(&Giant); 584 return (error); 585 } 586 587 /* 588 * Ioctl system call 589 */ 590 #ifndef _SYS_SYSPROTO_H_ 591 struct ioctl_args { 592 int fd; 593 u_long com; 594 caddr_t data; 595 }; 596 #endif 597 /* 598 * MPSAFE 599 */ 600 /* ARGSUSED */ 601 int 602 ioctl(td, uap) 603 struct thread *td; 604 register struct ioctl_args *uap; 605 { 606 register struct file *fp; 607 register struct filedesc *fdp; 608 register u_long com; 609 int error = 0; 610 register u_int size; 611 caddr_t data, memp; 612 int tmp; 613 #define STK_PARAMS 128 614 union { 615 char stkbuf[STK_PARAMS]; 616 long align; 617 } ubuf; 618 619 mtx_lock(&Giant); 620 fdp = td->td_proc->p_fd; 621 if ((u_int)uap->fd >= fdp->fd_nfiles || 622 (fp = fdp->fd_ofiles[uap->fd]) == NULL) { 623 error = EBADF; 624 goto done2; 625 } 626 627 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 628 error = EBADF; 629 goto done2; 630 } 631 632 switch (com = uap->com) { 633 case FIONCLEX: 634 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; 635 goto done2; 636 case FIOCLEX: 637 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; 638 goto done2; 639 } 640 641 /* 642 * Interpret high order word to find amount of data to be 643 * copied to/from the user's address space. 644 */ 645 size = IOCPARM_LEN(com); 646 if (size > IOCPARM_MAX) { 647 error = ENOTTY; 648 goto done2; 649 } 650 651 fhold(fp); 652 653 memp = NULL; 654 if (size > sizeof (ubuf.stkbuf)) { 655 memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 656 data = memp; 657 } else { 658 data = ubuf.stkbuf; 659 } 660 if (com&IOC_IN) { 661 if (size) { 662 error = copyin(uap->data, data, (u_int)size); 663 if (error) { 664 if (memp) 665 free(memp, M_IOCTLOPS); 666 fdrop(fp, td); 667 goto done2; 668 } 669 } else { 670 *(caddr_t *)data = uap->data; 671 } 672 } else if ((com&IOC_OUT) && size) { 673 /* 674 * Zero the buffer so the user always 675 * gets back something deterministic. 676 */ 677 bzero(data, size); 678 } else if (com&IOC_VOID) { 679 *(caddr_t *)data = uap->data; 680 } 681 682 switch (com) { 683 684 case FIONBIO: 685 if ((tmp = *(int *)data)) 686 fp->f_flag |= FNONBLOCK; 687 else 688 fp->f_flag &= ~FNONBLOCK; 689 error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td); 690 break; 691 692 case FIOASYNC: 693 if ((tmp = *(int *)data)) 694 fp->f_flag |= FASYNC; 695 else 696 fp->f_flag &= ~FASYNC; 697 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td); 698 break; 699 700 default: 701 error = fo_ioctl(fp, com, data, td); 702 /* 703 * Copy any data to user, size was 704 * already set and checked above. 705 */ 706 if (error == 0 && (com&IOC_OUT) && size) 707 error = copyout(data, uap->data, (u_int)size); 708 break; 709 } 710 if (memp) 711 free(memp, M_IOCTLOPS); 712 fdrop(fp, td); 713 done2: 714 mtx_unlock(&Giant); 715 return (error); 716 } 717 718 static int nselcoll; /* Select collisions since boot */ 719 struct cv selwait; 720 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 721 722 /* 723 * Select system call. 724 */ 725 #ifndef _SYS_SYSPROTO_H_ 726 struct select_args { 727 int nd; 728 fd_set *in, *ou, *ex; 729 struct timeval *tv; 730 }; 731 #endif 732 /* 733 * MPSAFE 734 */ 735 int 736 select(td, uap) 737 register struct thread *td; 738 register struct select_args *uap; 739 { 740 /* 741 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 742 * infds with the new FD_SETSIZE of 1024, and more than enough for 743 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 744 * of 256. 745 */ 746 fd_mask s_selbits[howmany(2048, NFDBITS)]; 747 fd_mask s_heldbits[howmany(2048, NFDBITS)]; 748 fd_mask *ibits[3], *obits[3], *selbits, *sbp, *heldbits, *hibits, *hobits; 749 struct timeval atv, rtv, ttv; 750 int ncoll, error, timo, i; 751 u_int nbufbytes, ncpbytes, nfdbits; 752 753 if (uap->nd < 0) 754 return (EINVAL); 755 756 mtx_lock(&Giant); 757 758 if (uap->nd > td->td_proc->p_fd->fd_nfiles) 759 uap->nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 760 761 /* 762 * Allocate just enough bits for the non-null fd_sets. Use the 763 * preallocated auto buffer if possible. 764 */ 765 nfdbits = roundup(uap->nd, NFDBITS); 766 ncpbytes = nfdbits / NBBY; 767 nbufbytes = 0; 768 if (uap->in != NULL) 769 nbufbytes += 2 * ncpbytes; 770 if (uap->ou != NULL) 771 nbufbytes += 2 * ncpbytes; 772 if (uap->ex != NULL) 773 nbufbytes += 2 * ncpbytes; 774 if (nbufbytes <= sizeof s_selbits) 775 selbits = &s_selbits[0]; 776 else 777 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 778 if (2 * ncpbytes <= sizeof s_heldbits) { 779 bzero(s_heldbits, sizeof(s_heldbits)); 780 heldbits = &s_heldbits[0]; 781 } else 782 heldbits = malloc(2 * ncpbytes, M_SELECT, M_WAITOK | M_ZERO); 783 784 /* 785 * Assign pointers into the bit buffers and fetch the input bits. 786 * Put the output buffers together so that they can be bzeroed 787 * together. 788 */ 789 sbp = selbits; 790 hibits = heldbits + ncpbytes / sizeof *heldbits; 791 hobits = heldbits; 792 #define getbits(name, x) \ 793 do { \ 794 if (uap->name == NULL) \ 795 ibits[x] = NULL; \ 796 else { \ 797 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 798 obits[x] = sbp; \ 799 sbp += ncpbytes / sizeof *sbp; \ 800 error = copyin(uap->name, ibits[x], ncpbytes); \ 801 if (error != 0) \ 802 goto done_noproclock; \ 803 for (i = 0; \ 804 i < ncpbytes / sizeof ibits[i][0]; \ 805 i++) \ 806 hibits[i] |= ibits[x][i]; \ 807 } \ 808 } while (0) 809 getbits(in, 0); 810 getbits(ou, 1); 811 getbits(ex, 2); 812 #undef getbits 813 if (nbufbytes != 0) 814 bzero(selbits, nbufbytes / 2); 815 816 if (uap->tv) { 817 error = copyin((caddr_t)uap->tv, (caddr_t)&atv, 818 sizeof (atv)); 819 if (error) 820 goto done_noproclock; 821 if (itimerfix(&atv)) { 822 error = EINVAL; 823 goto done_noproclock; 824 } 825 getmicrouptime(&rtv); 826 timevaladd(&atv, &rtv); 827 } else { 828 atv.tv_sec = 0; 829 atv.tv_usec = 0; 830 } 831 selholddrop(td, hibits, hobits, uap->nd, 1); 832 timo = 0; 833 PROC_LOCK(td->td_proc); 834 retry: 835 ncoll = nselcoll; 836 mtx_lock_spin(&sched_lock); 837 td->td_flags |= TDF_SELECT; 838 mtx_unlock_spin(&sched_lock); 839 PROC_UNLOCK(td->td_proc); 840 error = selscan(td, ibits, obits, uap->nd); 841 PROC_LOCK(td->td_proc); 842 if (error || td->td_retval[0]) 843 goto done; 844 if (atv.tv_sec || atv.tv_usec) { 845 getmicrouptime(&rtv); 846 if (timevalcmp(&rtv, &atv, >=)) { 847 /* 848 * An event of our interest may occur during locking a process. 849 * In order to avoid missing the event that occured during locking 850 * the process, test TDF_SELECT and rescan file descriptors if 851 * necessary. 852 */ 853 mtx_lock_spin(&sched_lock); 854 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 855 ncoll = nselcoll; 856 td->td_flags |= TDF_SELECT; 857 mtx_unlock_spin(&sched_lock); 858 PROC_UNLOCK(td->td_proc); 859 error = selscan(td, ibits, obits, uap->nd); 860 PROC_LOCK(td->td_proc); 861 } else 862 mtx_unlock_spin(&sched_lock); 863 goto done; 864 } 865 ttv = atv; 866 timevalsub(&ttv, &rtv); 867 timo = ttv.tv_sec > 24 * 60 * 60 ? 868 24 * 60 * 60 * hz : tvtohz(&ttv); 869 } 870 mtx_lock_spin(&sched_lock); 871 td->td_flags &= ~TDF_SELECT; 872 mtx_unlock_spin(&sched_lock); 873 874 if (timo > 0) 875 error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo); 876 else 877 error = cv_wait_sig(&selwait, &td->td_proc->p_mtx); 878 879 if (error == 0) 880 goto retry; 881 882 done: 883 mtx_lock_spin(&sched_lock); 884 td->td_flags &= ~TDF_SELECT; 885 mtx_unlock_spin(&sched_lock); 886 PROC_UNLOCK(td->td_proc); 887 selholddrop(td, hibits, hobits, uap->nd, 0); 888 done_noproclock: 889 /* select is not restarted after signals... */ 890 if (error == ERESTART) 891 error = EINTR; 892 if (error == EWOULDBLOCK) 893 error = 0; 894 #define putbits(name, x) \ 895 if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \ 896 error = error2; 897 if (error == 0) { 898 int error2; 899 900 putbits(in, 0); 901 putbits(ou, 1); 902 putbits(ex, 2); 903 #undef putbits 904 } 905 if (selbits != &s_selbits[0]) 906 free(selbits, M_SELECT); 907 if (heldbits != &s_heldbits[0]) 908 free(heldbits, M_SELECT); 909 910 mtx_unlock(&Giant); 911 return (error); 912 } 913 914 static int 915 selholddrop(td, ibits, obits, nfd, hold) 916 struct thread *td; 917 fd_mask *ibits, *obits; 918 int nfd, hold; 919 { 920 struct filedesc *fdp = td->td_proc->p_fd; 921 int i, fd; 922 fd_mask bits; 923 struct file *fp; 924 925 for (i = 0; i < nfd; i += NFDBITS) { 926 if (hold) 927 bits = ibits[i/NFDBITS]; 928 else 929 bits = obits[i/NFDBITS]; 930 /* ffs(int mask) not portable, fd_mask is long */ 931 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 932 if (!(bits & 1)) 933 continue; 934 fp = fdp->fd_ofiles[fd]; 935 if (fp == NULL) 936 return (EBADF); 937 if (hold) { 938 fhold(fp); 939 obits[(fd)/NFDBITS] |= 940 ((fd_mask)1 << ((fd) % NFDBITS)); 941 } else 942 fdrop(fp, td); 943 } 944 } 945 return (0); 946 } 947 948 static int 949 selscan(td, ibits, obits, nfd) 950 struct thread *td; 951 fd_mask **ibits, **obits; 952 int nfd; 953 { 954 struct filedesc *fdp = td->td_proc->p_fd; 955 int msk, i, fd; 956 fd_mask bits; 957 struct file *fp; 958 int n = 0; 959 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 960 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 961 962 for (msk = 0; msk < 3; msk++) { 963 if (ibits[msk] == NULL) 964 continue; 965 for (i = 0; i < nfd; i += NFDBITS) { 966 bits = ibits[msk][i/NFDBITS]; 967 /* ffs(int mask) not portable, fd_mask is long */ 968 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 969 if (!(bits & 1)) 970 continue; 971 fp = fdp->fd_ofiles[fd]; 972 if (fp == NULL) 973 return (EBADF); 974 if (fo_poll(fp, flag[msk], fp->f_cred, td)) { 975 obits[msk][(fd)/NFDBITS] |= 976 ((fd_mask)1 << ((fd) % NFDBITS)); 977 n++; 978 } 979 } 980 } 981 } 982 td->td_retval[0] = n; 983 return (0); 984 } 985 986 /* 987 * Poll system call. 988 */ 989 #ifndef _SYS_SYSPROTO_H_ 990 struct poll_args { 991 struct pollfd *fds; 992 u_int nfds; 993 int timeout; 994 }; 995 #endif 996 /* 997 * MPSAFE 998 */ 999 int 1000 poll(td, uap) 1001 struct thread *td; 1002 struct poll_args *uap; 1003 { 1004 caddr_t bits; 1005 char smallbits[32 * sizeof(struct pollfd)]; 1006 struct timeval atv, rtv, ttv; 1007 int ncoll, error = 0, timo; 1008 u_int nfds; 1009 size_t ni; 1010 struct pollfd p_heldbits[32]; 1011 struct pollfd *heldbits; 1012 1013 nfds = SCARG(uap, nfds); 1014 1015 mtx_lock(&Giant); 1016 /* 1017 * This is kinda bogus. We have fd limits, but that is not 1018 * really related to the size of the pollfd array. Make sure 1019 * we let the process use at least FD_SETSIZE entries and at 1020 * least enough for the current limits. We want to be reasonably 1021 * safe, but not overly restrictive. 1022 */ 1023 if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) && 1024 (nfds > FD_SETSIZE)) { 1025 error = EINVAL; 1026 goto done2; 1027 } 1028 ni = nfds * sizeof(struct pollfd); 1029 if (ni > sizeof(smallbits)) 1030 bits = malloc(ni, M_TEMP, M_WAITOK); 1031 else 1032 bits = smallbits; 1033 if (ni > sizeof(p_heldbits)) 1034 heldbits = malloc(ni, M_TEMP, M_WAITOK); 1035 else { 1036 bzero(p_heldbits, sizeof(p_heldbits)); 1037 heldbits = p_heldbits; 1038 } 1039 error = copyin(SCARG(uap, fds), bits, ni); 1040 if (error) 1041 goto done_noproclock; 1042 bcopy(bits, heldbits, ni); 1043 if (SCARG(uap, timeout) != INFTIM) { 1044 atv.tv_sec = SCARG(uap, timeout) / 1000; 1045 atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000; 1046 if (itimerfix(&atv)) { 1047 error = EINVAL; 1048 goto done_noproclock; 1049 } 1050 getmicrouptime(&rtv); 1051 timevaladd(&atv, &rtv); 1052 } else { 1053 atv.tv_sec = 0; 1054 atv.tv_usec = 0; 1055 } 1056 pollholddrop(td, heldbits, nfds, 1); 1057 timo = 0; 1058 PROC_LOCK(td->td_proc); 1059 retry: 1060 ncoll = nselcoll; 1061 mtx_lock_spin(&sched_lock); 1062 td->td_flags |= TDF_SELECT; 1063 mtx_unlock_spin(&sched_lock); 1064 PROC_UNLOCK(td->td_proc); 1065 error = pollscan(td, (struct pollfd *)bits, nfds); 1066 PROC_LOCK(td->td_proc); 1067 if (error || td->td_retval[0]) 1068 goto done; 1069 if (atv.tv_sec || atv.tv_usec) { 1070 getmicrouptime(&rtv); 1071 if (timevalcmp(&rtv, &atv, >=)) { 1072 /* 1073 * An event of our interest may occur during locking a process. 1074 * In order to avoid missing the event that occured during locking 1075 * the process, test TDF_SELECT and rescan file descriptors if 1076 * necessary. 1077 */ 1078 mtx_lock_spin(&sched_lock); 1079 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 1080 ncoll = nselcoll; 1081 td->td_flags |= TDF_SELECT; 1082 mtx_unlock_spin(&sched_lock); 1083 PROC_UNLOCK(td->td_proc); 1084 error = pollscan(td, (struct pollfd *)bits, nfds); 1085 PROC_LOCK(td->td_proc); 1086 } else 1087 mtx_unlock_spin(&sched_lock); 1088 goto done; 1089 } 1090 ttv = atv; 1091 timevalsub(&ttv, &rtv); 1092 timo = ttv.tv_sec > 24 * 60 * 60 ? 1093 24 * 60 * 60 * hz : tvtohz(&ttv); 1094 } 1095 mtx_lock_spin(&sched_lock); 1096 td->td_flags &= ~TDF_SELECT; 1097 mtx_unlock_spin(&sched_lock); 1098 if (timo > 0) 1099 error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo); 1100 else 1101 error = cv_wait_sig(&selwait, &td->td_proc->p_mtx); 1102 if (error == 0) 1103 goto retry; 1104 1105 done: 1106 mtx_lock_spin(&sched_lock); 1107 td->td_flags &= ~TDF_SELECT; 1108 mtx_unlock_spin(&sched_lock); 1109 PROC_UNLOCK(td->td_proc); 1110 pollholddrop(td, heldbits, nfds, 0); 1111 done_noproclock: 1112 /* poll is not restarted after signals... */ 1113 if (error == ERESTART) 1114 error = EINTR; 1115 if (error == EWOULDBLOCK) 1116 error = 0; 1117 if (error == 0) { 1118 error = copyout(bits, SCARG(uap, fds), ni); 1119 if (error) 1120 goto out; 1121 } 1122 out: 1123 if (ni > sizeof(smallbits)) 1124 free(bits, M_TEMP); 1125 if (ni > sizeof(p_heldbits)) 1126 free(heldbits, M_TEMP); 1127 done2: 1128 mtx_unlock(&Giant); 1129 return (error); 1130 } 1131 1132 static int 1133 pollholddrop(td, fds, nfd, hold) 1134 struct thread *td; 1135 struct pollfd *fds; 1136 u_int nfd; 1137 int hold; 1138 { 1139 register struct filedesc *fdp = td->td_proc->p_fd; 1140 int i; 1141 struct file *fp; 1142 1143 for (i = 0; i < nfd; i++, fds++) { 1144 if (0 <= fds->fd && fds->fd < fdp->fd_nfiles) { 1145 fp = fdp->fd_ofiles[fds->fd]; 1146 if (hold) { 1147 if (fp != NULL) { 1148 fhold(fp); 1149 fds->revents = 1; 1150 } else 1151 fds->revents = 0; 1152 } else if(fp != NULL && fds->revents) 1153 fdrop(fp, td); 1154 } 1155 } 1156 return (0); 1157 } 1158 1159 static int 1160 pollscan(td, fds, nfd) 1161 struct thread *td; 1162 struct pollfd *fds; 1163 u_int nfd; 1164 { 1165 register struct filedesc *fdp = td->td_proc->p_fd; 1166 int i; 1167 struct file *fp; 1168 int n = 0; 1169 1170 for (i = 0; i < nfd; i++, fds++) { 1171 if (fds->fd >= fdp->fd_nfiles) { 1172 fds->revents = POLLNVAL; 1173 n++; 1174 } else if (fds->fd < 0) { 1175 fds->revents = 0; 1176 } else { 1177 fp = fdp->fd_ofiles[fds->fd]; 1178 if (fp == NULL) { 1179 fds->revents = POLLNVAL; 1180 n++; 1181 } else { 1182 /* 1183 * Note: backend also returns POLLHUP and 1184 * POLLERR if appropriate. 1185 */ 1186 fds->revents = fo_poll(fp, fds->events, 1187 fp->f_cred, td); 1188 if (fds->revents != 0) 1189 n++; 1190 } 1191 } 1192 } 1193 td->td_retval[0] = n; 1194 return (0); 1195 } 1196 1197 /* 1198 * OpenBSD poll system call. 1199 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1200 */ 1201 #ifndef _SYS_SYSPROTO_H_ 1202 struct openbsd_poll_args { 1203 struct pollfd *fds; 1204 u_int nfds; 1205 int timeout; 1206 }; 1207 #endif 1208 /* 1209 * MPSAFE 1210 */ 1211 int 1212 openbsd_poll(td, uap) 1213 register struct thread *td; 1214 register struct openbsd_poll_args *uap; 1215 { 1216 return (poll(td, (struct poll_args *)uap)); 1217 } 1218 1219 /*ARGSUSED*/ 1220 int 1221 seltrue(dev, events, td) 1222 dev_t dev; 1223 int events; 1224 struct thread *td; 1225 { 1226 1227 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1228 } 1229 1230 static int 1231 find_thread_in_proc(struct proc *p, struct thread *td) 1232 { 1233 struct thread *td2; 1234 FOREACH_THREAD_IN_PROC(p, td2) { 1235 if (td2 == td) { 1236 return (1); 1237 } 1238 } 1239 return (0); 1240 } 1241 1242 /* 1243 * Record a select request. 1244 */ 1245 void 1246 selrecord(selector, sip) 1247 struct thread *selector; 1248 struct selinfo *sip; 1249 { 1250 struct proc *p; 1251 pid_t mypid; 1252 1253 mypid = selector->td_proc->p_pid; 1254 if ((sip->si_pid == mypid) && 1255 (sip->si_thread == selector)) { /* XXXKSE should be an ID? */ 1256 return; 1257 } 1258 if (sip->si_pid && 1259 (p = pfind(sip->si_pid)) && 1260 (find_thread_in_proc(p, sip->si_thread))) { 1261 mtx_lock_spin(&sched_lock); 1262 if (sip->si_thread->td_wchan == (caddr_t)&selwait) { 1263 mtx_unlock_spin(&sched_lock); 1264 PROC_UNLOCK(p); 1265 sip->si_flags |= SI_COLL; 1266 return; 1267 } 1268 mtx_unlock_spin(&sched_lock); 1269 PROC_UNLOCK(p); 1270 } 1271 sip->si_pid = mypid; 1272 sip->si_thread = selector; 1273 } 1274 1275 /* 1276 * Do a wakeup when a selectable event occurs. 1277 */ 1278 void 1279 selwakeup(sip) 1280 register struct selinfo *sip; 1281 { 1282 struct thread *td; 1283 register struct proc *p; 1284 1285 if (sip->si_pid == 0) 1286 return; 1287 if (sip->si_flags & SI_COLL) { 1288 nselcoll++; 1289 sip->si_flags &= ~SI_COLL; 1290 cv_broadcast(&selwait); 1291 } 1292 p = pfind(sip->si_pid); 1293 sip->si_pid = 0; 1294 td = sip->si_thread; 1295 if (p != NULL) { 1296 if (!find_thread_in_proc(p, td)) { 1297 PROC_UNLOCK(p); /* lock is in pfind() */; 1298 return; 1299 } 1300 mtx_lock_spin(&sched_lock); 1301 if (td->td_wchan == (caddr_t)&selwait) { 1302 if (td->td_proc->p_stat == SSLEEP) 1303 setrunnable(td); 1304 else 1305 cv_waitq_remove(td); 1306 } else 1307 td->td_flags &= ~TDF_SELECT; 1308 mtx_unlock_spin(&sched_lock); 1309 PROC_UNLOCK(p); /* Lock is in pfind() */ 1310 } 1311 } 1312 1313 static void selectinit __P((void *)); 1314 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1315 1316 /* ARGSUSED*/ 1317 static void 1318 selectinit(dummy) 1319 void *dummy; 1320 { 1321 cv_init(&selwait, "select"); 1322 } 1323