1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 * $FreeBSD$ 40 */ 41 42 #include "opt_ktrace.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/sysproto.h> 47 #include <sys/filedesc.h> 48 #include <sys/filio.h> 49 #include <sys/fcntl.h> 50 #include <sys/file.h> 51 #include <sys/proc.h> 52 #include <sys/signalvar.h> 53 #include <sys/socketvar.h> 54 #include <sys/uio.h> 55 #include <sys/kernel.h> 56 #include <sys/malloc.h> 57 #include <sys/poll.h> 58 #include <sys/resourcevar.h> 59 #include <sys/selinfo.h> 60 #include <sys/sysctl.h> 61 #include <sys/sysent.h> 62 #include <sys/bio.h> 63 #include <sys/buf.h> 64 #include <sys/condvar.h> 65 #ifdef KTRACE 66 #include <sys/ktrace.h> 67 #endif 68 #include <vm/vm.h> 69 #include <vm/vm_page.h> 70 71 #include <machine/limits.h> 72 73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 75 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 76 77 static int pollscan __P((struct thread *, struct pollfd *, u_int)); 78 static int selscan __P((struct thread *, fd_mask **, fd_mask **, int)); 79 static int dofileread __P((struct thread *, struct file *, int, void *, 80 size_t, off_t, int)); 81 static int dofilewrite __P((struct thread *, struct file *, int, 82 const void *, size_t, off_t, int)); 83 84 /* 85 * Read system call. 86 */ 87 #ifndef _SYS_SYSPROTO_H_ 88 struct read_args { 89 int fd; 90 void *buf; 91 size_t nbyte; 92 }; 93 #endif 94 /* 95 * MPSAFE 96 */ 97 int 98 read(td, uap) 99 struct thread *td; 100 struct read_args *uap; 101 { 102 struct file *fp; 103 int error; 104 105 mtx_lock(&Giant); 106 if ((error = fget_read(td, uap->fd, &fp)) == 0) { 107 error = dofileread(td, fp, uap->fd, uap->buf, 108 uap->nbyte, (off_t)-1, 0); 109 fdrop(fp, td); 110 } 111 mtx_unlock(&Giant); 112 return(error); 113 } 114 115 /* 116 * Pread system call 117 */ 118 #ifndef _SYS_SYSPROTO_H_ 119 struct pread_args { 120 int fd; 121 void *buf; 122 size_t nbyte; 123 int pad; 124 off_t offset; 125 }; 126 #endif 127 /* 128 * MPSAFE 129 */ 130 int 131 pread(td, uap) 132 struct thread *td; 133 struct pread_args *uap; 134 { 135 struct file *fp; 136 int error; 137 138 if ((error = fget_read(td, uap->fd, &fp)) != 0) 139 return (error); 140 mtx_lock(&Giant); 141 if (fp->f_type != DTYPE_VNODE) { 142 error = ESPIPE; 143 } else { 144 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 145 uap->offset, FOF_OFFSET); 146 } 147 fdrop(fp, td); 148 mtx_unlock(&Giant); 149 return(error); 150 } 151 152 /* 153 * Code common for read and pread 154 */ 155 int 156 dofileread(td, fp, fd, buf, nbyte, offset, flags) 157 struct thread *td; 158 struct file *fp; 159 int fd, flags; 160 void *buf; 161 size_t nbyte; 162 off_t offset; 163 { 164 struct uio auio; 165 struct iovec aiov; 166 long cnt, error = 0; 167 #ifdef KTRACE 168 struct iovec ktriov; 169 struct uio ktruio; 170 int didktr = 0; 171 #endif 172 173 aiov.iov_base = (caddr_t)buf; 174 aiov.iov_len = nbyte; 175 auio.uio_iov = &aiov; 176 auio.uio_iovcnt = 1; 177 auio.uio_offset = offset; 178 if (nbyte > INT_MAX) 179 return (EINVAL); 180 auio.uio_resid = nbyte; 181 auio.uio_rw = UIO_READ; 182 auio.uio_segflg = UIO_USERSPACE; 183 auio.uio_td = td; 184 #ifdef KTRACE 185 /* 186 * if tracing, save a copy of iovec 187 */ 188 if (KTRPOINT(td->td_proc, KTR_GENIO)) { 189 ktriov = aiov; 190 ktruio = auio; 191 didktr = 1; 192 } 193 #endif 194 cnt = nbyte; 195 196 if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) { 197 if (auio.uio_resid != cnt && (error == ERESTART || 198 error == EINTR || error == EWOULDBLOCK)) 199 error = 0; 200 } 201 cnt -= auio.uio_resid; 202 #ifdef KTRACE 203 if (didktr && error == 0) { 204 ktruio.uio_iov = &ktriov; 205 ktruio.uio_resid = cnt; 206 ktrgenio(td->td_proc->p_tracep, fd, UIO_READ, &ktruio, error); 207 } 208 #endif 209 td->td_retval[0] = cnt; 210 return (error); 211 } 212 213 /* 214 * Scatter read system call. 215 */ 216 #ifndef _SYS_SYSPROTO_H_ 217 struct readv_args { 218 int fd; 219 struct iovec *iovp; 220 u_int iovcnt; 221 }; 222 #endif 223 /* 224 * MPSAFE 225 */ 226 int 227 readv(td, uap) 228 struct thread *td; 229 struct readv_args *uap; 230 { 231 struct file *fp; 232 struct uio auio; 233 struct iovec *iov; 234 struct iovec *needfree; 235 struct iovec aiov[UIO_SMALLIOV]; 236 long i, cnt, error = 0; 237 u_int iovlen; 238 #ifdef KTRACE 239 struct iovec *ktriov = NULL; 240 struct uio ktruio; 241 #endif 242 mtx_lock(&Giant); 243 244 if ((error = fget_read(td, uap->fd, &fp)) != 0) 245 goto done2; 246 /* note: can't use iovlen until iovcnt is validated */ 247 iovlen = uap->iovcnt * sizeof (struct iovec); 248 if (uap->iovcnt > UIO_SMALLIOV) { 249 if (uap->iovcnt > UIO_MAXIOV) { 250 error = EINVAL; 251 goto done2; 252 } 253 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 254 needfree = iov; 255 } else { 256 iov = aiov; 257 needfree = NULL; 258 } 259 auio.uio_iov = iov; 260 auio.uio_iovcnt = uap->iovcnt; 261 auio.uio_rw = UIO_READ; 262 auio.uio_segflg = UIO_USERSPACE; 263 auio.uio_td = td; 264 auio.uio_offset = -1; 265 if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen))) 266 goto done; 267 auio.uio_resid = 0; 268 for (i = 0; i < uap->iovcnt; i++) { 269 if (iov->iov_len > INT_MAX - auio.uio_resid) { 270 error = EINVAL; 271 goto done; 272 } 273 auio.uio_resid += iov->iov_len; 274 iov++; 275 } 276 #ifdef KTRACE 277 /* 278 * if tracing, save a copy of iovec 279 */ 280 if (KTRPOINT(td->td_proc, KTR_GENIO)) { 281 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 282 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); 283 ktruio = auio; 284 } 285 #endif 286 cnt = auio.uio_resid; 287 if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) { 288 if (auio.uio_resid != cnt && (error == ERESTART || 289 error == EINTR || error == EWOULDBLOCK)) 290 error = 0; 291 } 292 cnt -= auio.uio_resid; 293 #ifdef KTRACE 294 if (ktriov != NULL) { 295 if (error == 0) { 296 ktruio.uio_iov = ktriov; 297 ktruio.uio_resid = cnt; 298 ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_READ, &ktruio, 299 error); 300 } 301 FREE(ktriov, M_TEMP); 302 } 303 #endif 304 td->td_retval[0] = cnt; 305 done: 306 fdrop(fp, td); 307 if (needfree) 308 FREE(needfree, M_IOV); 309 done2: 310 mtx_unlock(&Giant); 311 return (error); 312 } 313 314 /* 315 * Write system call 316 */ 317 #ifndef _SYS_SYSPROTO_H_ 318 struct write_args { 319 int fd; 320 const void *buf; 321 size_t nbyte; 322 }; 323 #endif 324 /* 325 * MPSAFE 326 */ 327 int 328 write(td, uap) 329 struct thread *td; 330 struct write_args *uap; 331 { 332 struct file *fp; 333 int error; 334 335 mtx_lock(&Giant); 336 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 337 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, 338 (off_t)-1, 0); 339 fdrop(fp, td); 340 } else { 341 error = EBADF; /* XXX this can't be right */ 342 } 343 mtx_unlock(&Giant); 344 return(error); 345 } 346 347 /* 348 * Pwrite system call 349 */ 350 #ifndef _SYS_SYSPROTO_H_ 351 struct pwrite_args { 352 int fd; 353 const void *buf; 354 size_t nbyte; 355 int pad; 356 off_t offset; 357 }; 358 #endif 359 /* 360 * MPSAFE 361 */ 362 int 363 pwrite(td, uap) 364 struct thread *td; 365 struct pwrite_args *uap; 366 { 367 struct file *fp; 368 int error; 369 370 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 371 mtx_lock(&Giant); 372 if (fp->f_type == DTYPE_VNODE) { 373 error = dofilewrite(td, fp, uap->fd, uap->buf, 374 uap->nbyte, uap->offset, FOF_OFFSET); 375 } else { 376 error = ESPIPE; 377 } 378 fdrop(fp, td); 379 mtx_unlock(&Giant); 380 } else { 381 error = EBADF; /* this can't be right */ 382 } 383 return(error); 384 } 385 386 static int 387 dofilewrite(td, fp, fd, buf, nbyte, offset, flags) 388 struct thread *td; 389 struct file *fp; 390 int fd, flags; 391 const void *buf; 392 size_t nbyte; 393 off_t offset; 394 { 395 struct uio auio; 396 struct iovec aiov; 397 long cnt, error = 0; 398 #ifdef KTRACE 399 struct iovec ktriov; 400 struct uio ktruio; 401 int didktr = 0; 402 #endif 403 404 aiov.iov_base = (void *)(uintptr_t)buf; 405 aiov.iov_len = nbyte; 406 auio.uio_iov = &aiov; 407 auio.uio_iovcnt = 1; 408 auio.uio_offset = offset; 409 if (nbyte > INT_MAX) 410 return (EINVAL); 411 auio.uio_resid = nbyte; 412 auio.uio_rw = UIO_WRITE; 413 auio.uio_segflg = UIO_USERSPACE; 414 auio.uio_td = td; 415 #ifdef KTRACE 416 /* 417 * if tracing, save a copy of iovec and uio 418 */ 419 if (KTRPOINT(td->td_proc, KTR_GENIO)) { 420 ktriov = aiov; 421 ktruio = auio; 422 didktr = 1; 423 } 424 #endif 425 cnt = nbyte; 426 if (fp->f_type == DTYPE_VNODE) 427 bwillwrite(); 428 if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) { 429 if (auio.uio_resid != cnt && (error == ERESTART || 430 error == EINTR || error == EWOULDBLOCK)) 431 error = 0; 432 if (error == EPIPE) { 433 PROC_LOCK(td->td_proc); 434 psignal(td->td_proc, SIGPIPE); 435 PROC_UNLOCK(td->td_proc); 436 } 437 } 438 cnt -= auio.uio_resid; 439 #ifdef KTRACE 440 if (didktr && error == 0) { 441 ktruio.uio_iov = &ktriov; 442 ktruio.uio_resid = cnt; 443 ktrgenio(td->td_proc->p_tracep, fd, UIO_WRITE, &ktruio, error); 444 } 445 #endif 446 td->td_retval[0] = cnt; 447 return (error); 448 } 449 450 /* 451 * Gather write system call 452 */ 453 #ifndef _SYS_SYSPROTO_H_ 454 struct writev_args { 455 int fd; 456 struct iovec *iovp; 457 u_int iovcnt; 458 }; 459 #endif 460 /* 461 * MPSAFE 462 */ 463 int 464 writev(td, uap) 465 struct thread *td; 466 register struct writev_args *uap; 467 { 468 struct file *fp; 469 struct uio auio; 470 register struct iovec *iov; 471 struct iovec *needfree; 472 struct iovec aiov[UIO_SMALLIOV]; 473 long i, cnt, error = 0; 474 u_int iovlen; 475 #ifdef KTRACE 476 struct iovec *ktriov = NULL; 477 struct uio ktruio; 478 #endif 479 480 mtx_lock(&Giant); 481 if ((error = fget_write(td, uap->fd, &fp)) != 0) { 482 error = EBADF; 483 goto done2; 484 } 485 /* note: can't use iovlen until iovcnt is validated */ 486 iovlen = uap->iovcnt * sizeof (struct iovec); 487 if (uap->iovcnt > UIO_SMALLIOV) { 488 if (uap->iovcnt > UIO_MAXIOV) { 489 needfree = NULL; 490 error = EINVAL; 491 goto done; 492 } 493 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 494 needfree = iov; 495 } else { 496 iov = aiov; 497 needfree = NULL; 498 } 499 auio.uio_iov = iov; 500 auio.uio_iovcnt = uap->iovcnt; 501 auio.uio_rw = UIO_WRITE; 502 auio.uio_segflg = UIO_USERSPACE; 503 auio.uio_td = td; 504 auio.uio_offset = -1; 505 if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen))) 506 goto done; 507 auio.uio_resid = 0; 508 for (i = 0; i < uap->iovcnt; i++) { 509 if (iov->iov_len > INT_MAX - auio.uio_resid) { 510 error = EINVAL; 511 goto done; 512 } 513 auio.uio_resid += iov->iov_len; 514 iov++; 515 } 516 #ifdef KTRACE 517 /* 518 * if tracing, save a copy of iovec and uio 519 */ 520 if (KTRPOINT(td->td_proc, KTR_GENIO)) { 521 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 522 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); 523 ktruio = auio; 524 } 525 #endif 526 cnt = auio.uio_resid; 527 if (fp->f_type == DTYPE_VNODE) 528 bwillwrite(); 529 if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) { 530 if (auio.uio_resid != cnt && (error == ERESTART || 531 error == EINTR || error == EWOULDBLOCK)) 532 error = 0; 533 if (error == EPIPE) { 534 PROC_LOCK(td->td_proc); 535 psignal(td->td_proc, SIGPIPE); 536 PROC_UNLOCK(td->td_proc); 537 } 538 } 539 cnt -= auio.uio_resid; 540 #ifdef KTRACE 541 if (ktriov != NULL) { 542 if (error == 0) { 543 ktruio.uio_iov = ktriov; 544 ktruio.uio_resid = cnt; 545 ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_WRITE, &ktruio, 546 error); 547 } 548 FREE(ktriov, M_TEMP); 549 } 550 #endif 551 td->td_retval[0] = cnt; 552 done: 553 fdrop(fp, td); 554 if (needfree) 555 FREE(needfree, M_IOV); 556 done2: 557 mtx_unlock(&Giant); 558 return (error); 559 } 560 561 /* 562 * Ioctl system call 563 */ 564 #ifndef _SYS_SYSPROTO_H_ 565 struct ioctl_args { 566 int fd; 567 u_long com; 568 caddr_t data; 569 }; 570 #endif 571 /* 572 * MPSAFE 573 */ 574 /* ARGSUSED */ 575 int 576 ioctl(td, uap) 577 struct thread *td; 578 register struct ioctl_args *uap; 579 { 580 struct file *fp; 581 register struct filedesc *fdp; 582 register u_long com; 583 int error = 0; 584 register u_int size; 585 caddr_t data, memp; 586 int tmp; 587 #define STK_PARAMS 128 588 union { 589 char stkbuf[STK_PARAMS]; 590 long align; 591 } ubuf; 592 593 if ((error = fget(td, uap->fd, &fp)) != 0) 594 return (error); 595 mtx_lock(&Giant); 596 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 597 fdrop(fp, td); 598 mtx_unlock(&Giant); 599 return (EBADF); 600 } 601 fdp = td->td_proc->p_fd; 602 switch (com = uap->com) { 603 case FIONCLEX: 604 FILEDESC_LOCK(fdp); 605 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; 606 FILEDESC_UNLOCK(fdp); 607 fdrop(fp, td); 608 mtx_unlock(&Giant); 609 return (0); 610 case FIOCLEX: 611 FILEDESC_LOCK(fdp); 612 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; 613 FILEDESC_UNLOCK(fdp); 614 fdrop(fp, td); 615 mtx_unlock(&Giant); 616 return (0); 617 } 618 619 /* 620 * Interpret high order word to find amount of data to be 621 * copied to/from the user's address space. 622 */ 623 size = IOCPARM_LEN(com); 624 if (size > IOCPARM_MAX) { 625 fdrop(fp, td); 626 mtx_unlock(&Giant); 627 return (ENOTTY); 628 } 629 630 memp = NULL; 631 if (size > sizeof (ubuf.stkbuf)) { 632 memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 633 data = memp; 634 } else { 635 data = ubuf.stkbuf; 636 } 637 if (com&IOC_IN) { 638 if (size) { 639 error = copyin(uap->data, data, (u_int)size); 640 if (error) { 641 if (memp) 642 free(memp, M_IOCTLOPS); 643 fdrop(fp, td); 644 goto done; 645 } 646 } else { 647 *(caddr_t *)data = uap->data; 648 } 649 } else if ((com&IOC_OUT) && size) { 650 /* 651 * Zero the buffer so the user always 652 * gets back something deterministic. 653 */ 654 bzero(data, size); 655 } else if (com&IOC_VOID) { 656 *(caddr_t *)data = uap->data; 657 } 658 659 switch (com) { 660 661 case FIONBIO: 662 FILE_LOCK(fp); 663 if ((tmp = *(int *)data)) 664 fp->f_flag |= FNONBLOCK; 665 else 666 fp->f_flag &= ~FNONBLOCK; 667 FILE_UNLOCK(fp); 668 error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td); 669 break; 670 671 case FIOASYNC: 672 FILE_LOCK(fp); 673 if ((tmp = *(int *)data)) 674 fp->f_flag |= FASYNC; 675 else 676 fp->f_flag &= ~FASYNC; 677 FILE_UNLOCK(fp); 678 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td); 679 break; 680 681 default: 682 error = fo_ioctl(fp, com, data, td); 683 /* 684 * Copy any data to user, size was 685 * already set and checked above. 686 */ 687 if (error == 0 && (com&IOC_OUT) && size) 688 error = copyout(data, uap->data, (u_int)size); 689 break; 690 } 691 if (memp) 692 free(memp, M_IOCTLOPS); 693 fdrop(fp, td); 694 done: 695 mtx_unlock(&Giant); 696 return (error); 697 } 698 699 static int nselcoll; /* Select collisions since boot */ 700 struct cv selwait; 701 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 702 703 /* 704 * Select system call. 705 */ 706 #ifndef _SYS_SYSPROTO_H_ 707 struct select_args { 708 int nd; 709 fd_set *in, *ou, *ex; 710 struct timeval *tv; 711 }; 712 #endif 713 /* 714 * MPSAFE 715 */ 716 int 717 select(td, uap) 718 register struct thread *td; 719 register struct select_args *uap; 720 { 721 struct filedesc *fdp; 722 /* 723 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 724 * infds with the new FD_SETSIZE of 1024, and more than enough for 725 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 726 * of 256. 727 */ 728 fd_mask s_selbits[howmany(2048, NFDBITS)]; 729 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 730 struct timeval atv, rtv, ttv; 731 int ncoll, error, timo; 732 u_int nbufbytes, ncpbytes, nfdbits; 733 734 if (uap->nd < 0) 735 return (EINVAL); 736 fdp = td->td_proc->p_fd; 737 mtx_lock(&Giant); 738 FILEDESC_LOCK(fdp); 739 740 if (uap->nd > td->td_proc->p_fd->fd_nfiles) 741 uap->nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 742 FILEDESC_UNLOCK(fdp); 743 744 /* 745 * Allocate just enough bits for the non-null fd_sets. Use the 746 * preallocated auto buffer if possible. 747 */ 748 nfdbits = roundup(uap->nd, NFDBITS); 749 ncpbytes = nfdbits / NBBY; 750 nbufbytes = 0; 751 if (uap->in != NULL) 752 nbufbytes += 2 * ncpbytes; 753 if (uap->ou != NULL) 754 nbufbytes += 2 * ncpbytes; 755 if (uap->ex != NULL) 756 nbufbytes += 2 * ncpbytes; 757 if (nbufbytes <= sizeof s_selbits) 758 selbits = &s_selbits[0]; 759 else 760 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 761 762 /* 763 * Assign pointers into the bit buffers and fetch the input bits. 764 * Put the output buffers together so that they can be bzeroed 765 * together. 766 */ 767 sbp = selbits; 768 #define getbits(name, x) \ 769 do { \ 770 if (uap->name == NULL) \ 771 ibits[x] = NULL; \ 772 else { \ 773 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 774 obits[x] = sbp; \ 775 sbp += ncpbytes / sizeof *sbp; \ 776 error = copyin(uap->name, ibits[x], ncpbytes); \ 777 if (error != 0) \ 778 goto done_noproclock; \ 779 } \ 780 } while (0) 781 getbits(in, 0); 782 getbits(ou, 1); 783 getbits(ex, 2); 784 #undef getbits 785 if (nbufbytes != 0) 786 bzero(selbits, nbufbytes / 2); 787 788 if (uap->tv) { 789 error = copyin((caddr_t)uap->tv, (caddr_t)&atv, 790 sizeof (atv)); 791 if (error) 792 goto done_noproclock; 793 if (itimerfix(&atv)) { 794 error = EINVAL; 795 goto done_noproclock; 796 } 797 getmicrouptime(&rtv); 798 timevaladd(&atv, &rtv); 799 } else { 800 atv.tv_sec = 0; 801 atv.tv_usec = 0; 802 } 803 timo = 0; 804 PROC_LOCK(td->td_proc); 805 retry: 806 ncoll = nselcoll; 807 mtx_lock_spin(&sched_lock); 808 td->td_flags |= TDF_SELECT; 809 mtx_unlock_spin(&sched_lock); 810 PROC_UNLOCK(td->td_proc); 811 error = selscan(td, ibits, obits, uap->nd); 812 PROC_LOCK(td->td_proc); 813 if (error || td->td_retval[0]) 814 goto done; 815 if (atv.tv_sec || atv.tv_usec) { 816 getmicrouptime(&rtv); 817 if (timevalcmp(&rtv, &atv, >=)) { 818 /* 819 * An event of our interest may occur during locking a process. 820 * In order to avoid missing the event that occured during locking 821 * the process, test TDF_SELECT and rescan file descriptors if 822 * necessary. 823 */ 824 mtx_lock_spin(&sched_lock); 825 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 826 ncoll = nselcoll; 827 td->td_flags |= TDF_SELECT; 828 mtx_unlock_spin(&sched_lock); 829 PROC_UNLOCK(td->td_proc); 830 error = selscan(td, ibits, obits, uap->nd); 831 PROC_LOCK(td->td_proc); 832 } else 833 mtx_unlock_spin(&sched_lock); 834 goto done; 835 } 836 ttv = atv; 837 timevalsub(&ttv, &rtv); 838 timo = ttv.tv_sec > 24 * 60 * 60 ? 839 24 * 60 * 60 * hz : tvtohz(&ttv); 840 } 841 mtx_lock_spin(&sched_lock); 842 td->td_flags &= ~TDF_SELECT; 843 mtx_unlock_spin(&sched_lock); 844 845 if (timo > 0) 846 error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo); 847 else 848 error = cv_wait_sig(&selwait, &td->td_proc->p_mtx); 849 850 if (error == 0) 851 goto retry; 852 853 done: 854 mtx_lock_spin(&sched_lock); 855 td->td_flags &= ~TDF_SELECT; 856 mtx_unlock_spin(&sched_lock); 857 PROC_UNLOCK(td->td_proc); 858 done_noproclock: 859 /* select is not restarted after signals... */ 860 if (error == ERESTART) 861 error = EINTR; 862 if (error == EWOULDBLOCK) 863 error = 0; 864 #define putbits(name, x) \ 865 if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \ 866 error = error2; 867 if (error == 0) { 868 int error2; 869 870 putbits(in, 0); 871 putbits(ou, 1); 872 putbits(ex, 2); 873 #undef putbits 874 } 875 if (selbits != &s_selbits[0]) 876 free(selbits, M_SELECT); 877 878 mtx_unlock(&Giant); 879 return (error); 880 } 881 882 static int 883 selscan(td, ibits, obits, nfd) 884 struct thread *td; 885 fd_mask **ibits, **obits; 886 int nfd; 887 { 888 int msk, i, fd; 889 fd_mask bits; 890 struct file *fp; 891 int n = 0; 892 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 893 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 894 struct filedesc *fdp = td->td_proc->p_fd; 895 896 FILEDESC_LOCK(fdp); 897 for (msk = 0; msk < 3; msk++) { 898 if (ibits[msk] == NULL) 899 continue; 900 for (i = 0; i < nfd; i += NFDBITS) { 901 bits = ibits[msk][i/NFDBITS]; 902 /* ffs(int mask) not portable, fd_mask is long */ 903 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 904 if (!(bits & 1)) 905 continue; 906 if ((fp = fget_locked(fdp, fd)) == NULL) { 907 FILEDESC_UNLOCK(fdp); 908 return (EBADF); 909 } 910 if (fo_poll(fp, flag[msk], fp->f_cred, td)) { 911 obits[msk][(fd)/NFDBITS] |= 912 ((fd_mask)1 << ((fd) % NFDBITS)); 913 n++; 914 } 915 } 916 } 917 } 918 FILEDESC_UNLOCK(fdp); 919 td->td_retval[0] = n; 920 return (0); 921 } 922 923 /* 924 * Poll system call. 925 */ 926 #ifndef _SYS_SYSPROTO_H_ 927 struct poll_args { 928 struct pollfd *fds; 929 u_int nfds; 930 int timeout; 931 }; 932 #endif 933 /* 934 * MPSAFE 935 */ 936 int 937 poll(td, uap) 938 struct thread *td; 939 struct poll_args *uap; 940 { 941 caddr_t bits; 942 char smallbits[32 * sizeof(struct pollfd)]; 943 struct timeval atv, rtv, ttv; 944 int ncoll, error = 0, timo; 945 u_int nfds; 946 size_t ni; 947 948 nfds = SCARG(uap, nfds); 949 950 mtx_lock(&Giant); 951 /* 952 * This is kinda bogus. We have fd limits, but that is not 953 * really related to the size of the pollfd array. Make sure 954 * we let the process use at least FD_SETSIZE entries and at 955 * least enough for the current limits. We want to be reasonably 956 * safe, but not overly restrictive. 957 */ 958 if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) && 959 (nfds > FD_SETSIZE)) { 960 error = EINVAL; 961 goto done2; 962 } 963 ni = nfds * sizeof(struct pollfd); 964 if (ni > sizeof(smallbits)) 965 bits = malloc(ni, M_TEMP, M_WAITOK); 966 else 967 bits = smallbits; 968 error = copyin(SCARG(uap, fds), bits, ni); 969 if (error) 970 goto done_noproclock; 971 if (SCARG(uap, timeout) != INFTIM) { 972 atv.tv_sec = SCARG(uap, timeout) / 1000; 973 atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000; 974 if (itimerfix(&atv)) { 975 error = EINVAL; 976 goto done_noproclock; 977 } 978 getmicrouptime(&rtv); 979 timevaladd(&atv, &rtv); 980 } else { 981 atv.tv_sec = 0; 982 atv.tv_usec = 0; 983 } 984 timo = 0; 985 PROC_LOCK(td->td_proc); 986 retry: 987 ncoll = nselcoll; 988 mtx_lock_spin(&sched_lock); 989 td->td_flags |= TDF_SELECT; 990 mtx_unlock_spin(&sched_lock); 991 PROC_UNLOCK(td->td_proc); 992 error = pollscan(td, (struct pollfd *)bits, nfds); 993 PROC_LOCK(td->td_proc); 994 if (error || td->td_retval[0]) 995 goto done; 996 if (atv.tv_sec || atv.tv_usec) { 997 getmicrouptime(&rtv); 998 if (timevalcmp(&rtv, &atv, >=)) { 999 /* 1000 * An event of our interest may occur during locking a process. 1001 * In order to avoid missing the event that occured during locking 1002 * the process, test TDF_SELECT and rescan file descriptors if 1003 * necessary. 1004 */ 1005 mtx_lock_spin(&sched_lock); 1006 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 1007 ncoll = nselcoll; 1008 td->td_flags |= TDF_SELECT; 1009 mtx_unlock_spin(&sched_lock); 1010 PROC_UNLOCK(td->td_proc); 1011 error = pollscan(td, (struct pollfd *)bits, nfds); 1012 PROC_LOCK(td->td_proc); 1013 } else 1014 mtx_unlock_spin(&sched_lock); 1015 goto done; 1016 } 1017 ttv = atv; 1018 timevalsub(&ttv, &rtv); 1019 timo = ttv.tv_sec > 24 * 60 * 60 ? 1020 24 * 60 * 60 * hz : tvtohz(&ttv); 1021 } 1022 mtx_lock_spin(&sched_lock); 1023 td->td_flags &= ~TDF_SELECT; 1024 mtx_unlock_spin(&sched_lock); 1025 if (timo > 0) 1026 error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo); 1027 else 1028 error = cv_wait_sig(&selwait, &td->td_proc->p_mtx); 1029 if (error == 0) 1030 goto retry; 1031 1032 done: 1033 mtx_lock_spin(&sched_lock); 1034 td->td_flags &= ~TDF_SELECT; 1035 mtx_unlock_spin(&sched_lock); 1036 PROC_UNLOCK(td->td_proc); 1037 done_noproclock: 1038 /* poll is not restarted after signals... */ 1039 if (error == ERESTART) 1040 error = EINTR; 1041 if (error == EWOULDBLOCK) 1042 error = 0; 1043 if (error == 0) { 1044 error = copyout(bits, SCARG(uap, fds), ni); 1045 if (error) 1046 goto out; 1047 } 1048 out: 1049 if (ni > sizeof(smallbits)) 1050 free(bits, M_TEMP); 1051 done2: 1052 mtx_unlock(&Giant); 1053 return (error); 1054 } 1055 1056 static int 1057 pollscan(td, fds, nfd) 1058 struct thread *td; 1059 struct pollfd *fds; 1060 u_int nfd; 1061 { 1062 register struct filedesc *fdp = td->td_proc->p_fd; 1063 int i; 1064 struct file *fp; 1065 int n = 0; 1066 1067 FILEDESC_LOCK(fdp); 1068 for (i = 0; i < nfd; i++, fds++) { 1069 if (fds->fd >= fdp->fd_nfiles) { 1070 fds->revents = POLLNVAL; 1071 n++; 1072 } else if (fds->fd < 0) { 1073 fds->revents = 0; 1074 } else { 1075 fp = fdp->fd_ofiles[fds->fd]; 1076 if (fp == NULL) { 1077 fds->revents = POLLNVAL; 1078 n++; 1079 } else { 1080 /* 1081 * Note: backend also returns POLLHUP and 1082 * POLLERR if appropriate. 1083 */ 1084 fds->revents = fo_poll(fp, fds->events, 1085 fp->f_cred, td); 1086 if (fds->revents != 0) 1087 n++; 1088 } 1089 } 1090 } 1091 FILEDESC_UNLOCK(fdp); 1092 td->td_retval[0] = n; 1093 return (0); 1094 } 1095 1096 /* 1097 * OpenBSD poll system call. 1098 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1099 */ 1100 #ifndef _SYS_SYSPROTO_H_ 1101 struct openbsd_poll_args { 1102 struct pollfd *fds; 1103 u_int nfds; 1104 int timeout; 1105 }; 1106 #endif 1107 /* 1108 * MPSAFE 1109 */ 1110 int 1111 openbsd_poll(td, uap) 1112 register struct thread *td; 1113 register struct openbsd_poll_args *uap; 1114 { 1115 return (poll(td, (struct poll_args *)uap)); 1116 } 1117 1118 /*ARGSUSED*/ 1119 int 1120 seltrue(dev, events, td) 1121 dev_t dev; 1122 int events; 1123 struct thread *td; 1124 { 1125 1126 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1127 } 1128 1129 static int 1130 find_thread_in_proc(struct proc *p, struct thread *td) 1131 { 1132 struct thread *td2; 1133 FOREACH_THREAD_IN_PROC(p, td2) { 1134 if (td2 == td) { 1135 return (1); 1136 } 1137 } 1138 return (0); 1139 } 1140 1141 /* 1142 * Record a select request. 1143 */ 1144 void 1145 selrecord(selector, sip) 1146 struct thread *selector; 1147 struct selinfo *sip; 1148 { 1149 struct proc *p; 1150 pid_t mypid; 1151 1152 mypid = selector->td_proc->p_pid; 1153 if ((sip->si_pid == mypid) && 1154 (sip->si_thread == selector)) { /* XXXKSE should be an ID? */ 1155 return; 1156 } 1157 if (sip->si_pid && 1158 (p = pfind(sip->si_pid)) && 1159 (find_thread_in_proc(p, sip->si_thread))) { 1160 mtx_lock_spin(&sched_lock); 1161 if (sip->si_thread->td_wchan == (caddr_t)&selwait) { 1162 mtx_unlock_spin(&sched_lock); 1163 PROC_UNLOCK(p); 1164 sip->si_flags |= SI_COLL; 1165 return; 1166 } 1167 mtx_unlock_spin(&sched_lock); 1168 PROC_UNLOCK(p); 1169 } 1170 sip->si_pid = mypid; 1171 sip->si_thread = selector; 1172 } 1173 1174 /* 1175 * Do a wakeup when a selectable event occurs. 1176 */ 1177 void 1178 selwakeup(sip) 1179 register struct selinfo *sip; 1180 { 1181 struct thread *td; 1182 register struct proc *p; 1183 1184 if (sip->si_pid == 0) 1185 return; 1186 if (sip->si_flags & SI_COLL) { 1187 nselcoll++; 1188 sip->si_flags &= ~SI_COLL; 1189 cv_broadcast(&selwait); 1190 } 1191 p = pfind(sip->si_pid); 1192 sip->si_pid = 0; 1193 td = sip->si_thread; 1194 if (p != NULL) { 1195 if (!find_thread_in_proc(p, td)) { 1196 PROC_UNLOCK(p); /* lock is in pfind() */; 1197 return; 1198 } 1199 mtx_lock_spin(&sched_lock); 1200 if (td->td_wchan == (caddr_t)&selwait) { 1201 if (td->td_proc->p_stat == SSLEEP) 1202 setrunnable(td); 1203 else 1204 cv_waitq_remove(td); 1205 } else 1206 td->td_flags &= ~TDF_SELECT; 1207 mtx_unlock_spin(&sched_lock); 1208 PROC_UNLOCK(p); /* Lock is in pfind() */ 1209 } 1210 } 1211 1212 static void selectinit __P((void *)); 1213 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1214 1215 /* ARGSUSED*/ 1216 static void 1217 selectinit(dummy) 1218 void *dummy; 1219 { 1220 cv_init(&selwait, "select"); 1221 } 1222