1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 * $FreeBSD$ 40 */ 41 42 #include "opt_ktrace.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/sysproto.h> 47 #include <sys/filedesc.h> 48 #include <sys/filio.h> 49 #include <sys/fcntl.h> 50 #include <sys/file.h> 51 #include <sys/proc.h> 52 #include <sys/signalvar.h> 53 #include <sys/socketvar.h> 54 #include <sys/uio.h> 55 #include <sys/kernel.h> 56 #include <sys/malloc.h> 57 #include <sys/poll.h> 58 #include <sys/resourcevar.h> 59 #include <sys/selinfo.h> 60 #include <sys/sysctl.h> 61 #include <sys/sysent.h> 62 #include <sys/bio.h> 63 #include <sys/buf.h> 64 #include <sys/condvar.h> 65 #ifdef KTRACE 66 #include <sys/ktrace.h> 67 #endif 68 #include <vm/vm.h> 69 #include <vm/vm_page.h> 70 71 #include <machine/limits.h> 72 73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 75 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 76 77 static int pollscan(struct thread *, struct pollfd *, u_int); 78 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 79 static int dofileread(struct thread *, struct file *, int, void *, 80 size_t, off_t, int); 81 static int dofilewrite(struct thread *, struct file *, int, 82 const void *, size_t, off_t, int); 83 84 /* 85 * Read system call. 86 */ 87 #ifndef _SYS_SYSPROTO_H_ 88 struct read_args { 89 int fd; 90 void *buf; 91 size_t nbyte; 92 }; 93 #endif 94 /* 95 * MPSAFE 96 */ 97 int 98 read(td, uap) 99 struct thread *td; 100 struct read_args *uap; 101 { 102 struct file *fp; 103 int error; 104 105 if ((error = fget_read(td, uap->fd, &fp)) == 0) { 106 error = dofileread(td, fp, uap->fd, uap->buf, 107 uap->nbyte, (off_t)-1, 0); 108 fdrop(fp, td); 109 } 110 return(error); 111 } 112 113 /* 114 * Pread system call 115 */ 116 #ifndef _SYS_SYSPROTO_H_ 117 struct pread_args { 118 int fd; 119 void *buf; 120 size_t nbyte; 121 int pad; 122 off_t offset; 123 }; 124 #endif 125 /* 126 * MPSAFE 127 */ 128 int 129 pread(td, uap) 130 struct thread *td; 131 struct pread_args *uap; 132 { 133 struct file *fp; 134 int error; 135 136 if ((error = fget_read(td, uap->fd, &fp)) != 0) 137 return (error); 138 if (fp->f_type != DTYPE_VNODE) { 139 error = ESPIPE; 140 } else { 141 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 142 uap->offset, FOF_OFFSET); 143 } 144 fdrop(fp, td); 145 return(error); 146 } 147 148 /* 149 * Code common for read and pread 150 */ 151 int 152 dofileread(td, fp, fd, buf, nbyte, offset, flags) 153 struct thread *td; 154 struct file *fp; 155 int fd, flags; 156 void *buf; 157 size_t nbyte; 158 off_t offset; 159 { 160 struct uio auio; 161 struct iovec aiov; 162 long cnt, error = 0; 163 #ifdef KTRACE 164 struct iovec ktriov; 165 struct uio ktruio; 166 int didktr = 0; 167 #endif 168 169 aiov.iov_base = (caddr_t)buf; 170 aiov.iov_len = nbyte; 171 auio.uio_iov = &aiov; 172 auio.uio_iovcnt = 1; 173 auio.uio_offset = offset; 174 if (nbyte > INT_MAX) 175 return (EINVAL); 176 auio.uio_resid = nbyte; 177 auio.uio_rw = UIO_READ; 178 auio.uio_segflg = UIO_USERSPACE; 179 auio.uio_td = td; 180 #ifdef KTRACE 181 /* 182 * if tracing, save a copy of iovec 183 */ 184 if (KTRPOINT(td->td_proc, KTR_GENIO)) { 185 ktriov = aiov; 186 ktruio = auio; 187 didktr = 1; 188 } 189 #endif 190 cnt = nbyte; 191 192 if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) { 193 if (auio.uio_resid != cnt && (error == ERESTART || 194 error == EINTR || error == EWOULDBLOCK)) 195 error = 0; 196 } 197 cnt -= auio.uio_resid; 198 #ifdef KTRACE 199 if (didktr && error == 0) { 200 ktruio.uio_iov = &ktriov; 201 ktruio.uio_resid = cnt; 202 ktrgenio(td->td_proc->p_tracep, fd, UIO_READ, &ktruio, error); 203 } 204 #endif 205 td->td_retval[0] = cnt; 206 return (error); 207 } 208 209 /* 210 * Scatter read system call. 211 */ 212 #ifndef _SYS_SYSPROTO_H_ 213 struct readv_args { 214 int fd; 215 struct iovec *iovp; 216 u_int iovcnt; 217 }; 218 #endif 219 /* 220 * MPSAFE 221 */ 222 int 223 readv(td, uap) 224 struct thread *td; 225 struct readv_args *uap; 226 { 227 struct file *fp; 228 struct uio auio; 229 struct iovec *iov; 230 struct iovec *needfree; 231 struct iovec aiov[UIO_SMALLIOV]; 232 long i, cnt, error = 0; 233 u_int iovlen; 234 #ifdef KTRACE 235 struct iovec *ktriov = NULL; 236 struct uio ktruio; 237 #endif 238 mtx_lock(&Giant); 239 240 if ((error = fget_read(td, uap->fd, &fp)) != 0) 241 goto done2; 242 /* note: can't use iovlen until iovcnt is validated */ 243 iovlen = uap->iovcnt * sizeof (struct iovec); 244 if (uap->iovcnt > UIO_SMALLIOV) { 245 if (uap->iovcnt > UIO_MAXIOV) { 246 error = EINVAL; 247 goto done2; 248 } 249 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 250 needfree = iov; 251 } else { 252 iov = aiov; 253 needfree = NULL; 254 } 255 auio.uio_iov = iov; 256 auio.uio_iovcnt = uap->iovcnt; 257 auio.uio_rw = UIO_READ; 258 auio.uio_segflg = UIO_USERSPACE; 259 auio.uio_td = td; 260 auio.uio_offset = -1; 261 if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen))) 262 goto done; 263 auio.uio_resid = 0; 264 for (i = 0; i < uap->iovcnt; i++) { 265 if (iov->iov_len > INT_MAX - auio.uio_resid) { 266 error = EINVAL; 267 goto done; 268 } 269 auio.uio_resid += iov->iov_len; 270 iov++; 271 } 272 #ifdef KTRACE 273 /* 274 * if tracing, save a copy of iovec 275 */ 276 if (KTRPOINT(td->td_proc, KTR_GENIO)) { 277 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 278 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); 279 ktruio = auio; 280 } 281 #endif 282 cnt = auio.uio_resid; 283 if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) { 284 if (auio.uio_resid != cnt && (error == ERESTART || 285 error == EINTR || error == EWOULDBLOCK)) 286 error = 0; 287 } 288 cnt -= auio.uio_resid; 289 #ifdef KTRACE 290 if (ktriov != NULL) { 291 if (error == 0) { 292 ktruio.uio_iov = ktriov; 293 ktruio.uio_resid = cnt; 294 ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_READ, &ktruio, 295 error); 296 } 297 FREE(ktriov, M_TEMP); 298 } 299 #endif 300 td->td_retval[0] = cnt; 301 done: 302 fdrop(fp, td); 303 if (needfree) 304 FREE(needfree, M_IOV); 305 done2: 306 mtx_unlock(&Giant); 307 return (error); 308 } 309 310 /* 311 * Write system call 312 */ 313 #ifndef _SYS_SYSPROTO_H_ 314 struct write_args { 315 int fd; 316 const void *buf; 317 size_t nbyte; 318 }; 319 #endif 320 /* 321 * MPSAFE 322 */ 323 int 324 write(td, uap) 325 struct thread *td; 326 struct write_args *uap; 327 { 328 struct file *fp; 329 int error; 330 331 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 332 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, 333 (off_t)-1, 0); 334 fdrop(fp, td); 335 } else { 336 error = EBADF; /* XXX this can't be right */ 337 } 338 return(error); 339 } 340 341 /* 342 * Pwrite system call 343 */ 344 #ifndef _SYS_SYSPROTO_H_ 345 struct pwrite_args { 346 int fd; 347 const void *buf; 348 size_t nbyte; 349 int pad; 350 off_t offset; 351 }; 352 #endif 353 /* 354 * MPSAFE 355 */ 356 int 357 pwrite(td, uap) 358 struct thread *td; 359 struct pwrite_args *uap; 360 { 361 struct file *fp; 362 int error; 363 364 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 365 if (fp->f_type == DTYPE_VNODE) { 366 error = dofilewrite(td, fp, uap->fd, uap->buf, 367 uap->nbyte, uap->offset, FOF_OFFSET); 368 } else { 369 error = ESPIPE; 370 } 371 fdrop(fp, td); 372 } else { 373 error = EBADF; /* this can't be right */ 374 } 375 return(error); 376 } 377 378 static int 379 dofilewrite(td, fp, fd, buf, nbyte, offset, flags) 380 struct thread *td; 381 struct file *fp; 382 int fd, flags; 383 const void *buf; 384 size_t nbyte; 385 off_t offset; 386 { 387 struct uio auio; 388 struct iovec aiov; 389 long cnt, error = 0; 390 #ifdef KTRACE 391 struct iovec ktriov; 392 struct uio ktruio; 393 int didktr = 0; 394 #endif 395 396 aiov.iov_base = (void *)(uintptr_t)buf; 397 aiov.iov_len = nbyte; 398 auio.uio_iov = &aiov; 399 auio.uio_iovcnt = 1; 400 auio.uio_offset = offset; 401 if (nbyte > INT_MAX) 402 return (EINVAL); 403 auio.uio_resid = nbyte; 404 auio.uio_rw = UIO_WRITE; 405 auio.uio_segflg = UIO_USERSPACE; 406 auio.uio_td = td; 407 #ifdef KTRACE 408 /* 409 * if tracing, save a copy of iovec and uio 410 */ 411 if (KTRPOINT(td->td_proc, KTR_GENIO)) { 412 ktriov = aiov; 413 ktruio = auio; 414 didktr = 1; 415 } 416 #endif 417 cnt = nbyte; 418 if (fp->f_type == DTYPE_VNODE) 419 bwillwrite(); 420 if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) { 421 if (auio.uio_resid != cnt && (error == ERESTART || 422 error == EINTR || error == EWOULDBLOCK)) 423 error = 0; 424 if (error == EPIPE) { 425 PROC_LOCK(td->td_proc); 426 psignal(td->td_proc, SIGPIPE); 427 PROC_UNLOCK(td->td_proc); 428 } 429 } 430 cnt -= auio.uio_resid; 431 #ifdef KTRACE 432 if (didktr && error == 0) { 433 ktruio.uio_iov = &ktriov; 434 ktruio.uio_resid = cnt; 435 ktrgenio(td->td_proc->p_tracep, fd, UIO_WRITE, &ktruio, error); 436 } 437 #endif 438 td->td_retval[0] = cnt; 439 return (error); 440 } 441 442 /* 443 * Gather write system call 444 */ 445 #ifndef _SYS_SYSPROTO_H_ 446 struct writev_args { 447 int fd; 448 struct iovec *iovp; 449 u_int iovcnt; 450 }; 451 #endif 452 /* 453 * MPSAFE 454 */ 455 int 456 writev(td, uap) 457 struct thread *td; 458 register struct writev_args *uap; 459 { 460 struct file *fp; 461 struct uio auio; 462 register struct iovec *iov; 463 struct iovec *needfree; 464 struct iovec aiov[UIO_SMALLIOV]; 465 long i, cnt, error = 0; 466 u_int iovlen; 467 #ifdef KTRACE 468 struct iovec *ktriov = NULL; 469 struct uio ktruio; 470 #endif 471 472 mtx_lock(&Giant); 473 if ((error = fget_write(td, uap->fd, &fp)) != 0) { 474 error = EBADF; 475 goto done2; 476 } 477 /* note: can't use iovlen until iovcnt is validated */ 478 iovlen = uap->iovcnt * sizeof (struct iovec); 479 if (uap->iovcnt > UIO_SMALLIOV) { 480 if (uap->iovcnt > UIO_MAXIOV) { 481 needfree = NULL; 482 error = EINVAL; 483 goto done; 484 } 485 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 486 needfree = iov; 487 } else { 488 iov = aiov; 489 needfree = NULL; 490 } 491 auio.uio_iov = iov; 492 auio.uio_iovcnt = uap->iovcnt; 493 auio.uio_rw = UIO_WRITE; 494 auio.uio_segflg = UIO_USERSPACE; 495 auio.uio_td = td; 496 auio.uio_offset = -1; 497 if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen))) 498 goto done; 499 auio.uio_resid = 0; 500 for (i = 0; i < uap->iovcnt; i++) { 501 if (iov->iov_len > INT_MAX - auio.uio_resid) { 502 error = EINVAL; 503 goto done; 504 } 505 auio.uio_resid += iov->iov_len; 506 iov++; 507 } 508 #ifdef KTRACE 509 /* 510 * if tracing, save a copy of iovec and uio 511 */ 512 if (KTRPOINT(td->td_proc, KTR_GENIO)) { 513 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 514 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); 515 ktruio = auio; 516 } 517 #endif 518 cnt = auio.uio_resid; 519 if (fp->f_type == DTYPE_VNODE) 520 bwillwrite(); 521 if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) { 522 if (auio.uio_resid != cnt && (error == ERESTART || 523 error == EINTR || error == EWOULDBLOCK)) 524 error = 0; 525 if (error == EPIPE) { 526 PROC_LOCK(td->td_proc); 527 psignal(td->td_proc, SIGPIPE); 528 PROC_UNLOCK(td->td_proc); 529 } 530 } 531 cnt -= auio.uio_resid; 532 #ifdef KTRACE 533 if (ktriov != NULL) { 534 if (error == 0) { 535 ktruio.uio_iov = ktriov; 536 ktruio.uio_resid = cnt; 537 ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_WRITE, &ktruio, 538 error); 539 } 540 FREE(ktriov, M_TEMP); 541 } 542 #endif 543 td->td_retval[0] = cnt; 544 done: 545 fdrop(fp, td); 546 if (needfree) 547 FREE(needfree, M_IOV); 548 done2: 549 mtx_unlock(&Giant); 550 return (error); 551 } 552 553 /* 554 * Ioctl system call 555 */ 556 #ifndef _SYS_SYSPROTO_H_ 557 struct ioctl_args { 558 int fd; 559 u_long com; 560 caddr_t data; 561 }; 562 #endif 563 /* 564 * MPSAFE 565 */ 566 /* ARGSUSED */ 567 int 568 ioctl(td, uap) 569 struct thread *td; 570 register struct ioctl_args *uap; 571 { 572 struct file *fp; 573 register struct filedesc *fdp; 574 register u_long com; 575 int error = 0; 576 register u_int size; 577 caddr_t data, memp; 578 int tmp; 579 #define STK_PARAMS 128 580 union { 581 char stkbuf[STK_PARAMS]; 582 long align; 583 } ubuf; 584 585 if ((error = fget(td, uap->fd, &fp)) != 0) 586 return (error); 587 mtx_lock(&Giant); 588 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 589 fdrop(fp, td); 590 mtx_unlock(&Giant); 591 return (EBADF); 592 } 593 fdp = td->td_proc->p_fd; 594 switch (com = uap->com) { 595 case FIONCLEX: 596 FILEDESC_LOCK(fdp); 597 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; 598 FILEDESC_UNLOCK(fdp); 599 fdrop(fp, td); 600 mtx_unlock(&Giant); 601 return (0); 602 case FIOCLEX: 603 FILEDESC_LOCK(fdp); 604 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; 605 FILEDESC_UNLOCK(fdp); 606 fdrop(fp, td); 607 mtx_unlock(&Giant); 608 return (0); 609 } 610 611 /* 612 * Interpret high order word to find amount of data to be 613 * copied to/from the user's address space. 614 */ 615 size = IOCPARM_LEN(com); 616 if (size > IOCPARM_MAX) { 617 fdrop(fp, td); 618 mtx_unlock(&Giant); 619 return (ENOTTY); 620 } 621 622 memp = NULL; 623 if (size > sizeof (ubuf.stkbuf)) { 624 memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 625 data = memp; 626 } else { 627 data = ubuf.stkbuf; 628 } 629 if (com&IOC_IN) { 630 if (size) { 631 error = copyin(uap->data, data, (u_int)size); 632 if (error) { 633 if (memp) 634 free(memp, M_IOCTLOPS); 635 fdrop(fp, td); 636 goto done; 637 } 638 } else { 639 *(caddr_t *)data = uap->data; 640 } 641 } else if ((com&IOC_OUT) && size) { 642 /* 643 * Zero the buffer so the user always 644 * gets back something deterministic. 645 */ 646 bzero(data, size); 647 } else if (com&IOC_VOID) { 648 *(caddr_t *)data = uap->data; 649 } 650 651 switch (com) { 652 653 case FIONBIO: 654 FILE_LOCK(fp); 655 if ((tmp = *(int *)data)) 656 fp->f_flag |= FNONBLOCK; 657 else 658 fp->f_flag &= ~FNONBLOCK; 659 FILE_UNLOCK(fp); 660 error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td); 661 break; 662 663 case FIOASYNC: 664 FILE_LOCK(fp); 665 if ((tmp = *(int *)data)) 666 fp->f_flag |= FASYNC; 667 else 668 fp->f_flag &= ~FASYNC; 669 FILE_UNLOCK(fp); 670 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td); 671 break; 672 673 default: 674 error = fo_ioctl(fp, com, data, td); 675 /* 676 * Copy any data to user, size was 677 * already set and checked above. 678 */ 679 if (error == 0 && (com&IOC_OUT) && size) 680 error = copyout(data, uap->data, (u_int)size); 681 break; 682 } 683 if (memp) 684 free(memp, M_IOCTLOPS); 685 fdrop(fp, td); 686 done: 687 mtx_unlock(&Giant); 688 return (error); 689 } 690 691 /* 692 * sellock and selwait are initialized in selectinit() via SYSINIT. 693 */ 694 struct mtx sellock; 695 struct cv selwait; 696 int nselcoll; /* Select collisions since boot */ 697 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 698 699 /* 700 * Select system call. 701 */ 702 #ifndef _SYS_SYSPROTO_H_ 703 struct select_args { 704 int nd; 705 fd_set *in, *ou, *ex; 706 struct timeval *tv; 707 }; 708 #endif 709 /* 710 * MPSAFE 711 */ 712 int 713 select(td, uap) 714 register struct thread *td; 715 register struct select_args *uap; 716 { 717 struct filedesc *fdp; 718 /* 719 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 720 * infds with the new FD_SETSIZE of 1024, and more than enough for 721 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 722 * of 256. 723 */ 724 fd_mask s_selbits[howmany(2048, NFDBITS)]; 725 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 726 struct timeval atv, rtv, ttv; 727 int ncoll, error, timo; 728 u_int nbufbytes, ncpbytes, nfdbits; 729 730 if (uap->nd < 0) 731 return (EINVAL); 732 fdp = td->td_proc->p_fd; 733 mtx_lock(&Giant); 734 FILEDESC_LOCK(fdp); 735 736 if (uap->nd > td->td_proc->p_fd->fd_nfiles) 737 uap->nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 738 FILEDESC_UNLOCK(fdp); 739 740 /* 741 * Allocate just enough bits for the non-null fd_sets. Use the 742 * preallocated auto buffer if possible. 743 */ 744 nfdbits = roundup(uap->nd, NFDBITS); 745 ncpbytes = nfdbits / NBBY; 746 nbufbytes = 0; 747 if (uap->in != NULL) 748 nbufbytes += 2 * ncpbytes; 749 if (uap->ou != NULL) 750 nbufbytes += 2 * ncpbytes; 751 if (uap->ex != NULL) 752 nbufbytes += 2 * ncpbytes; 753 if (nbufbytes <= sizeof s_selbits) 754 selbits = &s_selbits[0]; 755 else 756 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 757 758 /* 759 * Assign pointers into the bit buffers and fetch the input bits. 760 * Put the output buffers together so that they can be bzeroed 761 * together. 762 */ 763 sbp = selbits; 764 #define getbits(name, x) \ 765 do { \ 766 if (uap->name == NULL) \ 767 ibits[x] = NULL; \ 768 else { \ 769 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 770 obits[x] = sbp; \ 771 sbp += ncpbytes / sizeof *sbp; \ 772 error = copyin(uap->name, ibits[x], ncpbytes); \ 773 if (error != 0) \ 774 goto done_nosellock; \ 775 } \ 776 } while (0) 777 getbits(in, 0); 778 getbits(ou, 1); 779 getbits(ex, 2); 780 #undef getbits 781 if (nbufbytes != 0) 782 bzero(selbits, nbufbytes / 2); 783 784 if (uap->tv) { 785 error = copyin((caddr_t)uap->tv, (caddr_t)&atv, 786 sizeof (atv)); 787 if (error) 788 goto done_nosellock; 789 if (itimerfix(&atv)) { 790 error = EINVAL; 791 goto done_nosellock; 792 } 793 getmicrouptime(&rtv); 794 timevaladd(&atv, &rtv); 795 } else { 796 atv.tv_sec = 0; 797 atv.tv_usec = 0; 798 } 799 timo = 0; 800 mtx_lock(&sellock); 801 retry: 802 ncoll = nselcoll; 803 mtx_lock_spin(&sched_lock); 804 td->td_flags |= TDF_SELECT; 805 mtx_unlock_spin(&sched_lock); 806 mtx_unlock(&sellock); 807 808 /* XXX Is there a better place for this? */ 809 TAILQ_INIT(&td->td_selq); 810 error = selscan(td, ibits, obits, uap->nd); 811 mtx_lock(&sellock); 812 if (error || td->td_retval[0]) 813 goto done; 814 if (atv.tv_sec || atv.tv_usec) { 815 getmicrouptime(&rtv); 816 if (timevalcmp(&rtv, &atv, >=)) 817 goto done; 818 ttv = atv; 819 timevalsub(&ttv, &rtv); 820 timo = ttv.tv_sec > 24 * 60 * 60 ? 821 24 * 60 * 60 * hz : tvtohz(&ttv); 822 } 823 824 /* 825 * An event of interest may occur while we do not hold 826 * sellock, so check TDF_SELECT and the number of 827 * collisions and rescan the file descriptors if 828 * necessary. 829 */ 830 mtx_lock_spin(&sched_lock); 831 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 832 mtx_unlock_spin(&sched_lock); 833 goto retry; 834 } 835 mtx_unlock_spin(&sched_lock); 836 837 if (timo > 0) 838 error = cv_timedwait_sig(&selwait, &sellock, timo); 839 else 840 error = cv_wait_sig(&selwait, &sellock); 841 842 if (error == 0) 843 goto retry; 844 845 done: 846 clear_selinfo_list(td); 847 mtx_lock_spin(&sched_lock); 848 td->td_flags &= ~TDF_SELECT; 849 mtx_unlock_spin(&sched_lock); 850 mtx_unlock(&sellock); 851 852 done_nosellock: 853 /* select is not restarted after signals... */ 854 if (error == ERESTART) 855 error = EINTR; 856 if (error == EWOULDBLOCK) 857 error = 0; 858 #define putbits(name, x) \ 859 if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \ 860 error = error2; 861 if (error == 0) { 862 int error2; 863 864 putbits(in, 0); 865 putbits(ou, 1); 866 putbits(ex, 2); 867 #undef putbits 868 } 869 if (selbits != &s_selbits[0]) 870 free(selbits, M_SELECT); 871 872 mtx_unlock(&Giant); 873 return (error); 874 } 875 876 static int 877 selscan(td, ibits, obits, nfd) 878 struct thread *td; 879 fd_mask **ibits, **obits; 880 int nfd; 881 { 882 int msk, i, fd; 883 fd_mask bits; 884 struct file *fp; 885 int n = 0; 886 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 887 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 888 struct filedesc *fdp = td->td_proc->p_fd; 889 890 FILEDESC_LOCK(fdp); 891 for (msk = 0; msk < 3; msk++) { 892 if (ibits[msk] == NULL) 893 continue; 894 for (i = 0; i < nfd; i += NFDBITS) { 895 bits = ibits[msk][i/NFDBITS]; 896 /* ffs(int mask) not portable, fd_mask is long */ 897 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 898 if (!(bits & 1)) 899 continue; 900 if ((fp = fget_locked(fdp, fd)) == NULL) { 901 FILEDESC_UNLOCK(fdp); 902 return (EBADF); 903 } 904 if (fo_poll(fp, flag[msk], fp->f_cred, td)) { 905 obits[msk][(fd)/NFDBITS] |= 906 ((fd_mask)1 << ((fd) % NFDBITS)); 907 n++; 908 } 909 } 910 } 911 } 912 FILEDESC_UNLOCK(fdp); 913 td->td_retval[0] = n; 914 return (0); 915 } 916 917 /* 918 * Poll system call. 919 */ 920 #ifndef _SYS_SYSPROTO_H_ 921 struct poll_args { 922 struct pollfd *fds; 923 u_int nfds; 924 int timeout; 925 }; 926 #endif 927 /* 928 * MPSAFE 929 */ 930 int 931 poll(td, uap) 932 struct thread *td; 933 struct poll_args *uap; 934 { 935 caddr_t bits; 936 char smallbits[32 * sizeof(struct pollfd)]; 937 struct timeval atv, rtv, ttv; 938 int ncoll, error = 0, timo; 939 u_int nfds; 940 size_t ni; 941 942 nfds = SCARG(uap, nfds); 943 944 mtx_lock(&Giant); 945 /* 946 * This is kinda bogus. We have fd limits, but that is not 947 * really related to the size of the pollfd array. Make sure 948 * we let the process use at least FD_SETSIZE entries and at 949 * least enough for the current limits. We want to be reasonably 950 * safe, but not overly restrictive. 951 */ 952 if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) && 953 (nfds > FD_SETSIZE)) { 954 error = EINVAL; 955 goto done2; 956 } 957 ni = nfds * sizeof(struct pollfd); 958 if (ni > sizeof(smallbits)) 959 bits = malloc(ni, M_TEMP, M_WAITOK); 960 else 961 bits = smallbits; 962 error = copyin(SCARG(uap, fds), bits, ni); 963 if (error) 964 goto done_nosellock; 965 if (SCARG(uap, timeout) != INFTIM) { 966 atv.tv_sec = SCARG(uap, timeout) / 1000; 967 atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000; 968 if (itimerfix(&atv)) { 969 error = EINVAL; 970 goto done_nosellock; 971 } 972 getmicrouptime(&rtv); 973 timevaladd(&atv, &rtv); 974 } else { 975 atv.tv_sec = 0; 976 atv.tv_usec = 0; 977 } 978 timo = 0; 979 mtx_lock(&sellock); 980 retry: 981 ncoll = nselcoll; 982 mtx_lock_spin(&sched_lock); 983 td->td_flags |= TDF_SELECT; 984 mtx_unlock_spin(&sched_lock); 985 mtx_unlock(&sellock); 986 987 /* XXX Is there a better place for this? */ 988 TAILQ_INIT(&td->td_selq); 989 error = pollscan(td, (struct pollfd *)bits, nfds); 990 mtx_lock(&sellock); 991 if (error || td->td_retval[0]) 992 goto done; 993 if (atv.tv_sec || atv.tv_usec) { 994 getmicrouptime(&rtv); 995 if (timevalcmp(&rtv, &atv, >=)) 996 goto done; 997 ttv = atv; 998 timevalsub(&ttv, &rtv); 999 timo = ttv.tv_sec > 24 * 60 * 60 ? 1000 24 * 60 * 60 * hz : tvtohz(&ttv); 1001 } 1002 /* 1003 * An event of interest may occur while we do not hold 1004 * sellock, so check TDF_SELECT and the number of collisions 1005 * and rescan the file descriptors if necessary. 1006 */ 1007 mtx_lock_spin(&sched_lock); 1008 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 1009 mtx_unlock_spin(&sched_lock); 1010 goto retry; 1011 } 1012 mtx_unlock_spin(&sched_lock); 1013 1014 if (timo > 0) 1015 error = cv_timedwait_sig(&selwait, &sellock, timo); 1016 else 1017 error = cv_wait_sig(&selwait, &sellock); 1018 1019 if (error == 0) 1020 goto retry; 1021 1022 done: 1023 clear_selinfo_list(td); 1024 mtx_lock_spin(&sched_lock); 1025 td->td_flags &= ~TDF_SELECT; 1026 mtx_unlock_spin(&sched_lock); 1027 mtx_unlock(&sellock); 1028 1029 done_nosellock: 1030 /* poll is not restarted after signals... */ 1031 if (error == ERESTART) 1032 error = EINTR; 1033 if (error == EWOULDBLOCK) 1034 error = 0; 1035 if (error == 0) { 1036 error = copyout(bits, SCARG(uap, fds), ni); 1037 if (error) 1038 goto out; 1039 } 1040 out: 1041 if (ni > sizeof(smallbits)) 1042 free(bits, M_TEMP); 1043 done2: 1044 mtx_unlock(&Giant); 1045 return (error); 1046 } 1047 1048 static int 1049 pollscan(td, fds, nfd) 1050 struct thread *td; 1051 struct pollfd *fds; 1052 u_int nfd; 1053 { 1054 register struct filedesc *fdp = td->td_proc->p_fd; 1055 int i; 1056 struct file *fp; 1057 int n = 0; 1058 1059 FILEDESC_LOCK(fdp); 1060 for (i = 0; i < nfd; i++, fds++) { 1061 if (fds->fd >= fdp->fd_nfiles) { 1062 fds->revents = POLLNVAL; 1063 n++; 1064 } else if (fds->fd < 0) { 1065 fds->revents = 0; 1066 } else { 1067 fp = fdp->fd_ofiles[fds->fd]; 1068 if (fp == NULL) { 1069 fds->revents = POLLNVAL; 1070 n++; 1071 } else { 1072 /* 1073 * Note: backend also returns POLLHUP and 1074 * POLLERR if appropriate. 1075 */ 1076 fds->revents = fo_poll(fp, fds->events, 1077 fp->f_cred, td); 1078 if (fds->revents != 0) 1079 n++; 1080 } 1081 } 1082 } 1083 FILEDESC_UNLOCK(fdp); 1084 td->td_retval[0] = n; 1085 return (0); 1086 } 1087 1088 /* 1089 * OpenBSD poll system call. 1090 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1091 */ 1092 #ifndef _SYS_SYSPROTO_H_ 1093 struct openbsd_poll_args { 1094 struct pollfd *fds; 1095 u_int nfds; 1096 int timeout; 1097 }; 1098 #endif 1099 /* 1100 * MPSAFE 1101 */ 1102 int 1103 openbsd_poll(td, uap) 1104 register struct thread *td; 1105 register struct openbsd_poll_args *uap; 1106 { 1107 return (poll(td, (struct poll_args *)uap)); 1108 } 1109 1110 /* 1111 * Remove the references to the thread from all of the objects 1112 * we were polling. 1113 * 1114 * This code assumes that the underlying owner of the selinfo 1115 * structure will hold sellock before it changes it, and that 1116 * it will unlink itself from our list if it goes away. 1117 */ 1118 void 1119 clear_selinfo_list(td) 1120 struct thread *td; 1121 { 1122 struct selinfo *si; 1123 1124 mtx_assert(&sellock, MA_OWNED); 1125 TAILQ_FOREACH(si, &td->td_selq, si_thrlist) 1126 si->si_thread = NULL; 1127 TAILQ_INIT(&td->td_selq); 1128 } 1129 1130 /*ARGSUSED*/ 1131 int 1132 seltrue(dev, events, td) 1133 dev_t dev; 1134 int events; 1135 struct thread *td; 1136 { 1137 1138 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1139 } 1140 1141 /* 1142 * Record a select request. 1143 */ 1144 void 1145 selrecord(selector, sip) 1146 struct thread *selector; 1147 struct selinfo *sip; 1148 { 1149 1150 mtx_lock(&sellock); 1151 /* 1152 * If the thread is NULL then take ownership of selinfo 1153 * however if the thread is not NULL and the thread points to 1154 * someone else, then we have a collision, otherwise leave it alone 1155 * as we've owned it in a previous selrecord on this selinfo. 1156 */ 1157 if (sip->si_thread == NULL) { 1158 sip->si_thread = selector; 1159 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); 1160 } else if (sip->si_thread != selector) { 1161 sip->si_flags |= SI_COLL; 1162 } 1163 1164 mtx_unlock(&sellock); 1165 } 1166 1167 /* 1168 * Do a wakeup when a selectable event occurs. 1169 */ 1170 void 1171 selwakeup(sip) 1172 struct selinfo *sip; 1173 { 1174 struct thread *td; 1175 1176 mtx_lock(&sellock); 1177 td = sip->si_thread; 1178 if ((sip->si_flags & SI_COLL) != 0) { 1179 nselcoll++; 1180 sip->si_flags &= ~SI_COLL; 1181 cv_broadcast(&selwait); 1182 } 1183 if (td == NULL) { 1184 mtx_unlock(&sellock); 1185 return; 1186 } 1187 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); 1188 sip->si_thread = NULL; 1189 mtx_lock_spin(&sched_lock); 1190 if (td->td_wchan == (caddr_t)&selwait) { 1191 if (td->td_proc->p_stat == SSLEEP) 1192 setrunnable(td); 1193 else 1194 cv_waitq_remove(td); 1195 } else 1196 td->td_flags &= ~TDF_SELECT; 1197 mtx_unlock_spin(&sched_lock); 1198 mtx_unlock(&sellock); 1199 } 1200 1201 static void selectinit(void *); 1202 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1203 1204 /* ARGSUSED*/ 1205 static void 1206 selectinit(dummy) 1207 void *dummy; 1208 { 1209 cv_init(&selwait, "select"); 1210 mtx_init(&sellock, "sellck", MTX_DEF); 1211 } 1212