1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 * $FreeBSD$ 40 */ 41 42 #include "opt_ktrace.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/sysproto.h> 47 #include <sys/filedesc.h> 48 #include <sys/filio.h> 49 #include <sys/fcntl.h> 50 #include <sys/file.h> 51 #include <sys/proc.h> 52 #include <sys/signalvar.h> 53 #include <sys/socketvar.h> 54 #include <sys/uio.h> 55 #include <sys/kernel.h> 56 #include <sys/malloc.h> 57 #include <sys/poll.h> 58 #include <sys/resourcevar.h> 59 #include <sys/selinfo.h> 60 #include <sys/sysctl.h> 61 #include <sys/sysent.h> 62 #include <sys/bio.h> 63 #include <sys/buf.h> 64 #include <sys/condvar.h> 65 #ifdef KTRACE 66 #include <sys/ktrace.h> 67 #endif 68 #include <vm/vm.h> 69 #include <vm/vm_page.h> 70 71 #include <machine/limits.h> 72 73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 75 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 76 77 static int pollscan __P((struct thread *, struct pollfd *, u_int)); 78 static int pollholddrop __P((struct thread *, struct pollfd *, u_int, int)); 79 static int selscan __P((struct thread *, fd_mask **, fd_mask **, int)); 80 static int selholddrop __P((struct thread *, fd_mask *, fd_mask *, int, int)); 81 static int dofileread __P((struct thread *, struct file *, int, void *, 82 size_t, off_t, int)); 83 static int dofilewrite __P((struct thread *, struct file *, int, 84 const void *, size_t, off_t, int)); 85 86 /* 87 * Read system call. 88 */ 89 #ifndef _SYS_SYSPROTO_H_ 90 struct read_args { 91 int fd; 92 void *buf; 93 size_t nbyte; 94 }; 95 #endif 96 /* 97 * MPSAFE 98 */ 99 int 100 read(td, uap) 101 struct thread *td; 102 struct read_args *uap; 103 { 104 struct file *fp; 105 int error; 106 107 mtx_lock(&Giant); 108 if ((error = fget_read(td, uap->fd, &fp)) == 0) { 109 error = dofileread(td, fp, uap->fd, uap->buf, 110 uap->nbyte, (off_t)-1, 0); 111 fdrop(fp, td); 112 } 113 mtx_unlock(&Giant); 114 return(error); 115 } 116 117 /* 118 * Pread system call 119 */ 120 #ifndef _SYS_SYSPROTO_H_ 121 struct pread_args { 122 int fd; 123 void *buf; 124 size_t nbyte; 125 int pad; 126 off_t offset; 127 }; 128 #endif 129 /* 130 * MPSAFE 131 */ 132 int 133 pread(td, uap) 134 struct thread *td; 135 struct pread_args *uap; 136 { 137 struct file *fp; 138 int error; 139 140 mtx_lock(&Giant); 141 if ((error = fget_read(td, uap->fd, &fp)) == 0) { 142 if (fp->f_type == DTYPE_VNODE) { 143 error = dofileread(td, fp, uap->fd, uap->buf, 144 uap->nbyte, uap->offset, FOF_OFFSET); 145 } else { 146 error = ESPIPE; 147 } 148 fdrop(fp, td); 149 } 150 mtx_unlock(&Giant); 151 return(error); 152 } 153 154 /* 155 * Code common for read and pread 156 */ 157 int 158 dofileread(td, fp, fd, buf, nbyte, offset, flags) 159 struct thread *td; 160 struct file *fp; 161 int fd, flags; 162 void *buf; 163 size_t nbyte; 164 off_t offset; 165 { 166 struct uio auio; 167 struct iovec aiov; 168 long cnt, error = 0; 169 #ifdef KTRACE 170 struct iovec ktriov; 171 struct uio ktruio; 172 int didktr = 0; 173 #endif 174 175 aiov.iov_base = (caddr_t)buf; 176 aiov.iov_len = nbyte; 177 auio.uio_iov = &aiov; 178 auio.uio_iovcnt = 1; 179 auio.uio_offset = offset; 180 if (nbyte > INT_MAX) 181 return (EINVAL); 182 auio.uio_resid = nbyte; 183 auio.uio_rw = UIO_READ; 184 auio.uio_segflg = UIO_USERSPACE; 185 auio.uio_td = td; 186 #ifdef KTRACE 187 /* 188 * if tracing, save a copy of iovec 189 */ 190 if (KTRPOINT(td->td_proc, KTR_GENIO)) { 191 ktriov = aiov; 192 ktruio = auio; 193 didktr = 1; 194 } 195 #endif 196 cnt = nbyte; 197 198 if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) { 199 if (auio.uio_resid != cnt && (error == ERESTART || 200 error == EINTR || error == EWOULDBLOCK)) 201 error = 0; 202 } 203 cnt -= auio.uio_resid; 204 #ifdef KTRACE 205 if (didktr && error == 0) { 206 ktruio.uio_iov = &ktriov; 207 ktruio.uio_resid = cnt; 208 ktrgenio(td->td_proc->p_tracep, fd, UIO_READ, &ktruio, error); 209 } 210 #endif 211 td->td_retval[0] = cnt; 212 return (error); 213 } 214 215 /* 216 * Scatter read system call. 217 */ 218 #ifndef _SYS_SYSPROTO_H_ 219 struct readv_args { 220 int fd; 221 struct iovec *iovp; 222 u_int iovcnt; 223 }; 224 #endif 225 /* 226 * MPSAFE 227 */ 228 int 229 readv(td, uap) 230 struct thread *td; 231 struct readv_args *uap; 232 { 233 struct file *fp; 234 struct uio auio; 235 struct iovec *iov; 236 struct iovec *needfree; 237 struct iovec aiov[UIO_SMALLIOV]; 238 long i, cnt, error = 0; 239 u_int iovlen; 240 #ifdef KTRACE 241 struct iovec *ktriov = NULL; 242 struct uio ktruio; 243 #endif 244 mtx_lock(&Giant); 245 246 if ((error = fget_read(td, uap->fd, &fp)) != 0) 247 goto done2; 248 /* note: can't use iovlen until iovcnt is validated */ 249 iovlen = uap->iovcnt * sizeof (struct iovec); 250 if (uap->iovcnt > UIO_SMALLIOV) { 251 if (uap->iovcnt > UIO_MAXIOV) { 252 error = EINVAL; 253 goto done2; 254 } 255 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 256 needfree = iov; 257 } else { 258 iov = aiov; 259 needfree = NULL; 260 } 261 auio.uio_iov = iov; 262 auio.uio_iovcnt = uap->iovcnt; 263 auio.uio_rw = UIO_READ; 264 auio.uio_segflg = UIO_USERSPACE; 265 auio.uio_td = td; 266 auio.uio_offset = -1; 267 if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen))) 268 goto done; 269 auio.uio_resid = 0; 270 for (i = 0; i < uap->iovcnt; i++) { 271 if (iov->iov_len > INT_MAX - auio.uio_resid) { 272 error = EINVAL; 273 goto done; 274 } 275 auio.uio_resid += iov->iov_len; 276 iov++; 277 } 278 #ifdef KTRACE 279 /* 280 * if tracing, save a copy of iovec 281 */ 282 if (KTRPOINT(td->td_proc, KTR_GENIO)) { 283 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 284 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); 285 ktruio = auio; 286 } 287 #endif 288 cnt = auio.uio_resid; 289 if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) { 290 if (auio.uio_resid != cnt && (error == ERESTART || 291 error == EINTR || error == EWOULDBLOCK)) 292 error = 0; 293 } 294 cnt -= auio.uio_resid; 295 #ifdef KTRACE 296 if (ktriov != NULL) { 297 if (error == 0) { 298 ktruio.uio_iov = ktriov; 299 ktruio.uio_resid = cnt; 300 ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_READ, &ktruio, 301 error); 302 } 303 FREE(ktriov, M_TEMP); 304 } 305 #endif 306 td->td_retval[0] = cnt; 307 done: 308 fdrop(fp, td); 309 if (needfree) 310 FREE(needfree, M_IOV); 311 done2: 312 mtx_unlock(&Giant); 313 return (error); 314 } 315 316 /* 317 * Write system call 318 */ 319 #ifndef _SYS_SYSPROTO_H_ 320 struct write_args { 321 int fd; 322 const void *buf; 323 size_t nbyte; 324 }; 325 #endif 326 /* 327 * MPSAFE 328 */ 329 int 330 write(td, uap) 331 struct thread *td; 332 struct write_args *uap; 333 { 334 struct file *fp; 335 int error; 336 337 mtx_lock(&Giant); 338 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 339 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, 340 (off_t)-1, 0); 341 fdrop(fp, td); 342 } else { 343 error = EBADF; /* XXX this can't be right */ 344 } 345 mtx_unlock(&Giant); 346 return(error); 347 } 348 349 /* 350 * Pwrite system call 351 */ 352 #ifndef _SYS_SYSPROTO_H_ 353 struct pwrite_args { 354 int fd; 355 const void *buf; 356 size_t nbyte; 357 int pad; 358 off_t offset; 359 }; 360 #endif 361 /* 362 * MPSAFE 363 */ 364 int 365 pwrite(td, uap) 366 struct thread *td; 367 struct pwrite_args *uap; 368 { 369 struct file *fp; 370 int error; 371 372 mtx_lock(&Giant); 373 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 374 if (fp->f_type == DTYPE_VNODE) { 375 error = dofilewrite(td, fp, uap->fd, uap->buf, 376 uap->nbyte, uap->offset, FOF_OFFSET); 377 } else { 378 error = ESPIPE; 379 } 380 fdrop(fp, td); 381 } else { 382 error = EBADF; /* this can't be right */ 383 } 384 mtx_unlock(&Giant); 385 return(error); 386 } 387 388 static int 389 dofilewrite(td, fp, fd, buf, nbyte, offset, flags) 390 struct thread *td; 391 struct file *fp; 392 int fd, flags; 393 const void *buf; 394 size_t nbyte; 395 off_t offset; 396 { 397 struct uio auio; 398 struct iovec aiov; 399 long cnt, error = 0; 400 #ifdef KTRACE 401 struct iovec ktriov; 402 struct uio ktruio; 403 int didktr = 0; 404 #endif 405 406 aiov.iov_base = (void *)(uintptr_t)buf; 407 aiov.iov_len = nbyte; 408 auio.uio_iov = &aiov; 409 auio.uio_iovcnt = 1; 410 auio.uio_offset = offset; 411 if (nbyte > INT_MAX) 412 return (EINVAL); 413 auio.uio_resid = nbyte; 414 auio.uio_rw = UIO_WRITE; 415 auio.uio_segflg = UIO_USERSPACE; 416 auio.uio_td = td; 417 #ifdef KTRACE 418 /* 419 * if tracing, save a copy of iovec and uio 420 */ 421 if (KTRPOINT(td->td_proc, KTR_GENIO)) { 422 ktriov = aiov; 423 ktruio = auio; 424 didktr = 1; 425 } 426 #endif 427 cnt = nbyte; 428 if (fp->f_type == DTYPE_VNODE) 429 bwillwrite(); 430 if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) { 431 if (auio.uio_resid != cnt && (error == ERESTART || 432 error == EINTR || error == EWOULDBLOCK)) 433 error = 0; 434 if (error == EPIPE) { 435 PROC_LOCK(td->td_proc); 436 psignal(td->td_proc, SIGPIPE); 437 PROC_UNLOCK(td->td_proc); 438 } 439 } 440 cnt -= auio.uio_resid; 441 #ifdef KTRACE 442 if (didktr && error == 0) { 443 ktruio.uio_iov = &ktriov; 444 ktruio.uio_resid = cnt; 445 ktrgenio(td->td_proc->p_tracep, fd, UIO_WRITE, &ktruio, error); 446 } 447 #endif 448 td->td_retval[0] = cnt; 449 return (error); 450 } 451 452 /* 453 * Gather write system call 454 */ 455 #ifndef _SYS_SYSPROTO_H_ 456 struct writev_args { 457 int fd; 458 struct iovec *iovp; 459 u_int iovcnt; 460 }; 461 #endif 462 /* 463 * MPSAFE 464 */ 465 int 466 writev(td, uap) 467 struct thread *td; 468 register struct writev_args *uap; 469 { 470 struct file *fp; 471 struct uio auio; 472 register struct iovec *iov; 473 struct iovec *needfree; 474 struct iovec aiov[UIO_SMALLIOV]; 475 long i, cnt, error = 0; 476 u_int iovlen; 477 #ifdef KTRACE 478 struct iovec *ktriov = NULL; 479 struct uio ktruio; 480 #endif 481 482 mtx_lock(&Giant); 483 if ((error = fget_write(td, uap->fd, &fp)) != 0) { 484 error = EBADF; 485 goto done2; 486 } 487 /* note: can't use iovlen until iovcnt is validated */ 488 iovlen = uap->iovcnt * sizeof (struct iovec); 489 if (uap->iovcnt > UIO_SMALLIOV) { 490 if (uap->iovcnt > UIO_MAXIOV) { 491 needfree = NULL; 492 error = EINVAL; 493 goto done; 494 } 495 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 496 needfree = iov; 497 } else { 498 iov = aiov; 499 needfree = NULL; 500 } 501 auio.uio_iov = iov; 502 auio.uio_iovcnt = uap->iovcnt; 503 auio.uio_rw = UIO_WRITE; 504 auio.uio_segflg = UIO_USERSPACE; 505 auio.uio_td = td; 506 auio.uio_offset = -1; 507 if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen))) 508 goto done; 509 auio.uio_resid = 0; 510 for (i = 0; i < uap->iovcnt; i++) { 511 if (iov->iov_len > INT_MAX - auio.uio_resid) { 512 error = EINVAL; 513 goto done; 514 } 515 auio.uio_resid += iov->iov_len; 516 iov++; 517 } 518 #ifdef KTRACE 519 /* 520 * if tracing, save a copy of iovec and uio 521 */ 522 if (KTRPOINT(td->td_proc, KTR_GENIO)) { 523 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 524 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); 525 ktruio = auio; 526 } 527 #endif 528 cnt = auio.uio_resid; 529 if (fp->f_type == DTYPE_VNODE) 530 bwillwrite(); 531 if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) { 532 if (auio.uio_resid != cnt && (error == ERESTART || 533 error == EINTR || error == EWOULDBLOCK)) 534 error = 0; 535 if (error == EPIPE) { 536 PROC_LOCK(td->td_proc); 537 psignal(td->td_proc, SIGPIPE); 538 PROC_UNLOCK(td->td_proc); 539 } 540 } 541 cnt -= auio.uio_resid; 542 #ifdef KTRACE 543 if (ktriov != NULL) { 544 if (error == 0) { 545 ktruio.uio_iov = ktriov; 546 ktruio.uio_resid = cnt; 547 ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_WRITE, &ktruio, 548 error); 549 } 550 FREE(ktriov, M_TEMP); 551 } 552 #endif 553 td->td_retval[0] = cnt; 554 done: 555 fdrop(fp, td); 556 if (needfree) 557 FREE(needfree, M_IOV); 558 done2: 559 mtx_unlock(&Giant); 560 return (error); 561 } 562 563 /* 564 * Ioctl system call 565 */ 566 #ifndef _SYS_SYSPROTO_H_ 567 struct ioctl_args { 568 int fd; 569 u_long com; 570 caddr_t data; 571 }; 572 #endif 573 /* 574 * MPSAFE 575 */ 576 /* ARGSUSED */ 577 int 578 ioctl(td, uap) 579 struct thread *td; 580 register struct ioctl_args *uap; 581 { 582 register struct file *fp; 583 register struct filedesc *fdp; 584 register u_long com; 585 int error = 0; 586 register u_int size; 587 caddr_t data, memp; 588 int tmp; 589 #define STK_PARAMS 128 590 union { 591 char stkbuf[STK_PARAMS]; 592 long align; 593 } ubuf; 594 595 mtx_lock(&Giant); 596 fdp = td->td_proc->p_fd; 597 if ((u_int)uap->fd >= fdp->fd_nfiles || 598 (fp = fdp->fd_ofiles[uap->fd]) == NULL) { 599 error = EBADF; 600 goto done2; 601 } 602 603 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 604 error = EBADF; 605 goto done2; 606 } 607 608 switch (com = uap->com) { 609 case FIONCLEX: 610 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; 611 goto done2; 612 case FIOCLEX: 613 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; 614 goto done2; 615 } 616 617 /* 618 * Interpret high order word to find amount of data to be 619 * copied to/from the user's address space. 620 */ 621 size = IOCPARM_LEN(com); 622 if (size > IOCPARM_MAX) { 623 error = ENOTTY; 624 goto done2; 625 } 626 627 fhold(fp); 628 629 memp = NULL; 630 if (size > sizeof (ubuf.stkbuf)) { 631 memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 632 data = memp; 633 } else { 634 data = ubuf.stkbuf; 635 } 636 if (com&IOC_IN) { 637 if (size) { 638 error = copyin(uap->data, data, (u_int)size); 639 if (error) { 640 if (memp) 641 free(memp, M_IOCTLOPS); 642 fdrop(fp, td); 643 goto done2; 644 } 645 } else { 646 *(caddr_t *)data = uap->data; 647 } 648 } else if ((com&IOC_OUT) && size) { 649 /* 650 * Zero the buffer so the user always 651 * gets back something deterministic. 652 */ 653 bzero(data, size); 654 } else if (com&IOC_VOID) { 655 *(caddr_t *)data = uap->data; 656 } 657 658 switch (com) { 659 660 case FIONBIO: 661 if ((tmp = *(int *)data)) 662 fp->f_flag |= FNONBLOCK; 663 else 664 fp->f_flag &= ~FNONBLOCK; 665 error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td); 666 break; 667 668 case FIOASYNC: 669 if ((tmp = *(int *)data)) 670 fp->f_flag |= FASYNC; 671 else 672 fp->f_flag &= ~FASYNC; 673 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td); 674 break; 675 676 default: 677 error = fo_ioctl(fp, com, data, td); 678 /* 679 * Copy any data to user, size was 680 * already set and checked above. 681 */ 682 if (error == 0 && (com&IOC_OUT) && size) 683 error = copyout(data, uap->data, (u_int)size); 684 break; 685 } 686 if (memp) 687 free(memp, M_IOCTLOPS); 688 fdrop(fp, td); 689 done2: 690 mtx_unlock(&Giant); 691 return (error); 692 } 693 694 static int nselcoll; /* Select collisions since boot */ 695 struct cv selwait; 696 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 697 698 /* 699 * Select system call. 700 */ 701 #ifndef _SYS_SYSPROTO_H_ 702 struct select_args { 703 int nd; 704 fd_set *in, *ou, *ex; 705 struct timeval *tv; 706 }; 707 #endif 708 /* 709 * MPSAFE 710 */ 711 int 712 select(td, uap) 713 register struct thread *td; 714 register struct select_args *uap; 715 { 716 /* 717 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 718 * infds with the new FD_SETSIZE of 1024, and more than enough for 719 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 720 * of 256. 721 */ 722 fd_mask s_selbits[howmany(2048, NFDBITS)]; 723 fd_mask s_heldbits[howmany(2048, NFDBITS)]; 724 fd_mask *ibits[3], *obits[3], *selbits, *sbp, *heldbits, *hibits, *hobits; 725 struct timeval atv, rtv, ttv; 726 int ncoll, error, timo, i; 727 u_int nbufbytes, ncpbytes, nfdbits; 728 729 if (uap->nd < 0) 730 return (EINVAL); 731 732 mtx_lock(&Giant); 733 734 if (uap->nd > td->td_proc->p_fd->fd_nfiles) 735 uap->nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 736 737 /* 738 * Allocate just enough bits for the non-null fd_sets. Use the 739 * preallocated auto buffer if possible. 740 */ 741 nfdbits = roundup(uap->nd, NFDBITS); 742 ncpbytes = nfdbits / NBBY; 743 nbufbytes = 0; 744 if (uap->in != NULL) 745 nbufbytes += 2 * ncpbytes; 746 if (uap->ou != NULL) 747 nbufbytes += 2 * ncpbytes; 748 if (uap->ex != NULL) 749 nbufbytes += 2 * ncpbytes; 750 if (nbufbytes <= sizeof s_selbits) 751 selbits = &s_selbits[0]; 752 else 753 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 754 if (2 * ncpbytes <= sizeof s_heldbits) { 755 bzero(s_heldbits, sizeof(s_heldbits)); 756 heldbits = &s_heldbits[0]; 757 } else 758 heldbits = malloc(2 * ncpbytes, M_SELECT, M_WAITOK | M_ZERO); 759 760 /* 761 * Assign pointers into the bit buffers and fetch the input bits. 762 * Put the output buffers together so that they can be bzeroed 763 * together. 764 */ 765 sbp = selbits; 766 hibits = heldbits + ncpbytes / sizeof *heldbits; 767 hobits = heldbits; 768 #define getbits(name, x) \ 769 do { \ 770 if (uap->name == NULL) \ 771 ibits[x] = NULL; \ 772 else { \ 773 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 774 obits[x] = sbp; \ 775 sbp += ncpbytes / sizeof *sbp; \ 776 error = copyin(uap->name, ibits[x], ncpbytes); \ 777 if (error != 0) \ 778 goto done_noproclock; \ 779 for (i = 0; \ 780 i < ncpbytes / sizeof ibits[i][0]; \ 781 i++) \ 782 hibits[i] |= ibits[x][i]; \ 783 } \ 784 } while (0) 785 getbits(in, 0); 786 getbits(ou, 1); 787 getbits(ex, 2); 788 #undef getbits 789 if (nbufbytes != 0) 790 bzero(selbits, nbufbytes / 2); 791 792 if (uap->tv) { 793 error = copyin((caddr_t)uap->tv, (caddr_t)&atv, 794 sizeof (atv)); 795 if (error) 796 goto done_noproclock; 797 if (itimerfix(&atv)) { 798 error = EINVAL; 799 goto done_noproclock; 800 } 801 getmicrouptime(&rtv); 802 timevaladd(&atv, &rtv); 803 } else { 804 atv.tv_sec = 0; 805 atv.tv_usec = 0; 806 } 807 selholddrop(td, hibits, hobits, uap->nd, 1); 808 timo = 0; 809 PROC_LOCK(td->td_proc); 810 retry: 811 ncoll = nselcoll; 812 mtx_lock_spin(&sched_lock); 813 td->td_flags |= TDF_SELECT; 814 mtx_unlock_spin(&sched_lock); 815 PROC_UNLOCK(td->td_proc); 816 error = selscan(td, ibits, obits, uap->nd); 817 PROC_LOCK(td->td_proc); 818 if (error || td->td_retval[0]) 819 goto done; 820 if (atv.tv_sec || atv.tv_usec) { 821 getmicrouptime(&rtv); 822 if (timevalcmp(&rtv, &atv, >=)) { 823 /* 824 * An event of our interest may occur during locking a process. 825 * In order to avoid missing the event that occured during locking 826 * the process, test TDF_SELECT and rescan file descriptors if 827 * necessary. 828 */ 829 mtx_lock_spin(&sched_lock); 830 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 831 ncoll = nselcoll; 832 td->td_flags |= TDF_SELECT; 833 mtx_unlock_spin(&sched_lock); 834 PROC_UNLOCK(td->td_proc); 835 error = selscan(td, ibits, obits, uap->nd); 836 PROC_LOCK(td->td_proc); 837 } else 838 mtx_unlock_spin(&sched_lock); 839 goto done; 840 } 841 ttv = atv; 842 timevalsub(&ttv, &rtv); 843 timo = ttv.tv_sec > 24 * 60 * 60 ? 844 24 * 60 * 60 * hz : tvtohz(&ttv); 845 } 846 mtx_lock_spin(&sched_lock); 847 td->td_flags &= ~TDF_SELECT; 848 mtx_unlock_spin(&sched_lock); 849 850 if (timo > 0) 851 error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo); 852 else 853 error = cv_wait_sig(&selwait, &td->td_proc->p_mtx); 854 855 if (error == 0) 856 goto retry; 857 858 done: 859 mtx_lock_spin(&sched_lock); 860 td->td_flags &= ~TDF_SELECT; 861 mtx_unlock_spin(&sched_lock); 862 PROC_UNLOCK(td->td_proc); 863 selholddrop(td, hibits, hobits, uap->nd, 0); 864 done_noproclock: 865 /* select is not restarted after signals... */ 866 if (error == ERESTART) 867 error = EINTR; 868 if (error == EWOULDBLOCK) 869 error = 0; 870 #define putbits(name, x) \ 871 if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \ 872 error = error2; 873 if (error == 0) { 874 int error2; 875 876 putbits(in, 0); 877 putbits(ou, 1); 878 putbits(ex, 2); 879 #undef putbits 880 } 881 if (selbits != &s_selbits[0]) 882 free(selbits, M_SELECT); 883 if (heldbits != &s_heldbits[0]) 884 free(heldbits, M_SELECT); 885 886 mtx_unlock(&Giant); 887 return (error); 888 } 889 890 static int 891 selholddrop(td, ibits, obits, nfd, hold) 892 struct thread *td; 893 fd_mask *ibits, *obits; 894 int nfd, hold; 895 { 896 struct filedesc *fdp = td->td_proc->p_fd; 897 int i, fd; 898 fd_mask bits; 899 struct file *fp; 900 901 for (i = 0; i < nfd; i += NFDBITS) { 902 if (hold) 903 bits = ibits[i/NFDBITS]; 904 else 905 bits = obits[i/NFDBITS]; 906 /* ffs(int mask) not portable, fd_mask is long */ 907 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 908 if (!(bits & 1)) 909 continue; 910 fp = fdp->fd_ofiles[fd]; 911 if (fp == NULL) 912 return (EBADF); 913 if (hold) { 914 fhold(fp); 915 obits[(fd)/NFDBITS] |= 916 ((fd_mask)1 << ((fd) % NFDBITS)); 917 } else 918 fdrop(fp, td); 919 } 920 } 921 return (0); 922 } 923 924 static int 925 selscan(td, ibits, obits, nfd) 926 struct thread *td; 927 fd_mask **ibits, **obits; 928 int nfd; 929 { 930 struct filedesc *fdp = td->td_proc->p_fd; 931 int msk, i, fd; 932 fd_mask bits; 933 struct file *fp; 934 int n = 0; 935 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 936 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 937 938 for (msk = 0; msk < 3; msk++) { 939 if (ibits[msk] == NULL) 940 continue; 941 for (i = 0; i < nfd; i += NFDBITS) { 942 bits = ibits[msk][i/NFDBITS]; 943 /* ffs(int mask) not portable, fd_mask is long */ 944 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 945 if (!(bits & 1)) 946 continue; 947 fp = fdp->fd_ofiles[fd]; 948 if (fp == NULL) 949 return (EBADF); 950 if (fo_poll(fp, flag[msk], fp->f_cred, td)) { 951 obits[msk][(fd)/NFDBITS] |= 952 ((fd_mask)1 << ((fd) % NFDBITS)); 953 n++; 954 } 955 } 956 } 957 } 958 td->td_retval[0] = n; 959 return (0); 960 } 961 962 /* 963 * Poll system call. 964 */ 965 #ifndef _SYS_SYSPROTO_H_ 966 struct poll_args { 967 struct pollfd *fds; 968 u_int nfds; 969 int timeout; 970 }; 971 #endif 972 /* 973 * MPSAFE 974 */ 975 int 976 poll(td, uap) 977 struct thread *td; 978 struct poll_args *uap; 979 { 980 caddr_t bits; 981 char smallbits[32 * sizeof(struct pollfd)]; 982 struct timeval atv, rtv, ttv; 983 int ncoll, error = 0, timo; 984 u_int nfds; 985 size_t ni; 986 struct pollfd p_heldbits[32]; 987 struct pollfd *heldbits; 988 989 nfds = SCARG(uap, nfds); 990 991 mtx_lock(&Giant); 992 /* 993 * This is kinda bogus. We have fd limits, but that is not 994 * really related to the size of the pollfd array. Make sure 995 * we let the process use at least FD_SETSIZE entries and at 996 * least enough for the current limits. We want to be reasonably 997 * safe, but not overly restrictive. 998 */ 999 if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) && 1000 (nfds > FD_SETSIZE)) { 1001 error = EINVAL; 1002 goto done2; 1003 } 1004 ni = nfds * sizeof(struct pollfd); 1005 if (ni > sizeof(smallbits)) 1006 bits = malloc(ni, M_TEMP, M_WAITOK); 1007 else 1008 bits = smallbits; 1009 if (ni > sizeof(p_heldbits)) 1010 heldbits = malloc(ni, M_TEMP, M_WAITOK); 1011 else { 1012 bzero(p_heldbits, sizeof(p_heldbits)); 1013 heldbits = p_heldbits; 1014 } 1015 error = copyin(SCARG(uap, fds), bits, ni); 1016 if (error) 1017 goto done_noproclock; 1018 bcopy(bits, heldbits, ni); 1019 if (SCARG(uap, timeout) != INFTIM) { 1020 atv.tv_sec = SCARG(uap, timeout) / 1000; 1021 atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000; 1022 if (itimerfix(&atv)) { 1023 error = EINVAL; 1024 goto done_noproclock; 1025 } 1026 getmicrouptime(&rtv); 1027 timevaladd(&atv, &rtv); 1028 } else { 1029 atv.tv_sec = 0; 1030 atv.tv_usec = 0; 1031 } 1032 pollholddrop(td, heldbits, nfds, 1); 1033 timo = 0; 1034 PROC_LOCK(td->td_proc); 1035 retry: 1036 ncoll = nselcoll; 1037 mtx_lock_spin(&sched_lock); 1038 td->td_flags |= TDF_SELECT; 1039 mtx_unlock_spin(&sched_lock); 1040 PROC_UNLOCK(td->td_proc); 1041 error = pollscan(td, (struct pollfd *)bits, nfds); 1042 PROC_LOCK(td->td_proc); 1043 if (error || td->td_retval[0]) 1044 goto done; 1045 if (atv.tv_sec || atv.tv_usec) { 1046 getmicrouptime(&rtv); 1047 if (timevalcmp(&rtv, &atv, >=)) { 1048 /* 1049 * An event of our interest may occur during locking a process. 1050 * In order to avoid missing the event that occured during locking 1051 * the process, test TDF_SELECT and rescan file descriptors if 1052 * necessary. 1053 */ 1054 mtx_lock_spin(&sched_lock); 1055 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 1056 ncoll = nselcoll; 1057 td->td_flags |= TDF_SELECT; 1058 mtx_unlock_spin(&sched_lock); 1059 PROC_UNLOCK(td->td_proc); 1060 error = pollscan(td, (struct pollfd *)bits, nfds); 1061 PROC_LOCK(td->td_proc); 1062 } else 1063 mtx_unlock_spin(&sched_lock); 1064 goto done; 1065 } 1066 ttv = atv; 1067 timevalsub(&ttv, &rtv); 1068 timo = ttv.tv_sec > 24 * 60 * 60 ? 1069 24 * 60 * 60 * hz : tvtohz(&ttv); 1070 } 1071 mtx_lock_spin(&sched_lock); 1072 td->td_flags &= ~TDF_SELECT; 1073 mtx_unlock_spin(&sched_lock); 1074 if (timo > 0) 1075 error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo); 1076 else 1077 error = cv_wait_sig(&selwait, &td->td_proc->p_mtx); 1078 if (error == 0) 1079 goto retry; 1080 1081 done: 1082 mtx_lock_spin(&sched_lock); 1083 td->td_flags &= ~TDF_SELECT; 1084 mtx_unlock_spin(&sched_lock); 1085 PROC_UNLOCK(td->td_proc); 1086 pollholddrop(td, heldbits, nfds, 0); 1087 done_noproclock: 1088 /* poll is not restarted after signals... */ 1089 if (error == ERESTART) 1090 error = EINTR; 1091 if (error == EWOULDBLOCK) 1092 error = 0; 1093 if (error == 0) { 1094 error = copyout(bits, SCARG(uap, fds), ni); 1095 if (error) 1096 goto out; 1097 } 1098 out: 1099 if (ni > sizeof(smallbits)) 1100 free(bits, M_TEMP); 1101 if (ni > sizeof(p_heldbits)) 1102 free(heldbits, M_TEMP); 1103 done2: 1104 mtx_unlock(&Giant); 1105 return (error); 1106 } 1107 1108 static int 1109 pollholddrop(td, fds, nfd, hold) 1110 struct thread *td; 1111 struct pollfd *fds; 1112 u_int nfd; 1113 int hold; 1114 { 1115 register struct filedesc *fdp = td->td_proc->p_fd; 1116 int i; 1117 struct file *fp; 1118 1119 for (i = 0; i < nfd; i++, fds++) { 1120 if (0 <= fds->fd && fds->fd < fdp->fd_nfiles) { 1121 fp = fdp->fd_ofiles[fds->fd]; 1122 if (hold) { 1123 if (fp != NULL) { 1124 fhold(fp); 1125 fds->revents = 1; 1126 } else 1127 fds->revents = 0; 1128 } else if(fp != NULL && fds->revents) 1129 fdrop(fp, td); 1130 } 1131 } 1132 return (0); 1133 } 1134 1135 static int 1136 pollscan(td, fds, nfd) 1137 struct thread *td; 1138 struct pollfd *fds; 1139 u_int nfd; 1140 { 1141 register struct filedesc *fdp = td->td_proc->p_fd; 1142 int i; 1143 struct file *fp; 1144 int n = 0; 1145 1146 for (i = 0; i < nfd; i++, fds++) { 1147 if (fds->fd >= fdp->fd_nfiles) { 1148 fds->revents = POLLNVAL; 1149 n++; 1150 } else if (fds->fd < 0) { 1151 fds->revents = 0; 1152 } else { 1153 fp = fdp->fd_ofiles[fds->fd]; 1154 if (fp == NULL) { 1155 fds->revents = POLLNVAL; 1156 n++; 1157 } else { 1158 /* 1159 * Note: backend also returns POLLHUP and 1160 * POLLERR if appropriate. 1161 */ 1162 fds->revents = fo_poll(fp, fds->events, 1163 fp->f_cred, td); 1164 if (fds->revents != 0) 1165 n++; 1166 } 1167 } 1168 } 1169 td->td_retval[0] = n; 1170 return (0); 1171 } 1172 1173 /* 1174 * OpenBSD poll system call. 1175 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1176 */ 1177 #ifndef _SYS_SYSPROTO_H_ 1178 struct openbsd_poll_args { 1179 struct pollfd *fds; 1180 u_int nfds; 1181 int timeout; 1182 }; 1183 #endif 1184 /* 1185 * MPSAFE 1186 */ 1187 int 1188 openbsd_poll(td, uap) 1189 register struct thread *td; 1190 register struct openbsd_poll_args *uap; 1191 { 1192 return (poll(td, (struct poll_args *)uap)); 1193 } 1194 1195 /*ARGSUSED*/ 1196 int 1197 seltrue(dev, events, td) 1198 dev_t dev; 1199 int events; 1200 struct thread *td; 1201 { 1202 1203 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1204 } 1205 1206 static int 1207 find_thread_in_proc(struct proc *p, struct thread *td) 1208 { 1209 struct thread *td2; 1210 FOREACH_THREAD_IN_PROC(p, td2) { 1211 if (td2 == td) { 1212 return (1); 1213 } 1214 } 1215 return (0); 1216 } 1217 1218 /* 1219 * Record a select request. 1220 */ 1221 void 1222 selrecord(selector, sip) 1223 struct thread *selector; 1224 struct selinfo *sip; 1225 { 1226 struct proc *p; 1227 pid_t mypid; 1228 1229 mypid = selector->td_proc->p_pid; 1230 if ((sip->si_pid == mypid) && 1231 (sip->si_thread == selector)) { /* XXXKSE should be an ID? */ 1232 return; 1233 } 1234 if (sip->si_pid && 1235 (p = pfind(sip->si_pid)) && 1236 (find_thread_in_proc(p, sip->si_thread))) { 1237 mtx_lock_spin(&sched_lock); 1238 if (sip->si_thread->td_wchan == (caddr_t)&selwait) { 1239 mtx_unlock_spin(&sched_lock); 1240 PROC_UNLOCK(p); 1241 sip->si_flags |= SI_COLL; 1242 return; 1243 } 1244 mtx_unlock_spin(&sched_lock); 1245 PROC_UNLOCK(p); 1246 } 1247 sip->si_pid = mypid; 1248 sip->si_thread = selector; 1249 } 1250 1251 /* 1252 * Do a wakeup when a selectable event occurs. 1253 */ 1254 void 1255 selwakeup(sip) 1256 register struct selinfo *sip; 1257 { 1258 struct thread *td; 1259 register struct proc *p; 1260 1261 if (sip->si_pid == 0) 1262 return; 1263 if (sip->si_flags & SI_COLL) { 1264 nselcoll++; 1265 sip->si_flags &= ~SI_COLL; 1266 cv_broadcast(&selwait); 1267 } 1268 p = pfind(sip->si_pid); 1269 sip->si_pid = 0; 1270 td = sip->si_thread; 1271 if (p != NULL) { 1272 if (!find_thread_in_proc(p, td)) { 1273 PROC_UNLOCK(p); /* lock is in pfind() */; 1274 return; 1275 } 1276 mtx_lock_spin(&sched_lock); 1277 if (td->td_wchan == (caddr_t)&selwait) { 1278 if (td->td_proc->p_stat == SSLEEP) 1279 setrunnable(td); 1280 else 1281 cv_waitq_remove(td); 1282 } else 1283 td->td_flags &= ~TDF_SELECT; 1284 mtx_unlock_spin(&sched_lock); 1285 PROC_UNLOCK(p); /* Lock is in pfind() */ 1286 } 1287 } 1288 1289 static void selectinit __P((void *)); 1290 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1291 1292 /* ARGSUSED*/ 1293 static void 1294 selectinit(dummy) 1295 void *dummy; 1296 { 1297 cv_init(&selwait, "select"); 1298 } 1299