1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 * $FreeBSD$ 40 */ 41 42 #include "opt_ktrace.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/sysproto.h> 47 #include <sys/filedesc.h> 48 #include <sys/filio.h> 49 #include <sys/fcntl.h> 50 #include <sys/file.h> 51 #include <sys/proc.h> 52 #include <sys/signalvar.h> 53 #include <sys/socketvar.h> 54 #include <sys/uio.h> 55 #include <sys/kernel.h> 56 #include <sys/malloc.h> 57 #include <sys/poll.h> 58 #include <sys/resourcevar.h> 59 #include <sys/selinfo.h> 60 #include <sys/sysctl.h> 61 #include <sys/sysent.h> 62 #include <sys/bio.h> 63 #include <sys/buf.h> 64 #include <sys/condvar.h> 65 #ifdef __alpha__ 66 #include <sys/disklabel.h> 67 #endif 68 #ifdef KTRACE 69 #include <sys/ktrace.h> 70 #endif 71 #include <vm/vm.h> 72 #include <vm/vm_page.h> 73 74 #include <machine/limits.h> 75 76 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 77 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 78 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 79 80 static int pollscan(struct thread *, struct pollfd *, u_int); 81 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 82 static int dofileread(struct thread *, struct file *, int, void *, 83 size_t, off_t, int); 84 static int dofilewrite(struct thread *, struct file *, int, 85 const void *, size_t, off_t, int); 86 87 /* 88 * Read system call. 89 */ 90 #ifndef _SYS_SYSPROTO_H_ 91 struct read_args { 92 int fd; 93 void *buf; 94 size_t nbyte; 95 }; 96 #endif 97 /* 98 * MPSAFE 99 */ 100 int 101 read(td, uap) 102 struct thread *td; 103 struct read_args *uap; 104 { 105 struct file *fp; 106 int error; 107 108 if ((error = fget_read(td, uap->fd, &fp)) == 0) { 109 error = dofileread(td, fp, uap->fd, uap->buf, 110 uap->nbyte, (off_t)-1, 0); 111 fdrop(fp, td); 112 } 113 return(error); 114 } 115 116 /* 117 * Pread system call 118 */ 119 #ifndef _SYS_SYSPROTO_H_ 120 struct pread_args { 121 int fd; 122 void *buf; 123 size_t nbyte; 124 int pad; 125 off_t offset; 126 }; 127 #endif 128 /* 129 * MPSAFE 130 */ 131 int 132 pread(td, uap) 133 struct thread *td; 134 struct pread_args *uap; 135 { 136 struct file *fp; 137 int error; 138 139 if ((error = fget_read(td, uap->fd, &fp)) != 0) 140 return (error); 141 if (fp->f_type != DTYPE_VNODE) { 142 error = ESPIPE; 143 } else { 144 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 145 uap->offset, FOF_OFFSET); 146 } 147 fdrop(fp, td); 148 return(error); 149 } 150 151 /* 152 * Code common for read and pread 153 */ 154 int 155 dofileread(td, fp, fd, buf, nbyte, offset, flags) 156 struct thread *td; 157 struct file *fp; 158 int fd, flags; 159 void *buf; 160 size_t nbyte; 161 off_t offset; 162 { 163 struct uio auio; 164 struct iovec aiov; 165 long cnt, error = 0; 166 #ifdef KTRACE 167 struct iovec ktriov; 168 struct uio ktruio; 169 int didktr = 0; 170 #endif 171 172 aiov.iov_base = (caddr_t)buf; 173 aiov.iov_len = nbyte; 174 auio.uio_iov = &aiov; 175 auio.uio_iovcnt = 1; 176 auio.uio_offset = offset; 177 if (nbyte > INT_MAX) 178 return (EINVAL); 179 auio.uio_resid = nbyte; 180 auio.uio_rw = UIO_READ; 181 auio.uio_segflg = UIO_USERSPACE; 182 auio.uio_td = td; 183 #ifdef KTRACE 184 /* 185 * if tracing, save a copy of iovec 186 */ 187 if (KTRPOINT(td->td_proc, KTR_GENIO)) { 188 ktriov = aiov; 189 ktruio = auio; 190 didktr = 1; 191 } 192 #endif 193 cnt = nbyte; 194 195 if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) { 196 if (auio.uio_resid != cnt && (error == ERESTART || 197 error == EINTR || error == EWOULDBLOCK)) 198 error = 0; 199 } 200 cnt -= auio.uio_resid; 201 #ifdef KTRACE 202 if (didktr && error == 0) { 203 ktruio.uio_iov = &ktriov; 204 ktruio.uio_resid = cnt; 205 ktrgenio(td->td_proc->p_tracep, fd, UIO_READ, &ktruio, error); 206 } 207 #endif 208 td->td_retval[0] = cnt; 209 return (error); 210 } 211 212 /* 213 * Scatter read system call. 214 */ 215 #ifndef _SYS_SYSPROTO_H_ 216 struct readv_args { 217 int fd; 218 struct iovec *iovp; 219 u_int iovcnt; 220 }; 221 #endif 222 /* 223 * MPSAFE 224 */ 225 int 226 readv(td, uap) 227 struct thread *td; 228 struct readv_args *uap; 229 { 230 struct file *fp; 231 struct uio auio; 232 struct iovec *iov; 233 struct iovec *needfree; 234 struct iovec aiov[UIO_SMALLIOV]; 235 long i, cnt, error = 0; 236 u_int iovlen; 237 #ifdef KTRACE 238 struct iovec *ktriov = NULL; 239 struct uio ktruio; 240 #endif 241 mtx_lock(&Giant); 242 243 if ((error = fget_read(td, uap->fd, &fp)) != 0) 244 goto done2; 245 /* note: can't use iovlen until iovcnt is validated */ 246 iovlen = uap->iovcnt * sizeof (struct iovec); 247 if (uap->iovcnt > UIO_SMALLIOV) { 248 if (uap->iovcnt > UIO_MAXIOV) { 249 error = EINVAL; 250 goto done2; 251 } 252 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 253 needfree = iov; 254 } else { 255 iov = aiov; 256 needfree = NULL; 257 } 258 auio.uio_iov = iov; 259 auio.uio_iovcnt = uap->iovcnt; 260 auio.uio_rw = UIO_READ; 261 auio.uio_segflg = UIO_USERSPACE; 262 auio.uio_td = td; 263 auio.uio_offset = -1; 264 if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen))) 265 goto done; 266 auio.uio_resid = 0; 267 for (i = 0; i < uap->iovcnt; i++) { 268 if (iov->iov_len > INT_MAX - auio.uio_resid) { 269 error = EINVAL; 270 goto done; 271 } 272 auio.uio_resid += iov->iov_len; 273 iov++; 274 } 275 #ifdef KTRACE 276 /* 277 * if tracing, save a copy of iovec 278 */ 279 if (KTRPOINT(td->td_proc, KTR_GENIO)) { 280 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 281 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); 282 ktruio = auio; 283 } 284 #endif 285 cnt = auio.uio_resid; 286 if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) { 287 if (auio.uio_resid != cnt && (error == ERESTART || 288 error == EINTR || error == EWOULDBLOCK)) 289 error = 0; 290 } 291 cnt -= auio.uio_resid; 292 #ifdef KTRACE 293 if (ktriov != NULL) { 294 if (error == 0) { 295 ktruio.uio_iov = ktriov; 296 ktruio.uio_resid = cnt; 297 ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_READ, &ktruio, 298 error); 299 } 300 FREE(ktriov, M_TEMP); 301 } 302 #endif 303 td->td_retval[0] = cnt; 304 done: 305 fdrop(fp, td); 306 if (needfree) 307 FREE(needfree, M_IOV); 308 done2: 309 mtx_unlock(&Giant); 310 return (error); 311 } 312 313 /* 314 * Write system call 315 */ 316 #ifndef _SYS_SYSPROTO_H_ 317 struct write_args { 318 int fd; 319 const void *buf; 320 size_t nbyte; 321 }; 322 #endif 323 /* 324 * MPSAFE 325 */ 326 int 327 write(td, uap) 328 struct thread *td; 329 struct write_args *uap; 330 { 331 struct file *fp; 332 int error; 333 334 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 335 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, 336 (off_t)-1, 0); 337 fdrop(fp, td); 338 } else { 339 error = EBADF; /* XXX this can't be right */ 340 } 341 return(error); 342 } 343 344 /* 345 * Pwrite system call 346 */ 347 #ifndef _SYS_SYSPROTO_H_ 348 struct pwrite_args { 349 int fd; 350 const void *buf; 351 size_t nbyte; 352 int pad; 353 off_t offset; 354 }; 355 #endif 356 /* 357 * MPSAFE 358 */ 359 int 360 pwrite(td, uap) 361 struct thread *td; 362 struct pwrite_args *uap; 363 { 364 struct file *fp; 365 int error; 366 367 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 368 if (fp->f_type == DTYPE_VNODE) { 369 error = dofilewrite(td, fp, uap->fd, uap->buf, 370 uap->nbyte, uap->offset, FOF_OFFSET); 371 } else { 372 error = ESPIPE; 373 } 374 fdrop(fp, td); 375 } else { 376 error = EBADF; /* this can't be right */ 377 } 378 return(error); 379 } 380 381 static int 382 dofilewrite(td, fp, fd, buf, nbyte, offset, flags) 383 struct thread *td; 384 struct file *fp; 385 int fd, flags; 386 const void *buf; 387 size_t nbyte; 388 off_t offset; 389 { 390 struct uio auio; 391 struct iovec aiov; 392 long cnt, error = 0; 393 #ifdef KTRACE 394 struct iovec ktriov; 395 struct uio ktruio; 396 int didktr = 0; 397 #endif 398 399 aiov.iov_base = (void *)(uintptr_t)buf; 400 aiov.iov_len = nbyte; 401 auio.uio_iov = &aiov; 402 auio.uio_iovcnt = 1; 403 auio.uio_offset = offset; 404 if (nbyte > INT_MAX) 405 return (EINVAL); 406 auio.uio_resid = nbyte; 407 auio.uio_rw = UIO_WRITE; 408 auio.uio_segflg = UIO_USERSPACE; 409 auio.uio_td = td; 410 #ifdef KTRACE 411 /* 412 * if tracing, save a copy of iovec and uio 413 */ 414 if (KTRPOINT(td->td_proc, KTR_GENIO)) { 415 ktriov = aiov; 416 ktruio = auio; 417 didktr = 1; 418 } 419 #endif 420 cnt = nbyte; 421 if (fp->f_type == DTYPE_VNODE) 422 bwillwrite(); 423 if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) { 424 if (auio.uio_resid != cnt && (error == ERESTART || 425 error == EINTR || error == EWOULDBLOCK)) 426 error = 0; 427 if (error == EPIPE) { 428 PROC_LOCK(td->td_proc); 429 psignal(td->td_proc, SIGPIPE); 430 PROC_UNLOCK(td->td_proc); 431 } 432 } 433 cnt -= auio.uio_resid; 434 #ifdef KTRACE 435 if (didktr && error == 0) { 436 ktruio.uio_iov = &ktriov; 437 ktruio.uio_resid = cnt; 438 ktrgenio(td->td_proc->p_tracep, fd, UIO_WRITE, &ktruio, error); 439 } 440 #endif 441 td->td_retval[0] = cnt; 442 return (error); 443 } 444 445 /* 446 * Gather write system call 447 */ 448 #ifndef _SYS_SYSPROTO_H_ 449 struct writev_args { 450 int fd; 451 struct iovec *iovp; 452 u_int iovcnt; 453 }; 454 #endif 455 /* 456 * MPSAFE 457 */ 458 int 459 writev(td, uap) 460 struct thread *td; 461 register struct writev_args *uap; 462 { 463 struct file *fp; 464 struct uio auio; 465 register struct iovec *iov; 466 struct iovec *needfree; 467 struct iovec aiov[UIO_SMALLIOV]; 468 long i, cnt, error = 0; 469 u_int iovlen; 470 #ifdef KTRACE 471 struct iovec *ktriov = NULL; 472 struct uio ktruio; 473 #endif 474 475 mtx_lock(&Giant); 476 if ((error = fget_write(td, uap->fd, &fp)) != 0) { 477 error = EBADF; 478 goto done2; 479 } 480 /* note: can't use iovlen until iovcnt is validated */ 481 iovlen = uap->iovcnt * sizeof (struct iovec); 482 if (uap->iovcnt > UIO_SMALLIOV) { 483 if (uap->iovcnt > UIO_MAXIOV) { 484 needfree = NULL; 485 error = EINVAL; 486 goto done; 487 } 488 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 489 needfree = iov; 490 } else { 491 iov = aiov; 492 needfree = NULL; 493 } 494 auio.uio_iov = iov; 495 auio.uio_iovcnt = uap->iovcnt; 496 auio.uio_rw = UIO_WRITE; 497 auio.uio_segflg = UIO_USERSPACE; 498 auio.uio_td = td; 499 auio.uio_offset = -1; 500 if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen))) 501 goto done; 502 auio.uio_resid = 0; 503 for (i = 0; i < uap->iovcnt; i++) { 504 if (iov->iov_len > INT_MAX - auio.uio_resid) { 505 error = EINVAL; 506 goto done; 507 } 508 auio.uio_resid += iov->iov_len; 509 iov++; 510 } 511 #ifdef KTRACE 512 /* 513 * if tracing, save a copy of iovec and uio 514 */ 515 if (KTRPOINT(td->td_proc, KTR_GENIO)) { 516 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 517 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); 518 ktruio = auio; 519 } 520 #endif 521 cnt = auio.uio_resid; 522 if (fp->f_type == DTYPE_VNODE) 523 bwillwrite(); 524 if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) { 525 if (auio.uio_resid != cnt && (error == ERESTART || 526 error == EINTR || error == EWOULDBLOCK)) 527 error = 0; 528 if (error == EPIPE) { 529 PROC_LOCK(td->td_proc); 530 psignal(td->td_proc, SIGPIPE); 531 PROC_UNLOCK(td->td_proc); 532 } 533 } 534 cnt -= auio.uio_resid; 535 #ifdef KTRACE 536 if (ktriov != NULL) { 537 if (error == 0) { 538 ktruio.uio_iov = ktriov; 539 ktruio.uio_resid = cnt; 540 ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_WRITE, &ktruio, 541 error); 542 } 543 FREE(ktriov, M_TEMP); 544 } 545 #endif 546 td->td_retval[0] = cnt; 547 done: 548 fdrop(fp, td); 549 if (needfree) 550 FREE(needfree, M_IOV); 551 done2: 552 mtx_unlock(&Giant); 553 return (error); 554 } 555 556 /* 557 * Ioctl system call 558 */ 559 #ifndef _SYS_SYSPROTO_H_ 560 struct ioctl_args { 561 int fd; 562 u_long com; 563 caddr_t data; 564 }; 565 #endif 566 /* 567 * MPSAFE 568 */ 569 /* ARGSUSED */ 570 int 571 ioctl(td, uap) 572 struct thread *td; 573 register struct ioctl_args *uap; 574 { 575 struct file *fp; 576 register struct filedesc *fdp; 577 register u_long com; 578 int error = 0; 579 register u_int size; 580 caddr_t data, memp; 581 int tmp; 582 #define STK_PARAMS 128 583 union { 584 char stkbuf[STK_PARAMS]; 585 long align; 586 } ubuf; 587 588 if ((error = fget(td, uap->fd, &fp)) != 0) 589 return (error); 590 mtx_lock(&Giant); 591 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 592 fdrop(fp, td); 593 mtx_unlock(&Giant); 594 return (EBADF); 595 } 596 fdp = td->td_proc->p_fd; 597 switch (com = uap->com) { 598 case FIONCLEX: 599 FILEDESC_LOCK(fdp); 600 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; 601 FILEDESC_UNLOCK(fdp); 602 fdrop(fp, td); 603 mtx_unlock(&Giant); 604 return (0); 605 case FIOCLEX: 606 FILEDESC_LOCK(fdp); 607 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; 608 FILEDESC_UNLOCK(fdp); 609 fdrop(fp, td); 610 mtx_unlock(&Giant); 611 return (0); 612 } 613 614 /* 615 * Interpret high order word to find amount of data to be 616 * copied to/from the user's address space. 617 */ 618 size = IOCPARM_LEN(com); 619 if (size > IOCPARM_MAX) { 620 fdrop(fp, td); 621 mtx_unlock(&Giant); 622 return (ENOTTY); 623 } 624 625 memp = NULL; 626 if (size > sizeof (ubuf.stkbuf)) { 627 memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 628 data = memp; 629 } else { 630 data = ubuf.stkbuf; 631 } 632 if (com&IOC_IN) { 633 if (size) { 634 error = copyin(uap->data, data, (u_int)size); 635 if (error) { 636 if (memp) 637 free(memp, M_IOCTLOPS); 638 fdrop(fp, td); 639 goto done; 640 } 641 } else { 642 *(caddr_t *)data = uap->data; 643 } 644 } else if ((com&IOC_OUT) && size) { 645 /* 646 * Zero the buffer so the user always 647 * gets back something deterministic. 648 */ 649 bzero(data, size); 650 } else if (com&IOC_VOID) { 651 *(caddr_t *)data = uap->data; 652 } 653 654 #ifdef __alpha__ 655 { 656 int annoy = 1; 657 658 if (com == DIOCGDINFO_ALPHAHACK) 659 com = DIOCGDINFO; 660 else if (com == DIOCSDINFO_ALPHAHACK) 661 com = DIOCSDINFO; 662 else if (com == DIOCWDINFO_ALPHAHACK) 663 com = DIOCWDINFO; 664 else if (com == DIOCGDVIRGIN_ALPHAHACK) 665 com = DIOCGDVIRGIN; 666 else 667 annoy = 0; 668 if (annoy) { 669 uprintf("Recompile this program, it uses obsolete ioctls.\n"); 670 printf("Program using uses obsolete ioctls used, recompile.\n"); 671 tsleep(&annoy, PPAUSE, "syncer", 15 * hz); 672 } 673 } 674 #endif 675 676 switch (com) { 677 678 case FIONBIO: 679 FILE_LOCK(fp); 680 if ((tmp = *(int *)data)) 681 fp->f_flag |= FNONBLOCK; 682 else 683 fp->f_flag &= ~FNONBLOCK; 684 FILE_UNLOCK(fp); 685 error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td); 686 break; 687 688 case FIOASYNC: 689 FILE_LOCK(fp); 690 if ((tmp = *(int *)data)) 691 fp->f_flag |= FASYNC; 692 else 693 fp->f_flag &= ~FASYNC; 694 FILE_UNLOCK(fp); 695 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td); 696 break; 697 698 default: 699 error = fo_ioctl(fp, com, data, td); 700 /* 701 * Copy any data to user, size was 702 * already set and checked above. 703 */ 704 if (error == 0 && (com&IOC_OUT) && size) 705 error = copyout(data, uap->data, (u_int)size); 706 break; 707 } 708 if (memp) 709 free(memp, M_IOCTLOPS); 710 fdrop(fp, td); 711 done: 712 mtx_unlock(&Giant); 713 return (error); 714 } 715 716 /* 717 * sellock and selwait are initialized in selectinit() via SYSINIT. 718 */ 719 struct mtx sellock; 720 struct cv selwait; 721 int nselcoll; /* Select collisions since boot */ 722 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 723 724 /* 725 * Select system call. 726 */ 727 #ifndef _SYS_SYSPROTO_H_ 728 struct select_args { 729 int nd; 730 fd_set *in, *ou, *ex; 731 struct timeval *tv; 732 }; 733 #endif 734 /* 735 * MPSAFE 736 */ 737 int 738 select(td, uap) 739 register struct thread *td; 740 register struct select_args *uap; 741 { 742 struct filedesc *fdp; 743 /* 744 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 745 * infds with the new FD_SETSIZE of 1024, and more than enough for 746 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 747 * of 256. 748 */ 749 fd_mask s_selbits[howmany(2048, NFDBITS)]; 750 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 751 struct timeval atv, rtv, ttv; 752 int ncoll, error, timo; 753 u_int nbufbytes, ncpbytes, nfdbits; 754 755 if (uap->nd < 0) 756 return (EINVAL); 757 fdp = td->td_proc->p_fd; 758 mtx_lock(&Giant); 759 FILEDESC_LOCK(fdp); 760 761 if (uap->nd > td->td_proc->p_fd->fd_nfiles) 762 uap->nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 763 FILEDESC_UNLOCK(fdp); 764 765 /* 766 * Allocate just enough bits for the non-null fd_sets. Use the 767 * preallocated auto buffer if possible. 768 */ 769 nfdbits = roundup(uap->nd, NFDBITS); 770 ncpbytes = nfdbits / NBBY; 771 nbufbytes = 0; 772 if (uap->in != NULL) 773 nbufbytes += 2 * ncpbytes; 774 if (uap->ou != NULL) 775 nbufbytes += 2 * ncpbytes; 776 if (uap->ex != NULL) 777 nbufbytes += 2 * ncpbytes; 778 if (nbufbytes <= sizeof s_selbits) 779 selbits = &s_selbits[0]; 780 else 781 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 782 783 /* 784 * Assign pointers into the bit buffers and fetch the input bits. 785 * Put the output buffers together so that they can be bzeroed 786 * together. 787 */ 788 sbp = selbits; 789 #define getbits(name, x) \ 790 do { \ 791 if (uap->name == NULL) \ 792 ibits[x] = NULL; \ 793 else { \ 794 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 795 obits[x] = sbp; \ 796 sbp += ncpbytes / sizeof *sbp; \ 797 error = copyin(uap->name, ibits[x], ncpbytes); \ 798 if (error != 0) \ 799 goto done_nosellock; \ 800 } \ 801 } while (0) 802 getbits(in, 0); 803 getbits(ou, 1); 804 getbits(ex, 2); 805 #undef getbits 806 if (nbufbytes != 0) 807 bzero(selbits, nbufbytes / 2); 808 809 if (uap->tv) { 810 error = copyin((caddr_t)uap->tv, (caddr_t)&atv, 811 sizeof (atv)); 812 if (error) 813 goto done_nosellock; 814 if (itimerfix(&atv)) { 815 error = EINVAL; 816 goto done_nosellock; 817 } 818 getmicrouptime(&rtv); 819 timevaladd(&atv, &rtv); 820 } else { 821 atv.tv_sec = 0; 822 atv.tv_usec = 0; 823 } 824 timo = 0; 825 mtx_lock(&sellock); 826 retry: 827 ncoll = nselcoll; 828 mtx_lock_spin(&sched_lock); 829 td->td_flags |= TDF_SELECT; 830 mtx_unlock_spin(&sched_lock); 831 mtx_unlock(&sellock); 832 833 /* XXX Is there a better place for this? */ 834 TAILQ_INIT(&td->td_selq); 835 error = selscan(td, ibits, obits, uap->nd); 836 mtx_lock(&sellock); 837 if (error || td->td_retval[0]) 838 goto done; 839 if (atv.tv_sec || atv.tv_usec) { 840 getmicrouptime(&rtv); 841 if (timevalcmp(&rtv, &atv, >=)) 842 goto done; 843 ttv = atv; 844 timevalsub(&ttv, &rtv); 845 timo = ttv.tv_sec > 24 * 60 * 60 ? 846 24 * 60 * 60 * hz : tvtohz(&ttv); 847 } 848 849 /* 850 * An event of interest may occur while we do not hold 851 * sellock, so check TDF_SELECT and the number of 852 * collisions and rescan the file descriptors if 853 * necessary. 854 */ 855 mtx_lock_spin(&sched_lock); 856 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 857 mtx_unlock_spin(&sched_lock); 858 goto retry; 859 } 860 mtx_unlock_spin(&sched_lock); 861 862 if (timo > 0) 863 error = cv_timedwait_sig(&selwait, &sellock, timo); 864 else 865 error = cv_wait_sig(&selwait, &sellock); 866 867 if (error == 0) 868 goto retry; 869 870 done: 871 clear_selinfo_list(td); 872 mtx_lock_spin(&sched_lock); 873 td->td_flags &= ~TDF_SELECT; 874 mtx_unlock_spin(&sched_lock); 875 mtx_unlock(&sellock); 876 877 done_nosellock: 878 /* select is not restarted after signals... */ 879 if (error == ERESTART) 880 error = EINTR; 881 if (error == EWOULDBLOCK) 882 error = 0; 883 #define putbits(name, x) \ 884 if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \ 885 error = error2; 886 if (error == 0) { 887 int error2; 888 889 putbits(in, 0); 890 putbits(ou, 1); 891 putbits(ex, 2); 892 #undef putbits 893 } 894 if (selbits != &s_selbits[0]) 895 free(selbits, M_SELECT); 896 897 mtx_unlock(&Giant); 898 return (error); 899 } 900 901 static int 902 selscan(td, ibits, obits, nfd) 903 struct thread *td; 904 fd_mask **ibits, **obits; 905 int nfd; 906 { 907 int msk, i, fd; 908 fd_mask bits; 909 struct file *fp; 910 int n = 0; 911 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 912 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 913 struct filedesc *fdp = td->td_proc->p_fd; 914 915 FILEDESC_LOCK(fdp); 916 for (msk = 0; msk < 3; msk++) { 917 if (ibits[msk] == NULL) 918 continue; 919 for (i = 0; i < nfd; i += NFDBITS) { 920 bits = ibits[msk][i/NFDBITS]; 921 /* ffs(int mask) not portable, fd_mask is long */ 922 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 923 if (!(bits & 1)) 924 continue; 925 if ((fp = fget_locked(fdp, fd)) == NULL) { 926 FILEDESC_UNLOCK(fdp); 927 return (EBADF); 928 } 929 if (fo_poll(fp, flag[msk], fp->f_cred, td)) { 930 obits[msk][(fd)/NFDBITS] |= 931 ((fd_mask)1 << ((fd) % NFDBITS)); 932 n++; 933 } 934 } 935 } 936 } 937 FILEDESC_UNLOCK(fdp); 938 td->td_retval[0] = n; 939 return (0); 940 } 941 942 /* 943 * Poll system call. 944 */ 945 #ifndef _SYS_SYSPROTO_H_ 946 struct poll_args { 947 struct pollfd *fds; 948 u_int nfds; 949 int timeout; 950 }; 951 #endif 952 /* 953 * MPSAFE 954 */ 955 int 956 poll(td, uap) 957 struct thread *td; 958 struct poll_args *uap; 959 { 960 caddr_t bits; 961 char smallbits[32 * sizeof(struct pollfd)]; 962 struct timeval atv, rtv, ttv; 963 int ncoll, error = 0, timo; 964 u_int nfds; 965 size_t ni; 966 967 nfds = SCARG(uap, nfds); 968 969 mtx_lock(&Giant); 970 /* 971 * This is kinda bogus. We have fd limits, but that is not 972 * really related to the size of the pollfd array. Make sure 973 * we let the process use at least FD_SETSIZE entries and at 974 * least enough for the current limits. We want to be reasonably 975 * safe, but not overly restrictive. 976 */ 977 if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) && 978 (nfds > FD_SETSIZE)) { 979 error = EINVAL; 980 goto done2; 981 } 982 ni = nfds * sizeof(struct pollfd); 983 if (ni > sizeof(smallbits)) 984 bits = malloc(ni, M_TEMP, M_WAITOK); 985 else 986 bits = smallbits; 987 error = copyin(SCARG(uap, fds), bits, ni); 988 if (error) 989 goto done_nosellock; 990 if (SCARG(uap, timeout) != INFTIM) { 991 atv.tv_sec = SCARG(uap, timeout) / 1000; 992 atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000; 993 if (itimerfix(&atv)) { 994 error = EINVAL; 995 goto done_nosellock; 996 } 997 getmicrouptime(&rtv); 998 timevaladd(&atv, &rtv); 999 } else { 1000 atv.tv_sec = 0; 1001 atv.tv_usec = 0; 1002 } 1003 timo = 0; 1004 mtx_lock(&sellock); 1005 retry: 1006 ncoll = nselcoll; 1007 mtx_lock_spin(&sched_lock); 1008 td->td_flags |= TDF_SELECT; 1009 mtx_unlock_spin(&sched_lock); 1010 mtx_unlock(&sellock); 1011 1012 /* XXX Is there a better place for this? */ 1013 TAILQ_INIT(&td->td_selq); 1014 error = pollscan(td, (struct pollfd *)bits, nfds); 1015 mtx_lock(&sellock); 1016 if (error || td->td_retval[0]) 1017 goto done; 1018 if (atv.tv_sec || atv.tv_usec) { 1019 getmicrouptime(&rtv); 1020 if (timevalcmp(&rtv, &atv, >=)) 1021 goto done; 1022 ttv = atv; 1023 timevalsub(&ttv, &rtv); 1024 timo = ttv.tv_sec > 24 * 60 * 60 ? 1025 24 * 60 * 60 * hz : tvtohz(&ttv); 1026 } 1027 /* 1028 * An event of interest may occur while we do not hold 1029 * sellock, so check TDF_SELECT and the number of collisions 1030 * and rescan the file descriptors if necessary. 1031 */ 1032 mtx_lock_spin(&sched_lock); 1033 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 1034 mtx_unlock_spin(&sched_lock); 1035 goto retry; 1036 } 1037 mtx_unlock_spin(&sched_lock); 1038 1039 if (timo > 0) 1040 error = cv_timedwait_sig(&selwait, &sellock, timo); 1041 else 1042 error = cv_wait_sig(&selwait, &sellock); 1043 1044 if (error == 0) 1045 goto retry; 1046 1047 done: 1048 clear_selinfo_list(td); 1049 mtx_lock_spin(&sched_lock); 1050 td->td_flags &= ~TDF_SELECT; 1051 mtx_unlock_spin(&sched_lock); 1052 mtx_unlock(&sellock); 1053 1054 done_nosellock: 1055 /* poll is not restarted after signals... */ 1056 if (error == ERESTART) 1057 error = EINTR; 1058 if (error == EWOULDBLOCK) 1059 error = 0; 1060 if (error == 0) { 1061 error = copyout(bits, SCARG(uap, fds), ni); 1062 if (error) 1063 goto out; 1064 } 1065 out: 1066 if (ni > sizeof(smallbits)) 1067 free(bits, M_TEMP); 1068 done2: 1069 mtx_unlock(&Giant); 1070 return (error); 1071 } 1072 1073 static int 1074 pollscan(td, fds, nfd) 1075 struct thread *td; 1076 struct pollfd *fds; 1077 u_int nfd; 1078 { 1079 register struct filedesc *fdp = td->td_proc->p_fd; 1080 int i; 1081 struct file *fp; 1082 int n = 0; 1083 1084 FILEDESC_LOCK(fdp); 1085 for (i = 0; i < nfd; i++, fds++) { 1086 if (fds->fd >= fdp->fd_nfiles) { 1087 fds->revents = POLLNVAL; 1088 n++; 1089 } else if (fds->fd < 0) { 1090 fds->revents = 0; 1091 } else { 1092 fp = fdp->fd_ofiles[fds->fd]; 1093 if (fp == NULL) { 1094 fds->revents = POLLNVAL; 1095 n++; 1096 } else { 1097 /* 1098 * Note: backend also returns POLLHUP and 1099 * POLLERR if appropriate. 1100 */ 1101 fds->revents = fo_poll(fp, fds->events, 1102 fp->f_cred, td); 1103 if (fds->revents != 0) 1104 n++; 1105 } 1106 } 1107 } 1108 FILEDESC_UNLOCK(fdp); 1109 td->td_retval[0] = n; 1110 return (0); 1111 } 1112 1113 /* 1114 * OpenBSD poll system call. 1115 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1116 */ 1117 #ifndef _SYS_SYSPROTO_H_ 1118 struct openbsd_poll_args { 1119 struct pollfd *fds; 1120 u_int nfds; 1121 int timeout; 1122 }; 1123 #endif 1124 /* 1125 * MPSAFE 1126 */ 1127 int 1128 openbsd_poll(td, uap) 1129 register struct thread *td; 1130 register struct openbsd_poll_args *uap; 1131 { 1132 return (poll(td, (struct poll_args *)uap)); 1133 } 1134 1135 /* 1136 * Remove the references to the thread from all of the objects 1137 * we were polling. 1138 * 1139 * This code assumes that the underlying owner of the selinfo 1140 * structure will hold sellock before it changes it, and that 1141 * it will unlink itself from our list if it goes away. 1142 */ 1143 void 1144 clear_selinfo_list(td) 1145 struct thread *td; 1146 { 1147 struct selinfo *si; 1148 1149 mtx_assert(&sellock, MA_OWNED); 1150 TAILQ_FOREACH(si, &td->td_selq, si_thrlist) 1151 si->si_thread = NULL; 1152 TAILQ_INIT(&td->td_selq); 1153 } 1154 1155 /*ARGSUSED*/ 1156 int 1157 seltrue(dev, events, td) 1158 dev_t dev; 1159 int events; 1160 struct thread *td; 1161 { 1162 1163 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1164 } 1165 1166 /* 1167 * Record a select request. 1168 */ 1169 void 1170 selrecord(selector, sip) 1171 struct thread *selector; 1172 struct selinfo *sip; 1173 { 1174 1175 mtx_lock(&sellock); 1176 /* 1177 * If the thread is NULL then take ownership of selinfo 1178 * however if the thread is not NULL and the thread points to 1179 * someone else, then we have a collision, otherwise leave it alone 1180 * as we've owned it in a previous selrecord on this selinfo. 1181 */ 1182 if (sip->si_thread == NULL) { 1183 sip->si_thread = selector; 1184 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); 1185 } else if (sip->si_thread != selector) { 1186 sip->si_flags |= SI_COLL; 1187 } 1188 1189 mtx_unlock(&sellock); 1190 } 1191 1192 /* 1193 * Do a wakeup when a selectable event occurs. 1194 */ 1195 void 1196 selwakeup(sip) 1197 struct selinfo *sip; 1198 { 1199 struct thread *td; 1200 1201 mtx_lock(&sellock); 1202 td = sip->si_thread; 1203 if ((sip->si_flags & SI_COLL) != 0) { 1204 nselcoll++; 1205 sip->si_flags &= ~SI_COLL; 1206 cv_broadcast(&selwait); 1207 } 1208 if (td == NULL) { 1209 mtx_unlock(&sellock); 1210 return; 1211 } 1212 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); 1213 sip->si_thread = NULL; 1214 mtx_lock_spin(&sched_lock); 1215 if (td->td_wchan == (caddr_t)&selwait) { 1216 if (td->td_proc->p_stat == SSLEEP) 1217 setrunnable(td); 1218 else 1219 cv_waitq_remove(td); 1220 } else 1221 td->td_flags &= ~TDF_SELECT; 1222 mtx_unlock_spin(&sched_lock); 1223 mtx_unlock(&sellock); 1224 } 1225 1226 static void selectinit(void *); 1227 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1228 1229 /* ARGSUSED*/ 1230 static void 1231 selectinit(dummy) 1232 void *dummy; 1233 { 1234 cv_init(&selwait, "select"); 1235 mtx_init(&sellock, "sellck", NULL, MTX_DEF); 1236 } 1237