1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 * $FreeBSD$ 40 */ 41 42 #include "opt_ktrace.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/sysproto.h> 47 #include <sys/filedesc.h> 48 #include <sys/filio.h> 49 #include <sys/fcntl.h> 50 #include <sys/file.h> 51 #include <sys/proc.h> 52 #include <sys/signalvar.h> 53 #include <sys/socketvar.h> 54 #include <sys/uio.h> 55 #include <sys/kernel.h> 56 #include <sys/malloc.h> 57 #include <sys/poll.h> 58 #include <sys/resourcevar.h> 59 #include <sys/selinfo.h> 60 #include <sys/sysctl.h> 61 #include <sys/sysent.h> 62 #include <sys/bio.h> 63 #include <sys/buf.h> 64 #include <sys/condvar.h> 65 #ifdef KTRACE 66 #include <sys/ktrace.h> 67 #endif 68 #include <vm/vm.h> 69 #include <vm/vm_page.h> 70 71 #include <machine/limits.h> 72 73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 75 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 76 77 static int pollscan __P((struct proc *, struct pollfd *, u_int)); 78 static int pollholddrop __P((struct proc *, struct pollfd *, u_int, int)); 79 static int selscan __P((struct proc *, fd_mask **, fd_mask **, int)); 80 static int selholddrop __P((struct proc *, fd_mask *, fd_mask *, int, int)); 81 static int dofileread __P((struct proc *, struct file *, int, void *, 82 size_t, off_t, int)); 83 static int dofilewrite __P((struct proc *, struct file *, int, 84 const void *, size_t, off_t, int)); 85 86 struct file* 87 holdfp(fdp, fd, flag) 88 struct filedesc* fdp; 89 int fd, flag; 90 { 91 struct file* fp; 92 93 if (((u_int)fd) >= fdp->fd_nfiles || 94 (fp = fdp->fd_ofiles[fd]) == NULL || 95 (fp->f_flag & flag) == 0) { 96 return (NULL); 97 } 98 fhold(fp); 99 return (fp); 100 } 101 102 /* 103 * Read system call. 104 */ 105 #ifndef _SYS_SYSPROTO_H_ 106 struct read_args { 107 int fd; 108 void *buf; 109 size_t nbyte; 110 }; 111 #endif 112 int 113 read(p, uap) 114 struct proc *p; 115 register struct read_args *uap; 116 { 117 register struct file *fp; 118 int error; 119 120 if ((fp = holdfp(p->p_fd, uap->fd, FREAD)) == NULL) 121 return (EBADF); 122 error = dofileread(p, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0); 123 fdrop(fp, p); 124 return(error); 125 } 126 127 /* 128 * Pread system call 129 */ 130 #ifndef _SYS_SYSPROTO_H_ 131 struct pread_args { 132 int fd; 133 void *buf; 134 size_t nbyte; 135 int pad; 136 off_t offset; 137 }; 138 #endif 139 int 140 pread(p, uap) 141 struct proc *p; 142 register struct pread_args *uap; 143 { 144 register struct file *fp; 145 int error; 146 147 if ((fp = holdfp(p->p_fd, uap->fd, FREAD)) == NULL) 148 return (EBADF); 149 if (fp->f_type != DTYPE_VNODE) { 150 error = ESPIPE; 151 } else { 152 error = dofileread(p, fp, uap->fd, uap->buf, uap->nbyte, 153 uap->offset, FOF_OFFSET); 154 } 155 fdrop(fp, p); 156 return(error); 157 } 158 159 /* 160 * Code common for read and pread 161 */ 162 int 163 dofileread(p, fp, fd, buf, nbyte, offset, flags) 164 struct proc *p; 165 struct file *fp; 166 int fd, flags; 167 void *buf; 168 size_t nbyte; 169 off_t offset; 170 { 171 struct uio auio; 172 struct iovec aiov; 173 long cnt, error = 0; 174 #ifdef KTRACE 175 struct iovec ktriov; 176 struct uio ktruio; 177 int didktr = 0; 178 #endif 179 180 aiov.iov_base = (caddr_t)buf; 181 aiov.iov_len = nbyte; 182 auio.uio_iov = &aiov; 183 auio.uio_iovcnt = 1; 184 auio.uio_offset = offset; 185 if (nbyte > INT_MAX) 186 return (EINVAL); 187 auio.uio_resid = nbyte; 188 auio.uio_rw = UIO_READ; 189 auio.uio_segflg = UIO_USERSPACE; 190 auio.uio_procp = p; 191 #ifdef KTRACE 192 /* 193 * if tracing, save a copy of iovec 194 */ 195 if (KTRPOINT(p, KTR_GENIO)) { 196 ktriov = aiov; 197 ktruio = auio; 198 didktr = 1; 199 } 200 #endif 201 cnt = nbyte; 202 203 if ((error = fo_read(fp, &auio, fp->f_cred, flags, p))) { 204 if (auio.uio_resid != cnt && (error == ERESTART || 205 error == EINTR || error == EWOULDBLOCK)) 206 error = 0; 207 } 208 cnt -= auio.uio_resid; 209 #ifdef KTRACE 210 if (didktr && error == 0) { 211 ktruio.uio_iov = &ktriov; 212 ktruio.uio_resid = cnt; 213 ktrgenio(p->p_tracep, fd, UIO_READ, &ktruio, error); 214 } 215 #endif 216 p->p_retval[0] = cnt; 217 return (error); 218 } 219 220 /* 221 * Scatter read system call. 222 */ 223 #ifndef _SYS_SYSPROTO_H_ 224 struct readv_args { 225 int fd; 226 struct iovec *iovp; 227 u_int iovcnt; 228 }; 229 #endif 230 int 231 readv(p, uap) 232 struct proc *p; 233 register struct readv_args *uap; 234 { 235 register struct file *fp; 236 register struct filedesc *fdp = p->p_fd; 237 struct uio auio; 238 register struct iovec *iov; 239 struct iovec *needfree; 240 struct iovec aiov[UIO_SMALLIOV]; 241 long i, cnt, error = 0; 242 u_int iovlen; 243 #ifdef KTRACE 244 struct iovec *ktriov = NULL; 245 struct uio ktruio; 246 #endif 247 248 if ((fp = holdfp(fdp, uap->fd, FREAD)) == NULL) 249 return (EBADF); 250 /* note: can't use iovlen until iovcnt is validated */ 251 iovlen = uap->iovcnt * sizeof (struct iovec); 252 if (uap->iovcnt > UIO_SMALLIOV) { 253 if (uap->iovcnt > UIO_MAXIOV) 254 return (EINVAL); 255 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 256 needfree = iov; 257 } else { 258 iov = aiov; 259 needfree = NULL; 260 } 261 auio.uio_iov = iov; 262 auio.uio_iovcnt = uap->iovcnt; 263 auio.uio_rw = UIO_READ; 264 auio.uio_segflg = UIO_USERSPACE; 265 auio.uio_procp = p; 266 auio.uio_offset = -1; 267 if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen))) 268 goto done; 269 auio.uio_resid = 0; 270 for (i = 0; i < uap->iovcnt; i++) { 271 if (iov->iov_len > INT_MAX - auio.uio_resid) { 272 error = EINVAL; 273 goto done; 274 } 275 auio.uio_resid += iov->iov_len; 276 iov++; 277 } 278 #ifdef KTRACE 279 /* 280 * if tracing, save a copy of iovec 281 */ 282 if (KTRPOINT(p, KTR_GENIO)) { 283 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 284 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); 285 ktruio = auio; 286 } 287 #endif 288 cnt = auio.uio_resid; 289 if ((error = fo_read(fp, &auio, fp->f_cred, 0, p))) { 290 if (auio.uio_resid != cnt && (error == ERESTART || 291 error == EINTR || error == EWOULDBLOCK)) 292 error = 0; 293 } 294 cnt -= auio.uio_resid; 295 #ifdef KTRACE 296 if (ktriov != NULL) { 297 if (error == 0) { 298 ktruio.uio_iov = ktriov; 299 ktruio.uio_resid = cnt; 300 ktrgenio(p->p_tracep, uap->fd, UIO_READ, &ktruio, 301 error); 302 } 303 FREE(ktriov, M_TEMP); 304 } 305 #endif 306 p->p_retval[0] = cnt; 307 done: 308 fdrop(fp, p); 309 if (needfree) 310 FREE(needfree, M_IOV); 311 return (error); 312 } 313 314 /* 315 * Write system call 316 */ 317 #ifndef _SYS_SYSPROTO_H_ 318 struct write_args { 319 int fd; 320 const void *buf; 321 size_t nbyte; 322 }; 323 #endif 324 int 325 write(p, uap) 326 struct proc *p; 327 register struct write_args *uap; 328 { 329 register struct file *fp; 330 int error; 331 332 if ((fp = holdfp(p->p_fd, uap->fd, FWRITE)) == NULL) 333 return (EBADF); 334 error = dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0); 335 fdrop(fp, p); 336 return(error); 337 } 338 339 /* 340 * Pwrite system call 341 */ 342 #ifndef _SYS_SYSPROTO_H_ 343 struct pwrite_args { 344 int fd; 345 const void *buf; 346 size_t nbyte; 347 int pad; 348 off_t offset; 349 }; 350 #endif 351 int 352 pwrite(p, uap) 353 struct proc *p; 354 register struct pwrite_args *uap; 355 { 356 register struct file *fp; 357 int error; 358 359 if ((fp = holdfp(p->p_fd, uap->fd, FWRITE)) == NULL) 360 return (EBADF); 361 if (fp->f_type != DTYPE_VNODE) { 362 error = ESPIPE; 363 } else { 364 error = dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte, 365 uap->offset, FOF_OFFSET); 366 } 367 fdrop(fp, p); 368 return(error); 369 } 370 371 static int 372 dofilewrite(p, fp, fd, buf, nbyte, offset, flags) 373 struct proc *p; 374 struct file *fp; 375 int fd, flags; 376 const void *buf; 377 size_t nbyte; 378 off_t offset; 379 { 380 struct uio auio; 381 struct iovec aiov; 382 long cnt, error = 0; 383 #ifdef KTRACE 384 struct iovec ktriov; 385 struct uio ktruio; 386 int didktr = 0; 387 #endif 388 389 aiov.iov_base = (void *)(uintptr_t)buf; 390 aiov.iov_len = nbyte; 391 auio.uio_iov = &aiov; 392 auio.uio_iovcnt = 1; 393 auio.uio_offset = offset; 394 if (nbyte > INT_MAX) 395 return (EINVAL); 396 auio.uio_resid = nbyte; 397 auio.uio_rw = UIO_WRITE; 398 auio.uio_segflg = UIO_USERSPACE; 399 auio.uio_procp = p; 400 #ifdef KTRACE 401 /* 402 * if tracing, save a copy of iovec and uio 403 */ 404 if (KTRPOINT(p, KTR_GENIO)) { 405 ktriov = aiov; 406 ktruio = auio; 407 didktr = 1; 408 } 409 #endif 410 cnt = nbyte; 411 if (fp->f_type == DTYPE_VNODE) 412 bwillwrite(); 413 if ((error = fo_write(fp, &auio, fp->f_cred, flags, p))) { 414 if (auio.uio_resid != cnt && (error == ERESTART || 415 error == EINTR || error == EWOULDBLOCK)) 416 error = 0; 417 if (error == EPIPE) { 418 PROC_LOCK(p); 419 psignal(p, SIGPIPE); 420 PROC_UNLOCK(p); 421 } 422 } 423 cnt -= auio.uio_resid; 424 #ifdef KTRACE 425 if (didktr && error == 0) { 426 ktruio.uio_iov = &ktriov; 427 ktruio.uio_resid = cnt; 428 ktrgenio(p->p_tracep, fd, UIO_WRITE, &ktruio, error); 429 } 430 #endif 431 p->p_retval[0] = cnt; 432 return (error); 433 } 434 435 /* 436 * Gather write system call 437 */ 438 #ifndef _SYS_SYSPROTO_H_ 439 struct writev_args { 440 int fd; 441 struct iovec *iovp; 442 u_int iovcnt; 443 }; 444 #endif 445 int 446 writev(p, uap) 447 struct proc *p; 448 register struct writev_args *uap; 449 { 450 register struct file *fp; 451 register struct filedesc *fdp = p->p_fd; 452 struct uio auio; 453 register struct iovec *iov; 454 struct iovec *needfree; 455 struct iovec aiov[UIO_SMALLIOV]; 456 long i, cnt, error = 0; 457 u_int iovlen; 458 #ifdef KTRACE 459 struct iovec *ktriov = NULL; 460 struct uio ktruio; 461 #endif 462 463 if ((fp = holdfp(fdp, uap->fd, FWRITE)) == NULL) 464 return (EBADF); 465 /* note: can't use iovlen until iovcnt is validated */ 466 iovlen = uap->iovcnt * sizeof (struct iovec); 467 if (uap->iovcnt > UIO_SMALLIOV) { 468 if (uap->iovcnt > UIO_MAXIOV) { 469 needfree = NULL; 470 error = EINVAL; 471 goto done; 472 } 473 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 474 needfree = iov; 475 } else { 476 iov = aiov; 477 needfree = NULL; 478 } 479 auio.uio_iov = iov; 480 auio.uio_iovcnt = uap->iovcnt; 481 auio.uio_rw = UIO_WRITE; 482 auio.uio_segflg = UIO_USERSPACE; 483 auio.uio_procp = p; 484 auio.uio_offset = -1; 485 if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen))) 486 goto done; 487 auio.uio_resid = 0; 488 for (i = 0; i < uap->iovcnt; i++) { 489 if (iov->iov_len > INT_MAX - auio.uio_resid) { 490 error = EINVAL; 491 goto done; 492 } 493 auio.uio_resid += iov->iov_len; 494 iov++; 495 } 496 #ifdef KTRACE 497 /* 498 * if tracing, save a copy of iovec and uio 499 */ 500 if (KTRPOINT(p, KTR_GENIO)) { 501 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 502 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); 503 ktruio = auio; 504 } 505 #endif 506 cnt = auio.uio_resid; 507 if (fp->f_type == DTYPE_VNODE) 508 bwillwrite(); 509 if ((error = fo_write(fp, &auio, fp->f_cred, 0, p))) { 510 if (auio.uio_resid != cnt && (error == ERESTART || 511 error == EINTR || error == EWOULDBLOCK)) 512 error = 0; 513 if (error == EPIPE) { 514 PROC_LOCK(p); 515 psignal(p, SIGPIPE); 516 PROC_UNLOCK(p); 517 } 518 } 519 cnt -= auio.uio_resid; 520 #ifdef KTRACE 521 if (ktriov != NULL) { 522 if (error == 0) { 523 ktruio.uio_iov = ktriov; 524 ktruio.uio_resid = cnt; 525 ktrgenio(p->p_tracep, uap->fd, UIO_WRITE, &ktruio, 526 error); 527 } 528 FREE(ktriov, M_TEMP); 529 } 530 #endif 531 p->p_retval[0] = cnt; 532 done: 533 fdrop(fp, p); 534 if (needfree) 535 FREE(needfree, M_IOV); 536 return (error); 537 } 538 539 /* 540 * Ioctl system call 541 */ 542 #ifndef _SYS_SYSPROTO_H_ 543 struct ioctl_args { 544 int fd; 545 u_long com; 546 caddr_t data; 547 }; 548 #endif 549 /* ARGSUSED */ 550 int 551 ioctl(p, uap) 552 struct proc *p; 553 register struct ioctl_args *uap; 554 { 555 register struct file *fp; 556 register struct filedesc *fdp; 557 register u_long com; 558 int error; 559 register u_int size; 560 caddr_t data, memp; 561 int tmp; 562 #define STK_PARAMS 128 563 union { 564 char stkbuf[STK_PARAMS]; 565 long align; 566 } ubuf; 567 568 fdp = p->p_fd; 569 if ((u_int)uap->fd >= fdp->fd_nfiles || 570 (fp = fdp->fd_ofiles[uap->fd]) == NULL) 571 return (EBADF); 572 573 if ((fp->f_flag & (FREAD | FWRITE)) == 0) 574 return (EBADF); 575 576 switch (com = uap->com) { 577 case FIONCLEX: 578 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; 579 return (0); 580 case FIOCLEX: 581 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; 582 return (0); 583 } 584 585 /* 586 * Interpret high order word to find amount of data to be 587 * copied to/from the user's address space. 588 */ 589 size = IOCPARM_LEN(com); 590 if (size > IOCPARM_MAX) 591 return (ENOTTY); 592 593 fhold(fp); 594 595 memp = NULL; 596 if (size > sizeof (ubuf.stkbuf)) { 597 memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 598 data = memp; 599 } else { 600 data = ubuf.stkbuf; 601 } 602 if (com&IOC_IN) { 603 if (size) { 604 error = copyin(uap->data, data, (u_int)size); 605 if (error) { 606 if (memp) 607 free(memp, M_IOCTLOPS); 608 fdrop(fp, p); 609 return (error); 610 } 611 } else { 612 *(caddr_t *)data = uap->data; 613 } 614 } else if ((com&IOC_OUT) && size) { 615 /* 616 * Zero the buffer so the user always 617 * gets back something deterministic. 618 */ 619 bzero(data, size); 620 } else if (com&IOC_VOID) { 621 *(caddr_t *)data = uap->data; 622 } 623 624 switch (com) { 625 626 case FIONBIO: 627 if ((tmp = *(int *)data)) 628 fp->f_flag |= FNONBLOCK; 629 else 630 fp->f_flag &= ~FNONBLOCK; 631 error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, p); 632 break; 633 634 case FIOASYNC: 635 if ((tmp = *(int *)data)) 636 fp->f_flag |= FASYNC; 637 else 638 fp->f_flag &= ~FASYNC; 639 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, p); 640 break; 641 642 default: 643 error = fo_ioctl(fp, com, data, p); 644 /* 645 * Copy any data to user, size was 646 * already set and checked above. 647 */ 648 if (error == 0 && (com&IOC_OUT) && size) 649 error = copyout(data, uap->data, (u_int)size); 650 break; 651 } 652 if (memp) 653 free(memp, M_IOCTLOPS); 654 fdrop(fp, p); 655 return (error); 656 } 657 658 static int nselcoll; /* Select collisions since boot */ 659 struct cv selwait; 660 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 661 662 /* 663 * Select system call. 664 */ 665 #ifndef _SYS_SYSPROTO_H_ 666 struct select_args { 667 int nd; 668 fd_set *in, *ou, *ex; 669 struct timeval *tv; 670 }; 671 #endif 672 int 673 select(p, uap) 674 register struct proc *p; 675 register struct select_args *uap; 676 { 677 /* 678 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 679 * infds with the new FD_SETSIZE of 1024, and more than enough for 680 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 681 * of 256. 682 */ 683 fd_mask s_selbits[howmany(2048, NFDBITS)]; 684 fd_mask s_heldbits[howmany(2048, NFDBITS)]; 685 fd_mask *ibits[3], *obits[3], *selbits, *sbp, *heldbits, *hibits, *hobits; 686 struct timeval atv, rtv, ttv; 687 int ncoll, error, timo, i; 688 u_int nbufbytes, ncpbytes, nfdbits; 689 690 if (uap->nd < 0) 691 return (EINVAL); 692 if (uap->nd > p->p_fd->fd_nfiles) 693 uap->nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 694 695 /* 696 * Allocate just enough bits for the non-null fd_sets. Use the 697 * preallocated auto buffer if possible. 698 */ 699 nfdbits = roundup(uap->nd, NFDBITS); 700 ncpbytes = nfdbits / NBBY; 701 nbufbytes = 0; 702 if (uap->in != NULL) 703 nbufbytes += 2 * ncpbytes; 704 if (uap->ou != NULL) 705 nbufbytes += 2 * ncpbytes; 706 if (uap->ex != NULL) 707 nbufbytes += 2 * ncpbytes; 708 if (nbufbytes <= sizeof s_selbits) 709 selbits = &s_selbits[0]; 710 else 711 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 712 if (2 * ncpbytes <= sizeof s_heldbits) { 713 bzero(s_heldbits, sizeof(s_heldbits)); 714 heldbits = &s_heldbits[0]; 715 } else 716 heldbits = malloc(2 * ncpbytes, M_SELECT, M_WAITOK | M_ZERO); 717 718 /* 719 * Assign pointers into the bit buffers and fetch the input bits. 720 * Put the output buffers together so that they can be bzeroed 721 * together. 722 */ 723 sbp = selbits; 724 hibits = heldbits + ncpbytes / sizeof *heldbits; 725 hobits = heldbits; 726 #define getbits(name, x) \ 727 do { \ 728 if (uap->name == NULL) \ 729 ibits[x] = NULL; \ 730 else { \ 731 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 732 obits[x] = sbp; \ 733 sbp += ncpbytes / sizeof *sbp; \ 734 error = copyin(uap->name, ibits[x], ncpbytes); \ 735 if (error != 0) \ 736 goto done_noproclock; \ 737 for (i = 0; \ 738 i < ncpbytes / sizeof ibits[i][0]; \ 739 i++) \ 740 hibits[i] |= ibits[x][i]; \ 741 } \ 742 } while (0) 743 getbits(in, 0); 744 getbits(ou, 1); 745 getbits(ex, 2); 746 #undef getbits 747 if (nbufbytes != 0) 748 bzero(selbits, nbufbytes / 2); 749 750 if (uap->tv) { 751 error = copyin((caddr_t)uap->tv, (caddr_t)&atv, 752 sizeof (atv)); 753 if (error) 754 goto done_noproclock; 755 if (itimerfix(&atv)) { 756 error = EINVAL; 757 goto done_noproclock; 758 } 759 getmicrouptime(&rtv); 760 timevaladd(&atv, &rtv); 761 } else { 762 atv.tv_sec = 0; 763 atv.tv_usec = 0; 764 } 765 selholddrop(p, hibits, hobits, uap->nd, 1); 766 timo = 0; 767 PROC_LOCK(p); 768 retry: 769 ncoll = nselcoll; 770 p->p_flag |= P_SELECT; 771 PROC_UNLOCK(p); 772 error = selscan(p, ibits, obits, uap->nd); 773 PROC_LOCK(p); 774 if (error || p->p_retval[0]) 775 goto done; 776 if (atv.tv_sec || atv.tv_usec) { 777 getmicrouptime(&rtv); 778 if (timevalcmp(&rtv, &atv, >=)) { 779 /* 780 * An event of our interest may occur during locking a process. 781 * In order to avoid missing the event that occured during locking 782 * the process, test P_SELECT and rescan file descriptors if 783 * necessary. 784 */ 785 if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) { 786 ncoll = nselcoll; 787 p->p_flag |= P_SELECT; 788 PROC_UNLOCK(p); 789 error = selscan(p, ibits, obits, uap->nd); 790 PROC_LOCK(p); 791 } 792 goto done; 793 } 794 ttv = atv; 795 timevalsub(&ttv, &rtv); 796 timo = ttv.tv_sec > 24 * 60 * 60 ? 797 24 * 60 * 60 * hz : tvtohz(&ttv); 798 } 799 p->p_flag &= ~P_SELECT; 800 801 if (timo > 0) 802 error = cv_timedwait_sig(&selwait, &p->p_mtx, timo); 803 else 804 error = cv_wait_sig(&selwait, &p->p_mtx); 805 806 if (error == 0) 807 goto retry; 808 809 done: 810 p->p_flag &= ~P_SELECT; 811 PROC_UNLOCK(p); 812 selholddrop(p, hibits, hobits, uap->nd, 0); 813 done_noproclock: 814 /* select is not restarted after signals... */ 815 if (error == ERESTART) 816 error = EINTR; 817 if (error == EWOULDBLOCK) 818 error = 0; 819 #define putbits(name, x) \ 820 if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \ 821 error = error2; 822 if (error == 0) { 823 int error2; 824 825 putbits(in, 0); 826 putbits(ou, 1); 827 putbits(ex, 2); 828 #undef putbits 829 } 830 if (selbits != &s_selbits[0]) 831 free(selbits, M_SELECT); 832 if (heldbits != &s_heldbits[0]) 833 free(heldbits, M_SELECT); 834 return (error); 835 } 836 837 static int 838 selholddrop(p, ibits, obits, nfd, hold) 839 struct proc *p; 840 fd_mask *ibits, *obits; 841 int nfd, hold; 842 { 843 struct filedesc *fdp = p->p_fd; 844 int i, fd; 845 fd_mask bits; 846 struct file *fp; 847 848 for (i = 0; i < nfd; i += NFDBITS) { 849 if (hold) 850 bits = ibits[i/NFDBITS]; 851 else 852 bits = obits[i/NFDBITS]; 853 /* ffs(int mask) not portable, fd_mask is long */ 854 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 855 if (!(bits & 1)) 856 continue; 857 fp = fdp->fd_ofiles[fd]; 858 if (fp == NULL) 859 return (EBADF); 860 if (hold) { 861 fhold(fp); 862 obits[(fd)/NFDBITS] |= 863 ((fd_mask)1 << ((fd) % NFDBITS)); 864 } else 865 fdrop(fp, p); 866 } 867 } 868 return (0); 869 } 870 871 static int 872 selscan(p, ibits, obits, nfd) 873 struct proc *p; 874 fd_mask **ibits, **obits; 875 int nfd; 876 { 877 struct filedesc *fdp = p->p_fd; 878 int msk, i, fd; 879 fd_mask bits; 880 struct file *fp; 881 int n = 0; 882 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 883 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 884 885 for (msk = 0; msk < 3; msk++) { 886 if (ibits[msk] == NULL) 887 continue; 888 for (i = 0; i < nfd; i += NFDBITS) { 889 bits = ibits[msk][i/NFDBITS]; 890 /* ffs(int mask) not portable, fd_mask is long */ 891 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 892 if (!(bits & 1)) 893 continue; 894 fp = fdp->fd_ofiles[fd]; 895 if (fp == NULL) 896 return (EBADF); 897 if (fo_poll(fp, flag[msk], fp->f_cred, p)) { 898 obits[msk][(fd)/NFDBITS] |= 899 ((fd_mask)1 << ((fd) % NFDBITS)); 900 n++; 901 } 902 } 903 } 904 } 905 p->p_retval[0] = n; 906 return (0); 907 } 908 909 /* 910 * Poll system call. 911 */ 912 #ifndef _SYS_SYSPROTO_H_ 913 struct poll_args { 914 struct pollfd *fds; 915 u_int nfds; 916 int timeout; 917 }; 918 #endif 919 int 920 poll(p, uap) 921 struct proc *p; 922 struct poll_args *uap; 923 { 924 caddr_t bits; 925 char smallbits[32 * sizeof(struct pollfd)]; 926 struct timeval atv, rtv, ttv; 927 int ncoll, error = 0, timo; 928 u_int nfds; 929 size_t ni; 930 struct pollfd p_heldbits[32]; 931 struct pollfd *heldbits; 932 933 nfds = SCARG(uap, nfds); 934 /* 935 * This is kinda bogus. We have fd limits, but that is not 936 * really related to the size of the pollfd array. Make sure 937 * we let the process use at least FD_SETSIZE entries and at 938 * least enough for the current limits. We want to be reasonably 939 * safe, but not overly restrictive. 940 */ 941 if (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && nfds > FD_SETSIZE) 942 return (EINVAL); 943 ni = nfds * sizeof(struct pollfd); 944 if (ni > sizeof(smallbits)) 945 bits = malloc(ni, M_TEMP, M_WAITOK); 946 else 947 bits = smallbits; 948 if (ni > sizeof(p_heldbits)) 949 heldbits = malloc(ni, M_TEMP, M_WAITOK); 950 else { 951 bzero(p_heldbits, sizeof(p_heldbits)); 952 heldbits = p_heldbits; 953 } 954 error = copyin(SCARG(uap, fds), bits, ni); 955 if (error) 956 goto done_noproclock; 957 bcopy(bits, heldbits, ni); 958 if (SCARG(uap, timeout) != INFTIM) { 959 atv.tv_sec = SCARG(uap, timeout) / 1000; 960 atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000; 961 if (itimerfix(&atv)) { 962 error = EINVAL; 963 goto done_noproclock; 964 } 965 getmicrouptime(&rtv); 966 timevaladd(&atv, &rtv); 967 } else { 968 atv.tv_sec = 0; 969 atv.tv_usec = 0; 970 } 971 pollholddrop(p, heldbits, nfds, 1); 972 timo = 0; 973 PROC_LOCK(p); 974 retry: 975 ncoll = nselcoll; 976 p->p_flag |= P_SELECT; 977 PROC_UNLOCK(p); 978 error = pollscan(p, (struct pollfd *)bits, nfds); 979 PROC_LOCK(p); 980 if (error || p->p_retval[0]) 981 goto done; 982 if (atv.tv_sec || atv.tv_usec) { 983 getmicrouptime(&rtv); 984 if (timevalcmp(&rtv, &atv, >=)) { 985 /* 986 * An event of our interest may occur during locking a process. 987 * In order to avoid missing the event that occured during locking 988 * the process, test P_SELECT and rescan file descriptors if 989 * necessary. 990 */ 991 if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) { 992 ncoll = nselcoll; 993 p->p_flag |= P_SELECT; 994 PROC_UNLOCK(p); 995 error = pollscan(p, (struct pollfd *)bits, nfds); 996 PROC_LOCK(p); 997 } 998 goto done; 999 } 1000 ttv = atv; 1001 timevalsub(&ttv, &rtv); 1002 timo = ttv.tv_sec > 24 * 60 * 60 ? 1003 24 * 60 * 60 * hz : tvtohz(&ttv); 1004 } 1005 p->p_flag &= ~P_SELECT; 1006 if (timo > 0) 1007 error = cv_timedwait_sig(&selwait, &p->p_mtx, timo); 1008 else 1009 error = cv_wait_sig(&selwait, &p->p_mtx); 1010 if (error == 0) 1011 goto retry; 1012 1013 done: 1014 p->p_flag &= ~P_SELECT; 1015 PROC_UNLOCK(p); 1016 pollholddrop(p, heldbits, nfds, 0); 1017 done_noproclock: 1018 /* poll is not restarted after signals... */ 1019 if (error == ERESTART) 1020 error = EINTR; 1021 if (error == EWOULDBLOCK) 1022 error = 0; 1023 if (error == 0) { 1024 error = copyout(bits, SCARG(uap, fds), ni); 1025 if (error) 1026 goto out; 1027 } 1028 out: 1029 if (ni > sizeof(smallbits)) 1030 free(bits, M_TEMP); 1031 if (ni > sizeof(p_heldbits)) 1032 free(heldbits, M_TEMP); 1033 return (error); 1034 } 1035 1036 static int 1037 pollholddrop(p, fds, nfd, hold) 1038 struct proc *p; 1039 struct pollfd *fds; 1040 u_int nfd; 1041 int hold; 1042 { 1043 register struct filedesc *fdp = p->p_fd; 1044 int i; 1045 struct file *fp; 1046 1047 for (i = 0; i < nfd; i++, fds++) { 1048 if (0 <= fds->fd && fds->fd < fdp->fd_nfiles) { 1049 fp = fdp->fd_ofiles[fds->fd]; 1050 if (hold) { 1051 if (fp != NULL) { 1052 fhold(fp); 1053 fds->revents = 1; 1054 } else 1055 fds->revents = 0; 1056 } else if(fp != NULL && fds->revents) 1057 fdrop(fp, p); 1058 } 1059 } 1060 return (0); 1061 } 1062 1063 static int 1064 pollscan(p, fds, nfd) 1065 struct proc *p; 1066 struct pollfd *fds; 1067 u_int nfd; 1068 { 1069 register struct filedesc *fdp = p->p_fd; 1070 int i; 1071 struct file *fp; 1072 int n = 0; 1073 1074 for (i = 0; i < nfd; i++, fds++) { 1075 if (fds->fd >= fdp->fd_nfiles) { 1076 fds->revents = POLLNVAL; 1077 n++; 1078 } else if (fds->fd < 0) { 1079 fds->revents = 0; 1080 } else { 1081 fp = fdp->fd_ofiles[fds->fd]; 1082 if (fp == NULL) { 1083 fds->revents = POLLNVAL; 1084 n++; 1085 } else { 1086 /* 1087 * Note: backend also returns POLLHUP and 1088 * POLLERR if appropriate. 1089 */ 1090 fds->revents = fo_poll(fp, fds->events, 1091 fp->f_cred, p); 1092 if (fds->revents != 0) 1093 n++; 1094 } 1095 } 1096 } 1097 p->p_retval[0] = n; 1098 return (0); 1099 } 1100 1101 /* 1102 * OpenBSD poll system call. 1103 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1104 */ 1105 #ifndef _SYS_SYSPROTO_H_ 1106 struct openbsd_poll_args { 1107 struct pollfd *fds; 1108 u_int nfds; 1109 int timeout; 1110 }; 1111 #endif 1112 int 1113 openbsd_poll(p, uap) 1114 register struct proc *p; 1115 register struct openbsd_poll_args *uap; 1116 { 1117 return (poll(p, (struct poll_args *)uap)); 1118 } 1119 1120 /*ARGSUSED*/ 1121 int 1122 seltrue(dev, events, p) 1123 dev_t dev; 1124 int events; 1125 struct proc *p; 1126 { 1127 1128 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1129 } 1130 1131 /* 1132 * Record a select request. 1133 */ 1134 void 1135 selrecord(selector, sip) 1136 struct proc *selector; 1137 struct selinfo *sip; 1138 { 1139 struct proc *p; 1140 pid_t mypid; 1141 1142 mypid = selector->p_pid; 1143 if (sip->si_pid == mypid) 1144 return; 1145 if (sip->si_pid && (p = pfind(sip->si_pid))) { 1146 mtx_lock_spin(&sched_lock); 1147 if (p->p_wchan == (caddr_t)&selwait) { 1148 mtx_unlock_spin(&sched_lock); 1149 PROC_UNLOCK(p); 1150 sip->si_flags |= SI_COLL; 1151 return; 1152 } 1153 mtx_unlock_spin(&sched_lock); 1154 PROC_UNLOCK(p); 1155 } 1156 sip->si_pid = mypid; 1157 } 1158 1159 /* 1160 * Do a wakeup when a selectable event occurs. 1161 */ 1162 void 1163 selwakeup(sip) 1164 register struct selinfo *sip; 1165 { 1166 register struct proc *p; 1167 1168 if (sip->si_pid == 0) 1169 return; 1170 if (sip->si_flags & SI_COLL) { 1171 nselcoll++; 1172 sip->si_flags &= ~SI_COLL; 1173 cv_broadcast(&selwait); 1174 } 1175 p = pfind(sip->si_pid); 1176 sip->si_pid = 0; 1177 if (p != NULL) { 1178 mtx_lock_spin(&sched_lock); 1179 if (p->p_wchan == (caddr_t)&selwait) { 1180 if (p->p_stat == SSLEEP) 1181 setrunnable(p); 1182 else 1183 cv_waitq_remove(p); 1184 } else 1185 p->p_flag &= ~P_SELECT; 1186 mtx_unlock_spin(&sched_lock); 1187 PROC_UNLOCK(p); 1188 } 1189 } 1190 1191 static void selectinit __P((void *)); 1192 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1193 1194 /* ARGSUSED*/ 1195 static void 1196 selectinit(dummy) 1197 void *dummy; 1198 { 1199 cv_init(&selwait, "select"); 1200 } 1201