1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_capsicum.h" 41 #include "opt_compat.h" 42 #include "opt_ktrace.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/sysproto.h> 47 #include <sys/capsicum.h> 48 #include <sys/filedesc.h> 49 #include <sys/filio.h> 50 #include <sys/fcntl.h> 51 #include <sys/file.h> 52 #include <sys/lock.h> 53 #include <sys/proc.h> 54 #include <sys/signalvar.h> 55 #include <sys/socketvar.h> 56 #include <sys/uio.h> 57 #include <sys/kernel.h> 58 #include <sys/ktr.h> 59 #include <sys/limits.h> 60 #include <sys/malloc.h> 61 #include <sys/poll.h> 62 #include <sys/resourcevar.h> 63 #include <sys/selinfo.h> 64 #include <sys/sleepqueue.h> 65 #include <sys/syscallsubr.h> 66 #include <sys/sysctl.h> 67 #include <sys/sysent.h> 68 #include <sys/vnode.h> 69 #include <sys/bio.h> 70 #include <sys/buf.h> 71 #include <sys/condvar.h> 72 #ifdef KTRACE 73 #include <sys/ktrace.h> 74 #endif 75 76 #include <security/audit/audit.h> 77 78 /* 79 * The following macro defines how many bytes will be allocated from 80 * the stack instead of memory allocated when passing the IOCTL data 81 * structures from userspace and to the kernel. Some IOCTLs having 82 * small data structures are used very frequently and this small 83 * buffer on the stack gives a significant speedup improvement for 84 * those requests. The value of this define should be greater or equal 85 * to 64 bytes and should also be power of two. The data structure is 86 * currently hard-aligned to a 8-byte boundary on the stack. This 87 * should currently be sufficient for all supported platforms. 88 */ 89 #define SYS_IOCTL_SMALL_SIZE 128 /* bytes */ 90 #define SYS_IOCTL_SMALL_ALIGN 8 /* bytes */ 91 92 int iosize_max_clamp = 0; 93 SYSCTL_INT(_debug, OID_AUTO, iosize_max_clamp, CTLFLAG_RW, 94 &iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX"); 95 int devfs_iosize_max_clamp = 1; 96 SYSCTL_INT(_debug, OID_AUTO, devfs_iosize_max_clamp, CTLFLAG_RW, 97 &devfs_iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX for devices"); 98 99 /* 100 * Assert that the return value of read(2) and write(2) syscalls fits 101 * into a register. If not, an architecture will need to provide the 102 * usermode wrappers to reconstruct the result. 103 */ 104 CTASSERT(sizeof(register_t) >= sizeof(size_t)); 105 106 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 107 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 108 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 109 110 static int pollout(struct thread *, struct pollfd *, struct pollfd *, 111 u_int); 112 static int pollscan(struct thread *, struct pollfd *, u_int); 113 static int pollrescan(struct thread *); 114 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 115 static int selrescan(struct thread *, fd_mask **, fd_mask **); 116 static void selfdalloc(struct thread *, void *); 117 static void selfdfree(struct seltd *, struct selfd *); 118 static int dofileread(struct thread *, int, struct file *, struct uio *, 119 off_t, int); 120 static int dofilewrite(struct thread *, int, struct file *, struct uio *, 121 off_t, int); 122 static void doselwakeup(struct selinfo *, int); 123 static void seltdinit(struct thread *); 124 static int seltdwait(struct thread *, sbintime_t, sbintime_t); 125 static void seltdclear(struct thread *); 126 127 /* 128 * One seltd per-thread allocated on demand as needed. 129 * 130 * t - protected by st_mtx 131 * k - Only accessed by curthread or read-only 132 */ 133 struct seltd { 134 STAILQ_HEAD(, selfd) st_selq; /* (k) List of selfds. */ 135 struct selfd *st_free1; /* (k) free fd for read set. */ 136 struct selfd *st_free2; /* (k) free fd for write set. */ 137 struct mtx st_mtx; /* Protects struct seltd */ 138 struct cv st_wait; /* (t) Wait channel. */ 139 int st_flags; /* (t) SELTD_ flags. */ 140 }; 141 142 #define SELTD_PENDING 0x0001 /* We have pending events. */ 143 #define SELTD_RESCAN 0x0002 /* Doing a rescan. */ 144 145 /* 146 * One selfd allocated per-thread per-file-descriptor. 147 * f - protected by sf_mtx 148 */ 149 struct selfd { 150 STAILQ_ENTRY(selfd) sf_link; /* (k) fds owned by this td. */ 151 TAILQ_ENTRY(selfd) sf_threads; /* (f) fds on this selinfo. */ 152 struct selinfo *sf_si; /* (f) selinfo when linked. */ 153 struct mtx *sf_mtx; /* Pointer to selinfo mtx. */ 154 struct seltd *sf_td; /* (k) owning seltd. */ 155 void *sf_cookie; /* (k) fd or pollfd. */ 156 u_int sf_refs; 157 }; 158 159 static uma_zone_t selfd_zone; 160 static struct mtx_pool *mtxpool_select; 161 162 #ifndef _SYS_SYSPROTO_H_ 163 struct read_args { 164 int fd; 165 void *buf; 166 size_t nbyte; 167 }; 168 #endif 169 int 170 sys_read(td, uap) 171 struct thread *td; 172 struct read_args *uap; 173 { 174 struct uio auio; 175 struct iovec aiov; 176 int error; 177 178 if (uap->nbyte > IOSIZE_MAX) 179 return (EINVAL); 180 aiov.iov_base = uap->buf; 181 aiov.iov_len = uap->nbyte; 182 auio.uio_iov = &aiov; 183 auio.uio_iovcnt = 1; 184 auio.uio_resid = uap->nbyte; 185 auio.uio_segflg = UIO_USERSPACE; 186 error = kern_readv(td, uap->fd, &auio); 187 return(error); 188 } 189 190 /* 191 * Positioned read system call 192 */ 193 #ifndef _SYS_SYSPROTO_H_ 194 struct pread_args { 195 int fd; 196 void *buf; 197 size_t nbyte; 198 int pad; 199 off_t offset; 200 }; 201 #endif 202 int 203 sys_pread(td, uap) 204 struct thread *td; 205 struct pread_args *uap; 206 { 207 struct uio auio; 208 struct iovec aiov; 209 int error; 210 211 if (uap->nbyte > IOSIZE_MAX) 212 return (EINVAL); 213 aiov.iov_base = uap->buf; 214 aiov.iov_len = uap->nbyte; 215 auio.uio_iov = &aiov; 216 auio.uio_iovcnt = 1; 217 auio.uio_resid = uap->nbyte; 218 auio.uio_segflg = UIO_USERSPACE; 219 error = kern_preadv(td, uap->fd, &auio, uap->offset); 220 return(error); 221 } 222 223 #if defined(COMPAT_FREEBSD6) 224 int 225 freebsd6_pread(td, uap) 226 struct thread *td; 227 struct freebsd6_pread_args *uap; 228 { 229 struct pread_args oargs; 230 231 oargs.fd = uap->fd; 232 oargs.buf = uap->buf; 233 oargs.nbyte = uap->nbyte; 234 oargs.offset = uap->offset; 235 return (sys_pread(td, &oargs)); 236 } 237 #endif 238 239 /* 240 * Scatter read system call. 241 */ 242 #ifndef _SYS_SYSPROTO_H_ 243 struct readv_args { 244 int fd; 245 struct iovec *iovp; 246 u_int iovcnt; 247 }; 248 #endif 249 int 250 sys_readv(struct thread *td, struct readv_args *uap) 251 { 252 struct uio *auio; 253 int error; 254 255 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 256 if (error) 257 return (error); 258 error = kern_readv(td, uap->fd, auio); 259 free(auio, M_IOV); 260 return (error); 261 } 262 263 int 264 kern_readv(struct thread *td, int fd, struct uio *auio) 265 { 266 struct file *fp; 267 cap_rights_t rights; 268 int error; 269 270 error = fget_read(td, fd, cap_rights_init(&rights, CAP_READ), &fp); 271 if (error) 272 return (error); 273 error = dofileread(td, fd, fp, auio, (off_t)-1, 0); 274 fdrop(fp, td); 275 return (error); 276 } 277 278 /* 279 * Scatter positioned read system call. 280 */ 281 #ifndef _SYS_SYSPROTO_H_ 282 struct preadv_args { 283 int fd; 284 struct iovec *iovp; 285 u_int iovcnt; 286 off_t offset; 287 }; 288 #endif 289 int 290 sys_preadv(struct thread *td, struct preadv_args *uap) 291 { 292 struct uio *auio; 293 int error; 294 295 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 296 if (error) 297 return (error); 298 error = kern_preadv(td, uap->fd, auio, uap->offset); 299 free(auio, M_IOV); 300 return (error); 301 } 302 303 int 304 kern_preadv(td, fd, auio, offset) 305 struct thread *td; 306 int fd; 307 struct uio *auio; 308 off_t offset; 309 { 310 struct file *fp; 311 cap_rights_t rights; 312 int error; 313 314 error = fget_read(td, fd, cap_rights_init(&rights, CAP_PREAD), &fp); 315 if (error) 316 return (error); 317 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 318 error = ESPIPE; 319 else if (offset < 0 && fp->f_vnode->v_type != VCHR) 320 error = EINVAL; 321 else 322 error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET); 323 fdrop(fp, td); 324 return (error); 325 } 326 327 /* 328 * Common code for readv and preadv that reads data in 329 * from a file using the passed in uio, offset, and flags. 330 */ 331 static int 332 dofileread(td, fd, fp, auio, offset, flags) 333 struct thread *td; 334 int fd; 335 struct file *fp; 336 struct uio *auio; 337 off_t offset; 338 int flags; 339 { 340 ssize_t cnt; 341 int error; 342 #ifdef KTRACE 343 struct uio *ktruio = NULL; 344 #endif 345 346 /* Finish zero length reads right here */ 347 if (auio->uio_resid == 0) { 348 td->td_retval[0] = 0; 349 return(0); 350 } 351 auio->uio_rw = UIO_READ; 352 auio->uio_offset = offset; 353 auio->uio_td = td; 354 #ifdef KTRACE 355 if (KTRPOINT(td, KTR_GENIO)) 356 ktruio = cloneuio(auio); 357 #endif 358 cnt = auio->uio_resid; 359 if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) { 360 if (auio->uio_resid != cnt && (error == ERESTART || 361 error == EINTR || error == EWOULDBLOCK)) 362 error = 0; 363 } 364 cnt -= auio->uio_resid; 365 #ifdef KTRACE 366 if (ktruio != NULL) { 367 ktruio->uio_resid = cnt; 368 ktrgenio(fd, UIO_READ, ktruio, error); 369 } 370 #endif 371 td->td_retval[0] = cnt; 372 return (error); 373 } 374 375 #ifndef _SYS_SYSPROTO_H_ 376 struct write_args { 377 int fd; 378 const void *buf; 379 size_t nbyte; 380 }; 381 #endif 382 int 383 sys_write(td, uap) 384 struct thread *td; 385 struct write_args *uap; 386 { 387 struct uio auio; 388 struct iovec aiov; 389 int error; 390 391 if (uap->nbyte > IOSIZE_MAX) 392 return (EINVAL); 393 aiov.iov_base = (void *)(uintptr_t)uap->buf; 394 aiov.iov_len = uap->nbyte; 395 auio.uio_iov = &aiov; 396 auio.uio_iovcnt = 1; 397 auio.uio_resid = uap->nbyte; 398 auio.uio_segflg = UIO_USERSPACE; 399 error = kern_writev(td, uap->fd, &auio); 400 return(error); 401 } 402 403 /* 404 * Positioned write system call. 405 */ 406 #ifndef _SYS_SYSPROTO_H_ 407 struct pwrite_args { 408 int fd; 409 const void *buf; 410 size_t nbyte; 411 int pad; 412 off_t offset; 413 }; 414 #endif 415 int 416 sys_pwrite(td, uap) 417 struct thread *td; 418 struct pwrite_args *uap; 419 { 420 struct uio auio; 421 struct iovec aiov; 422 int error; 423 424 if (uap->nbyte > IOSIZE_MAX) 425 return (EINVAL); 426 aiov.iov_base = (void *)(uintptr_t)uap->buf; 427 aiov.iov_len = uap->nbyte; 428 auio.uio_iov = &aiov; 429 auio.uio_iovcnt = 1; 430 auio.uio_resid = uap->nbyte; 431 auio.uio_segflg = UIO_USERSPACE; 432 error = kern_pwritev(td, uap->fd, &auio, uap->offset); 433 return(error); 434 } 435 436 #if defined(COMPAT_FREEBSD6) 437 int 438 freebsd6_pwrite(td, uap) 439 struct thread *td; 440 struct freebsd6_pwrite_args *uap; 441 { 442 struct pwrite_args oargs; 443 444 oargs.fd = uap->fd; 445 oargs.buf = uap->buf; 446 oargs.nbyte = uap->nbyte; 447 oargs.offset = uap->offset; 448 return (sys_pwrite(td, &oargs)); 449 } 450 #endif 451 452 /* 453 * Gather write system call. 454 */ 455 #ifndef _SYS_SYSPROTO_H_ 456 struct writev_args { 457 int fd; 458 struct iovec *iovp; 459 u_int iovcnt; 460 }; 461 #endif 462 int 463 sys_writev(struct thread *td, struct writev_args *uap) 464 { 465 struct uio *auio; 466 int error; 467 468 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 469 if (error) 470 return (error); 471 error = kern_writev(td, uap->fd, auio); 472 free(auio, M_IOV); 473 return (error); 474 } 475 476 int 477 kern_writev(struct thread *td, int fd, struct uio *auio) 478 { 479 struct file *fp; 480 cap_rights_t rights; 481 int error; 482 483 error = fget_write(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp); 484 if (error) 485 return (error); 486 error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0); 487 fdrop(fp, td); 488 return (error); 489 } 490 491 /* 492 * Gather positioned write system call. 493 */ 494 #ifndef _SYS_SYSPROTO_H_ 495 struct pwritev_args { 496 int fd; 497 struct iovec *iovp; 498 u_int iovcnt; 499 off_t offset; 500 }; 501 #endif 502 int 503 sys_pwritev(struct thread *td, struct pwritev_args *uap) 504 { 505 struct uio *auio; 506 int error; 507 508 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 509 if (error) 510 return (error); 511 error = kern_pwritev(td, uap->fd, auio, uap->offset); 512 free(auio, M_IOV); 513 return (error); 514 } 515 516 int 517 kern_pwritev(td, fd, auio, offset) 518 struct thread *td; 519 struct uio *auio; 520 int fd; 521 off_t offset; 522 { 523 struct file *fp; 524 cap_rights_t rights; 525 int error; 526 527 error = fget_write(td, fd, cap_rights_init(&rights, CAP_PWRITE), &fp); 528 if (error) 529 return (error); 530 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 531 error = ESPIPE; 532 else if (offset < 0 && fp->f_vnode->v_type != VCHR) 533 error = EINVAL; 534 else 535 error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET); 536 fdrop(fp, td); 537 return (error); 538 } 539 540 /* 541 * Common code for writev and pwritev that writes data to 542 * a file using the passed in uio, offset, and flags. 543 */ 544 static int 545 dofilewrite(td, fd, fp, auio, offset, flags) 546 struct thread *td; 547 int fd; 548 struct file *fp; 549 struct uio *auio; 550 off_t offset; 551 int flags; 552 { 553 ssize_t cnt; 554 int error; 555 #ifdef KTRACE 556 struct uio *ktruio = NULL; 557 #endif 558 559 auio->uio_rw = UIO_WRITE; 560 auio->uio_td = td; 561 auio->uio_offset = offset; 562 #ifdef KTRACE 563 if (KTRPOINT(td, KTR_GENIO)) 564 ktruio = cloneuio(auio); 565 #endif 566 cnt = auio->uio_resid; 567 if (fp->f_type == DTYPE_VNODE && 568 (fp->f_vnread_flags & FDEVFS_VNODE) == 0) 569 bwillwrite(); 570 if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) { 571 if (auio->uio_resid != cnt && (error == ERESTART || 572 error == EINTR || error == EWOULDBLOCK)) 573 error = 0; 574 /* Socket layer is responsible for issuing SIGPIPE. */ 575 if (fp->f_type != DTYPE_SOCKET && error == EPIPE) { 576 PROC_LOCK(td->td_proc); 577 tdsignal(td, SIGPIPE); 578 PROC_UNLOCK(td->td_proc); 579 } 580 } 581 cnt -= auio->uio_resid; 582 #ifdef KTRACE 583 if (ktruio != NULL) { 584 ktruio->uio_resid = cnt; 585 ktrgenio(fd, UIO_WRITE, ktruio, error); 586 } 587 #endif 588 td->td_retval[0] = cnt; 589 return (error); 590 } 591 592 /* 593 * Truncate a file given a file descriptor. 594 * 595 * Can't use fget_write() here, since must return EINVAL and not EBADF if the 596 * descriptor isn't writable. 597 */ 598 int 599 kern_ftruncate(td, fd, length) 600 struct thread *td; 601 int fd; 602 off_t length; 603 { 604 struct file *fp; 605 cap_rights_t rights; 606 int error; 607 608 AUDIT_ARG_FD(fd); 609 if (length < 0) 610 return (EINVAL); 611 error = fget(td, fd, cap_rights_init(&rights, CAP_FTRUNCATE), &fp); 612 if (error) 613 return (error); 614 AUDIT_ARG_FILE(td->td_proc, fp); 615 if (!(fp->f_flag & FWRITE)) { 616 fdrop(fp, td); 617 return (EINVAL); 618 } 619 error = fo_truncate(fp, length, td->td_ucred, td); 620 fdrop(fp, td); 621 return (error); 622 } 623 624 #ifndef _SYS_SYSPROTO_H_ 625 struct ftruncate_args { 626 int fd; 627 int pad; 628 off_t length; 629 }; 630 #endif 631 int 632 sys_ftruncate(td, uap) 633 struct thread *td; 634 struct ftruncate_args *uap; 635 { 636 637 return (kern_ftruncate(td, uap->fd, uap->length)); 638 } 639 640 #if defined(COMPAT_43) 641 #ifndef _SYS_SYSPROTO_H_ 642 struct oftruncate_args { 643 int fd; 644 long length; 645 }; 646 #endif 647 int 648 oftruncate(td, uap) 649 struct thread *td; 650 struct oftruncate_args *uap; 651 { 652 653 return (kern_ftruncate(td, uap->fd, uap->length)); 654 } 655 #endif /* COMPAT_43 */ 656 657 #ifndef _SYS_SYSPROTO_H_ 658 struct ioctl_args { 659 int fd; 660 u_long com; 661 caddr_t data; 662 }; 663 #endif 664 /* ARGSUSED */ 665 int 666 sys_ioctl(struct thread *td, struct ioctl_args *uap) 667 { 668 u_char smalldata[SYS_IOCTL_SMALL_SIZE] __aligned(SYS_IOCTL_SMALL_ALIGN); 669 u_long com; 670 int arg, error; 671 u_int size; 672 caddr_t data; 673 674 if (uap->com > 0xffffffff) { 675 printf( 676 "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", 677 td->td_proc->p_pid, td->td_name, uap->com); 678 uap->com &= 0xffffffff; 679 } 680 com = uap->com; 681 682 /* 683 * Interpret high order word to find amount of data to be 684 * copied to/from the user's address space. 685 */ 686 size = IOCPARM_LEN(com); 687 if ((size > IOCPARM_MAX) || 688 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || 689 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43) 690 ((com & IOC_OUT) && size == 0) || 691 #else 692 ((com & (IOC_IN | IOC_OUT)) && size == 0) || 693 #endif 694 ((com & IOC_VOID) && size > 0 && size != sizeof(int))) 695 return (ENOTTY); 696 697 if (size > 0) { 698 if (com & IOC_VOID) { 699 /* Integer argument. */ 700 arg = (intptr_t)uap->data; 701 data = (void *)&arg; 702 size = 0; 703 } else { 704 if (size > SYS_IOCTL_SMALL_SIZE) 705 data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 706 else 707 data = smalldata; 708 } 709 } else 710 data = (void *)&uap->data; 711 if (com & IOC_IN) { 712 error = copyin(uap->data, data, (u_int)size); 713 if (error != 0) 714 goto out; 715 } else if (com & IOC_OUT) { 716 /* 717 * Zero the buffer so the user always 718 * gets back something deterministic. 719 */ 720 bzero(data, size); 721 } 722 723 error = kern_ioctl(td, uap->fd, com, data); 724 725 if (error == 0 && (com & IOC_OUT)) 726 error = copyout(data, uap->data, (u_int)size); 727 728 out: 729 if (size > SYS_IOCTL_SMALL_SIZE) 730 free(data, M_IOCTLOPS); 731 return (error); 732 } 733 734 int 735 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) 736 { 737 struct file *fp; 738 struct filedesc *fdp; 739 #ifndef CAPABILITIES 740 cap_rights_t rights; 741 #endif 742 int error, tmp, locked; 743 744 AUDIT_ARG_FD(fd); 745 AUDIT_ARG_CMD(com); 746 747 fdp = td->td_proc->p_fd; 748 749 switch (com) { 750 case FIONCLEX: 751 case FIOCLEX: 752 FILEDESC_XLOCK(fdp); 753 locked = LA_XLOCKED; 754 break; 755 default: 756 #ifdef CAPABILITIES 757 FILEDESC_SLOCK(fdp); 758 locked = LA_SLOCKED; 759 #else 760 locked = LA_UNLOCKED; 761 #endif 762 break; 763 } 764 765 #ifdef CAPABILITIES 766 if ((fp = fget_locked(fdp, fd)) == NULL) { 767 error = EBADF; 768 goto out; 769 } 770 if ((error = cap_ioctl_check(fdp, fd, com)) != 0) { 771 fp = NULL; /* fhold() was not called yet */ 772 goto out; 773 } 774 fhold(fp); 775 if (locked == LA_SLOCKED) { 776 FILEDESC_SUNLOCK(fdp); 777 locked = LA_UNLOCKED; 778 } 779 #else 780 error = fget(td, fd, cap_rights_init(&rights, CAP_IOCTL), &fp); 781 if (error != 0) { 782 fp = NULL; 783 goto out; 784 } 785 #endif 786 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 787 error = EBADF; 788 goto out; 789 } 790 791 switch (com) { 792 case FIONCLEX: 793 fdp->fd_ofiles[fd].fde_flags &= ~UF_EXCLOSE; 794 goto out; 795 case FIOCLEX: 796 fdp->fd_ofiles[fd].fde_flags |= UF_EXCLOSE; 797 goto out; 798 case FIONBIO: 799 if ((tmp = *(int *)data)) 800 atomic_set_int(&fp->f_flag, FNONBLOCK); 801 else 802 atomic_clear_int(&fp->f_flag, FNONBLOCK); 803 data = (void *)&tmp; 804 break; 805 case FIOASYNC: 806 if ((tmp = *(int *)data)) 807 atomic_set_int(&fp->f_flag, FASYNC); 808 else 809 atomic_clear_int(&fp->f_flag, FASYNC); 810 data = (void *)&tmp; 811 break; 812 } 813 814 error = fo_ioctl(fp, com, data, td->td_ucred, td); 815 out: 816 switch (locked) { 817 case LA_XLOCKED: 818 FILEDESC_XUNLOCK(fdp); 819 break; 820 #ifdef CAPABILITIES 821 case LA_SLOCKED: 822 FILEDESC_SUNLOCK(fdp); 823 break; 824 #endif 825 default: 826 FILEDESC_UNLOCK_ASSERT(fdp); 827 break; 828 } 829 if (fp != NULL) 830 fdrop(fp, td); 831 return (error); 832 } 833 834 int 835 poll_no_poll(int events) 836 { 837 /* 838 * Return true for read/write. If the user asked for something 839 * special, return POLLNVAL, so that clients have a way of 840 * determining reliably whether or not the extended 841 * functionality is present without hard-coding knowledge 842 * of specific filesystem implementations. 843 */ 844 if (events & ~POLLSTANDARD) 845 return (POLLNVAL); 846 847 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 848 } 849 850 int 851 sys_pselect(struct thread *td, struct pselect_args *uap) 852 { 853 struct timespec ts; 854 struct timeval tv, *tvp; 855 sigset_t set, *uset; 856 int error; 857 858 if (uap->ts != NULL) { 859 error = copyin(uap->ts, &ts, sizeof(ts)); 860 if (error != 0) 861 return (error); 862 TIMESPEC_TO_TIMEVAL(&tv, &ts); 863 tvp = &tv; 864 } else 865 tvp = NULL; 866 if (uap->sm != NULL) { 867 error = copyin(uap->sm, &set, sizeof(set)); 868 if (error != 0) 869 return (error); 870 uset = &set; 871 } else 872 uset = NULL; 873 return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, 874 uset, NFDBITS)); 875 } 876 877 int 878 kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex, 879 struct timeval *tvp, sigset_t *uset, int abi_nfdbits) 880 { 881 int error; 882 883 if (uset != NULL) { 884 error = kern_sigprocmask(td, SIG_SETMASK, uset, 885 &td->td_oldsigmask, 0); 886 if (error != 0) 887 return (error); 888 td->td_pflags |= TDP_OLDMASK; 889 /* 890 * Make sure that ast() is called on return to 891 * usermode and TDP_OLDMASK is cleared, restoring old 892 * sigmask. 893 */ 894 thread_lock(td); 895 td->td_flags |= TDF_ASTPENDING; 896 thread_unlock(td); 897 } 898 error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits); 899 return (error); 900 } 901 902 #ifndef _SYS_SYSPROTO_H_ 903 struct select_args { 904 int nd; 905 fd_set *in, *ou, *ex; 906 struct timeval *tv; 907 }; 908 #endif 909 int 910 sys_select(struct thread *td, struct select_args *uap) 911 { 912 struct timeval tv, *tvp; 913 int error; 914 915 if (uap->tv != NULL) { 916 error = copyin(uap->tv, &tv, sizeof(tv)); 917 if (error) 918 return (error); 919 tvp = &tv; 920 } else 921 tvp = NULL; 922 923 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, 924 NFDBITS)); 925 } 926 927 /* 928 * In the unlikely case when user specified n greater then the last 929 * open file descriptor, check that no bits are set after the last 930 * valid fd. We must return EBADF if any is set. 931 * 932 * There are applications that rely on the behaviour. 933 * 934 * nd is fd_lastfile + 1. 935 */ 936 static int 937 select_check_badfd(fd_set *fd_in, int nd, int ndu, int abi_nfdbits) 938 { 939 char *addr, *oaddr; 940 int b, i, res; 941 uint8_t bits; 942 943 if (nd >= ndu || fd_in == NULL) 944 return (0); 945 946 oaddr = NULL; 947 bits = 0; /* silence gcc */ 948 for (i = nd; i < ndu; i++) { 949 b = i / NBBY; 950 #if BYTE_ORDER == LITTLE_ENDIAN 951 addr = (char *)fd_in + b; 952 #else 953 addr = (char *)fd_in; 954 if (abi_nfdbits == NFDBITS) { 955 addr += rounddown(b, sizeof(fd_mask)) + 956 sizeof(fd_mask) - 1 - b % sizeof(fd_mask); 957 } else { 958 addr += rounddown(b, sizeof(uint32_t)) + 959 sizeof(uint32_t) - 1 - b % sizeof(uint32_t); 960 } 961 #endif 962 if (addr != oaddr) { 963 res = fubyte(addr); 964 if (res == -1) 965 return (EFAULT); 966 oaddr = addr; 967 bits = res; 968 } 969 if ((bits & (1 << (i % NBBY))) != 0) 970 return (EBADF); 971 } 972 return (0); 973 } 974 975 int 976 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 977 fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits) 978 { 979 struct filedesc *fdp; 980 /* 981 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 982 * infds with the new FD_SETSIZE of 1024, and more than enough for 983 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 984 * of 256. 985 */ 986 fd_mask s_selbits[howmany(2048, NFDBITS)]; 987 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 988 struct timeval rtv; 989 sbintime_t asbt, precision, rsbt; 990 u_int nbufbytes, ncpbytes, ncpubytes, nfdbits; 991 int error, lf, ndu; 992 993 if (nd < 0) 994 return (EINVAL); 995 fdp = td->td_proc->p_fd; 996 ndu = nd; 997 lf = fdp->fd_lastfile; 998 if (nd > lf + 1) 999 nd = lf + 1; 1000 1001 error = select_check_badfd(fd_in, nd, ndu, abi_nfdbits); 1002 if (error != 0) 1003 return (error); 1004 error = select_check_badfd(fd_ou, nd, ndu, abi_nfdbits); 1005 if (error != 0) 1006 return (error); 1007 error = select_check_badfd(fd_ex, nd, ndu, abi_nfdbits); 1008 if (error != 0) 1009 return (error); 1010 1011 /* 1012 * Allocate just enough bits for the non-null fd_sets. Use the 1013 * preallocated auto buffer if possible. 1014 */ 1015 nfdbits = roundup(nd, NFDBITS); 1016 ncpbytes = nfdbits / NBBY; 1017 ncpubytes = roundup(nd, abi_nfdbits) / NBBY; 1018 nbufbytes = 0; 1019 if (fd_in != NULL) 1020 nbufbytes += 2 * ncpbytes; 1021 if (fd_ou != NULL) 1022 nbufbytes += 2 * ncpbytes; 1023 if (fd_ex != NULL) 1024 nbufbytes += 2 * ncpbytes; 1025 if (nbufbytes <= sizeof s_selbits) 1026 selbits = &s_selbits[0]; 1027 else 1028 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 1029 1030 /* 1031 * Assign pointers into the bit buffers and fetch the input bits. 1032 * Put the output buffers together so that they can be bzeroed 1033 * together. 1034 */ 1035 sbp = selbits; 1036 #define getbits(name, x) \ 1037 do { \ 1038 if (name == NULL) { \ 1039 ibits[x] = NULL; \ 1040 obits[x] = NULL; \ 1041 } else { \ 1042 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 1043 obits[x] = sbp; \ 1044 sbp += ncpbytes / sizeof *sbp; \ 1045 error = copyin(name, ibits[x], ncpubytes); \ 1046 if (error != 0) \ 1047 goto done; \ 1048 bzero((char *)ibits[x] + ncpubytes, \ 1049 ncpbytes - ncpubytes); \ 1050 } \ 1051 } while (0) 1052 getbits(fd_in, 0); 1053 getbits(fd_ou, 1); 1054 getbits(fd_ex, 2); 1055 #undef getbits 1056 1057 #if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__) 1058 /* 1059 * XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS, 1060 * we are running under 32-bit emulation. This should be more 1061 * generic. 1062 */ 1063 #define swizzle_fdset(bits) \ 1064 if (abi_nfdbits != NFDBITS && bits != NULL) { \ 1065 int i; \ 1066 for (i = 0; i < ncpbytes / sizeof *sbp; i++) \ 1067 bits[i] = (bits[i] >> 32) | (bits[i] << 32); \ 1068 } 1069 #else 1070 #define swizzle_fdset(bits) 1071 #endif 1072 1073 /* Make sure the bit order makes it through an ABI transition */ 1074 swizzle_fdset(ibits[0]); 1075 swizzle_fdset(ibits[1]); 1076 swizzle_fdset(ibits[2]); 1077 1078 if (nbufbytes != 0) 1079 bzero(selbits, nbufbytes / 2); 1080 1081 precision = 0; 1082 if (tvp != NULL) { 1083 rtv = *tvp; 1084 if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || 1085 rtv.tv_usec >= 1000000) { 1086 error = EINVAL; 1087 goto done; 1088 } 1089 if (!timevalisset(&rtv)) 1090 asbt = 0; 1091 else if (rtv.tv_sec <= INT32_MAX) { 1092 rsbt = tvtosbt(rtv); 1093 precision = rsbt; 1094 precision >>= tc_precexp; 1095 if (TIMESEL(&asbt, rsbt)) 1096 asbt += tc_tick_sbt; 1097 if (asbt <= SBT_MAX - rsbt) 1098 asbt += rsbt; 1099 else 1100 asbt = -1; 1101 } else 1102 asbt = -1; 1103 } else 1104 asbt = -1; 1105 seltdinit(td); 1106 /* Iterate until the timeout expires or descriptors become ready. */ 1107 for (;;) { 1108 error = selscan(td, ibits, obits, nd); 1109 if (error || td->td_retval[0] != 0) 1110 break; 1111 error = seltdwait(td, asbt, precision); 1112 if (error) 1113 break; 1114 error = selrescan(td, ibits, obits); 1115 if (error || td->td_retval[0] != 0) 1116 break; 1117 } 1118 seltdclear(td); 1119 1120 done: 1121 /* select is not restarted after signals... */ 1122 if (error == ERESTART) 1123 error = EINTR; 1124 if (error == EWOULDBLOCK) 1125 error = 0; 1126 1127 /* swizzle bit order back, if necessary */ 1128 swizzle_fdset(obits[0]); 1129 swizzle_fdset(obits[1]); 1130 swizzle_fdset(obits[2]); 1131 #undef swizzle_fdset 1132 1133 #define putbits(name, x) \ 1134 if (name && (error2 = copyout(obits[x], name, ncpubytes))) \ 1135 error = error2; 1136 if (error == 0) { 1137 int error2; 1138 1139 putbits(fd_in, 0); 1140 putbits(fd_ou, 1); 1141 putbits(fd_ex, 2); 1142 #undef putbits 1143 } 1144 if (selbits != &s_selbits[0]) 1145 free(selbits, M_SELECT); 1146 1147 return (error); 1148 } 1149 /* 1150 * Convert a select bit set to poll flags. 1151 * 1152 * The backend always returns POLLHUP/POLLERR if appropriate and we 1153 * return this as a set bit in any set. 1154 */ 1155 static int select_flags[3] = { 1156 POLLRDNORM | POLLHUP | POLLERR, 1157 POLLWRNORM | POLLHUP | POLLERR, 1158 POLLRDBAND | POLLERR 1159 }; 1160 1161 /* 1162 * Compute the fo_poll flags required for a fd given by the index and 1163 * bit position in the fd_mask array. 1164 */ 1165 static __inline int 1166 selflags(fd_mask **ibits, int idx, fd_mask bit) 1167 { 1168 int flags; 1169 int msk; 1170 1171 flags = 0; 1172 for (msk = 0; msk < 3; msk++) { 1173 if (ibits[msk] == NULL) 1174 continue; 1175 if ((ibits[msk][idx] & bit) == 0) 1176 continue; 1177 flags |= select_flags[msk]; 1178 } 1179 return (flags); 1180 } 1181 1182 /* 1183 * Set the appropriate output bits given a mask of fired events and the 1184 * input bits originally requested. 1185 */ 1186 static __inline int 1187 selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events) 1188 { 1189 int msk; 1190 int n; 1191 1192 n = 0; 1193 for (msk = 0; msk < 3; msk++) { 1194 if ((events & select_flags[msk]) == 0) 1195 continue; 1196 if (ibits[msk] == NULL) 1197 continue; 1198 if ((ibits[msk][idx] & bit) == 0) 1199 continue; 1200 /* 1201 * XXX Check for a duplicate set. This can occur because a 1202 * socket calls selrecord() twice for each poll() call 1203 * resulting in two selfds per real fd. selrescan() will 1204 * call selsetbits twice as a result. 1205 */ 1206 if ((obits[msk][idx] & bit) != 0) 1207 continue; 1208 obits[msk][idx] |= bit; 1209 n++; 1210 } 1211 1212 return (n); 1213 } 1214 1215 static __inline int 1216 getselfd_cap(struct filedesc *fdp, int fd, struct file **fpp) 1217 { 1218 cap_rights_t rights; 1219 1220 cap_rights_init(&rights, CAP_EVENT); 1221 1222 return (fget_unlocked(fdp, fd, &rights, fpp, NULL)); 1223 } 1224 1225 /* 1226 * Traverse the list of fds attached to this thread's seltd and check for 1227 * completion. 1228 */ 1229 static int 1230 selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits) 1231 { 1232 struct filedesc *fdp; 1233 struct selinfo *si; 1234 struct seltd *stp; 1235 struct selfd *sfp; 1236 struct selfd *sfn; 1237 struct file *fp; 1238 fd_mask bit; 1239 int fd, ev, n, idx; 1240 int error; 1241 1242 fdp = td->td_proc->p_fd; 1243 stp = td->td_sel; 1244 n = 0; 1245 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { 1246 fd = (int)(uintptr_t)sfp->sf_cookie; 1247 si = sfp->sf_si; 1248 selfdfree(stp, sfp); 1249 /* If the selinfo wasn't cleared the event didn't fire. */ 1250 if (si != NULL) 1251 continue; 1252 error = getselfd_cap(fdp, fd, &fp); 1253 if (error) 1254 return (error); 1255 idx = fd / NFDBITS; 1256 bit = (fd_mask)1 << (fd % NFDBITS); 1257 ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td); 1258 fdrop(fp, td); 1259 if (ev != 0) 1260 n += selsetbits(ibits, obits, idx, bit, ev); 1261 } 1262 stp->st_flags = 0; 1263 td->td_retval[0] = n; 1264 return (0); 1265 } 1266 1267 /* 1268 * Perform the initial filedescriptor scan and register ourselves with 1269 * each selinfo. 1270 */ 1271 static int 1272 selscan(td, ibits, obits, nfd) 1273 struct thread *td; 1274 fd_mask **ibits, **obits; 1275 int nfd; 1276 { 1277 struct filedesc *fdp; 1278 struct file *fp; 1279 fd_mask bit; 1280 int ev, flags, end, fd; 1281 int n, idx; 1282 int error; 1283 1284 fdp = td->td_proc->p_fd; 1285 n = 0; 1286 for (idx = 0, fd = 0; fd < nfd; idx++) { 1287 end = imin(fd + NFDBITS, nfd); 1288 for (bit = 1; fd < end; bit <<= 1, fd++) { 1289 /* Compute the list of events we're interested in. */ 1290 flags = selflags(ibits, idx, bit); 1291 if (flags == 0) 1292 continue; 1293 error = getselfd_cap(fdp, fd, &fp); 1294 if (error) 1295 return (error); 1296 selfdalloc(td, (void *)(uintptr_t)fd); 1297 ev = fo_poll(fp, flags, td->td_ucred, td); 1298 fdrop(fp, td); 1299 if (ev != 0) 1300 n += selsetbits(ibits, obits, idx, bit, ev); 1301 } 1302 } 1303 1304 td->td_retval[0] = n; 1305 return (0); 1306 } 1307 1308 int 1309 sys_poll(struct thread *td, struct poll_args *uap) 1310 { 1311 struct timespec ts, *tsp; 1312 1313 if (uap->timeout != INFTIM) { 1314 if (uap->timeout < 0) 1315 return (EINVAL); 1316 ts.tv_sec = uap->timeout / 1000; 1317 ts.tv_nsec = (uap->timeout % 1000) * 1000000; 1318 tsp = &ts; 1319 } else 1320 tsp = NULL; 1321 1322 return (kern_poll(td, uap->fds, uap->nfds, tsp, NULL)); 1323 } 1324 1325 int 1326 kern_poll(struct thread *td, struct pollfd *fds, u_int nfds, 1327 struct timespec *tsp, sigset_t *uset) 1328 { 1329 struct pollfd *bits; 1330 struct pollfd smallbits[32]; 1331 sbintime_t sbt, precision, tmp; 1332 time_t over; 1333 struct timespec ts; 1334 int error; 1335 size_t ni; 1336 1337 precision = 0; 1338 if (tsp != NULL) { 1339 if (tsp->tv_sec < 0) 1340 return (EINVAL); 1341 if (tsp->tv_nsec < 0 || tsp->tv_nsec >= 1000000000) 1342 return (EINVAL); 1343 if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) 1344 sbt = 0; 1345 else { 1346 ts = *tsp; 1347 if (ts.tv_sec > INT32_MAX / 2) { 1348 over = ts.tv_sec - INT32_MAX / 2; 1349 ts.tv_sec -= over; 1350 } else 1351 over = 0; 1352 tmp = tstosbt(ts); 1353 precision = tmp; 1354 precision >>= tc_precexp; 1355 if (TIMESEL(&sbt, tmp)) 1356 sbt += tc_tick_sbt; 1357 sbt += tmp; 1358 } 1359 } else 1360 sbt = -1; 1361 1362 if (nfds > maxfilesperproc && nfds > FD_SETSIZE) 1363 return (EINVAL); 1364 ni = nfds * sizeof(struct pollfd); 1365 if (ni > sizeof(smallbits)) 1366 bits = malloc(ni, M_TEMP, M_WAITOK); 1367 else 1368 bits = smallbits; 1369 error = copyin(fds, bits, ni); 1370 if (error) 1371 goto done; 1372 1373 if (uset != NULL) { 1374 error = kern_sigprocmask(td, SIG_SETMASK, uset, 1375 &td->td_oldsigmask, 0); 1376 if (error) 1377 goto done; 1378 td->td_pflags |= TDP_OLDMASK; 1379 /* 1380 * Make sure that ast() is called on return to 1381 * usermode and TDP_OLDMASK is cleared, restoring old 1382 * sigmask. 1383 */ 1384 thread_lock(td); 1385 td->td_flags |= TDF_ASTPENDING; 1386 thread_unlock(td); 1387 } 1388 1389 seltdinit(td); 1390 /* Iterate until the timeout expires or descriptors become ready. */ 1391 for (;;) { 1392 error = pollscan(td, bits, nfds); 1393 if (error || td->td_retval[0] != 0) 1394 break; 1395 error = seltdwait(td, sbt, precision); 1396 if (error) 1397 break; 1398 error = pollrescan(td); 1399 if (error || td->td_retval[0] != 0) 1400 break; 1401 } 1402 seltdclear(td); 1403 1404 done: 1405 /* poll is not restarted after signals... */ 1406 if (error == ERESTART) 1407 error = EINTR; 1408 if (error == EWOULDBLOCK) 1409 error = 0; 1410 if (error == 0) { 1411 error = pollout(td, bits, fds, nfds); 1412 if (error) 1413 goto out; 1414 } 1415 out: 1416 if (ni > sizeof(smallbits)) 1417 free(bits, M_TEMP); 1418 return (error); 1419 } 1420 1421 int 1422 sys_ppoll(struct thread *td, struct ppoll_args *uap) 1423 { 1424 struct timespec ts, *tsp; 1425 sigset_t set, *ssp; 1426 int error; 1427 1428 if (uap->ts != NULL) { 1429 error = copyin(uap->ts, &ts, sizeof(ts)); 1430 if (error) 1431 return (error); 1432 tsp = &ts; 1433 } else 1434 tsp = NULL; 1435 if (uap->set != NULL) { 1436 error = copyin(uap->set, &set, sizeof(set)); 1437 if (error) 1438 return (error); 1439 ssp = &set; 1440 } else 1441 ssp = NULL; 1442 /* 1443 * fds is still a pointer to user space. kern_poll() will 1444 * take care of copyin that array to the kernel space. 1445 */ 1446 1447 return (kern_poll(td, uap->fds, uap->nfds, tsp, ssp)); 1448 } 1449 1450 static int 1451 pollrescan(struct thread *td) 1452 { 1453 struct seltd *stp; 1454 struct selfd *sfp; 1455 struct selfd *sfn; 1456 struct selinfo *si; 1457 struct filedesc *fdp; 1458 struct file *fp; 1459 struct pollfd *fd; 1460 #ifdef CAPABILITIES 1461 cap_rights_t rights; 1462 #endif 1463 int n; 1464 1465 n = 0; 1466 fdp = td->td_proc->p_fd; 1467 stp = td->td_sel; 1468 FILEDESC_SLOCK(fdp); 1469 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { 1470 fd = (struct pollfd *)sfp->sf_cookie; 1471 si = sfp->sf_si; 1472 selfdfree(stp, sfp); 1473 /* If the selinfo wasn't cleared the event didn't fire. */ 1474 if (si != NULL) 1475 continue; 1476 fp = fdp->fd_ofiles[fd->fd].fde_file; 1477 #ifdef CAPABILITIES 1478 if (fp == NULL || 1479 cap_check(cap_rights(fdp, fd->fd), 1480 cap_rights_init(&rights, CAP_EVENT)) != 0) 1481 #else 1482 if (fp == NULL) 1483 #endif 1484 { 1485 fd->revents = POLLNVAL; 1486 n++; 1487 continue; 1488 } 1489 1490 /* 1491 * Note: backend also returns POLLHUP and 1492 * POLLERR if appropriate. 1493 */ 1494 fd->revents = fo_poll(fp, fd->events, td->td_ucred, td); 1495 if (fd->revents != 0) 1496 n++; 1497 } 1498 FILEDESC_SUNLOCK(fdp); 1499 stp->st_flags = 0; 1500 td->td_retval[0] = n; 1501 return (0); 1502 } 1503 1504 1505 static int 1506 pollout(td, fds, ufds, nfd) 1507 struct thread *td; 1508 struct pollfd *fds; 1509 struct pollfd *ufds; 1510 u_int nfd; 1511 { 1512 int error = 0; 1513 u_int i = 0; 1514 u_int n = 0; 1515 1516 for (i = 0; i < nfd; i++) { 1517 error = copyout(&fds->revents, &ufds->revents, 1518 sizeof(ufds->revents)); 1519 if (error) 1520 return (error); 1521 if (fds->revents != 0) 1522 n++; 1523 fds++; 1524 ufds++; 1525 } 1526 td->td_retval[0] = n; 1527 return (0); 1528 } 1529 1530 static int 1531 pollscan(td, fds, nfd) 1532 struct thread *td; 1533 struct pollfd *fds; 1534 u_int nfd; 1535 { 1536 struct filedesc *fdp = td->td_proc->p_fd; 1537 struct file *fp; 1538 #ifdef CAPABILITIES 1539 cap_rights_t rights; 1540 #endif 1541 int i, n = 0; 1542 1543 FILEDESC_SLOCK(fdp); 1544 for (i = 0; i < nfd; i++, fds++) { 1545 if (fds->fd > fdp->fd_lastfile) { 1546 fds->revents = POLLNVAL; 1547 n++; 1548 } else if (fds->fd < 0) { 1549 fds->revents = 0; 1550 } else { 1551 fp = fdp->fd_ofiles[fds->fd].fde_file; 1552 #ifdef CAPABILITIES 1553 if (fp == NULL || 1554 cap_check(cap_rights(fdp, fds->fd), 1555 cap_rights_init(&rights, CAP_EVENT)) != 0) 1556 #else 1557 if (fp == NULL) 1558 #endif 1559 { 1560 fds->revents = POLLNVAL; 1561 n++; 1562 } else { 1563 /* 1564 * Note: backend also returns POLLHUP and 1565 * POLLERR if appropriate. 1566 */ 1567 selfdalloc(td, fds); 1568 fds->revents = fo_poll(fp, fds->events, 1569 td->td_ucred, td); 1570 /* 1571 * POSIX requires POLLOUT to be never 1572 * set simultaneously with POLLHUP. 1573 */ 1574 if ((fds->revents & POLLHUP) != 0) 1575 fds->revents &= ~POLLOUT; 1576 1577 if (fds->revents != 0) 1578 n++; 1579 } 1580 } 1581 } 1582 FILEDESC_SUNLOCK(fdp); 1583 td->td_retval[0] = n; 1584 return (0); 1585 } 1586 1587 /* 1588 * OpenBSD poll system call. 1589 * 1590 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1591 */ 1592 #ifndef _SYS_SYSPROTO_H_ 1593 struct openbsd_poll_args { 1594 struct pollfd *fds; 1595 u_int nfds; 1596 int timeout; 1597 }; 1598 #endif 1599 int 1600 sys_openbsd_poll(td, uap) 1601 register struct thread *td; 1602 register struct openbsd_poll_args *uap; 1603 { 1604 return (sys_poll(td, (struct poll_args *)uap)); 1605 } 1606 1607 /* 1608 * XXX This was created specifically to support netncp and netsmb. This 1609 * allows the caller to specify a socket to wait for events on. It returns 1610 * 0 if any events matched and an error otherwise. There is no way to 1611 * determine which events fired. 1612 */ 1613 int 1614 selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td) 1615 { 1616 struct timeval rtv; 1617 sbintime_t asbt, precision, rsbt; 1618 int error; 1619 1620 precision = 0; /* stupid gcc! */ 1621 if (tvp != NULL) { 1622 rtv = *tvp; 1623 if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || 1624 rtv.tv_usec >= 1000000) 1625 return (EINVAL); 1626 if (!timevalisset(&rtv)) 1627 asbt = 0; 1628 else if (rtv.tv_sec <= INT32_MAX) { 1629 rsbt = tvtosbt(rtv); 1630 precision = rsbt; 1631 precision >>= tc_precexp; 1632 if (TIMESEL(&asbt, rsbt)) 1633 asbt += tc_tick_sbt; 1634 if (asbt <= SBT_MAX - rsbt) 1635 asbt += rsbt; 1636 else 1637 asbt = -1; 1638 } else 1639 asbt = -1; 1640 } else 1641 asbt = -1; 1642 seltdinit(td); 1643 /* 1644 * Iterate until the timeout expires or the socket becomes ready. 1645 */ 1646 for (;;) { 1647 selfdalloc(td, NULL); 1648 error = sopoll(so, events, NULL, td); 1649 /* error here is actually the ready events. */ 1650 if (error) 1651 return (0); 1652 error = seltdwait(td, asbt, precision); 1653 if (error) 1654 break; 1655 } 1656 seltdclear(td); 1657 /* XXX Duplicates ncp/smb behavior. */ 1658 if (error == ERESTART) 1659 error = 0; 1660 return (error); 1661 } 1662 1663 /* 1664 * Preallocate two selfds associated with 'cookie'. Some fo_poll routines 1665 * have two select sets, one for read and another for write. 1666 */ 1667 static void 1668 selfdalloc(struct thread *td, void *cookie) 1669 { 1670 struct seltd *stp; 1671 1672 stp = td->td_sel; 1673 if (stp->st_free1 == NULL) 1674 stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); 1675 stp->st_free1->sf_td = stp; 1676 stp->st_free1->sf_cookie = cookie; 1677 if (stp->st_free2 == NULL) 1678 stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); 1679 stp->st_free2->sf_td = stp; 1680 stp->st_free2->sf_cookie = cookie; 1681 } 1682 1683 static void 1684 selfdfree(struct seltd *stp, struct selfd *sfp) 1685 { 1686 STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link); 1687 if (sfp->sf_si != NULL) { 1688 mtx_lock(sfp->sf_mtx); 1689 if (sfp->sf_si != NULL) { 1690 TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads); 1691 refcount_release(&sfp->sf_refs); 1692 } 1693 mtx_unlock(sfp->sf_mtx); 1694 } 1695 if (refcount_release(&sfp->sf_refs)) 1696 uma_zfree(selfd_zone, sfp); 1697 } 1698 1699 /* Drain the waiters tied to all the selfd belonging the specified selinfo. */ 1700 void 1701 seldrain(sip) 1702 struct selinfo *sip; 1703 { 1704 1705 /* 1706 * This feature is already provided by doselwakeup(), thus it is 1707 * enough to go for it. 1708 * Eventually, the context, should take care to avoid races 1709 * between thread calling select()/poll() and file descriptor 1710 * detaching, but, again, the races are just the same as 1711 * selwakeup(). 1712 */ 1713 doselwakeup(sip, -1); 1714 } 1715 1716 /* 1717 * Record a select request. 1718 */ 1719 void 1720 selrecord(selector, sip) 1721 struct thread *selector; 1722 struct selinfo *sip; 1723 { 1724 struct selfd *sfp; 1725 struct seltd *stp; 1726 struct mtx *mtxp; 1727 1728 stp = selector->td_sel; 1729 /* 1730 * Don't record when doing a rescan. 1731 */ 1732 if (stp->st_flags & SELTD_RESCAN) 1733 return; 1734 /* 1735 * Grab one of the preallocated descriptors. 1736 */ 1737 sfp = NULL; 1738 if ((sfp = stp->st_free1) != NULL) 1739 stp->st_free1 = NULL; 1740 else if ((sfp = stp->st_free2) != NULL) 1741 stp->st_free2 = NULL; 1742 else 1743 panic("selrecord: No free selfd on selq"); 1744 mtxp = sip->si_mtx; 1745 if (mtxp == NULL) 1746 mtxp = mtx_pool_find(mtxpool_select, sip); 1747 /* 1748 * Initialize the sfp and queue it in the thread. 1749 */ 1750 sfp->sf_si = sip; 1751 sfp->sf_mtx = mtxp; 1752 refcount_init(&sfp->sf_refs, 2); 1753 STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link); 1754 /* 1755 * Now that we've locked the sip, check for initialization. 1756 */ 1757 mtx_lock(mtxp); 1758 if (sip->si_mtx == NULL) { 1759 sip->si_mtx = mtxp; 1760 TAILQ_INIT(&sip->si_tdlist); 1761 } 1762 /* 1763 * Add this thread to the list of selfds listening on this selinfo. 1764 */ 1765 TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads); 1766 mtx_unlock(sip->si_mtx); 1767 } 1768 1769 /* Wake up a selecting thread. */ 1770 void 1771 selwakeup(sip) 1772 struct selinfo *sip; 1773 { 1774 doselwakeup(sip, -1); 1775 } 1776 1777 /* Wake up a selecting thread, and set its priority. */ 1778 void 1779 selwakeuppri(sip, pri) 1780 struct selinfo *sip; 1781 int pri; 1782 { 1783 doselwakeup(sip, pri); 1784 } 1785 1786 /* 1787 * Do a wakeup when a selectable event occurs. 1788 */ 1789 static void 1790 doselwakeup(sip, pri) 1791 struct selinfo *sip; 1792 int pri; 1793 { 1794 struct selfd *sfp; 1795 struct selfd *sfn; 1796 struct seltd *stp; 1797 1798 /* If it's not initialized there can't be any waiters. */ 1799 if (sip->si_mtx == NULL) 1800 return; 1801 /* 1802 * Locking the selinfo locks all selfds associated with it. 1803 */ 1804 mtx_lock(sip->si_mtx); 1805 TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) { 1806 /* 1807 * Once we remove this sfp from the list and clear the 1808 * sf_si seltdclear will know to ignore this si. 1809 */ 1810 TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads); 1811 sfp->sf_si = NULL; 1812 stp = sfp->sf_td; 1813 mtx_lock(&stp->st_mtx); 1814 stp->st_flags |= SELTD_PENDING; 1815 cv_broadcastpri(&stp->st_wait, pri); 1816 mtx_unlock(&stp->st_mtx); 1817 if (refcount_release(&sfp->sf_refs)) 1818 uma_zfree(selfd_zone, sfp); 1819 } 1820 mtx_unlock(sip->si_mtx); 1821 } 1822 1823 static void 1824 seltdinit(struct thread *td) 1825 { 1826 struct seltd *stp; 1827 1828 if ((stp = td->td_sel) != NULL) 1829 goto out; 1830 td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO); 1831 mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF); 1832 cv_init(&stp->st_wait, "select"); 1833 out: 1834 stp->st_flags = 0; 1835 STAILQ_INIT(&stp->st_selq); 1836 } 1837 1838 static int 1839 seltdwait(struct thread *td, sbintime_t sbt, sbintime_t precision) 1840 { 1841 struct seltd *stp; 1842 int error; 1843 1844 stp = td->td_sel; 1845 /* 1846 * An event of interest may occur while we do not hold the seltd 1847 * locked so check the pending flag before we sleep. 1848 */ 1849 mtx_lock(&stp->st_mtx); 1850 /* 1851 * Any further calls to selrecord will be a rescan. 1852 */ 1853 stp->st_flags |= SELTD_RESCAN; 1854 if (stp->st_flags & SELTD_PENDING) { 1855 mtx_unlock(&stp->st_mtx); 1856 return (0); 1857 } 1858 if (sbt == 0) 1859 error = EWOULDBLOCK; 1860 else if (sbt != -1) 1861 error = cv_timedwait_sig_sbt(&stp->st_wait, &stp->st_mtx, 1862 sbt, precision, C_ABSOLUTE); 1863 else 1864 error = cv_wait_sig(&stp->st_wait, &stp->st_mtx); 1865 mtx_unlock(&stp->st_mtx); 1866 1867 return (error); 1868 } 1869 1870 void 1871 seltdfini(struct thread *td) 1872 { 1873 struct seltd *stp; 1874 1875 stp = td->td_sel; 1876 if (stp == NULL) 1877 return; 1878 if (stp->st_free1) 1879 uma_zfree(selfd_zone, stp->st_free1); 1880 if (stp->st_free2) 1881 uma_zfree(selfd_zone, stp->st_free2); 1882 td->td_sel = NULL; 1883 free(stp, M_SELECT); 1884 } 1885 1886 /* 1887 * Remove the references to the thread from all of the objects we were 1888 * polling. 1889 */ 1890 static void 1891 seltdclear(struct thread *td) 1892 { 1893 struct seltd *stp; 1894 struct selfd *sfp; 1895 struct selfd *sfn; 1896 1897 stp = td->td_sel; 1898 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) 1899 selfdfree(stp, sfp); 1900 stp->st_flags = 0; 1901 } 1902 1903 static void selectinit(void *); 1904 SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL); 1905 static void 1906 selectinit(void *dummy __unused) 1907 { 1908 1909 selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL, 1910 NULL, NULL, UMA_ALIGN_PTR, 0); 1911 mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF); 1912 } 1913