1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_capsicum.h" 41 #include "opt_compat.h" 42 #include "opt_ktrace.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/sysproto.h> 47 #include <sys/capsicum.h> 48 #include <sys/filedesc.h> 49 #include <sys/filio.h> 50 #include <sys/fcntl.h> 51 #include <sys/file.h> 52 #include <sys/lock.h> 53 #include <sys/proc.h> 54 #include <sys/signalvar.h> 55 #include <sys/socketvar.h> 56 #include <sys/uio.h> 57 #include <sys/kernel.h> 58 #include <sys/ktr.h> 59 #include <sys/limits.h> 60 #include <sys/malloc.h> 61 #include <sys/poll.h> 62 #include <sys/resourcevar.h> 63 #include <sys/selinfo.h> 64 #include <sys/sleepqueue.h> 65 #include <sys/syscallsubr.h> 66 #include <sys/sysctl.h> 67 #include <sys/sysent.h> 68 #include <sys/vnode.h> 69 #include <sys/bio.h> 70 #include <sys/buf.h> 71 #include <sys/condvar.h> 72 #ifdef KTRACE 73 #include <sys/ktrace.h> 74 #endif 75 76 #include <security/audit/audit.h> 77 78 /* 79 * The following macro defines how many bytes will be allocated from 80 * the stack instead of memory allocated when passing the IOCTL data 81 * structures from userspace and to the kernel. Some IOCTLs having 82 * small data structures are used very frequently and this small 83 * buffer on the stack gives a significant speedup improvement for 84 * those requests. The value of this define should be greater or equal 85 * to 64 bytes and should also be power of two. The data structure is 86 * currently hard-aligned to a 8-byte boundary on the stack. This 87 * should currently be sufficient for all supported platforms. 88 */ 89 #define SYS_IOCTL_SMALL_SIZE 128 /* bytes */ 90 #define SYS_IOCTL_SMALL_ALIGN 8 /* bytes */ 91 92 int iosize_max_clamp = 0; 93 SYSCTL_INT(_debug, OID_AUTO, iosize_max_clamp, CTLFLAG_RW, 94 &iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX"); 95 int devfs_iosize_max_clamp = 1; 96 SYSCTL_INT(_debug, OID_AUTO, devfs_iosize_max_clamp, CTLFLAG_RW, 97 &devfs_iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX for devices"); 98 99 /* 100 * Assert that the return value of read(2) and write(2) syscalls fits 101 * into a register. If not, an architecture will need to provide the 102 * usermode wrappers to reconstruct the result. 103 */ 104 CTASSERT(sizeof(register_t) >= sizeof(size_t)); 105 106 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 107 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 108 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 109 110 static int pollout(struct thread *, struct pollfd *, struct pollfd *, 111 u_int); 112 static int pollscan(struct thread *, struct pollfd *, u_int); 113 static int pollrescan(struct thread *); 114 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 115 static int selrescan(struct thread *, fd_mask **, fd_mask **); 116 static void selfdalloc(struct thread *, void *); 117 static void selfdfree(struct seltd *, struct selfd *); 118 static int dofileread(struct thread *, int, struct file *, struct uio *, 119 off_t, int); 120 static int dofilewrite(struct thread *, int, struct file *, struct uio *, 121 off_t, int); 122 static void doselwakeup(struct selinfo *, int); 123 static void seltdinit(struct thread *); 124 static int seltdwait(struct thread *, sbintime_t, sbintime_t); 125 static void seltdclear(struct thread *); 126 127 /* 128 * One seltd per-thread allocated on demand as needed. 129 * 130 * t - protected by st_mtx 131 * k - Only accessed by curthread or read-only 132 */ 133 struct seltd { 134 STAILQ_HEAD(, selfd) st_selq; /* (k) List of selfds. */ 135 struct selfd *st_free1; /* (k) free fd for read set. */ 136 struct selfd *st_free2; /* (k) free fd for write set. */ 137 struct mtx st_mtx; /* Protects struct seltd */ 138 struct cv st_wait; /* (t) Wait channel. */ 139 int st_flags; /* (t) SELTD_ flags. */ 140 }; 141 142 #define SELTD_PENDING 0x0001 /* We have pending events. */ 143 #define SELTD_RESCAN 0x0002 /* Doing a rescan. */ 144 145 /* 146 * One selfd allocated per-thread per-file-descriptor. 147 * f - protected by sf_mtx 148 */ 149 struct selfd { 150 STAILQ_ENTRY(selfd) sf_link; /* (k) fds owned by this td. */ 151 TAILQ_ENTRY(selfd) sf_threads; /* (f) fds on this selinfo. */ 152 struct selinfo *sf_si; /* (f) selinfo when linked. */ 153 struct mtx *sf_mtx; /* Pointer to selinfo mtx. */ 154 struct seltd *sf_td; /* (k) owning seltd. */ 155 void *sf_cookie; /* (k) fd or pollfd. */ 156 }; 157 158 static uma_zone_t selfd_zone; 159 static struct mtx_pool *mtxpool_select; 160 161 #ifndef _SYS_SYSPROTO_H_ 162 struct read_args { 163 int fd; 164 void *buf; 165 size_t nbyte; 166 }; 167 #endif 168 int 169 sys_read(td, uap) 170 struct thread *td; 171 struct read_args *uap; 172 { 173 struct uio auio; 174 struct iovec aiov; 175 int error; 176 177 if (uap->nbyte > IOSIZE_MAX) 178 return (EINVAL); 179 aiov.iov_base = uap->buf; 180 aiov.iov_len = uap->nbyte; 181 auio.uio_iov = &aiov; 182 auio.uio_iovcnt = 1; 183 auio.uio_resid = uap->nbyte; 184 auio.uio_segflg = UIO_USERSPACE; 185 error = kern_readv(td, uap->fd, &auio); 186 return(error); 187 } 188 189 /* 190 * Positioned read system call 191 */ 192 #ifndef _SYS_SYSPROTO_H_ 193 struct pread_args { 194 int fd; 195 void *buf; 196 size_t nbyte; 197 int pad; 198 off_t offset; 199 }; 200 #endif 201 int 202 sys_pread(td, uap) 203 struct thread *td; 204 struct pread_args *uap; 205 { 206 struct uio auio; 207 struct iovec aiov; 208 int error; 209 210 if (uap->nbyte > IOSIZE_MAX) 211 return (EINVAL); 212 aiov.iov_base = uap->buf; 213 aiov.iov_len = uap->nbyte; 214 auio.uio_iov = &aiov; 215 auio.uio_iovcnt = 1; 216 auio.uio_resid = uap->nbyte; 217 auio.uio_segflg = UIO_USERSPACE; 218 error = kern_preadv(td, uap->fd, &auio, uap->offset); 219 return(error); 220 } 221 222 #if defined(COMPAT_FREEBSD6) 223 int 224 freebsd6_pread(td, uap) 225 struct thread *td; 226 struct freebsd6_pread_args *uap; 227 { 228 struct pread_args oargs; 229 230 oargs.fd = uap->fd; 231 oargs.buf = uap->buf; 232 oargs.nbyte = uap->nbyte; 233 oargs.offset = uap->offset; 234 return (sys_pread(td, &oargs)); 235 } 236 #endif 237 238 /* 239 * Scatter read system call. 240 */ 241 #ifndef _SYS_SYSPROTO_H_ 242 struct readv_args { 243 int fd; 244 struct iovec *iovp; 245 u_int iovcnt; 246 }; 247 #endif 248 int 249 sys_readv(struct thread *td, struct readv_args *uap) 250 { 251 struct uio *auio; 252 int error; 253 254 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 255 if (error) 256 return (error); 257 error = kern_readv(td, uap->fd, auio); 258 free(auio, M_IOV); 259 return (error); 260 } 261 262 int 263 kern_readv(struct thread *td, int fd, struct uio *auio) 264 { 265 struct file *fp; 266 cap_rights_t rights; 267 int error; 268 269 error = fget_read(td, fd, cap_rights_init(&rights, CAP_READ), &fp); 270 if (error) 271 return (error); 272 error = dofileread(td, fd, fp, auio, (off_t)-1, 0); 273 fdrop(fp, td); 274 return (error); 275 } 276 277 /* 278 * Scatter positioned read system call. 279 */ 280 #ifndef _SYS_SYSPROTO_H_ 281 struct preadv_args { 282 int fd; 283 struct iovec *iovp; 284 u_int iovcnt; 285 off_t offset; 286 }; 287 #endif 288 int 289 sys_preadv(struct thread *td, struct preadv_args *uap) 290 { 291 struct uio *auio; 292 int error; 293 294 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 295 if (error) 296 return (error); 297 error = kern_preadv(td, uap->fd, auio, uap->offset); 298 free(auio, M_IOV); 299 return (error); 300 } 301 302 int 303 kern_preadv(td, fd, auio, offset) 304 struct thread *td; 305 int fd; 306 struct uio *auio; 307 off_t offset; 308 { 309 struct file *fp; 310 cap_rights_t rights; 311 int error; 312 313 error = fget_read(td, fd, cap_rights_init(&rights, CAP_PREAD), &fp); 314 if (error) 315 return (error); 316 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 317 error = ESPIPE; 318 else if (offset < 0 && fp->f_vnode->v_type != VCHR) 319 error = EINVAL; 320 else 321 error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET); 322 fdrop(fp, td); 323 return (error); 324 } 325 326 /* 327 * Common code for readv and preadv that reads data in 328 * from a file using the passed in uio, offset, and flags. 329 */ 330 static int 331 dofileread(td, fd, fp, auio, offset, flags) 332 struct thread *td; 333 int fd; 334 struct file *fp; 335 struct uio *auio; 336 off_t offset; 337 int flags; 338 { 339 ssize_t cnt; 340 int error; 341 #ifdef KTRACE 342 struct uio *ktruio = NULL; 343 #endif 344 345 /* Finish zero length reads right here */ 346 if (auio->uio_resid == 0) { 347 td->td_retval[0] = 0; 348 return(0); 349 } 350 auio->uio_rw = UIO_READ; 351 auio->uio_offset = offset; 352 auio->uio_td = td; 353 #ifdef KTRACE 354 if (KTRPOINT(td, KTR_GENIO)) 355 ktruio = cloneuio(auio); 356 #endif 357 cnt = auio->uio_resid; 358 if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) { 359 if (auio->uio_resid != cnt && (error == ERESTART || 360 error == EINTR || error == EWOULDBLOCK)) 361 error = 0; 362 } 363 cnt -= auio->uio_resid; 364 #ifdef KTRACE 365 if (ktruio != NULL) { 366 ktruio->uio_resid = cnt; 367 ktrgenio(fd, UIO_READ, ktruio, error); 368 } 369 #endif 370 td->td_retval[0] = cnt; 371 return (error); 372 } 373 374 #ifndef _SYS_SYSPROTO_H_ 375 struct write_args { 376 int fd; 377 const void *buf; 378 size_t nbyte; 379 }; 380 #endif 381 int 382 sys_write(td, uap) 383 struct thread *td; 384 struct write_args *uap; 385 { 386 struct uio auio; 387 struct iovec aiov; 388 int error; 389 390 if (uap->nbyte > IOSIZE_MAX) 391 return (EINVAL); 392 aiov.iov_base = (void *)(uintptr_t)uap->buf; 393 aiov.iov_len = uap->nbyte; 394 auio.uio_iov = &aiov; 395 auio.uio_iovcnt = 1; 396 auio.uio_resid = uap->nbyte; 397 auio.uio_segflg = UIO_USERSPACE; 398 error = kern_writev(td, uap->fd, &auio); 399 return(error); 400 } 401 402 /* 403 * Positioned write system call. 404 */ 405 #ifndef _SYS_SYSPROTO_H_ 406 struct pwrite_args { 407 int fd; 408 const void *buf; 409 size_t nbyte; 410 int pad; 411 off_t offset; 412 }; 413 #endif 414 int 415 sys_pwrite(td, uap) 416 struct thread *td; 417 struct pwrite_args *uap; 418 { 419 struct uio auio; 420 struct iovec aiov; 421 int error; 422 423 if (uap->nbyte > IOSIZE_MAX) 424 return (EINVAL); 425 aiov.iov_base = (void *)(uintptr_t)uap->buf; 426 aiov.iov_len = uap->nbyte; 427 auio.uio_iov = &aiov; 428 auio.uio_iovcnt = 1; 429 auio.uio_resid = uap->nbyte; 430 auio.uio_segflg = UIO_USERSPACE; 431 error = kern_pwritev(td, uap->fd, &auio, uap->offset); 432 return(error); 433 } 434 435 #if defined(COMPAT_FREEBSD6) 436 int 437 freebsd6_pwrite(td, uap) 438 struct thread *td; 439 struct freebsd6_pwrite_args *uap; 440 { 441 struct pwrite_args oargs; 442 443 oargs.fd = uap->fd; 444 oargs.buf = uap->buf; 445 oargs.nbyte = uap->nbyte; 446 oargs.offset = uap->offset; 447 return (sys_pwrite(td, &oargs)); 448 } 449 #endif 450 451 /* 452 * Gather write system call. 453 */ 454 #ifndef _SYS_SYSPROTO_H_ 455 struct writev_args { 456 int fd; 457 struct iovec *iovp; 458 u_int iovcnt; 459 }; 460 #endif 461 int 462 sys_writev(struct thread *td, struct writev_args *uap) 463 { 464 struct uio *auio; 465 int error; 466 467 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 468 if (error) 469 return (error); 470 error = kern_writev(td, uap->fd, auio); 471 free(auio, M_IOV); 472 return (error); 473 } 474 475 int 476 kern_writev(struct thread *td, int fd, struct uio *auio) 477 { 478 struct file *fp; 479 cap_rights_t rights; 480 int error; 481 482 error = fget_write(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp); 483 if (error) 484 return (error); 485 error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0); 486 fdrop(fp, td); 487 return (error); 488 } 489 490 /* 491 * Gather positioned write system call. 492 */ 493 #ifndef _SYS_SYSPROTO_H_ 494 struct pwritev_args { 495 int fd; 496 struct iovec *iovp; 497 u_int iovcnt; 498 off_t offset; 499 }; 500 #endif 501 int 502 sys_pwritev(struct thread *td, struct pwritev_args *uap) 503 { 504 struct uio *auio; 505 int error; 506 507 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 508 if (error) 509 return (error); 510 error = kern_pwritev(td, uap->fd, auio, uap->offset); 511 free(auio, M_IOV); 512 return (error); 513 } 514 515 int 516 kern_pwritev(td, fd, auio, offset) 517 struct thread *td; 518 struct uio *auio; 519 int fd; 520 off_t offset; 521 { 522 struct file *fp; 523 cap_rights_t rights; 524 int error; 525 526 error = fget_write(td, fd, cap_rights_init(&rights, CAP_PWRITE), &fp); 527 if (error) 528 return (error); 529 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 530 error = ESPIPE; 531 else if (offset < 0 && fp->f_vnode->v_type != VCHR) 532 error = EINVAL; 533 else 534 error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET); 535 fdrop(fp, td); 536 return (error); 537 } 538 539 /* 540 * Common code for writev and pwritev that writes data to 541 * a file using the passed in uio, offset, and flags. 542 */ 543 static int 544 dofilewrite(td, fd, fp, auio, offset, flags) 545 struct thread *td; 546 int fd; 547 struct file *fp; 548 struct uio *auio; 549 off_t offset; 550 int flags; 551 { 552 ssize_t cnt; 553 int error; 554 #ifdef KTRACE 555 struct uio *ktruio = NULL; 556 #endif 557 558 auio->uio_rw = UIO_WRITE; 559 auio->uio_td = td; 560 auio->uio_offset = offset; 561 #ifdef KTRACE 562 if (KTRPOINT(td, KTR_GENIO)) 563 ktruio = cloneuio(auio); 564 #endif 565 cnt = auio->uio_resid; 566 if (fp->f_type == DTYPE_VNODE && 567 (fp->f_vnread_flags & FDEVFS_VNODE) == 0) 568 bwillwrite(); 569 if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) { 570 if (auio->uio_resid != cnt && (error == ERESTART || 571 error == EINTR || error == EWOULDBLOCK)) 572 error = 0; 573 /* Socket layer is responsible for issuing SIGPIPE. */ 574 if (fp->f_type != DTYPE_SOCKET && error == EPIPE) { 575 PROC_LOCK(td->td_proc); 576 tdsignal(td, SIGPIPE); 577 PROC_UNLOCK(td->td_proc); 578 } 579 } 580 cnt -= auio->uio_resid; 581 #ifdef KTRACE 582 if (ktruio != NULL) { 583 ktruio->uio_resid = cnt; 584 ktrgenio(fd, UIO_WRITE, ktruio, error); 585 } 586 #endif 587 td->td_retval[0] = cnt; 588 return (error); 589 } 590 591 /* 592 * Truncate a file given a file descriptor. 593 * 594 * Can't use fget_write() here, since must return EINVAL and not EBADF if the 595 * descriptor isn't writable. 596 */ 597 int 598 kern_ftruncate(td, fd, length) 599 struct thread *td; 600 int fd; 601 off_t length; 602 { 603 struct file *fp; 604 cap_rights_t rights; 605 int error; 606 607 AUDIT_ARG_FD(fd); 608 if (length < 0) 609 return (EINVAL); 610 error = fget(td, fd, cap_rights_init(&rights, CAP_FTRUNCATE), &fp); 611 if (error) 612 return (error); 613 AUDIT_ARG_FILE(td->td_proc, fp); 614 if (!(fp->f_flag & FWRITE)) { 615 fdrop(fp, td); 616 return (EINVAL); 617 } 618 error = fo_truncate(fp, length, td->td_ucred, td); 619 fdrop(fp, td); 620 return (error); 621 } 622 623 #ifndef _SYS_SYSPROTO_H_ 624 struct ftruncate_args { 625 int fd; 626 int pad; 627 off_t length; 628 }; 629 #endif 630 int 631 sys_ftruncate(td, uap) 632 struct thread *td; 633 struct ftruncate_args *uap; 634 { 635 636 return (kern_ftruncate(td, uap->fd, uap->length)); 637 } 638 639 #if defined(COMPAT_43) 640 #ifndef _SYS_SYSPROTO_H_ 641 struct oftruncate_args { 642 int fd; 643 long length; 644 }; 645 #endif 646 int 647 oftruncate(td, uap) 648 struct thread *td; 649 struct oftruncate_args *uap; 650 { 651 652 return (kern_ftruncate(td, uap->fd, uap->length)); 653 } 654 #endif /* COMPAT_43 */ 655 656 #ifndef _SYS_SYSPROTO_H_ 657 struct ioctl_args { 658 int fd; 659 u_long com; 660 caddr_t data; 661 }; 662 #endif 663 /* ARGSUSED */ 664 int 665 sys_ioctl(struct thread *td, struct ioctl_args *uap) 666 { 667 u_char smalldata[SYS_IOCTL_SMALL_SIZE] __aligned(SYS_IOCTL_SMALL_ALIGN); 668 u_long com; 669 int arg, error; 670 u_int size; 671 caddr_t data; 672 673 if (uap->com > 0xffffffff) { 674 printf( 675 "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", 676 td->td_proc->p_pid, td->td_name, uap->com); 677 uap->com &= 0xffffffff; 678 } 679 com = uap->com; 680 681 /* 682 * Interpret high order word to find amount of data to be 683 * copied to/from the user's address space. 684 */ 685 size = IOCPARM_LEN(com); 686 if ((size > IOCPARM_MAX) || 687 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || 688 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43) 689 ((com & IOC_OUT) && size == 0) || 690 #else 691 ((com & (IOC_IN | IOC_OUT)) && size == 0) || 692 #endif 693 ((com & IOC_VOID) && size > 0 && size != sizeof(int))) 694 return (ENOTTY); 695 696 if (size > 0) { 697 if (com & IOC_VOID) { 698 /* Integer argument. */ 699 arg = (intptr_t)uap->data; 700 data = (void *)&arg; 701 size = 0; 702 } else { 703 if (size > SYS_IOCTL_SMALL_SIZE) 704 data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 705 else 706 data = smalldata; 707 } 708 } else 709 data = (void *)&uap->data; 710 if (com & IOC_IN) { 711 error = copyin(uap->data, data, (u_int)size); 712 if (error != 0) 713 goto out; 714 } else if (com & IOC_OUT) { 715 /* 716 * Zero the buffer so the user always 717 * gets back something deterministic. 718 */ 719 bzero(data, size); 720 } 721 722 error = kern_ioctl(td, uap->fd, com, data); 723 724 if (error == 0 && (com & IOC_OUT)) 725 error = copyout(data, uap->data, (u_int)size); 726 727 out: 728 if (size > SYS_IOCTL_SMALL_SIZE) 729 free(data, M_IOCTLOPS); 730 return (error); 731 } 732 733 int 734 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) 735 { 736 struct file *fp; 737 struct filedesc *fdp; 738 #ifndef CAPABILITIES 739 cap_rights_t rights; 740 #endif 741 int error, tmp, locked; 742 743 AUDIT_ARG_FD(fd); 744 AUDIT_ARG_CMD(com); 745 746 fdp = td->td_proc->p_fd; 747 748 switch (com) { 749 case FIONCLEX: 750 case FIOCLEX: 751 FILEDESC_XLOCK(fdp); 752 locked = LA_XLOCKED; 753 break; 754 default: 755 #ifdef CAPABILITIES 756 FILEDESC_SLOCK(fdp); 757 locked = LA_SLOCKED; 758 #else 759 locked = LA_UNLOCKED; 760 #endif 761 break; 762 } 763 764 #ifdef CAPABILITIES 765 if ((fp = fget_locked(fdp, fd)) == NULL) { 766 error = EBADF; 767 goto out; 768 } 769 if ((error = cap_ioctl_check(fdp, fd, com)) != 0) { 770 fp = NULL; /* fhold() was not called yet */ 771 goto out; 772 } 773 fhold(fp); 774 if (locked == LA_SLOCKED) { 775 FILEDESC_SUNLOCK(fdp); 776 locked = LA_UNLOCKED; 777 } 778 #else 779 error = fget(td, fd, cap_rights_init(&rights, CAP_IOCTL), &fp); 780 if (error != 0) { 781 fp = NULL; 782 goto out; 783 } 784 #endif 785 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 786 error = EBADF; 787 goto out; 788 } 789 790 switch (com) { 791 case FIONCLEX: 792 fdp->fd_ofiles[fd].fde_flags &= ~UF_EXCLOSE; 793 goto out; 794 case FIOCLEX: 795 fdp->fd_ofiles[fd].fde_flags |= UF_EXCLOSE; 796 goto out; 797 case FIONBIO: 798 if ((tmp = *(int *)data)) 799 atomic_set_int(&fp->f_flag, FNONBLOCK); 800 else 801 atomic_clear_int(&fp->f_flag, FNONBLOCK); 802 data = (void *)&tmp; 803 break; 804 case FIOASYNC: 805 if ((tmp = *(int *)data)) 806 atomic_set_int(&fp->f_flag, FASYNC); 807 else 808 atomic_clear_int(&fp->f_flag, FASYNC); 809 data = (void *)&tmp; 810 break; 811 } 812 813 error = fo_ioctl(fp, com, data, td->td_ucred, td); 814 out: 815 switch (locked) { 816 case LA_XLOCKED: 817 FILEDESC_XUNLOCK(fdp); 818 break; 819 #ifdef CAPABILITIES 820 case LA_SLOCKED: 821 FILEDESC_SUNLOCK(fdp); 822 break; 823 #endif 824 default: 825 FILEDESC_UNLOCK_ASSERT(fdp); 826 break; 827 } 828 if (fp != NULL) 829 fdrop(fp, td); 830 return (error); 831 } 832 833 int 834 poll_no_poll(int events) 835 { 836 /* 837 * Return true for read/write. If the user asked for something 838 * special, return POLLNVAL, so that clients have a way of 839 * determining reliably whether or not the extended 840 * functionality is present without hard-coding knowledge 841 * of specific filesystem implementations. 842 */ 843 if (events & ~POLLSTANDARD) 844 return (POLLNVAL); 845 846 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 847 } 848 849 int 850 sys_pselect(struct thread *td, struct pselect_args *uap) 851 { 852 struct timespec ts; 853 struct timeval tv, *tvp; 854 sigset_t set, *uset; 855 int error; 856 857 if (uap->ts != NULL) { 858 error = copyin(uap->ts, &ts, sizeof(ts)); 859 if (error != 0) 860 return (error); 861 TIMESPEC_TO_TIMEVAL(&tv, &ts); 862 tvp = &tv; 863 } else 864 tvp = NULL; 865 if (uap->sm != NULL) { 866 error = copyin(uap->sm, &set, sizeof(set)); 867 if (error != 0) 868 return (error); 869 uset = &set; 870 } else 871 uset = NULL; 872 return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, 873 uset, NFDBITS)); 874 } 875 876 int 877 kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex, 878 struct timeval *tvp, sigset_t *uset, int abi_nfdbits) 879 { 880 int error; 881 882 if (uset != NULL) { 883 error = kern_sigprocmask(td, SIG_SETMASK, uset, 884 &td->td_oldsigmask, 0); 885 if (error != 0) 886 return (error); 887 td->td_pflags |= TDP_OLDMASK; 888 /* 889 * Make sure that ast() is called on return to 890 * usermode and TDP_OLDMASK is cleared, restoring old 891 * sigmask. 892 */ 893 thread_lock(td); 894 td->td_flags |= TDF_ASTPENDING; 895 thread_unlock(td); 896 } 897 error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits); 898 return (error); 899 } 900 901 #ifndef _SYS_SYSPROTO_H_ 902 struct select_args { 903 int nd; 904 fd_set *in, *ou, *ex; 905 struct timeval *tv; 906 }; 907 #endif 908 int 909 sys_select(struct thread *td, struct select_args *uap) 910 { 911 struct timeval tv, *tvp; 912 int error; 913 914 if (uap->tv != NULL) { 915 error = copyin(uap->tv, &tv, sizeof(tv)); 916 if (error) 917 return (error); 918 tvp = &tv; 919 } else 920 tvp = NULL; 921 922 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, 923 NFDBITS)); 924 } 925 926 /* 927 * In the unlikely case when user specified n greater then the last 928 * open file descriptor, check that no bits are set after the last 929 * valid fd. We must return EBADF if any is set. 930 * 931 * There are applications that rely on the behaviour. 932 * 933 * nd is fd_lastfile + 1. 934 */ 935 static int 936 select_check_badfd(fd_set *fd_in, int nd, int ndu, int abi_nfdbits) 937 { 938 char *addr, *oaddr; 939 int b, i, res; 940 uint8_t bits; 941 942 if (nd >= ndu || fd_in == NULL) 943 return (0); 944 945 oaddr = NULL; 946 bits = 0; /* silence gcc */ 947 for (i = nd; i < ndu; i++) { 948 b = i / NBBY; 949 #if BYTE_ORDER == LITTLE_ENDIAN 950 addr = (char *)fd_in + b; 951 #else 952 addr = (char *)fd_in; 953 if (abi_nfdbits == NFDBITS) { 954 addr += rounddown(b, sizeof(fd_mask)) + 955 sizeof(fd_mask) - 1 - b % sizeof(fd_mask); 956 } else { 957 addr += rounddown(b, sizeof(uint32_t)) + 958 sizeof(uint32_t) - 1 - b % sizeof(uint32_t); 959 } 960 #endif 961 if (addr != oaddr) { 962 res = fubyte(addr); 963 if (res == -1) 964 return (EFAULT); 965 oaddr = addr; 966 bits = res; 967 } 968 if ((bits & (1 << (i % NBBY))) != 0) 969 return (EBADF); 970 } 971 return (0); 972 } 973 974 int 975 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 976 fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits) 977 { 978 struct filedesc *fdp; 979 /* 980 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 981 * infds with the new FD_SETSIZE of 1024, and more than enough for 982 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 983 * of 256. 984 */ 985 fd_mask s_selbits[howmany(2048, NFDBITS)]; 986 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 987 struct timeval rtv; 988 sbintime_t asbt, precision, rsbt; 989 u_int nbufbytes, ncpbytes, ncpubytes, nfdbits; 990 int error, lf, ndu; 991 992 if (nd < 0) 993 return (EINVAL); 994 fdp = td->td_proc->p_fd; 995 ndu = nd; 996 lf = fdp->fd_lastfile; 997 if (nd > lf + 1) 998 nd = lf + 1; 999 1000 error = select_check_badfd(fd_in, nd, ndu, abi_nfdbits); 1001 if (error != 0) 1002 return (error); 1003 error = select_check_badfd(fd_ou, nd, ndu, abi_nfdbits); 1004 if (error != 0) 1005 return (error); 1006 error = select_check_badfd(fd_ex, nd, ndu, abi_nfdbits); 1007 if (error != 0) 1008 return (error); 1009 1010 /* 1011 * Allocate just enough bits for the non-null fd_sets. Use the 1012 * preallocated auto buffer if possible. 1013 */ 1014 nfdbits = roundup(nd, NFDBITS); 1015 ncpbytes = nfdbits / NBBY; 1016 ncpubytes = roundup(nd, abi_nfdbits) / NBBY; 1017 nbufbytes = 0; 1018 if (fd_in != NULL) 1019 nbufbytes += 2 * ncpbytes; 1020 if (fd_ou != NULL) 1021 nbufbytes += 2 * ncpbytes; 1022 if (fd_ex != NULL) 1023 nbufbytes += 2 * ncpbytes; 1024 if (nbufbytes <= sizeof s_selbits) 1025 selbits = &s_selbits[0]; 1026 else 1027 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 1028 1029 /* 1030 * Assign pointers into the bit buffers and fetch the input bits. 1031 * Put the output buffers together so that they can be bzeroed 1032 * together. 1033 */ 1034 sbp = selbits; 1035 #define getbits(name, x) \ 1036 do { \ 1037 if (name == NULL) { \ 1038 ibits[x] = NULL; \ 1039 obits[x] = NULL; \ 1040 } else { \ 1041 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 1042 obits[x] = sbp; \ 1043 sbp += ncpbytes / sizeof *sbp; \ 1044 error = copyin(name, ibits[x], ncpubytes); \ 1045 if (error != 0) \ 1046 goto done; \ 1047 bzero((char *)ibits[x] + ncpubytes, \ 1048 ncpbytes - ncpubytes); \ 1049 } \ 1050 } while (0) 1051 getbits(fd_in, 0); 1052 getbits(fd_ou, 1); 1053 getbits(fd_ex, 2); 1054 #undef getbits 1055 1056 #if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__) 1057 /* 1058 * XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS, 1059 * we are running under 32-bit emulation. This should be more 1060 * generic. 1061 */ 1062 #define swizzle_fdset(bits) \ 1063 if (abi_nfdbits != NFDBITS && bits != NULL) { \ 1064 int i; \ 1065 for (i = 0; i < ncpbytes / sizeof *sbp; i++) \ 1066 bits[i] = (bits[i] >> 32) | (bits[i] << 32); \ 1067 } 1068 #else 1069 #define swizzle_fdset(bits) 1070 #endif 1071 1072 /* Make sure the bit order makes it through an ABI transition */ 1073 swizzle_fdset(ibits[0]); 1074 swizzle_fdset(ibits[1]); 1075 swizzle_fdset(ibits[2]); 1076 1077 if (nbufbytes != 0) 1078 bzero(selbits, nbufbytes / 2); 1079 1080 precision = 0; 1081 if (tvp != NULL) { 1082 rtv = *tvp; 1083 if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || 1084 rtv.tv_usec >= 1000000) { 1085 error = EINVAL; 1086 goto done; 1087 } 1088 if (!timevalisset(&rtv)) 1089 asbt = 0; 1090 else if (rtv.tv_sec <= INT32_MAX) { 1091 rsbt = tvtosbt(rtv); 1092 precision = rsbt; 1093 precision >>= tc_precexp; 1094 if (TIMESEL(&asbt, rsbt)) 1095 asbt += tc_tick_sbt; 1096 if (asbt <= SBT_MAX - rsbt) 1097 asbt += rsbt; 1098 else 1099 asbt = -1; 1100 } else 1101 asbt = -1; 1102 } else 1103 asbt = -1; 1104 seltdinit(td); 1105 /* Iterate until the timeout expires or descriptors become ready. */ 1106 for (;;) { 1107 error = selscan(td, ibits, obits, nd); 1108 if (error || td->td_retval[0] != 0) 1109 break; 1110 error = seltdwait(td, asbt, precision); 1111 if (error) 1112 break; 1113 error = selrescan(td, ibits, obits); 1114 if (error || td->td_retval[0] != 0) 1115 break; 1116 } 1117 seltdclear(td); 1118 1119 done: 1120 /* select is not restarted after signals... */ 1121 if (error == ERESTART) 1122 error = EINTR; 1123 if (error == EWOULDBLOCK) 1124 error = 0; 1125 1126 /* swizzle bit order back, if necessary */ 1127 swizzle_fdset(obits[0]); 1128 swizzle_fdset(obits[1]); 1129 swizzle_fdset(obits[2]); 1130 #undef swizzle_fdset 1131 1132 #define putbits(name, x) \ 1133 if (name && (error2 = copyout(obits[x], name, ncpubytes))) \ 1134 error = error2; 1135 if (error == 0) { 1136 int error2; 1137 1138 putbits(fd_in, 0); 1139 putbits(fd_ou, 1); 1140 putbits(fd_ex, 2); 1141 #undef putbits 1142 } 1143 if (selbits != &s_selbits[0]) 1144 free(selbits, M_SELECT); 1145 1146 return (error); 1147 } 1148 /* 1149 * Convert a select bit set to poll flags. 1150 * 1151 * The backend always returns POLLHUP/POLLERR if appropriate and we 1152 * return this as a set bit in any set. 1153 */ 1154 static int select_flags[3] = { 1155 POLLRDNORM | POLLHUP | POLLERR, 1156 POLLWRNORM | POLLHUP | POLLERR, 1157 POLLRDBAND | POLLERR 1158 }; 1159 1160 /* 1161 * Compute the fo_poll flags required for a fd given by the index and 1162 * bit position in the fd_mask array. 1163 */ 1164 static __inline int 1165 selflags(fd_mask **ibits, int idx, fd_mask bit) 1166 { 1167 int flags; 1168 int msk; 1169 1170 flags = 0; 1171 for (msk = 0; msk < 3; msk++) { 1172 if (ibits[msk] == NULL) 1173 continue; 1174 if ((ibits[msk][idx] & bit) == 0) 1175 continue; 1176 flags |= select_flags[msk]; 1177 } 1178 return (flags); 1179 } 1180 1181 /* 1182 * Set the appropriate output bits given a mask of fired events and the 1183 * input bits originally requested. 1184 */ 1185 static __inline int 1186 selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events) 1187 { 1188 int msk; 1189 int n; 1190 1191 n = 0; 1192 for (msk = 0; msk < 3; msk++) { 1193 if ((events & select_flags[msk]) == 0) 1194 continue; 1195 if (ibits[msk] == NULL) 1196 continue; 1197 if ((ibits[msk][idx] & bit) == 0) 1198 continue; 1199 /* 1200 * XXX Check for a duplicate set. This can occur because a 1201 * socket calls selrecord() twice for each poll() call 1202 * resulting in two selfds per real fd. selrescan() will 1203 * call selsetbits twice as a result. 1204 */ 1205 if ((obits[msk][idx] & bit) != 0) 1206 continue; 1207 obits[msk][idx] |= bit; 1208 n++; 1209 } 1210 1211 return (n); 1212 } 1213 1214 static __inline int 1215 getselfd_cap(struct filedesc *fdp, int fd, struct file **fpp) 1216 { 1217 cap_rights_t rights; 1218 1219 cap_rights_init(&rights, CAP_EVENT); 1220 1221 return (fget_unlocked(fdp, fd, &rights, fpp, NULL)); 1222 } 1223 1224 /* 1225 * Traverse the list of fds attached to this thread's seltd and check for 1226 * completion. 1227 */ 1228 static int 1229 selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits) 1230 { 1231 struct filedesc *fdp; 1232 struct selinfo *si; 1233 struct seltd *stp; 1234 struct selfd *sfp; 1235 struct selfd *sfn; 1236 struct file *fp; 1237 fd_mask bit; 1238 int fd, ev, n, idx; 1239 int error; 1240 1241 fdp = td->td_proc->p_fd; 1242 stp = td->td_sel; 1243 n = 0; 1244 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { 1245 fd = (int)(uintptr_t)sfp->sf_cookie; 1246 si = sfp->sf_si; 1247 selfdfree(stp, sfp); 1248 /* If the selinfo wasn't cleared the event didn't fire. */ 1249 if (si != NULL) 1250 continue; 1251 error = getselfd_cap(fdp, fd, &fp); 1252 if (error) 1253 return (error); 1254 idx = fd / NFDBITS; 1255 bit = (fd_mask)1 << (fd % NFDBITS); 1256 ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td); 1257 fdrop(fp, td); 1258 if (ev != 0) 1259 n += selsetbits(ibits, obits, idx, bit, ev); 1260 } 1261 stp->st_flags = 0; 1262 td->td_retval[0] = n; 1263 return (0); 1264 } 1265 1266 /* 1267 * Perform the initial filedescriptor scan and register ourselves with 1268 * each selinfo. 1269 */ 1270 static int 1271 selscan(td, ibits, obits, nfd) 1272 struct thread *td; 1273 fd_mask **ibits, **obits; 1274 int nfd; 1275 { 1276 struct filedesc *fdp; 1277 struct file *fp; 1278 fd_mask bit; 1279 int ev, flags, end, fd; 1280 int n, idx; 1281 int error; 1282 1283 fdp = td->td_proc->p_fd; 1284 n = 0; 1285 for (idx = 0, fd = 0; fd < nfd; idx++) { 1286 end = imin(fd + NFDBITS, nfd); 1287 for (bit = 1; fd < end; bit <<= 1, fd++) { 1288 /* Compute the list of events we're interested in. */ 1289 flags = selflags(ibits, idx, bit); 1290 if (flags == 0) 1291 continue; 1292 error = getselfd_cap(fdp, fd, &fp); 1293 if (error) 1294 return (error); 1295 selfdalloc(td, (void *)(uintptr_t)fd); 1296 ev = fo_poll(fp, flags, td->td_ucred, td); 1297 fdrop(fp, td); 1298 if (ev != 0) 1299 n += selsetbits(ibits, obits, idx, bit, ev); 1300 } 1301 } 1302 1303 td->td_retval[0] = n; 1304 return (0); 1305 } 1306 1307 int 1308 sys_poll(struct thread *td, struct poll_args *uap) 1309 { 1310 struct timespec ts, *tsp; 1311 1312 if (uap->timeout != INFTIM) { 1313 if (uap->timeout < 0) 1314 return (EINVAL); 1315 ts.tv_sec = uap->timeout / 1000; 1316 ts.tv_nsec = (uap->timeout % 1000) * 1000000; 1317 tsp = &ts; 1318 } else 1319 tsp = NULL; 1320 1321 return (kern_poll(td, uap->fds, uap->nfds, tsp, NULL)); 1322 } 1323 1324 int 1325 kern_poll(struct thread *td, struct pollfd *fds, u_int nfds, 1326 struct timespec *tsp, sigset_t *uset) 1327 { 1328 struct pollfd *bits; 1329 struct pollfd smallbits[32]; 1330 sbintime_t sbt, precision, tmp; 1331 time_t over; 1332 struct timespec ts; 1333 int error; 1334 size_t ni; 1335 1336 precision = 0; 1337 if (tsp != NULL) { 1338 if (tsp->tv_sec < 0) 1339 return (EINVAL); 1340 if (tsp->tv_nsec < 0 || tsp->tv_nsec >= 1000000000) 1341 return (EINVAL); 1342 if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) 1343 sbt = 0; 1344 else { 1345 ts = *tsp; 1346 if (ts.tv_sec > INT32_MAX / 2) { 1347 over = ts.tv_sec - INT32_MAX / 2; 1348 ts.tv_sec -= over; 1349 } else 1350 over = 0; 1351 tmp = tstosbt(ts); 1352 precision = tmp; 1353 precision >>= tc_precexp; 1354 if (TIMESEL(&sbt, tmp)) 1355 sbt += tc_tick_sbt; 1356 sbt += tmp; 1357 } 1358 } else 1359 sbt = -1; 1360 1361 if (nfds > maxfilesperproc && nfds > FD_SETSIZE) 1362 return (EINVAL); 1363 ni = nfds * sizeof(struct pollfd); 1364 if (ni > sizeof(smallbits)) 1365 bits = malloc(ni, M_TEMP, M_WAITOK); 1366 else 1367 bits = smallbits; 1368 error = copyin(fds, bits, ni); 1369 if (error) 1370 goto done; 1371 1372 if (uset != NULL) { 1373 error = kern_sigprocmask(td, SIG_SETMASK, uset, 1374 &td->td_oldsigmask, 0); 1375 if (error) 1376 goto done; 1377 td->td_pflags |= TDP_OLDMASK; 1378 /* 1379 * Make sure that ast() is called on return to 1380 * usermode and TDP_OLDMASK is cleared, restoring old 1381 * sigmask. 1382 */ 1383 thread_lock(td); 1384 td->td_flags |= TDF_ASTPENDING; 1385 thread_unlock(td); 1386 } 1387 1388 seltdinit(td); 1389 /* Iterate until the timeout expires or descriptors become ready. */ 1390 for (;;) { 1391 error = pollscan(td, bits, nfds); 1392 if (error || td->td_retval[0] != 0) 1393 break; 1394 error = seltdwait(td, sbt, precision); 1395 if (error) 1396 break; 1397 error = pollrescan(td); 1398 if (error || td->td_retval[0] != 0) 1399 break; 1400 } 1401 seltdclear(td); 1402 1403 done: 1404 /* poll is not restarted after signals... */ 1405 if (error == ERESTART) 1406 error = EINTR; 1407 if (error == EWOULDBLOCK) 1408 error = 0; 1409 if (error == 0) { 1410 error = pollout(td, bits, fds, nfds); 1411 if (error) 1412 goto out; 1413 } 1414 out: 1415 if (ni > sizeof(smallbits)) 1416 free(bits, M_TEMP); 1417 return (error); 1418 } 1419 1420 int 1421 sys_ppoll(struct thread *td, struct ppoll_args *uap) 1422 { 1423 struct timespec ts, *tsp; 1424 sigset_t set, *ssp; 1425 int error; 1426 1427 if (uap->ts != NULL) { 1428 error = copyin(uap->ts, &ts, sizeof(ts)); 1429 if (error) 1430 return (error); 1431 tsp = &ts; 1432 } else 1433 tsp = NULL; 1434 if (uap->set != NULL) { 1435 error = copyin(uap->set, &set, sizeof(set)); 1436 if (error) 1437 return (error); 1438 ssp = &set; 1439 } else 1440 ssp = NULL; 1441 /* 1442 * fds is still a pointer to user space. kern_poll() will 1443 * take care of copyin that array to the kernel space. 1444 */ 1445 1446 return (kern_poll(td, uap->fds, uap->nfds, tsp, ssp)); 1447 } 1448 1449 static int 1450 pollrescan(struct thread *td) 1451 { 1452 struct seltd *stp; 1453 struct selfd *sfp; 1454 struct selfd *sfn; 1455 struct selinfo *si; 1456 struct filedesc *fdp; 1457 struct file *fp; 1458 struct pollfd *fd; 1459 #ifdef CAPABILITIES 1460 cap_rights_t rights; 1461 #endif 1462 int n; 1463 1464 n = 0; 1465 fdp = td->td_proc->p_fd; 1466 stp = td->td_sel; 1467 FILEDESC_SLOCK(fdp); 1468 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { 1469 fd = (struct pollfd *)sfp->sf_cookie; 1470 si = sfp->sf_si; 1471 selfdfree(stp, sfp); 1472 /* If the selinfo wasn't cleared the event didn't fire. */ 1473 if (si != NULL) 1474 continue; 1475 fp = fdp->fd_ofiles[fd->fd].fde_file; 1476 #ifdef CAPABILITIES 1477 if (fp == NULL || 1478 cap_check(cap_rights(fdp, fd->fd), 1479 cap_rights_init(&rights, CAP_EVENT)) != 0) 1480 #else 1481 if (fp == NULL) 1482 #endif 1483 { 1484 fd->revents = POLLNVAL; 1485 n++; 1486 continue; 1487 } 1488 1489 /* 1490 * Note: backend also returns POLLHUP and 1491 * POLLERR if appropriate. 1492 */ 1493 fd->revents = fo_poll(fp, fd->events, td->td_ucred, td); 1494 if (fd->revents != 0) 1495 n++; 1496 } 1497 FILEDESC_SUNLOCK(fdp); 1498 stp->st_flags = 0; 1499 td->td_retval[0] = n; 1500 return (0); 1501 } 1502 1503 1504 static int 1505 pollout(td, fds, ufds, nfd) 1506 struct thread *td; 1507 struct pollfd *fds; 1508 struct pollfd *ufds; 1509 u_int nfd; 1510 { 1511 int error = 0; 1512 u_int i = 0; 1513 u_int n = 0; 1514 1515 for (i = 0; i < nfd; i++) { 1516 error = copyout(&fds->revents, &ufds->revents, 1517 sizeof(ufds->revents)); 1518 if (error) 1519 return (error); 1520 if (fds->revents != 0) 1521 n++; 1522 fds++; 1523 ufds++; 1524 } 1525 td->td_retval[0] = n; 1526 return (0); 1527 } 1528 1529 static int 1530 pollscan(td, fds, nfd) 1531 struct thread *td; 1532 struct pollfd *fds; 1533 u_int nfd; 1534 { 1535 struct filedesc *fdp = td->td_proc->p_fd; 1536 struct file *fp; 1537 #ifdef CAPABILITIES 1538 cap_rights_t rights; 1539 #endif 1540 int i, n = 0; 1541 1542 FILEDESC_SLOCK(fdp); 1543 for (i = 0; i < nfd; i++, fds++) { 1544 if (fds->fd > fdp->fd_lastfile) { 1545 fds->revents = POLLNVAL; 1546 n++; 1547 } else if (fds->fd < 0) { 1548 fds->revents = 0; 1549 } else { 1550 fp = fdp->fd_ofiles[fds->fd].fde_file; 1551 #ifdef CAPABILITIES 1552 if (fp == NULL || 1553 cap_check(cap_rights(fdp, fds->fd), 1554 cap_rights_init(&rights, CAP_EVENT)) != 0) 1555 #else 1556 if (fp == NULL) 1557 #endif 1558 { 1559 fds->revents = POLLNVAL; 1560 n++; 1561 } else { 1562 /* 1563 * Note: backend also returns POLLHUP and 1564 * POLLERR if appropriate. 1565 */ 1566 selfdalloc(td, fds); 1567 fds->revents = fo_poll(fp, fds->events, 1568 td->td_ucred, td); 1569 /* 1570 * POSIX requires POLLOUT to be never 1571 * set simultaneously with POLLHUP. 1572 */ 1573 if ((fds->revents & POLLHUP) != 0) 1574 fds->revents &= ~POLLOUT; 1575 1576 if (fds->revents != 0) 1577 n++; 1578 } 1579 } 1580 } 1581 FILEDESC_SUNLOCK(fdp); 1582 td->td_retval[0] = n; 1583 return (0); 1584 } 1585 1586 /* 1587 * OpenBSD poll system call. 1588 * 1589 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1590 */ 1591 #ifndef _SYS_SYSPROTO_H_ 1592 struct openbsd_poll_args { 1593 struct pollfd *fds; 1594 u_int nfds; 1595 int timeout; 1596 }; 1597 #endif 1598 int 1599 sys_openbsd_poll(td, uap) 1600 register struct thread *td; 1601 register struct openbsd_poll_args *uap; 1602 { 1603 return (sys_poll(td, (struct poll_args *)uap)); 1604 } 1605 1606 /* 1607 * XXX This was created specifically to support netncp and netsmb. This 1608 * allows the caller to specify a socket to wait for events on. It returns 1609 * 0 if any events matched and an error otherwise. There is no way to 1610 * determine which events fired. 1611 */ 1612 int 1613 selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td) 1614 { 1615 struct timeval rtv; 1616 sbintime_t asbt, precision, rsbt; 1617 int error; 1618 1619 precision = 0; /* stupid gcc! */ 1620 if (tvp != NULL) { 1621 rtv = *tvp; 1622 if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || 1623 rtv.tv_usec >= 1000000) 1624 return (EINVAL); 1625 if (!timevalisset(&rtv)) 1626 asbt = 0; 1627 else if (rtv.tv_sec <= INT32_MAX) { 1628 rsbt = tvtosbt(rtv); 1629 precision = rsbt; 1630 precision >>= tc_precexp; 1631 if (TIMESEL(&asbt, rsbt)) 1632 asbt += tc_tick_sbt; 1633 if (asbt <= SBT_MAX - rsbt) 1634 asbt += rsbt; 1635 else 1636 asbt = -1; 1637 } else 1638 asbt = -1; 1639 } else 1640 asbt = -1; 1641 seltdinit(td); 1642 /* 1643 * Iterate until the timeout expires or the socket becomes ready. 1644 */ 1645 for (;;) { 1646 selfdalloc(td, NULL); 1647 error = sopoll(so, events, NULL, td); 1648 /* error here is actually the ready events. */ 1649 if (error) 1650 return (0); 1651 error = seltdwait(td, asbt, precision); 1652 if (error) 1653 break; 1654 } 1655 seltdclear(td); 1656 /* XXX Duplicates ncp/smb behavior. */ 1657 if (error == ERESTART) 1658 error = 0; 1659 return (error); 1660 } 1661 1662 /* 1663 * Preallocate two selfds associated with 'cookie'. Some fo_poll routines 1664 * have two select sets, one for read and another for write. 1665 */ 1666 static void 1667 selfdalloc(struct thread *td, void *cookie) 1668 { 1669 struct seltd *stp; 1670 1671 stp = td->td_sel; 1672 if (stp->st_free1 == NULL) 1673 stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); 1674 stp->st_free1->sf_td = stp; 1675 stp->st_free1->sf_cookie = cookie; 1676 if (stp->st_free2 == NULL) 1677 stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); 1678 stp->st_free2->sf_td = stp; 1679 stp->st_free2->sf_cookie = cookie; 1680 } 1681 1682 static void 1683 selfdfree(struct seltd *stp, struct selfd *sfp) 1684 { 1685 STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link); 1686 if (sfp->sf_si != NULL) { 1687 mtx_lock(sfp->sf_mtx); 1688 if (sfp->sf_si != NULL) 1689 TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads); 1690 mtx_unlock(sfp->sf_mtx); 1691 } 1692 uma_zfree(selfd_zone, sfp); 1693 } 1694 1695 /* Drain the waiters tied to all the selfd belonging the specified selinfo. */ 1696 void 1697 seldrain(sip) 1698 struct selinfo *sip; 1699 { 1700 1701 /* 1702 * This feature is already provided by doselwakeup(), thus it is 1703 * enough to go for it. 1704 * Eventually, the context, should take care to avoid races 1705 * between thread calling select()/poll() and file descriptor 1706 * detaching, but, again, the races are just the same as 1707 * selwakeup(). 1708 */ 1709 doselwakeup(sip, -1); 1710 } 1711 1712 /* 1713 * Record a select request. 1714 */ 1715 void 1716 selrecord(selector, sip) 1717 struct thread *selector; 1718 struct selinfo *sip; 1719 { 1720 struct selfd *sfp; 1721 struct seltd *stp; 1722 struct mtx *mtxp; 1723 1724 stp = selector->td_sel; 1725 /* 1726 * Don't record when doing a rescan. 1727 */ 1728 if (stp->st_flags & SELTD_RESCAN) 1729 return; 1730 /* 1731 * Grab one of the preallocated descriptors. 1732 */ 1733 sfp = NULL; 1734 if ((sfp = stp->st_free1) != NULL) 1735 stp->st_free1 = NULL; 1736 else if ((sfp = stp->st_free2) != NULL) 1737 stp->st_free2 = NULL; 1738 else 1739 panic("selrecord: No free selfd on selq"); 1740 mtxp = sip->si_mtx; 1741 if (mtxp == NULL) 1742 mtxp = mtx_pool_find(mtxpool_select, sip); 1743 /* 1744 * Initialize the sfp and queue it in the thread. 1745 */ 1746 sfp->sf_si = sip; 1747 sfp->sf_mtx = mtxp; 1748 STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link); 1749 /* 1750 * Now that we've locked the sip, check for initialization. 1751 */ 1752 mtx_lock(mtxp); 1753 if (sip->si_mtx == NULL) { 1754 sip->si_mtx = mtxp; 1755 TAILQ_INIT(&sip->si_tdlist); 1756 } 1757 /* 1758 * Add this thread to the list of selfds listening on this selinfo. 1759 */ 1760 TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads); 1761 mtx_unlock(sip->si_mtx); 1762 } 1763 1764 /* Wake up a selecting thread. */ 1765 void 1766 selwakeup(sip) 1767 struct selinfo *sip; 1768 { 1769 doselwakeup(sip, -1); 1770 } 1771 1772 /* Wake up a selecting thread, and set its priority. */ 1773 void 1774 selwakeuppri(sip, pri) 1775 struct selinfo *sip; 1776 int pri; 1777 { 1778 doselwakeup(sip, pri); 1779 } 1780 1781 /* 1782 * Do a wakeup when a selectable event occurs. 1783 */ 1784 static void 1785 doselwakeup(sip, pri) 1786 struct selinfo *sip; 1787 int pri; 1788 { 1789 struct selfd *sfp; 1790 struct selfd *sfn; 1791 struct seltd *stp; 1792 1793 /* If it's not initialized there can't be any waiters. */ 1794 if (sip->si_mtx == NULL) 1795 return; 1796 /* 1797 * Locking the selinfo locks all selfds associated with it. 1798 */ 1799 mtx_lock(sip->si_mtx); 1800 TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) { 1801 /* 1802 * Once we remove this sfp from the list and clear the 1803 * sf_si seltdclear will know to ignore this si. 1804 */ 1805 TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads); 1806 sfp->sf_si = NULL; 1807 stp = sfp->sf_td; 1808 mtx_lock(&stp->st_mtx); 1809 stp->st_flags |= SELTD_PENDING; 1810 cv_broadcastpri(&stp->st_wait, pri); 1811 mtx_unlock(&stp->st_mtx); 1812 } 1813 mtx_unlock(sip->si_mtx); 1814 } 1815 1816 static void 1817 seltdinit(struct thread *td) 1818 { 1819 struct seltd *stp; 1820 1821 if ((stp = td->td_sel) != NULL) 1822 goto out; 1823 td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO); 1824 mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF); 1825 cv_init(&stp->st_wait, "select"); 1826 out: 1827 stp->st_flags = 0; 1828 STAILQ_INIT(&stp->st_selq); 1829 } 1830 1831 static int 1832 seltdwait(struct thread *td, sbintime_t sbt, sbintime_t precision) 1833 { 1834 struct seltd *stp; 1835 int error; 1836 1837 stp = td->td_sel; 1838 /* 1839 * An event of interest may occur while we do not hold the seltd 1840 * locked so check the pending flag before we sleep. 1841 */ 1842 mtx_lock(&stp->st_mtx); 1843 /* 1844 * Any further calls to selrecord will be a rescan. 1845 */ 1846 stp->st_flags |= SELTD_RESCAN; 1847 if (stp->st_flags & SELTD_PENDING) { 1848 mtx_unlock(&stp->st_mtx); 1849 return (0); 1850 } 1851 if (sbt == 0) 1852 error = EWOULDBLOCK; 1853 else if (sbt != -1) 1854 error = cv_timedwait_sig_sbt(&stp->st_wait, &stp->st_mtx, 1855 sbt, precision, C_ABSOLUTE); 1856 else 1857 error = cv_wait_sig(&stp->st_wait, &stp->st_mtx); 1858 mtx_unlock(&stp->st_mtx); 1859 1860 return (error); 1861 } 1862 1863 void 1864 seltdfini(struct thread *td) 1865 { 1866 struct seltd *stp; 1867 1868 stp = td->td_sel; 1869 if (stp == NULL) 1870 return; 1871 if (stp->st_free1) 1872 uma_zfree(selfd_zone, stp->st_free1); 1873 if (stp->st_free2) 1874 uma_zfree(selfd_zone, stp->st_free2); 1875 td->td_sel = NULL; 1876 free(stp, M_SELECT); 1877 } 1878 1879 /* 1880 * Remove the references to the thread from all of the objects we were 1881 * polling. 1882 */ 1883 static void 1884 seltdclear(struct thread *td) 1885 { 1886 struct seltd *stp; 1887 struct selfd *sfp; 1888 struct selfd *sfn; 1889 1890 stp = td->td_sel; 1891 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) 1892 selfdfree(stp, sfp); 1893 stp->st_flags = 0; 1894 } 1895 1896 static void selectinit(void *); 1897 SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL); 1898 static void 1899 selectinit(void *dummy __unused) 1900 { 1901 1902 selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL, 1903 NULL, NULL, UMA_ALIGN_PTR, 0); 1904 mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF); 1905 } 1906