1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_capsicum.h" 41 #include "opt_compat.h" 42 #include "opt_ktrace.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/sysproto.h> 47 #include <sys/capsicum.h> 48 #include <sys/filedesc.h> 49 #include <sys/filio.h> 50 #include <sys/fcntl.h> 51 #include <sys/file.h> 52 #include <sys/lock.h> 53 #include <sys/proc.h> 54 #include <sys/signalvar.h> 55 #include <sys/socketvar.h> 56 #include <sys/uio.h> 57 #include <sys/kernel.h> 58 #include <sys/ktr.h> 59 #include <sys/limits.h> 60 #include <sys/malloc.h> 61 #include <sys/poll.h> 62 #include <sys/resourcevar.h> 63 #include <sys/selinfo.h> 64 #include <sys/sleepqueue.h> 65 #include <sys/syscallsubr.h> 66 #include <sys/sysctl.h> 67 #include <sys/sysent.h> 68 #include <sys/vnode.h> 69 #include <sys/bio.h> 70 #include <sys/buf.h> 71 #include <sys/condvar.h> 72 #ifdef KTRACE 73 #include <sys/ktrace.h> 74 #endif 75 76 #include <security/audit/audit.h> 77 78 /* 79 * The following macro defines how many bytes will be allocated from 80 * the stack instead of memory allocated when passing the IOCTL data 81 * structures from userspace and to the kernel. Some IOCTLs having 82 * small data structures are used very frequently and this small 83 * buffer on the stack gives a significant speedup improvement for 84 * those requests. The value of this define should be greater or equal 85 * to 64 bytes and should also be power of two. The data structure is 86 * currently hard-aligned to a 8-byte boundary on the stack. This 87 * should currently be sufficient for all supported platforms. 88 */ 89 #define SYS_IOCTL_SMALL_SIZE 128 /* bytes */ 90 #define SYS_IOCTL_SMALL_ALIGN 8 /* bytes */ 91 92 #ifdef __LP64__ 93 static int iosize_max_clamp = 0; 94 SYSCTL_INT(_debug, OID_AUTO, iosize_max_clamp, CTLFLAG_RW, 95 &iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX"); 96 static int devfs_iosize_max_clamp = 1; 97 SYSCTL_INT(_debug, OID_AUTO, devfs_iosize_max_clamp, CTLFLAG_RW, 98 &devfs_iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX for devices"); 99 #endif 100 101 /* 102 * Assert that the return value of read(2) and write(2) syscalls fits 103 * into a register. If not, an architecture will need to provide the 104 * usermode wrappers to reconstruct the result. 105 */ 106 CTASSERT(sizeof(register_t) >= sizeof(size_t)); 107 108 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 109 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 110 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 111 112 static int pollout(struct thread *, struct pollfd *, struct pollfd *, 113 u_int); 114 static int pollscan(struct thread *, struct pollfd *, u_int); 115 static int pollrescan(struct thread *); 116 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 117 static int selrescan(struct thread *, fd_mask **, fd_mask **); 118 static void selfdalloc(struct thread *, void *); 119 static void selfdfree(struct seltd *, struct selfd *); 120 static int dofileread(struct thread *, int, struct file *, struct uio *, 121 off_t, int); 122 static int dofilewrite(struct thread *, int, struct file *, struct uio *, 123 off_t, int); 124 static void doselwakeup(struct selinfo *, int); 125 static void seltdinit(struct thread *); 126 static int seltdwait(struct thread *, sbintime_t, sbintime_t); 127 static void seltdclear(struct thread *); 128 129 /* 130 * One seltd per-thread allocated on demand as needed. 131 * 132 * t - protected by st_mtx 133 * k - Only accessed by curthread or read-only 134 */ 135 struct seltd { 136 STAILQ_HEAD(, selfd) st_selq; /* (k) List of selfds. */ 137 struct selfd *st_free1; /* (k) free fd for read set. */ 138 struct selfd *st_free2; /* (k) free fd for write set. */ 139 struct mtx st_mtx; /* Protects struct seltd */ 140 struct cv st_wait; /* (t) Wait channel. */ 141 int st_flags; /* (t) SELTD_ flags. */ 142 }; 143 144 #define SELTD_PENDING 0x0001 /* We have pending events. */ 145 #define SELTD_RESCAN 0x0002 /* Doing a rescan. */ 146 147 /* 148 * One selfd allocated per-thread per-file-descriptor. 149 * f - protected by sf_mtx 150 */ 151 struct selfd { 152 STAILQ_ENTRY(selfd) sf_link; /* (k) fds owned by this td. */ 153 TAILQ_ENTRY(selfd) sf_threads; /* (f) fds on this selinfo. */ 154 struct selinfo *sf_si; /* (f) selinfo when linked. */ 155 struct mtx *sf_mtx; /* Pointer to selinfo mtx. */ 156 struct seltd *sf_td; /* (k) owning seltd. */ 157 void *sf_cookie; /* (k) fd or pollfd. */ 158 u_int sf_refs; 159 }; 160 161 static uma_zone_t selfd_zone; 162 static struct mtx_pool *mtxpool_select; 163 164 #ifdef __LP64__ 165 size_t 166 devfs_iosize_max(void) 167 { 168 169 return (devfs_iosize_max_clamp || SV_CURPROC_FLAG(SV_ILP32) ? 170 INT_MAX : SSIZE_MAX); 171 } 172 173 size_t 174 iosize_max(void) 175 { 176 177 return (iosize_max_clamp || SV_CURPROC_FLAG(SV_ILP32) ? 178 INT_MAX : SSIZE_MAX); 179 } 180 #endif 181 182 #ifndef _SYS_SYSPROTO_H_ 183 struct read_args { 184 int fd; 185 void *buf; 186 size_t nbyte; 187 }; 188 #endif 189 int 190 sys_read(td, uap) 191 struct thread *td; 192 struct read_args *uap; 193 { 194 struct uio auio; 195 struct iovec aiov; 196 int error; 197 198 if (uap->nbyte > IOSIZE_MAX) 199 return (EINVAL); 200 aiov.iov_base = uap->buf; 201 aiov.iov_len = uap->nbyte; 202 auio.uio_iov = &aiov; 203 auio.uio_iovcnt = 1; 204 auio.uio_resid = uap->nbyte; 205 auio.uio_segflg = UIO_USERSPACE; 206 error = kern_readv(td, uap->fd, &auio); 207 return(error); 208 } 209 210 /* 211 * Positioned read system call 212 */ 213 #ifndef _SYS_SYSPROTO_H_ 214 struct pread_args { 215 int fd; 216 void *buf; 217 size_t nbyte; 218 int pad; 219 off_t offset; 220 }; 221 #endif 222 int 223 sys_pread(struct thread *td, struct pread_args *uap) 224 { 225 226 return (kern_pread(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); 227 } 228 229 int 230 kern_pread(struct thread *td, int fd, void *buf, size_t nbyte, off_t offset) 231 { 232 struct uio auio; 233 struct iovec aiov; 234 int error; 235 236 if (nbyte > IOSIZE_MAX) 237 return (EINVAL); 238 aiov.iov_base = buf; 239 aiov.iov_len = nbyte; 240 auio.uio_iov = &aiov; 241 auio.uio_iovcnt = 1; 242 auio.uio_resid = nbyte; 243 auio.uio_segflg = UIO_USERSPACE; 244 error = kern_preadv(td, fd, &auio, offset); 245 return (error); 246 } 247 248 #if defined(COMPAT_FREEBSD6) 249 int 250 freebsd6_pread(struct thread *td, struct freebsd6_pread_args *uap) 251 { 252 253 return (kern_pread(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); 254 } 255 #endif 256 257 /* 258 * Scatter read system call. 259 */ 260 #ifndef _SYS_SYSPROTO_H_ 261 struct readv_args { 262 int fd; 263 struct iovec *iovp; 264 u_int iovcnt; 265 }; 266 #endif 267 int 268 sys_readv(struct thread *td, struct readv_args *uap) 269 { 270 struct uio *auio; 271 int error; 272 273 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 274 if (error) 275 return (error); 276 error = kern_readv(td, uap->fd, auio); 277 free(auio, M_IOV); 278 return (error); 279 } 280 281 int 282 kern_readv(struct thread *td, int fd, struct uio *auio) 283 { 284 struct file *fp; 285 cap_rights_t rights; 286 int error; 287 288 error = fget_read(td, fd, cap_rights_init(&rights, CAP_READ), &fp); 289 if (error) 290 return (error); 291 error = dofileread(td, fd, fp, auio, (off_t)-1, 0); 292 fdrop(fp, td); 293 return (error); 294 } 295 296 /* 297 * Scatter positioned read system call. 298 */ 299 #ifndef _SYS_SYSPROTO_H_ 300 struct preadv_args { 301 int fd; 302 struct iovec *iovp; 303 u_int iovcnt; 304 off_t offset; 305 }; 306 #endif 307 int 308 sys_preadv(struct thread *td, struct preadv_args *uap) 309 { 310 struct uio *auio; 311 int error; 312 313 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 314 if (error) 315 return (error); 316 error = kern_preadv(td, uap->fd, auio, uap->offset); 317 free(auio, M_IOV); 318 return (error); 319 } 320 321 int 322 kern_preadv(td, fd, auio, offset) 323 struct thread *td; 324 int fd; 325 struct uio *auio; 326 off_t offset; 327 { 328 struct file *fp; 329 cap_rights_t rights; 330 int error; 331 332 error = fget_read(td, fd, cap_rights_init(&rights, CAP_PREAD), &fp); 333 if (error) 334 return (error); 335 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 336 error = ESPIPE; 337 else if (offset < 0 && 338 (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) 339 error = EINVAL; 340 else 341 error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET); 342 fdrop(fp, td); 343 return (error); 344 } 345 346 /* 347 * Common code for readv and preadv that reads data in 348 * from a file using the passed in uio, offset, and flags. 349 */ 350 static int 351 dofileread(td, fd, fp, auio, offset, flags) 352 struct thread *td; 353 int fd; 354 struct file *fp; 355 struct uio *auio; 356 off_t offset; 357 int flags; 358 { 359 ssize_t cnt; 360 int error; 361 #ifdef KTRACE 362 struct uio *ktruio = NULL; 363 #endif 364 365 AUDIT_ARG_FD(fd); 366 367 /* Finish zero length reads right here */ 368 if (auio->uio_resid == 0) { 369 td->td_retval[0] = 0; 370 return(0); 371 } 372 auio->uio_rw = UIO_READ; 373 auio->uio_offset = offset; 374 auio->uio_td = td; 375 #ifdef KTRACE 376 if (KTRPOINT(td, KTR_GENIO)) 377 ktruio = cloneuio(auio); 378 #endif 379 cnt = auio->uio_resid; 380 if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) { 381 if (auio->uio_resid != cnt && (error == ERESTART || 382 error == EINTR || error == EWOULDBLOCK)) 383 error = 0; 384 } 385 cnt -= auio->uio_resid; 386 #ifdef KTRACE 387 if (ktruio != NULL) { 388 ktruio->uio_resid = cnt; 389 ktrgenio(fd, UIO_READ, ktruio, error); 390 } 391 #endif 392 td->td_retval[0] = cnt; 393 return (error); 394 } 395 396 #ifndef _SYS_SYSPROTO_H_ 397 struct write_args { 398 int fd; 399 const void *buf; 400 size_t nbyte; 401 }; 402 #endif 403 int 404 sys_write(td, uap) 405 struct thread *td; 406 struct write_args *uap; 407 { 408 struct uio auio; 409 struct iovec aiov; 410 int error; 411 412 if (uap->nbyte > IOSIZE_MAX) 413 return (EINVAL); 414 aiov.iov_base = (void *)(uintptr_t)uap->buf; 415 aiov.iov_len = uap->nbyte; 416 auio.uio_iov = &aiov; 417 auio.uio_iovcnt = 1; 418 auio.uio_resid = uap->nbyte; 419 auio.uio_segflg = UIO_USERSPACE; 420 error = kern_writev(td, uap->fd, &auio); 421 return(error); 422 } 423 424 /* 425 * Positioned write system call. 426 */ 427 #ifndef _SYS_SYSPROTO_H_ 428 struct pwrite_args { 429 int fd; 430 const void *buf; 431 size_t nbyte; 432 int pad; 433 off_t offset; 434 }; 435 #endif 436 int 437 sys_pwrite(struct thread *td, struct pwrite_args *uap) 438 { 439 440 return (kern_pwrite(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); 441 } 442 443 int 444 kern_pwrite(struct thread *td, int fd, const void *buf, size_t nbyte, 445 off_t offset) 446 { 447 struct uio auio; 448 struct iovec aiov; 449 int error; 450 451 if (nbyte > IOSIZE_MAX) 452 return (EINVAL); 453 aiov.iov_base = (void *)(uintptr_t)buf; 454 aiov.iov_len = nbyte; 455 auio.uio_iov = &aiov; 456 auio.uio_iovcnt = 1; 457 auio.uio_resid = nbyte; 458 auio.uio_segflg = UIO_USERSPACE; 459 error = kern_pwritev(td, fd, &auio, offset); 460 return(error); 461 } 462 463 #if defined(COMPAT_FREEBSD6) 464 int 465 freebsd6_pwrite(struct thread *td, struct freebsd6_pwrite_args *uap) 466 { 467 468 return (kern_pwrite(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); 469 } 470 #endif 471 472 /* 473 * Gather write system call. 474 */ 475 #ifndef _SYS_SYSPROTO_H_ 476 struct writev_args { 477 int fd; 478 struct iovec *iovp; 479 u_int iovcnt; 480 }; 481 #endif 482 int 483 sys_writev(struct thread *td, struct writev_args *uap) 484 { 485 struct uio *auio; 486 int error; 487 488 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 489 if (error) 490 return (error); 491 error = kern_writev(td, uap->fd, auio); 492 free(auio, M_IOV); 493 return (error); 494 } 495 496 int 497 kern_writev(struct thread *td, int fd, struct uio *auio) 498 { 499 struct file *fp; 500 cap_rights_t rights; 501 int error; 502 503 error = fget_write(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp); 504 if (error) 505 return (error); 506 error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0); 507 fdrop(fp, td); 508 return (error); 509 } 510 511 /* 512 * Gather positioned write system call. 513 */ 514 #ifndef _SYS_SYSPROTO_H_ 515 struct pwritev_args { 516 int fd; 517 struct iovec *iovp; 518 u_int iovcnt; 519 off_t offset; 520 }; 521 #endif 522 int 523 sys_pwritev(struct thread *td, struct pwritev_args *uap) 524 { 525 struct uio *auio; 526 int error; 527 528 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 529 if (error) 530 return (error); 531 error = kern_pwritev(td, uap->fd, auio, uap->offset); 532 free(auio, M_IOV); 533 return (error); 534 } 535 536 int 537 kern_pwritev(td, fd, auio, offset) 538 struct thread *td; 539 struct uio *auio; 540 int fd; 541 off_t offset; 542 { 543 struct file *fp; 544 cap_rights_t rights; 545 int error; 546 547 error = fget_write(td, fd, cap_rights_init(&rights, CAP_PWRITE), &fp); 548 if (error) 549 return (error); 550 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 551 error = ESPIPE; 552 else if (offset < 0 && 553 (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) 554 error = EINVAL; 555 else 556 error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET); 557 fdrop(fp, td); 558 return (error); 559 } 560 561 /* 562 * Common code for writev and pwritev that writes data to 563 * a file using the passed in uio, offset, and flags. 564 */ 565 static int 566 dofilewrite(td, fd, fp, auio, offset, flags) 567 struct thread *td; 568 int fd; 569 struct file *fp; 570 struct uio *auio; 571 off_t offset; 572 int flags; 573 { 574 ssize_t cnt; 575 int error; 576 #ifdef KTRACE 577 struct uio *ktruio = NULL; 578 #endif 579 580 AUDIT_ARG_FD(fd); 581 auio->uio_rw = UIO_WRITE; 582 auio->uio_td = td; 583 auio->uio_offset = offset; 584 #ifdef KTRACE 585 if (KTRPOINT(td, KTR_GENIO)) 586 ktruio = cloneuio(auio); 587 #endif 588 cnt = auio->uio_resid; 589 if (fp->f_type == DTYPE_VNODE && 590 (fp->f_vnread_flags & FDEVFS_VNODE) == 0) 591 bwillwrite(); 592 if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) { 593 if (auio->uio_resid != cnt && (error == ERESTART || 594 error == EINTR || error == EWOULDBLOCK)) 595 error = 0; 596 /* Socket layer is responsible for issuing SIGPIPE. */ 597 if (fp->f_type != DTYPE_SOCKET && error == EPIPE) { 598 PROC_LOCK(td->td_proc); 599 tdsignal(td, SIGPIPE); 600 PROC_UNLOCK(td->td_proc); 601 } 602 } 603 cnt -= auio->uio_resid; 604 #ifdef KTRACE 605 if (ktruio != NULL) { 606 ktruio->uio_resid = cnt; 607 ktrgenio(fd, UIO_WRITE, ktruio, error); 608 } 609 #endif 610 td->td_retval[0] = cnt; 611 return (error); 612 } 613 614 /* 615 * Truncate a file given a file descriptor. 616 * 617 * Can't use fget_write() here, since must return EINVAL and not EBADF if the 618 * descriptor isn't writable. 619 */ 620 int 621 kern_ftruncate(td, fd, length) 622 struct thread *td; 623 int fd; 624 off_t length; 625 { 626 struct file *fp; 627 cap_rights_t rights; 628 int error; 629 630 AUDIT_ARG_FD(fd); 631 if (length < 0) 632 return (EINVAL); 633 error = fget(td, fd, cap_rights_init(&rights, CAP_FTRUNCATE), &fp); 634 if (error) 635 return (error); 636 AUDIT_ARG_FILE(td->td_proc, fp); 637 if (!(fp->f_flag & FWRITE)) { 638 fdrop(fp, td); 639 return (EINVAL); 640 } 641 error = fo_truncate(fp, length, td->td_ucred, td); 642 fdrop(fp, td); 643 return (error); 644 } 645 646 #ifndef _SYS_SYSPROTO_H_ 647 struct ftruncate_args { 648 int fd; 649 int pad; 650 off_t length; 651 }; 652 #endif 653 int 654 sys_ftruncate(td, uap) 655 struct thread *td; 656 struct ftruncate_args *uap; 657 { 658 659 return (kern_ftruncate(td, uap->fd, uap->length)); 660 } 661 662 #if defined(COMPAT_43) 663 #ifndef _SYS_SYSPROTO_H_ 664 struct oftruncate_args { 665 int fd; 666 long length; 667 }; 668 #endif 669 int 670 oftruncate(td, uap) 671 struct thread *td; 672 struct oftruncate_args *uap; 673 { 674 675 return (kern_ftruncate(td, uap->fd, uap->length)); 676 } 677 #endif /* COMPAT_43 */ 678 679 #ifndef _SYS_SYSPROTO_H_ 680 struct ioctl_args { 681 int fd; 682 u_long com; 683 caddr_t data; 684 }; 685 #endif 686 /* ARGSUSED */ 687 int 688 sys_ioctl(struct thread *td, struct ioctl_args *uap) 689 { 690 u_char smalldata[SYS_IOCTL_SMALL_SIZE] __aligned(SYS_IOCTL_SMALL_ALIGN); 691 u_long com; 692 int arg, error; 693 u_int size; 694 caddr_t data; 695 696 if (uap->com > 0xffffffff) { 697 printf( 698 "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", 699 td->td_proc->p_pid, td->td_name, uap->com); 700 uap->com &= 0xffffffff; 701 } 702 com = uap->com; 703 704 /* 705 * Interpret high order word to find amount of data to be 706 * copied to/from the user's address space. 707 */ 708 size = IOCPARM_LEN(com); 709 if ((size > IOCPARM_MAX) || 710 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || 711 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43) 712 ((com & IOC_OUT) && size == 0) || 713 #else 714 ((com & (IOC_IN | IOC_OUT)) && size == 0) || 715 #endif 716 ((com & IOC_VOID) && size > 0 && size != sizeof(int))) 717 return (ENOTTY); 718 719 if (size > 0) { 720 if (com & IOC_VOID) { 721 /* Integer argument. */ 722 arg = (intptr_t)uap->data; 723 data = (void *)&arg; 724 size = 0; 725 } else { 726 if (size > SYS_IOCTL_SMALL_SIZE) 727 data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 728 else 729 data = smalldata; 730 } 731 } else 732 data = (void *)&uap->data; 733 if (com & IOC_IN) { 734 error = copyin(uap->data, data, (u_int)size); 735 if (error != 0) 736 goto out; 737 } else if (com & IOC_OUT) { 738 /* 739 * Zero the buffer so the user always 740 * gets back something deterministic. 741 */ 742 bzero(data, size); 743 } 744 745 error = kern_ioctl(td, uap->fd, com, data); 746 747 if (error == 0 && (com & IOC_OUT)) 748 error = copyout(data, uap->data, (u_int)size); 749 750 out: 751 if (size > SYS_IOCTL_SMALL_SIZE) 752 free(data, M_IOCTLOPS); 753 return (error); 754 } 755 756 int 757 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) 758 { 759 struct file *fp; 760 struct filedesc *fdp; 761 #ifndef CAPABILITIES 762 cap_rights_t rights; 763 #endif 764 int error, tmp, locked; 765 766 AUDIT_ARG_FD(fd); 767 AUDIT_ARG_CMD(com); 768 769 fdp = td->td_proc->p_fd; 770 771 switch (com) { 772 case FIONCLEX: 773 case FIOCLEX: 774 FILEDESC_XLOCK(fdp); 775 locked = LA_XLOCKED; 776 break; 777 default: 778 #ifdef CAPABILITIES 779 FILEDESC_SLOCK(fdp); 780 locked = LA_SLOCKED; 781 #else 782 locked = LA_UNLOCKED; 783 #endif 784 break; 785 } 786 787 #ifdef CAPABILITIES 788 if ((fp = fget_locked(fdp, fd)) == NULL) { 789 error = EBADF; 790 goto out; 791 } 792 if ((error = cap_ioctl_check(fdp, fd, com)) != 0) { 793 fp = NULL; /* fhold() was not called yet */ 794 goto out; 795 } 796 fhold(fp); 797 if (locked == LA_SLOCKED) { 798 FILEDESC_SUNLOCK(fdp); 799 locked = LA_UNLOCKED; 800 } 801 #else 802 error = fget(td, fd, cap_rights_init(&rights, CAP_IOCTL), &fp); 803 if (error != 0) { 804 fp = NULL; 805 goto out; 806 } 807 #endif 808 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 809 error = EBADF; 810 goto out; 811 } 812 813 switch (com) { 814 case FIONCLEX: 815 fdp->fd_ofiles[fd].fde_flags &= ~UF_EXCLOSE; 816 goto out; 817 case FIOCLEX: 818 fdp->fd_ofiles[fd].fde_flags |= UF_EXCLOSE; 819 goto out; 820 case FIONBIO: 821 if ((tmp = *(int *)data)) 822 atomic_set_int(&fp->f_flag, FNONBLOCK); 823 else 824 atomic_clear_int(&fp->f_flag, FNONBLOCK); 825 data = (void *)&tmp; 826 break; 827 case FIOASYNC: 828 if ((tmp = *(int *)data)) 829 atomic_set_int(&fp->f_flag, FASYNC); 830 else 831 atomic_clear_int(&fp->f_flag, FASYNC); 832 data = (void *)&tmp; 833 break; 834 } 835 836 error = fo_ioctl(fp, com, data, td->td_ucred, td); 837 out: 838 switch (locked) { 839 case LA_XLOCKED: 840 FILEDESC_XUNLOCK(fdp); 841 break; 842 #ifdef CAPABILITIES 843 case LA_SLOCKED: 844 FILEDESC_SUNLOCK(fdp); 845 break; 846 #endif 847 default: 848 FILEDESC_UNLOCK_ASSERT(fdp); 849 break; 850 } 851 if (fp != NULL) 852 fdrop(fp, td); 853 return (error); 854 } 855 856 int 857 poll_no_poll(int events) 858 { 859 /* 860 * Return true for read/write. If the user asked for something 861 * special, return POLLNVAL, so that clients have a way of 862 * determining reliably whether or not the extended 863 * functionality is present without hard-coding knowledge 864 * of specific filesystem implementations. 865 */ 866 if (events & ~POLLSTANDARD) 867 return (POLLNVAL); 868 869 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 870 } 871 872 int 873 sys_pselect(struct thread *td, struct pselect_args *uap) 874 { 875 struct timespec ts; 876 struct timeval tv, *tvp; 877 sigset_t set, *uset; 878 int error; 879 880 if (uap->ts != NULL) { 881 error = copyin(uap->ts, &ts, sizeof(ts)); 882 if (error != 0) 883 return (error); 884 TIMESPEC_TO_TIMEVAL(&tv, &ts); 885 tvp = &tv; 886 } else 887 tvp = NULL; 888 if (uap->sm != NULL) { 889 error = copyin(uap->sm, &set, sizeof(set)); 890 if (error != 0) 891 return (error); 892 uset = &set; 893 } else 894 uset = NULL; 895 return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, 896 uset, NFDBITS)); 897 } 898 899 int 900 kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex, 901 struct timeval *tvp, sigset_t *uset, int abi_nfdbits) 902 { 903 int error; 904 905 if (uset != NULL) { 906 error = kern_sigprocmask(td, SIG_SETMASK, uset, 907 &td->td_oldsigmask, 0); 908 if (error != 0) 909 return (error); 910 td->td_pflags |= TDP_OLDMASK; 911 /* 912 * Make sure that ast() is called on return to 913 * usermode and TDP_OLDMASK is cleared, restoring old 914 * sigmask. 915 */ 916 thread_lock(td); 917 td->td_flags |= TDF_ASTPENDING; 918 thread_unlock(td); 919 } 920 error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits); 921 return (error); 922 } 923 924 #ifndef _SYS_SYSPROTO_H_ 925 struct select_args { 926 int nd; 927 fd_set *in, *ou, *ex; 928 struct timeval *tv; 929 }; 930 #endif 931 int 932 sys_select(struct thread *td, struct select_args *uap) 933 { 934 struct timeval tv, *tvp; 935 int error; 936 937 if (uap->tv != NULL) { 938 error = copyin(uap->tv, &tv, sizeof(tv)); 939 if (error) 940 return (error); 941 tvp = &tv; 942 } else 943 tvp = NULL; 944 945 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, 946 NFDBITS)); 947 } 948 949 /* 950 * In the unlikely case when user specified n greater then the last 951 * open file descriptor, check that no bits are set after the last 952 * valid fd. We must return EBADF if any is set. 953 * 954 * There are applications that rely on the behaviour. 955 * 956 * nd is fd_lastfile + 1. 957 */ 958 static int 959 select_check_badfd(fd_set *fd_in, int nd, int ndu, int abi_nfdbits) 960 { 961 char *addr, *oaddr; 962 int b, i, res; 963 uint8_t bits; 964 965 if (nd >= ndu || fd_in == NULL) 966 return (0); 967 968 oaddr = NULL; 969 bits = 0; /* silence gcc */ 970 for (i = nd; i < ndu; i++) { 971 b = i / NBBY; 972 #if BYTE_ORDER == LITTLE_ENDIAN 973 addr = (char *)fd_in + b; 974 #else 975 addr = (char *)fd_in; 976 if (abi_nfdbits == NFDBITS) { 977 addr += rounddown(b, sizeof(fd_mask)) + 978 sizeof(fd_mask) - 1 - b % sizeof(fd_mask); 979 } else { 980 addr += rounddown(b, sizeof(uint32_t)) + 981 sizeof(uint32_t) - 1 - b % sizeof(uint32_t); 982 } 983 #endif 984 if (addr != oaddr) { 985 res = fubyte(addr); 986 if (res == -1) 987 return (EFAULT); 988 oaddr = addr; 989 bits = res; 990 } 991 if ((bits & (1 << (i % NBBY))) != 0) 992 return (EBADF); 993 } 994 return (0); 995 } 996 997 int 998 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 999 fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits) 1000 { 1001 struct filedesc *fdp; 1002 /* 1003 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 1004 * infds with the new FD_SETSIZE of 1024, and more than enough for 1005 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 1006 * of 256. 1007 */ 1008 fd_mask s_selbits[howmany(2048, NFDBITS)]; 1009 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 1010 struct timeval rtv; 1011 sbintime_t asbt, precision, rsbt; 1012 u_int nbufbytes, ncpbytes, ncpubytes, nfdbits; 1013 int error, lf, ndu; 1014 1015 if (nd < 0) 1016 return (EINVAL); 1017 fdp = td->td_proc->p_fd; 1018 ndu = nd; 1019 lf = fdp->fd_lastfile; 1020 if (nd > lf + 1) 1021 nd = lf + 1; 1022 1023 error = select_check_badfd(fd_in, nd, ndu, abi_nfdbits); 1024 if (error != 0) 1025 return (error); 1026 error = select_check_badfd(fd_ou, nd, ndu, abi_nfdbits); 1027 if (error != 0) 1028 return (error); 1029 error = select_check_badfd(fd_ex, nd, ndu, abi_nfdbits); 1030 if (error != 0) 1031 return (error); 1032 1033 /* 1034 * Allocate just enough bits for the non-null fd_sets. Use the 1035 * preallocated auto buffer if possible. 1036 */ 1037 nfdbits = roundup(nd, NFDBITS); 1038 ncpbytes = nfdbits / NBBY; 1039 ncpubytes = roundup(nd, abi_nfdbits) / NBBY; 1040 nbufbytes = 0; 1041 if (fd_in != NULL) 1042 nbufbytes += 2 * ncpbytes; 1043 if (fd_ou != NULL) 1044 nbufbytes += 2 * ncpbytes; 1045 if (fd_ex != NULL) 1046 nbufbytes += 2 * ncpbytes; 1047 if (nbufbytes <= sizeof s_selbits) 1048 selbits = &s_selbits[0]; 1049 else 1050 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 1051 1052 /* 1053 * Assign pointers into the bit buffers and fetch the input bits. 1054 * Put the output buffers together so that they can be bzeroed 1055 * together. 1056 */ 1057 sbp = selbits; 1058 #define getbits(name, x) \ 1059 do { \ 1060 if (name == NULL) { \ 1061 ibits[x] = NULL; \ 1062 obits[x] = NULL; \ 1063 } else { \ 1064 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 1065 obits[x] = sbp; \ 1066 sbp += ncpbytes / sizeof *sbp; \ 1067 error = copyin(name, ibits[x], ncpubytes); \ 1068 if (error != 0) \ 1069 goto done; \ 1070 bzero((char *)ibits[x] + ncpubytes, \ 1071 ncpbytes - ncpubytes); \ 1072 } \ 1073 } while (0) 1074 getbits(fd_in, 0); 1075 getbits(fd_ou, 1); 1076 getbits(fd_ex, 2); 1077 #undef getbits 1078 1079 #if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__) 1080 /* 1081 * XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS, 1082 * we are running under 32-bit emulation. This should be more 1083 * generic. 1084 */ 1085 #define swizzle_fdset(bits) \ 1086 if (abi_nfdbits != NFDBITS && bits != NULL) { \ 1087 int i; \ 1088 for (i = 0; i < ncpbytes / sizeof *sbp; i++) \ 1089 bits[i] = (bits[i] >> 32) | (bits[i] << 32); \ 1090 } 1091 #else 1092 #define swizzle_fdset(bits) 1093 #endif 1094 1095 /* Make sure the bit order makes it through an ABI transition */ 1096 swizzle_fdset(ibits[0]); 1097 swizzle_fdset(ibits[1]); 1098 swizzle_fdset(ibits[2]); 1099 1100 if (nbufbytes != 0) 1101 bzero(selbits, nbufbytes / 2); 1102 1103 precision = 0; 1104 if (tvp != NULL) { 1105 rtv = *tvp; 1106 if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || 1107 rtv.tv_usec >= 1000000) { 1108 error = EINVAL; 1109 goto done; 1110 } 1111 if (!timevalisset(&rtv)) 1112 asbt = 0; 1113 else if (rtv.tv_sec <= INT32_MAX) { 1114 rsbt = tvtosbt(rtv); 1115 precision = rsbt; 1116 precision >>= tc_precexp; 1117 if (TIMESEL(&asbt, rsbt)) 1118 asbt += tc_tick_sbt; 1119 if (asbt <= SBT_MAX - rsbt) 1120 asbt += rsbt; 1121 else 1122 asbt = -1; 1123 } else 1124 asbt = -1; 1125 } else 1126 asbt = -1; 1127 seltdinit(td); 1128 /* Iterate until the timeout expires or descriptors become ready. */ 1129 for (;;) { 1130 error = selscan(td, ibits, obits, nd); 1131 if (error || td->td_retval[0] != 0) 1132 break; 1133 error = seltdwait(td, asbt, precision); 1134 if (error) 1135 break; 1136 error = selrescan(td, ibits, obits); 1137 if (error || td->td_retval[0] != 0) 1138 break; 1139 } 1140 seltdclear(td); 1141 1142 done: 1143 /* select is not restarted after signals... */ 1144 if (error == ERESTART) 1145 error = EINTR; 1146 if (error == EWOULDBLOCK) 1147 error = 0; 1148 1149 /* swizzle bit order back, if necessary */ 1150 swizzle_fdset(obits[0]); 1151 swizzle_fdset(obits[1]); 1152 swizzle_fdset(obits[2]); 1153 #undef swizzle_fdset 1154 1155 #define putbits(name, x) \ 1156 if (name && (error2 = copyout(obits[x], name, ncpubytes))) \ 1157 error = error2; 1158 if (error == 0) { 1159 int error2; 1160 1161 putbits(fd_in, 0); 1162 putbits(fd_ou, 1); 1163 putbits(fd_ex, 2); 1164 #undef putbits 1165 } 1166 if (selbits != &s_selbits[0]) 1167 free(selbits, M_SELECT); 1168 1169 return (error); 1170 } 1171 /* 1172 * Convert a select bit set to poll flags. 1173 * 1174 * The backend always returns POLLHUP/POLLERR if appropriate and we 1175 * return this as a set bit in any set. 1176 */ 1177 static int select_flags[3] = { 1178 POLLRDNORM | POLLHUP | POLLERR, 1179 POLLWRNORM | POLLHUP | POLLERR, 1180 POLLRDBAND | POLLERR 1181 }; 1182 1183 /* 1184 * Compute the fo_poll flags required for a fd given by the index and 1185 * bit position in the fd_mask array. 1186 */ 1187 static __inline int 1188 selflags(fd_mask **ibits, int idx, fd_mask bit) 1189 { 1190 int flags; 1191 int msk; 1192 1193 flags = 0; 1194 for (msk = 0; msk < 3; msk++) { 1195 if (ibits[msk] == NULL) 1196 continue; 1197 if ((ibits[msk][idx] & bit) == 0) 1198 continue; 1199 flags |= select_flags[msk]; 1200 } 1201 return (flags); 1202 } 1203 1204 /* 1205 * Set the appropriate output bits given a mask of fired events and the 1206 * input bits originally requested. 1207 */ 1208 static __inline int 1209 selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events) 1210 { 1211 int msk; 1212 int n; 1213 1214 n = 0; 1215 for (msk = 0; msk < 3; msk++) { 1216 if ((events & select_flags[msk]) == 0) 1217 continue; 1218 if (ibits[msk] == NULL) 1219 continue; 1220 if ((ibits[msk][idx] & bit) == 0) 1221 continue; 1222 /* 1223 * XXX Check for a duplicate set. This can occur because a 1224 * socket calls selrecord() twice for each poll() call 1225 * resulting in two selfds per real fd. selrescan() will 1226 * call selsetbits twice as a result. 1227 */ 1228 if ((obits[msk][idx] & bit) != 0) 1229 continue; 1230 obits[msk][idx] |= bit; 1231 n++; 1232 } 1233 1234 return (n); 1235 } 1236 1237 static __inline int 1238 getselfd_cap(struct filedesc *fdp, int fd, struct file **fpp) 1239 { 1240 cap_rights_t rights; 1241 1242 cap_rights_init(&rights, CAP_EVENT); 1243 1244 return (fget_unlocked(fdp, fd, &rights, fpp, NULL)); 1245 } 1246 1247 /* 1248 * Traverse the list of fds attached to this thread's seltd and check for 1249 * completion. 1250 */ 1251 static int 1252 selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits) 1253 { 1254 struct filedesc *fdp; 1255 struct selinfo *si; 1256 struct seltd *stp; 1257 struct selfd *sfp; 1258 struct selfd *sfn; 1259 struct file *fp; 1260 fd_mask bit; 1261 int fd, ev, n, idx; 1262 int error; 1263 1264 fdp = td->td_proc->p_fd; 1265 stp = td->td_sel; 1266 n = 0; 1267 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { 1268 fd = (int)(uintptr_t)sfp->sf_cookie; 1269 si = sfp->sf_si; 1270 selfdfree(stp, sfp); 1271 /* If the selinfo wasn't cleared the event didn't fire. */ 1272 if (si != NULL) 1273 continue; 1274 error = getselfd_cap(fdp, fd, &fp); 1275 if (error) 1276 return (error); 1277 idx = fd / NFDBITS; 1278 bit = (fd_mask)1 << (fd % NFDBITS); 1279 ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td); 1280 fdrop(fp, td); 1281 if (ev != 0) 1282 n += selsetbits(ibits, obits, idx, bit, ev); 1283 } 1284 stp->st_flags = 0; 1285 td->td_retval[0] = n; 1286 return (0); 1287 } 1288 1289 /* 1290 * Perform the initial filedescriptor scan and register ourselves with 1291 * each selinfo. 1292 */ 1293 static int 1294 selscan(td, ibits, obits, nfd) 1295 struct thread *td; 1296 fd_mask **ibits, **obits; 1297 int nfd; 1298 { 1299 struct filedesc *fdp; 1300 struct file *fp; 1301 fd_mask bit; 1302 int ev, flags, end, fd; 1303 int n, idx; 1304 int error; 1305 1306 fdp = td->td_proc->p_fd; 1307 n = 0; 1308 for (idx = 0, fd = 0; fd < nfd; idx++) { 1309 end = imin(fd + NFDBITS, nfd); 1310 for (bit = 1; fd < end; bit <<= 1, fd++) { 1311 /* Compute the list of events we're interested in. */ 1312 flags = selflags(ibits, idx, bit); 1313 if (flags == 0) 1314 continue; 1315 error = getselfd_cap(fdp, fd, &fp); 1316 if (error) 1317 return (error); 1318 selfdalloc(td, (void *)(uintptr_t)fd); 1319 ev = fo_poll(fp, flags, td->td_ucred, td); 1320 fdrop(fp, td); 1321 if (ev != 0) 1322 n += selsetbits(ibits, obits, idx, bit, ev); 1323 } 1324 } 1325 1326 td->td_retval[0] = n; 1327 return (0); 1328 } 1329 1330 int 1331 sys_poll(struct thread *td, struct poll_args *uap) 1332 { 1333 struct timespec ts, *tsp; 1334 1335 if (uap->timeout != INFTIM) { 1336 if (uap->timeout < 0) 1337 return (EINVAL); 1338 ts.tv_sec = uap->timeout / 1000; 1339 ts.tv_nsec = (uap->timeout % 1000) * 1000000; 1340 tsp = &ts; 1341 } else 1342 tsp = NULL; 1343 1344 return (kern_poll(td, uap->fds, uap->nfds, tsp, NULL)); 1345 } 1346 1347 int 1348 kern_poll(struct thread *td, struct pollfd *fds, u_int nfds, 1349 struct timespec *tsp, sigset_t *uset) 1350 { 1351 struct pollfd *bits; 1352 struct pollfd smallbits[32]; 1353 sbintime_t sbt, precision, tmp; 1354 time_t over; 1355 struct timespec ts; 1356 int error; 1357 size_t ni; 1358 1359 precision = 0; 1360 if (tsp != NULL) { 1361 if (tsp->tv_sec < 0) 1362 return (EINVAL); 1363 if (tsp->tv_nsec < 0 || tsp->tv_nsec >= 1000000000) 1364 return (EINVAL); 1365 if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) 1366 sbt = 0; 1367 else { 1368 ts = *tsp; 1369 if (ts.tv_sec > INT32_MAX / 2) { 1370 over = ts.tv_sec - INT32_MAX / 2; 1371 ts.tv_sec -= over; 1372 } else 1373 over = 0; 1374 tmp = tstosbt(ts); 1375 precision = tmp; 1376 precision >>= tc_precexp; 1377 if (TIMESEL(&sbt, tmp)) 1378 sbt += tc_tick_sbt; 1379 sbt += tmp; 1380 } 1381 } else 1382 sbt = -1; 1383 1384 if (nfds > maxfilesperproc && nfds > FD_SETSIZE) 1385 return (EINVAL); 1386 ni = nfds * sizeof(struct pollfd); 1387 if (ni > sizeof(smallbits)) 1388 bits = malloc(ni, M_TEMP, M_WAITOK); 1389 else 1390 bits = smallbits; 1391 error = copyin(fds, bits, ni); 1392 if (error) 1393 goto done; 1394 1395 if (uset != NULL) { 1396 error = kern_sigprocmask(td, SIG_SETMASK, uset, 1397 &td->td_oldsigmask, 0); 1398 if (error) 1399 goto done; 1400 td->td_pflags |= TDP_OLDMASK; 1401 /* 1402 * Make sure that ast() is called on return to 1403 * usermode and TDP_OLDMASK is cleared, restoring old 1404 * sigmask. 1405 */ 1406 thread_lock(td); 1407 td->td_flags |= TDF_ASTPENDING; 1408 thread_unlock(td); 1409 } 1410 1411 seltdinit(td); 1412 /* Iterate until the timeout expires or descriptors become ready. */ 1413 for (;;) { 1414 error = pollscan(td, bits, nfds); 1415 if (error || td->td_retval[0] != 0) 1416 break; 1417 error = seltdwait(td, sbt, precision); 1418 if (error) 1419 break; 1420 error = pollrescan(td); 1421 if (error || td->td_retval[0] != 0) 1422 break; 1423 } 1424 seltdclear(td); 1425 1426 done: 1427 /* poll is not restarted after signals... */ 1428 if (error == ERESTART) 1429 error = EINTR; 1430 if (error == EWOULDBLOCK) 1431 error = 0; 1432 if (error == 0) { 1433 error = pollout(td, bits, fds, nfds); 1434 if (error) 1435 goto out; 1436 } 1437 out: 1438 if (ni > sizeof(smallbits)) 1439 free(bits, M_TEMP); 1440 return (error); 1441 } 1442 1443 int 1444 sys_ppoll(struct thread *td, struct ppoll_args *uap) 1445 { 1446 struct timespec ts, *tsp; 1447 sigset_t set, *ssp; 1448 int error; 1449 1450 if (uap->ts != NULL) { 1451 error = copyin(uap->ts, &ts, sizeof(ts)); 1452 if (error) 1453 return (error); 1454 tsp = &ts; 1455 } else 1456 tsp = NULL; 1457 if (uap->set != NULL) { 1458 error = copyin(uap->set, &set, sizeof(set)); 1459 if (error) 1460 return (error); 1461 ssp = &set; 1462 } else 1463 ssp = NULL; 1464 /* 1465 * fds is still a pointer to user space. kern_poll() will 1466 * take care of copyin that array to the kernel space. 1467 */ 1468 1469 return (kern_poll(td, uap->fds, uap->nfds, tsp, ssp)); 1470 } 1471 1472 static int 1473 pollrescan(struct thread *td) 1474 { 1475 struct seltd *stp; 1476 struct selfd *sfp; 1477 struct selfd *sfn; 1478 struct selinfo *si; 1479 struct filedesc *fdp; 1480 struct file *fp; 1481 struct pollfd *fd; 1482 #ifdef CAPABILITIES 1483 cap_rights_t rights; 1484 #endif 1485 int n; 1486 1487 n = 0; 1488 fdp = td->td_proc->p_fd; 1489 stp = td->td_sel; 1490 FILEDESC_SLOCK(fdp); 1491 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { 1492 fd = (struct pollfd *)sfp->sf_cookie; 1493 si = sfp->sf_si; 1494 selfdfree(stp, sfp); 1495 /* If the selinfo wasn't cleared the event didn't fire. */ 1496 if (si != NULL) 1497 continue; 1498 fp = fdp->fd_ofiles[fd->fd].fde_file; 1499 #ifdef CAPABILITIES 1500 if (fp == NULL || 1501 cap_check(cap_rights(fdp, fd->fd), 1502 cap_rights_init(&rights, CAP_EVENT)) != 0) 1503 #else 1504 if (fp == NULL) 1505 #endif 1506 { 1507 fd->revents = POLLNVAL; 1508 n++; 1509 continue; 1510 } 1511 1512 /* 1513 * Note: backend also returns POLLHUP and 1514 * POLLERR if appropriate. 1515 */ 1516 fd->revents = fo_poll(fp, fd->events, td->td_ucred, td); 1517 if (fd->revents != 0) 1518 n++; 1519 } 1520 FILEDESC_SUNLOCK(fdp); 1521 stp->st_flags = 0; 1522 td->td_retval[0] = n; 1523 return (0); 1524 } 1525 1526 1527 static int 1528 pollout(td, fds, ufds, nfd) 1529 struct thread *td; 1530 struct pollfd *fds; 1531 struct pollfd *ufds; 1532 u_int nfd; 1533 { 1534 int error = 0; 1535 u_int i = 0; 1536 u_int n = 0; 1537 1538 for (i = 0; i < nfd; i++) { 1539 error = copyout(&fds->revents, &ufds->revents, 1540 sizeof(ufds->revents)); 1541 if (error) 1542 return (error); 1543 if (fds->revents != 0) 1544 n++; 1545 fds++; 1546 ufds++; 1547 } 1548 td->td_retval[0] = n; 1549 return (0); 1550 } 1551 1552 static int 1553 pollscan(td, fds, nfd) 1554 struct thread *td; 1555 struct pollfd *fds; 1556 u_int nfd; 1557 { 1558 struct filedesc *fdp = td->td_proc->p_fd; 1559 struct file *fp; 1560 #ifdef CAPABILITIES 1561 cap_rights_t rights; 1562 #endif 1563 int i, n = 0; 1564 1565 FILEDESC_SLOCK(fdp); 1566 for (i = 0; i < nfd; i++, fds++) { 1567 if (fds->fd > fdp->fd_lastfile) { 1568 fds->revents = POLLNVAL; 1569 n++; 1570 } else if (fds->fd < 0) { 1571 fds->revents = 0; 1572 } else { 1573 fp = fdp->fd_ofiles[fds->fd].fde_file; 1574 #ifdef CAPABILITIES 1575 if (fp == NULL || 1576 cap_check(cap_rights(fdp, fds->fd), 1577 cap_rights_init(&rights, CAP_EVENT)) != 0) 1578 #else 1579 if (fp == NULL) 1580 #endif 1581 { 1582 fds->revents = POLLNVAL; 1583 n++; 1584 } else { 1585 /* 1586 * Note: backend also returns POLLHUP and 1587 * POLLERR if appropriate. 1588 */ 1589 selfdalloc(td, fds); 1590 fds->revents = fo_poll(fp, fds->events, 1591 td->td_ucred, td); 1592 /* 1593 * POSIX requires POLLOUT to be never 1594 * set simultaneously with POLLHUP. 1595 */ 1596 if ((fds->revents & POLLHUP) != 0) 1597 fds->revents &= ~POLLOUT; 1598 1599 if (fds->revents != 0) 1600 n++; 1601 } 1602 } 1603 } 1604 FILEDESC_SUNLOCK(fdp); 1605 td->td_retval[0] = n; 1606 return (0); 1607 } 1608 1609 /* 1610 * XXX This was created specifically to support netncp and netsmb. This 1611 * allows the caller to specify a socket to wait for events on. It returns 1612 * 0 if any events matched and an error otherwise. There is no way to 1613 * determine which events fired. 1614 */ 1615 int 1616 selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td) 1617 { 1618 struct timeval rtv; 1619 sbintime_t asbt, precision, rsbt; 1620 int error; 1621 1622 precision = 0; /* stupid gcc! */ 1623 if (tvp != NULL) { 1624 rtv = *tvp; 1625 if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || 1626 rtv.tv_usec >= 1000000) 1627 return (EINVAL); 1628 if (!timevalisset(&rtv)) 1629 asbt = 0; 1630 else if (rtv.tv_sec <= INT32_MAX) { 1631 rsbt = tvtosbt(rtv); 1632 precision = rsbt; 1633 precision >>= tc_precexp; 1634 if (TIMESEL(&asbt, rsbt)) 1635 asbt += tc_tick_sbt; 1636 if (asbt <= SBT_MAX - rsbt) 1637 asbt += rsbt; 1638 else 1639 asbt = -1; 1640 } else 1641 asbt = -1; 1642 } else 1643 asbt = -1; 1644 seltdinit(td); 1645 /* 1646 * Iterate until the timeout expires or the socket becomes ready. 1647 */ 1648 for (;;) { 1649 selfdalloc(td, NULL); 1650 error = sopoll(so, events, NULL, td); 1651 /* error here is actually the ready events. */ 1652 if (error) 1653 return (0); 1654 error = seltdwait(td, asbt, precision); 1655 if (error) 1656 break; 1657 } 1658 seltdclear(td); 1659 /* XXX Duplicates ncp/smb behavior. */ 1660 if (error == ERESTART) 1661 error = 0; 1662 return (error); 1663 } 1664 1665 /* 1666 * Preallocate two selfds associated with 'cookie'. Some fo_poll routines 1667 * have two select sets, one for read and another for write. 1668 */ 1669 static void 1670 selfdalloc(struct thread *td, void *cookie) 1671 { 1672 struct seltd *stp; 1673 1674 stp = td->td_sel; 1675 if (stp->st_free1 == NULL) 1676 stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); 1677 stp->st_free1->sf_td = stp; 1678 stp->st_free1->sf_cookie = cookie; 1679 if (stp->st_free2 == NULL) 1680 stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); 1681 stp->st_free2->sf_td = stp; 1682 stp->st_free2->sf_cookie = cookie; 1683 } 1684 1685 static void 1686 selfdfree(struct seltd *stp, struct selfd *sfp) 1687 { 1688 STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link); 1689 if (sfp->sf_si != NULL) { 1690 mtx_lock(sfp->sf_mtx); 1691 if (sfp->sf_si != NULL) { 1692 TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads); 1693 refcount_release(&sfp->sf_refs); 1694 } 1695 mtx_unlock(sfp->sf_mtx); 1696 } 1697 if (refcount_release(&sfp->sf_refs)) 1698 uma_zfree(selfd_zone, sfp); 1699 } 1700 1701 /* Drain the waiters tied to all the selfd belonging the specified selinfo. */ 1702 void 1703 seldrain(sip) 1704 struct selinfo *sip; 1705 { 1706 1707 /* 1708 * This feature is already provided by doselwakeup(), thus it is 1709 * enough to go for it. 1710 * Eventually, the context, should take care to avoid races 1711 * between thread calling select()/poll() and file descriptor 1712 * detaching, but, again, the races are just the same as 1713 * selwakeup(). 1714 */ 1715 doselwakeup(sip, -1); 1716 } 1717 1718 /* 1719 * Record a select request. 1720 */ 1721 void 1722 selrecord(selector, sip) 1723 struct thread *selector; 1724 struct selinfo *sip; 1725 { 1726 struct selfd *sfp; 1727 struct seltd *stp; 1728 struct mtx *mtxp; 1729 1730 stp = selector->td_sel; 1731 /* 1732 * Don't record when doing a rescan. 1733 */ 1734 if (stp->st_flags & SELTD_RESCAN) 1735 return; 1736 /* 1737 * Grab one of the preallocated descriptors. 1738 */ 1739 sfp = NULL; 1740 if ((sfp = stp->st_free1) != NULL) 1741 stp->st_free1 = NULL; 1742 else if ((sfp = stp->st_free2) != NULL) 1743 stp->st_free2 = NULL; 1744 else 1745 panic("selrecord: No free selfd on selq"); 1746 mtxp = sip->si_mtx; 1747 if (mtxp == NULL) 1748 mtxp = mtx_pool_find(mtxpool_select, sip); 1749 /* 1750 * Initialize the sfp and queue it in the thread. 1751 */ 1752 sfp->sf_si = sip; 1753 sfp->sf_mtx = mtxp; 1754 refcount_init(&sfp->sf_refs, 2); 1755 STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link); 1756 /* 1757 * Now that we've locked the sip, check for initialization. 1758 */ 1759 mtx_lock(mtxp); 1760 if (sip->si_mtx == NULL) { 1761 sip->si_mtx = mtxp; 1762 TAILQ_INIT(&sip->si_tdlist); 1763 } 1764 /* 1765 * Add this thread to the list of selfds listening on this selinfo. 1766 */ 1767 TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads); 1768 mtx_unlock(sip->si_mtx); 1769 } 1770 1771 /* Wake up a selecting thread. */ 1772 void 1773 selwakeup(sip) 1774 struct selinfo *sip; 1775 { 1776 doselwakeup(sip, -1); 1777 } 1778 1779 /* Wake up a selecting thread, and set its priority. */ 1780 void 1781 selwakeuppri(sip, pri) 1782 struct selinfo *sip; 1783 int pri; 1784 { 1785 doselwakeup(sip, pri); 1786 } 1787 1788 /* 1789 * Do a wakeup when a selectable event occurs. 1790 */ 1791 static void 1792 doselwakeup(sip, pri) 1793 struct selinfo *sip; 1794 int pri; 1795 { 1796 struct selfd *sfp; 1797 struct selfd *sfn; 1798 struct seltd *stp; 1799 1800 /* If it's not initialized there can't be any waiters. */ 1801 if (sip->si_mtx == NULL) 1802 return; 1803 /* 1804 * Locking the selinfo locks all selfds associated with it. 1805 */ 1806 mtx_lock(sip->si_mtx); 1807 TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) { 1808 /* 1809 * Once we remove this sfp from the list and clear the 1810 * sf_si seltdclear will know to ignore this si. 1811 */ 1812 TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads); 1813 sfp->sf_si = NULL; 1814 stp = sfp->sf_td; 1815 mtx_lock(&stp->st_mtx); 1816 stp->st_flags |= SELTD_PENDING; 1817 cv_broadcastpri(&stp->st_wait, pri); 1818 mtx_unlock(&stp->st_mtx); 1819 if (refcount_release(&sfp->sf_refs)) 1820 uma_zfree(selfd_zone, sfp); 1821 } 1822 mtx_unlock(sip->si_mtx); 1823 } 1824 1825 static void 1826 seltdinit(struct thread *td) 1827 { 1828 struct seltd *stp; 1829 1830 if ((stp = td->td_sel) != NULL) 1831 goto out; 1832 td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO); 1833 mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF); 1834 cv_init(&stp->st_wait, "select"); 1835 out: 1836 stp->st_flags = 0; 1837 STAILQ_INIT(&stp->st_selq); 1838 } 1839 1840 static int 1841 seltdwait(struct thread *td, sbintime_t sbt, sbintime_t precision) 1842 { 1843 struct seltd *stp; 1844 int error; 1845 1846 stp = td->td_sel; 1847 /* 1848 * An event of interest may occur while we do not hold the seltd 1849 * locked so check the pending flag before we sleep. 1850 */ 1851 mtx_lock(&stp->st_mtx); 1852 /* 1853 * Any further calls to selrecord will be a rescan. 1854 */ 1855 stp->st_flags |= SELTD_RESCAN; 1856 if (stp->st_flags & SELTD_PENDING) { 1857 mtx_unlock(&stp->st_mtx); 1858 return (0); 1859 } 1860 if (sbt == 0) 1861 error = EWOULDBLOCK; 1862 else if (sbt != -1) 1863 error = cv_timedwait_sig_sbt(&stp->st_wait, &stp->st_mtx, 1864 sbt, precision, C_ABSOLUTE); 1865 else 1866 error = cv_wait_sig(&stp->st_wait, &stp->st_mtx); 1867 mtx_unlock(&stp->st_mtx); 1868 1869 return (error); 1870 } 1871 1872 void 1873 seltdfini(struct thread *td) 1874 { 1875 struct seltd *stp; 1876 1877 stp = td->td_sel; 1878 if (stp == NULL) 1879 return; 1880 if (stp->st_free1) 1881 uma_zfree(selfd_zone, stp->st_free1); 1882 if (stp->st_free2) 1883 uma_zfree(selfd_zone, stp->st_free2); 1884 td->td_sel = NULL; 1885 free(stp, M_SELECT); 1886 } 1887 1888 /* 1889 * Remove the references to the thread from all of the objects we were 1890 * polling. 1891 */ 1892 static void 1893 seltdclear(struct thread *td) 1894 { 1895 struct seltd *stp; 1896 struct selfd *sfp; 1897 struct selfd *sfn; 1898 1899 stp = td->td_sel; 1900 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) 1901 selfdfree(stp, sfp); 1902 stp->st_flags = 0; 1903 } 1904 1905 static void selectinit(void *); 1906 SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL); 1907 static void 1908 selectinit(void *dummy __unused) 1909 { 1910 1911 selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL, 1912 NULL, NULL, UMA_ALIGN_PTR, 0); 1913 mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF); 1914 } 1915 1916 /* 1917 * Set up a syscall return value that follows the convention specified for 1918 * posix_* functions. 1919 */ 1920 int 1921 kern_posix_error(struct thread *td, int error) 1922 { 1923 1924 if (error <= 0) 1925 return (error); 1926 td->td_errno = error; 1927 td->td_pflags |= TDP_NERRNO; 1928 td->td_retval[0] = error; 1929 return (0); 1930 } 1931