1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 #include <sys/cdefs.h> 38 #include "opt_capsicum.h" 39 #include "opt_ktrace.h" 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/sysproto.h> 44 #include <sys/capsicum.h> 45 #include <sys/filedesc.h> 46 #include <sys/filio.h> 47 #include <sys/fcntl.h> 48 #include <sys/file.h> 49 #include <sys/lock.h> 50 #include <sys/proc.h> 51 #include <sys/signalvar.h> 52 #include <sys/socketvar.h> 53 #include <sys/uio.h> 54 #include <sys/eventfd.h> 55 #include <sys/kernel.h> 56 #include <sys/ktr.h> 57 #include <sys/limits.h> 58 #include <sys/malloc.h> 59 #include <sys/poll.h> 60 #include <sys/resourcevar.h> 61 #include <sys/selinfo.h> 62 #include <sys/sleepqueue.h> 63 #include <sys/specialfd.h> 64 #include <sys/syscallsubr.h> 65 #include <sys/sysctl.h> 66 #include <sys/sysent.h> 67 #include <sys/vnode.h> 68 #include <sys/unistd.h> 69 #include <sys/bio.h> 70 #include <sys/buf.h> 71 #include <sys/condvar.h> 72 #ifdef KTRACE 73 #include <sys/ktrace.h> 74 #endif 75 76 #include <security/audit/audit.h> 77 78 /* 79 * The following macro defines how many bytes will be allocated from 80 * the stack instead of memory allocated when passing the IOCTL data 81 * structures from userspace and to the kernel. Some IOCTLs having 82 * small data structures are used very frequently and this small 83 * buffer on the stack gives a significant speedup improvement for 84 * those requests. The value of this define should be greater or equal 85 * to 64 bytes and should also be power of two. The data structure is 86 * currently hard-aligned to a 8-byte boundary on the stack. This 87 * should currently be sufficient for all supported platforms. 88 */ 89 #define SYS_IOCTL_SMALL_SIZE 128 /* bytes */ 90 #define SYS_IOCTL_SMALL_ALIGN 8 /* bytes */ 91 92 #ifdef __LP64__ 93 static int iosize_max_clamp = 0; 94 SYSCTL_INT(_debug, OID_AUTO, iosize_max_clamp, CTLFLAG_RW, 95 &iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX"); 96 static int devfs_iosize_max_clamp = 1; 97 SYSCTL_INT(_debug, OID_AUTO, devfs_iosize_max_clamp, CTLFLAG_RW, 98 &devfs_iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX for devices"); 99 #endif 100 101 /* 102 * Assert that the return value of read(2) and write(2) syscalls fits 103 * into a register. If not, an architecture will need to provide the 104 * usermode wrappers to reconstruct the result. 105 */ 106 CTASSERT(sizeof(register_t) >= sizeof(size_t)); 107 108 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 109 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 110 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 111 112 static int pollout(struct thread *, struct pollfd *, struct pollfd *, 113 u_int); 114 static int pollscan(struct thread *, struct pollfd *, u_int); 115 static int pollrescan(struct thread *); 116 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 117 static int selrescan(struct thread *, fd_mask **, fd_mask **); 118 static void selfdalloc(struct thread *, void *); 119 static void selfdfree(struct seltd *, struct selfd *); 120 static int dofileread(struct thread *, int, struct file *, struct uio *, 121 off_t, int); 122 static int dofilewrite(struct thread *, int, struct file *, struct uio *, 123 off_t, int); 124 static void doselwakeup(struct selinfo *, int); 125 static void seltdinit(struct thread *); 126 static int seltdwait(struct thread *, sbintime_t, sbintime_t); 127 static void seltdclear(struct thread *); 128 129 /* 130 * One seltd per-thread allocated on demand as needed. 131 * 132 * t - protected by st_mtx 133 * k - Only accessed by curthread or read-only 134 */ 135 struct seltd { 136 STAILQ_HEAD(, selfd) st_selq; /* (k) List of selfds. */ 137 struct selfd *st_free1; /* (k) free fd for read set. */ 138 struct selfd *st_free2; /* (k) free fd for write set. */ 139 struct mtx st_mtx; /* Protects struct seltd */ 140 struct cv st_wait; /* (t) Wait channel. */ 141 int st_flags; /* (t) SELTD_ flags. */ 142 }; 143 144 #define SELTD_PENDING 0x0001 /* We have pending events. */ 145 #define SELTD_RESCAN 0x0002 /* Doing a rescan. */ 146 147 /* 148 * One selfd allocated per-thread per-file-descriptor. 149 * f - protected by sf_mtx 150 */ 151 struct selfd { 152 STAILQ_ENTRY(selfd) sf_link; /* (k) fds owned by this td. */ 153 TAILQ_ENTRY(selfd) sf_threads; /* (f) fds on this selinfo. */ 154 struct selinfo *sf_si; /* (f) selinfo when linked. */ 155 struct mtx *sf_mtx; /* Pointer to selinfo mtx. */ 156 struct seltd *sf_td; /* (k) owning seltd. */ 157 void *sf_cookie; /* (k) fd or pollfd. */ 158 }; 159 160 MALLOC_DEFINE(M_SELFD, "selfd", "selfd"); 161 static struct mtx_pool *mtxpool_select; 162 163 #ifdef __LP64__ 164 size_t 165 devfs_iosize_max(void) 166 { 167 168 return (devfs_iosize_max_clamp || SV_CURPROC_FLAG(SV_ILP32) ? 169 INT_MAX : SSIZE_MAX); 170 } 171 172 size_t 173 iosize_max(void) 174 { 175 176 return (iosize_max_clamp || SV_CURPROC_FLAG(SV_ILP32) ? 177 INT_MAX : SSIZE_MAX); 178 } 179 #endif 180 181 #ifndef _SYS_SYSPROTO_H_ 182 struct read_args { 183 int fd; 184 void *buf; 185 size_t nbyte; 186 }; 187 #endif 188 int 189 sys_read(struct thread *td, struct read_args *uap) 190 { 191 struct uio auio; 192 struct iovec aiov; 193 int error; 194 195 if (uap->nbyte > IOSIZE_MAX) 196 return (EINVAL); 197 aiov.iov_base = uap->buf; 198 aiov.iov_len = uap->nbyte; 199 auio.uio_iov = &aiov; 200 auio.uio_iovcnt = 1; 201 auio.uio_resid = uap->nbyte; 202 auio.uio_segflg = UIO_USERSPACE; 203 error = kern_readv(td, uap->fd, &auio); 204 return (error); 205 } 206 207 /* 208 * Positioned read system call 209 */ 210 #ifndef _SYS_SYSPROTO_H_ 211 struct pread_args { 212 int fd; 213 void *buf; 214 size_t nbyte; 215 int pad; 216 off_t offset; 217 }; 218 #endif 219 int 220 sys_pread(struct thread *td, struct pread_args *uap) 221 { 222 223 return (kern_pread(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); 224 } 225 226 int 227 kern_pread(struct thread *td, int fd, void *buf, size_t nbyte, off_t offset) 228 { 229 struct uio auio; 230 struct iovec aiov; 231 int error; 232 233 if (nbyte > IOSIZE_MAX) 234 return (EINVAL); 235 aiov.iov_base = buf; 236 aiov.iov_len = nbyte; 237 auio.uio_iov = &aiov; 238 auio.uio_iovcnt = 1; 239 auio.uio_resid = nbyte; 240 auio.uio_segflg = UIO_USERSPACE; 241 error = kern_preadv(td, fd, &auio, offset); 242 return (error); 243 } 244 245 #if defined(COMPAT_FREEBSD6) 246 int 247 freebsd6_pread(struct thread *td, struct freebsd6_pread_args *uap) 248 { 249 250 return (kern_pread(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); 251 } 252 #endif 253 254 /* 255 * Scatter read system call. 256 */ 257 #ifndef _SYS_SYSPROTO_H_ 258 struct readv_args { 259 int fd; 260 struct iovec *iovp; 261 u_int iovcnt; 262 }; 263 #endif 264 int 265 sys_readv(struct thread *td, struct readv_args *uap) 266 { 267 struct uio *auio; 268 int error; 269 270 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 271 if (error) 272 return (error); 273 error = kern_readv(td, uap->fd, auio); 274 free(auio, M_IOV); 275 return (error); 276 } 277 278 int 279 kern_readv(struct thread *td, int fd, struct uio *auio) 280 { 281 struct file *fp; 282 int error; 283 284 error = fget_read(td, fd, &cap_read_rights, &fp); 285 if (error) 286 return (error); 287 error = dofileread(td, fd, fp, auio, (off_t)-1, 0); 288 fdrop(fp, td); 289 return (error); 290 } 291 292 /* 293 * Scatter positioned read system call. 294 */ 295 #ifndef _SYS_SYSPROTO_H_ 296 struct preadv_args { 297 int fd; 298 struct iovec *iovp; 299 u_int iovcnt; 300 off_t offset; 301 }; 302 #endif 303 int 304 sys_preadv(struct thread *td, struct preadv_args *uap) 305 { 306 struct uio *auio; 307 int error; 308 309 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 310 if (error) 311 return (error); 312 error = kern_preadv(td, uap->fd, auio, uap->offset); 313 free(auio, M_IOV); 314 return (error); 315 } 316 317 int 318 kern_preadv(struct thread *td, int fd, struct uio *auio, off_t offset) 319 { 320 struct file *fp; 321 int error; 322 323 error = fget_read(td, fd, &cap_pread_rights, &fp); 324 if (error) 325 return (error); 326 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 327 error = ESPIPE; 328 else if (offset < 0 && 329 (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) 330 error = EINVAL; 331 else 332 error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET); 333 fdrop(fp, td); 334 return (error); 335 } 336 337 /* 338 * Common code for readv and preadv that reads data in 339 * from a file using the passed in uio, offset, and flags. 340 */ 341 static int 342 dofileread(struct thread *td, int fd, struct file *fp, struct uio *auio, 343 off_t offset, int flags) 344 { 345 ssize_t cnt; 346 int error; 347 #ifdef KTRACE 348 struct uio *ktruio = NULL; 349 #endif 350 351 AUDIT_ARG_FD(fd); 352 353 /* Finish zero length reads right here */ 354 if (auio->uio_resid == 0) { 355 td->td_retval[0] = 0; 356 return (0); 357 } 358 auio->uio_rw = UIO_READ; 359 auio->uio_offset = offset; 360 auio->uio_td = td; 361 #ifdef KTRACE 362 if (KTRPOINT(td, KTR_GENIO)) 363 ktruio = cloneuio(auio); 364 #endif 365 cnt = auio->uio_resid; 366 if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) { 367 if (auio->uio_resid != cnt && (error == ERESTART || 368 error == EINTR || error == EWOULDBLOCK)) 369 error = 0; 370 } 371 cnt -= auio->uio_resid; 372 #ifdef KTRACE 373 if (ktruio != NULL) { 374 ktruio->uio_resid = cnt; 375 ktrgenio(fd, UIO_READ, ktruio, error); 376 } 377 #endif 378 td->td_retval[0] = cnt; 379 return (error); 380 } 381 382 #ifndef _SYS_SYSPROTO_H_ 383 struct write_args { 384 int fd; 385 const void *buf; 386 size_t nbyte; 387 }; 388 #endif 389 int 390 sys_write(struct thread *td, struct write_args *uap) 391 { 392 struct uio auio; 393 struct iovec aiov; 394 int error; 395 396 if (uap->nbyte > IOSIZE_MAX) 397 return (EINVAL); 398 aiov.iov_base = (void *)(uintptr_t)uap->buf; 399 aiov.iov_len = uap->nbyte; 400 auio.uio_iov = &aiov; 401 auio.uio_iovcnt = 1; 402 auio.uio_resid = uap->nbyte; 403 auio.uio_segflg = UIO_USERSPACE; 404 error = kern_writev(td, uap->fd, &auio); 405 return (error); 406 } 407 408 /* 409 * Positioned write system call. 410 */ 411 #ifndef _SYS_SYSPROTO_H_ 412 struct pwrite_args { 413 int fd; 414 const void *buf; 415 size_t nbyte; 416 int pad; 417 off_t offset; 418 }; 419 #endif 420 int 421 sys_pwrite(struct thread *td, struct pwrite_args *uap) 422 { 423 424 return (kern_pwrite(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); 425 } 426 427 int 428 kern_pwrite(struct thread *td, int fd, const void *buf, size_t nbyte, 429 off_t offset) 430 { 431 struct uio auio; 432 struct iovec aiov; 433 int error; 434 435 if (nbyte > IOSIZE_MAX) 436 return (EINVAL); 437 aiov.iov_base = (void *)(uintptr_t)buf; 438 aiov.iov_len = nbyte; 439 auio.uio_iov = &aiov; 440 auio.uio_iovcnt = 1; 441 auio.uio_resid = nbyte; 442 auio.uio_segflg = UIO_USERSPACE; 443 error = kern_pwritev(td, fd, &auio, offset); 444 return (error); 445 } 446 447 #if defined(COMPAT_FREEBSD6) 448 int 449 freebsd6_pwrite(struct thread *td, struct freebsd6_pwrite_args *uap) 450 { 451 452 return (kern_pwrite(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); 453 } 454 #endif 455 456 /* 457 * Gather write system call. 458 */ 459 #ifndef _SYS_SYSPROTO_H_ 460 struct writev_args { 461 int fd; 462 struct iovec *iovp; 463 u_int iovcnt; 464 }; 465 #endif 466 int 467 sys_writev(struct thread *td, struct writev_args *uap) 468 { 469 struct uio *auio; 470 int error; 471 472 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 473 if (error) 474 return (error); 475 error = kern_writev(td, uap->fd, auio); 476 free(auio, M_IOV); 477 return (error); 478 } 479 480 int 481 kern_writev(struct thread *td, int fd, struct uio *auio) 482 { 483 struct file *fp; 484 int error; 485 486 error = fget_write(td, fd, &cap_write_rights, &fp); 487 if (error) 488 return (error); 489 error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0); 490 fdrop(fp, td); 491 return (error); 492 } 493 494 /* 495 * Gather positioned write system call. 496 */ 497 #ifndef _SYS_SYSPROTO_H_ 498 struct pwritev_args { 499 int fd; 500 struct iovec *iovp; 501 u_int iovcnt; 502 off_t offset; 503 }; 504 #endif 505 int 506 sys_pwritev(struct thread *td, struct pwritev_args *uap) 507 { 508 struct uio *auio; 509 int error; 510 511 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 512 if (error) 513 return (error); 514 error = kern_pwritev(td, uap->fd, auio, uap->offset); 515 free(auio, M_IOV); 516 return (error); 517 } 518 519 int 520 kern_pwritev(struct thread *td, int fd, struct uio *auio, off_t offset) 521 { 522 struct file *fp; 523 int error; 524 525 error = fget_write(td, fd, &cap_pwrite_rights, &fp); 526 if (error) 527 return (error); 528 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 529 error = ESPIPE; 530 else if (offset < 0 && 531 (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) 532 error = EINVAL; 533 else 534 error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET); 535 fdrop(fp, td); 536 return (error); 537 } 538 539 /* 540 * Common code for writev and pwritev that writes data to 541 * a file using the passed in uio, offset, and flags. 542 */ 543 static int 544 dofilewrite(struct thread *td, int fd, struct file *fp, struct uio *auio, 545 off_t offset, int flags) 546 { 547 ssize_t cnt; 548 int error; 549 #ifdef KTRACE 550 struct uio *ktruio = NULL; 551 #endif 552 553 AUDIT_ARG_FD(fd); 554 auio->uio_rw = UIO_WRITE; 555 auio->uio_td = td; 556 auio->uio_offset = offset; 557 #ifdef KTRACE 558 if (KTRPOINT(td, KTR_GENIO)) 559 ktruio = cloneuio(auio); 560 #endif 561 cnt = auio->uio_resid; 562 error = fo_write(fp, auio, td->td_ucred, flags, td); 563 /* 564 * Socket layer is responsible for special error handling, 565 * see sousrsend(). 566 */ 567 if (error != 0 && fp->f_type != DTYPE_SOCKET) { 568 if (auio->uio_resid != cnt && (error == ERESTART || 569 error == EINTR || error == EWOULDBLOCK)) 570 error = 0; 571 if (error == EPIPE) { 572 PROC_LOCK(td->td_proc); 573 tdsignal(td, SIGPIPE); 574 PROC_UNLOCK(td->td_proc); 575 } 576 } 577 cnt -= auio->uio_resid; 578 #ifdef KTRACE 579 if (ktruio != NULL) { 580 ktruio->uio_resid = cnt; 581 ktrgenio(fd, UIO_WRITE, ktruio, error); 582 } 583 #endif 584 td->td_retval[0] = cnt; 585 return (error); 586 } 587 588 /* 589 * Truncate a file given a file descriptor. 590 * 591 * Can't use fget_write() here, since must return EINVAL and not EBADF if the 592 * descriptor isn't writable. 593 */ 594 int 595 kern_ftruncate(struct thread *td, int fd, off_t length) 596 { 597 struct file *fp; 598 int error; 599 600 AUDIT_ARG_FD(fd); 601 if (length < 0) 602 return (EINVAL); 603 error = fget(td, fd, &cap_ftruncate_rights, &fp); 604 if (error) 605 return (error); 606 AUDIT_ARG_FILE(td->td_proc, fp); 607 if (!(fp->f_flag & FWRITE)) { 608 fdrop(fp, td); 609 return (EINVAL); 610 } 611 error = fo_truncate(fp, length, td->td_ucred, td); 612 fdrop(fp, td); 613 return (error); 614 } 615 616 #ifndef _SYS_SYSPROTO_H_ 617 struct ftruncate_args { 618 int fd; 619 int pad; 620 off_t length; 621 }; 622 #endif 623 int 624 sys_ftruncate(struct thread *td, struct ftruncate_args *uap) 625 { 626 627 return (kern_ftruncate(td, uap->fd, uap->length)); 628 } 629 630 #if defined(COMPAT_43) 631 #ifndef _SYS_SYSPROTO_H_ 632 struct oftruncate_args { 633 int fd; 634 long length; 635 }; 636 #endif 637 int 638 oftruncate(struct thread *td, struct oftruncate_args *uap) 639 { 640 641 return (kern_ftruncate(td, uap->fd, uap->length)); 642 } 643 #endif /* COMPAT_43 */ 644 645 #ifndef _SYS_SYSPROTO_H_ 646 struct ioctl_args { 647 int fd; 648 u_long com; 649 caddr_t data; 650 }; 651 #endif 652 /* ARGSUSED */ 653 int 654 sys_ioctl(struct thread *td, struct ioctl_args *uap) 655 { 656 u_char smalldata[SYS_IOCTL_SMALL_SIZE] __aligned(SYS_IOCTL_SMALL_ALIGN); 657 uint32_t com; 658 int arg, error; 659 u_int size; 660 caddr_t data; 661 662 #ifdef INVARIANTS 663 if (uap->com > 0xffffffff) { 664 printf( 665 "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", 666 td->td_proc->p_pid, td->td_name, uap->com); 667 } 668 #endif 669 com = (uint32_t)uap->com; 670 671 /* 672 * Interpret high order word to find amount of data to be 673 * copied to/from the user's address space. 674 */ 675 size = IOCPARM_LEN(com); 676 if ((size > IOCPARM_MAX) || 677 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || 678 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43) 679 ((com & IOC_OUT) && size == 0) || 680 #else 681 ((com & (IOC_IN | IOC_OUT)) && size == 0) || 682 #endif 683 ((com & IOC_VOID) && size > 0 && size != sizeof(int))) 684 return (ENOTTY); 685 686 if (size > 0) { 687 if (com & IOC_VOID) { 688 /* Integer argument. */ 689 arg = (intptr_t)uap->data; 690 data = (void *)&arg; 691 size = 0; 692 } else { 693 if (size > SYS_IOCTL_SMALL_SIZE) 694 data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 695 else 696 data = smalldata; 697 } 698 } else 699 data = (void *)&uap->data; 700 if (com & IOC_IN) { 701 error = copyin(uap->data, data, (u_int)size); 702 if (error != 0) 703 goto out; 704 } else if (com & IOC_OUT) { 705 /* 706 * Zero the buffer so the user always 707 * gets back something deterministic. 708 */ 709 bzero(data, size); 710 } 711 712 error = kern_ioctl(td, uap->fd, com, data); 713 714 if (error == 0 && (com & IOC_OUT)) 715 error = copyout(data, uap->data, (u_int)size); 716 717 out: 718 if (size > SYS_IOCTL_SMALL_SIZE) 719 free(data, M_IOCTLOPS); 720 return (error); 721 } 722 723 int 724 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) 725 { 726 struct file *fp; 727 struct filedesc *fdp; 728 int error, tmp, locked; 729 730 AUDIT_ARG_FD(fd); 731 AUDIT_ARG_CMD(com); 732 733 fdp = td->td_proc->p_fd; 734 735 switch (com) { 736 case FIONCLEX: 737 case FIOCLEX: 738 FILEDESC_XLOCK(fdp); 739 locked = LA_XLOCKED; 740 break; 741 default: 742 #ifdef CAPABILITIES 743 FILEDESC_SLOCK(fdp); 744 locked = LA_SLOCKED; 745 #else 746 locked = LA_UNLOCKED; 747 #endif 748 break; 749 } 750 751 #ifdef CAPABILITIES 752 if ((fp = fget_noref(fdp, fd)) == NULL) { 753 error = EBADF; 754 goto out; 755 } 756 if ((error = cap_ioctl_check(fdp, fd, com)) != 0) { 757 fp = NULL; /* fhold() was not called yet */ 758 goto out; 759 } 760 if (!fhold(fp)) { 761 error = EBADF; 762 fp = NULL; 763 goto out; 764 } 765 if (locked == LA_SLOCKED) { 766 FILEDESC_SUNLOCK(fdp); 767 locked = LA_UNLOCKED; 768 } 769 #else 770 error = fget(td, fd, &cap_ioctl_rights, &fp); 771 if (error != 0) { 772 fp = NULL; 773 goto out; 774 } 775 #endif 776 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 777 error = EBADF; 778 goto out; 779 } 780 781 switch (com) { 782 case FIONCLEX: 783 fdp->fd_ofiles[fd].fde_flags &= ~UF_EXCLOSE; 784 goto out; 785 case FIOCLEX: 786 fdp->fd_ofiles[fd].fde_flags |= UF_EXCLOSE; 787 goto out; 788 case FIONBIO: 789 if ((tmp = *(int *)data)) 790 atomic_set_int(&fp->f_flag, FNONBLOCK); 791 else 792 atomic_clear_int(&fp->f_flag, FNONBLOCK); 793 data = (void *)&tmp; 794 break; 795 case FIOASYNC: 796 if ((tmp = *(int *)data)) 797 atomic_set_int(&fp->f_flag, FASYNC); 798 else 799 atomic_clear_int(&fp->f_flag, FASYNC); 800 data = (void *)&tmp; 801 break; 802 } 803 804 error = fo_ioctl(fp, com, data, td->td_ucred, td); 805 out: 806 switch (locked) { 807 case LA_XLOCKED: 808 FILEDESC_XUNLOCK(fdp); 809 break; 810 #ifdef CAPABILITIES 811 case LA_SLOCKED: 812 FILEDESC_SUNLOCK(fdp); 813 break; 814 #endif 815 default: 816 FILEDESC_UNLOCK_ASSERT(fdp); 817 break; 818 } 819 if (fp != NULL) 820 fdrop(fp, td); 821 return (error); 822 } 823 824 int 825 sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap) 826 { 827 int error; 828 829 error = kern_posix_fallocate(td, uap->fd, uap->offset, uap->len); 830 return (kern_posix_error(td, error)); 831 } 832 833 int 834 kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len) 835 { 836 struct file *fp; 837 int error; 838 839 AUDIT_ARG_FD(fd); 840 if (offset < 0 || len <= 0) 841 return (EINVAL); 842 /* Check for wrap. */ 843 if (offset > OFF_MAX - len) 844 return (EFBIG); 845 AUDIT_ARG_FD(fd); 846 error = fget(td, fd, &cap_pwrite_rights, &fp); 847 if (error != 0) 848 return (error); 849 AUDIT_ARG_FILE(td->td_proc, fp); 850 if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) { 851 error = ESPIPE; 852 goto out; 853 } 854 if ((fp->f_flag & FWRITE) == 0) { 855 error = EBADF; 856 goto out; 857 } 858 859 error = fo_fallocate(fp, offset, len, td); 860 out: 861 fdrop(fp, td); 862 return (error); 863 } 864 865 int 866 sys_fspacectl(struct thread *td, struct fspacectl_args *uap) 867 { 868 struct spacectl_range rqsr, rmsr; 869 int error, cerror; 870 871 error = copyin(uap->rqsr, &rqsr, sizeof(rqsr)); 872 if (error != 0) 873 return (error); 874 875 error = kern_fspacectl(td, uap->fd, uap->cmd, &rqsr, uap->flags, 876 &rmsr); 877 if (uap->rmsr != NULL) { 878 cerror = copyout(&rmsr, uap->rmsr, sizeof(rmsr)); 879 if (error == 0) 880 error = cerror; 881 } 882 return (error); 883 } 884 885 int 886 kern_fspacectl(struct thread *td, int fd, int cmd, 887 const struct spacectl_range *rqsr, int flags, struct spacectl_range *rmsrp) 888 { 889 struct file *fp; 890 struct spacectl_range rmsr; 891 int error; 892 893 AUDIT_ARG_FD(fd); 894 AUDIT_ARG_CMD(cmd); 895 AUDIT_ARG_FFLAGS(flags); 896 897 if (rqsr == NULL) 898 return (EINVAL); 899 rmsr = *rqsr; 900 if (rmsrp != NULL) 901 *rmsrp = rmsr; 902 903 if (cmd != SPACECTL_DEALLOC || 904 rqsr->r_offset < 0 || rqsr->r_len <= 0 || 905 rqsr->r_offset > OFF_MAX - rqsr->r_len || 906 (flags & ~SPACECTL_F_SUPPORTED) != 0) 907 return (EINVAL); 908 909 error = fget_write(td, fd, &cap_pwrite_rights, &fp); 910 if (error != 0) 911 return (error); 912 AUDIT_ARG_FILE(td->td_proc, fp); 913 if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) { 914 error = ESPIPE; 915 goto out; 916 } 917 if ((fp->f_flag & FWRITE) == 0) { 918 error = EBADF; 919 goto out; 920 } 921 922 error = fo_fspacectl(fp, cmd, &rmsr.r_offset, &rmsr.r_len, flags, 923 td->td_ucred, td); 924 /* fspacectl is not restarted after signals if the file is modified. */ 925 if (rmsr.r_len != rqsr->r_len && (error == ERESTART || 926 error == EINTR || error == EWOULDBLOCK)) 927 error = 0; 928 if (rmsrp != NULL) 929 *rmsrp = rmsr; 930 out: 931 fdrop(fp, td); 932 return (error); 933 } 934 935 int 936 kern_specialfd(struct thread *td, int type, void *arg) 937 { 938 struct file *fp; 939 struct specialfd_eventfd *ae; 940 int error, fd, fflags; 941 942 fflags = 0; 943 error = falloc_noinstall(td, &fp); 944 if (error != 0) 945 return (error); 946 947 switch (type) { 948 case SPECIALFD_EVENTFD: 949 ae = arg; 950 if ((ae->flags & EFD_CLOEXEC) != 0) 951 fflags |= O_CLOEXEC; 952 error = eventfd_create_file(td, fp, ae->initval, ae->flags); 953 break; 954 default: 955 error = EINVAL; 956 break; 957 } 958 959 if (error == 0) 960 error = finstall(td, fp, &fd, fflags, NULL); 961 fdrop(fp, td); 962 if (error == 0) 963 td->td_retval[0] = fd; 964 return (error); 965 } 966 967 int 968 sys___specialfd(struct thread *td, struct __specialfd_args *args) 969 { 970 struct specialfd_eventfd ae; 971 int error; 972 973 switch (args->type) { 974 case SPECIALFD_EVENTFD: 975 if (args->len != sizeof(struct specialfd_eventfd)) { 976 error = EINVAL; 977 break; 978 } 979 error = copyin(args->req, &ae, sizeof(ae)); 980 if (error != 0) 981 break; 982 if ((ae.flags & ~(EFD_CLOEXEC | EFD_NONBLOCK | 983 EFD_SEMAPHORE)) != 0) { 984 error = EINVAL; 985 break; 986 } 987 error = kern_specialfd(td, args->type, &ae); 988 break; 989 default: 990 error = EINVAL; 991 break; 992 } 993 return (error); 994 } 995 996 int 997 poll_no_poll(int events) 998 { 999 /* 1000 * Return true for read/write. If the user asked for something 1001 * special, return POLLNVAL, so that clients have a way of 1002 * determining reliably whether or not the extended 1003 * functionality is present without hard-coding knowledge 1004 * of specific filesystem implementations. 1005 */ 1006 if (events & ~POLLSTANDARD) 1007 return (POLLNVAL); 1008 1009 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1010 } 1011 1012 int 1013 sys_pselect(struct thread *td, struct pselect_args *uap) 1014 { 1015 struct timespec ts; 1016 struct timeval tv, *tvp; 1017 sigset_t set, *uset; 1018 int error; 1019 1020 if (uap->ts != NULL) { 1021 error = copyin(uap->ts, &ts, sizeof(ts)); 1022 if (error != 0) 1023 return (error); 1024 TIMESPEC_TO_TIMEVAL(&tv, &ts); 1025 tvp = &tv; 1026 } else 1027 tvp = NULL; 1028 if (uap->sm != NULL) { 1029 error = copyin(uap->sm, &set, sizeof(set)); 1030 if (error != 0) 1031 return (error); 1032 uset = &set; 1033 } else 1034 uset = NULL; 1035 return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, 1036 uset, NFDBITS)); 1037 } 1038 1039 int 1040 kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex, 1041 struct timeval *tvp, sigset_t *uset, int abi_nfdbits) 1042 { 1043 int error; 1044 1045 if (uset != NULL) { 1046 error = kern_sigprocmask(td, SIG_SETMASK, uset, 1047 &td->td_oldsigmask, 0); 1048 if (error != 0) 1049 return (error); 1050 td->td_pflags |= TDP_OLDMASK; 1051 /* 1052 * Make sure that ast() is called on return to 1053 * usermode and TDP_OLDMASK is cleared, restoring old 1054 * sigmask. 1055 */ 1056 ast_sched(td, TDA_SIGSUSPEND); 1057 } 1058 error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits); 1059 return (error); 1060 } 1061 1062 #ifndef _SYS_SYSPROTO_H_ 1063 struct select_args { 1064 int nd; 1065 fd_set *in, *ou, *ex; 1066 struct timeval *tv; 1067 }; 1068 #endif 1069 int 1070 sys_select(struct thread *td, struct select_args *uap) 1071 { 1072 struct timeval tv, *tvp; 1073 int error; 1074 1075 if (uap->tv != NULL) { 1076 error = copyin(uap->tv, &tv, sizeof(tv)); 1077 if (error) 1078 return (error); 1079 tvp = &tv; 1080 } else 1081 tvp = NULL; 1082 1083 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, 1084 NFDBITS)); 1085 } 1086 1087 /* 1088 * In the unlikely case when user specified n greater then the last 1089 * open file descriptor, check that no bits are set after the last 1090 * valid fd. We must return EBADF if any is set. 1091 * 1092 * There are applications that rely on the behaviour. 1093 * 1094 * nd is fd_nfiles. 1095 */ 1096 static int 1097 select_check_badfd(fd_set *fd_in, int nd, int ndu, int abi_nfdbits) 1098 { 1099 char *addr, *oaddr; 1100 int b, i, res; 1101 uint8_t bits; 1102 1103 if (nd >= ndu || fd_in == NULL) 1104 return (0); 1105 1106 oaddr = NULL; 1107 bits = 0; /* silence gcc */ 1108 for (i = nd; i < ndu; i++) { 1109 b = i / NBBY; 1110 #if BYTE_ORDER == LITTLE_ENDIAN 1111 addr = (char *)fd_in + b; 1112 #else 1113 addr = (char *)fd_in; 1114 if (abi_nfdbits == NFDBITS) { 1115 addr += rounddown(b, sizeof(fd_mask)) + 1116 sizeof(fd_mask) - 1 - b % sizeof(fd_mask); 1117 } else { 1118 addr += rounddown(b, sizeof(uint32_t)) + 1119 sizeof(uint32_t) - 1 - b % sizeof(uint32_t); 1120 } 1121 #endif 1122 if (addr != oaddr) { 1123 res = fubyte(addr); 1124 if (res == -1) 1125 return (EFAULT); 1126 oaddr = addr; 1127 bits = res; 1128 } 1129 if ((bits & (1 << (i % NBBY))) != 0) 1130 return (EBADF); 1131 } 1132 return (0); 1133 } 1134 1135 int 1136 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 1137 fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits) 1138 { 1139 struct filedesc *fdp; 1140 /* 1141 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 1142 * infds with the new FD_SETSIZE of 1024, and more than enough for 1143 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 1144 * of 256. 1145 */ 1146 fd_mask s_selbits[howmany(2048, NFDBITS)]; 1147 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 1148 struct timeval rtv; 1149 sbintime_t asbt, precision, rsbt; 1150 u_int nbufbytes, ncpbytes, ncpubytes, nfdbits; 1151 int error, lf, ndu; 1152 1153 if (nd < 0) 1154 return (EINVAL); 1155 fdp = td->td_proc->p_fd; 1156 ndu = nd; 1157 lf = fdp->fd_nfiles; 1158 if (nd > lf) 1159 nd = lf; 1160 1161 error = select_check_badfd(fd_in, nd, ndu, abi_nfdbits); 1162 if (error != 0) 1163 return (error); 1164 error = select_check_badfd(fd_ou, nd, ndu, abi_nfdbits); 1165 if (error != 0) 1166 return (error); 1167 error = select_check_badfd(fd_ex, nd, ndu, abi_nfdbits); 1168 if (error != 0) 1169 return (error); 1170 1171 /* 1172 * Allocate just enough bits for the non-null fd_sets. Use the 1173 * preallocated auto buffer if possible. 1174 */ 1175 nfdbits = roundup(nd, NFDBITS); 1176 ncpbytes = nfdbits / NBBY; 1177 ncpubytes = roundup(nd, abi_nfdbits) / NBBY; 1178 nbufbytes = 0; 1179 if (fd_in != NULL) 1180 nbufbytes += 2 * ncpbytes; 1181 if (fd_ou != NULL) 1182 nbufbytes += 2 * ncpbytes; 1183 if (fd_ex != NULL) 1184 nbufbytes += 2 * ncpbytes; 1185 if (nbufbytes <= sizeof s_selbits) 1186 selbits = &s_selbits[0]; 1187 else 1188 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 1189 1190 /* 1191 * Assign pointers into the bit buffers and fetch the input bits. 1192 * Put the output buffers together so that they can be bzeroed 1193 * together. 1194 */ 1195 sbp = selbits; 1196 #define getbits(name, x) \ 1197 do { \ 1198 if (name == NULL) { \ 1199 ibits[x] = NULL; \ 1200 obits[x] = NULL; \ 1201 } else { \ 1202 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 1203 obits[x] = sbp; \ 1204 sbp += ncpbytes / sizeof *sbp; \ 1205 error = copyin(name, ibits[x], ncpubytes); \ 1206 if (error != 0) \ 1207 goto done; \ 1208 if (ncpbytes != ncpubytes) \ 1209 bzero((char *)ibits[x] + ncpubytes, \ 1210 ncpbytes - ncpubytes); \ 1211 } \ 1212 } while (0) 1213 getbits(fd_in, 0); 1214 getbits(fd_ou, 1); 1215 getbits(fd_ex, 2); 1216 #undef getbits 1217 1218 #if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__) 1219 /* 1220 * XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS, 1221 * we are running under 32-bit emulation. This should be more 1222 * generic. 1223 */ 1224 #define swizzle_fdset(bits) \ 1225 if (abi_nfdbits != NFDBITS && bits != NULL) { \ 1226 int i; \ 1227 for (i = 0; i < ncpbytes / sizeof *sbp; i++) \ 1228 bits[i] = (bits[i] >> 32) | (bits[i] << 32); \ 1229 } 1230 #else 1231 #define swizzle_fdset(bits) 1232 #endif 1233 1234 /* Make sure the bit order makes it through an ABI transition */ 1235 swizzle_fdset(ibits[0]); 1236 swizzle_fdset(ibits[1]); 1237 swizzle_fdset(ibits[2]); 1238 1239 if (nbufbytes != 0) 1240 bzero(selbits, nbufbytes / 2); 1241 1242 precision = 0; 1243 if (tvp != NULL) { 1244 rtv = *tvp; 1245 if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || 1246 rtv.tv_usec >= 1000000) { 1247 error = EINVAL; 1248 goto done; 1249 } 1250 if (!timevalisset(&rtv)) 1251 asbt = 0; 1252 else if (rtv.tv_sec <= INT32_MAX) { 1253 rsbt = tvtosbt(rtv); 1254 precision = rsbt; 1255 precision >>= tc_precexp; 1256 if (TIMESEL(&asbt, rsbt)) 1257 asbt += tc_tick_sbt; 1258 if (asbt <= SBT_MAX - rsbt) 1259 asbt += rsbt; 1260 else 1261 asbt = -1; 1262 } else 1263 asbt = -1; 1264 } else 1265 asbt = -1; 1266 seltdinit(td); 1267 /* Iterate until the timeout expires or descriptors become ready. */ 1268 for (;;) { 1269 error = selscan(td, ibits, obits, nd); 1270 if (error || td->td_retval[0] != 0) 1271 break; 1272 error = seltdwait(td, asbt, precision); 1273 if (error) 1274 break; 1275 error = selrescan(td, ibits, obits); 1276 if (error || td->td_retval[0] != 0) 1277 break; 1278 } 1279 seltdclear(td); 1280 1281 done: 1282 /* select is not restarted after signals... */ 1283 if (error == ERESTART) 1284 error = EINTR; 1285 if (error == EWOULDBLOCK) 1286 error = 0; 1287 1288 /* swizzle bit order back, if necessary */ 1289 swizzle_fdset(obits[0]); 1290 swizzle_fdset(obits[1]); 1291 swizzle_fdset(obits[2]); 1292 #undef swizzle_fdset 1293 1294 #define putbits(name, x) \ 1295 if (name && (error2 = copyout(obits[x], name, ncpubytes))) \ 1296 error = error2; 1297 if (error == 0) { 1298 int error2; 1299 1300 putbits(fd_in, 0); 1301 putbits(fd_ou, 1); 1302 putbits(fd_ex, 2); 1303 #undef putbits 1304 } 1305 if (selbits != &s_selbits[0]) 1306 free(selbits, M_SELECT); 1307 1308 return (error); 1309 } 1310 /* 1311 * Convert a select bit set to poll flags. 1312 * 1313 * The backend always returns POLLHUP/POLLERR if appropriate and we 1314 * return this as a set bit in any set. 1315 */ 1316 static const int select_flags[3] = { 1317 POLLRDNORM | POLLHUP | POLLERR, 1318 POLLWRNORM | POLLHUP | POLLERR, 1319 POLLRDBAND | POLLERR 1320 }; 1321 1322 /* 1323 * Compute the fo_poll flags required for a fd given by the index and 1324 * bit position in the fd_mask array. 1325 */ 1326 static __inline int 1327 selflags(fd_mask **ibits, int idx, fd_mask bit) 1328 { 1329 int flags; 1330 int msk; 1331 1332 flags = 0; 1333 for (msk = 0; msk < 3; msk++) { 1334 if (ibits[msk] == NULL) 1335 continue; 1336 if ((ibits[msk][idx] & bit) == 0) 1337 continue; 1338 flags |= select_flags[msk]; 1339 } 1340 return (flags); 1341 } 1342 1343 /* 1344 * Set the appropriate output bits given a mask of fired events and the 1345 * input bits originally requested. 1346 */ 1347 static __inline int 1348 selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events) 1349 { 1350 int msk; 1351 int n; 1352 1353 n = 0; 1354 for (msk = 0; msk < 3; msk++) { 1355 if ((events & select_flags[msk]) == 0) 1356 continue; 1357 if (ibits[msk] == NULL) 1358 continue; 1359 if ((ibits[msk][idx] & bit) == 0) 1360 continue; 1361 /* 1362 * XXX Check for a duplicate set. This can occur because a 1363 * socket calls selrecord() twice for each poll() call 1364 * resulting in two selfds per real fd. selrescan() will 1365 * call selsetbits twice as a result. 1366 */ 1367 if ((obits[msk][idx] & bit) != 0) 1368 continue; 1369 obits[msk][idx] |= bit; 1370 n++; 1371 } 1372 1373 return (n); 1374 } 1375 1376 /* 1377 * Traverse the list of fds attached to this thread's seltd and check for 1378 * completion. 1379 */ 1380 static int 1381 selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits) 1382 { 1383 struct filedesc *fdp; 1384 struct selinfo *si; 1385 struct seltd *stp; 1386 struct selfd *sfp; 1387 struct selfd *sfn; 1388 struct file *fp; 1389 fd_mask bit; 1390 int fd, ev, n, idx; 1391 int error; 1392 bool only_user; 1393 1394 fdp = td->td_proc->p_fd; 1395 stp = td->td_sel; 1396 n = 0; 1397 only_user = FILEDESC_IS_ONLY_USER(fdp); 1398 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { 1399 fd = (int)(uintptr_t)sfp->sf_cookie; 1400 si = sfp->sf_si; 1401 selfdfree(stp, sfp); 1402 /* If the selinfo wasn't cleared the event didn't fire. */ 1403 if (si != NULL) 1404 continue; 1405 if (only_user) 1406 error = fget_only_user(fdp, fd, &cap_event_rights, &fp); 1407 else 1408 error = fget_unlocked(td, fd, &cap_event_rights, &fp); 1409 if (__predict_false(error != 0)) 1410 return (error); 1411 idx = fd / NFDBITS; 1412 bit = (fd_mask)1 << (fd % NFDBITS); 1413 ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td); 1414 if (only_user) 1415 fput_only_user(fdp, fp); 1416 else 1417 fdrop(fp, td); 1418 if (ev != 0) 1419 n += selsetbits(ibits, obits, idx, bit, ev); 1420 } 1421 stp->st_flags = 0; 1422 td->td_retval[0] = n; 1423 return (0); 1424 } 1425 1426 /* 1427 * Perform the initial filedescriptor scan and register ourselves with 1428 * each selinfo. 1429 */ 1430 static int 1431 selscan(struct thread *td, fd_mask **ibits, fd_mask **obits, int nfd) 1432 { 1433 struct filedesc *fdp; 1434 struct file *fp; 1435 fd_mask bit; 1436 int ev, flags, end, fd; 1437 int n, idx; 1438 int error; 1439 bool only_user; 1440 1441 fdp = td->td_proc->p_fd; 1442 n = 0; 1443 only_user = FILEDESC_IS_ONLY_USER(fdp); 1444 for (idx = 0, fd = 0; fd < nfd; idx++) { 1445 end = imin(fd + NFDBITS, nfd); 1446 for (bit = 1; fd < end; bit <<= 1, fd++) { 1447 /* Compute the list of events we're interested in. */ 1448 flags = selflags(ibits, idx, bit); 1449 if (flags == 0) 1450 continue; 1451 if (only_user) 1452 error = fget_only_user(fdp, fd, &cap_event_rights, &fp); 1453 else 1454 error = fget_unlocked(td, fd, &cap_event_rights, &fp); 1455 if (__predict_false(error != 0)) 1456 return (error); 1457 selfdalloc(td, (void *)(uintptr_t)fd); 1458 ev = fo_poll(fp, flags, td->td_ucred, td); 1459 if (only_user) 1460 fput_only_user(fdp, fp); 1461 else 1462 fdrop(fp, td); 1463 if (ev != 0) 1464 n += selsetbits(ibits, obits, idx, bit, ev); 1465 } 1466 } 1467 1468 td->td_retval[0] = n; 1469 return (0); 1470 } 1471 1472 int 1473 sys_poll(struct thread *td, struct poll_args *uap) 1474 { 1475 struct timespec ts, *tsp; 1476 1477 if (uap->timeout != INFTIM) { 1478 if (uap->timeout < 0) 1479 return (EINVAL); 1480 ts.tv_sec = uap->timeout / 1000; 1481 ts.tv_nsec = (uap->timeout % 1000) * 1000000; 1482 tsp = &ts; 1483 } else 1484 tsp = NULL; 1485 1486 return (kern_poll(td, uap->fds, uap->nfds, tsp, NULL)); 1487 } 1488 1489 /* 1490 * kfds points to an array in the kernel. 1491 */ 1492 int 1493 kern_poll_kfds(struct thread *td, struct pollfd *kfds, u_int nfds, 1494 struct timespec *tsp, sigset_t *uset) 1495 { 1496 sbintime_t sbt, precision, tmp; 1497 time_t over; 1498 struct timespec ts; 1499 int error; 1500 1501 precision = 0; 1502 if (tsp != NULL) { 1503 if (!timespecvalid_interval(tsp)) 1504 return (EINVAL); 1505 if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) 1506 sbt = 0; 1507 else { 1508 ts = *tsp; 1509 if (ts.tv_sec > INT32_MAX / 2) { 1510 over = ts.tv_sec - INT32_MAX / 2; 1511 ts.tv_sec -= over; 1512 } else 1513 over = 0; 1514 tmp = tstosbt(ts); 1515 precision = tmp; 1516 precision >>= tc_precexp; 1517 if (TIMESEL(&sbt, tmp)) 1518 sbt += tc_tick_sbt; 1519 sbt += tmp; 1520 } 1521 } else 1522 sbt = -1; 1523 1524 if (uset != NULL) { 1525 error = kern_sigprocmask(td, SIG_SETMASK, uset, 1526 &td->td_oldsigmask, 0); 1527 if (error) 1528 return (error); 1529 td->td_pflags |= TDP_OLDMASK; 1530 /* 1531 * Make sure that ast() is called on return to 1532 * usermode and TDP_OLDMASK is cleared, restoring old 1533 * sigmask. 1534 */ 1535 ast_sched(td, TDA_SIGSUSPEND); 1536 } 1537 1538 seltdinit(td); 1539 /* Iterate until the timeout expires or descriptors become ready. */ 1540 for (;;) { 1541 error = pollscan(td, kfds, nfds); 1542 if (error || td->td_retval[0] != 0) 1543 break; 1544 error = seltdwait(td, sbt, precision); 1545 if (error) 1546 break; 1547 error = pollrescan(td); 1548 if (error || td->td_retval[0] != 0) 1549 break; 1550 } 1551 seltdclear(td); 1552 1553 /* poll is not restarted after signals... */ 1554 if (error == ERESTART) 1555 error = EINTR; 1556 if (error == EWOULDBLOCK) 1557 error = 0; 1558 return (error); 1559 } 1560 1561 int 1562 sys_ppoll(struct thread *td, struct ppoll_args *uap) 1563 { 1564 struct timespec ts, *tsp; 1565 sigset_t set, *ssp; 1566 int error; 1567 1568 if (uap->ts != NULL) { 1569 error = copyin(uap->ts, &ts, sizeof(ts)); 1570 if (error) 1571 return (error); 1572 tsp = &ts; 1573 } else 1574 tsp = NULL; 1575 if (uap->set != NULL) { 1576 error = copyin(uap->set, &set, sizeof(set)); 1577 if (error) 1578 return (error); 1579 ssp = &set; 1580 } else 1581 ssp = NULL; 1582 return (kern_poll(td, uap->fds, uap->nfds, tsp, ssp)); 1583 } 1584 1585 /* 1586 * ufds points to an array in user space. 1587 */ 1588 int 1589 kern_poll(struct thread *td, struct pollfd *ufds, u_int nfds, 1590 struct timespec *tsp, sigset_t *set) 1591 { 1592 struct pollfd *kfds; 1593 struct pollfd stackfds[32]; 1594 int error; 1595 1596 if (kern_poll_maxfds(nfds)) 1597 return (EINVAL); 1598 if (nfds > nitems(stackfds)) 1599 kfds = mallocarray(nfds, sizeof(*kfds), M_TEMP, M_WAITOK); 1600 else 1601 kfds = stackfds; 1602 error = copyin(ufds, kfds, nfds * sizeof(*kfds)); 1603 if (error != 0) 1604 goto out; 1605 1606 error = kern_poll_kfds(td, kfds, nfds, tsp, set); 1607 if (error == 0) 1608 error = pollout(td, kfds, ufds, nfds); 1609 1610 out: 1611 if (nfds > nitems(stackfds)) 1612 free(kfds, M_TEMP); 1613 return (error); 1614 } 1615 1616 bool 1617 kern_poll_maxfds(u_int nfds) 1618 { 1619 1620 /* 1621 * This is kinda bogus. We have fd limits, but that is not 1622 * really related to the size of the pollfd array. Make sure 1623 * we let the process use at least FD_SETSIZE entries and at 1624 * least enough for the system-wide limits. We want to be reasonably 1625 * safe, but not overly restrictive. 1626 */ 1627 return (nfds > maxfilesperproc && nfds > FD_SETSIZE); 1628 } 1629 1630 static int 1631 pollrescan(struct thread *td) 1632 { 1633 struct seltd *stp; 1634 struct selfd *sfp; 1635 struct selfd *sfn; 1636 struct selinfo *si; 1637 struct filedesc *fdp; 1638 struct file *fp; 1639 struct pollfd *fd; 1640 int n, error; 1641 bool only_user; 1642 1643 n = 0; 1644 fdp = td->td_proc->p_fd; 1645 stp = td->td_sel; 1646 only_user = FILEDESC_IS_ONLY_USER(fdp); 1647 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { 1648 fd = (struct pollfd *)sfp->sf_cookie; 1649 si = sfp->sf_si; 1650 selfdfree(stp, sfp); 1651 /* If the selinfo wasn't cleared the event didn't fire. */ 1652 if (si != NULL) 1653 continue; 1654 if (only_user) 1655 error = fget_only_user(fdp, fd->fd, &cap_event_rights, &fp); 1656 else 1657 error = fget_unlocked(td, fd->fd, &cap_event_rights, &fp); 1658 if (__predict_false(error != 0)) { 1659 fd->revents = POLLNVAL; 1660 n++; 1661 continue; 1662 } 1663 /* 1664 * Note: backend also returns POLLHUP and 1665 * POLLERR if appropriate. 1666 */ 1667 fd->revents = fo_poll(fp, fd->events, td->td_ucred, td); 1668 if (only_user) 1669 fput_only_user(fdp, fp); 1670 else 1671 fdrop(fp, td); 1672 if (fd->revents != 0) 1673 n++; 1674 } 1675 stp->st_flags = 0; 1676 td->td_retval[0] = n; 1677 return (0); 1678 } 1679 1680 static int 1681 pollout(struct thread *td, struct pollfd *fds, struct pollfd *ufds, u_int nfd) 1682 { 1683 int error = 0; 1684 u_int i = 0; 1685 u_int n = 0; 1686 1687 for (i = 0; i < nfd; i++) { 1688 error = copyout(&fds->revents, &ufds->revents, 1689 sizeof(ufds->revents)); 1690 if (error) 1691 return (error); 1692 if (fds->revents != 0) 1693 n++; 1694 fds++; 1695 ufds++; 1696 } 1697 td->td_retval[0] = n; 1698 return (0); 1699 } 1700 1701 static int 1702 pollscan(struct thread *td, struct pollfd *fds, u_int nfd) 1703 { 1704 struct filedesc *fdp; 1705 struct file *fp; 1706 int i, n, error; 1707 bool only_user; 1708 1709 n = 0; 1710 fdp = td->td_proc->p_fd; 1711 only_user = FILEDESC_IS_ONLY_USER(fdp); 1712 for (i = 0; i < nfd; i++, fds++) { 1713 if (fds->fd < 0) { 1714 fds->revents = 0; 1715 continue; 1716 } 1717 if (only_user) 1718 error = fget_only_user(fdp, fds->fd, &cap_event_rights, &fp); 1719 else 1720 error = fget_unlocked(td, fds->fd, &cap_event_rights, &fp); 1721 if (__predict_false(error != 0)) { 1722 fds->revents = POLLNVAL; 1723 n++; 1724 continue; 1725 } 1726 /* 1727 * Note: backend also returns POLLHUP and 1728 * POLLERR if appropriate. 1729 */ 1730 selfdalloc(td, fds); 1731 fds->revents = fo_poll(fp, fds->events, 1732 td->td_ucred, td); 1733 if (only_user) 1734 fput_only_user(fdp, fp); 1735 else 1736 fdrop(fp, td); 1737 /* 1738 * POSIX requires POLLOUT to be never 1739 * set simultaneously with POLLHUP. 1740 */ 1741 if ((fds->revents & POLLHUP) != 0) 1742 fds->revents &= ~POLLOUT; 1743 1744 if (fds->revents != 0) 1745 n++; 1746 } 1747 td->td_retval[0] = n; 1748 return (0); 1749 } 1750 1751 /* 1752 * XXX This was created specifically to support netncp and netsmb. This 1753 * allows the caller to specify a socket to wait for events on. It returns 1754 * 0 if any events matched and an error otherwise. There is no way to 1755 * determine which events fired. 1756 */ 1757 int 1758 selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td) 1759 { 1760 struct timeval rtv; 1761 sbintime_t asbt, precision, rsbt; 1762 int error; 1763 1764 precision = 0; /* stupid gcc! */ 1765 if (tvp != NULL) { 1766 rtv = *tvp; 1767 if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || 1768 rtv.tv_usec >= 1000000) 1769 return (EINVAL); 1770 if (!timevalisset(&rtv)) 1771 asbt = 0; 1772 else if (rtv.tv_sec <= INT32_MAX) { 1773 rsbt = tvtosbt(rtv); 1774 precision = rsbt; 1775 precision >>= tc_precexp; 1776 if (TIMESEL(&asbt, rsbt)) 1777 asbt += tc_tick_sbt; 1778 if (asbt <= SBT_MAX - rsbt) 1779 asbt += rsbt; 1780 else 1781 asbt = -1; 1782 } else 1783 asbt = -1; 1784 } else 1785 asbt = -1; 1786 seltdinit(td); 1787 /* 1788 * Iterate until the timeout expires or the socket becomes ready. 1789 */ 1790 for (;;) { 1791 selfdalloc(td, NULL); 1792 if (sopoll(so, events, NULL, td) != 0) { 1793 error = 0; 1794 break; 1795 } 1796 error = seltdwait(td, asbt, precision); 1797 if (error) 1798 break; 1799 } 1800 seltdclear(td); 1801 /* XXX Duplicates ncp/smb behavior. */ 1802 if (error == ERESTART) 1803 error = 0; 1804 return (error); 1805 } 1806 1807 /* 1808 * Preallocate two selfds associated with 'cookie'. Some fo_poll routines 1809 * have two select sets, one for read and another for write. 1810 */ 1811 static void 1812 selfdalloc(struct thread *td, void *cookie) 1813 { 1814 struct seltd *stp; 1815 1816 stp = td->td_sel; 1817 if (stp->st_free1 == NULL) 1818 stp->st_free1 = malloc(sizeof(*stp->st_free1), M_SELFD, M_WAITOK|M_ZERO); 1819 stp->st_free1->sf_td = stp; 1820 stp->st_free1->sf_cookie = cookie; 1821 if (stp->st_free2 == NULL) 1822 stp->st_free2 = malloc(sizeof(*stp->st_free2), M_SELFD, M_WAITOK|M_ZERO); 1823 stp->st_free2->sf_td = stp; 1824 stp->st_free2->sf_cookie = cookie; 1825 } 1826 1827 static void 1828 selfdfree(struct seltd *stp, struct selfd *sfp) 1829 { 1830 STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link); 1831 /* 1832 * Paired with doselwakeup. 1833 */ 1834 if (atomic_load_acq_ptr((uintptr_t *)&sfp->sf_si) != (uintptr_t)NULL) { 1835 mtx_lock(sfp->sf_mtx); 1836 if (sfp->sf_si != NULL) { 1837 TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads); 1838 } 1839 mtx_unlock(sfp->sf_mtx); 1840 } 1841 free(sfp, M_SELFD); 1842 } 1843 1844 /* Drain the waiters tied to all the selfd belonging the specified selinfo. */ 1845 void 1846 seldrain(struct selinfo *sip) 1847 { 1848 1849 /* 1850 * This feature is already provided by doselwakeup(), thus it is 1851 * enough to go for it. 1852 * Eventually, the context, should take care to avoid races 1853 * between thread calling select()/poll() and file descriptor 1854 * detaching, but, again, the races are just the same as 1855 * selwakeup(). 1856 */ 1857 doselwakeup(sip, -1); 1858 } 1859 1860 /* 1861 * Record a select request. 1862 */ 1863 void 1864 selrecord(struct thread *selector, struct selinfo *sip) 1865 { 1866 struct selfd *sfp; 1867 struct seltd *stp; 1868 struct mtx *mtxp; 1869 1870 stp = selector->td_sel; 1871 /* 1872 * Don't record when doing a rescan. 1873 */ 1874 if (stp->st_flags & SELTD_RESCAN) 1875 return; 1876 /* 1877 * Grab one of the preallocated descriptors. 1878 */ 1879 sfp = NULL; 1880 if ((sfp = stp->st_free1) != NULL) 1881 stp->st_free1 = NULL; 1882 else if ((sfp = stp->st_free2) != NULL) 1883 stp->st_free2 = NULL; 1884 else 1885 panic("selrecord: No free selfd on selq"); 1886 mtxp = sip->si_mtx; 1887 if (mtxp == NULL) 1888 mtxp = mtx_pool_find(mtxpool_select, sip); 1889 /* 1890 * Initialize the sfp and queue it in the thread. 1891 */ 1892 sfp->sf_si = sip; 1893 sfp->sf_mtx = mtxp; 1894 STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link); 1895 /* 1896 * Now that we've locked the sip, check for initialization. 1897 */ 1898 mtx_lock(mtxp); 1899 if (sip->si_mtx == NULL) { 1900 sip->si_mtx = mtxp; 1901 TAILQ_INIT(&sip->si_tdlist); 1902 } 1903 /* 1904 * Add this thread to the list of selfds listening on this selinfo. 1905 */ 1906 TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads); 1907 mtx_unlock(sip->si_mtx); 1908 } 1909 1910 /* Wake up a selecting thread. */ 1911 void 1912 selwakeup(struct selinfo *sip) 1913 { 1914 doselwakeup(sip, -1); 1915 } 1916 1917 /* Wake up a selecting thread, and set its priority. */ 1918 void 1919 selwakeuppri(struct selinfo *sip, int pri) 1920 { 1921 doselwakeup(sip, pri); 1922 } 1923 1924 /* 1925 * Do a wakeup when a selectable event occurs. 1926 */ 1927 static void 1928 doselwakeup(struct selinfo *sip, int pri) 1929 { 1930 struct selfd *sfp; 1931 struct selfd *sfn; 1932 struct seltd *stp; 1933 1934 /* If it's not initialized there can't be any waiters. */ 1935 if (sip->si_mtx == NULL) 1936 return; 1937 /* 1938 * Locking the selinfo locks all selfds associated with it. 1939 */ 1940 mtx_lock(sip->si_mtx); 1941 TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) { 1942 /* 1943 * Once we remove this sfp from the list and clear the 1944 * sf_si seltdclear will know to ignore this si. 1945 */ 1946 TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads); 1947 stp = sfp->sf_td; 1948 mtx_lock(&stp->st_mtx); 1949 stp->st_flags |= SELTD_PENDING; 1950 cv_broadcastpri(&stp->st_wait, pri); 1951 mtx_unlock(&stp->st_mtx); 1952 /* 1953 * Paired with selfdfree. 1954 * 1955 * Storing this only after the wakeup provides an invariant that 1956 * stp is not used after selfdfree returns. 1957 */ 1958 atomic_store_rel_ptr((uintptr_t *)&sfp->sf_si, (uintptr_t)NULL); 1959 } 1960 mtx_unlock(sip->si_mtx); 1961 } 1962 1963 static void 1964 seltdinit(struct thread *td) 1965 { 1966 struct seltd *stp; 1967 1968 stp = td->td_sel; 1969 if (stp != NULL) { 1970 MPASS(stp->st_flags == 0); 1971 MPASS(STAILQ_EMPTY(&stp->st_selq)); 1972 return; 1973 } 1974 stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO); 1975 mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF); 1976 cv_init(&stp->st_wait, "select"); 1977 stp->st_flags = 0; 1978 STAILQ_INIT(&stp->st_selq); 1979 td->td_sel = stp; 1980 } 1981 1982 static int 1983 seltdwait(struct thread *td, sbintime_t sbt, sbintime_t precision) 1984 { 1985 struct seltd *stp; 1986 int error; 1987 1988 stp = td->td_sel; 1989 /* 1990 * An event of interest may occur while we do not hold the seltd 1991 * locked so check the pending flag before we sleep. 1992 */ 1993 mtx_lock(&stp->st_mtx); 1994 /* 1995 * Any further calls to selrecord will be a rescan. 1996 */ 1997 stp->st_flags |= SELTD_RESCAN; 1998 if (stp->st_flags & SELTD_PENDING) { 1999 mtx_unlock(&stp->st_mtx); 2000 return (0); 2001 } 2002 if (sbt == 0) 2003 error = EWOULDBLOCK; 2004 else if (sbt != -1) 2005 error = cv_timedwait_sig_sbt(&stp->st_wait, &stp->st_mtx, 2006 sbt, precision, C_ABSOLUTE); 2007 else 2008 error = cv_wait_sig(&stp->st_wait, &stp->st_mtx); 2009 mtx_unlock(&stp->st_mtx); 2010 2011 return (error); 2012 } 2013 2014 void 2015 seltdfini(struct thread *td) 2016 { 2017 struct seltd *stp; 2018 2019 stp = td->td_sel; 2020 if (stp == NULL) 2021 return; 2022 MPASS(stp->st_flags == 0); 2023 MPASS(STAILQ_EMPTY(&stp->st_selq)); 2024 if (stp->st_free1) 2025 free(stp->st_free1, M_SELFD); 2026 if (stp->st_free2) 2027 free(stp->st_free2, M_SELFD); 2028 td->td_sel = NULL; 2029 cv_destroy(&stp->st_wait); 2030 mtx_destroy(&stp->st_mtx); 2031 free(stp, M_SELECT); 2032 } 2033 2034 /* 2035 * Remove the references to the thread from all of the objects we were 2036 * polling. 2037 */ 2038 static void 2039 seltdclear(struct thread *td) 2040 { 2041 struct seltd *stp; 2042 struct selfd *sfp; 2043 struct selfd *sfn; 2044 2045 stp = td->td_sel; 2046 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) 2047 selfdfree(stp, sfp); 2048 stp->st_flags = 0; 2049 } 2050 2051 static void selectinit(void *); 2052 SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL); 2053 static void 2054 selectinit(void *dummy __unused) 2055 { 2056 2057 mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF); 2058 } 2059 2060 /* 2061 * Set up a syscall return value that follows the convention specified for 2062 * posix_* functions. 2063 */ 2064 int 2065 kern_posix_error(struct thread *td, int error) 2066 { 2067 2068 if (error <= 0) 2069 return (error); 2070 td->td_errno = error; 2071 td->td_pflags |= TDP_NERRNO; 2072 td->td_retval[0] = error; 2073 return (0); 2074 } 2075 2076 int 2077 kcmp_cmp(uintptr_t a, uintptr_t b) 2078 { 2079 if (a == b) 2080 return (0); 2081 else if (a < b) 2082 return (1); 2083 return (2); 2084 } 2085 2086 static int 2087 kcmp_pget(struct thread *td, pid_t pid, struct proc **pp) 2088 { 2089 if (pid == td->td_proc->p_pid) { 2090 *pp = td->td_proc; 2091 return (0); 2092 } 2093 return (pget(pid, PGET_CANDEBUG | PGET_NOTWEXIT | PGET_HOLD, pp)); 2094 } 2095 2096 int 2097 kern_kcmp(struct thread *td, pid_t pid1, pid_t pid2, int type, 2098 uintptr_t idx1, uintptr_t idx2) 2099 { 2100 struct proc *p1, *p2; 2101 struct file *fp1, *fp2; 2102 int error, res; 2103 2104 res = -1; 2105 p1 = p2 = NULL; 2106 error = kcmp_pget(td, pid1, &p1); 2107 if (error == 0) 2108 error = kcmp_pget(td, pid2, &p2); 2109 if (error != 0) 2110 goto out; 2111 2112 switch (type) { 2113 case KCMP_FILE: 2114 case KCMP_FILEOBJ: 2115 error = fget_remote(td, p1, idx1, &fp1); 2116 if (error == 0) { 2117 error = fget_remote(td, p2, idx2, &fp2); 2118 if (error == 0) { 2119 if (type == KCMP_FILEOBJ) 2120 res = fo_cmp(fp1, fp2, td); 2121 else 2122 res = kcmp_cmp((uintptr_t)fp1, 2123 (uintptr_t)fp2); 2124 fdrop(fp2, td); 2125 } 2126 fdrop(fp1, td); 2127 } 2128 break; 2129 case KCMP_FILES: 2130 res = kcmp_cmp((uintptr_t)p1->p_fd, (uintptr_t)p2->p_fd); 2131 break; 2132 case KCMP_SIGHAND: 2133 res = kcmp_cmp((uintptr_t)p1->p_sigacts, 2134 (uintptr_t)p2->p_sigacts); 2135 break; 2136 case KCMP_VM: 2137 res = kcmp_cmp((uintptr_t)p1->p_vmspace, 2138 (uintptr_t)p2->p_vmspace); 2139 break; 2140 default: 2141 error = EINVAL; 2142 break; 2143 } 2144 2145 out: 2146 if (p1 != NULL && p1 != td->td_proc) 2147 PRELE(p1); 2148 if (p2 != NULL && p2 != td->td_proc) 2149 PRELE(p2); 2150 2151 td->td_retval[0] = res; 2152 return (error); 2153 } 2154 2155 int 2156 sys_kcmp(struct thread *td, struct kcmp_args *uap) 2157 { 2158 return (kern_kcmp(td, uap->pid1, uap->pid2, uap->type, 2159 uap->idx1, uap->idx2)); 2160 } 2161 2162 int 2163 file_kcmp_generic(struct file *fp1, struct file *fp2, struct thread *td) 2164 { 2165 if (fp1->f_type != fp2->f_type) 2166 return (3); 2167 return (kcmp_cmp((uintptr_t)fp1->f_data, (uintptr_t)fp2->f_data)); 2168 } 2169