1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 #include "opt_capsicum.h" 38 #include "opt_ktrace.h" 39 40 #define EXTERR_CATEGORY EXTERR_CAT_GENIO 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/sysproto.h> 44 #include <sys/capsicum.h> 45 #include <sys/exterrvar.h> 46 #include <sys/filedesc.h> 47 #include <sys/filio.h> 48 #include <sys/fcntl.h> 49 #include <sys/file.h> 50 #include <sys/inotify.h> 51 #include <sys/lock.h> 52 #include <sys/proc.h> 53 #include <sys/signalvar.h> 54 #include <sys/protosw.h> 55 #include <sys/socketvar.h> 56 #include <sys/uio.h> 57 #include <sys/eventfd.h> 58 #include <sys/kernel.h> 59 #include <sys/ktr.h> 60 #include <sys/limits.h> 61 #include <sys/malloc.h> 62 #include <sys/poll.h> 63 #include <sys/resourcevar.h> 64 #include <sys/selinfo.h> 65 #include <sys/sleepqueue.h> 66 #include <sys/specialfd.h> 67 #include <sys/syscallsubr.h> 68 #include <sys/sysctl.h> 69 #include <sys/sysent.h> 70 #include <sys/vnode.h> 71 #include <sys/unistd.h> 72 #include <sys/bio.h> 73 #include <sys/buf.h> 74 #include <sys/condvar.h> 75 #ifdef KTRACE 76 #include <sys/ktrace.h> 77 #endif 78 79 #include <security/audit/audit.h> 80 81 /* 82 * The following macro defines how many bytes will be allocated from 83 * the stack instead of memory allocated when passing the IOCTL data 84 * structures from userspace and to the kernel. Some IOCTLs having 85 * small data structures are used very frequently and this small 86 * buffer on the stack gives a significant speedup improvement for 87 * those requests. The value of this define should be greater or equal 88 * to 64 bytes and should also be power of two. The data structure is 89 * currently hard-aligned to a 8-byte boundary on the stack. This 90 * should currently be sufficient for all supported platforms. 91 */ 92 #define SYS_IOCTL_SMALL_SIZE 128 /* bytes */ 93 #define SYS_IOCTL_SMALL_ALIGN 8 /* bytes */ 94 95 #ifdef __LP64__ 96 static int iosize_max_clamp = 0; 97 SYSCTL_INT(_debug, OID_AUTO, iosize_max_clamp, CTLFLAG_RW, 98 &iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX"); 99 static int devfs_iosize_max_clamp = 1; 100 SYSCTL_INT(_debug, OID_AUTO, devfs_iosize_max_clamp, CTLFLAG_RW, 101 &devfs_iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX for devices"); 102 #endif 103 104 /* 105 * Assert that the return value of read(2) and write(2) syscalls fits 106 * into a register. If not, an architecture will need to provide the 107 * usermode wrappers to reconstruct the result. 108 */ 109 CTASSERT(sizeof(register_t) >= sizeof(size_t)); 110 111 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 112 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 113 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 114 115 static int pollout(struct thread *, struct pollfd *, struct pollfd *, 116 u_int); 117 static int pollscan(struct thread *, struct pollfd *, u_int); 118 static int pollrescan(struct thread *); 119 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 120 static int selrescan(struct thread *, fd_mask **, fd_mask **); 121 static void selfdalloc(struct thread *, void *); 122 static void selfdfree(struct seltd *, struct selfd *); 123 static int dofileread(struct thread *, int, struct file *, struct uio *, 124 off_t, int); 125 static int dofilewrite(struct thread *, int, struct file *, struct uio *, 126 off_t, int); 127 static void doselwakeup(struct selinfo *, int); 128 static void seltdinit(struct thread *); 129 static int seltdwait(struct thread *, sbintime_t, sbintime_t); 130 static void seltdclear(struct thread *); 131 132 /* 133 * One seltd per-thread allocated on demand as needed. 134 * 135 * t - protected by st_mtx 136 * k - Only accessed by curthread or read-only 137 */ 138 struct seltd { 139 STAILQ_HEAD(, selfd) st_selq; /* (k) List of selfds. */ 140 struct selfd *st_free1; /* (k) free fd for read set. */ 141 struct selfd *st_free2; /* (k) free fd for write set. */ 142 struct mtx st_mtx; /* Protects struct seltd */ 143 struct cv st_wait; /* (t) Wait channel. */ 144 int st_flags; /* (t) SELTD_ flags. */ 145 }; 146 147 #define SELTD_PENDING 0x0001 /* We have pending events. */ 148 #define SELTD_RESCAN 0x0002 /* Doing a rescan. */ 149 150 /* 151 * One selfd allocated per-thread per-file-descriptor. 152 * f - protected by sf_mtx 153 */ 154 struct selfd { 155 STAILQ_ENTRY(selfd) sf_link; /* (k) fds owned by this td. */ 156 TAILQ_ENTRY(selfd) sf_threads; /* (f) fds on this selinfo. */ 157 struct selinfo *sf_si; /* (f) selinfo when linked. */ 158 struct mtx *sf_mtx; /* Pointer to selinfo mtx. */ 159 struct seltd *sf_td; /* (k) owning seltd. */ 160 void *sf_cookie; /* (k) fd or pollfd. */ 161 }; 162 163 MALLOC_DEFINE(M_SELFD, "selfd", "selfd"); 164 static struct mtx_pool *mtxpool_select; 165 166 #ifdef __LP64__ 167 size_t 168 devfs_iosize_max(void) 169 { 170 171 return (devfs_iosize_max_clamp || SV_CURPROC_FLAG(SV_ILP32) ? 172 INT_MAX : SSIZE_MAX); 173 } 174 175 size_t 176 iosize_max(void) 177 { 178 179 return (iosize_max_clamp || SV_CURPROC_FLAG(SV_ILP32) ? 180 INT_MAX : SSIZE_MAX); 181 } 182 #endif 183 184 #ifndef _SYS_SYSPROTO_H_ 185 struct read_args { 186 int fd; 187 void *buf; 188 size_t nbyte; 189 }; 190 #endif 191 int 192 sys_read(struct thread *td, struct read_args *uap) 193 { 194 struct uio auio; 195 struct iovec aiov; 196 int error; 197 198 if (uap->nbyte > IOSIZE_MAX) 199 return (EXTERROR(EINVAL, "length > iosize_max")); 200 aiov.iov_base = uap->buf; 201 aiov.iov_len = uap->nbyte; 202 auio.uio_iov = &aiov; 203 auio.uio_iovcnt = 1; 204 auio.uio_resid = uap->nbyte; 205 auio.uio_segflg = UIO_USERSPACE; 206 error = kern_readv(td, uap->fd, &auio); 207 return (error); 208 } 209 210 /* 211 * Positioned read system call 212 */ 213 #ifndef _SYS_SYSPROTO_H_ 214 struct pread_args { 215 int fd; 216 void *buf; 217 size_t nbyte; 218 int pad; 219 off_t offset; 220 }; 221 #endif 222 int 223 sys_pread(struct thread *td, struct pread_args *uap) 224 { 225 226 return (kern_pread(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); 227 } 228 229 int 230 kern_pread(struct thread *td, int fd, void *buf, size_t nbyte, off_t offset) 231 { 232 struct uio auio; 233 struct iovec aiov; 234 int error; 235 236 if (nbyte > IOSIZE_MAX) 237 return (EXTERROR(EINVAL, "length > iosize_max")); 238 aiov.iov_base = buf; 239 aiov.iov_len = nbyte; 240 auio.uio_iov = &aiov; 241 auio.uio_iovcnt = 1; 242 auio.uio_resid = nbyte; 243 auio.uio_segflg = UIO_USERSPACE; 244 error = kern_preadv(td, fd, &auio, offset); 245 return (error); 246 } 247 248 #if defined(COMPAT_FREEBSD6) 249 int 250 freebsd6_pread(struct thread *td, struct freebsd6_pread_args *uap) 251 { 252 253 return (kern_pread(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); 254 } 255 #endif 256 257 /* 258 * Scatter read system call. 259 */ 260 #ifndef _SYS_SYSPROTO_H_ 261 struct readv_args { 262 int fd; 263 struct iovec *iovp; 264 u_int iovcnt; 265 }; 266 #endif 267 int 268 sys_readv(struct thread *td, struct readv_args *uap) 269 { 270 struct uio *auio; 271 int error; 272 273 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 274 if (error) 275 return (error); 276 error = kern_readv(td, uap->fd, auio); 277 freeuio(auio); 278 return (error); 279 } 280 281 int 282 kern_readv(struct thread *td, int fd, struct uio *auio) 283 { 284 struct file *fp; 285 int error; 286 287 error = fget_read(td, fd, &cap_read_rights, &fp); 288 if (error) 289 return (error); 290 error = dofileread(td, fd, fp, auio, (off_t)-1, 0); 291 fdrop(fp, td); 292 return (error); 293 } 294 295 /* 296 * Scatter positioned read system call. 297 */ 298 #ifndef _SYS_SYSPROTO_H_ 299 struct preadv_args { 300 int fd; 301 struct iovec *iovp; 302 u_int iovcnt; 303 off_t offset; 304 }; 305 #endif 306 int 307 sys_preadv(struct thread *td, struct preadv_args *uap) 308 { 309 struct uio *auio; 310 int error; 311 312 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 313 if (error) 314 return (error); 315 error = kern_preadv(td, uap->fd, auio, uap->offset); 316 freeuio(auio); 317 return (error); 318 } 319 320 int 321 kern_preadv(struct thread *td, int fd, struct uio *auio, off_t offset) 322 { 323 struct file *fp; 324 int error; 325 326 error = fget_read(td, fd, &cap_pread_rights, &fp); 327 if (error) 328 return (error); 329 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 330 error = ESPIPE; 331 else if (offset < 0 && 332 (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) 333 error = EXTERROR(EINVAL, "neg offset"); 334 else 335 error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET); 336 fdrop(fp, td); 337 return (error); 338 } 339 340 /* 341 * Common code for readv and preadv that reads data in 342 * from a file using the passed in uio, offset, and flags. 343 */ 344 static int 345 dofileread(struct thread *td, int fd, struct file *fp, struct uio *auio, 346 off_t offset, int flags) 347 { 348 ssize_t cnt; 349 int error; 350 #ifdef KTRACE 351 struct uio *ktruio = NULL; 352 #endif 353 354 AUDIT_ARG_FD(fd); 355 356 /* Finish zero length reads right here */ 357 if (auio->uio_resid == 0) { 358 td->td_retval[0] = 0; 359 return (0); 360 } 361 auio->uio_rw = UIO_READ; 362 auio->uio_offset = offset; 363 auio->uio_td = td; 364 #ifdef KTRACE 365 if (KTRPOINT(td, KTR_GENIO)) 366 ktruio = cloneuio(auio); 367 #endif 368 cnt = auio->uio_resid; 369 if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) { 370 if (auio->uio_resid != cnt && (error == ERESTART || 371 error == EINTR || error == EWOULDBLOCK)) 372 error = 0; 373 } 374 cnt -= auio->uio_resid; 375 #ifdef KTRACE 376 if (ktruio != NULL) { 377 ktruio->uio_resid = cnt; 378 ktrgenio(fd, UIO_READ, ktruio, error); 379 } 380 #endif 381 td->td_retval[0] = cnt; 382 return (error); 383 } 384 385 #ifndef _SYS_SYSPROTO_H_ 386 struct write_args { 387 int fd; 388 const void *buf; 389 size_t nbyte; 390 }; 391 #endif 392 int 393 sys_write(struct thread *td, struct write_args *uap) 394 { 395 struct uio auio; 396 struct iovec aiov; 397 int error; 398 399 if (uap->nbyte > IOSIZE_MAX) 400 return (EXTERROR(EINVAL, "length > iosize_max")); 401 aiov.iov_base = (void *)(uintptr_t)uap->buf; 402 aiov.iov_len = uap->nbyte; 403 auio.uio_iov = &aiov; 404 auio.uio_iovcnt = 1; 405 auio.uio_resid = uap->nbyte; 406 auio.uio_segflg = UIO_USERSPACE; 407 error = kern_writev(td, uap->fd, &auio); 408 return (error); 409 } 410 411 /* 412 * Positioned write system call. 413 */ 414 #ifndef _SYS_SYSPROTO_H_ 415 struct pwrite_args { 416 int fd; 417 const void *buf; 418 size_t nbyte; 419 int pad; 420 off_t offset; 421 }; 422 #endif 423 int 424 sys_pwrite(struct thread *td, struct pwrite_args *uap) 425 { 426 427 return (kern_pwrite(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); 428 } 429 430 int 431 kern_pwrite(struct thread *td, int fd, const void *buf, size_t nbyte, 432 off_t offset) 433 { 434 struct uio auio; 435 struct iovec aiov; 436 int error; 437 438 if (nbyte > IOSIZE_MAX) 439 return (EXTERROR(EINVAL, "length > iosize_max")); 440 aiov.iov_base = (void *)(uintptr_t)buf; 441 aiov.iov_len = nbyte; 442 auio.uio_iov = &aiov; 443 auio.uio_iovcnt = 1; 444 auio.uio_resid = nbyte; 445 auio.uio_segflg = UIO_USERSPACE; 446 error = kern_pwritev(td, fd, &auio, offset); 447 return (error); 448 } 449 450 #if defined(COMPAT_FREEBSD6) 451 int 452 freebsd6_pwrite(struct thread *td, struct freebsd6_pwrite_args *uap) 453 { 454 455 return (kern_pwrite(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); 456 } 457 #endif 458 459 /* 460 * Gather write system call. 461 */ 462 #ifndef _SYS_SYSPROTO_H_ 463 struct writev_args { 464 int fd; 465 struct iovec *iovp; 466 u_int iovcnt; 467 }; 468 #endif 469 int 470 sys_writev(struct thread *td, struct writev_args *uap) 471 { 472 struct uio *auio; 473 int error; 474 475 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 476 if (error) 477 return (error); 478 error = kern_writev(td, uap->fd, auio); 479 freeuio(auio); 480 return (error); 481 } 482 483 int 484 kern_writev(struct thread *td, int fd, struct uio *auio) 485 { 486 struct file *fp; 487 int error; 488 489 error = fget_write(td, fd, &cap_write_rights, &fp); 490 if (error) 491 return (error); 492 error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0); 493 fdrop(fp, td); 494 return (error); 495 } 496 497 /* 498 * Gather positioned write system call. 499 */ 500 #ifndef _SYS_SYSPROTO_H_ 501 struct pwritev_args { 502 int fd; 503 struct iovec *iovp; 504 u_int iovcnt; 505 off_t offset; 506 }; 507 #endif 508 int 509 sys_pwritev(struct thread *td, struct pwritev_args *uap) 510 { 511 struct uio *auio; 512 int error; 513 514 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 515 if (error) 516 return (error); 517 error = kern_pwritev(td, uap->fd, auio, uap->offset); 518 freeuio(auio); 519 return (error); 520 } 521 522 int 523 kern_pwritev(struct thread *td, int fd, struct uio *auio, off_t offset) 524 { 525 struct file *fp; 526 int error; 527 528 error = fget_write(td, fd, &cap_pwrite_rights, &fp); 529 if (error) 530 return (error); 531 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 532 error = ESPIPE; 533 else if (offset < 0 && 534 (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) 535 error = EXTERROR(EINVAL, "neg offset"); 536 else 537 error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET); 538 fdrop(fp, td); 539 return (error); 540 } 541 542 /* 543 * Common code for writev and pwritev that writes data to 544 * a file using the passed in uio, offset, and flags. 545 */ 546 static int 547 dofilewrite(struct thread *td, int fd, struct file *fp, struct uio *auio, 548 off_t offset, int flags) 549 { 550 ssize_t cnt; 551 int error; 552 #ifdef KTRACE 553 struct uio *ktruio = NULL; 554 #endif 555 556 AUDIT_ARG_FD(fd); 557 auio->uio_rw = UIO_WRITE; 558 auio->uio_td = td; 559 auio->uio_offset = offset; 560 #ifdef KTRACE 561 if (KTRPOINT(td, KTR_GENIO)) 562 ktruio = cloneuio(auio); 563 #endif 564 cnt = auio->uio_resid; 565 error = fo_write(fp, auio, td->td_ucred, flags, td); 566 /* 567 * Socket layer is responsible for special error handling, 568 * see sousrsend(). 569 */ 570 if (error != 0 && fp->f_type != DTYPE_SOCKET) { 571 if (auio->uio_resid != cnt && (error == ERESTART || 572 error == EINTR || error == EWOULDBLOCK)) 573 error = 0; 574 if (error == EPIPE) { 575 PROC_LOCK(td->td_proc); 576 tdsignal(td, SIGPIPE); 577 PROC_UNLOCK(td->td_proc); 578 } 579 } 580 cnt -= auio->uio_resid; 581 #ifdef KTRACE 582 if (ktruio != NULL) { 583 if (error == 0) 584 ktruio->uio_resid = cnt; 585 ktrgenio(fd, UIO_WRITE, ktruio, error); 586 } 587 #endif 588 td->td_retval[0] = cnt; 589 return (error); 590 } 591 592 /* 593 * Truncate a file given a file descriptor. 594 * 595 * Can't use fget_write() here, since must return EINVAL and not EBADF if the 596 * descriptor isn't writable. 597 */ 598 int 599 kern_ftruncate(struct thread *td, int fd, off_t length) 600 { 601 struct file *fp; 602 int error; 603 604 AUDIT_ARG_FD(fd); 605 if (length < 0) 606 return (EXTERROR(EINVAL, "negative length")); 607 error = fget(td, fd, &cap_ftruncate_rights, &fp); 608 if (error) 609 return (error); 610 AUDIT_ARG_FILE(td->td_proc, fp); 611 if (!(fp->f_flag & FWRITE)) { 612 fdrop(fp, td); 613 return (EXTERROR(EINVAL, "non-writable")); 614 } 615 error = fo_truncate(fp, length, td->td_ucred, td); 616 fdrop(fp, td); 617 return (error); 618 } 619 620 #ifndef _SYS_SYSPROTO_H_ 621 struct ftruncate_args { 622 int fd; 623 int pad; 624 off_t length; 625 }; 626 #endif 627 int 628 sys_ftruncate(struct thread *td, struct ftruncate_args *uap) 629 { 630 631 return (kern_ftruncate(td, uap->fd, uap->length)); 632 } 633 634 #if defined(COMPAT_43) 635 #ifndef _SYS_SYSPROTO_H_ 636 struct oftruncate_args { 637 int fd; 638 long length; 639 }; 640 #endif 641 int 642 oftruncate(struct thread *td, struct oftruncate_args *uap) 643 { 644 645 return (kern_ftruncate(td, uap->fd, uap->length)); 646 } 647 #endif /* COMPAT_43 */ 648 649 #ifndef _SYS_SYSPROTO_H_ 650 struct ioctl_args { 651 int fd; 652 u_long com; 653 caddr_t data; 654 }; 655 #endif 656 /* ARGSUSED */ 657 int 658 sys_ioctl(struct thread *td, struct ioctl_args *uap) 659 { 660 u_char smalldata[SYS_IOCTL_SMALL_SIZE] __aligned(SYS_IOCTL_SMALL_ALIGN); 661 uint32_t com; 662 int arg, error; 663 u_int size; 664 caddr_t data; 665 666 #ifdef INVARIANTS 667 if (uap->com > 0xffffffff) { 668 printf( 669 "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", 670 td->td_proc->p_pid, td->td_name, uap->com); 671 } 672 #endif 673 com = (uint32_t)uap->com; 674 675 /* 676 * Interpret high order word to find amount of data to be 677 * copied to/from the user's address space. 678 */ 679 size = IOCPARM_LEN(com); 680 if ((size > IOCPARM_MAX) || 681 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || 682 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43) 683 ((com & IOC_OUT) && size == 0) || 684 #else 685 ((com & (IOC_IN | IOC_OUT)) && size == 0) || 686 #endif 687 ((com & IOC_VOID) && size > 0 && size != sizeof(int))) 688 return (ENOTTY); 689 690 if (size > 0) { 691 if (com & IOC_VOID) { 692 /* Integer argument. */ 693 arg = (intptr_t)uap->data; 694 data = (void *)&arg; 695 size = 0; 696 } else { 697 if (size > SYS_IOCTL_SMALL_SIZE) 698 data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 699 else 700 data = smalldata; 701 } 702 } else 703 data = (void *)&uap->data; 704 if (com & IOC_IN) { 705 error = copyin(uap->data, data, (u_int)size); 706 if (error != 0) 707 goto out; 708 } else if (com & IOC_OUT) { 709 /* 710 * Zero the buffer so the user always 711 * gets back something deterministic. 712 */ 713 bzero(data, size); 714 } 715 716 error = kern_ioctl(td, uap->fd, com, data); 717 718 if (error == 0 && (com & IOC_OUT)) 719 error = copyout(data, uap->data, (u_int)size); 720 721 out: 722 if (size > SYS_IOCTL_SMALL_SIZE) 723 free(data, M_IOCTLOPS); 724 return (error); 725 } 726 727 int 728 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) 729 { 730 struct file *fp; 731 struct filedesc *fdp; 732 int error, tmp, locked; 733 734 AUDIT_ARG_FD(fd); 735 AUDIT_ARG_CMD(com); 736 737 fdp = td->td_proc->p_fd; 738 739 switch (com) { 740 case FIONCLEX: 741 case FIOCLEX: 742 FILEDESC_XLOCK(fdp); 743 locked = LA_XLOCKED; 744 break; 745 default: 746 #ifdef CAPABILITIES 747 FILEDESC_SLOCK(fdp); 748 locked = LA_SLOCKED; 749 #else 750 locked = LA_UNLOCKED; 751 #endif 752 break; 753 } 754 755 #ifdef CAPABILITIES 756 if ((fp = fget_noref(fdp, fd)) == NULL) { 757 error = EBADF; 758 goto out; 759 } 760 if ((error = cap_ioctl_check(fdp, fd, com)) != 0) { 761 fp = NULL; /* fhold() was not called yet */ 762 goto out; 763 } 764 if (!fhold(fp)) { 765 error = EBADF; 766 fp = NULL; 767 goto out; 768 } 769 if (locked == LA_SLOCKED) { 770 FILEDESC_SUNLOCK(fdp); 771 locked = LA_UNLOCKED; 772 } 773 #else 774 error = fget(td, fd, &cap_ioctl_rights, &fp); 775 if (error != 0) { 776 fp = NULL; 777 goto out; 778 } 779 #endif 780 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 781 error = EBADF; 782 goto out; 783 } 784 785 switch (com) { 786 case FIONCLEX: 787 fdp->fd_ofiles[fd].fde_flags &= ~UF_EXCLOSE; 788 goto out; 789 case FIOCLEX: 790 fdp->fd_ofiles[fd].fde_flags |= UF_EXCLOSE; 791 goto out; 792 case FIONBIO: 793 if ((tmp = *(int *)data)) 794 atomic_set_int(&fp->f_flag, FNONBLOCK); 795 else 796 atomic_clear_int(&fp->f_flag, FNONBLOCK); 797 data = (void *)&tmp; 798 break; 799 case FIOASYNC: 800 if ((tmp = *(int *)data)) 801 atomic_set_int(&fp->f_flag, FASYNC); 802 else 803 atomic_clear_int(&fp->f_flag, FASYNC); 804 data = (void *)&tmp; 805 break; 806 } 807 808 error = fo_ioctl(fp, com, data, td->td_ucred, td); 809 out: 810 switch (locked) { 811 case LA_XLOCKED: 812 FILEDESC_XUNLOCK(fdp); 813 break; 814 #ifdef CAPABILITIES 815 case LA_SLOCKED: 816 FILEDESC_SUNLOCK(fdp); 817 break; 818 #endif 819 default: 820 FILEDESC_UNLOCK_ASSERT(fdp); 821 break; 822 } 823 if (fp != NULL) 824 fdrop(fp, td); 825 return (error); 826 } 827 828 int 829 sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap) 830 { 831 int error; 832 833 error = kern_posix_fallocate(td, uap->fd, uap->offset, uap->len); 834 return (kern_posix_error(td, error)); 835 } 836 837 int 838 kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len) 839 { 840 struct file *fp; 841 int error; 842 843 AUDIT_ARG_FD(fd); 844 if (offset < 0) 845 return (EXTERROR(EINVAL, "negative offset")); 846 if (len <= 0) 847 return (EXTERROR(EINVAL, "negative length")); 848 /* Check for wrap. */ 849 if (offset > OFF_MAX - len) 850 return (EFBIG); 851 AUDIT_ARG_FD(fd); 852 error = fget(td, fd, &cap_pwrite_rights, &fp); 853 if (error != 0) 854 return (error); 855 AUDIT_ARG_FILE(td->td_proc, fp); 856 if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) { 857 error = ESPIPE; 858 goto out; 859 } 860 if ((fp->f_flag & FWRITE) == 0) { 861 error = EBADF; 862 goto out; 863 } 864 865 error = fo_fallocate(fp, offset, len, td); 866 out: 867 fdrop(fp, td); 868 return (error); 869 } 870 871 int 872 sys_fspacectl(struct thread *td, struct fspacectl_args *uap) 873 { 874 struct spacectl_range rqsr, rmsr; 875 int error, cerror; 876 877 error = copyin(uap->rqsr, &rqsr, sizeof(rqsr)); 878 if (error != 0) 879 return (error); 880 881 error = kern_fspacectl(td, uap->fd, uap->cmd, &rqsr, uap->flags, 882 &rmsr); 883 if (uap->rmsr != NULL) { 884 cerror = copyout(&rmsr, uap->rmsr, sizeof(rmsr)); 885 if (error == 0) 886 error = cerror; 887 } 888 return (error); 889 } 890 891 int 892 kern_fspacectl(struct thread *td, int fd, int cmd, 893 const struct spacectl_range *rqsr, int flags, struct spacectl_range *rmsrp) 894 { 895 struct file *fp; 896 struct spacectl_range rmsr; 897 int error; 898 899 AUDIT_ARG_FD(fd); 900 AUDIT_ARG_CMD(cmd); 901 AUDIT_ARG_FFLAGS(flags); 902 903 if (rqsr == NULL) 904 return (EXTERROR(EINVAL, "no range")); 905 rmsr = *rqsr; 906 if (rmsrp != NULL) 907 *rmsrp = rmsr; 908 909 if (cmd != SPACECTL_DEALLOC) 910 return (EXTERROR(EINVAL, "cmd", cmd)); 911 if (rqsr->r_offset < 0) 912 return (EXTERROR(EINVAL, "neg offset")); 913 if (rqsr->r_len <= 0) 914 return (EXTERROR(EINVAL, "neg len")); 915 if (rqsr->r_offset > OFF_MAX - rqsr->r_len) 916 return (EXTERROR(EINVAL, "offset too large")); 917 if ((flags & ~SPACECTL_F_SUPPORTED) != 0) 918 return (EXTERROR(EINVAL, "reserved flags", flags)); 919 920 error = fget_write(td, fd, &cap_pwrite_rights, &fp); 921 if (error != 0) 922 return (error); 923 AUDIT_ARG_FILE(td->td_proc, fp); 924 if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) { 925 error = ESPIPE; 926 goto out; 927 } 928 if ((fp->f_flag & FWRITE) == 0) { 929 error = EBADF; 930 goto out; 931 } 932 933 error = fo_fspacectl(fp, cmd, &rmsr.r_offset, &rmsr.r_len, flags, 934 td->td_ucred, td); 935 /* fspacectl is not restarted after signals if the file is modified. */ 936 if (rmsr.r_len != rqsr->r_len && (error == ERESTART || 937 error == EINTR || error == EWOULDBLOCK)) 938 error = 0; 939 if (rmsrp != NULL) 940 *rmsrp = rmsr; 941 out: 942 fdrop(fp, td); 943 return (error); 944 } 945 946 int 947 kern_specialfd(struct thread *td, int type, void *arg) 948 { 949 struct file *fp; 950 int error, fd, fflags; 951 952 fflags = 0; 953 error = falloc_noinstall(td, &fp); 954 if (error != 0) 955 return (error); 956 957 switch (type) { 958 case SPECIALFD_EVENTFD: { 959 struct specialfd_eventfd *ae; 960 961 ae = arg; 962 if ((ae->flags & EFD_CLOEXEC) != 0) 963 fflags |= O_CLOEXEC; 964 error = eventfd_create_file(td, fp, ae->initval, ae->flags); 965 break; 966 } 967 case SPECIALFD_INOTIFY: { 968 struct specialfd_inotify *si; 969 970 si = arg; 971 error = inotify_create_file(td, fp, si->flags, &fflags); 972 break; 973 } 974 default: 975 error = EXTERROR(EINVAL, "invalid type", type); 976 break; 977 } 978 979 if (error == 0) 980 error = finstall(td, fp, &fd, fflags, NULL); 981 fdrop(fp, td); 982 if (error == 0) 983 td->td_retval[0] = fd; 984 return (error); 985 } 986 987 int 988 sys___specialfd(struct thread *td, struct __specialfd_args *args) 989 { 990 int error; 991 992 switch (args->type) { 993 case SPECIALFD_EVENTFD: { 994 struct specialfd_eventfd ae; 995 996 if (args->len != sizeof(struct specialfd_eventfd)) { 997 error = EXTERROR(EINVAL, "eventfd params ABI"); 998 break; 999 } 1000 error = copyin(args->req, &ae, sizeof(ae)); 1001 if (error != 0) 1002 break; 1003 if ((ae.flags & ~(EFD_CLOEXEC | EFD_NONBLOCK | 1004 EFD_SEMAPHORE)) != 0) { 1005 error = EXTERROR(EINVAL, "reserved flag"); 1006 break; 1007 } 1008 error = kern_specialfd(td, args->type, &ae); 1009 break; 1010 } 1011 case SPECIALFD_INOTIFY: { 1012 struct specialfd_inotify si; 1013 1014 if (args->len != sizeof(si)) { 1015 error = EINVAL; 1016 break; 1017 } 1018 error = copyin(args->req, &si, sizeof(si)); 1019 if (error != 0) 1020 break; 1021 error = kern_specialfd(td, args->type, &si); 1022 break; 1023 } 1024 default: 1025 error = EXTERROR(EINVAL, "unknown type", args->type); 1026 break; 1027 } 1028 return (error); 1029 } 1030 1031 int 1032 poll_no_poll(int events) 1033 { 1034 /* 1035 * Return true for read/write. If the user asked for something 1036 * special, return POLLNVAL, so that clients have a way of 1037 * determining reliably whether or not the extended 1038 * functionality is present without hard-coding knowledge 1039 * of specific filesystem implementations. 1040 */ 1041 if (events & ~POLLSTANDARD) 1042 return (POLLNVAL); 1043 1044 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1045 } 1046 1047 int 1048 sys_pselect(struct thread *td, struct pselect_args *uap) 1049 { 1050 struct timespec ts; 1051 struct timeval tv, *tvp; 1052 sigset_t set, *uset; 1053 int error; 1054 1055 if (uap->ts != NULL) { 1056 error = copyin(uap->ts, &ts, sizeof(ts)); 1057 if (error != 0) 1058 return (error); 1059 TIMESPEC_TO_TIMEVAL(&tv, &ts); 1060 tvp = &tv; 1061 } else 1062 tvp = NULL; 1063 if (uap->sm != NULL) { 1064 error = copyin(uap->sm, &set, sizeof(set)); 1065 if (error != 0) 1066 return (error); 1067 uset = &set; 1068 } else 1069 uset = NULL; 1070 return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, 1071 uset, NFDBITS)); 1072 } 1073 1074 int 1075 kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex, 1076 struct timeval *tvp, sigset_t *uset, int abi_nfdbits) 1077 { 1078 int error; 1079 1080 if (uset != NULL) { 1081 error = kern_sigprocmask(td, SIG_SETMASK, uset, 1082 &td->td_oldsigmask, 0); 1083 if (error != 0) 1084 return (error); 1085 td->td_pflags |= TDP_OLDMASK; 1086 } 1087 error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits); 1088 if (uset != NULL) { 1089 /* 1090 * Make sure that ast() is called on return to 1091 * usermode and TDP_OLDMASK is cleared, restoring old 1092 * sigmask. If we didn't get interrupted, then the caller is 1093 * likely not expecting a signal to hit that should normally be 1094 * blocked by its signal mask, so we restore the mask before 1095 * any signals could be delivered. 1096 */ 1097 if (error == EINTR) { 1098 ast_sched(td, TDA_SIGSUSPEND); 1099 } else { 1100 /* *select(2) should never restart. */ 1101 MPASS(error != ERESTART); 1102 ast_sched(td, TDA_PSELECT); 1103 } 1104 } 1105 1106 return (error); 1107 } 1108 1109 #ifndef _SYS_SYSPROTO_H_ 1110 struct select_args { 1111 int nd; 1112 fd_set *in, *ou, *ex; 1113 struct timeval *tv; 1114 }; 1115 #endif 1116 int 1117 sys_select(struct thread *td, struct select_args *uap) 1118 { 1119 struct timeval tv, *tvp; 1120 int error; 1121 1122 if (uap->tv != NULL) { 1123 error = copyin(uap->tv, &tv, sizeof(tv)); 1124 if (error) 1125 return (error); 1126 tvp = &tv; 1127 } else 1128 tvp = NULL; 1129 1130 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, 1131 NFDBITS)); 1132 } 1133 1134 /* 1135 * In the unlikely case when user specified n greater then the last 1136 * open file descriptor, check that no bits are set after the last 1137 * valid fd. We must return EBADF if any is set. 1138 * 1139 * There are applications that rely on the behaviour. 1140 * 1141 * nd is fd_nfiles. 1142 */ 1143 static int 1144 select_check_badfd(fd_set *fd_in, int nd, int ndu, int abi_nfdbits) 1145 { 1146 char *addr, *oaddr; 1147 int b, i, res; 1148 uint8_t bits; 1149 1150 if (nd >= ndu || fd_in == NULL) 1151 return (0); 1152 1153 oaddr = NULL; 1154 bits = 0; /* silence gcc */ 1155 for (i = nd; i < ndu; i++) { 1156 b = i / NBBY; 1157 #if BYTE_ORDER == LITTLE_ENDIAN 1158 addr = (char *)fd_in + b; 1159 #else 1160 addr = (char *)fd_in; 1161 if (abi_nfdbits == NFDBITS) { 1162 addr += rounddown(b, sizeof(fd_mask)) + 1163 sizeof(fd_mask) - 1 - b % sizeof(fd_mask); 1164 } else { 1165 addr += rounddown(b, sizeof(uint32_t)) + 1166 sizeof(uint32_t) - 1 - b % sizeof(uint32_t); 1167 } 1168 #endif 1169 if (addr != oaddr) { 1170 res = fubyte(addr); 1171 if (res == -1) 1172 return (EFAULT); 1173 oaddr = addr; 1174 bits = res; 1175 } 1176 if ((bits & (1 << (i % NBBY))) != 0) 1177 return (EBADF); 1178 } 1179 return (0); 1180 } 1181 1182 int 1183 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 1184 fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits) 1185 { 1186 struct filedesc *fdp; 1187 /* 1188 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 1189 * infds with the new FD_SETSIZE of 1024, and more than enough for 1190 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 1191 * of 256. 1192 */ 1193 fd_mask s_selbits[howmany(2048, NFDBITS)]; 1194 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 1195 struct timeval rtv; 1196 sbintime_t asbt, precision, rsbt; 1197 u_int nbufbytes, ncpbytes, ncpubytes, nfdbits; 1198 int error, lf, ndu; 1199 1200 if (nd < 0) 1201 return (EXTERROR(EINVAL, "negative ndescs")); 1202 fdp = td->td_proc->p_fd; 1203 ndu = nd; 1204 lf = fdp->fd_nfiles; 1205 if (nd > lf) 1206 nd = lf; 1207 1208 error = select_check_badfd(fd_in, nd, ndu, abi_nfdbits); 1209 if (error != 0) 1210 return (error); 1211 error = select_check_badfd(fd_ou, nd, ndu, abi_nfdbits); 1212 if (error != 0) 1213 return (error); 1214 error = select_check_badfd(fd_ex, nd, ndu, abi_nfdbits); 1215 if (error != 0) 1216 return (error); 1217 1218 /* 1219 * Allocate just enough bits for the non-null fd_sets. Use the 1220 * preallocated auto buffer if possible. 1221 */ 1222 nfdbits = roundup(nd, NFDBITS); 1223 ncpbytes = nfdbits / NBBY; 1224 ncpubytes = roundup(nd, abi_nfdbits) / NBBY; 1225 nbufbytes = 0; 1226 if (fd_in != NULL) 1227 nbufbytes += 2 * ncpbytes; 1228 if (fd_ou != NULL) 1229 nbufbytes += 2 * ncpbytes; 1230 if (fd_ex != NULL) 1231 nbufbytes += 2 * ncpbytes; 1232 if (nbufbytes <= sizeof s_selbits) 1233 selbits = &s_selbits[0]; 1234 else 1235 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 1236 1237 /* 1238 * Assign pointers into the bit buffers and fetch the input bits. 1239 * Put the output buffers together so that they can be bzeroed 1240 * together. 1241 */ 1242 sbp = selbits; 1243 #define getbits(name, x) \ 1244 do { \ 1245 if (name == NULL) { \ 1246 ibits[x] = NULL; \ 1247 obits[x] = NULL; \ 1248 } else { \ 1249 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 1250 obits[x] = sbp; \ 1251 sbp += ncpbytes / sizeof *sbp; \ 1252 error = copyin(name, ibits[x], ncpubytes); \ 1253 if (error != 0) \ 1254 goto done; \ 1255 if (ncpbytes != ncpubytes) \ 1256 bzero((char *)ibits[x] + ncpubytes, \ 1257 ncpbytes - ncpubytes); \ 1258 } \ 1259 } while (0) 1260 getbits(fd_in, 0); 1261 getbits(fd_ou, 1); 1262 getbits(fd_ex, 2); 1263 #undef getbits 1264 1265 #if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__) 1266 /* 1267 * XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS, 1268 * we are running under 32-bit emulation. This should be more 1269 * generic. 1270 */ 1271 #define swizzle_fdset(bits) \ 1272 if (abi_nfdbits != NFDBITS && bits != NULL) { \ 1273 int i; \ 1274 for (i = 0; i < ncpbytes / sizeof *sbp; i++) \ 1275 bits[i] = (bits[i] >> 32) | (bits[i] << 32); \ 1276 } 1277 #else 1278 #define swizzle_fdset(bits) 1279 #endif 1280 1281 /* Make sure the bit order makes it through an ABI transition */ 1282 swizzle_fdset(ibits[0]); 1283 swizzle_fdset(ibits[1]); 1284 swizzle_fdset(ibits[2]); 1285 1286 if (nbufbytes != 0) 1287 bzero(selbits, nbufbytes / 2); 1288 1289 precision = 0; 1290 if (tvp != NULL) { 1291 rtv = *tvp; 1292 if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || 1293 rtv.tv_usec >= 1000000) { 1294 error = EXTERROR(EINVAL, "invalid timeval"); 1295 goto done; 1296 } 1297 if (!timevalisset(&rtv)) 1298 asbt = 0; 1299 else if (rtv.tv_sec <= INT32_MAX) { 1300 rsbt = tvtosbt(rtv); 1301 precision = rsbt; 1302 precision >>= tc_precexp; 1303 if (TIMESEL(&asbt, rsbt)) 1304 asbt += tc_tick_sbt; 1305 if (asbt <= SBT_MAX - rsbt) 1306 asbt += rsbt; 1307 else 1308 asbt = -1; 1309 } else 1310 asbt = -1; 1311 } else 1312 asbt = -1; 1313 seltdinit(td); 1314 /* Iterate until the timeout expires or descriptors become ready. */ 1315 for (;;) { 1316 error = selscan(td, ibits, obits, nd); 1317 if (error || td->td_retval[0] != 0) 1318 break; 1319 error = seltdwait(td, asbt, precision); 1320 if (error) 1321 break; 1322 error = selrescan(td, ibits, obits); 1323 if (error || td->td_retval[0] != 0) 1324 break; 1325 } 1326 seltdclear(td); 1327 1328 done: 1329 /* select is not restarted after signals... */ 1330 if (error == ERESTART) 1331 error = EINTR; 1332 if (error == EWOULDBLOCK) 1333 error = 0; 1334 1335 /* swizzle bit order back, if necessary */ 1336 swizzle_fdset(obits[0]); 1337 swizzle_fdset(obits[1]); 1338 swizzle_fdset(obits[2]); 1339 #undef swizzle_fdset 1340 1341 #define putbits(name, x) \ 1342 if (name && (error2 = copyout(obits[x], name, ncpubytes))) \ 1343 error = error2; 1344 if (error == 0) { 1345 int error2; 1346 1347 putbits(fd_in, 0); 1348 putbits(fd_ou, 1); 1349 putbits(fd_ex, 2); 1350 #undef putbits 1351 } 1352 if (selbits != &s_selbits[0]) 1353 free(selbits, M_SELECT); 1354 1355 return (error); 1356 } 1357 /* 1358 * Convert a select bit set to poll flags. 1359 * 1360 * The backend always returns POLLHUP/POLLERR if appropriate and we 1361 * return this as a set bit in any set. 1362 */ 1363 static const int select_flags[3] = { 1364 POLLRDNORM | POLLHUP | POLLERR, 1365 POLLWRNORM | POLLHUP | POLLERR, 1366 POLLRDBAND | POLLERR 1367 }; 1368 1369 /* 1370 * Compute the fo_poll flags required for a fd given by the index and 1371 * bit position in the fd_mask array. 1372 */ 1373 static __inline int 1374 selflags(fd_mask **ibits, int idx, fd_mask bit) 1375 { 1376 int flags; 1377 int msk; 1378 1379 flags = 0; 1380 for (msk = 0; msk < 3; msk++) { 1381 if (ibits[msk] == NULL) 1382 continue; 1383 if ((ibits[msk][idx] & bit) == 0) 1384 continue; 1385 flags |= select_flags[msk]; 1386 } 1387 return (flags); 1388 } 1389 1390 /* 1391 * Set the appropriate output bits given a mask of fired events and the 1392 * input bits originally requested. 1393 */ 1394 static __inline int 1395 selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events) 1396 { 1397 int msk; 1398 int n; 1399 1400 n = 0; 1401 for (msk = 0; msk < 3; msk++) { 1402 if ((events & select_flags[msk]) == 0) 1403 continue; 1404 if (ibits[msk] == NULL) 1405 continue; 1406 if ((ibits[msk][idx] & bit) == 0) 1407 continue; 1408 /* 1409 * XXX Check for a duplicate set. This can occur because a 1410 * socket calls selrecord() twice for each poll() call 1411 * resulting in two selfds per real fd. selrescan() will 1412 * call selsetbits twice as a result. 1413 */ 1414 if ((obits[msk][idx] & bit) != 0) 1415 continue; 1416 obits[msk][idx] |= bit; 1417 n++; 1418 } 1419 1420 return (n); 1421 } 1422 1423 /* 1424 * Traverse the list of fds attached to this thread's seltd and check for 1425 * completion. 1426 */ 1427 static int 1428 selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits) 1429 { 1430 struct filedesc *fdp; 1431 struct selinfo *si; 1432 struct seltd *stp; 1433 struct selfd *sfp; 1434 struct selfd *sfn; 1435 struct file *fp; 1436 fd_mask bit; 1437 int fd, ev, n, idx; 1438 int error; 1439 bool only_user; 1440 1441 fdp = td->td_proc->p_fd; 1442 stp = td->td_sel; 1443 n = 0; 1444 only_user = FILEDESC_IS_ONLY_USER(fdp); 1445 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { 1446 fd = (int)(uintptr_t)sfp->sf_cookie; 1447 si = sfp->sf_si; 1448 selfdfree(stp, sfp); 1449 /* If the selinfo wasn't cleared the event didn't fire. */ 1450 if (si != NULL) 1451 continue; 1452 if (only_user) 1453 error = fget_only_user(fdp, fd, &cap_event_rights, &fp); 1454 else 1455 error = fget_unlocked(td, fd, &cap_event_rights, &fp); 1456 if (__predict_false(error != 0)) 1457 return (error); 1458 idx = fd / NFDBITS; 1459 bit = (fd_mask)1 << (fd % NFDBITS); 1460 ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td); 1461 if (only_user) 1462 fput_only_user(fdp, fp); 1463 else 1464 fdrop(fp, td); 1465 if (ev != 0) 1466 n += selsetbits(ibits, obits, idx, bit, ev); 1467 } 1468 stp->st_flags = 0; 1469 td->td_retval[0] = n; 1470 return (0); 1471 } 1472 1473 /* 1474 * Perform the initial filedescriptor scan and register ourselves with 1475 * each selinfo. 1476 */ 1477 static int 1478 selscan(struct thread *td, fd_mask **ibits, fd_mask **obits, int nfd) 1479 { 1480 struct filedesc *fdp; 1481 struct file *fp; 1482 fd_mask bit; 1483 int ev, flags, end, fd; 1484 int n, idx; 1485 int error; 1486 bool only_user; 1487 1488 fdp = td->td_proc->p_fd; 1489 n = 0; 1490 only_user = FILEDESC_IS_ONLY_USER(fdp); 1491 for (idx = 0, fd = 0; fd < nfd; idx++) { 1492 end = imin(fd + NFDBITS, nfd); 1493 for (bit = 1; fd < end; bit <<= 1, fd++) { 1494 /* Compute the list of events we're interested in. */ 1495 flags = selflags(ibits, idx, bit); 1496 if (flags == 0) 1497 continue; 1498 if (only_user) 1499 error = fget_only_user(fdp, fd, &cap_event_rights, &fp); 1500 else 1501 error = fget_unlocked(td, fd, &cap_event_rights, &fp); 1502 if (__predict_false(error != 0)) 1503 return (error); 1504 selfdalloc(td, (void *)(uintptr_t)fd); 1505 ev = fo_poll(fp, flags, td->td_ucred, td); 1506 if (only_user) 1507 fput_only_user(fdp, fp); 1508 else 1509 fdrop(fp, td); 1510 if (ev != 0) 1511 n += selsetbits(ibits, obits, idx, bit, ev); 1512 } 1513 } 1514 1515 td->td_retval[0] = n; 1516 return (0); 1517 } 1518 1519 int 1520 sys_poll(struct thread *td, struct poll_args *uap) 1521 { 1522 struct timespec ts, *tsp; 1523 1524 if (uap->timeout != INFTIM) { 1525 if (uap->timeout < 0) 1526 return (EXTERROR(EINVAL, "invalid timeout")); 1527 ts.tv_sec = uap->timeout / 1000; 1528 ts.tv_nsec = (uap->timeout % 1000) * 1000000; 1529 tsp = &ts; 1530 } else 1531 tsp = NULL; 1532 1533 return (kern_poll(td, uap->fds, uap->nfds, tsp, NULL)); 1534 } 1535 1536 /* 1537 * kfds points to an array in the kernel. 1538 */ 1539 int 1540 kern_poll_kfds(struct thread *td, struct pollfd *kfds, u_int nfds, 1541 struct timespec *tsp, sigset_t *uset) 1542 { 1543 sbintime_t sbt, precision, tmp; 1544 time_t over; 1545 struct timespec ts; 1546 int error; 1547 1548 precision = 0; 1549 if (tsp != NULL) { 1550 if (!timespecvalid_interval(tsp)) 1551 return (EXTERROR(EINVAL, "invalid timespec")); 1552 if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) 1553 sbt = 0; 1554 else { 1555 ts = *tsp; 1556 if (ts.tv_sec > INT32_MAX / 2) { 1557 over = ts.tv_sec - INT32_MAX / 2; 1558 ts.tv_sec -= over; 1559 } else 1560 over = 0; 1561 tmp = tstosbt(ts); 1562 precision = tmp; 1563 precision >>= tc_precexp; 1564 if (TIMESEL(&sbt, tmp)) 1565 sbt += tc_tick_sbt; 1566 sbt += tmp; 1567 } 1568 } else 1569 sbt = -1; 1570 1571 if (uset != NULL) { 1572 error = kern_sigprocmask(td, SIG_SETMASK, uset, 1573 &td->td_oldsigmask, 0); 1574 if (error) 1575 return (error); 1576 td->td_pflags |= TDP_OLDMASK; 1577 } 1578 1579 seltdinit(td); 1580 /* Iterate until the timeout expires or descriptors become ready. */ 1581 for (;;) { 1582 error = pollscan(td, kfds, nfds); 1583 if (error || td->td_retval[0] != 0) 1584 break; 1585 error = seltdwait(td, sbt, precision); 1586 if (error) 1587 break; 1588 error = pollrescan(td); 1589 if (error || td->td_retval[0] != 0) 1590 break; 1591 } 1592 seltdclear(td); 1593 1594 /* poll is not restarted after signals... */ 1595 if (error == ERESTART) 1596 error = EINTR; 1597 if (error == EWOULDBLOCK) 1598 error = 0; 1599 1600 if (uset != NULL) { 1601 /* 1602 * Make sure that ast() is called on return to 1603 * usermode and TDP_OLDMASK is cleared, restoring old 1604 * sigmask. If we didn't get interrupted, then the caller is 1605 * likely not expecting a signal to hit that should normally be 1606 * blocked by its signal mask, so we restore the mask before 1607 * any signals could be delivered. 1608 */ 1609 if (error == EINTR) 1610 ast_sched(td, TDA_SIGSUSPEND); 1611 else 1612 ast_sched(td, TDA_PSELECT); 1613 } 1614 1615 return (error); 1616 } 1617 1618 int 1619 sys_ppoll(struct thread *td, struct ppoll_args *uap) 1620 { 1621 struct timespec ts, *tsp; 1622 sigset_t set, *ssp; 1623 int error; 1624 1625 if (uap->ts != NULL) { 1626 error = copyin(uap->ts, &ts, sizeof(ts)); 1627 if (error) 1628 return (error); 1629 tsp = &ts; 1630 } else 1631 tsp = NULL; 1632 if (uap->set != NULL) { 1633 error = copyin(uap->set, &set, sizeof(set)); 1634 if (error) 1635 return (error); 1636 ssp = &set; 1637 } else 1638 ssp = NULL; 1639 return (kern_poll(td, uap->fds, uap->nfds, tsp, ssp)); 1640 } 1641 1642 /* 1643 * ufds points to an array in user space. 1644 */ 1645 int 1646 kern_poll(struct thread *td, struct pollfd *ufds, u_int nfds, 1647 struct timespec *tsp, sigset_t *set) 1648 { 1649 struct pollfd *kfds; 1650 struct pollfd stackfds[32]; 1651 int error; 1652 1653 if (kern_poll_maxfds(nfds)) 1654 return (EXTERROR(EINVAL, "too large nfds")); 1655 if (nfds > nitems(stackfds)) 1656 kfds = mallocarray(nfds, sizeof(*kfds), M_TEMP, M_WAITOK); 1657 else 1658 kfds = stackfds; 1659 error = copyin(ufds, kfds, nfds * sizeof(*kfds)); 1660 if (error != 0) 1661 goto out; 1662 1663 error = kern_poll_kfds(td, kfds, nfds, tsp, set); 1664 if (error == 0) 1665 error = pollout(td, kfds, ufds, nfds); 1666 #ifdef KTRACE 1667 if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY)) 1668 ktrstructarray("pollfd", UIO_USERSPACE, ufds, nfds, 1669 sizeof(*ufds)); 1670 #endif 1671 1672 out: 1673 if (nfds > nitems(stackfds)) 1674 free(kfds, M_TEMP); 1675 return (error); 1676 } 1677 1678 bool 1679 kern_poll_maxfds(u_int nfds) 1680 { 1681 1682 /* 1683 * This is kinda bogus. We have fd limits, but that is not 1684 * really related to the size of the pollfd array. Make sure 1685 * we let the process use at least FD_SETSIZE entries and at 1686 * least enough for the system-wide limits. We want to be reasonably 1687 * safe, but not overly restrictive. 1688 */ 1689 return (nfds > maxfilesperproc && nfds > FD_SETSIZE); 1690 } 1691 1692 static int 1693 pollrescan(struct thread *td) 1694 { 1695 struct seltd *stp; 1696 struct selfd *sfp; 1697 struct selfd *sfn; 1698 struct selinfo *si; 1699 struct filedesc *fdp; 1700 struct file *fp; 1701 struct pollfd *fd; 1702 int n, error; 1703 bool only_user; 1704 1705 n = 0; 1706 fdp = td->td_proc->p_fd; 1707 stp = td->td_sel; 1708 only_user = FILEDESC_IS_ONLY_USER(fdp); 1709 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { 1710 fd = (struct pollfd *)sfp->sf_cookie; 1711 si = sfp->sf_si; 1712 selfdfree(stp, sfp); 1713 /* If the selinfo wasn't cleared the event didn't fire. */ 1714 if (si != NULL) 1715 continue; 1716 if (only_user) 1717 error = fget_only_user(fdp, fd->fd, &cap_event_rights, &fp); 1718 else 1719 error = fget_unlocked(td, fd->fd, &cap_event_rights, &fp); 1720 if (__predict_false(error != 0)) { 1721 fd->revents = POLLNVAL; 1722 n++; 1723 continue; 1724 } 1725 /* 1726 * Note: backend also returns POLLHUP and 1727 * POLLERR if appropriate. 1728 */ 1729 fd->revents = fo_poll(fp, fd->events, td->td_ucred, td); 1730 if (only_user) 1731 fput_only_user(fdp, fp); 1732 else 1733 fdrop(fp, td); 1734 if (fd->revents != 0) 1735 n++; 1736 } 1737 stp->st_flags = 0; 1738 td->td_retval[0] = n; 1739 return (0); 1740 } 1741 1742 static int 1743 pollout(struct thread *td, struct pollfd *fds, struct pollfd *ufds, u_int nfd) 1744 { 1745 int error = 0; 1746 u_int i = 0; 1747 u_int n = 0; 1748 1749 for (i = 0; i < nfd; i++) { 1750 error = copyout(&fds->revents, &ufds->revents, 1751 sizeof(ufds->revents)); 1752 if (error) 1753 return (error); 1754 if (fds->revents != 0) 1755 n++; 1756 fds++; 1757 ufds++; 1758 } 1759 td->td_retval[0] = n; 1760 return (0); 1761 } 1762 1763 static int 1764 pollscan(struct thread *td, struct pollfd *fds, u_int nfd) 1765 { 1766 struct filedesc *fdp; 1767 struct file *fp; 1768 int i, n, error; 1769 bool only_user; 1770 1771 n = 0; 1772 fdp = td->td_proc->p_fd; 1773 only_user = FILEDESC_IS_ONLY_USER(fdp); 1774 for (i = 0; i < nfd; i++, fds++) { 1775 if (fds->fd < 0) { 1776 fds->revents = 0; 1777 continue; 1778 } 1779 if (only_user) 1780 error = fget_only_user(fdp, fds->fd, &cap_event_rights, &fp); 1781 else 1782 error = fget_unlocked(td, fds->fd, &cap_event_rights, &fp); 1783 if (__predict_false(error != 0)) { 1784 fds->revents = POLLNVAL; 1785 n++; 1786 continue; 1787 } 1788 /* 1789 * Note: backend also returns POLLHUP and 1790 * POLLERR if appropriate. 1791 */ 1792 selfdalloc(td, fds); 1793 fds->revents = fo_poll(fp, fds->events, 1794 td->td_ucred, td); 1795 if (only_user) 1796 fput_only_user(fdp, fp); 1797 else 1798 fdrop(fp, td); 1799 /* 1800 * POSIX requires POLLOUT to be never 1801 * set simultaneously with POLLHUP. 1802 */ 1803 if ((fds->revents & POLLHUP) != 0) 1804 fds->revents &= ~POLLOUT; 1805 1806 if (fds->revents != 0) 1807 n++; 1808 } 1809 td->td_retval[0] = n; 1810 return (0); 1811 } 1812 1813 /* 1814 * XXX This was created specifically to support netncp and netsmb. This 1815 * allows the caller to specify a socket to wait for events on. It returns 1816 * 0 if any events matched and an error otherwise. There is no way to 1817 * determine which events fired. 1818 */ 1819 int 1820 selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td) 1821 { 1822 struct timeval rtv; 1823 sbintime_t asbt, precision, rsbt; 1824 int error; 1825 1826 precision = 0; /* stupid gcc! */ 1827 if (tvp != NULL) { 1828 rtv = *tvp; 1829 if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || 1830 rtv.tv_usec >= 1000000) 1831 return (EXTERROR(EINVAL, "invalid timeval")); 1832 if (!timevalisset(&rtv)) 1833 asbt = 0; 1834 else if (rtv.tv_sec <= INT32_MAX) { 1835 rsbt = tvtosbt(rtv); 1836 precision = rsbt; 1837 precision >>= tc_precexp; 1838 if (TIMESEL(&asbt, rsbt)) 1839 asbt += tc_tick_sbt; 1840 if (asbt <= SBT_MAX - rsbt) 1841 asbt += rsbt; 1842 else 1843 asbt = -1; 1844 } else 1845 asbt = -1; 1846 } else 1847 asbt = -1; 1848 seltdinit(td); 1849 /* 1850 * Iterate until the timeout expires or the socket becomes ready. 1851 */ 1852 for (;;) { 1853 selfdalloc(td, NULL); 1854 if (so->so_proto->pr_sopoll(so, events, td) != 0) { 1855 error = 0; 1856 break; 1857 } 1858 error = seltdwait(td, asbt, precision); 1859 if (error) 1860 break; 1861 } 1862 seltdclear(td); 1863 /* XXX Duplicates ncp/smb behavior. */ 1864 if (error == ERESTART) 1865 error = 0; 1866 return (error); 1867 } 1868 1869 /* 1870 * Preallocate two selfds associated with 'cookie'. Some fo_poll routines 1871 * have two select sets, one for read and another for write. 1872 */ 1873 static void 1874 selfdalloc(struct thread *td, void *cookie) 1875 { 1876 struct seltd *stp; 1877 1878 stp = td->td_sel; 1879 if (stp->st_free1 == NULL) 1880 stp->st_free1 = malloc(sizeof(*stp->st_free1), M_SELFD, M_WAITOK|M_ZERO); 1881 stp->st_free1->sf_td = stp; 1882 stp->st_free1->sf_cookie = cookie; 1883 if (stp->st_free2 == NULL) 1884 stp->st_free2 = malloc(sizeof(*stp->st_free2), M_SELFD, M_WAITOK|M_ZERO); 1885 stp->st_free2->sf_td = stp; 1886 stp->st_free2->sf_cookie = cookie; 1887 } 1888 1889 static void 1890 selfdfree(struct seltd *stp, struct selfd *sfp) 1891 { 1892 STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link); 1893 /* 1894 * Paired with doselwakeup. 1895 */ 1896 if (atomic_load_acq_ptr((uintptr_t *)&sfp->sf_si) != (uintptr_t)NULL) { 1897 mtx_lock(sfp->sf_mtx); 1898 if (sfp->sf_si != NULL) { 1899 TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads); 1900 } 1901 mtx_unlock(sfp->sf_mtx); 1902 } 1903 free(sfp, M_SELFD); 1904 } 1905 1906 /* Drain the waiters tied to all the selfd belonging the specified selinfo. */ 1907 void 1908 seldrain(struct selinfo *sip) 1909 { 1910 1911 /* 1912 * This feature is already provided by doselwakeup(), thus it is 1913 * enough to go for it. 1914 * Eventually, the context, should take care to avoid races 1915 * between thread calling select()/poll() and file descriptor 1916 * detaching, but, again, the races are just the same as 1917 * selwakeup(). 1918 */ 1919 doselwakeup(sip, -1); 1920 } 1921 1922 /* 1923 * Record a select request. 1924 */ 1925 void 1926 selrecord(struct thread *selector, struct selinfo *sip) 1927 { 1928 struct selfd *sfp; 1929 struct seltd *stp; 1930 struct mtx *mtxp; 1931 1932 stp = selector->td_sel; 1933 /* 1934 * Don't record when doing a rescan. 1935 */ 1936 if (stp->st_flags & SELTD_RESCAN) 1937 return; 1938 /* 1939 * Grab one of the preallocated descriptors. 1940 */ 1941 sfp = NULL; 1942 if ((sfp = stp->st_free1) != NULL) 1943 stp->st_free1 = NULL; 1944 else if ((sfp = stp->st_free2) != NULL) 1945 stp->st_free2 = NULL; 1946 else 1947 panic("selrecord: No free selfd on selq"); 1948 mtxp = sip->si_mtx; 1949 if (mtxp == NULL) 1950 mtxp = mtx_pool_find(mtxpool_select, sip); 1951 /* 1952 * Initialize the sfp and queue it in the thread. 1953 */ 1954 sfp->sf_si = sip; 1955 sfp->sf_mtx = mtxp; 1956 STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link); 1957 /* 1958 * Now that we've locked the sip, check for initialization. 1959 */ 1960 mtx_lock(mtxp); 1961 if (sip->si_mtx == NULL) { 1962 sip->si_mtx = mtxp; 1963 TAILQ_INIT(&sip->si_tdlist); 1964 } 1965 /* 1966 * Add this thread to the list of selfds listening on this selinfo. 1967 */ 1968 TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads); 1969 mtx_unlock(sip->si_mtx); 1970 } 1971 1972 /* Wake up a selecting thread. */ 1973 void 1974 selwakeup(struct selinfo *sip) 1975 { 1976 doselwakeup(sip, -1); 1977 } 1978 1979 /* Wake up a selecting thread, and set its priority. */ 1980 void 1981 selwakeuppri(struct selinfo *sip, int pri) 1982 { 1983 doselwakeup(sip, pri); 1984 } 1985 1986 /* 1987 * Do a wakeup when a selectable event occurs. 1988 */ 1989 static void 1990 doselwakeup(struct selinfo *sip, int pri) 1991 { 1992 struct selfd *sfp; 1993 struct selfd *sfn; 1994 struct seltd *stp; 1995 1996 /* If it's not initialized there can't be any waiters. */ 1997 if (sip->si_mtx == NULL) 1998 return; 1999 /* 2000 * Locking the selinfo locks all selfds associated with it. 2001 */ 2002 mtx_lock(sip->si_mtx); 2003 TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) { 2004 /* 2005 * Once we remove this sfp from the list and clear the 2006 * sf_si seltdclear will know to ignore this si. 2007 */ 2008 TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads); 2009 stp = sfp->sf_td; 2010 mtx_lock(&stp->st_mtx); 2011 stp->st_flags |= SELTD_PENDING; 2012 cv_broadcastpri(&stp->st_wait, pri); 2013 mtx_unlock(&stp->st_mtx); 2014 /* 2015 * Paired with selfdfree. 2016 * 2017 * Storing this only after the wakeup provides an invariant that 2018 * stp is not used after selfdfree returns. 2019 */ 2020 atomic_store_rel_ptr((uintptr_t *)&sfp->sf_si, (uintptr_t)NULL); 2021 } 2022 mtx_unlock(sip->si_mtx); 2023 } 2024 2025 static void 2026 seltdinit(struct thread *td) 2027 { 2028 struct seltd *stp; 2029 2030 stp = td->td_sel; 2031 if (stp != NULL) { 2032 MPASS(stp->st_flags == 0); 2033 MPASS(STAILQ_EMPTY(&stp->st_selq)); 2034 return; 2035 } 2036 stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO); 2037 mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF); 2038 cv_init(&stp->st_wait, "select"); 2039 stp->st_flags = 0; 2040 STAILQ_INIT(&stp->st_selq); 2041 td->td_sel = stp; 2042 } 2043 2044 static int 2045 seltdwait(struct thread *td, sbintime_t sbt, sbintime_t precision) 2046 { 2047 struct seltd *stp; 2048 int error; 2049 2050 stp = td->td_sel; 2051 /* 2052 * An event of interest may occur while we do not hold the seltd 2053 * locked so check the pending flag before we sleep. 2054 */ 2055 mtx_lock(&stp->st_mtx); 2056 /* 2057 * Any further calls to selrecord will be a rescan. 2058 */ 2059 stp->st_flags |= SELTD_RESCAN; 2060 if (stp->st_flags & SELTD_PENDING) { 2061 mtx_unlock(&stp->st_mtx); 2062 return (0); 2063 } 2064 if (sbt == 0) 2065 error = EWOULDBLOCK; 2066 else if (sbt != -1) 2067 error = cv_timedwait_sig_sbt(&stp->st_wait, &stp->st_mtx, 2068 sbt, precision, C_ABSOLUTE); 2069 else 2070 error = cv_wait_sig(&stp->st_wait, &stp->st_mtx); 2071 mtx_unlock(&stp->st_mtx); 2072 2073 return (error); 2074 } 2075 2076 void 2077 seltdfini(struct thread *td) 2078 { 2079 struct seltd *stp; 2080 2081 stp = td->td_sel; 2082 if (stp == NULL) 2083 return; 2084 MPASS(stp->st_flags == 0); 2085 MPASS(STAILQ_EMPTY(&stp->st_selq)); 2086 if (stp->st_free1) 2087 free(stp->st_free1, M_SELFD); 2088 if (stp->st_free2) 2089 free(stp->st_free2, M_SELFD); 2090 td->td_sel = NULL; 2091 cv_destroy(&stp->st_wait); 2092 mtx_destroy(&stp->st_mtx); 2093 free(stp, M_SELECT); 2094 } 2095 2096 /* 2097 * Remove the references to the thread from all of the objects we were 2098 * polling. 2099 */ 2100 static void 2101 seltdclear(struct thread *td) 2102 { 2103 struct seltd *stp; 2104 struct selfd *sfp; 2105 struct selfd *sfn; 2106 2107 stp = td->td_sel; 2108 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) 2109 selfdfree(stp, sfp); 2110 stp->st_flags = 0; 2111 } 2112 2113 static void selectinit(void *); 2114 SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL); 2115 static void 2116 selectinit(void *dummy __unused) 2117 { 2118 2119 mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF); 2120 } 2121 2122 /* 2123 * Set up a syscall return value that follows the convention specified for 2124 * posix_* functions. 2125 */ 2126 int 2127 kern_posix_error(struct thread *td, int error) 2128 { 2129 2130 if (error <= 0) 2131 return (error); 2132 td->td_errno = error; 2133 td->td_pflags |= TDP_NERRNO; 2134 td->td_retval[0] = error; 2135 return (0); 2136 } 2137 2138 int 2139 kcmp_cmp(uintptr_t a, uintptr_t b) 2140 { 2141 if (a == b) 2142 return (0); 2143 else if (a < b) 2144 return (1); 2145 return (2); 2146 } 2147 2148 static int 2149 kcmp_pget(struct thread *td, pid_t pid, struct proc **pp) 2150 { 2151 int error; 2152 2153 if (pid == td->td_proc->p_pid) { 2154 *pp = td->td_proc; 2155 return (0); 2156 } 2157 error = pget(pid, PGET_NOTID | PGET_CANDEBUG | PGET_NOTWEXIT | 2158 PGET_HOLD, pp); 2159 MPASS(*pp != td->td_proc); 2160 return (error); 2161 } 2162 2163 int 2164 kern_kcmp(struct thread *td, pid_t pid1, pid_t pid2, int type, 2165 uintptr_t idx1, uintptr_t idx2) 2166 { 2167 struct proc *p1, *p2; 2168 struct file *fp1, *fp2; 2169 int error, res; 2170 2171 res = -1; 2172 p1 = p2 = NULL; 2173 error = kcmp_pget(td, pid1, &p1); 2174 if (error == 0) 2175 error = kcmp_pget(td, pid2, &p2); 2176 if (error != 0) 2177 goto out; 2178 2179 switch (type) { 2180 case KCMP_FILE: 2181 case KCMP_FILEOBJ: 2182 error = fget_remote(td, p1, idx1, &fp1); 2183 if (error == 0) { 2184 error = fget_remote(td, p2, idx2, &fp2); 2185 if (error == 0) { 2186 if (type == KCMP_FILEOBJ) 2187 res = fo_cmp(fp1, fp2, td); 2188 else 2189 res = kcmp_cmp((uintptr_t)fp1, 2190 (uintptr_t)fp2); 2191 fdrop(fp2, td); 2192 } 2193 fdrop(fp1, td); 2194 } 2195 break; 2196 case KCMP_FILES: 2197 res = kcmp_cmp((uintptr_t)p1->p_fd, (uintptr_t)p2->p_fd); 2198 break; 2199 case KCMP_SIGHAND: 2200 res = kcmp_cmp((uintptr_t)p1->p_sigacts, 2201 (uintptr_t)p2->p_sigacts); 2202 break; 2203 case KCMP_VM: 2204 res = kcmp_cmp((uintptr_t)p1->p_vmspace, 2205 (uintptr_t)p2->p_vmspace); 2206 break; 2207 default: 2208 error = EXTERROR(EINVAL, "unknown op"); 2209 break; 2210 } 2211 2212 out: 2213 if (p1 != NULL && p1 != td->td_proc) 2214 PRELE(p1); 2215 if (p2 != NULL && p2 != td->td_proc) 2216 PRELE(p2); 2217 2218 td->td_retval[0] = res; 2219 return (error); 2220 } 2221 2222 int 2223 sys_kcmp(struct thread *td, struct kcmp_args *uap) 2224 { 2225 return (kern_kcmp(td, uap->pid1, uap->pid2, uap->type, 2226 uap->idx1, uap->idx2)); 2227 } 2228 2229 int 2230 file_kcmp_generic(struct file *fp1, struct file *fp2, struct thread *td) 2231 { 2232 if (fp1->f_type != fp2->f_type) 2233 return (3); 2234 return (kcmp_cmp((uintptr_t)fp1->f_data, (uintptr_t)fp2->f_data)); 2235 } 2236 2237 int 2238 exterr_to_ue(struct thread *td, struct uexterror *ue) 2239 { 2240 if ((td->td_pflags2 & TDP2_EXTERR) == 0) 2241 return (ENOENT); 2242 2243 memset(ue, 0, sizeof(*ue)); 2244 ue->error = td->td_kexterr.error; 2245 ue->cat = td->td_kexterr.cat; 2246 ue->src_line = td->td_kexterr.src_line; 2247 ue->p1 = td->td_kexterr.p1; 2248 ue->p2 = td->td_kexterr.p2; 2249 if (td->td_kexterr.msg != NULL) 2250 strlcpy(ue->msg, td->td_kexterr.msg, sizeof(ue->msg)); 2251 return (0); 2252 } 2253 2254 void 2255 exterr_copyout(struct thread *td) 2256 { 2257 struct uexterror ue; 2258 ksiginfo_t ksi; 2259 void *uloc; 2260 size_t sz; 2261 int error; 2262 2263 MPASS((td->td_pflags2 & TDP2_UEXTERR) != 0); 2264 2265 uloc = (char *)td->td_exterr_ptr + __offsetof(struct uexterror, 2266 error); 2267 error = exterr_to_ue(td, &ue); 2268 if (error != 0) { 2269 ue.error = 0; 2270 sz = sizeof(ue.error); 2271 } else { 2272 sz = sizeof(ue) - __offsetof(struct uexterror, error); 2273 } 2274 error = copyout(&ue.error, uloc, sz); 2275 if (error != 0) { 2276 td->td_pflags2 &= ~TDP2_UEXTERR; 2277 ksiginfo_init_trap(&ksi); 2278 ksi.ksi_signo = SIGSEGV; 2279 ksi.ksi_code = SEGV_ACCERR; 2280 ksi.ksi_addr = uloc; 2281 trapsignal(td, &ksi); 2282 } 2283 } 2284 2285 int 2286 sys_exterrctl(struct thread *td, struct exterrctl_args *uap) 2287 { 2288 uint32_t ver; 2289 int error; 2290 2291 if ((uap->flags & ~(EXTERRCTLF_FORCE)) != 0) 2292 return (EINVAL); 2293 switch (uap->op) { 2294 case EXTERRCTL_ENABLE: 2295 if ((td->td_pflags2 & TDP2_UEXTERR) != 0 && 2296 (uap->flags & EXTERRCTLF_FORCE) == 0) 2297 return (EBUSY); 2298 td->td_pflags2 &= ~TDP2_UEXTERR; 2299 error = copyin(uap->ptr, &ver, sizeof(ver)); 2300 if (error != 0) 2301 return (error); 2302 if (ver != UEXTERROR_VER) 2303 return (EINVAL); 2304 td->td_pflags2 |= TDP2_UEXTERR; 2305 td->td_exterr_ptr = uap->ptr; 2306 return (0); 2307 case EXTERRCTL_DISABLE: 2308 if ((td->td_pflags2 & TDP2_UEXTERR) == 0) 2309 return (EINVAL); 2310 td->td_pflags2 &= ~TDP2_UEXTERR; 2311 return (0); 2312 case EXTERRCTL_UD: 2313 /* 2314 * Important: this code must always return EINVAL and never any 2315 * extended error, for testing purposes. 2316 */ 2317 /* FALLTHROUGH */ 2318 default: 2319 return (EINVAL); 2320 } 2321 } 2322 2323 int 2324 exterr_set(int eerror, int category, const char *mmsg, uintptr_t pp1, 2325 uintptr_t pp2, int line) 2326 { 2327 struct thread *td; 2328 2329 td = curthread; 2330 if ((td->td_pflags2 & TDP2_UEXTERR) != 0) { 2331 td->td_pflags2 |= TDP2_EXTERR; 2332 td->td_kexterr.error = eerror; 2333 td->td_kexterr.cat = category; 2334 td->td_kexterr.msg = mmsg; 2335 td->td_kexterr.p1 = pp1; 2336 td->td_kexterr.p2 = pp2; 2337 td->td_kexterr.src_line = line; 2338 ktrexterr(td); 2339 } 2340 return (eerror); 2341 } 2342