1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 #include "opt_capsicum.h" 38 #include "opt_ktrace.h" 39 40 #define EXTERR_CATEGORY EXTERR_CAT_GENIO 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/sysproto.h> 44 #include <sys/capsicum.h> 45 #include <sys/exterrvar.h> 46 #include <sys/filedesc.h> 47 #include <sys/filio.h> 48 #include <sys/fcntl.h> 49 #include <sys/file.h> 50 #include <sys/inotify.h> 51 #include <sys/lock.h> 52 #include <sys/proc.h> 53 #include <sys/signalvar.h> 54 #include <sys/protosw.h> 55 #include <sys/socketvar.h> 56 #include <sys/uio.h> 57 #include <sys/eventfd.h> 58 #include <sys/kernel.h> 59 #include <sys/ktr.h> 60 #include <sys/limits.h> 61 #include <sys/malloc.h> 62 #include <sys/poll.h> 63 #include <sys/resourcevar.h> 64 #include <sys/selinfo.h> 65 #include <sys/sleepqueue.h> 66 #include <sys/specialfd.h> 67 #include <sys/syscallsubr.h> 68 #include <sys/sysctl.h> 69 #include <sys/sysent.h> 70 #include <sys/vnode.h> 71 #include <sys/unistd.h> 72 #include <sys/bio.h> 73 #include <sys/buf.h> 74 #include <sys/condvar.h> 75 #ifdef KTRACE 76 #include <sys/ktrace.h> 77 #endif 78 79 #include <security/audit/audit.h> 80 81 /* 82 * The following macro defines how many bytes will be allocated from 83 * the stack instead of memory allocated when passing the IOCTL data 84 * structures from userspace and to the kernel. Some IOCTLs having 85 * small data structures are used very frequently and this small 86 * buffer on the stack gives a significant speedup improvement for 87 * those requests. The value of this define should be greater or equal 88 * to 64 bytes and should also be power of two. The data structure is 89 * currently hard-aligned to a 8-byte boundary on the stack. This 90 * should currently be sufficient for all supported platforms. 91 */ 92 #define SYS_IOCTL_SMALL_SIZE 128 /* bytes */ 93 #define SYS_IOCTL_SMALL_ALIGN 8 /* bytes */ 94 95 #ifdef __LP64__ 96 static int iosize_max_clamp = 0; 97 SYSCTL_INT(_debug, OID_AUTO, iosize_max_clamp, CTLFLAG_RW, 98 &iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX"); 99 static int devfs_iosize_max_clamp = 1; 100 SYSCTL_INT(_debug, OID_AUTO, devfs_iosize_max_clamp, CTLFLAG_RW, 101 &devfs_iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX for devices"); 102 #endif 103 104 /* 105 * Assert that the return value of read(2) and write(2) syscalls fits 106 * into a register. If not, an architecture will need to provide the 107 * usermode wrappers to reconstruct the result. 108 */ 109 CTASSERT(sizeof(register_t) >= sizeof(size_t)); 110 111 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 112 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 113 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 114 115 static int pollout(struct thread *, struct pollfd *, struct pollfd *, 116 u_int); 117 static int pollscan(struct thread *, struct pollfd *, u_int); 118 static int pollrescan(struct thread *); 119 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 120 static int selrescan(struct thread *, fd_mask **, fd_mask **); 121 static void selfdalloc(struct thread *, void *); 122 static void selfdfree(struct seltd *, struct selfd *); 123 static int dofileread(struct thread *, int, struct file *, struct uio *, 124 off_t, int); 125 static int dofilewrite(struct thread *, int, struct file *, struct uio *, 126 off_t, int); 127 static void doselwakeup(struct selinfo *, int); 128 static void seltdinit(struct thread *); 129 static int seltdwait(struct thread *, sbintime_t, sbintime_t); 130 static void seltdclear(struct thread *); 131 132 /* 133 * One seltd per-thread allocated on demand as needed. 134 * 135 * t - protected by st_mtx 136 * k - Only accessed by curthread or read-only 137 */ 138 struct seltd { 139 STAILQ_HEAD(, selfd) st_selq; /* (k) List of selfds. */ 140 struct selfd *st_free1; /* (k) free fd for read set. */ 141 struct selfd *st_free2; /* (k) free fd for write set. */ 142 struct mtx st_mtx; /* Protects struct seltd */ 143 struct cv st_wait; /* (t) Wait channel. */ 144 int st_flags; /* (t) SELTD_ flags. */ 145 }; 146 147 #define SELTD_PENDING 0x0001 /* We have pending events. */ 148 #define SELTD_RESCAN 0x0002 /* Doing a rescan. */ 149 150 /* 151 * One selfd allocated per-thread per-file-descriptor. 152 * f - protected by sf_mtx 153 */ 154 struct selfd { 155 STAILQ_ENTRY(selfd) sf_link; /* (k) fds owned by this td. */ 156 TAILQ_ENTRY(selfd) sf_threads; /* (f) fds on this selinfo. */ 157 struct selinfo *sf_si; /* (f) selinfo when linked. */ 158 struct mtx *sf_mtx; /* Pointer to selinfo mtx. */ 159 struct seltd *sf_td; /* (k) owning seltd. */ 160 void *sf_cookie; /* (k) fd or pollfd. */ 161 }; 162 163 MALLOC_DEFINE(M_SELFD, "selfd", "selfd"); 164 static struct mtx_pool *mtxpool_select; 165 166 #ifdef __LP64__ 167 size_t 168 devfs_iosize_max(void) 169 { 170 171 return (devfs_iosize_max_clamp || SV_CURPROC_FLAG(SV_ILP32) ? 172 INT_MAX : SSIZE_MAX); 173 } 174 175 size_t 176 iosize_max(void) 177 { 178 179 return (iosize_max_clamp || SV_CURPROC_FLAG(SV_ILP32) ? 180 INT_MAX : SSIZE_MAX); 181 } 182 #endif 183 184 #ifndef _SYS_SYSPROTO_H_ 185 struct read_args { 186 int fd; 187 void *buf; 188 size_t nbyte; 189 }; 190 #endif 191 int 192 sys_read(struct thread *td, struct read_args *uap) 193 { 194 struct uio auio; 195 struct iovec aiov; 196 int error; 197 198 if (uap->nbyte > IOSIZE_MAX) 199 return (EXTERROR(EINVAL, "length > iosize_max")); 200 aiov.iov_base = uap->buf; 201 aiov.iov_len = uap->nbyte; 202 auio.uio_iov = &aiov; 203 auio.uio_iovcnt = 1; 204 auio.uio_resid = uap->nbyte; 205 auio.uio_segflg = UIO_USERSPACE; 206 error = kern_readv(td, uap->fd, &auio); 207 return (error); 208 } 209 210 /* 211 * Positioned read system call 212 */ 213 #ifndef _SYS_SYSPROTO_H_ 214 struct pread_args { 215 int fd; 216 void *buf; 217 size_t nbyte; 218 int pad; 219 off_t offset; 220 }; 221 #endif 222 int 223 sys_pread(struct thread *td, struct pread_args *uap) 224 { 225 226 return (kern_pread(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); 227 } 228 229 int 230 kern_pread(struct thread *td, int fd, void *buf, size_t nbyte, off_t offset) 231 { 232 struct uio auio; 233 struct iovec aiov; 234 int error; 235 236 if (nbyte > IOSIZE_MAX) 237 return (EXTERROR(EINVAL, "length > iosize_max")); 238 aiov.iov_base = buf; 239 aiov.iov_len = nbyte; 240 auio.uio_iov = &aiov; 241 auio.uio_iovcnt = 1; 242 auio.uio_resid = nbyte; 243 auio.uio_segflg = UIO_USERSPACE; 244 error = kern_preadv(td, fd, &auio, offset); 245 return (error); 246 } 247 248 #if defined(COMPAT_FREEBSD6) 249 int 250 freebsd6_pread(struct thread *td, struct freebsd6_pread_args *uap) 251 { 252 253 return (kern_pread(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); 254 } 255 #endif 256 257 /* 258 * Scatter read system call. 259 */ 260 #ifndef _SYS_SYSPROTO_H_ 261 struct readv_args { 262 int fd; 263 struct iovec *iovp; 264 u_int iovcnt; 265 }; 266 #endif 267 int 268 sys_readv(struct thread *td, struct readv_args *uap) 269 { 270 struct uio *auio; 271 int error; 272 273 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 274 if (error) 275 return (error); 276 error = kern_readv(td, uap->fd, auio); 277 freeuio(auio); 278 return (error); 279 } 280 281 int 282 kern_readv(struct thread *td, int fd, struct uio *auio) 283 { 284 struct file *fp; 285 int error; 286 287 error = fget_read(td, fd, &cap_read_rights, &fp); 288 if (error) 289 return (error); 290 error = dofileread(td, fd, fp, auio, (off_t)-1, 0); 291 fdrop(fp, td); 292 return (error); 293 } 294 295 /* 296 * Scatter positioned read system call. 297 */ 298 #ifndef _SYS_SYSPROTO_H_ 299 struct preadv_args { 300 int fd; 301 struct iovec *iovp; 302 u_int iovcnt; 303 off_t offset; 304 }; 305 #endif 306 int 307 sys_preadv(struct thread *td, struct preadv_args *uap) 308 { 309 struct uio *auio; 310 int error; 311 312 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 313 if (error) 314 return (error); 315 error = kern_preadv(td, uap->fd, auio, uap->offset); 316 freeuio(auio); 317 return (error); 318 } 319 320 int 321 kern_preadv(struct thread *td, int fd, struct uio *auio, off_t offset) 322 { 323 struct file *fp; 324 int error; 325 326 error = fget_read(td, fd, &cap_pread_rights, &fp); 327 if (error) 328 return (error); 329 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 330 error = ESPIPE; 331 else if (offset < 0 && 332 (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) 333 error = EXTERROR(EINVAL, "neg offset"); 334 else 335 error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET); 336 fdrop(fp, td); 337 return (error); 338 } 339 340 /* 341 * Common code for readv and preadv that reads data in 342 * from a file using the passed in uio, offset, and flags. 343 */ 344 static int 345 dofileread(struct thread *td, int fd, struct file *fp, struct uio *auio, 346 off_t offset, int flags) 347 { 348 ssize_t cnt; 349 int error; 350 #ifdef KTRACE 351 struct uio *ktruio = NULL; 352 #endif 353 354 AUDIT_ARG_FD(fd); 355 356 /* Finish zero length reads right here */ 357 if (auio->uio_resid == 0) { 358 td->td_retval[0] = 0; 359 return (0); 360 } 361 auio->uio_rw = UIO_READ; 362 auio->uio_offset = offset; 363 auio->uio_td = td; 364 #ifdef KTRACE 365 if (KTRPOINT(td, KTR_GENIO)) 366 ktruio = cloneuio(auio); 367 #endif 368 cnt = auio->uio_resid; 369 if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) { 370 if (auio->uio_resid != cnt && (error == ERESTART || 371 error == EINTR || error == EWOULDBLOCK)) 372 error = 0; 373 } 374 cnt -= auio->uio_resid; 375 #ifdef KTRACE 376 if (ktruio != NULL) { 377 ktruio->uio_resid = cnt; 378 ktrgenio(fd, UIO_READ, ktruio, error); 379 } 380 #endif 381 td->td_retval[0] = cnt; 382 return (error); 383 } 384 385 #ifndef _SYS_SYSPROTO_H_ 386 struct write_args { 387 int fd; 388 const void *buf; 389 size_t nbyte; 390 }; 391 #endif 392 int 393 sys_write(struct thread *td, struct write_args *uap) 394 { 395 struct uio auio; 396 struct iovec aiov; 397 int error; 398 399 if (uap->nbyte > IOSIZE_MAX) 400 return (EXTERROR(EINVAL, "length > iosize_max")); 401 aiov.iov_base = (void *)(uintptr_t)uap->buf; 402 aiov.iov_len = uap->nbyte; 403 auio.uio_iov = &aiov; 404 auio.uio_iovcnt = 1; 405 auio.uio_resid = uap->nbyte; 406 auio.uio_segflg = UIO_USERSPACE; 407 error = kern_writev(td, uap->fd, &auio); 408 return (error); 409 } 410 411 /* 412 * Positioned write system call. 413 */ 414 #ifndef _SYS_SYSPROTO_H_ 415 struct pwrite_args { 416 int fd; 417 const void *buf; 418 size_t nbyte; 419 int pad; 420 off_t offset; 421 }; 422 #endif 423 int 424 sys_pwrite(struct thread *td, struct pwrite_args *uap) 425 { 426 427 return (kern_pwrite(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); 428 } 429 430 int 431 kern_pwrite(struct thread *td, int fd, const void *buf, size_t nbyte, 432 off_t offset) 433 { 434 struct uio auio; 435 struct iovec aiov; 436 int error; 437 438 if (nbyte > IOSIZE_MAX) 439 return (EXTERROR(EINVAL, "length > iosize_max")); 440 aiov.iov_base = (void *)(uintptr_t)buf; 441 aiov.iov_len = nbyte; 442 auio.uio_iov = &aiov; 443 auio.uio_iovcnt = 1; 444 auio.uio_resid = nbyte; 445 auio.uio_segflg = UIO_USERSPACE; 446 error = kern_pwritev(td, fd, &auio, offset); 447 return (error); 448 } 449 450 #if defined(COMPAT_FREEBSD6) 451 int 452 freebsd6_pwrite(struct thread *td, struct freebsd6_pwrite_args *uap) 453 { 454 455 return (kern_pwrite(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); 456 } 457 #endif 458 459 /* 460 * Gather write system call. 461 */ 462 #ifndef _SYS_SYSPROTO_H_ 463 struct writev_args { 464 int fd; 465 struct iovec *iovp; 466 u_int iovcnt; 467 }; 468 #endif 469 int 470 sys_writev(struct thread *td, struct writev_args *uap) 471 { 472 struct uio *auio; 473 int error; 474 475 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 476 if (error) 477 return (error); 478 error = kern_writev(td, uap->fd, auio); 479 freeuio(auio); 480 return (error); 481 } 482 483 int 484 kern_writev(struct thread *td, int fd, struct uio *auio) 485 { 486 struct file *fp; 487 int error; 488 489 error = fget_write(td, fd, &cap_write_rights, &fp); 490 if (error) 491 return (error); 492 error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0); 493 fdrop(fp, td); 494 return (error); 495 } 496 497 /* 498 * Gather positioned write system call. 499 */ 500 #ifndef _SYS_SYSPROTO_H_ 501 struct pwritev_args { 502 int fd; 503 struct iovec *iovp; 504 u_int iovcnt; 505 off_t offset; 506 }; 507 #endif 508 int 509 sys_pwritev(struct thread *td, struct pwritev_args *uap) 510 { 511 struct uio *auio; 512 int error; 513 514 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 515 if (error) 516 return (error); 517 error = kern_pwritev(td, uap->fd, auio, uap->offset); 518 freeuio(auio); 519 return (error); 520 } 521 522 int 523 kern_pwritev(struct thread *td, int fd, struct uio *auio, off_t offset) 524 { 525 struct file *fp; 526 int error; 527 528 error = fget_write(td, fd, &cap_pwrite_rights, &fp); 529 if (error) 530 return (error); 531 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 532 error = ESPIPE; 533 else if (offset < 0 && 534 (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) 535 error = EXTERROR(EINVAL, "neg offset"); 536 else 537 error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET); 538 fdrop(fp, td); 539 return (error); 540 } 541 542 /* 543 * Common code for writev and pwritev that writes data to 544 * a file using the passed in uio, offset, and flags. 545 */ 546 static int 547 dofilewrite(struct thread *td, int fd, struct file *fp, struct uio *auio, 548 off_t offset, int flags) 549 { 550 ssize_t cnt; 551 int error; 552 #ifdef KTRACE 553 struct uio *ktruio = NULL; 554 #endif 555 556 AUDIT_ARG_FD(fd); 557 auio->uio_rw = UIO_WRITE; 558 auio->uio_td = td; 559 auio->uio_offset = offset; 560 #ifdef KTRACE 561 if (KTRPOINT(td, KTR_GENIO)) 562 ktruio = cloneuio(auio); 563 #endif 564 cnt = auio->uio_resid; 565 error = fo_write(fp, auio, td->td_ucred, flags, td); 566 /* 567 * Socket layer is responsible for special error handling, 568 * see sousrsend(). 569 */ 570 if (error != 0 && fp->f_type != DTYPE_SOCKET) { 571 if (auio->uio_resid != cnt && (error == ERESTART || 572 error == EINTR || error == EWOULDBLOCK)) 573 error = 0; 574 if (error == EPIPE) { 575 PROC_LOCK(td->td_proc); 576 tdsignal(td, SIGPIPE); 577 PROC_UNLOCK(td->td_proc); 578 } 579 } 580 cnt -= auio->uio_resid; 581 #ifdef KTRACE 582 if (ktruio != NULL) { 583 if (error == 0) 584 ktruio->uio_resid = cnt; 585 ktrgenio(fd, UIO_WRITE, ktruio, error); 586 } 587 #endif 588 td->td_retval[0] = cnt; 589 return (error); 590 } 591 592 /* 593 * Truncate a file given a file descriptor. 594 * 595 * Can't use fget_write() here, since must return EINVAL and not EBADF if the 596 * descriptor isn't writable. 597 */ 598 int 599 kern_ftruncate(struct thread *td, int fd, off_t length) 600 { 601 struct file *fp; 602 int error; 603 604 AUDIT_ARG_FD(fd); 605 if (length < 0) 606 return (EXTERROR(EINVAL, "negative length")); 607 error = fget(td, fd, &cap_ftruncate_rights, &fp); 608 if (error) 609 return (error); 610 AUDIT_ARG_FILE(td->td_proc, fp); 611 if (!(fp->f_flag & FWRITE)) { 612 fdrop(fp, td); 613 return (EXTERROR(EINVAL, "non-writable")); 614 } 615 error = fo_truncate(fp, length, td->td_ucred, td); 616 fdrop(fp, td); 617 return (error); 618 } 619 620 #ifndef _SYS_SYSPROTO_H_ 621 struct ftruncate_args { 622 int fd; 623 int pad; 624 off_t length; 625 }; 626 #endif 627 int 628 sys_ftruncate(struct thread *td, struct ftruncate_args *uap) 629 { 630 631 return (kern_ftruncate(td, uap->fd, uap->length)); 632 } 633 634 #if defined(COMPAT_43) 635 #ifndef _SYS_SYSPROTO_H_ 636 struct oftruncate_args { 637 int fd; 638 long length; 639 }; 640 #endif 641 int 642 oftruncate(struct thread *td, struct oftruncate_args *uap) 643 { 644 645 return (kern_ftruncate(td, uap->fd, uap->length)); 646 } 647 #endif /* COMPAT_43 */ 648 649 #ifndef _SYS_SYSPROTO_H_ 650 struct ioctl_args { 651 int fd; 652 u_long com; 653 caddr_t data; 654 }; 655 #endif 656 /* ARGSUSED */ 657 int 658 sys_ioctl(struct thread *td, struct ioctl_args *uap) 659 { 660 u_char smalldata[SYS_IOCTL_SMALL_SIZE] __aligned(SYS_IOCTL_SMALL_ALIGN); 661 uint32_t com; 662 int arg, error; 663 u_int size; 664 caddr_t data; 665 666 #ifdef INVARIANTS 667 if (uap->com > 0xffffffff) { 668 printf( 669 "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", 670 td->td_proc->p_pid, td->td_name, uap->com); 671 } 672 #endif 673 com = (uint32_t)uap->com; 674 675 /* 676 * Interpret high order word to find amount of data to be 677 * copied to/from the user's address space. 678 */ 679 size = IOCPARM_LEN(com); 680 if ((size > IOCPARM_MAX) || 681 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || 682 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43) 683 ((com & IOC_OUT) && size == 0) || 684 #else 685 ((com & (IOC_IN | IOC_OUT)) && size == 0) || 686 #endif 687 ((com & IOC_VOID) && size > 0 && size != sizeof(int))) 688 return (ENOTTY); 689 690 if (size > 0) { 691 if (com & IOC_VOID) { 692 /* Integer argument. */ 693 arg = (intptr_t)uap->data; 694 data = (void *)&arg; 695 size = 0; 696 } else { 697 if (size > SYS_IOCTL_SMALL_SIZE) 698 data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 699 else 700 data = smalldata; 701 } 702 } else 703 data = (void *)&uap->data; 704 if (com & IOC_IN) { 705 error = copyin(uap->data, data, (u_int)size); 706 if (error != 0) 707 goto out; 708 } else if (com & IOC_OUT) { 709 /* 710 * Zero the buffer so the user always 711 * gets back something deterministic. 712 */ 713 bzero(data, size); 714 } 715 716 error = kern_ioctl(td, uap->fd, com, data); 717 718 if (error == 0 && (com & IOC_OUT)) 719 error = copyout(data, uap->data, (u_int)size); 720 721 out: 722 if (size > SYS_IOCTL_SMALL_SIZE) 723 free(data, M_IOCTLOPS); 724 return (error); 725 } 726 727 int 728 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) 729 { 730 struct file *fp; 731 struct filedesc *fdp; 732 int error, f_flag, tmp, locked; 733 734 AUDIT_ARG_FD(fd); 735 AUDIT_ARG_CMD(com); 736 737 fdp = td->td_proc->p_fd; 738 739 switch (com) { 740 case FIONCLEX: 741 case FIOCLEX: 742 FILEDESC_XLOCK(fdp); 743 locked = LA_XLOCKED; 744 break; 745 default: 746 #ifdef CAPABILITIES 747 FILEDESC_SLOCK(fdp); 748 locked = LA_SLOCKED; 749 #else 750 locked = LA_UNLOCKED; 751 #endif 752 break; 753 } 754 755 #ifdef CAPABILITIES 756 if ((fp = fget_noref(fdp, fd)) == NULL) { 757 error = EBADF; 758 goto out; 759 } 760 if ((error = cap_ioctl_check(fdp, fd, com)) != 0) { 761 fp = NULL; /* fhold() was not called yet */ 762 goto out; 763 } 764 if (!fhold(fp)) { 765 error = EBADF; 766 fp = NULL; 767 goto out; 768 } 769 if (locked == LA_SLOCKED) { 770 FILEDESC_SUNLOCK(fdp); 771 locked = LA_UNLOCKED; 772 } 773 #else 774 error = fget(td, fd, &cap_ioctl_rights, &fp); 775 if (error != 0) { 776 fp = NULL; 777 goto out; 778 } 779 #endif 780 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 781 error = EBADF; 782 goto out; 783 } 784 785 f_flag = 0; 786 switch (com) { 787 case FIONCLEX: 788 fdp->fd_ofiles[fd].fde_flags &= ~UF_EXCLOSE; 789 break; 790 case FIOCLEX: 791 fdp->fd_ofiles[fd].fde_flags |= UF_EXCLOSE; 792 break; 793 case FIONBIO: 794 case FIOASYNC: 795 f_flag = com == FIONBIO ? FNONBLOCK : FASYNC; 796 tmp = *(int *)data; 797 fsetfl_lock(fp); 798 if (((fp->f_flag & f_flag) != 0) != (tmp != 0)) { 799 error = fo_ioctl(fp, com, (void *)&tmp, td->td_ucred, 800 td); 801 if (error == 0) { 802 if (tmp != 0) 803 atomic_set_int(&fp->f_flag, f_flag); 804 else 805 atomic_clear_int(&fp->f_flag, f_flag); 806 } 807 } 808 fsetfl_unlock(fp); 809 break; 810 default: 811 error = fo_ioctl(fp, com, data, td->td_ucred, td); 812 break; 813 } 814 815 out: 816 switch (locked) { 817 case LA_XLOCKED: 818 FILEDESC_XUNLOCK(fdp); 819 break; 820 #ifdef CAPABILITIES 821 case LA_SLOCKED: 822 FILEDESC_SUNLOCK(fdp); 823 break; 824 #endif 825 default: 826 FILEDESC_UNLOCK_ASSERT(fdp); 827 break; 828 } 829 if (fp != NULL) 830 fdrop(fp, td); 831 return (error); 832 } 833 834 int 835 sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap) 836 { 837 int error; 838 839 error = kern_posix_fallocate(td, uap->fd, uap->offset, uap->len); 840 return (kern_posix_error(td, error)); 841 } 842 843 int 844 kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len) 845 { 846 struct file *fp; 847 int error; 848 849 AUDIT_ARG_FD(fd); 850 if (offset < 0) 851 return (EXTERROR(EINVAL, "negative offset")); 852 if (len <= 0) 853 return (EXTERROR(EINVAL, "negative length")); 854 /* Check for wrap. */ 855 if (offset > OFF_MAX - len) 856 return (EFBIG); 857 AUDIT_ARG_FD(fd); 858 error = fget(td, fd, &cap_pwrite_rights, &fp); 859 if (error != 0) 860 return (error); 861 AUDIT_ARG_FILE(td->td_proc, fp); 862 if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) { 863 error = ESPIPE; 864 goto out; 865 } 866 if ((fp->f_flag & FWRITE) == 0) { 867 error = EBADF; 868 goto out; 869 } 870 871 error = fo_fallocate(fp, offset, len, td); 872 out: 873 fdrop(fp, td); 874 return (error); 875 } 876 877 int 878 sys_fspacectl(struct thread *td, struct fspacectl_args *uap) 879 { 880 struct spacectl_range rqsr, rmsr; 881 int error, cerror; 882 883 error = copyin(uap->rqsr, &rqsr, sizeof(rqsr)); 884 if (error != 0) 885 return (error); 886 887 error = kern_fspacectl(td, uap->fd, uap->cmd, &rqsr, uap->flags, 888 &rmsr); 889 if (uap->rmsr != NULL) { 890 cerror = copyout(&rmsr, uap->rmsr, sizeof(rmsr)); 891 if (error == 0) 892 error = cerror; 893 } 894 return (error); 895 } 896 897 int 898 kern_fspacectl(struct thread *td, int fd, int cmd, 899 const struct spacectl_range *rqsr, int flags, struct spacectl_range *rmsrp) 900 { 901 struct file *fp; 902 struct spacectl_range rmsr; 903 int error; 904 905 AUDIT_ARG_FD(fd); 906 AUDIT_ARG_CMD(cmd); 907 AUDIT_ARG_FFLAGS(flags); 908 909 if (rqsr == NULL) 910 return (EXTERROR(EINVAL, "no range")); 911 rmsr = *rqsr; 912 if (rmsrp != NULL) 913 *rmsrp = rmsr; 914 915 if (cmd != SPACECTL_DEALLOC) 916 return (EXTERROR(EINVAL, "cmd", cmd)); 917 if (rqsr->r_offset < 0) 918 return (EXTERROR(EINVAL, "neg offset")); 919 if (rqsr->r_len <= 0) 920 return (EXTERROR(EINVAL, "neg len")); 921 if (rqsr->r_offset > OFF_MAX - rqsr->r_len) 922 return (EXTERROR(EINVAL, "offset too large")); 923 if ((flags & ~SPACECTL_F_SUPPORTED) != 0) 924 return (EXTERROR(EINVAL, "reserved flags", flags)); 925 926 error = fget_write(td, fd, &cap_pwrite_rights, &fp); 927 if (error != 0) 928 return (error); 929 AUDIT_ARG_FILE(td->td_proc, fp); 930 if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) { 931 error = ESPIPE; 932 goto out; 933 } 934 if ((fp->f_flag & FWRITE) == 0) { 935 error = EBADF; 936 goto out; 937 } 938 939 error = fo_fspacectl(fp, cmd, &rmsr.r_offset, &rmsr.r_len, flags, 940 td->td_ucred, td); 941 /* fspacectl is not restarted after signals if the file is modified. */ 942 if (rmsr.r_len != rqsr->r_len && (error == ERESTART || 943 error == EINTR || error == EWOULDBLOCK)) 944 error = 0; 945 if (rmsrp != NULL) 946 *rmsrp = rmsr; 947 out: 948 fdrop(fp, td); 949 return (error); 950 } 951 952 int 953 kern_specialfd(struct thread *td, int type, void *arg) 954 { 955 struct file *fp; 956 int error, fd, fflags; 957 958 fflags = 0; 959 error = falloc_noinstall(td, &fp); 960 if (error != 0) 961 return (error); 962 963 switch (type) { 964 case SPECIALFD_EVENTFD: { 965 struct specialfd_eventfd *ae; 966 967 ae = arg; 968 if ((ae->flags & EFD_CLOEXEC) != 0) 969 fflags |= O_CLOEXEC; 970 error = eventfd_create_file(td, fp, ae->initval, ae->flags); 971 break; 972 } 973 case SPECIALFD_INOTIFY: { 974 struct specialfd_inotify *si; 975 976 si = arg; 977 error = inotify_create_file(td, fp, si->flags, &fflags); 978 break; 979 } 980 default: 981 error = EXTERROR(EINVAL, "invalid type", type); 982 break; 983 } 984 985 if (error == 0) 986 error = finstall(td, fp, &fd, fflags, NULL); 987 fdrop(fp, td); 988 if (error == 0) 989 td->td_retval[0] = fd; 990 return (error); 991 } 992 993 int 994 sys___specialfd(struct thread *td, struct __specialfd_args *args) 995 { 996 int error; 997 998 switch (args->type) { 999 case SPECIALFD_EVENTFD: { 1000 struct specialfd_eventfd ae; 1001 1002 if (args->len != sizeof(struct specialfd_eventfd)) { 1003 error = EXTERROR(EINVAL, "eventfd params ABI"); 1004 break; 1005 } 1006 error = copyin(args->req, &ae, sizeof(ae)); 1007 if (error != 0) 1008 break; 1009 if ((ae.flags & ~(EFD_CLOEXEC | EFD_NONBLOCK | 1010 EFD_SEMAPHORE)) != 0) { 1011 error = EXTERROR(EINVAL, "reserved flag"); 1012 break; 1013 } 1014 error = kern_specialfd(td, args->type, &ae); 1015 break; 1016 } 1017 case SPECIALFD_INOTIFY: { 1018 struct specialfd_inotify si; 1019 1020 if (args->len != sizeof(si)) { 1021 error = EINVAL; 1022 break; 1023 } 1024 error = copyin(args->req, &si, sizeof(si)); 1025 if (error != 0) 1026 break; 1027 error = kern_specialfd(td, args->type, &si); 1028 break; 1029 } 1030 default: 1031 error = EXTERROR(EINVAL, "unknown type", args->type); 1032 break; 1033 } 1034 return (error); 1035 } 1036 1037 int 1038 poll_no_poll(int events) 1039 { 1040 /* 1041 * Return true for read/write. If the user asked for something 1042 * special, return POLLNVAL, so that clients have a way of 1043 * determining reliably whether or not the extended 1044 * functionality is present without hard-coding knowledge 1045 * of specific filesystem implementations. 1046 */ 1047 if (events & ~POLLSTANDARD) 1048 return (POLLNVAL); 1049 1050 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1051 } 1052 1053 int 1054 sys_pselect(struct thread *td, struct pselect_args *uap) 1055 { 1056 struct timespec ts; 1057 struct timeval tv, *tvp; 1058 sigset_t set, *uset; 1059 int error; 1060 1061 if (uap->ts != NULL) { 1062 error = copyin(uap->ts, &ts, sizeof(ts)); 1063 if (error != 0) 1064 return (error); 1065 TIMESPEC_TO_TIMEVAL(&tv, &ts); 1066 tvp = &tv; 1067 } else 1068 tvp = NULL; 1069 if (uap->sm != NULL) { 1070 error = copyin(uap->sm, &set, sizeof(set)); 1071 if (error != 0) 1072 return (error); 1073 uset = &set; 1074 } else 1075 uset = NULL; 1076 return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, 1077 uset, NFDBITS)); 1078 } 1079 1080 int 1081 kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex, 1082 struct timeval *tvp, sigset_t *uset, int abi_nfdbits) 1083 { 1084 int error; 1085 1086 if (uset != NULL) { 1087 error = kern_sigprocmask(td, SIG_SETMASK, uset, 1088 &td->td_oldsigmask, 0); 1089 if (error != 0) 1090 return (error); 1091 td->td_pflags |= TDP_OLDMASK; 1092 } 1093 error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits); 1094 if (uset != NULL) { 1095 /* 1096 * Make sure that ast() is called on return to 1097 * usermode and TDP_OLDMASK is cleared, restoring old 1098 * sigmask. If we didn't get interrupted, then the caller is 1099 * likely not expecting a signal to hit that should normally be 1100 * blocked by its signal mask, so we restore the mask before 1101 * any signals could be delivered. 1102 */ 1103 if (error == EINTR) { 1104 ast_sched(td, TDA_SIGSUSPEND); 1105 } else { 1106 /* *select(2) should never restart. */ 1107 MPASS(error != ERESTART); 1108 ast_sched(td, TDA_PSELECT); 1109 } 1110 } 1111 1112 return (error); 1113 } 1114 1115 #ifndef _SYS_SYSPROTO_H_ 1116 struct select_args { 1117 int nd; 1118 fd_set *in, *ou, *ex; 1119 struct timeval *tv; 1120 }; 1121 #endif 1122 int 1123 sys_select(struct thread *td, struct select_args *uap) 1124 { 1125 struct timeval tv, *tvp; 1126 int error; 1127 1128 if (uap->tv != NULL) { 1129 error = copyin(uap->tv, &tv, sizeof(tv)); 1130 if (error) 1131 return (error); 1132 tvp = &tv; 1133 } else 1134 tvp = NULL; 1135 1136 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, 1137 NFDBITS)); 1138 } 1139 1140 /* 1141 * In the unlikely case when user specified n greater then the last 1142 * open file descriptor, check that no bits are set after the last 1143 * valid fd. We must return EBADF if any is set. 1144 * 1145 * There are applications that rely on the behaviour. 1146 * 1147 * nd is fd_nfiles. 1148 */ 1149 static int 1150 select_check_badfd(fd_set *fd_in, int nd, int ndu, int abi_nfdbits) 1151 { 1152 char *addr, *oaddr; 1153 int b, i, res; 1154 uint8_t bits; 1155 1156 if (nd >= ndu || fd_in == NULL) 1157 return (0); 1158 1159 oaddr = NULL; 1160 bits = 0; /* silence gcc */ 1161 for (i = nd; i < ndu; i++) { 1162 b = i / NBBY; 1163 #if BYTE_ORDER == LITTLE_ENDIAN 1164 addr = (char *)fd_in + b; 1165 #else 1166 addr = (char *)fd_in; 1167 if (abi_nfdbits == NFDBITS) { 1168 addr += rounddown(b, sizeof(fd_mask)) + 1169 sizeof(fd_mask) - 1 - b % sizeof(fd_mask); 1170 } else { 1171 addr += rounddown(b, sizeof(uint32_t)) + 1172 sizeof(uint32_t) - 1 - b % sizeof(uint32_t); 1173 } 1174 #endif 1175 if (addr != oaddr) { 1176 res = fubyte(addr); 1177 if (res == -1) 1178 return (EFAULT); 1179 oaddr = addr; 1180 bits = res; 1181 } 1182 if ((bits & (1 << (i % NBBY))) != 0) 1183 return (EBADF); 1184 } 1185 return (0); 1186 } 1187 1188 int 1189 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 1190 fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits) 1191 { 1192 struct filedesc *fdp; 1193 /* 1194 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 1195 * infds with the new FD_SETSIZE of 1024, and more than enough for 1196 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 1197 * of 256. 1198 */ 1199 fd_mask s_selbits[howmany(2048, NFDBITS)]; 1200 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 1201 struct timeval rtv; 1202 sbintime_t asbt, precision, rsbt; 1203 u_int nbufbytes, ncpbytes, ncpubytes, nfdbits; 1204 int error, lf, ndu; 1205 1206 if (nd < 0) 1207 return (EXTERROR(EINVAL, "negative ndescs")); 1208 fdp = td->td_proc->p_fd; 1209 ndu = nd; 1210 lf = fdp->fd_nfiles; 1211 if (nd > lf) 1212 nd = lf; 1213 1214 error = select_check_badfd(fd_in, nd, ndu, abi_nfdbits); 1215 if (error != 0) 1216 return (error); 1217 error = select_check_badfd(fd_ou, nd, ndu, abi_nfdbits); 1218 if (error != 0) 1219 return (error); 1220 error = select_check_badfd(fd_ex, nd, ndu, abi_nfdbits); 1221 if (error != 0) 1222 return (error); 1223 1224 /* 1225 * Allocate just enough bits for the non-null fd_sets. Use the 1226 * preallocated auto buffer if possible. 1227 */ 1228 nfdbits = roundup(nd, NFDBITS); 1229 ncpbytes = nfdbits / NBBY; 1230 ncpubytes = roundup(nd, abi_nfdbits) / NBBY; 1231 nbufbytes = 0; 1232 if (fd_in != NULL) 1233 nbufbytes += 2 * ncpbytes; 1234 if (fd_ou != NULL) 1235 nbufbytes += 2 * ncpbytes; 1236 if (fd_ex != NULL) 1237 nbufbytes += 2 * ncpbytes; 1238 if (nbufbytes <= sizeof s_selbits) 1239 selbits = &s_selbits[0]; 1240 else 1241 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 1242 1243 /* 1244 * Assign pointers into the bit buffers and fetch the input bits. 1245 * Put the output buffers together so that they can be bzeroed 1246 * together. 1247 */ 1248 sbp = selbits; 1249 #define getbits(name, x) \ 1250 do { \ 1251 if (name == NULL) { \ 1252 ibits[x] = NULL; \ 1253 obits[x] = NULL; \ 1254 } else { \ 1255 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 1256 obits[x] = sbp; \ 1257 sbp += ncpbytes / sizeof *sbp; \ 1258 error = copyin(name, ibits[x], ncpubytes); \ 1259 if (error != 0) \ 1260 goto done; \ 1261 if (ncpbytes != ncpubytes) \ 1262 bzero((char *)ibits[x] + ncpubytes, \ 1263 ncpbytes - ncpubytes); \ 1264 } \ 1265 } while (0) 1266 getbits(fd_in, 0); 1267 getbits(fd_ou, 1); 1268 getbits(fd_ex, 2); 1269 #undef getbits 1270 1271 #if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__) 1272 /* 1273 * XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS, 1274 * we are running under 32-bit emulation. This should be more 1275 * generic. 1276 */ 1277 #define swizzle_fdset(bits) \ 1278 if (abi_nfdbits != NFDBITS && bits != NULL) { \ 1279 int i; \ 1280 for (i = 0; i < ncpbytes / sizeof *sbp; i++) \ 1281 bits[i] = (bits[i] >> 32) | (bits[i] << 32); \ 1282 } 1283 #else 1284 #define swizzle_fdset(bits) 1285 #endif 1286 1287 /* Make sure the bit order makes it through an ABI transition */ 1288 swizzle_fdset(ibits[0]); 1289 swizzle_fdset(ibits[1]); 1290 swizzle_fdset(ibits[2]); 1291 1292 if (nbufbytes != 0) 1293 bzero(selbits, nbufbytes / 2); 1294 1295 precision = 0; 1296 if (tvp != NULL) { 1297 rtv = *tvp; 1298 if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || 1299 rtv.tv_usec >= 1000000) { 1300 error = EXTERROR(EINVAL, "invalid timeval"); 1301 goto done; 1302 } 1303 if (!timevalisset(&rtv)) 1304 asbt = 0; 1305 else if (rtv.tv_sec <= INT32_MAX) { 1306 rsbt = tvtosbt(rtv); 1307 precision = rsbt; 1308 precision >>= tc_precexp; 1309 if (TIMESEL(&asbt, rsbt)) 1310 asbt += tc_tick_sbt; 1311 if (asbt <= SBT_MAX - rsbt) 1312 asbt += rsbt; 1313 else 1314 asbt = -1; 1315 } else 1316 asbt = -1; 1317 } else 1318 asbt = -1; 1319 seltdinit(td); 1320 /* Iterate until the timeout expires or descriptors become ready. */ 1321 for (;;) { 1322 error = selscan(td, ibits, obits, nd); 1323 if (error || td->td_retval[0] != 0) 1324 break; 1325 error = seltdwait(td, asbt, precision); 1326 if (error) 1327 break; 1328 error = selrescan(td, ibits, obits); 1329 if (error || td->td_retval[0] != 0) 1330 break; 1331 } 1332 seltdclear(td); 1333 1334 done: 1335 /* select is not restarted after signals... */ 1336 if (error == ERESTART) 1337 error = EINTR; 1338 if (error == EWOULDBLOCK) 1339 error = 0; 1340 1341 /* swizzle bit order back, if necessary */ 1342 swizzle_fdset(obits[0]); 1343 swizzle_fdset(obits[1]); 1344 swizzle_fdset(obits[2]); 1345 #undef swizzle_fdset 1346 1347 #define putbits(name, x) \ 1348 if (name && (error2 = copyout(obits[x], name, ncpubytes))) \ 1349 error = error2; 1350 if (error == 0) { 1351 int error2; 1352 1353 putbits(fd_in, 0); 1354 putbits(fd_ou, 1); 1355 putbits(fd_ex, 2); 1356 #undef putbits 1357 } 1358 if (selbits != &s_selbits[0]) 1359 free(selbits, M_SELECT); 1360 1361 return (error); 1362 } 1363 /* 1364 * Convert a select bit set to poll flags. 1365 * 1366 * The backend always returns POLLHUP/POLLERR if appropriate and we 1367 * return this as a set bit in any set. 1368 */ 1369 static const int select_flags[3] = { 1370 POLLRDNORM | POLLHUP | POLLERR, 1371 POLLWRNORM | POLLHUP | POLLERR, 1372 POLLRDBAND | POLLERR 1373 }; 1374 1375 /* 1376 * Compute the fo_poll flags required for a fd given by the index and 1377 * bit position in the fd_mask array. 1378 */ 1379 static __inline int 1380 selflags(fd_mask **ibits, int idx, fd_mask bit) 1381 { 1382 int flags; 1383 int msk; 1384 1385 flags = 0; 1386 for (msk = 0; msk < 3; msk++) { 1387 if (ibits[msk] == NULL) 1388 continue; 1389 if ((ibits[msk][idx] & bit) == 0) 1390 continue; 1391 flags |= select_flags[msk]; 1392 } 1393 return (flags); 1394 } 1395 1396 /* 1397 * Set the appropriate output bits given a mask of fired events and the 1398 * input bits originally requested. 1399 */ 1400 static __inline int 1401 selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events) 1402 { 1403 int msk; 1404 int n; 1405 1406 n = 0; 1407 for (msk = 0; msk < 3; msk++) { 1408 if ((events & select_flags[msk]) == 0) 1409 continue; 1410 if (ibits[msk] == NULL) 1411 continue; 1412 if ((ibits[msk][idx] & bit) == 0) 1413 continue; 1414 /* 1415 * XXX Check for a duplicate set. This can occur because a 1416 * socket calls selrecord() twice for each poll() call 1417 * resulting in two selfds per real fd. selrescan() will 1418 * call selsetbits twice as a result. 1419 */ 1420 if ((obits[msk][idx] & bit) != 0) 1421 continue; 1422 obits[msk][idx] |= bit; 1423 n++; 1424 } 1425 1426 return (n); 1427 } 1428 1429 /* 1430 * Traverse the list of fds attached to this thread's seltd and check for 1431 * completion. 1432 */ 1433 static int 1434 selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits) 1435 { 1436 struct filedesc *fdp; 1437 struct selinfo *si; 1438 struct seltd *stp; 1439 struct selfd *sfp; 1440 struct selfd *sfn; 1441 struct file *fp; 1442 fd_mask bit; 1443 int fd, ev, n, idx; 1444 int error; 1445 bool only_user; 1446 1447 fdp = td->td_proc->p_fd; 1448 stp = td->td_sel; 1449 n = 0; 1450 only_user = FILEDESC_IS_ONLY_USER(fdp); 1451 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { 1452 fd = (int)(uintptr_t)sfp->sf_cookie; 1453 si = sfp->sf_si; 1454 selfdfree(stp, sfp); 1455 /* If the selinfo wasn't cleared the event didn't fire. */ 1456 if (si != NULL) 1457 continue; 1458 if (only_user) 1459 error = fget_only_user(fdp, fd, &cap_event_rights, &fp); 1460 else 1461 error = fget_unlocked(td, fd, &cap_event_rights, &fp); 1462 if (__predict_false(error != 0)) 1463 return (error); 1464 idx = fd / NFDBITS; 1465 bit = (fd_mask)1 << (fd % NFDBITS); 1466 ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td); 1467 if (only_user) 1468 fput_only_user(fdp, fp); 1469 else 1470 fdrop(fp, td); 1471 if (ev != 0) 1472 n += selsetbits(ibits, obits, idx, bit, ev); 1473 } 1474 stp->st_flags = 0; 1475 td->td_retval[0] = n; 1476 return (0); 1477 } 1478 1479 /* 1480 * Perform the initial filedescriptor scan and register ourselves with 1481 * each selinfo. 1482 */ 1483 static int 1484 selscan(struct thread *td, fd_mask **ibits, fd_mask **obits, int nfd) 1485 { 1486 struct filedesc *fdp; 1487 struct file *fp; 1488 fd_mask bit; 1489 int ev, flags, end, fd; 1490 int n, idx; 1491 int error; 1492 bool only_user; 1493 1494 fdp = td->td_proc->p_fd; 1495 n = 0; 1496 only_user = FILEDESC_IS_ONLY_USER(fdp); 1497 for (idx = 0, fd = 0; fd < nfd; idx++) { 1498 end = imin(fd + NFDBITS, nfd); 1499 for (bit = 1; fd < end; bit <<= 1, fd++) { 1500 /* Compute the list of events we're interested in. */ 1501 flags = selflags(ibits, idx, bit); 1502 if (flags == 0) 1503 continue; 1504 if (only_user) 1505 error = fget_only_user(fdp, fd, &cap_event_rights, &fp); 1506 else 1507 error = fget_unlocked(td, fd, &cap_event_rights, &fp); 1508 if (__predict_false(error != 0)) 1509 return (error); 1510 selfdalloc(td, (void *)(uintptr_t)fd); 1511 ev = fo_poll(fp, flags, td->td_ucred, td); 1512 if (only_user) 1513 fput_only_user(fdp, fp); 1514 else 1515 fdrop(fp, td); 1516 if (ev != 0) 1517 n += selsetbits(ibits, obits, idx, bit, ev); 1518 } 1519 } 1520 1521 td->td_retval[0] = n; 1522 return (0); 1523 } 1524 1525 int 1526 sys_poll(struct thread *td, struct poll_args *uap) 1527 { 1528 struct timespec ts, *tsp; 1529 1530 if (uap->timeout != INFTIM) { 1531 if (uap->timeout < 0) 1532 return (EXTERROR(EINVAL, "invalid timeout")); 1533 ts.tv_sec = uap->timeout / 1000; 1534 ts.tv_nsec = (uap->timeout % 1000) * 1000000; 1535 tsp = &ts; 1536 } else 1537 tsp = NULL; 1538 1539 return (kern_poll(td, uap->fds, uap->nfds, tsp, NULL)); 1540 } 1541 1542 /* 1543 * kfds points to an array in the kernel. 1544 */ 1545 int 1546 kern_poll_kfds(struct thread *td, struct pollfd *kfds, u_int nfds, 1547 struct timespec *tsp, sigset_t *uset) 1548 { 1549 sbintime_t sbt, precision, tmp; 1550 time_t over; 1551 struct timespec ts; 1552 int error; 1553 1554 precision = 0; 1555 if (tsp != NULL) { 1556 if (!timespecvalid_interval(tsp)) 1557 return (EXTERROR(EINVAL, "invalid timespec")); 1558 if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) 1559 sbt = 0; 1560 else { 1561 ts = *tsp; 1562 if (ts.tv_sec > INT32_MAX / 2) { 1563 over = ts.tv_sec - INT32_MAX / 2; 1564 ts.tv_sec -= over; 1565 } else 1566 over = 0; 1567 tmp = tstosbt(ts); 1568 precision = tmp; 1569 precision >>= tc_precexp; 1570 if (TIMESEL(&sbt, tmp)) 1571 sbt += tc_tick_sbt; 1572 sbt += tmp; 1573 } 1574 } else 1575 sbt = -1; 1576 1577 if (uset != NULL) { 1578 error = kern_sigprocmask(td, SIG_SETMASK, uset, 1579 &td->td_oldsigmask, 0); 1580 if (error) 1581 return (error); 1582 td->td_pflags |= TDP_OLDMASK; 1583 } 1584 1585 seltdinit(td); 1586 /* Iterate until the timeout expires or descriptors become ready. */ 1587 for (;;) { 1588 error = pollscan(td, kfds, nfds); 1589 if (error || td->td_retval[0] != 0) 1590 break; 1591 error = seltdwait(td, sbt, precision); 1592 if (error) 1593 break; 1594 error = pollrescan(td); 1595 if (error || td->td_retval[0] != 0) 1596 break; 1597 } 1598 seltdclear(td); 1599 1600 /* poll is not restarted after signals... */ 1601 if (error == ERESTART) 1602 error = EINTR; 1603 if (error == EWOULDBLOCK) 1604 error = 0; 1605 1606 if (uset != NULL) { 1607 /* 1608 * Make sure that ast() is called on return to 1609 * usermode and TDP_OLDMASK is cleared, restoring old 1610 * sigmask. If we didn't get interrupted, then the caller is 1611 * likely not expecting a signal to hit that should normally be 1612 * blocked by its signal mask, so we restore the mask before 1613 * any signals could be delivered. 1614 */ 1615 if (error == EINTR) 1616 ast_sched(td, TDA_SIGSUSPEND); 1617 else 1618 ast_sched(td, TDA_PSELECT); 1619 } 1620 1621 return (error); 1622 } 1623 1624 int 1625 sys_ppoll(struct thread *td, struct ppoll_args *uap) 1626 { 1627 struct timespec ts, *tsp; 1628 sigset_t set, *ssp; 1629 int error; 1630 1631 if (uap->ts != NULL) { 1632 error = copyin(uap->ts, &ts, sizeof(ts)); 1633 if (error) 1634 return (error); 1635 tsp = &ts; 1636 } else 1637 tsp = NULL; 1638 if (uap->set != NULL) { 1639 error = copyin(uap->set, &set, sizeof(set)); 1640 if (error) 1641 return (error); 1642 ssp = &set; 1643 } else 1644 ssp = NULL; 1645 return (kern_poll(td, uap->fds, uap->nfds, tsp, ssp)); 1646 } 1647 1648 /* 1649 * ufds points to an array in user space. 1650 */ 1651 int 1652 kern_poll(struct thread *td, struct pollfd *ufds, u_int nfds, 1653 struct timespec *tsp, sigset_t *set) 1654 { 1655 struct pollfd *kfds; 1656 struct pollfd stackfds[32]; 1657 int error; 1658 1659 if (kern_poll_maxfds(nfds)) 1660 return (EXTERROR(EINVAL, "too large nfds")); 1661 if (nfds > nitems(stackfds)) 1662 kfds = mallocarray(nfds, sizeof(*kfds), M_TEMP, M_WAITOK); 1663 else 1664 kfds = stackfds; 1665 error = copyin(ufds, kfds, nfds * sizeof(*kfds)); 1666 if (error != 0) 1667 goto out; 1668 1669 error = kern_poll_kfds(td, kfds, nfds, tsp, set); 1670 if (error == 0) 1671 error = pollout(td, kfds, ufds, nfds); 1672 #ifdef KTRACE 1673 if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY)) 1674 ktrstructarray("pollfd", UIO_USERSPACE, ufds, nfds, 1675 sizeof(*ufds)); 1676 #endif 1677 1678 out: 1679 if (nfds > nitems(stackfds)) 1680 free(kfds, M_TEMP); 1681 return (error); 1682 } 1683 1684 bool 1685 kern_poll_maxfds(u_int nfds) 1686 { 1687 1688 /* 1689 * This is kinda bogus. We have fd limits, but that is not 1690 * really related to the size of the pollfd array. Make sure 1691 * we let the process use at least FD_SETSIZE entries and at 1692 * least enough for the system-wide limits. We want to be reasonably 1693 * safe, but not overly restrictive. 1694 */ 1695 return (nfds > maxfilesperproc && nfds > FD_SETSIZE); 1696 } 1697 1698 static int 1699 pollrescan(struct thread *td) 1700 { 1701 struct seltd *stp; 1702 struct selfd *sfp; 1703 struct selfd *sfn; 1704 struct selinfo *si; 1705 struct filedesc *fdp; 1706 struct file *fp; 1707 struct pollfd *fd; 1708 int n, error; 1709 bool only_user; 1710 1711 n = 0; 1712 fdp = td->td_proc->p_fd; 1713 stp = td->td_sel; 1714 only_user = FILEDESC_IS_ONLY_USER(fdp); 1715 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { 1716 fd = (struct pollfd *)sfp->sf_cookie; 1717 si = sfp->sf_si; 1718 selfdfree(stp, sfp); 1719 /* If the selinfo wasn't cleared the event didn't fire. */ 1720 if (si != NULL) 1721 continue; 1722 if (only_user) 1723 error = fget_only_user(fdp, fd->fd, &cap_event_rights, &fp); 1724 else 1725 error = fget_unlocked(td, fd->fd, &cap_event_rights, &fp); 1726 if (__predict_false(error != 0)) { 1727 fd->revents = POLLNVAL; 1728 n++; 1729 continue; 1730 } 1731 /* 1732 * Note: backend also returns POLLHUP and 1733 * POLLERR if appropriate. 1734 */ 1735 fd->revents = fo_poll(fp, fd->events, td->td_ucred, td); 1736 if (only_user) 1737 fput_only_user(fdp, fp); 1738 else 1739 fdrop(fp, td); 1740 if (fd->revents != 0) 1741 n++; 1742 } 1743 stp->st_flags = 0; 1744 td->td_retval[0] = n; 1745 return (0); 1746 } 1747 1748 static int 1749 pollout(struct thread *td, struct pollfd *fds, struct pollfd *ufds, u_int nfd) 1750 { 1751 int error = 0; 1752 u_int i = 0; 1753 u_int n = 0; 1754 1755 for (i = 0; i < nfd; i++) { 1756 error = copyout(&fds->revents, &ufds->revents, 1757 sizeof(ufds->revents)); 1758 if (error) 1759 return (error); 1760 if (fds->revents != 0) 1761 n++; 1762 fds++; 1763 ufds++; 1764 } 1765 td->td_retval[0] = n; 1766 return (0); 1767 } 1768 1769 static int 1770 pollscan(struct thread *td, struct pollfd *fds, u_int nfd) 1771 { 1772 struct filedesc *fdp; 1773 struct file *fp; 1774 int i, n, error; 1775 bool only_user; 1776 1777 n = 0; 1778 fdp = td->td_proc->p_fd; 1779 only_user = FILEDESC_IS_ONLY_USER(fdp); 1780 for (i = 0; i < nfd; i++, fds++) { 1781 if (fds->fd < 0) { 1782 fds->revents = 0; 1783 continue; 1784 } 1785 if (only_user) 1786 error = fget_only_user(fdp, fds->fd, &cap_event_rights, &fp); 1787 else 1788 error = fget_unlocked(td, fds->fd, &cap_event_rights, &fp); 1789 if (__predict_false(error != 0)) { 1790 fds->revents = POLLNVAL; 1791 n++; 1792 continue; 1793 } 1794 /* 1795 * Note: backend also returns POLLHUP and 1796 * POLLERR if appropriate. 1797 */ 1798 selfdalloc(td, fds); 1799 fds->revents = fo_poll(fp, fds->events, 1800 td->td_ucred, td); 1801 if (only_user) 1802 fput_only_user(fdp, fp); 1803 else 1804 fdrop(fp, td); 1805 /* 1806 * POSIX requires POLLOUT to be never 1807 * set simultaneously with POLLHUP. 1808 */ 1809 if ((fds->revents & POLLHUP) != 0) 1810 fds->revents &= ~POLLOUT; 1811 1812 if (fds->revents != 0) 1813 n++; 1814 } 1815 td->td_retval[0] = n; 1816 return (0); 1817 } 1818 1819 /* 1820 * XXX This was created specifically to support netncp and netsmb. This 1821 * allows the caller to specify a socket to wait for events on. It returns 1822 * 0 if any events matched and an error otherwise. There is no way to 1823 * determine which events fired. 1824 */ 1825 int 1826 selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td) 1827 { 1828 struct timeval rtv; 1829 sbintime_t asbt, precision, rsbt; 1830 int error; 1831 1832 precision = 0; /* stupid gcc! */ 1833 if (tvp != NULL) { 1834 rtv = *tvp; 1835 if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || 1836 rtv.tv_usec >= 1000000) 1837 return (EXTERROR(EINVAL, "invalid timeval")); 1838 if (!timevalisset(&rtv)) 1839 asbt = 0; 1840 else if (rtv.tv_sec <= INT32_MAX) { 1841 rsbt = tvtosbt(rtv); 1842 precision = rsbt; 1843 precision >>= tc_precexp; 1844 if (TIMESEL(&asbt, rsbt)) 1845 asbt += tc_tick_sbt; 1846 if (asbt <= SBT_MAX - rsbt) 1847 asbt += rsbt; 1848 else 1849 asbt = -1; 1850 } else 1851 asbt = -1; 1852 } else 1853 asbt = -1; 1854 seltdinit(td); 1855 /* 1856 * Iterate until the timeout expires or the socket becomes ready. 1857 */ 1858 for (;;) { 1859 selfdalloc(td, NULL); 1860 if (so->so_proto->pr_sopoll(so, events, td) != 0) { 1861 error = 0; 1862 break; 1863 } 1864 error = seltdwait(td, asbt, precision); 1865 if (error) 1866 break; 1867 } 1868 seltdclear(td); 1869 /* XXX Duplicates ncp/smb behavior. */ 1870 if (error == ERESTART) 1871 error = 0; 1872 return (error); 1873 } 1874 1875 /* 1876 * Preallocate two selfds associated with 'cookie'. Some fo_poll routines 1877 * have two select sets, one for read and another for write. 1878 */ 1879 static void 1880 selfdalloc(struct thread *td, void *cookie) 1881 { 1882 struct seltd *stp; 1883 1884 stp = td->td_sel; 1885 if (stp->st_free1 == NULL) 1886 stp->st_free1 = malloc(sizeof(*stp->st_free1), M_SELFD, M_WAITOK|M_ZERO); 1887 stp->st_free1->sf_td = stp; 1888 stp->st_free1->sf_cookie = cookie; 1889 if (stp->st_free2 == NULL) 1890 stp->st_free2 = malloc(sizeof(*stp->st_free2), M_SELFD, M_WAITOK|M_ZERO); 1891 stp->st_free2->sf_td = stp; 1892 stp->st_free2->sf_cookie = cookie; 1893 } 1894 1895 static void 1896 selfdfree(struct seltd *stp, struct selfd *sfp) 1897 { 1898 STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link); 1899 /* 1900 * Paired with doselwakeup. 1901 */ 1902 if (atomic_load_acq_ptr((uintptr_t *)&sfp->sf_si) != (uintptr_t)NULL) { 1903 mtx_lock(sfp->sf_mtx); 1904 if (sfp->sf_si != NULL) { 1905 TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads); 1906 } 1907 mtx_unlock(sfp->sf_mtx); 1908 } 1909 free(sfp, M_SELFD); 1910 } 1911 1912 /* Drain the waiters tied to all the selfd belonging the specified selinfo. */ 1913 void 1914 seldrain(struct selinfo *sip) 1915 { 1916 1917 /* 1918 * This feature is already provided by doselwakeup(), thus it is 1919 * enough to go for it. 1920 * Eventually, the context, should take care to avoid races 1921 * between thread calling select()/poll() and file descriptor 1922 * detaching, but, again, the races are just the same as 1923 * selwakeup(). 1924 */ 1925 doselwakeup(sip, -1); 1926 } 1927 1928 /* 1929 * Record a select request. 1930 */ 1931 void 1932 selrecord(struct thread *selector, struct selinfo *sip) 1933 { 1934 struct selfd *sfp; 1935 struct seltd *stp; 1936 struct mtx *mtxp; 1937 1938 stp = selector->td_sel; 1939 /* 1940 * Don't record when doing a rescan. 1941 */ 1942 if (stp->st_flags & SELTD_RESCAN) 1943 return; 1944 /* 1945 * Grab one of the preallocated descriptors. 1946 */ 1947 sfp = NULL; 1948 if ((sfp = stp->st_free1) != NULL) 1949 stp->st_free1 = NULL; 1950 else if ((sfp = stp->st_free2) != NULL) 1951 stp->st_free2 = NULL; 1952 else 1953 panic("selrecord: No free selfd on selq"); 1954 mtxp = sip->si_mtx; 1955 if (mtxp == NULL) 1956 mtxp = mtx_pool_find(mtxpool_select, sip); 1957 /* 1958 * Initialize the sfp and queue it in the thread. 1959 */ 1960 sfp->sf_si = sip; 1961 sfp->sf_mtx = mtxp; 1962 STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link); 1963 /* 1964 * Now that we've locked the sip, check for initialization. 1965 */ 1966 mtx_lock(mtxp); 1967 if (sip->si_mtx == NULL) { 1968 sip->si_mtx = mtxp; 1969 TAILQ_INIT(&sip->si_tdlist); 1970 } 1971 /* 1972 * Add this thread to the list of selfds listening on this selinfo. 1973 */ 1974 TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads); 1975 mtx_unlock(sip->si_mtx); 1976 } 1977 1978 /* Wake up a selecting thread. */ 1979 void 1980 selwakeup(struct selinfo *sip) 1981 { 1982 doselwakeup(sip, -1); 1983 } 1984 1985 /* Wake up a selecting thread, and set its priority. */ 1986 void 1987 selwakeuppri(struct selinfo *sip, int pri) 1988 { 1989 doselwakeup(sip, pri); 1990 } 1991 1992 /* 1993 * Do a wakeup when a selectable event occurs. 1994 */ 1995 static void 1996 doselwakeup(struct selinfo *sip, int pri) 1997 { 1998 struct selfd *sfp; 1999 struct selfd *sfn; 2000 struct seltd *stp; 2001 2002 /* If it's not initialized there can't be any waiters. */ 2003 if (sip->si_mtx == NULL) 2004 return; 2005 /* 2006 * Locking the selinfo locks all selfds associated with it. 2007 */ 2008 mtx_lock(sip->si_mtx); 2009 TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) { 2010 /* 2011 * Once we remove this sfp from the list and clear the 2012 * sf_si seltdclear will know to ignore this si. 2013 */ 2014 TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads); 2015 stp = sfp->sf_td; 2016 mtx_lock(&stp->st_mtx); 2017 stp->st_flags |= SELTD_PENDING; 2018 cv_broadcastpri(&stp->st_wait, pri); 2019 mtx_unlock(&stp->st_mtx); 2020 /* 2021 * Paired with selfdfree. 2022 * 2023 * Storing this only after the wakeup provides an invariant that 2024 * stp is not used after selfdfree returns. 2025 */ 2026 atomic_store_rel_ptr((uintptr_t *)&sfp->sf_si, (uintptr_t)NULL); 2027 } 2028 mtx_unlock(sip->si_mtx); 2029 } 2030 2031 static void 2032 seltdinit(struct thread *td) 2033 { 2034 struct seltd *stp; 2035 2036 stp = td->td_sel; 2037 if (stp != NULL) { 2038 MPASS(stp->st_flags == 0); 2039 MPASS(STAILQ_EMPTY(&stp->st_selq)); 2040 return; 2041 } 2042 stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO); 2043 mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF); 2044 cv_init(&stp->st_wait, "select"); 2045 stp->st_flags = 0; 2046 STAILQ_INIT(&stp->st_selq); 2047 td->td_sel = stp; 2048 } 2049 2050 static int 2051 seltdwait(struct thread *td, sbintime_t sbt, sbintime_t precision) 2052 { 2053 struct seltd *stp; 2054 int error; 2055 2056 stp = td->td_sel; 2057 /* 2058 * An event of interest may occur while we do not hold the seltd 2059 * locked so check the pending flag before we sleep. 2060 */ 2061 mtx_lock(&stp->st_mtx); 2062 /* 2063 * Any further calls to selrecord will be a rescan. 2064 */ 2065 stp->st_flags |= SELTD_RESCAN; 2066 if (stp->st_flags & SELTD_PENDING) { 2067 mtx_unlock(&stp->st_mtx); 2068 return (0); 2069 } 2070 if (sbt == 0) 2071 error = EWOULDBLOCK; 2072 else if (sbt != -1) 2073 error = cv_timedwait_sig_sbt(&stp->st_wait, &stp->st_mtx, 2074 sbt, precision, C_ABSOLUTE); 2075 else 2076 error = cv_wait_sig(&stp->st_wait, &stp->st_mtx); 2077 mtx_unlock(&stp->st_mtx); 2078 2079 return (error); 2080 } 2081 2082 void 2083 seltdfini(struct thread *td) 2084 { 2085 struct seltd *stp; 2086 2087 stp = td->td_sel; 2088 if (stp == NULL) 2089 return; 2090 MPASS(stp->st_flags == 0); 2091 MPASS(STAILQ_EMPTY(&stp->st_selq)); 2092 if (stp->st_free1) 2093 free(stp->st_free1, M_SELFD); 2094 if (stp->st_free2) 2095 free(stp->st_free2, M_SELFD); 2096 td->td_sel = NULL; 2097 cv_destroy(&stp->st_wait); 2098 mtx_destroy(&stp->st_mtx); 2099 free(stp, M_SELECT); 2100 } 2101 2102 /* 2103 * Remove the references to the thread from all of the objects we were 2104 * polling. 2105 */ 2106 static void 2107 seltdclear(struct thread *td) 2108 { 2109 struct seltd *stp; 2110 struct selfd *sfp; 2111 struct selfd *sfn; 2112 2113 stp = td->td_sel; 2114 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) 2115 selfdfree(stp, sfp); 2116 stp->st_flags = 0; 2117 } 2118 2119 static void selectinit(void *); 2120 SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL); 2121 static void 2122 selectinit(void *dummy __unused) 2123 { 2124 2125 mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF); 2126 } 2127 2128 /* 2129 * Set up a syscall return value that follows the convention specified for 2130 * posix_* functions. 2131 */ 2132 int 2133 kern_posix_error(struct thread *td, int error) 2134 { 2135 2136 if (error <= 0) 2137 return (error); 2138 td->td_errno = error; 2139 td->td_pflags |= TDP_NERRNO; 2140 td->td_retval[0] = error; 2141 return (0); 2142 } 2143 2144 int 2145 kcmp_cmp(uintptr_t a, uintptr_t b) 2146 { 2147 if (a == b) 2148 return (0); 2149 else if (a < b) 2150 return (1); 2151 return (2); 2152 } 2153 2154 static int 2155 kcmp_pget(struct thread *td, pid_t pid, struct proc **pp) 2156 { 2157 int error; 2158 2159 if (pid == td->td_proc->p_pid) { 2160 *pp = td->td_proc; 2161 return (0); 2162 } 2163 error = pget(pid, PGET_NOTID | PGET_CANDEBUG | PGET_NOTWEXIT | 2164 PGET_HOLD, pp); 2165 MPASS(*pp != td->td_proc); 2166 return (error); 2167 } 2168 2169 int 2170 kern_kcmp(struct thread *td, pid_t pid1, pid_t pid2, int type, 2171 uintptr_t idx1, uintptr_t idx2) 2172 { 2173 struct proc *p1, *p2; 2174 struct file *fp1, *fp2; 2175 int error, res; 2176 2177 res = -1; 2178 p1 = p2 = NULL; 2179 error = kcmp_pget(td, pid1, &p1); 2180 if (error == 0) 2181 error = kcmp_pget(td, pid2, &p2); 2182 if (error != 0) 2183 goto out; 2184 2185 switch (type) { 2186 case KCMP_FILE: 2187 case KCMP_FILEOBJ: 2188 error = fget_remote(td, p1, idx1, &fp1); 2189 if (error == 0) { 2190 error = fget_remote(td, p2, idx2, &fp2); 2191 if (error == 0) { 2192 if (type == KCMP_FILEOBJ) 2193 res = fo_cmp(fp1, fp2, td); 2194 else 2195 res = kcmp_cmp((uintptr_t)fp1, 2196 (uintptr_t)fp2); 2197 fdrop(fp2, td); 2198 } 2199 fdrop(fp1, td); 2200 } 2201 break; 2202 case KCMP_FILES: 2203 res = kcmp_cmp((uintptr_t)p1->p_fd, (uintptr_t)p2->p_fd); 2204 break; 2205 case KCMP_SIGHAND: 2206 res = kcmp_cmp((uintptr_t)p1->p_sigacts, 2207 (uintptr_t)p2->p_sigacts); 2208 break; 2209 case KCMP_VM: 2210 res = kcmp_cmp((uintptr_t)p1->p_vmspace, 2211 (uintptr_t)p2->p_vmspace); 2212 break; 2213 default: 2214 error = EXTERROR(EINVAL, "unknown op"); 2215 break; 2216 } 2217 2218 out: 2219 if (p1 != NULL && p1 != td->td_proc) 2220 PRELE(p1); 2221 if (p2 != NULL && p2 != td->td_proc) 2222 PRELE(p2); 2223 2224 td->td_retval[0] = res; 2225 return (error); 2226 } 2227 2228 int 2229 sys_kcmp(struct thread *td, struct kcmp_args *uap) 2230 { 2231 return (kern_kcmp(td, uap->pid1, uap->pid2, uap->type, 2232 uap->idx1, uap->idx2)); 2233 } 2234 2235 int 2236 file_kcmp_generic(struct file *fp1, struct file *fp2, struct thread *td) 2237 { 2238 if (fp1->f_type != fp2->f_type) 2239 return (3); 2240 return (kcmp_cmp((uintptr_t)fp1->f_data, (uintptr_t)fp2->f_data)); 2241 } 2242 2243 int 2244 exterr_to_ue(struct thread *td, struct uexterror *ue) 2245 { 2246 if ((td->td_pflags2 & TDP2_EXTERR) == 0) 2247 return (ENOENT); 2248 2249 memset(ue, 0, sizeof(*ue)); 2250 ue->error = td->td_kexterr.error; 2251 ue->cat = td->td_kexterr.cat; 2252 ue->src_line = td->td_kexterr.src_line; 2253 ue->p1 = td->td_kexterr.p1; 2254 ue->p2 = td->td_kexterr.p2; 2255 if (td->td_kexterr.msg != NULL) 2256 strlcpy(ue->msg, td->td_kexterr.msg, sizeof(ue->msg)); 2257 return (0); 2258 } 2259 2260 void 2261 exterr_copyout(struct thread *td) 2262 { 2263 struct uexterror ue; 2264 ksiginfo_t ksi; 2265 void *uloc; 2266 size_t sz; 2267 int error; 2268 2269 MPASS((td->td_pflags2 & TDP2_UEXTERR) != 0); 2270 2271 uloc = (char *)td->td_exterr_ptr + __offsetof(struct uexterror, 2272 error); 2273 error = exterr_to_ue(td, &ue); 2274 if (error != 0) { 2275 ue.error = 0; 2276 sz = sizeof(ue.error); 2277 } else { 2278 ktrexterr(td); 2279 sz = sizeof(ue) - __offsetof(struct uexterror, error); 2280 } 2281 error = copyout(&ue.error, uloc, sz); 2282 if (error != 0) { 2283 td->td_pflags2 &= ~TDP2_UEXTERR; 2284 ksiginfo_init_trap(&ksi); 2285 ksi.ksi_signo = SIGSEGV; 2286 ksi.ksi_code = SEGV_ACCERR; 2287 ksi.ksi_addr = uloc; 2288 trapsignal(td, &ksi); 2289 } 2290 } 2291 2292 int 2293 sys_exterrctl(struct thread *td, struct exterrctl_args *uap) 2294 { 2295 uint32_t ver; 2296 int error; 2297 2298 if ((uap->flags & ~(EXTERRCTLF_FORCE)) != 0) 2299 return (EINVAL); 2300 switch (uap->op) { 2301 case EXTERRCTL_ENABLE: 2302 if ((td->td_pflags2 & TDP2_UEXTERR) != 0 && 2303 (uap->flags & EXTERRCTLF_FORCE) == 0) 2304 return (EBUSY); 2305 td->td_pflags2 &= ~TDP2_UEXTERR; 2306 error = copyin(uap->ptr, &ver, sizeof(ver)); 2307 if (error != 0) 2308 return (error); 2309 if (ver != UEXTERROR_VER) 2310 return (EINVAL); 2311 td->td_pflags2 |= TDP2_UEXTERR; 2312 td->td_exterr_ptr = uap->ptr; 2313 return (0); 2314 case EXTERRCTL_DISABLE: 2315 if ((td->td_pflags2 & TDP2_UEXTERR) == 0) 2316 return (EINVAL); 2317 td->td_pflags2 &= ~TDP2_UEXTERR; 2318 return (0); 2319 case EXTERRCTL_UD: 2320 /* 2321 * Important: this code must always return EINVAL and never any 2322 * extended error, for testing purposes. 2323 */ 2324 /* FALLTHROUGH */ 2325 default: 2326 return (EINVAL); 2327 } 2328 } 2329 2330 int 2331 exterr_set(int eerror, int category, const char *mmsg, uintptr_t pp1, 2332 uintptr_t pp2, int line) 2333 { 2334 struct thread *td; 2335 2336 td = curthread; 2337 if ((td->td_pflags2 & TDP2_UEXTERR) != 0) { 2338 td->td_pflags2 |= TDP2_EXTERR; 2339 td->td_kexterr.error = eerror; 2340 td->td_kexterr.cat = category; 2341 td->td_kexterr.msg = mmsg; 2342 td->td_kexterr.p1 = pp1; 2343 td->td_kexterr.p2 = pp2; 2344 td->td_kexterr.src_line = line; 2345 } 2346 return (eerror); 2347 } 2348