1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 37 */ 38 39 #include <sys/cdefs.h> 40 __FBSDID("$FreeBSD$"); 41 42 #include "opt_capsicum.h" 43 #include "opt_ktrace.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/sysproto.h> 48 #include <sys/capsicum.h> 49 #include <sys/filedesc.h> 50 #include <sys/filio.h> 51 #include <sys/fcntl.h> 52 #include <sys/file.h> 53 #include <sys/lock.h> 54 #include <sys/proc.h> 55 #include <sys/signalvar.h> 56 #include <sys/socketvar.h> 57 #include <sys/uio.h> 58 #include <sys/kernel.h> 59 #include <sys/ktr.h> 60 #include <sys/limits.h> 61 #include <sys/malloc.h> 62 #include <sys/poll.h> 63 #include <sys/resourcevar.h> 64 #include <sys/selinfo.h> 65 #include <sys/sleepqueue.h> 66 #include <sys/syscallsubr.h> 67 #include <sys/sysctl.h> 68 #include <sys/sysent.h> 69 #include <sys/vnode.h> 70 #include <sys/bio.h> 71 #include <sys/buf.h> 72 #include <sys/condvar.h> 73 #ifdef KTRACE 74 #include <sys/ktrace.h> 75 #endif 76 77 #include <security/audit/audit.h> 78 79 /* 80 * The following macro defines how many bytes will be allocated from 81 * the stack instead of memory allocated when passing the IOCTL data 82 * structures from userspace and to the kernel. Some IOCTLs having 83 * small data structures are used very frequently and this small 84 * buffer on the stack gives a significant speedup improvement for 85 * those requests. The value of this define should be greater or equal 86 * to 64 bytes and should also be power of two. The data structure is 87 * currently hard-aligned to a 8-byte boundary on the stack. This 88 * should currently be sufficient for all supported platforms. 89 */ 90 #define SYS_IOCTL_SMALL_SIZE 128 /* bytes */ 91 #define SYS_IOCTL_SMALL_ALIGN 8 /* bytes */ 92 93 #ifdef __LP64__ 94 static int iosize_max_clamp = 0; 95 SYSCTL_INT(_debug, OID_AUTO, iosize_max_clamp, CTLFLAG_RW, 96 &iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX"); 97 static int devfs_iosize_max_clamp = 1; 98 SYSCTL_INT(_debug, OID_AUTO, devfs_iosize_max_clamp, CTLFLAG_RW, 99 &devfs_iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX for devices"); 100 #endif 101 102 /* 103 * Assert that the return value of read(2) and write(2) syscalls fits 104 * into a register. If not, an architecture will need to provide the 105 * usermode wrappers to reconstruct the result. 106 */ 107 CTASSERT(sizeof(register_t) >= sizeof(size_t)); 108 109 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 110 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 111 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 112 113 static int pollout(struct thread *, struct pollfd *, struct pollfd *, 114 u_int); 115 static int pollscan(struct thread *, struct pollfd *, u_int); 116 static int pollrescan(struct thread *); 117 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 118 static int selrescan(struct thread *, fd_mask **, fd_mask **); 119 static void selfdalloc(struct thread *, void *); 120 static void selfdfree(struct seltd *, struct selfd *); 121 static int dofileread(struct thread *, int, struct file *, struct uio *, 122 off_t, int); 123 static int dofilewrite(struct thread *, int, struct file *, struct uio *, 124 off_t, int); 125 static void doselwakeup(struct selinfo *, int); 126 static void seltdinit(struct thread *); 127 static int seltdwait(struct thread *, sbintime_t, sbintime_t); 128 static void seltdclear(struct thread *); 129 130 /* 131 * One seltd per-thread allocated on demand as needed. 132 * 133 * t - protected by st_mtx 134 * k - Only accessed by curthread or read-only 135 */ 136 struct seltd { 137 STAILQ_HEAD(, selfd) st_selq; /* (k) List of selfds. */ 138 struct selfd *st_free1; /* (k) free fd for read set. */ 139 struct selfd *st_free2; /* (k) free fd for write set. */ 140 struct mtx st_mtx; /* Protects struct seltd */ 141 struct cv st_wait; /* (t) Wait channel. */ 142 int st_flags; /* (t) SELTD_ flags. */ 143 }; 144 145 #define SELTD_PENDING 0x0001 /* We have pending events. */ 146 #define SELTD_RESCAN 0x0002 /* Doing a rescan. */ 147 148 /* 149 * One selfd allocated per-thread per-file-descriptor. 150 * f - protected by sf_mtx 151 */ 152 struct selfd { 153 STAILQ_ENTRY(selfd) sf_link; /* (k) fds owned by this td. */ 154 TAILQ_ENTRY(selfd) sf_threads; /* (f) fds on this selinfo. */ 155 struct selinfo *sf_si; /* (f) selinfo when linked. */ 156 struct mtx *sf_mtx; /* Pointer to selinfo mtx. */ 157 struct seltd *sf_td; /* (k) owning seltd. */ 158 void *sf_cookie; /* (k) fd or pollfd. */ 159 u_int sf_refs; 160 }; 161 162 static uma_zone_t selfd_zone; 163 static struct mtx_pool *mtxpool_select; 164 165 #ifdef __LP64__ 166 size_t 167 devfs_iosize_max(void) 168 { 169 170 return (devfs_iosize_max_clamp || SV_CURPROC_FLAG(SV_ILP32) ? 171 INT_MAX : SSIZE_MAX); 172 } 173 174 size_t 175 iosize_max(void) 176 { 177 178 return (iosize_max_clamp || SV_CURPROC_FLAG(SV_ILP32) ? 179 INT_MAX : SSIZE_MAX); 180 } 181 #endif 182 183 #ifndef _SYS_SYSPROTO_H_ 184 struct read_args { 185 int fd; 186 void *buf; 187 size_t nbyte; 188 }; 189 #endif 190 int 191 sys_read(struct thread *td, struct read_args *uap) 192 { 193 struct uio auio; 194 struct iovec aiov; 195 int error; 196 197 if (uap->nbyte > IOSIZE_MAX) 198 return (EINVAL); 199 aiov.iov_base = uap->buf; 200 aiov.iov_len = uap->nbyte; 201 auio.uio_iov = &aiov; 202 auio.uio_iovcnt = 1; 203 auio.uio_resid = uap->nbyte; 204 auio.uio_segflg = UIO_USERSPACE; 205 error = kern_readv(td, uap->fd, &auio); 206 return (error); 207 } 208 209 /* 210 * Positioned read system call 211 */ 212 #ifndef _SYS_SYSPROTO_H_ 213 struct pread_args { 214 int fd; 215 void *buf; 216 size_t nbyte; 217 int pad; 218 off_t offset; 219 }; 220 #endif 221 int 222 sys_pread(struct thread *td, struct pread_args *uap) 223 { 224 225 return (kern_pread(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); 226 } 227 228 int 229 kern_pread(struct thread *td, int fd, void *buf, size_t nbyte, off_t offset) 230 { 231 struct uio auio; 232 struct iovec aiov; 233 int error; 234 235 if (nbyte > IOSIZE_MAX) 236 return (EINVAL); 237 aiov.iov_base = buf; 238 aiov.iov_len = nbyte; 239 auio.uio_iov = &aiov; 240 auio.uio_iovcnt = 1; 241 auio.uio_resid = nbyte; 242 auio.uio_segflg = UIO_USERSPACE; 243 error = kern_preadv(td, fd, &auio, offset); 244 return (error); 245 } 246 247 #if defined(COMPAT_FREEBSD6) 248 int 249 freebsd6_pread(struct thread *td, struct freebsd6_pread_args *uap) 250 { 251 252 return (kern_pread(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); 253 } 254 #endif 255 256 /* 257 * Scatter read system call. 258 */ 259 #ifndef _SYS_SYSPROTO_H_ 260 struct readv_args { 261 int fd; 262 struct iovec *iovp; 263 u_int iovcnt; 264 }; 265 #endif 266 int 267 sys_readv(struct thread *td, struct readv_args *uap) 268 { 269 struct uio *auio; 270 int error; 271 272 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 273 if (error) 274 return (error); 275 error = kern_readv(td, uap->fd, auio); 276 free(auio, M_IOV); 277 return (error); 278 } 279 280 int 281 kern_readv(struct thread *td, int fd, struct uio *auio) 282 { 283 struct file *fp; 284 int error; 285 286 error = fget_read(td, fd, &cap_read_rights, &fp); 287 if (error) 288 return (error); 289 error = dofileread(td, fd, fp, auio, (off_t)-1, 0); 290 fdrop(fp, td); 291 return (error); 292 } 293 294 /* 295 * Scatter positioned read system call. 296 */ 297 #ifndef _SYS_SYSPROTO_H_ 298 struct preadv_args { 299 int fd; 300 struct iovec *iovp; 301 u_int iovcnt; 302 off_t offset; 303 }; 304 #endif 305 int 306 sys_preadv(struct thread *td, struct preadv_args *uap) 307 { 308 struct uio *auio; 309 int error; 310 311 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 312 if (error) 313 return (error); 314 error = kern_preadv(td, uap->fd, auio, uap->offset); 315 free(auio, M_IOV); 316 return (error); 317 } 318 319 int 320 kern_preadv(struct thread *td, int fd, struct uio *auio, off_t offset) 321 { 322 struct file *fp; 323 int error; 324 325 error = fget_read(td, fd, &cap_pread_rights, &fp); 326 if (error) 327 return (error); 328 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 329 error = ESPIPE; 330 else if (offset < 0 && 331 (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) 332 error = EINVAL; 333 else 334 error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET); 335 fdrop(fp, td); 336 return (error); 337 } 338 339 /* 340 * Common code for readv and preadv that reads data in 341 * from a file using the passed in uio, offset, and flags. 342 */ 343 static int 344 dofileread(struct thread *td, int fd, struct file *fp, struct uio *auio, 345 off_t offset, int flags) 346 { 347 ssize_t cnt; 348 int error; 349 #ifdef KTRACE 350 struct uio *ktruio = NULL; 351 #endif 352 353 AUDIT_ARG_FD(fd); 354 355 /* Finish zero length reads right here */ 356 if (auio->uio_resid == 0) { 357 td->td_retval[0] = 0; 358 return (0); 359 } 360 auio->uio_rw = UIO_READ; 361 auio->uio_offset = offset; 362 auio->uio_td = td; 363 #ifdef KTRACE 364 if (KTRPOINT(td, KTR_GENIO)) 365 ktruio = cloneuio(auio); 366 #endif 367 cnt = auio->uio_resid; 368 if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) { 369 if (auio->uio_resid != cnt && (error == ERESTART || 370 error == EINTR || error == EWOULDBLOCK)) 371 error = 0; 372 } 373 cnt -= auio->uio_resid; 374 #ifdef KTRACE 375 if (ktruio != NULL) { 376 ktruio->uio_resid = cnt; 377 ktrgenio(fd, UIO_READ, ktruio, error); 378 } 379 #endif 380 td->td_retval[0] = cnt; 381 return (error); 382 } 383 384 #ifndef _SYS_SYSPROTO_H_ 385 struct write_args { 386 int fd; 387 const void *buf; 388 size_t nbyte; 389 }; 390 #endif 391 int 392 sys_write(struct thread *td, struct write_args *uap) 393 { 394 struct uio auio; 395 struct iovec aiov; 396 int error; 397 398 if (uap->nbyte > IOSIZE_MAX) 399 return (EINVAL); 400 aiov.iov_base = (void *)(uintptr_t)uap->buf; 401 aiov.iov_len = uap->nbyte; 402 auio.uio_iov = &aiov; 403 auio.uio_iovcnt = 1; 404 auio.uio_resid = uap->nbyte; 405 auio.uio_segflg = UIO_USERSPACE; 406 error = kern_writev(td, uap->fd, &auio); 407 return (error); 408 } 409 410 /* 411 * Positioned write system call. 412 */ 413 #ifndef _SYS_SYSPROTO_H_ 414 struct pwrite_args { 415 int fd; 416 const void *buf; 417 size_t nbyte; 418 int pad; 419 off_t offset; 420 }; 421 #endif 422 int 423 sys_pwrite(struct thread *td, struct pwrite_args *uap) 424 { 425 426 return (kern_pwrite(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); 427 } 428 429 int 430 kern_pwrite(struct thread *td, int fd, const void *buf, size_t nbyte, 431 off_t offset) 432 { 433 struct uio auio; 434 struct iovec aiov; 435 int error; 436 437 if (nbyte > IOSIZE_MAX) 438 return (EINVAL); 439 aiov.iov_base = (void *)(uintptr_t)buf; 440 aiov.iov_len = nbyte; 441 auio.uio_iov = &aiov; 442 auio.uio_iovcnt = 1; 443 auio.uio_resid = nbyte; 444 auio.uio_segflg = UIO_USERSPACE; 445 error = kern_pwritev(td, fd, &auio, offset); 446 return (error); 447 } 448 449 #if defined(COMPAT_FREEBSD6) 450 int 451 freebsd6_pwrite(struct thread *td, struct freebsd6_pwrite_args *uap) 452 { 453 454 return (kern_pwrite(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); 455 } 456 #endif 457 458 /* 459 * Gather write system call. 460 */ 461 #ifndef _SYS_SYSPROTO_H_ 462 struct writev_args { 463 int fd; 464 struct iovec *iovp; 465 u_int iovcnt; 466 }; 467 #endif 468 int 469 sys_writev(struct thread *td, struct writev_args *uap) 470 { 471 struct uio *auio; 472 int error; 473 474 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 475 if (error) 476 return (error); 477 error = kern_writev(td, uap->fd, auio); 478 free(auio, M_IOV); 479 return (error); 480 } 481 482 int 483 kern_writev(struct thread *td, int fd, struct uio *auio) 484 { 485 struct file *fp; 486 int error; 487 488 error = fget_write(td, fd, &cap_write_rights, &fp); 489 if (error) 490 return (error); 491 error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0); 492 fdrop(fp, td); 493 return (error); 494 } 495 496 /* 497 * Gather positioned write system call. 498 */ 499 #ifndef _SYS_SYSPROTO_H_ 500 struct pwritev_args { 501 int fd; 502 struct iovec *iovp; 503 u_int iovcnt; 504 off_t offset; 505 }; 506 #endif 507 int 508 sys_pwritev(struct thread *td, struct pwritev_args *uap) 509 { 510 struct uio *auio; 511 int error; 512 513 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 514 if (error) 515 return (error); 516 error = kern_pwritev(td, uap->fd, auio, uap->offset); 517 free(auio, M_IOV); 518 return (error); 519 } 520 521 int 522 kern_pwritev(struct thread *td, int fd, struct uio *auio, off_t offset) 523 { 524 struct file *fp; 525 int error; 526 527 error = fget_write(td, fd, &cap_pwrite_rights, &fp); 528 if (error) 529 return (error); 530 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 531 error = ESPIPE; 532 else if (offset < 0 && 533 (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) 534 error = EINVAL; 535 else 536 error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET); 537 fdrop(fp, td); 538 return (error); 539 } 540 541 /* 542 * Common code for writev and pwritev that writes data to 543 * a file using the passed in uio, offset, and flags. 544 */ 545 static int 546 dofilewrite(struct thread *td, int fd, struct file *fp, struct uio *auio, 547 off_t offset, int flags) 548 { 549 ssize_t cnt; 550 int error; 551 #ifdef KTRACE 552 struct uio *ktruio = NULL; 553 #endif 554 555 AUDIT_ARG_FD(fd); 556 auio->uio_rw = UIO_WRITE; 557 auio->uio_td = td; 558 auio->uio_offset = offset; 559 #ifdef KTRACE 560 if (KTRPOINT(td, KTR_GENIO)) 561 ktruio = cloneuio(auio); 562 #endif 563 cnt = auio->uio_resid; 564 if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) { 565 if (auio->uio_resid != cnt && (error == ERESTART || 566 error == EINTR || error == EWOULDBLOCK)) 567 error = 0; 568 /* Socket layer is responsible for issuing SIGPIPE. */ 569 if (fp->f_type != DTYPE_SOCKET && error == EPIPE) { 570 PROC_LOCK(td->td_proc); 571 tdsignal(td, SIGPIPE); 572 PROC_UNLOCK(td->td_proc); 573 } 574 } 575 cnt -= auio->uio_resid; 576 #ifdef KTRACE 577 if (ktruio != NULL) { 578 ktruio->uio_resid = cnt; 579 ktrgenio(fd, UIO_WRITE, ktruio, error); 580 } 581 #endif 582 td->td_retval[0] = cnt; 583 return (error); 584 } 585 586 /* 587 * Truncate a file given a file descriptor. 588 * 589 * Can't use fget_write() here, since must return EINVAL and not EBADF if the 590 * descriptor isn't writable. 591 */ 592 int 593 kern_ftruncate(struct thread *td, int fd, off_t length) 594 { 595 struct file *fp; 596 int error; 597 598 AUDIT_ARG_FD(fd); 599 if (length < 0) 600 return (EINVAL); 601 error = fget(td, fd, &cap_ftruncate_rights, &fp); 602 if (error) 603 return (error); 604 AUDIT_ARG_FILE(td->td_proc, fp); 605 if (!(fp->f_flag & FWRITE)) { 606 fdrop(fp, td); 607 return (EINVAL); 608 } 609 error = fo_truncate(fp, length, td->td_ucred, td); 610 fdrop(fp, td); 611 return (error); 612 } 613 614 #ifndef _SYS_SYSPROTO_H_ 615 struct ftruncate_args { 616 int fd; 617 int pad; 618 off_t length; 619 }; 620 #endif 621 int 622 sys_ftruncate(struct thread *td, struct ftruncate_args *uap) 623 { 624 625 return (kern_ftruncate(td, uap->fd, uap->length)); 626 } 627 628 #if defined(COMPAT_43) 629 #ifndef _SYS_SYSPROTO_H_ 630 struct oftruncate_args { 631 int fd; 632 long length; 633 }; 634 #endif 635 int 636 oftruncate(struct thread *td, struct oftruncate_args *uap) 637 { 638 639 return (kern_ftruncate(td, uap->fd, uap->length)); 640 } 641 #endif /* COMPAT_43 */ 642 643 #ifndef _SYS_SYSPROTO_H_ 644 struct ioctl_args { 645 int fd; 646 u_long com; 647 caddr_t data; 648 }; 649 #endif 650 /* ARGSUSED */ 651 int 652 sys_ioctl(struct thread *td, struct ioctl_args *uap) 653 { 654 u_char smalldata[SYS_IOCTL_SMALL_SIZE] __aligned(SYS_IOCTL_SMALL_ALIGN); 655 uint32_t com; 656 int arg, error; 657 u_int size; 658 caddr_t data; 659 660 #ifdef INVARIANTS 661 if (uap->com > 0xffffffff) { 662 printf( 663 "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", 664 td->td_proc->p_pid, td->td_name, uap->com); 665 } 666 #endif 667 com = (uint32_t)uap->com; 668 669 /* 670 * Interpret high order word to find amount of data to be 671 * copied to/from the user's address space. 672 */ 673 size = IOCPARM_LEN(com); 674 if ((size > IOCPARM_MAX) || 675 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || 676 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43) 677 ((com & IOC_OUT) && size == 0) || 678 #else 679 ((com & (IOC_IN | IOC_OUT)) && size == 0) || 680 #endif 681 ((com & IOC_VOID) && size > 0 && size != sizeof(int))) 682 return (ENOTTY); 683 684 if (size > 0) { 685 if (com & IOC_VOID) { 686 /* Integer argument. */ 687 arg = (intptr_t)uap->data; 688 data = (void *)&arg; 689 size = 0; 690 } else { 691 if (size > SYS_IOCTL_SMALL_SIZE) 692 data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 693 else 694 data = smalldata; 695 } 696 } else 697 data = (void *)&uap->data; 698 if (com & IOC_IN) { 699 error = copyin(uap->data, data, (u_int)size); 700 if (error != 0) 701 goto out; 702 } else if (com & IOC_OUT) { 703 /* 704 * Zero the buffer so the user always 705 * gets back something deterministic. 706 */ 707 bzero(data, size); 708 } 709 710 error = kern_ioctl(td, uap->fd, com, data); 711 712 if (error == 0 && (com & IOC_OUT)) 713 error = copyout(data, uap->data, (u_int)size); 714 715 out: 716 if (size > SYS_IOCTL_SMALL_SIZE) 717 free(data, M_IOCTLOPS); 718 return (error); 719 } 720 721 int 722 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) 723 { 724 struct file *fp; 725 struct filedesc *fdp; 726 int error, tmp, locked; 727 728 AUDIT_ARG_FD(fd); 729 AUDIT_ARG_CMD(com); 730 731 fdp = td->td_proc->p_fd; 732 733 switch (com) { 734 case FIONCLEX: 735 case FIOCLEX: 736 FILEDESC_XLOCK(fdp); 737 locked = LA_XLOCKED; 738 break; 739 default: 740 #ifdef CAPABILITIES 741 FILEDESC_SLOCK(fdp); 742 locked = LA_SLOCKED; 743 #else 744 locked = LA_UNLOCKED; 745 #endif 746 break; 747 } 748 749 #ifdef CAPABILITIES 750 if ((fp = fget_locked(fdp, fd)) == NULL) { 751 error = EBADF; 752 goto out; 753 } 754 if ((error = cap_ioctl_check(fdp, fd, com)) != 0) { 755 fp = NULL; /* fhold() was not called yet */ 756 goto out; 757 } 758 if (!fhold(fp)) { 759 error = EBADF; 760 fp = NULL; 761 goto out; 762 } 763 if (locked == LA_SLOCKED) { 764 FILEDESC_SUNLOCK(fdp); 765 locked = LA_UNLOCKED; 766 } 767 #else 768 error = fget(td, fd, &cap_ioctl_rights, &fp); 769 if (error != 0) { 770 fp = NULL; 771 goto out; 772 } 773 #endif 774 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 775 error = EBADF; 776 goto out; 777 } 778 779 switch (com) { 780 case FIONCLEX: 781 fdp->fd_ofiles[fd].fde_flags &= ~UF_EXCLOSE; 782 goto out; 783 case FIOCLEX: 784 fdp->fd_ofiles[fd].fde_flags |= UF_EXCLOSE; 785 goto out; 786 case FIONBIO: 787 if ((tmp = *(int *)data)) 788 atomic_set_int(&fp->f_flag, FNONBLOCK); 789 else 790 atomic_clear_int(&fp->f_flag, FNONBLOCK); 791 data = (void *)&tmp; 792 break; 793 case FIOASYNC: 794 if ((tmp = *(int *)data)) 795 atomic_set_int(&fp->f_flag, FASYNC); 796 else 797 atomic_clear_int(&fp->f_flag, FASYNC); 798 data = (void *)&tmp; 799 break; 800 } 801 802 error = fo_ioctl(fp, com, data, td->td_ucred, td); 803 out: 804 switch (locked) { 805 case LA_XLOCKED: 806 FILEDESC_XUNLOCK(fdp); 807 break; 808 #ifdef CAPABILITIES 809 case LA_SLOCKED: 810 FILEDESC_SUNLOCK(fdp); 811 break; 812 #endif 813 default: 814 FILEDESC_UNLOCK_ASSERT(fdp); 815 break; 816 } 817 if (fp != NULL) 818 fdrop(fp, td); 819 return (error); 820 } 821 822 int 823 sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap) 824 { 825 int error; 826 827 error = kern_posix_fallocate(td, uap->fd, uap->offset, uap->len); 828 return (kern_posix_error(td, error)); 829 } 830 831 int 832 kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len) 833 { 834 struct file *fp; 835 int error; 836 837 AUDIT_ARG_FD(fd); 838 if (offset < 0 || len <= 0) 839 return (EINVAL); 840 /* Check for wrap. */ 841 if (offset > OFF_MAX - len) 842 return (EFBIG); 843 AUDIT_ARG_FD(fd); 844 error = fget(td, fd, &cap_pwrite_rights, &fp); 845 if (error != 0) 846 return (error); 847 AUDIT_ARG_FILE(td->td_proc, fp); 848 if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) { 849 error = ESPIPE; 850 goto out; 851 } 852 if ((fp->f_flag & FWRITE) == 0) { 853 error = EBADF; 854 goto out; 855 } 856 857 error = fo_fallocate(fp, offset, len, td); 858 out: 859 fdrop(fp, td); 860 return (error); 861 } 862 863 int 864 poll_no_poll(int events) 865 { 866 /* 867 * Return true for read/write. If the user asked for something 868 * special, return POLLNVAL, so that clients have a way of 869 * determining reliably whether or not the extended 870 * functionality is present without hard-coding knowledge 871 * of specific filesystem implementations. 872 */ 873 if (events & ~POLLSTANDARD) 874 return (POLLNVAL); 875 876 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 877 } 878 879 int 880 sys_pselect(struct thread *td, struct pselect_args *uap) 881 { 882 struct timespec ts; 883 struct timeval tv, *tvp; 884 sigset_t set, *uset; 885 int error; 886 887 if (uap->ts != NULL) { 888 error = copyin(uap->ts, &ts, sizeof(ts)); 889 if (error != 0) 890 return (error); 891 TIMESPEC_TO_TIMEVAL(&tv, &ts); 892 tvp = &tv; 893 } else 894 tvp = NULL; 895 if (uap->sm != NULL) { 896 error = copyin(uap->sm, &set, sizeof(set)); 897 if (error != 0) 898 return (error); 899 uset = &set; 900 } else 901 uset = NULL; 902 return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, 903 uset, NFDBITS)); 904 } 905 906 int 907 kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex, 908 struct timeval *tvp, sigset_t *uset, int abi_nfdbits) 909 { 910 int error; 911 912 if (uset != NULL) { 913 error = kern_sigprocmask(td, SIG_SETMASK, uset, 914 &td->td_oldsigmask, 0); 915 if (error != 0) 916 return (error); 917 td->td_pflags |= TDP_OLDMASK; 918 /* 919 * Make sure that ast() is called on return to 920 * usermode and TDP_OLDMASK is cleared, restoring old 921 * sigmask. 922 */ 923 thread_lock(td); 924 td->td_flags |= TDF_ASTPENDING; 925 thread_unlock(td); 926 } 927 error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits); 928 return (error); 929 } 930 931 #ifndef _SYS_SYSPROTO_H_ 932 struct select_args { 933 int nd; 934 fd_set *in, *ou, *ex; 935 struct timeval *tv; 936 }; 937 #endif 938 int 939 sys_select(struct thread *td, struct select_args *uap) 940 { 941 struct timeval tv, *tvp; 942 int error; 943 944 if (uap->tv != NULL) { 945 error = copyin(uap->tv, &tv, sizeof(tv)); 946 if (error) 947 return (error); 948 tvp = &tv; 949 } else 950 tvp = NULL; 951 952 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, 953 NFDBITS)); 954 } 955 956 /* 957 * In the unlikely case when user specified n greater then the last 958 * open file descriptor, check that no bits are set after the last 959 * valid fd. We must return EBADF if any is set. 960 * 961 * There are applications that rely on the behaviour. 962 * 963 * nd is fd_nfiles. 964 */ 965 static int 966 select_check_badfd(fd_set *fd_in, int nd, int ndu, int abi_nfdbits) 967 { 968 char *addr, *oaddr; 969 int b, i, res; 970 uint8_t bits; 971 972 if (nd >= ndu || fd_in == NULL) 973 return (0); 974 975 oaddr = NULL; 976 bits = 0; /* silence gcc */ 977 for (i = nd; i < ndu; i++) { 978 b = i / NBBY; 979 #if BYTE_ORDER == LITTLE_ENDIAN 980 addr = (char *)fd_in + b; 981 #else 982 addr = (char *)fd_in; 983 if (abi_nfdbits == NFDBITS) { 984 addr += rounddown(b, sizeof(fd_mask)) + 985 sizeof(fd_mask) - 1 - b % sizeof(fd_mask); 986 } else { 987 addr += rounddown(b, sizeof(uint32_t)) + 988 sizeof(uint32_t) - 1 - b % sizeof(uint32_t); 989 } 990 #endif 991 if (addr != oaddr) { 992 res = fubyte(addr); 993 if (res == -1) 994 return (EFAULT); 995 oaddr = addr; 996 bits = res; 997 } 998 if ((bits & (1 << (i % NBBY))) != 0) 999 return (EBADF); 1000 } 1001 return (0); 1002 } 1003 1004 int 1005 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 1006 fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits) 1007 { 1008 struct filedesc *fdp; 1009 /* 1010 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 1011 * infds with the new FD_SETSIZE of 1024, and more than enough for 1012 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 1013 * of 256. 1014 */ 1015 fd_mask s_selbits[howmany(2048, NFDBITS)]; 1016 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 1017 struct timeval rtv; 1018 sbintime_t asbt, precision, rsbt; 1019 u_int nbufbytes, ncpbytes, ncpubytes, nfdbits; 1020 int error, lf, ndu; 1021 1022 if (nd < 0) 1023 return (EINVAL); 1024 fdp = td->td_proc->p_fd; 1025 ndu = nd; 1026 lf = fdp->fd_nfiles; 1027 if (nd > lf) 1028 nd = lf; 1029 1030 error = select_check_badfd(fd_in, nd, ndu, abi_nfdbits); 1031 if (error != 0) 1032 return (error); 1033 error = select_check_badfd(fd_ou, nd, ndu, abi_nfdbits); 1034 if (error != 0) 1035 return (error); 1036 error = select_check_badfd(fd_ex, nd, ndu, abi_nfdbits); 1037 if (error != 0) 1038 return (error); 1039 1040 /* 1041 * Allocate just enough bits for the non-null fd_sets. Use the 1042 * preallocated auto buffer if possible. 1043 */ 1044 nfdbits = roundup(nd, NFDBITS); 1045 ncpbytes = nfdbits / NBBY; 1046 ncpubytes = roundup(nd, abi_nfdbits) / NBBY; 1047 nbufbytes = 0; 1048 if (fd_in != NULL) 1049 nbufbytes += 2 * ncpbytes; 1050 if (fd_ou != NULL) 1051 nbufbytes += 2 * ncpbytes; 1052 if (fd_ex != NULL) 1053 nbufbytes += 2 * ncpbytes; 1054 if (nbufbytes <= sizeof s_selbits) 1055 selbits = &s_selbits[0]; 1056 else 1057 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 1058 1059 /* 1060 * Assign pointers into the bit buffers and fetch the input bits. 1061 * Put the output buffers together so that they can be bzeroed 1062 * together. 1063 */ 1064 sbp = selbits; 1065 #define getbits(name, x) \ 1066 do { \ 1067 if (name == NULL) { \ 1068 ibits[x] = NULL; \ 1069 obits[x] = NULL; \ 1070 } else { \ 1071 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 1072 obits[x] = sbp; \ 1073 sbp += ncpbytes / sizeof *sbp; \ 1074 error = copyin(name, ibits[x], ncpubytes); \ 1075 if (error != 0) \ 1076 goto done; \ 1077 if (ncpbytes != ncpubytes) \ 1078 bzero((char *)ibits[x] + ncpubytes, \ 1079 ncpbytes - ncpubytes); \ 1080 } \ 1081 } while (0) 1082 getbits(fd_in, 0); 1083 getbits(fd_ou, 1); 1084 getbits(fd_ex, 2); 1085 #undef getbits 1086 1087 #if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__) 1088 /* 1089 * XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS, 1090 * we are running under 32-bit emulation. This should be more 1091 * generic. 1092 */ 1093 #define swizzle_fdset(bits) \ 1094 if (abi_nfdbits != NFDBITS && bits != NULL) { \ 1095 int i; \ 1096 for (i = 0; i < ncpbytes / sizeof *sbp; i++) \ 1097 bits[i] = (bits[i] >> 32) | (bits[i] << 32); \ 1098 } 1099 #else 1100 #define swizzle_fdset(bits) 1101 #endif 1102 1103 /* Make sure the bit order makes it through an ABI transition */ 1104 swizzle_fdset(ibits[0]); 1105 swizzle_fdset(ibits[1]); 1106 swizzle_fdset(ibits[2]); 1107 1108 if (nbufbytes != 0) 1109 bzero(selbits, nbufbytes / 2); 1110 1111 precision = 0; 1112 if (tvp != NULL) { 1113 rtv = *tvp; 1114 if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || 1115 rtv.tv_usec >= 1000000) { 1116 error = EINVAL; 1117 goto done; 1118 } 1119 if (!timevalisset(&rtv)) 1120 asbt = 0; 1121 else if (rtv.tv_sec <= INT32_MAX) { 1122 rsbt = tvtosbt(rtv); 1123 precision = rsbt; 1124 precision >>= tc_precexp; 1125 if (TIMESEL(&asbt, rsbt)) 1126 asbt += tc_tick_sbt; 1127 if (asbt <= SBT_MAX - rsbt) 1128 asbt += rsbt; 1129 else 1130 asbt = -1; 1131 } else 1132 asbt = -1; 1133 } else 1134 asbt = -1; 1135 seltdinit(td); 1136 /* Iterate until the timeout expires or descriptors become ready. */ 1137 for (;;) { 1138 error = selscan(td, ibits, obits, nd); 1139 if (error || td->td_retval[0] != 0) 1140 break; 1141 error = seltdwait(td, asbt, precision); 1142 if (error) 1143 break; 1144 error = selrescan(td, ibits, obits); 1145 if (error || td->td_retval[0] != 0) 1146 break; 1147 } 1148 seltdclear(td); 1149 1150 done: 1151 /* select is not restarted after signals... */ 1152 if (error == ERESTART) 1153 error = EINTR; 1154 if (error == EWOULDBLOCK) 1155 error = 0; 1156 1157 /* swizzle bit order back, if necessary */ 1158 swizzle_fdset(obits[0]); 1159 swizzle_fdset(obits[1]); 1160 swizzle_fdset(obits[2]); 1161 #undef swizzle_fdset 1162 1163 #define putbits(name, x) \ 1164 if (name && (error2 = copyout(obits[x], name, ncpubytes))) \ 1165 error = error2; 1166 if (error == 0) { 1167 int error2; 1168 1169 putbits(fd_in, 0); 1170 putbits(fd_ou, 1); 1171 putbits(fd_ex, 2); 1172 #undef putbits 1173 } 1174 if (selbits != &s_selbits[0]) 1175 free(selbits, M_SELECT); 1176 1177 return (error); 1178 } 1179 /* 1180 * Convert a select bit set to poll flags. 1181 * 1182 * The backend always returns POLLHUP/POLLERR if appropriate and we 1183 * return this as a set bit in any set. 1184 */ 1185 static int select_flags[3] = { 1186 POLLRDNORM | POLLHUP | POLLERR, 1187 POLLWRNORM | POLLHUP | POLLERR, 1188 POLLRDBAND | POLLERR 1189 }; 1190 1191 /* 1192 * Compute the fo_poll flags required for a fd given by the index and 1193 * bit position in the fd_mask array. 1194 */ 1195 static __inline int 1196 selflags(fd_mask **ibits, int idx, fd_mask bit) 1197 { 1198 int flags; 1199 int msk; 1200 1201 flags = 0; 1202 for (msk = 0; msk < 3; msk++) { 1203 if (ibits[msk] == NULL) 1204 continue; 1205 if ((ibits[msk][idx] & bit) == 0) 1206 continue; 1207 flags |= select_flags[msk]; 1208 } 1209 return (flags); 1210 } 1211 1212 /* 1213 * Set the appropriate output bits given a mask of fired events and the 1214 * input bits originally requested. 1215 */ 1216 static __inline int 1217 selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events) 1218 { 1219 int msk; 1220 int n; 1221 1222 n = 0; 1223 for (msk = 0; msk < 3; msk++) { 1224 if ((events & select_flags[msk]) == 0) 1225 continue; 1226 if (ibits[msk] == NULL) 1227 continue; 1228 if ((ibits[msk][idx] & bit) == 0) 1229 continue; 1230 /* 1231 * XXX Check for a duplicate set. This can occur because a 1232 * socket calls selrecord() twice for each poll() call 1233 * resulting in two selfds per real fd. selrescan() will 1234 * call selsetbits twice as a result. 1235 */ 1236 if ((obits[msk][idx] & bit) != 0) 1237 continue; 1238 obits[msk][idx] |= bit; 1239 n++; 1240 } 1241 1242 return (n); 1243 } 1244 1245 static __inline int 1246 getselfd_cap(struct filedesc *fdp, int fd, struct file **fpp) 1247 { 1248 1249 return (fget_unlocked(fdp, fd, &cap_event_rights, fpp)); 1250 } 1251 1252 /* 1253 * Traverse the list of fds attached to this thread's seltd and check for 1254 * completion. 1255 */ 1256 static int 1257 selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits) 1258 { 1259 struct filedesc *fdp; 1260 struct selinfo *si; 1261 struct seltd *stp; 1262 struct selfd *sfp; 1263 struct selfd *sfn; 1264 struct file *fp; 1265 fd_mask bit; 1266 int fd, ev, n, idx; 1267 int error; 1268 1269 fdp = td->td_proc->p_fd; 1270 stp = td->td_sel; 1271 n = 0; 1272 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { 1273 fd = (int)(uintptr_t)sfp->sf_cookie; 1274 si = sfp->sf_si; 1275 selfdfree(stp, sfp); 1276 /* If the selinfo wasn't cleared the event didn't fire. */ 1277 if (si != NULL) 1278 continue; 1279 error = getselfd_cap(fdp, fd, &fp); 1280 if (error) 1281 return (error); 1282 idx = fd / NFDBITS; 1283 bit = (fd_mask)1 << (fd % NFDBITS); 1284 ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td); 1285 fdrop(fp, td); 1286 if (ev != 0) 1287 n += selsetbits(ibits, obits, idx, bit, ev); 1288 } 1289 stp->st_flags = 0; 1290 td->td_retval[0] = n; 1291 return (0); 1292 } 1293 1294 /* 1295 * Perform the initial filedescriptor scan and register ourselves with 1296 * each selinfo. 1297 */ 1298 static int 1299 selscan(struct thread *td, fd_mask **ibits, fd_mask **obits, int nfd) 1300 { 1301 struct filedesc *fdp; 1302 struct file *fp; 1303 fd_mask bit; 1304 int ev, flags, end, fd; 1305 int n, idx; 1306 int error; 1307 1308 fdp = td->td_proc->p_fd; 1309 n = 0; 1310 for (idx = 0, fd = 0; fd < nfd; idx++) { 1311 end = imin(fd + NFDBITS, nfd); 1312 for (bit = 1; fd < end; bit <<= 1, fd++) { 1313 /* Compute the list of events we're interested in. */ 1314 flags = selflags(ibits, idx, bit); 1315 if (flags == 0) 1316 continue; 1317 error = getselfd_cap(fdp, fd, &fp); 1318 if (error) 1319 return (error); 1320 selfdalloc(td, (void *)(uintptr_t)fd); 1321 ev = fo_poll(fp, flags, td->td_ucred, td); 1322 fdrop(fp, td); 1323 if (ev != 0) 1324 n += selsetbits(ibits, obits, idx, bit, ev); 1325 } 1326 } 1327 1328 td->td_retval[0] = n; 1329 return (0); 1330 } 1331 1332 int 1333 sys_poll(struct thread *td, struct poll_args *uap) 1334 { 1335 struct timespec ts, *tsp; 1336 1337 if (uap->timeout != INFTIM) { 1338 if (uap->timeout < 0) 1339 return (EINVAL); 1340 ts.tv_sec = uap->timeout / 1000; 1341 ts.tv_nsec = (uap->timeout % 1000) * 1000000; 1342 tsp = &ts; 1343 } else 1344 tsp = NULL; 1345 1346 return (kern_poll(td, uap->fds, uap->nfds, tsp, NULL)); 1347 } 1348 1349 int 1350 kern_poll(struct thread *td, struct pollfd *ufds, u_int nfds, 1351 struct timespec *tsp, sigset_t *uset) 1352 { 1353 struct pollfd *kfds; 1354 struct pollfd stackfds[32]; 1355 sbintime_t sbt, precision, tmp; 1356 time_t over; 1357 struct timespec ts; 1358 int error; 1359 1360 precision = 0; 1361 if (tsp != NULL) { 1362 if (tsp->tv_sec < 0) 1363 return (EINVAL); 1364 if (tsp->tv_nsec < 0 || tsp->tv_nsec >= 1000000000) 1365 return (EINVAL); 1366 if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) 1367 sbt = 0; 1368 else { 1369 ts = *tsp; 1370 if (ts.tv_sec > INT32_MAX / 2) { 1371 over = ts.tv_sec - INT32_MAX / 2; 1372 ts.tv_sec -= over; 1373 } else 1374 over = 0; 1375 tmp = tstosbt(ts); 1376 precision = tmp; 1377 precision >>= tc_precexp; 1378 if (TIMESEL(&sbt, tmp)) 1379 sbt += tc_tick_sbt; 1380 sbt += tmp; 1381 } 1382 } else 1383 sbt = -1; 1384 1385 /* 1386 * This is kinda bogus. We have fd limits, but that is not 1387 * really related to the size of the pollfd array. Make sure 1388 * we let the process use at least FD_SETSIZE entries and at 1389 * least enough for the system-wide limits. We want to be reasonably 1390 * safe, but not overly restrictive. 1391 */ 1392 if (nfds > maxfilesperproc && nfds > FD_SETSIZE) 1393 return (EINVAL); 1394 if (nfds > nitems(stackfds)) 1395 kfds = mallocarray(nfds, sizeof(*kfds), M_TEMP, M_WAITOK); 1396 else 1397 kfds = stackfds; 1398 error = copyin(ufds, kfds, nfds * sizeof(*kfds)); 1399 if (error) 1400 goto done; 1401 1402 if (uset != NULL) { 1403 error = kern_sigprocmask(td, SIG_SETMASK, uset, 1404 &td->td_oldsigmask, 0); 1405 if (error) 1406 goto done; 1407 td->td_pflags |= TDP_OLDMASK; 1408 /* 1409 * Make sure that ast() is called on return to 1410 * usermode and TDP_OLDMASK is cleared, restoring old 1411 * sigmask. 1412 */ 1413 thread_lock(td); 1414 td->td_flags |= TDF_ASTPENDING; 1415 thread_unlock(td); 1416 } 1417 1418 seltdinit(td); 1419 /* Iterate until the timeout expires or descriptors become ready. */ 1420 for (;;) { 1421 error = pollscan(td, kfds, nfds); 1422 if (error || td->td_retval[0] != 0) 1423 break; 1424 error = seltdwait(td, sbt, precision); 1425 if (error) 1426 break; 1427 error = pollrescan(td); 1428 if (error || td->td_retval[0] != 0) 1429 break; 1430 } 1431 seltdclear(td); 1432 1433 done: 1434 /* poll is not restarted after signals... */ 1435 if (error == ERESTART) 1436 error = EINTR; 1437 if (error == EWOULDBLOCK) 1438 error = 0; 1439 if (error == 0) { 1440 error = pollout(td, kfds, ufds, nfds); 1441 if (error) 1442 goto out; 1443 } 1444 out: 1445 if (nfds > nitems(stackfds)) 1446 free(kfds, M_TEMP); 1447 return (error); 1448 } 1449 1450 int 1451 sys_ppoll(struct thread *td, struct ppoll_args *uap) 1452 { 1453 struct timespec ts, *tsp; 1454 sigset_t set, *ssp; 1455 int error; 1456 1457 if (uap->ts != NULL) { 1458 error = copyin(uap->ts, &ts, sizeof(ts)); 1459 if (error) 1460 return (error); 1461 tsp = &ts; 1462 } else 1463 tsp = NULL; 1464 if (uap->set != NULL) { 1465 error = copyin(uap->set, &set, sizeof(set)); 1466 if (error) 1467 return (error); 1468 ssp = &set; 1469 } else 1470 ssp = NULL; 1471 /* 1472 * fds is still a pointer to user space. kern_poll() will 1473 * take care of copyin that array to the kernel space. 1474 */ 1475 1476 return (kern_poll(td, uap->fds, uap->nfds, tsp, ssp)); 1477 } 1478 1479 #ifdef CAPABILITIES 1480 static int 1481 poll_fget(struct filedesc *fdp, int fd, struct file **fpp) 1482 { 1483 const struct filedescent *fde; 1484 const struct fdescenttbl *fdt; 1485 const cap_rights_t *haverights; 1486 struct file *fp; 1487 int error; 1488 1489 if (__predict_false(fd >= fdp->fd_nfiles)) 1490 return (EBADF); 1491 1492 fdt = fdp->fd_files; 1493 fde = &fdt->fdt_ofiles[fd]; 1494 fp = fde->fde_file; 1495 if (__predict_false(fp == NULL)) 1496 return (EBADF); 1497 haverights = cap_rights_fde_inline(fde); 1498 error = cap_check_inline(haverights, &cap_event_rights); 1499 if (__predict_false(error != 0)) 1500 return (EBADF); 1501 *fpp = fp; 1502 return (0); 1503 } 1504 #else 1505 static int 1506 poll_fget(struct filedesc *fdp, int fd, struct file **fpp) 1507 { 1508 struct file *fp; 1509 1510 if (__predict_false(fd >= fdp->fd_nfiles)) 1511 return (EBADF); 1512 1513 fp = fdp->fd_ofiles[fd].fde_file; 1514 if (__predict_false(fp == NULL)) 1515 return (EBADF); 1516 1517 *fpp = fp; 1518 return (0); 1519 } 1520 #endif 1521 1522 static int 1523 pollrescan(struct thread *td) 1524 { 1525 struct seltd *stp; 1526 struct selfd *sfp; 1527 struct selfd *sfn; 1528 struct selinfo *si; 1529 struct filedesc *fdp; 1530 struct file *fp; 1531 struct pollfd *fd; 1532 int n; 1533 1534 n = 0; 1535 fdp = td->td_proc->p_fd; 1536 stp = td->td_sel; 1537 FILEDESC_SLOCK(fdp); 1538 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { 1539 fd = (struct pollfd *)sfp->sf_cookie; 1540 si = sfp->sf_si; 1541 selfdfree(stp, sfp); 1542 /* If the selinfo wasn't cleared the event didn't fire. */ 1543 if (si != NULL) 1544 continue; 1545 if (poll_fget(fdp, fd->fd, &fp) != 0) { 1546 fd->revents = POLLNVAL; 1547 n++; 1548 continue; 1549 } 1550 /* 1551 * Note: backend also returns POLLHUP and 1552 * POLLERR if appropriate. 1553 */ 1554 fd->revents = fo_poll(fp, fd->events, td->td_ucred, td); 1555 if (fd->revents != 0) 1556 n++; 1557 } 1558 FILEDESC_SUNLOCK(fdp); 1559 stp->st_flags = 0; 1560 td->td_retval[0] = n; 1561 return (0); 1562 } 1563 1564 static int 1565 pollout(struct thread *td, struct pollfd *fds, struct pollfd *ufds, u_int nfd) 1566 { 1567 int error = 0; 1568 u_int i = 0; 1569 u_int n = 0; 1570 1571 for (i = 0; i < nfd; i++) { 1572 error = copyout(&fds->revents, &ufds->revents, 1573 sizeof(ufds->revents)); 1574 if (error) 1575 return (error); 1576 if (fds->revents != 0) 1577 n++; 1578 fds++; 1579 ufds++; 1580 } 1581 td->td_retval[0] = n; 1582 return (0); 1583 } 1584 1585 static int 1586 pollscan(struct thread *td, struct pollfd *fds, u_int nfd) 1587 { 1588 struct filedesc *fdp; 1589 struct file *fp; 1590 int i, n; 1591 1592 n = 0; 1593 fdp = td->td_proc->p_fd; 1594 FILEDESC_SLOCK(fdp); 1595 for (i = 0; i < nfd; i++, fds++) { 1596 if (fds->fd < 0) { 1597 fds->revents = 0; 1598 continue; 1599 } 1600 if (poll_fget(fdp, fds->fd, &fp) != 0) { 1601 fds->revents = POLLNVAL; 1602 n++; 1603 continue; 1604 } 1605 /* 1606 * Note: backend also returns POLLHUP and 1607 * POLLERR if appropriate. 1608 */ 1609 selfdalloc(td, fds); 1610 fds->revents = fo_poll(fp, fds->events, 1611 td->td_ucred, td); 1612 /* 1613 * POSIX requires POLLOUT to be never 1614 * set simultaneously with POLLHUP. 1615 */ 1616 if ((fds->revents & POLLHUP) != 0) 1617 fds->revents &= ~POLLOUT; 1618 1619 if (fds->revents != 0) 1620 n++; 1621 } 1622 FILEDESC_SUNLOCK(fdp); 1623 td->td_retval[0] = n; 1624 return (0); 1625 } 1626 1627 /* 1628 * XXX This was created specifically to support netncp and netsmb. This 1629 * allows the caller to specify a socket to wait for events on. It returns 1630 * 0 if any events matched and an error otherwise. There is no way to 1631 * determine which events fired. 1632 */ 1633 int 1634 selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td) 1635 { 1636 struct timeval rtv; 1637 sbintime_t asbt, precision, rsbt; 1638 int error; 1639 1640 precision = 0; /* stupid gcc! */ 1641 if (tvp != NULL) { 1642 rtv = *tvp; 1643 if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || 1644 rtv.tv_usec >= 1000000) 1645 return (EINVAL); 1646 if (!timevalisset(&rtv)) 1647 asbt = 0; 1648 else if (rtv.tv_sec <= INT32_MAX) { 1649 rsbt = tvtosbt(rtv); 1650 precision = rsbt; 1651 precision >>= tc_precexp; 1652 if (TIMESEL(&asbt, rsbt)) 1653 asbt += tc_tick_sbt; 1654 if (asbt <= SBT_MAX - rsbt) 1655 asbt += rsbt; 1656 else 1657 asbt = -1; 1658 } else 1659 asbt = -1; 1660 } else 1661 asbt = -1; 1662 seltdinit(td); 1663 /* 1664 * Iterate until the timeout expires or the socket becomes ready. 1665 */ 1666 for (;;) { 1667 selfdalloc(td, NULL); 1668 error = sopoll(so, events, NULL, td); 1669 /* error here is actually the ready events. */ 1670 if (error) 1671 return (0); 1672 error = seltdwait(td, asbt, precision); 1673 if (error) 1674 break; 1675 } 1676 seltdclear(td); 1677 /* XXX Duplicates ncp/smb behavior. */ 1678 if (error == ERESTART) 1679 error = 0; 1680 return (error); 1681 } 1682 1683 /* 1684 * Preallocate two selfds associated with 'cookie'. Some fo_poll routines 1685 * have two select sets, one for read and another for write. 1686 */ 1687 static void 1688 selfdalloc(struct thread *td, void *cookie) 1689 { 1690 struct seltd *stp; 1691 1692 stp = td->td_sel; 1693 if (stp->st_free1 == NULL) 1694 stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); 1695 stp->st_free1->sf_td = stp; 1696 stp->st_free1->sf_cookie = cookie; 1697 if (stp->st_free2 == NULL) 1698 stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); 1699 stp->st_free2->sf_td = stp; 1700 stp->st_free2->sf_cookie = cookie; 1701 } 1702 1703 static void 1704 selfdfree(struct seltd *stp, struct selfd *sfp) 1705 { 1706 STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link); 1707 if (sfp->sf_si != NULL) { 1708 mtx_lock(sfp->sf_mtx); 1709 if (sfp->sf_si != NULL) { 1710 TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads); 1711 refcount_release(&sfp->sf_refs); 1712 } 1713 mtx_unlock(sfp->sf_mtx); 1714 } 1715 if (refcount_release(&sfp->sf_refs)) 1716 uma_zfree(selfd_zone, sfp); 1717 } 1718 1719 /* Drain the waiters tied to all the selfd belonging the specified selinfo. */ 1720 void 1721 seldrain(struct selinfo *sip) 1722 { 1723 1724 /* 1725 * This feature is already provided by doselwakeup(), thus it is 1726 * enough to go for it. 1727 * Eventually, the context, should take care to avoid races 1728 * between thread calling select()/poll() and file descriptor 1729 * detaching, but, again, the races are just the same as 1730 * selwakeup(). 1731 */ 1732 doselwakeup(sip, -1); 1733 } 1734 1735 /* 1736 * Record a select request. 1737 */ 1738 void 1739 selrecord(struct thread *selector, struct selinfo *sip) 1740 { 1741 struct selfd *sfp; 1742 struct seltd *stp; 1743 struct mtx *mtxp; 1744 1745 stp = selector->td_sel; 1746 /* 1747 * Don't record when doing a rescan. 1748 */ 1749 if (stp->st_flags & SELTD_RESCAN) 1750 return; 1751 /* 1752 * Grab one of the preallocated descriptors. 1753 */ 1754 sfp = NULL; 1755 if ((sfp = stp->st_free1) != NULL) 1756 stp->st_free1 = NULL; 1757 else if ((sfp = stp->st_free2) != NULL) 1758 stp->st_free2 = NULL; 1759 else 1760 panic("selrecord: No free selfd on selq"); 1761 mtxp = sip->si_mtx; 1762 if (mtxp == NULL) 1763 mtxp = mtx_pool_find(mtxpool_select, sip); 1764 /* 1765 * Initialize the sfp and queue it in the thread. 1766 */ 1767 sfp->sf_si = sip; 1768 sfp->sf_mtx = mtxp; 1769 refcount_init(&sfp->sf_refs, 2); 1770 STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link); 1771 /* 1772 * Now that we've locked the sip, check for initialization. 1773 */ 1774 mtx_lock(mtxp); 1775 if (sip->si_mtx == NULL) { 1776 sip->si_mtx = mtxp; 1777 TAILQ_INIT(&sip->si_tdlist); 1778 } 1779 /* 1780 * Add this thread to the list of selfds listening on this selinfo. 1781 */ 1782 TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads); 1783 mtx_unlock(sip->si_mtx); 1784 } 1785 1786 /* Wake up a selecting thread. */ 1787 void 1788 selwakeup(struct selinfo *sip) 1789 { 1790 doselwakeup(sip, -1); 1791 } 1792 1793 /* Wake up a selecting thread, and set its priority. */ 1794 void 1795 selwakeuppri(struct selinfo *sip, int pri) 1796 { 1797 doselwakeup(sip, pri); 1798 } 1799 1800 /* 1801 * Do a wakeup when a selectable event occurs. 1802 */ 1803 static void 1804 doselwakeup(struct selinfo *sip, int pri) 1805 { 1806 struct selfd *sfp; 1807 struct selfd *sfn; 1808 struct seltd *stp; 1809 1810 /* If it's not initialized there can't be any waiters. */ 1811 if (sip->si_mtx == NULL) 1812 return; 1813 /* 1814 * Locking the selinfo locks all selfds associated with it. 1815 */ 1816 mtx_lock(sip->si_mtx); 1817 TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) { 1818 /* 1819 * Once we remove this sfp from the list and clear the 1820 * sf_si seltdclear will know to ignore this si. 1821 */ 1822 TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads); 1823 sfp->sf_si = NULL; 1824 stp = sfp->sf_td; 1825 mtx_lock(&stp->st_mtx); 1826 stp->st_flags |= SELTD_PENDING; 1827 cv_broadcastpri(&stp->st_wait, pri); 1828 mtx_unlock(&stp->st_mtx); 1829 if (refcount_release(&sfp->sf_refs)) 1830 uma_zfree(selfd_zone, sfp); 1831 } 1832 mtx_unlock(sip->si_mtx); 1833 } 1834 1835 static void 1836 seltdinit(struct thread *td) 1837 { 1838 struct seltd *stp; 1839 1840 if ((stp = td->td_sel) != NULL) 1841 goto out; 1842 td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO); 1843 mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF); 1844 cv_init(&stp->st_wait, "select"); 1845 out: 1846 stp->st_flags = 0; 1847 STAILQ_INIT(&stp->st_selq); 1848 } 1849 1850 static int 1851 seltdwait(struct thread *td, sbintime_t sbt, sbintime_t precision) 1852 { 1853 struct seltd *stp; 1854 int error; 1855 1856 stp = td->td_sel; 1857 /* 1858 * An event of interest may occur while we do not hold the seltd 1859 * locked so check the pending flag before we sleep. 1860 */ 1861 mtx_lock(&stp->st_mtx); 1862 /* 1863 * Any further calls to selrecord will be a rescan. 1864 */ 1865 stp->st_flags |= SELTD_RESCAN; 1866 if (stp->st_flags & SELTD_PENDING) { 1867 mtx_unlock(&stp->st_mtx); 1868 return (0); 1869 } 1870 if (sbt == 0) 1871 error = EWOULDBLOCK; 1872 else if (sbt != -1) 1873 error = cv_timedwait_sig_sbt(&stp->st_wait, &stp->st_mtx, 1874 sbt, precision, C_ABSOLUTE); 1875 else 1876 error = cv_wait_sig(&stp->st_wait, &stp->st_mtx); 1877 mtx_unlock(&stp->st_mtx); 1878 1879 return (error); 1880 } 1881 1882 void 1883 seltdfini(struct thread *td) 1884 { 1885 struct seltd *stp; 1886 1887 stp = td->td_sel; 1888 if (stp == NULL) 1889 return; 1890 if (stp->st_free1) 1891 uma_zfree(selfd_zone, stp->st_free1); 1892 if (stp->st_free2) 1893 uma_zfree(selfd_zone, stp->st_free2); 1894 td->td_sel = NULL; 1895 cv_destroy(&stp->st_wait); 1896 mtx_destroy(&stp->st_mtx); 1897 free(stp, M_SELECT); 1898 } 1899 1900 /* 1901 * Remove the references to the thread from all of the objects we were 1902 * polling. 1903 */ 1904 static void 1905 seltdclear(struct thread *td) 1906 { 1907 struct seltd *stp; 1908 struct selfd *sfp; 1909 struct selfd *sfn; 1910 1911 stp = td->td_sel; 1912 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) 1913 selfdfree(stp, sfp); 1914 stp->st_flags = 0; 1915 } 1916 1917 static void selectinit(void *); 1918 SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL); 1919 static void 1920 selectinit(void *dummy __unused) 1921 { 1922 1923 selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL, 1924 NULL, NULL, UMA_ALIGN_PTR, 0); 1925 mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF); 1926 } 1927 1928 /* 1929 * Set up a syscall return value that follows the convention specified for 1930 * posix_* functions. 1931 */ 1932 int 1933 kern_posix_error(struct thread *td, int error) 1934 { 1935 1936 if (error <= 0) 1937 return (error); 1938 td->td_errno = error; 1939 td->td_pflags |= TDP_NERRNO; 1940 td->td_retval[0] = error; 1941 return (0); 1942 } 1943