1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1989, 1991, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 37 */ 38 39 #include <sys/cdefs.h> 40 __FBSDID("$FreeBSD$"); 41 42 #include "opt_capsicum.h" 43 #include "opt_ddb.h" 44 #include "opt_ktrace.h" 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 49 #include <sys/capsicum.h> 50 #include <sys/conf.h> 51 #include <sys/fcntl.h> 52 #include <sys/file.h> 53 #include <sys/filedesc.h> 54 #include <sys/filio.h> 55 #include <sys/jail.h> 56 #include <sys/kernel.h> 57 #include <sys/limits.h> 58 #include <sys/lock.h> 59 #include <sys/malloc.h> 60 #include <sys/mount.h> 61 #include <sys/mutex.h> 62 #include <sys/namei.h> 63 #include <sys/selinfo.h> 64 #include <sys/poll.h> 65 #include <sys/priv.h> 66 #include <sys/proc.h> 67 #include <sys/protosw.h> 68 #include <sys/racct.h> 69 #include <sys/resourcevar.h> 70 #include <sys/sbuf.h> 71 #include <sys/signalvar.h> 72 #include <sys/kdb.h> 73 #include <sys/smr.h> 74 #include <sys/stat.h> 75 #include <sys/sx.h> 76 #include <sys/syscallsubr.h> 77 #include <sys/sysctl.h> 78 #include <sys/sysproto.h> 79 #include <sys/unistd.h> 80 #include <sys/user.h> 81 #include <sys/vnode.h> 82 #include <sys/ktrace.h> 83 84 #include <net/vnet.h> 85 86 #include <security/audit/audit.h> 87 88 #include <vm/uma.h> 89 #include <vm/vm.h> 90 91 #include <ddb/ddb.h> 92 93 static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table"); 94 static MALLOC_DEFINE(M_PWD, "pwd", "Descriptor table vnodes"); 95 static MALLOC_DEFINE(M_PWDDESC, "pwddesc", "Pwd descriptors"); 96 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader", 97 "file desc to leader structures"); 98 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); 99 MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities"); 100 101 MALLOC_DECLARE(M_FADVISE); 102 103 static __read_mostly uma_zone_t file_zone; 104 static __read_mostly uma_zone_t filedesc0_zone; 105 __read_mostly uma_zone_t pwd_zone; 106 VFS_SMR_DECLARE; 107 108 static int closefp(struct filedesc *fdp, int fd, struct file *fp, 109 struct thread *td, bool holdleaders, bool audit); 110 static void export_file_to_kinfo(struct file *fp, int fd, 111 cap_rights_t *rightsp, struct kinfo_file *kif, 112 struct filedesc *fdp, int flags); 113 static int fd_first_free(struct filedesc *fdp, int low, int size); 114 static void fdgrowtable(struct filedesc *fdp, int nfd); 115 static void fdgrowtable_exp(struct filedesc *fdp, int nfd); 116 static void fdunused(struct filedesc *fdp, int fd); 117 static void fdused(struct filedesc *fdp, int fd); 118 static int fget_unlocked_seq(struct thread *td, int fd, 119 cap_rights_t *needrightsp, struct file **fpp, seqc_t *seqp); 120 static int getmaxfd(struct thread *td); 121 static u_long *filecaps_copy_prep(const struct filecaps *src); 122 static void filecaps_copy_finish(const struct filecaps *src, 123 struct filecaps *dst, u_long *ioctls); 124 static u_long *filecaps_free_prep(struct filecaps *fcaps); 125 static void filecaps_free_finish(u_long *ioctls); 126 127 static struct pwd *pwd_alloc(void); 128 129 /* 130 * Each process has: 131 * 132 * - An array of open file descriptors (fd_ofiles) 133 * - An array of file flags (fd_ofileflags) 134 * - A bitmap recording which descriptors are in use (fd_map) 135 * 136 * A process starts out with NDFILE descriptors. The value of NDFILE has 137 * been selected based the historical limit of 20 open files, and an 138 * assumption that the majority of processes, especially short-lived 139 * processes like shells, will never need more. 140 * 141 * If this initial allocation is exhausted, a larger descriptor table and 142 * map are allocated dynamically, and the pointers in the process's struct 143 * filedesc are updated to point to those. This is repeated every time 144 * the process runs out of file descriptors (provided it hasn't hit its 145 * resource limit). 146 * 147 * Since threads may hold references to individual descriptor table 148 * entries, the tables are never freed. Instead, they are placed on a 149 * linked list and freed only when the struct filedesc is released. 150 */ 151 #define NDFILE 20 152 #define NDSLOTSIZE sizeof(NDSLOTTYPE) 153 #define NDENTRIES (NDSLOTSIZE * __CHAR_BIT) 154 #define NDSLOT(x) ((x) / NDENTRIES) 155 #define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES)) 156 #define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES) 157 158 /* 159 * SLIST entry used to keep track of ofiles which must be reclaimed when 160 * the process exits. 161 */ 162 struct freetable { 163 struct fdescenttbl *ft_table; 164 SLIST_ENTRY(freetable) ft_next; 165 }; 166 167 /* 168 * Initial allocation: a filedesc structure + the head of SLIST used to 169 * keep track of old ofiles + enough space for NDFILE descriptors. 170 */ 171 172 struct fdescenttbl0 { 173 int fdt_nfiles; 174 struct filedescent fdt_ofiles[NDFILE]; 175 }; 176 177 struct filedesc0 { 178 struct filedesc fd_fd; 179 SLIST_HEAD(, freetable) fd_free; 180 struct fdescenttbl0 fd_dfiles; 181 NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)]; 182 }; 183 184 /* 185 * Descriptor management. 186 */ 187 static int __exclusive_cache_line openfiles; /* actual number of open files */ 188 struct mtx sigio_lock; /* mtx to protect pointers to sigio */ 189 void __read_mostly (*mq_fdclose)(struct thread *td, int fd, struct file *fp); 190 191 /* 192 * If low >= size, just return low. Otherwise find the first zero bit in the 193 * given bitmap, starting at low and not exceeding size - 1. Return size if 194 * not found. 195 */ 196 static int 197 fd_first_free(struct filedesc *fdp, int low, int size) 198 { 199 NDSLOTTYPE *map = fdp->fd_map; 200 NDSLOTTYPE mask; 201 int off, maxoff; 202 203 if (low >= size) 204 return (low); 205 206 off = NDSLOT(low); 207 if (low % NDENTRIES) { 208 mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES))); 209 if ((mask &= ~map[off]) != 0UL) 210 return (off * NDENTRIES + ffsl(mask) - 1); 211 ++off; 212 } 213 for (maxoff = NDSLOTS(size); off < maxoff; ++off) 214 if (map[off] != ~0UL) 215 return (off * NDENTRIES + ffsl(~map[off]) - 1); 216 return (size); 217 } 218 219 /* 220 * Find the last used fd. 221 * 222 * Call this variant if fdp can't be modified by anyone else (e.g, during exec). 223 * Otherwise use fdlastfile. 224 */ 225 int 226 fdlastfile_single(struct filedesc *fdp) 227 { 228 NDSLOTTYPE *map = fdp->fd_map; 229 int off, minoff; 230 231 off = NDSLOT(fdp->fd_nfiles - 1); 232 for (minoff = NDSLOT(0); off >= minoff; --off) 233 if (map[off] != 0) 234 return (off * NDENTRIES + flsl(map[off]) - 1); 235 return (-1); 236 } 237 238 int 239 fdlastfile(struct filedesc *fdp) 240 { 241 242 FILEDESC_LOCK_ASSERT(fdp); 243 return (fdlastfile_single(fdp)); 244 } 245 246 static int 247 fdisused(struct filedesc *fdp, int fd) 248 { 249 250 KASSERT(fd >= 0 && fd < fdp->fd_nfiles, 251 ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles)); 252 253 return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0); 254 } 255 256 /* 257 * Mark a file descriptor as used. 258 */ 259 static void 260 fdused_init(struct filedesc *fdp, int fd) 261 { 262 263 KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd)); 264 265 fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd); 266 } 267 268 static void 269 fdused(struct filedesc *fdp, int fd) 270 { 271 272 FILEDESC_XLOCK_ASSERT(fdp); 273 274 fdused_init(fdp, fd); 275 if (fd == fdp->fd_freefile) 276 fdp->fd_freefile++; 277 } 278 279 /* 280 * Mark a file descriptor as unused. 281 */ 282 static void 283 fdunused(struct filedesc *fdp, int fd) 284 { 285 286 FILEDESC_XLOCK_ASSERT(fdp); 287 288 KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd)); 289 KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, 290 ("fd=%d is still in use", fd)); 291 292 fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd); 293 if (fd < fdp->fd_freefile) 294 fdp->fd_freefile = fd; 295 } 296 297 /* 298 * Free a file descriptor. 299 * 300 * Avoid some work if fdp is about to be destroyed. 301 */ 302 static inline void 303 fdefree_last(struct filedescent *fde) 304 { 305 306 filecaps_free(&fde->fde_caps); 307 } 308 309 static inline void 310 fdfree(struct filedesc *fdp, int fd) 311 { 312 struct filedescent *fde; 313 314 FILEDESC_XLOCK_ASSERT(fdp); 315 fde = &fdp->fd_ofiles[fd]; 316 #ifdef CAPABILITIES 317 seqc_write_begin(&fde->fde_seqc); 318 #endif 319 fde->fde_file = NULL; 320 #ifdef CAPABILITIES 321 seqc_write_end(&fde->fde_seqc); 322 #endif 323 fdefree_last(fde); 324 fdunused(fdp, fd); 325 } 326 327 /* 328 * System calls on descriptors. 329 */ 330 #ifndef _SYS_SYSPROTO_H_ 331 struct getdtablesize_args { 332 int dummy; 333 }; 334 #endif 335 /* ARGSUSED */ 336 int 337 sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap) 338 { 339 #ifdef RACCT 340 uint64_t lim; 341 #endif 342 343 td->td_retval[0] = getmaxfd(td); 344 #ifdef RACCT 345 PROC_LOCK(td->td_proc); 346 lim = racct_get_limit(td->td_proc, RACCT_NOFILE); 347 PROC_UNLOCK(td->td_proc); 348 if (lim < td->td_retval[0]) 349 td->td_retval[0] = lim; 350 #endif 351 return (0); 352 } 353 354 /* 355 * Duplicate a file descriptor to a particular value. 356 * 357 * Note: keep in mind that a potential race condition exists when closing 358 * descriptors from a shared descriptor table (via rfork). 359 */ 360 #ifndef _SYS_SYSPROTO_H_ 361 struct dup2_args { 362 u_int from; 363 u_int to; 364 }; 365 #endif 366 /* ARGSUSED */ 367 int 368 sys_dup2(struct thread *td, struct dup2_args *uap) 369 { 370 371 return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to)); 372 } 373 374 /* 375 * Duplicate a file descriptor. 376 */ 377 #ifndef _SYS_SYSPROTO_H_ 378 struct dup_args { 379 u_int fd; 380 }; 381 #endif 382 /* ARGSUSED */ 383 int 384 sys_dup(struct thread *td, struct dup_args *uap) 385 { 386 387 return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0)); 388 } 389 390 /* 391 * The file control system call. 392 */ 393 #ifndef _SYS_SYSPROTO_H_ 394 struct fcntl_args { 395 int fd; 396 int cmd; 397 long arg; 398 }; 399 #endif 400 /* ARGSUSED */ 401 int 402 sys_fcntl(struct thread *td, struct fcntl_args *uap) 403 { 404 405 return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg)); 406 } 407 408 int 409 kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg) 410 { 411 struct flock fl; 412 struct __oflock ofl; 413 intptr_t arg1; 414 int error, newcmd; 415 416 error = 0; 417 newcmd = cmd; 418 switch (cmd) { 419 case F_OGETLK: 420 case F_OSETLK: 421 case F_OSETLKW: 422 /* 423 * Convert old flock structure to new. 424 */ 425 error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl)); 426 fl.l_start = ofl.l_start; 427 fl.l_len = ofl.l_len; 428 fl.l_pid = ofl.l_pid; 429 fl.l_type = ofl.l_type; 430 fl.l_whence = ofl.l_whence; 431 fl.l_sysid = 0; 432 433 switch (cmd) { 434 case F_OGETLK: 435 newcmd = F_GETLK; 436 break; 437 case F_OSETLK: 438 newcmd = F_SETLK; 439 break; 440 case F_OSETLKW: 441 newcmd = F_SETLKW; 442 break; 443 } 444 arg1 = (intptr_t)&fl; 445 break; 446 case F_GETLK: 447 case F_SETLK: 448 case F_SETLKW: 449 case F_SETLK_REMOTE: 450 error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl)); 451 arg1 = (intptr_t)&fl; 452 break; 453 default: 454 arg1 = arg; 455 break; 456 } 457 if (error) 458 return (error); 459 error = kern_fcntl(td, fd, newcmd, arg1); 460 if (error) 461 return (error); 462 if (cmd == F_OGETLK) { 463 ofl.l_start = fl.l_start; 464 ofl.l_len = fl.l_len; 465 ofl.l_pid = fl.l_pid; 466 ofl.l_type = fl.l_type; 467 ofl.l_whence = fl.l_whence; 468 error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl)); 469 } else if (cmd == F_GETLK) { 470 error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl)); 471 } 472 return (error); 473 } 474 475 int 476 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) 477 { 478 struct filedesc *fdp; 479 struct flock *flp; 480 struct file *fp, *fp2; 481 struct filedescent *fde; 482 struct proc *p; 483 struct vnode *vp; 484 struct mount *mp; 485 struct kinfo_file *kif; 486 int error, flg, kif_sz, seals, tmp; 487 uint64_t bsize; 488 off_t foffset; 489 490 error = 0; 491 flg = F_POSIX; 492 p = td->td_proc; 493 fdp = p->p_fd; 494 495 AUDIT_ARG_FD(cmd); 496 AUDIT_ARG_CMD(cmd); 497 switch (cmd) { 498 case F_DUPFD: 499 tmp = arg; 500 error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp); 501 break; 502 503 case F_DUPFD_CLOEXEC: 504 tmp = arg; 505 error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp); 506 break; 507 508 case F_DUP2FD: 509 tmp = arg; 510 error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp); 511 break; 512 513 case F_DUP2FD_CLOEXEC: 514 tmp = arg; 515 error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp); 516 break; 517 518 case F_GETFD: 519 error = EBADF; 520 FILEDESC_SLOCK(fdp); 521 fde = fdeget_locked(fdp, fd); 522 if (fde != NULL) { 523 td->td_retval[0] = 524 (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0; 525 error = 0; 526 } 527 FILEDESC_SUNLOCK(fdp); 528 break; 529 530 case F_SETFD: 531 error = EBADF; 532 FILEDESC_XLOCK(fdp); 533 fde = fdeget_locked(fdp, fd); 534 if (fde != NULL) { 535 fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) | 536 (arg & FD_CLOEXEC ? UF_EXCLOSE : 0); 537 error = 0; 538 } 539 FILEDESC_XUNLOCK(fdp); 540 break; 541 542 case F_GETFL: 543 error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETFL, &fp); 544 if (error != 0) 545 break; 546 td->td_retval[0] = OFLAGS(fp->f_flag); 547 fdrop(fp, td); 548 break; 549 550 case F_SETFL: 551 error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETFL, &fp); 552 if (error != 0) 553 break; 554 if (fp->f_ops == &path_fileops) { 555 fdrop(fp, td); 556 error = EBADF; 557 break; 558 } 559 do { 560 tmp = flg = fp->f_flag; 561 tmp &= ~FCNTLFLAGS; 562 tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS; 563 } while (atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); 564 tmp = fp->f_flag & FNONBLOCK; 565 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 566 if (error != 0) { 567 fdrop(fp, td); 568 break; 569 } 570 tmp = fp->f_flag & FASYNC; 571 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); 572 if (error == 0) { 573 fdrop(fp, td); 574 break; 575 } 576 atomic_clear_int(&fp->f_flag, FNONBLOCK); 577 tmp = 0; 578 (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 579 fdrop(fp, td); 580 break; 581 582 case F_GETOWN: 583 error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETOWN, &fp); 584 if (error != 0) 585 break; 586 error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td); 587 if (error == 0) 588 td->td_retval[0] = tmp; 589 fdrop(fp, td); 590 break; 591 592 case F_SETOWN: 593 error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETOWN, &fp); 594 if (error != 0) 595 break; 596 tmp = arg; 597 error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td); 598 fdrop(fp, td); 599 break; 600 601 case F_SETLK_REMOTE: 602 error = priv_check(td, PRIV_NFS_LOCKD); 603 if (error != 0) 604 return (error); 605 flg = F_REMOTE; 606 goto do_setlk; 607 608 case F_SETLKW: 609 flg |= F_WAIT; 610 /* FALLTHROUGH F_SETLK */ 611 612 case F_SETLK: 613 do_setlk: 614 flp = (struct flock *)arg; 615 if ((flg & F_REMOTE) != 0 && flp->l_sysid == 0) { 616 error = EINVAL; 617 break; 618 } 619 620 error = fget_unlocked(td, fd, &cap_flock_rights, &fp); 621 if (error != 0) 622 break; 623 if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { 624 error = EBADF; 625 fdrop(fp, td); 626 break; 627 } 628 629 if (flp->l_whence == SEEK_CUR) { 630 foffset = foffset_get(fp); 631 if (foffset < 0 || 632 (flp->l_start > 0 && 633 foffset > OFF_MAX - flp->l_start)) { 634 error = EOVERFLOW; 635 fdrop(fp, td); 636 break; 637 } 638 flp->l_start += foffset; 639 } 640 641 vp = fp->f_vnode; 642 switch (flp->l_type) { 643 case F_RDLCK: 644 if ((fp->f_flag & FREAD) == 0) { 645 error = EBADF; 646 break; 647 } 648 if ((p->p_leader->p_flag & P_ADVLOCK) == 0) { 649 PROC_LOCK(p->p_leader); 650 p->p_leader->p_flag |= P_ADVLOCK; 651 PROC_UNLOCK(p->p_leader); 652 } 653 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, 654 flp, flg); 655 break; 656 case F_WRLCK: 657 if ((fp->f_flag & FWRITE) == 0) { 658 error = EBADF; 659 break; 660 } 661 if ((p->p_leader->p_flag & P_ADVLOCK) == 0) { 662 PROC_LOCK(p->p_leader); 663 p->p_leader->p_flag |= P_ADVLOCK; 664 PROC_UNLOCK(p->p_leader); 665 } 666 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, 667 flp, flg); 668 break; 669 case F_UNLCK: 670 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, 671 flp, flg); 672 break; 673 case F_UNLCKSYS: 674 if (flg != F_REMOTE) { 675 error = EINVAL; 676 break; 677 } 678 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, 679 F_UNLCKSYS, flp, flg); 680 break; 681 default: 682 error = EINVAL; 683 break; 684 } 685 if (error != 0 || flp->l_type == F_UNLCK || 686 flp->l_type == F_UNLCKSYS) { 687 fdrop(fp, td); 688 break; 689 } 690 691 /* 692 * Check for a race with close. 693 * 694 * The vnode is now advisory locked (or unlocked, but this case 695 * is not really important) as the caller requested. 696 * We had to drop the filedesc lock, so we need to recheck if 697 * the descriptor is still valid, because if it was closed 698 * in the meantime we need to remove advisory lock from the 699 * vnode - close on any descriptor leading to an advisory 700 * locked vnode, removes that lock. 701 * We will return 0 on purpose in that case, as the result of 702 * successful advisory lock might have been externally visible 703 * already. This is fine - effectively we pretend to the caller 704 * that the closing thread was a bit slower and that the 705 * advisory lock succeeded before the close. 706 */ 707 error = fget_unlocked(td, fd, &cap_no_rights, &fp2); 708 if (error != 0) { 709 fdrop(fp, td); 710 break; 711 } 712 if (fp != fp2) { 713 flp->l_whence = SEEK_SET; 714 flp->l_start = 0; 715 flp->l_len = 0; 716 flp->l_type = F_UNLCK; 717 (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, 718 F_UNLCK, flp, F_POSIX); 719 } 720 fdrop(fp, td); 721 fdrop(fp2, td); 722 break; 723 724 case F_GETLK: 725 error = fget_unlocked(td, fd, &cap_flock_rights, &fp); 726 if (error != 0) 727 break; 728 if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { 729 error = EBADF; 730 fdrop(fp, td); 731 break; 732 } 733 flp = (struct flock *)arg; 734 if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK && 735 flp->l_type != F_UNLCK) { 736 error = EINVAL; 737 fdrop(fp, td); 738 break; 739 } 740 if (flp->l_whence == SEEK_CUR) { 741 foffset = foffset_get(fp); 742 if ((flp->l_start > 0 && 743 foffset > OFF_MAX - flp->l_start) || 744 (flp->l_start < 0 && 745 foffset < OFF_MIN - flp->l_start)) { 746 error = EOVERFLOW; 747 fdrop(fp, td); 748 break; 749 } 750 flp->l_start += foffset; 751 } 752 vp = fp->f_vnode; 753 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp, 754 F_POSIX); 755 fdrop(fp, td); 756 break; 757 758 case F_ADD_SEALS: 759 error = fget_unlocked(td, fd, &cap_no_rights, &fp); 760 if (error != 0) 761 break; 762 error = fo_add_seals(fp, arg); 763 fdrop(fp, td); 764 break; 765 766 case F_GET_SEALS: 767 error = fget_unlocked(td, fd, &cap_no_rights, &fp); 768 if (error != 0) 769 break; 770 if (fo_get_seals(fp, &seals) == 0) 771 td->td_retval[0] = seals; 772 else 773 error = EINVAL; 774 fdrop(fp, td); 775 break; 776 777 case F_RDAHEAD: 778 arg = arg ? 128 * 1024: 0; 779 /* FALLTHROUGH */ 780 case F_READAHEAD: 781 error = fget_unlocked(td, fd, &cap_no_rights, &fp); 782 if (error != 0) 783 break; 784 if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { 785 fdrop(fp, td); 786 error = EBADF; 787 break; 788 } 789 vp = fp->f_vnode; 790 if (vp->v_type != VREG) { 791 fdrop(fp, td); 792 error = ENOTTY; 793 break; 794 } 795 796 /* 797 * Exclusive lock synchronizes against f_seqcount reads and 798 * writes in sequential_heuristic(). 799 */ 800 error = vn_lock(vp, LK_EXCLUSIVE); 801 if (error != 0) { 802 fdrop(fp, td); 803 break; 804 } 805 if (arg >= 0) { 806 bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize; 807 arg = MIN(arg, INT_MAX - bsize + 1); 808 fp->f_seqcount[UIO_READ] = MIN(IO_SEQMAX, 809 (arg + bsize - 1) / bsize); 810 atomic_set_int(&fp->f_flag, FRDAHEAD); 811 } else { 812 atomic_clear_int(&fp->f_flag, FRDAHEAD); 813 } 814 VOP_UNLOCK(vp); 815 fdrop(fp, td); 816 break; 817 818 case F_ISUNIONSTACK: 819 /* 820 * Check if the vnode is part of a union stack (either the 821 * "union" flag from mount(2) or unionfs). 822 * 823 * Prior to introduction of this op libc's readdir would call 824 * fstatfs(2), in effect unnecessarily copying kilobytes of 825 * data just to check fs name and a mount flag. 826 * 827 * Fixing the code to handle everything in the kernel instead 828 * is a non-trivial endeavor and has low priority, thus this 829 * horrible kludge facilitates the current behavior in a much 830 * cheaper manner until someone(tm) sorts this out. 831 */ 832 error = fget_unlocked(td, fd, &cap_no_rights, &fp); 833 if (error != 0) 834 break; 835 if (fp->f_type != DTYPE_VNODE) { 836 fdrop(fp, td); 837 error = EBADF; 838 break; 839 } 840 vp = fp->f_vnode; 841 /* 842 * Since we don't prevent dooming the vnode even non-null mp 843 * found can become immediately stale. This is tolerable since 844 * mount points are type-stable (providing safe memory access) 845 * and any vfs op on this vnode going forward will return an 846 * error (meaning return value in this case is meaningless). 847 */ 848 mp = atomic_load_ptr(&vp->v_mount); 849 if (__predict_false(mp == NULL)) { 850 fdrop(fp, td); 851 error = EBADF; 852 break; 853 } 854 td->td_retval[0] = 0; 855 if (mp->mnt_kern_flag & MNTK_UNIONFS || 856 mp->mnt_flag & MNT_UNION) 857 td->td_retval[0] = 1; 858 fdrop(fp, td); 859 break; 860 861 case F_KINFO: 862 #ifdef CAPABILITY_MODE 863 if (IN_CAPABILITY_MODE(td)) { 864 error = ECAPMODE; 865 break; 866 } 867 #endif 868 error = copyin((void *)arg, &kif_sz, sizeof(kif_sz)); 869 if (error != 0) 870 break; 871 if (kif_sz != sizeof(*kif)) { 872 error = EINVAL; 873 break; 874 } 875 kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK | M_ZERO); 876 FILEDESC_SLOCK(fdp); 877 error = fget_cap_locked(fdp, fd, &cap_fcntl_rights, &fp, NULL); 878 if (error == 0 && fhold(fp)) { 879 export_file_to_kinfo(fp, fd, NULL, kif, fdp, 0); 880 FILEDESC_SUNLOCK(fdp); 881 fdrop(fp, td); 882 if ((kif->kf_status & KF_ATTR_VALID) != 0) { 883 kif->kf_structsize = sizeof(*kif); 884 error = copyout(kif, (void *)arg, sizeof(*kif)); 885 } else { 886 error = EBADF; 887 } 888 } else { 889 FILEDESC_SUNLOCK(fdp); 890 if (error == 0) 891 error = EBADF; 892 } 893 free(kif, M_TEMP); 894 break; 895 896 default: 897 error = EINVAL; 898 break; 899 } 900 return (error); 901 } 902 903 static int 904 getmaxfd(struct thread *td) 905 { 906 907 return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc)); 908 } 909 910 /* 911 * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD). 912 */ 913 int 914 kern_dup(struct thread *td, u_int mode, int flags, int old, int new) 915 { 916 struct filedesc *fdp; 917 struct filedescent *oldfde, *newfde; 918 struct proc *p; 919 struct file *delfp, *oldfp; 920 u_long *oioctls, *nioctls; 921 int error, maxfd; 922 923 p = td->td_proc; 924 fdp = p->p_fd; 925 oioctls = NULL; 926 927 MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0); 928 MPASS(mode < FDDUP_LASTMODE); 929 930 AUDIT_ARG_FD(old); 931 /* XXXRW: if (flags & FDDUP_FIXED) AUDIT_ARG_FD2(new); */ 932 933 /* 934 * Verify we have a valid descriptor to dup from and possibly to 935 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should 936 * return EINVAL when the new descriptor is out of bounds. 937 */ 938 if (old < 0) 939 return (EBADF); 940 if (new < 0) 941 return (mode == FDDUP_FCNTL ? EINVAL : EBADF); 942 maxfd = getmaxfd(td); 943 if (new >= maxfd) 944 return (mode == FDDUP_FCNTL ? EINVAL : EBADF); 945 946 error = EBADF; 947 FILEDESC_XLOCK(fdp); 948 if (fget_locked(fdp, old) == NULL) 949 goto unlock; 950 if (mode == FDDUP_FIXED && old == new) { 951 td->td_retval[0] = new; 952 if (flags & FDDUP_FLAG_CLOEXEC) 953 fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE; 954 error = 0; 955 goto unlock; 956 } 957 958 oldfde = &fdp->fd_ofiles[old]; 959 oldfp = oldfde->fde_file; 960 if (!fhold(oldfp)) 961 goto unlock; 962 963 /* 964 * If the caller specified a file descriptor, make sure the file 965 * table is large enough to hold it, and grab it. Otherwise, just 966 * allocate a new descriptor the usual way. 967 */ 968 switch (mode) { 969 case FDDUP_NORMAL: 970 case FDDUP_FCNTL: 971 if ((error = fdalloc(td, new, &new)) != 0) { 972 fdrop(oldfp, td); 973 goto unlock; 974 } 975 break; 976 case FDDUP_FIXED: 977 if (new >= fdp->fd_nfiles) { 978 /* 979 * The resource limits are here instead of e.g. 980 * fdalloc(), because the file descriptor table may be 981 * shared between processes, so we can't really use 982 * racct_add()/racct_sub(). Instead of counting the 983 * number of actually allocated descriptors, just put 984 * the limit on the size of the file descriptor table. 985 */ 986 #ifdef RACCT 987 if (RACCT_ENABLED()) { 988 error = racct_set_unlocked(p, RACCT_NOFILE, new + 1); 989 if (error != 0) { 990 error = EMFILE; 991 fdrop(oldfp, td); 992 goto unlock; 993 } 994 } 995 #endif 996 fdgrowtable_exp(fdp, new + 1); 997 } 998 if (!fdisused(fdp, new)) 999 fdused(fdp, new); 1000 break; 1001 default: 1002 KASSERT(0, ("%s unsupported mode %d", __func__, mode)); 1003 } 1004 1005 KASSERT(old != new, ("new fd is same as old")); 1006 1007 /* Refetch oldfde because the table may have grown and old one freed. */ 1008 oldfde = &fdp->fd_ofiles[old]; 1009 KASSERT(oldfp == oldfde->fde_file, 1010 ("fdt_ofiles shift from growth observed at fd %d", 1011 old)); 1012 1013 newfde = &fdp->fd_ofiles[new]; 1014 delfp = newfde->fde_file; 1015 1016 nioctls = filecaps_copy_prep(&oldfde->fde_caps); 1017 1018 /* 1019 * Duplicate the source descriptor. 1020 */ 1021 #ifdef CAPABILITIES 1022 seqc_write_begin(&newfde->fde_seqc); 1023 #endif 1024 oioctls = filecaps_free_prep(&newfde->fde_caps); 1025 fde_copy(oldfde, newfde); 1026 filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps, 1027 nioctls); 1028 if ((flags & FDDUP_FLAG_CLOEXEC) != 0) 1029 newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE; 1030 else 1031 newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE; 1032 #ifdef CAPABILITIES 1033 seqc_write_end(&newfde->fde_seqc); 1034 #endif 1035 td->td_retval[0] = new; 1036 1037 error = 0; 1038 1039 if (delfp != NULL) { 1040 (void) closefp(fdp, new, delfp, td, true, false); 1041 FILEDESC_UNLOCK_ASSERT(fdp); 1042 } else { 1043 unlock: 1044 FILEDESC_XUNLOCK(fdp); 1045 } 1046 1047 filecaps_free_finish(oioctls); 1048 return (error); 1049 } 1050 1051 static void 1052 sigiofree(struct sigio *sigio) 1053 { 1054 crfree(sigio->sio_ucred); 1055 free(sigio, M_SIGIO); 1056 } 1057 1058 static struct sigio * 1059 funsetown_locked(struct sigio *sigio) 1060 { 1061 struct proc *p; 1062 struct pgrp *pg; 1063 1064 SIGIO_ASSERT_LOCKED(); 1065 1066 if (sigio == NULL) 1067 return (NULL); 1068 *sigio->sio_myref = NULL; 1069 if (sigio->sio_pgid < 0) { 1070 pg = sigio->sio_pgrp; 1071 PGRP_LOCK(pg); 1072 SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, sio_pgsigio); 1073 PGRP_UNLOCK(pg); 1074 } else { 1075 p = sigio->sio_proc; 1076 PROC_LOCK(p); 1077 SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, sio_pgsigio); 1078 PROC_UNLOCK(p); 1079 } 1080 return (sigio); 1081 } 1082 1083 /* 1084 * If sigio is on the list associated with a process or process group, 1085 * disable signalling from the device, remove sigio from the list and 1086 * free sigio. 1087 */ 1088 void 1089 funsetown(struct sigio **sigiop) 1090 { 1091 struct sigio *sigio; 1092 1093 /* Racy check, consumers must provide synchronization. */ 1094 if (*sigiop == NULL) 1095 return; 1096 1097 SIGIO_LOCK(); 1098 sigio = funsetown_locked(*sigiop); 1099 SIGIO_UNLOCK(); 1100 if (sigio != NULL) 1101 sigiofree(sigio); 1102 } 1103 1104 /* 1105 * Free a list of sigio structures. The caller must ensure that new sigio 1106 * structures cannot be added after this point. For process groups this is 1107 * guaranteed using the proctree lock; for processes, the P_WEXIT flag serves 1108 * as an interlock. 1109 */ 1110 void 1111 funsetownlst(struct sigiolst *sigiolst) 1112 { 1113 struct proc *p; 1114 struct pgrp *pg; 1115 struct sigio *sigio, *tmp; 1116 1117 /* Racy check. */ 1118 sigio = SLIST_FIRST(sigiolst); 1119 if (sigio == NULL) 1120 return; 1121 1122 p = NULL; 1123 pg = NULL; 1124 1125 SIGIO_LOCK(); 1126 sigio = SLIST_FIRST(sigiolst); 1127 if (sigio == NULL) { 1128 SIGIO_UNLOCK(); 1129 return; 1130 } 1131 1132 /* 1133 * Every entry of the list should belong to a single proc or pgrp. 1134 */ 1135 if (sigio->sio_pgid < 0) { 1136 pg = sigio->sio_pgrp; 1137 sx_assert(&proctree_lock, SX_XLOCKED); 1138 PGRP_LOCK(pg); 1139 } else /* if (sigio->sio_pgid > 0) */ { 1140 p = sigio->sio_proc; 1141 PROC_LOCK(p); 1142 KASSERT((p->p_flag & P_WEXIT) != 0, 1143 ("%s: process %p is not exiting", __func__, p)); 1144 } 1145 1146 SLIST_FOREACH(sigio, sigiolst, sio_pgsigio) { 1147 *sigio->sio_myref = NULL; 1148 if (pg != NULL) { 1149 KASSERT(sigio->sio_pgid < 0, 1150 ("Proc sigio in pgrp sigio list")); 1151 KASSERT(sigio->sio_pgrp == pg, 1152 ("Bogus pgrp in sigio list")); 1153 } else /* if (p != NULL) */ { 1154 KASSERT(sigio->sio_pgid > 0, 1155 ("Pgrp sigio in proc sigio list")); 1156 KASSERT(sigio->sio_proc == p, 1157 ("Bogus proc in sigio list")); 1158 } 1159 } 1160 1161 if (pg != NULL) 1162 PGRP_UNLOCK(pg); 1163 else 1164 PROC_UNLOCK(p); 1165 SIGIO_UNLOCK(); 1166 1167 SLIST_FOREACH_SAFE(sigio, sigiolst, sio_pgsigio, tmp) 1168 sigiofree(sigio); 1169 } 1170 1171 /* 1172 * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). 1173 * 1174 * After permission checking, add a sigio structure to the sigio list for 1175 * the process or process group. 1176 */ 1177 int 1178 fsetown(pid_t pgid, struct sigio **sigiop) 1179 { 1180 struct proc *proc; 1181 struct pgrp *pgrp; 1182 struct sigio *osigio, *sigio; 1183 int ret; 1184 1185 if (pgid == 0) { 1186 funsetown(sigiop); 1187 return (0); 1188 } 1189 1190 sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK); 1191 sigio->sio_pgid = pgid; 1192 sigio->sio_ucred = crhold(curthread->td_ucred); 1193 sigio->sio_myref = sigiop; 1194 1195 ret = 0; 1196 if (pgid > 0) { 1197 ret = pget(pgid, PGET_NOTWEXIT | PGET_NOTID | PGET_HOLD, &proc); 1198 SIGIO_LOCK(); 1199 osigio = funsetown_locked(*sigiop); 1200 if (ret == 0) { 1201 PROC_LOCK(proc); 1202 _PRELE(proc); 1203 if ((proc->p_flag & P_WEXIT) != 0) { 1204 ret = ESRCH; 1205 } else if (proc->p_session != 1206 curthread->td_proc->p_session) { 1207 /* 1208 * Policy - Don't allow a process to FSETOWN a 1209 * process in another session. 1210 * 1211 * Remove this test to allow maximum flexibility 1212 * or restrict FSETOWN to the current process or 1213 * process group for maximum safety. 1214 */ 1215 ret = EPERM; 1216 } else { 1217 sigio->sio_proc = proc; 1218 SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, 1219 sio_pgsigio); 1220 } 1221 PROC_UNLOCK(proc); 1222 } 1223 } else /* if (pgid < 0) */ { 1224 sx_slock(&proctree_lock); 1225 SIGIO_LOCK(); 1226 osigio = funsetown_locked(*sigiop); 1227 pgrp = pgfind(-pgid); 1228 if (pgrp == NULL) { 1229 ret = ESRCH; 1230 } else { 1231 if (pgrp->pg_session != curthread->td_proc->p_session) { 1232 /* 1233 * Policy - Don't allow a process to FSETOWN a 1234 * process in another session. 1235 * 1236 * Remove this test to allow maximum flexibility 1237 * or restrict FSETOWN to the current process or 1238 * process group for maximum safety. 1239 */ 1240 ret = EPERM; 1241 } else { 1242 sigio->sio_pgrp = pgrp; 1243 SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, 1244 sio_pgsigio); 1245 } 1246 PGRP_UNLOCK(pgrp); 1247 } 1248 sx_sunlock(&proctree_lock); 1249 } 1250 if (ret == 0) 1251 *sigiop = sigio; 1252 SIGIO_UNLOCK(); 1253 if (osigio != NULL) 1254 sigiofree(osigio); 1255 return (ret); 1256 } 1257 1258 /* 1259 * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). 1260 */ 1261 pid_t 1262 fgetown(struct sigio **sigiop) 1263 { 1264 pid_t pgid; 1265 1266 SIGIO_LOCK(); 1267 pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0; 1268 SIGIO_UNLOCK(); 1269 return (pgid); 1270 } 1271 1272 static int 1273 closefp_impl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, 1274 bool audit) 1275 { 1276 int error; 1277 1278 FILEDESC_XLOCK_ASSERT(fdp); 1279 1280 /* 1281 * We now hold the fp reference that used to be owned by the 1282 * descriptor array. We have to unlock the FILEDESC *AFTER* 1283 * knote_fdclose to prevent a race of the fd getting opened, a knote 1284 * added, and deleteing a knote for the new fd. 1285 */ 1286 if (__predict_false(!TAILQ_EMPTY(&fdp->fd_kqlist))) 1287 knote_fdclose(td, fd); 1288 1289 /* 1290 * We need to notify mqueue if the object is of type mqueue. 1291 */ 1292 if (__predict_false(fp->f_type == DTYPE_MQUEUE)) 1293 mq_fdclose(td, fd, fp); 1294 FILEDESC_XUNLOCK(fdp); 1295 1296 #ifdef AUDIT 1297 if (AUDITING_TD(td) && audit) 1298 audit_sysclose(td, fd, fp); 1299 #endif 1300 error = closef(fp, td); 1301 1302 /* 1303 * All paths leading up to closefp() will have already removed or 1304 * replaced the fd in the filedesc table, so a restart would not 1305 * operate on the same file. 1306 */ 1307 if (error == ERESTART) 1308 error = EINTR; 1309 1310 return (error); 1311 } 1312 1313 static int 1314 closefp_hl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, 1315 bool holdleaders, bool audit) 1316 { 1317 int error; 1318 1319 FILEDESC_XLOCK_ASSERT(fdp); 1320 1321 if (holdleaders) { 1322 if (td->td_proc->p_fdtol != NULL) { 1323 /* 1324 * Ask fdfree() to sleep to ensure that all relevant 1325 * process leaders can be traversed in closef(). 1326 */ 1327 fdp->fd_holdleaderscount++; 1328 } else { 1329 holdleaders = false; 1330 } 1331 } 1332 1333 error = closefp_impl(fdp, fd, fp, td, audit); 1334 if (holdleaders) { 1335 FILEDESC_XLOCK(fdp); 1336 fdp->fd_holdleaderscount--; 1337 if (fdp->fd_holdleaderscount == 0 && 1338 fdp->fd_holdleaderswakeup != 0) { 1339 fdp->fd_holdleaderswakeup = 0; 1340 wakeup(&fdp->fd_holdleaderscount); 1341 } 1342 FILEDESC_XUNLOCK(fdp); 1343 } 1344 return (error); 1345 } 1346 1347 static int 1348 closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, 1349 bool holdleaders, bool audit) 1350 { 1351 1352 FILEDESC_XLOCK_ASSERT(fdp); 1353 1354 if (__predict_false(td->td_proc->p_fdtol != NULL)) { 1355 return (closefp_hl(fdp, fd, fp, td, holdleaders, audit)); 1356 } else { 1357 return (closefp_impl(fdp, fd, fp, td, audit)); 1358 } 1359 } 1360 1361 /* 1362 * Close a file descriptor. 1363 */ 1364 #ifndef _SYS_SYSPROTO_H_ 1365 struct close_args { 1366 int fd; 1367 }; 1368 #endif 1369 /* ARGSUSED */ 1370 int 1371 sys_close(struct thread *td, struct close_args *uap) 1372 { 1373 1374 return (kern_close(td, uap->fd)); 1375 } 1376 1377 int 1378 kern_close(struct thread *td, int fd) 1379 { 1380 struct filedesc *fdp; 1381 struct file *fp; 1382 1383 fdp = td->td_proc->p_fd; 1384 1385 FILEDESC_XLOCK(fdp); 1386 if ((fp = fget_locked(fdp, fd)) == NULL) { 1387 FILEDESC_XUNLOCK(fdp); 1388 return (EBADF); 1389 } 1390 fdfree(fdp, fd); 1391 1392 /* closefp() drops the FILEDESC lock for us. */ 1393 return (closefp(fdp, fd, fp, td, true, true)); 1394 } 1395 1396 int 1397 kern_close_range(struct thread *td, u_int lowfd, u_int highfd) 1398 { 1399 struct filedesc *fdp; 1400 const struct fdescenttbl *fdt; 1401 struct file *fp; 1402 int fd; 1403 1404 /* 1405 * Check this prior to clamping; closefrom(3) with only fd 0, 1, and 2 1406 * open should not be a usage error. From a close_range() perspective, 1407 * close_range(3, ~0U, 0) in the same scenario should also likely not 1408 * be a usage error as all fd above 3 are in-fact already closed. 1409 */ 1410 if (highfd < lowfd) { 1411 return (EINVAL); 1412 } 1413 1414 fdp = td->td_proc->p_fd; 1415 FILEDESC_XLOCK(fdp); 1416 fdt = atomic_load_ptr(&fdp->fd_files); 1417 highfd = MIN(highfd, fdt->fdt_nfiles - 1); 1418 fd = lowfd; 1419 if (__predict_false(fd > highfd)) { 1420 goto out_locked; 1421 } 1422 for (;;) { 1423 fp = fdt->fdt_ofiles[fd].fde_file; 1424 if (fp == NULL) { 1425 if (fd == highfd) 1426 goto out_locked; 1427 } else { 1428 fdfree(fdp, fd); 1429 (void) closefp(fdp, fd, fp, td, true, true); 1430 if (fd == highfd) 1431 goto out_unlocked; 1432 FILEDESC_XLOCK(fdp); 1433 fdt = atomic_load_ptr(&fdp->fd_files); 1434 } 1435 fd++; 1436 } 1437 out_locked: 1438 FILEDESC_XUNLOCK(fdp); 1439 out_unlocked: 1440 return (0); 1441 } 1442 1443 #ifndef _SYS_SYSPROTO_H_ 1444 struct close_range_args { 1445 u_int lowfd; 1446 u_int highfd; 1447 int flags; 1448 }; 1449 #endif 1450 int 1451 sys_close_range(struct thread *td, struct close_range_args *uap) 1452 { 1453 1454 AUDIT_ARG_FD(uap->lowfd); 1455 AUDIT_ARG_CMD(uap->highfd); 1456 AUDIT_ARG_FFLAGS(uap->flags); 1457 1458 /* No flags currently defined */ 1459 if (uap->flags != 0) 1460 return (EINVAL); 1461 return (kern_close_range(td, uap->lowfd, uap->highfd)); 1462 } 1463 1464 #ifdef COMPAT_FREEBSD12 1465 /* 1466 * Close open file descriptors. 1467 */ 1468 #ifndef _SYS_SYSPROTO_H_ 1469 struct freebsd12_closefrom_args { 1470 int lowfd; 1471 }; 1472 #endif 1473 /* ARGSUSED */ 1474 int 1475 freebsd12_closefrom(struct thread *td, struct freebsd12_closefrom_args *uap) 1476 { 1477 u_int lowfd; 1478 1479 AUDIT_ARG_FD(uap->lowfd); 1480 1481 /* 1482 * Treat negative starting file descriptor values identical to 1483 * closefrom(0) which closes all files. 1484 */ 1485 lowfd = MAX(0, uap->lowfd); 1486 return (kern_close_range(td, lowfd, ~0U)); 1487 } 1488 #endif /* COMPAT_FREEBSD12 */ 1489 1490 #if defined(COMPAT_43) 1491 /* 1492 * Return status information about a file descriptor. 1493 */ 1494 #ifndef _SYS_SYSPROTO_H_ 1495 struct ofstat_args { 1496 int fd; 1497 struct ostat *sb; 1498 }; 1499 #endif 1500 /* ARGSUSED */ 1501 int 1502 ofstat(struct thread *td, struct ofstat_args *uap) 1503 { 1504 struct ostat oub; 1505 struct stat ub; 1506 int error; 1507 1508 error = kern_fstat(td, uap->fd, &ub); 1509 if (error == 0) { 1510 cvtstat(&ub, &oub); 1511 error = copyout(&oub, uap->sb, sizeof(oub)); 1512 } 1513 return (error); 1514 } 1515 #endif /* COMPAT_43 */ 1516 1517 #if defined(COMPAT_FREEBSD11) 1518 int 1519 freebsd11_fstat(struct thread *td, struct freebsd11_fstat_args *uap) 1520 { 1521 struct stat sb; 1522 struct freebsd11_stat osb; 1523 int error; 1524 1525 error = kern_fstat(td, uap->fd, &sb); 1526 if (error != 0) 1527 return (error); 1528 error = freebsd11_cvtstat(&sb, &osb); 1529 if (error == 0) 1530 error = copyout(&osb, uap->sb, sizeof(osb)); 1531 return (error); 1532 } 1533 #endif /* COMPAT_FREEBSD11 */ 1534 1535 /* 1536 * Return status information about a file descriptor. 1537 */ 1538 #ifndef _SYS_SYSPROTO_H_ 1539 struct fstat_args { 1540 int fd; 1541 struct stat *sb; 1542 }; 1543 #endif 1544 /* ARGSUSED */ 1545 int 1546 sys_fstat(struct thread *td, struct fstat_args *uap) 1547 { 1548 struct stat ub; 1549 int error; 1550 1551 error = kern_fstat(td, uap->fd, &ub); 1552 if (error == 0) 1553 error = copyout(&ub, uap->sb, sizeof(ub)); 1554 return (error); 1555 } 1556 1557 int 1558 kern_fstat(struct thread *td, int fd, struct stat *sbp) 1559 { 1560 struct file *fp; 1561 int error; 1562 1563 AUDIT_ARG_FD(fd); 1564 1565 error = fget(td, fd, &cap_fstat_rights, &fp); 1566 if (__predict_false(error != 0)) 1567 return (error); 1568 1569 AUDIT_ARG_FILE(td->td_proc, fp); 1570 1571 error = fo_stat(fp, sbp, td->td_ucred); 1572 fdrop(fp, td); 1573 #ifdef __STAT_TIME_T_EXT 1574 sbp->st_atim_ext = 0; 1575 sbp->st_mtim_ext = 0; 1576 sbp->st_ctim_ext = 0; 1577 sbp->st_btim_ext = 0; 1578 #endif 1579 #ifdef KTRACE 1580 if (KTRPOINT(td, KTR_STRUCT)) 1581 ktrstat_error(sbp, error); 1582 #endif 1583 return (error); 1584 } 1585 1586 #if defined(COMPAT_FREEBSD11) 1587 /* 1588 * Return status information about a file descriptor. 1589 */ 1590 #ifndef _SYS_SYSPROTO_H_ 1591 struct freebsd11_nfstat_args { 1592 int fd; 1593 struct nstat *sb; 1594 }; 1595 #endif 1596 /* ARGSUSED */ 1597 int 1598 freebsd11_nfstat(struct thread *td, struct freebsd11_nfstat_args *uap) 1599 { 1600 struct nstat nub; 1601 struct stat ub; 1602 int error; 1603 1604 error = kern_fstat(td, uap->fd, &ub); 1605 if (error != 0) 1606 return (error); 1607 error = freebsd11_cvtnstat(&ub, &nub); 1608 if (error != 0) 1609 error = copyout(&nub, uap->sb, sizeof(nub)); 1610 return (error); 1611 } 1612 #endif /* COMPAT_FREEBSD11 */ 1613 1614 /* 1615 * Return pathconf information about a file descriptor. 1616 */ 1617 #ifndef _SYS_SYSPROTO_H_ 1618 struct fpathconf_args { 1619 int fd; 1620 int name; 1621 }; 1622 #endif 1623 /* ARGSUSED */ 1624 int 1625 sys_fpathconf(struct thread *td, struct fpathconf_args *uap) 1626 { 1627 long value; 1628 int error; 1629 1630 error = kern_fpathconf(td, uap->fd, uap->name, &value); 1631 if (error == 0) 1632 td->td_retval[0] = value; 1633 return (error); 1634 } 1635 1636 int 1637 kern_fpathconf(struct thread *td, int fd, int name, long *valuep) 1638 { 1639 struct file *fp; 1640 struct vnode *vp; 1641 int error; 1642 1643 error = fget(td, fd, &cap_fpathconf_rights, &fp); 1644 if (error != 0) 1645 return (error); 1646 1647 if (name == _PC_ASYNC_IO) { 1648 *valuep = _POSIX_ASYNCHRONOUS_IO; 1649 goto out; 1650 } 1651 vp = fp->f_vnode; 1652 if (vp != NULL) { 1653 vn_lock(vp, LK_SHARED | LK_RETRY); 1654 error = VOP_PATHCONF(vp, name, valuep); 1655 VOP_UNLOCK(vp); 1656 } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) { 1657 if (name != _PC_PIPE_BUF) { 1658 error = EINVAL; 1659 } else { 1660 *valuep = PIPE_BUF; 1661 error = 0; 1662 } 1663 } else { 1664 error = EOPNOTSUPP; 1665 } 1666 out: 1667 fdrop(fp, td); 1668 return (error); 1669 } 1670 1671 /* 1672 * Copy filecaps structure allocating memory for ioctls array if needed. 1673 * 1674 * The last parameter indicates whether the fdtable is locked. If it is not and 1675 * ioctls are encountered, copying fails and the caller must lock the table. 1676 * 1677 * Note that if the table was not locked, the caller has to check the relevant 1678 * sequence counter to determine whether the operation was successful. 1679 */ 1680 bool 1681 filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked) 1682 { 1683 size_t size; 1684 1685 if (src->fc_ioctls != NULL && !locked) 1686 return (false); 1687 memcpy(dst, src, sizeof(*src)); 1688 if (src->fc_ioctls == NULL) 1689 return (true); 1690 1691 KASSERT(src->fc_nioctls > 0, 1692 ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); 1693 1694 size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; 1695 dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK); 1696 memcpy(dst->fc_ioctls, src->fc_ioctls, size); 1697 return (true); 1698 } 1699 1700 static u_long * 1701 filecaps_copy_prep(const struct filecaps *src) 1702 { 1703 u_long *ioctls; 1704 size_t size; 1705 1706 if (__predict_true(src->fc_ioctls == NULL)) 1707 return (NULL); 1708 1709 KASSERT(src->fc_nioctls > 0, 1710 ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); 1711 1712 size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; 1713 ioctls = malloc(size, M_FILECAPS, M_WAITOK); 1714 return (ioctls); 1715 } 1716 1717 static void 1718 filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst, 1719 u_long *ioctls) 1720 { 1721 size_t size; 1722 1723 *dst = *src; 1724 if (__predict_true(src->fc_ioctls == NULL)) { 1725 MPASS(ioctls == NULL); 1726 return; 1727 } 1728 1729 size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; 1730 dst->fc_ioctls = ioctls; 1731 bcopy(src->fc_ioctls, dst->fc_ioctls, size); 1732 } 1733 1734 /* 1735 * Move filecaps structure to the new place and clear the old place. 1736 */ 1737 void 1738 filecaps_move(struct filecaps *src, struct filecaps *dst) 1739 { 1740 1741 *dst = *src; 1742 bzero(src, sizeof(*src)); 1743 } 1744 1745 /* 1746 * Fill the given filecaps structure with full rights. 1747 */ 1748 static void 1749 filecaps_fill(struct filecaps *fcaps) 1750 { 1751 1752 CAP_ALL(&fcaps->fc_rights); 1753 fcaps->fc_ioctls = NULL; 1754 fcaps->fc_nioctls = -1; 1755 fcaps->fc_fcntls = CAP_FCNTL_ALL; 1756 } 1757 1758 /* 1759 * Free memory allocated within filecaps structure. 1760 */ 1761 void 1762 filecaps_free(struct filecaps *fcaps) 1763 { 1764 1765 free(fcaps->fc_ioctls, M_FILECAPS); 1766 bzero(fcaps, sizeof(*fcaps)); 1767 } 1768 1769 static u_long * 1770 filecaps_free_prep(struct filecaps *fcaps) 1771 { 1772 u_long *ioctls; 1773 1774 ioctls = fcaps->fc_ioctls; 1775 bzero(fcaps, sizeof(*fcaps)); 1776 return (ioctls); 1777 } 1778 1779 static void 1780 filecaps_free_finish(u_long *ioctls) 1781 { 1782 1783 free(ioctls, M_FILECAPS); 1784 } 1785 1786 /* 1787 * Validate the given filecaps structure. 1788 */ 1789 static void 1790 filecaps_validate(const struct filecaps *fcaps, const char *func) 1791 { 1792 1793 KASSERT(cap_rights_is_valid(&fcaps->fc_rights), 1794 ("%s: invalid rights", func)); 1795 KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0, 1796 ("%s: invalid fcntls", func)); 1797 KASSERT(fcaps->fc_fcntls == 0 || 1798 cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL), 1799 ("%s: fcntls without CAP_FCNTL", func)); 1800 KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 : 1801 (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0), 1802 ("%s: invalid ioctls", func)); 1803 KASSERT(fcaps->fc_nioctls == 0 || 1804 cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL), 1805 ("%s: ioctls without CAP_IOCTL", func)); 1806 } 1807 1808 static void 1809 fdgrowtable_exp(struct filedesc *fdp, int nfd) 1810 { 1811 int nfd1; 1812 1813 FILEDESC_XLOCK_ASSERT(fdp); 1814 1815 nfd1 = fdp->fd_nfiles * 2; 1816 if (nfd1 < nfd) 1817 nfd1 = nfd; 1818 fdgrowtable(fdp, nfd1); 1819 } 1820 1821 /* 1822 * Grow the file table to accommodate (at least) nfd descriptors. 1823 */ 1824 static void 1825 fdgrowtable(struct filedesc *fdp, int nfd) 1826 { 1827 struct filedesc0 *fdp0; 1828 struct freetable *ft; 1829 struct fdescenttbl *ntable; 1830 struct fdescenttbl *otable; 1831 int nnfiles, onfiles; 1832 NDSLOTTYPE *nmap, *omap; 1833 1834 KASSERT(fdp->fd_nfiles > 0, ("zero-length file table")); 1835 1836 /* save old values */ 1837 onfiles = fdp->fd_nfiles; 1838 otable = fdp->fd_files; 1839 omap = fdp->fd_map; 1840 1841 /* compute the size of the new table */ 1842 nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */ 1843 if (nnfiles <= onfiles) 1844 /* the table is already large enough */ 1845 return; 1846 1847 /* 1848 * Allocate a new table. We need enough space for the number of 1849 * entries, file entries themselves and the struct freetable we will use 1850 * when we decommission the table and place it on the freelist. 1851 * We place the struct freetable in the middle so we don't have 1852 * to worry about padding. 1853 */ 1854 ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) + 1855 nnfiles * sizeof(ntable->fdt_ofiles[0]) + 1856 sizeof(struct freetable), 1857 M_FILEDESC, M_ZERO | M_WAITOK); 1858 /* copy the old data */ 1859 ntable->fdt_nfiles = nnfiles; 1860 memcpy(ntable->fdt_ofiles, otable->fdt_ofiles, 1861 onfiles * sizeof(ntable->fdt_ofiles[0])); 1862 1863 /* 1864 * Allocate a new map only if the old is not large enough. It will 1865 * grow at a slower rate than the table as it can map more 1866 * entries than the table can hold. 1867 */ 1868 if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) { 1869 nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC, 1870 M_ZERO | M_WAITOK); 1871 /* copy over the old data and update the pointer */ 1872 memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap)); 1873 fdp->fd_map = nmap; 1874 } 1875 1876 /* 1877 * Make sure that ntable is correctly initialized before we replace 1878 * fd_files poiner. Otherwise fget_unlocked() may see inconsistent 1879 * data. 1880 */ 1881 atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable); 1882 1883 /* 1884 * Free the old file table when not shared by other threads or processes. 1885 * The old file table is considered to be shared when either are true: 1886 * - The process has more than one thread. 1887 * - The file descriptor table has been shared via fdshare(). 1888 * 1889 * When shared, the old file table will be placed on a freelist 1890 * which will be processed when the struct filedesc is released. 1891 * 1892 * Note that if onfiles == NDFILE, we're dealing with the original 1893 * static allocation contained within (struct filedesc0 *)fdp, 1894 * which must not be freed. 1895 */ 1896 if (onfiles > NDFILE) { 1897 /* 1898 * Note we may be called here from fdinit while allocating a 1899 * table for a new process in which case ->p_fd points 1900 * elsewhere. 1901 */ 1902 if (curproc->p_fd != fdp || FILEDESC_IS_ONLY_USER(fdp)) { 1903 free(otable, M_FILEDESC); 1904 } else { 1905 ft = (struct freetable *)&otable->fdt_ofiles[onfiles]; 1906 fdp0 = (struct filedesc0 *)fdp; 1907 ft->ft_table = otable; 1908 SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next); 1909 } 1910 } 1911 /* 1912 * The map does not have the same possibility of threads still 1913 * holding references to it. So always free it as long as it 1914 * does not reference the original static allocation. 1915 */ 1916 if (NDSLOTS(onfiles) > NDSLOTS(NDFILE)) 1917 free(omap, M_FILEDESC); 1918 } 1919 1920 /* 1921 * Allocate a file descriptor for the process. 1922 */ 1923 int 1924 fdalloc(struct thread *td, int minfd, int *result) 1925 { 1926 struct proc *p = td->td_proc; 1927 struct filedesc *fdp = p->p_fd; 1928 int fd, maxfd, allocfd; 1929 #ifdef RACCT 1930 int error; 1931 #endif 1932 1933 FILEDESC_XLOCK_ASSERT(fdp); 1934 1935 if (fdp->fd_freefile > minfd) 1936 minfd = fdp->fd_freefile; 1937 1938 maxfd = getmaxfd(td); 1939 1940 /* 1941 * Search the bitmap for a free descriptor starting at minfd. 1942 * If none is found, grow the file table. 1943 */ 1944 fd = fd_first_free(fdp, minfd, fdp->fd_nfiles); 1945 if (__predict_false(fd >= maxfd)) 1946 return (EMFILE); 1947 if (__predict_false(fd >= fdp->fd_nfiles)) { 1948 allocfd = min(fd * 2, maxfd); 1949 #ifdef RACCT 1950 if (RACCT_ENABLED()) { 1951 error = racct_set_unlocked(p, RACCT_NOFILE, allocfd); 1952 if (error != 0) 1953 return (EMFILE); 1954 } 1955 #endif 1956 /* 1957 * fd is already equal to first free descriptor >= minfd, so 1958 * we only need to grow the table and we are done. 1959 */ 1960 fdgrowtable_exp(fdp, allocfd); 1961 } 1962 1963 /* 1964 * Perform some sanity checks, then mark the file descriptor as 1965 * used and return it to the caller. 1966 */ 1967 KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles), 1968 ("invalid descriptor %d", fd)); 1969 KASSERT(!fdisused(fdp, fd), 1970 ("fd_first_free() returned non-free descriptor")); 1971 KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, 1972 ("file descriptor isn't free")); 1973 fdused(fdp, fd); 1974 *result = fd; 1975 return (0); 1976 } 1977 1978 /* 1979 * Allocate n file descriptors for the process. 1980 */ 1981 int 1982 fdallocn(struct thread *td, int minfd, int *fds, int n) 1983 { 1984 struct proc *p = td->td_proc; 1985 struct filedesc *fdp = p->p_fd; 1986 int i; 1987 1988 FILEDESC_XLOCK_ASSERT(fdp); 1989 1990 for (i = 0; i < n; i++) 1991 if (fdalloc(td, 0, &fds[i]) != 0) 1992 break; 1993 1994 if (i < n) { 1995 for (i--; i >= 0; i--) 1996 fdunused(fdp, fds[i]); 1997 return (EMFILE); 1998 } 1999 2000 return (0); 2001 } 2002 2003 /* 2004 * Create a new open file structure and allocate a file descriptor for the 2005 * process that refers to it. We add one reference to the file for the 2006 * descriptor table and one reference for resultfp. This is to prevent us 2007 * being preempted and the entry in the descriptor table closed after we 2008 * release the FILEDESC lock. 2009 */ 2010 int 2011 falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags, 2012 struct filecaps *fcaps) 2013 { 2014 struct file *fp; 2015 int error, fd; 2016 2017 MPASS(resultfp != NULL); 2018 MPASS(resultfd != NULL); 2019 2020 error = _falloc_noinstall(td, &fp, 2); 2021 if (__predict_false(error != 0)) { 2022 return (error); 2023 } 2024 2025 error = finstall_refed(td, fp, &fd, flags, fcaps); 2026 if (__predict_false(error != 0)) { 2027 falloc_abort(td, fp); 2028 return (error); 2029 } 2030 2031 *resultfp = fp; 2032 *resultfd = fd; 2033 2034 return (0); 2035 } 2036 2037 /* 2038 * Create a new open file structure without allocating a file descriptor. 2039 */ 2040 int 2041 _falloc_noinstall(struct thread *td, struct file **resultfp, u_int n) 2042 { 2043 struct file *fp; 2044 int maxuserfiles = maxfiles - (maxfiles / 20); 2045 int openfiles_new; 2046 static struct timeval lastfail; 2047 static int curfail; 2048 2049 KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__)); 2050 MPASS(n > 0); 2051 2052 openfiles_new = atomic_fetchadd_int(&openfiles, 1) + 1; 2053 if ((openfiles_new >= maxuserfiles && 2054 priv_check(td, PRIV_MAXFILES) != 0) || 2055 openfiles_new >= maxfiles) { 2056 atomic_subtract_int(&openfiles, 1); 2057 if (ppsratecheck(&lastfail, &curfail, 1)) { 2058 printf("kern.maxfiles limit exceeded by uid %i, (%s) " 2059 "please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm); 2060 } 2061 return (ENFILE); 2062 } 2063 fp = uma_zalloc(file_zone, M_WAITOK); 2064 bzero(fp, sizeof(*fp)); 2065 refcount_init(&fp->f_count, n); 2066 fp->f_cred = crhold(td->td_ucred); 2067 fp->f_ops = &badfileops; 2068 *resultfp = fp; 2069 return (0); 2070 } 2071 2072 void 2073 falloc_abort(struct thread *td, struct file *fp) 2074 { 2075 2076 /* 2077 * For assertion purposes. 2078 */ 2079 refcount_init(&fp->f_count, 0); 2080 _fdrop(fp, td); 2081 } 2082 2083 /* 2084 * Install a file in a file descriptor table. 2085 */ 2086 void 2087 _finstall(struct filedesc *fdp, struct file *fp, int fd, int flags, 2088 struct filecaps *fcaps) 2089 { 2090 struct filedescent *fde; 2091 2092 MPASS(fp != NULL); 2093 if (fcaps != NULL) 2094 filecaps_validate(fcaps, __func__); 2095 FILEDESC_XLOCK_ASSERT(fdp); 2096 2097 fde = &fdp->fd_ofiles[fd]; 2098 #ifdef CAPABILITIES 2099 seqc_write_begin(&fde->fde_seqc); 2100 #endif 2101 fde->fde_file = fp; 2102 fde->fde_flags = (flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0; 2103 if (fcaps != NULL) 2104 filecaps_move(fcaps, &fde->fde_caps); 2105 else 2106 filecaps_fill(&fde->fde_caps); 2107 #ifdef CAPABILITIES 2108 seqc_write_end(&fde->fde_seqc); 2109 #endif 2110 } 2111 2112 int 2113 finstall_refed(struct thread *td, struct file *fp, int *fd, int flags, 2114 struct filecaps *fcaps) 2115 { 2116 struct filedesc *fdp = td->td_proc->p_fd; 2117 int error; 2118 2119 MPASS(fd != NULL); 2120 2121 FILEDESC_XLOCK(fdp); 2122 error = fdalloc(td, 0, fd); 2123 if (__predict_true(error == 0)) { 2124 _finstall(fdp, fp, *fd, flags, fcaps); 2125 } 2126 FILEDESC_XUNLOCK(fdp); 2127 return (error); 2128 } 2129 2130 int 2131 finstall(struct thread *td, struct file *fp, int *fd, int flags, 2132 struct filecaps *fcaps) 2133 { 2134 int error; 2135 2136 MPASS(fd != NULL); 2137 2138 if (!fhold(fp)) 2139 return (EBADF); 2140 error = finstall_refed(td, fp, fd, flags, fcaps); 2141 if (__predict_false(error != 0)) { 2142 fdrop(fp, td); 2143 } 2144 return (error); 2145 } 2146 2147 /* 2148 * Build a new filedesc structure from another. 2149 * 2150 * If fdp is not NULL, return with it shared locked. 2151 */ 2152 struct filedesc * 2153 fdinit(void) 2154 { 2155 struct filedesc0 *newfdp0; 2156 struct filedesc *newfdp; 2157 2158 newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO); 2159 newfdp = &newfdp0->fd_fd; 2160 2161 /* Create the file descriptor table. */ 2162 FILEDESC_LOCK_INIT(newfdp); 2163 refcount_init(&newfdp->fd_refcnt, 1); 2164 refcount_init(&newfdp->fd_holdcnt, 1); 2165 newfdp->fd_map = newfdp0->fd_dmap; 2166 newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles; 2167 newfdp->fd_files->fdt_nfiles = NDFILE; 2168 2169 return (newfdp); 2170 } 2171 2172 /* 2173 * Build a pwddesc structure from another. 2174 * Copy the current, root, and jail root vnode references. 2175 * 2176 * If pdp is not NULL, return with it shared locked. 2177 */ 2178 struct pwddesc * 2179 pdinit(struct pwddesc *pdp, bool keeplock) 2180 { 2181 struct pwddesc *newpdp; 2182 struct pwd *newpwd; 2183 2184 newpdp = malloc(sizeof(*newpdp), M_PWDDESC, M_WAITOK | M_ZERO); 2185 2186 PWDDESC_LOCK_INIT(newpdp); 2187 refcount_init(&newpdp->pd_refcount, 1); 2188 newpdp->pd_cmask = CMASK; 2189 2190 if (pdp == NULL) { 2191 newpwd = pwd_alloc(); 2192 smr_serialized_store(&newpdp->pd_pwd, newpwd, true); 2193 return (newpdp); 2194 } 2195 2196 PWDDESC_XLOCK(pdp); 2197 newpwd = pwd_hold_pwddesc(pdp); 2198 smr_serialized_store(&newpdp->pd_pwd, newpwd, true); 2199 if (!keeplock) 2200 PWDDESC_XUNLOCK(pdp); 2201 return (newpdp); 2202 } 2203 2204 /* 2205 * Hold either filedesc or pwddesc of the passed process. 2206 * 2207 * The process lock is used to synchronize against the target exiting and 2208 * freeing the data. 2209 * 2210 * Clearing can be ilustrated in 3 steps: 2211 * 1. set the pointer to NULL. Either routine can race against it, hence 2212 * atomic_load_ptr. 2213 * 2. observe the process lock as not taken. Until then fdhold/pdhold can 2214 * race to either still see the pointer or find NULL. It is still safe to 2215 * grab a reference as clearing is stalled. 2216 * 3. after the lock is observed as not taken, any fdhold/pdhold calls are 2217 * guaranteed to see NULL, making it safe to finish clearing 2218 */ 2219 static struct filedesc * 2220 fdhold(struct proc *p) 2221 { 2222 struct filedesc *fdp; 2223 2224 PROC_LOCK_ASSERT(p, MA_OWNED); 2225 fdp = atomic_load_ptr(&p->p_fd); 2226 if (fdp != NULL) 2227 refcount_acquire(&fdp->fd_holdcnt); 2228 return (fdp); 2229 } 2230 2231 static struct pwddesc * 2232 pdhold(struct proc *p) 2233 { 2234 struct pwddesc *pdp; 2235 2236 PROC_LOCK_ASSERT(p, MA_OWNED); 2237 pdp = atomic_load_ptr(&p->p_pd); 2238 if (pdp != NULL) 2239 refcount_acquire(&pdp->pd_refcount); 2240 return (pdp); 2241 } 2242 2243 static void 2244 fddrop(struct filedesc *fdp) 2245 { 2246 2247 if (refcount_load(&fdp->fd_holdcnt) > 1) { 2248 if (refcount_release(&fdp->fd_holdcnt) == 0) 2249 return; 2250 } 2251 2252 FILEDESC_LOCK_DESTROY(fdp); 2253 uma_zfree(filedesc0_zone, fdp); 2254 } 2255 2256 static void 2257 pddrop(struct pwddesc *pdp) 2258 { 2259 struct pwd *pwd; 2260 2261 if (refcount_release_if_not_last(&pdp->pd_refcount)) 2262 return; 2263 2264 PWDDESC_XLOCK(pdp); 2265 if (refcount_release(&pdp->pd_refcount) == 0) { 2266 PWDDESC_XUNLOCK(pdp); 2267 return; 2268 } 2269 pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 2270 pwd_set(pdp, NULL); 2271 PWDDESC_XUNLOCK(pdp); 2272 pwd_drop(pwd); 2273 2274 PWDDESC_LOCK_DESTROY(pdp); 2275 free(pdp, M_PWDDESC); 2276 } 2277 2278 /* 2279 * Share a filedesc structure. 2280 */ 2281 struct filedesc * 2282 fdshare(struct filedesc *fdp) 2283 { 2284 2285 refcount_acquire(&fdp->fd_refcnt); 2286 return (fdp); 2287 } 2288 2289 /* 2290 * Share a pwddesc structure. 2291 */ 2292 struct pwddesc * 2293 pdshare(struct pwddesc *pdp) 2294 { 2295 refcount_acquire(&pdp->pd_refcount); 2296 return (pdp); 2297 } 2298 2299 /* 2300 * Unshare a filedesc structure, if necessary by making a copy 2301 */ 2302 void 2303 fdunshare(struct thread *td) 2304 { 2305 struct filedesc *tmp; 2306 struct proc *p = td->td_proc; 2307 2308 if (refcount_load(&p->p_fd->fd_refcnt) == 1) 2309 return; 2310 2311 tmp = fdcopy(p->p_fd); 2312 fdescfree(td); 2313 p->p_fd = tmp; 2314 } 2315 2316 /* 2317 * Unshare a pwddesc structure. 2318 */ 2319 void 2320 pdunshare(struct thread *td) 2321 { 2322 struct pwddesc *pdp; 2323 struct proc *p; 2324 2325 p = td->td_proc; 2326 /* Not shared. */ 2327 if (refcount_load(&p->p_pd->pd_refcount) == 1) 2328 return; 2329 2330 pdp = pdcopy(p->p_pd); 2331 pdescfree(td); 2332 p->p_pd = pdp; 2333 } 2334 2335 /* 2336 * Copy a filedesc structure. A NULL pointer in returns a NULL reference, 2337 * this is to ease callers, not catch errors. 2338 */ 2339 struct filedesc * 2340 fdcopy(struct filedesc *fdp) 2341 { 2342 struct filedesc *newfdp; 2343 struct filedescent *nfde, *ofde; 2344 int i, lastfile; 2345 2346 MPASS(fdp != NULL); 2347 2348 newfdp = fdinit(); 2349 FILEDESC_SLOCK(fdp); 2350 for (;;) { 2351 lastfile = fdlastfile(fdp); 2352 if (lastfile < newfdp->fd_nfiles) 2353 break; 2354 FILEDESC_SUNLOCK(fdp); 2355 fdgrowtable(newfdp, lastfile + 1); 2356 FILEDESC_SLOCK(fdp); 2357 } 2358 /* copy all passable descriptors (i.e. not kqueue) */ 2359 newfdp->fd_freefile = fdp->fd_freefile; 2360 FILEDESC_FOREACH_FDE(fdp, i, ofde) { 2361 if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0 || 2362 !fhold(ofde->fde_file)) { 2363 if (newfdp->fd_freefile == fdp->fd_freefile) 2364 newfdp->fd_freefile = i; 2365 continue; 2366 } 2367 nfde = &newfdp->fd_ofiles[i]; 2368 *nfde = *ofde; 2369 filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true); 2370 fdused_init(newfdp, i); 2371 } 2372 MPASS(newfdp->fd_freefile != -1); 2373 FILEDESC_SUNLOCK(fdp); 2374 return (newfdp); 2375 } 2376 2377 /* 2378 * Copy a pwddesc structure. 2379 */ 2380 struct pwddesc * 2381 pdcopy(struct pwddesc *pdp) 2382 { 2383 struct pwddesc *newpdp; 2384 2385 MPASS(pdp != NULL); 2386 2387 newpdp = pdinit(pdp, true); 2388 newpdp->pd_cmask = pdp->pd_cmask; 2389 PWDDESC_XUNLOCK(pdp); 2390 return (newpdp); 2391 } 2392 2393 /* 2394 * Clear POSIX style locks. This is only used when fdp looses a reference (i.e. 2395 * one of processes using it exits) and the table used to be shared. 2396 */ 2397 static void 2398 fdclearlocks(struct thread *td) 2399 { 2400 struct filedesc *fdp; 2401 struct filedesc_to_leader *fdtol; 2402 struct flock lf; 2403 struct file *fp; 2404 struct proc *p; 2405 struct vnode *vp; 2406 int i; 2407 2408 p = td->td_proc; 2409 fdp = p->p_fd; 2410 fdtol = p->p_fdtol; 2411 MPASS(fdtol != NULL); 2412 2413 FILEDESC_XLOCK(fdp); 2414 KASSERT(fdtol->fdl_refcount > 0, 2415 ("filedesc_to_refcount botch: fdl_refcount=%d", 2416 fdtol->fdl_refcount)); 2417 if (fdtol->fdl_refcount == 1 && 2418 (p->p_leader->p_flag & P_ADVLOCK) != 0) { 2419 FILEDESC_FOREACH_FP(fdp, i, fp) { 2420 if (fp->f_type != DTYPE_VNODE || 2421 !fhold(fp)) 2422 continue; 2423 FILEDESC_XUNLOCK(fdp); 2424 lf.l_whence = SEEK_SET; 2425 lf.l_start = 0; 2426 lf.l_len = 0; 2427 lf.l_type = F_UNLCK; 2428 vp = fp->f_vnode; 2429 (void) VOP_ADVLOCK(vp, 2430 (caddr_t)p->p_leader, F_UNLCK, 2431 &lf, F_POSIX); 2432 FILEDESC_XLOCK(fdp); 2433 fdrop(fp, td); 2434 } 2435 } 2436 retry: 2437 if (fdtol->fdl_refcount == 1) { 2438 if (fdp->fd_holdleaderscount > 0 && 2439 (p->p_leader->p_flag & P_ADVLOCK) != 0) { 2440 /* 2441 * close() or kern_dup() has cleared a reference 2442 * in a shared file descriptor table. 2443 */ 2444 fdp->fd_holdleaderswakeup = 1; 2445 sx_sleep(&fdp->fd_holdleaderscount, 2446 FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); 2447 goto retry; 2448 } 2449 if (fdtol->fdl_holdcount > 0) { 2450 /* 2451 * Ensure that fdtol->fdl_leader remains 2452 * valid in closef(). 2453 */ 2454 fdtol->fdl_wakeup = 1; 2455 sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK, 2456 "fdlhold", 0); 2457 goto retry; 2458 } 2459 } 2460 fdtol->fdl_refcount--; 2461 if (fdtol->fdl_refcount == 0 && 2462 fdtol->fdl_holdcount == 0) { 2463 fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; 2464 fdtol->fdl_prev->fdl_next = fdtol->fdl_next; 2465 } else 2466 fdtol = NULL; 2467 p->p_fdtol = NULL; 2468 FILEDESC_XUNLOCK(fdp); 2469 if (fdtol != NULL) 2470 free(fdtol, M_FILEDESC_TO_LEADER); 2471 } 2472 2473 /* 2474 * Release a filedesc structure. 2475 */ 2476 static void 2477 fdescfree_fds(struct thread *td, struct filedesc *fdp) 2478 { 2479 struct filedesc0 *fdp0; 2480 struct freetable *ft, *tft; 2481 struct filedescent *fde; 2482 struct file *fp; 2483 int i; 2484 2485 KASSERT(refcount_load(&fdp->fd_refcnt) == 0, 2486 ("%s: fd table %p carries references", __func__, fdp)); 2487 2488 /* 2489 * Serialize with threads iterating over the table, if any. 2490 */ 2491 if (refcount_load(&fdp->fd_holdcnt) > 1) { 2492 FILEDESC_XLOCK(fdp); 2493 FILEDESC_XUNLOCK(fdp); 2494 } 2495 2496 FILEDESC_FOREACH_FDE(fdp, i, fde) { 2497 fp = fde->fde_file; 2498 fdefree_last(fde); 2499 (void) closef(fp, td); 2500 } 2501 2502 if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE)) 2503 free(fdp->fd_map, M_FILEDESC); 2504 if (fdp->fd_nfiles > NDFILE) 2505 free(fdp->fd_files, M_FILEDESC); 2506 2507 fdp0 = (struct filedesc0 *)fdp; 2508 SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft) 2509 free(ft->ft_table, M_FILEDESC); 2510 2511 fddrop(fdp); 2512 } 2513 2514 void 2515 fdescfree(struct thread *td) 2516 { 2517 struct proc *p; 2518 struct filedesc *fdp; 2519 2520 p = td->td_proc; 2521 fdp = p->p_fd; 2522 MPASS(fdp != NULL); 2523 2524 #ifdef RACCT 2525 if (RACCT_ENABLED()) 2526 racct_set_unlocked(p, RACCT_NOFILE, 0); 2527 #endif 2528 2529 if (p->p_fdtol != NULL) 2530 fdclearlocks(td); 2531 2532 /* 2533 * Check fdhold for an explanation. 2534 */ 2535 atomic_store_ptr(&p->p_fd, NULL); 2536 atomic_thread_fence_seq_cst(); 2537 PROC_WAIT_UNLOCKED(p); 2538 2539 if (refcount_release(&fdp->fd_refcnt) == 0) 2540 return; 2541 2542 fdescfree_fds(td, fdp); 2543 } 2544 2545 void 2546 pdescfree(struct thread *td) 2547 { 2548 struct proc *p; 2549 struct pwddesc *pdp; 2550 2551 p = td->td_proc; 2552 pdp = p->p_pd; 2553 MPASS(pdp != NULL); 2554 2555 /* 2556 * Check pdhold for an explanation. 2557 */ 2558 atomic_store_ptr(&p->p_pd, NULL); 2559 atomic_thread_fence_seq_cst(); 2560 PROC_WAIT_UNLOCKED(p); 2561 2562 pddrop(pdp); 2563 } 2564 2565 /* 2566 * For setugid programs, we don't want to people to use that setugidness 2567 * to generate error messages which write to a file which otherwise would 2568 * otherwise be off-limits to the process. We check for filesystems where 2569 * the vnode can change out from under us after execve (like [lin]procfs). 2570 * 2571 * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is 2572 * sufficient. We also don't check for setugidness since we know we are. 2573 */ 2574 static bool 2575 is_unsafe(struct file *fp) 2576 { 2577 struct vnode *vp; 2578 2579 if (fp->f_type != DTYPE_VNODE) 2580 return (false); 2581 2582 vp = fp->f_vnode; 2583 return ((vp->v_vflag & VV_PROCDEP) != 0); 2584 } 2585 2586 /* 2587 * Make this setguid thing safe, if at all possible. 2588 */ 2589 void 2590 fdsetugidsafety(struct thread *td) 2591 { 2592 struct filedesc *fdp; 2593 struct file *fp; 2594 int i; 2595 2596 fdp = td->td_proc->p_fd; 2597 KASSERT(refcount_load(&fdp->fd_refcnt) == 1, 2598 ("the fdtable should not be shared")); 2599 MPASS(fdp->fd_nfiles >= 3); 2600 for (i = 0; i <= 2; i++) { 2601 fp = fdp->fd_ofiles[i].fde_file; 2602 if (fp != NULL && is_unsafe(fp)) { 2603 FILEDESC_XLOCK(fdp); 2604 knote_fdclose(td, i); 2605 /* 2606 * NULL-out descriptor prior to close to avoid 2607 * a race while close blocks. 2608 */ 2609 fdfree(fdp, i); 2610 FILEDESC_XUNLOCK(fdp); 2611 (void) closef(fp, td); 2612 } 2613 } 2614 } 2615 2616 /* 2617 * If a specific file object occupies a specific file descriptor, close the 2618 * file descriptor entry and drop a reference on the file object. This is a 2619 * convenience function to handle a subsequent error in a function that calls 2620 * falloc() that handles the race that another thread might have closed the 2621 * file descriptor out from under the thread creating the file object. 2622 */ 2623 void 2624 fdclose(struct thread *td, struct file *fp, int idx) 2625 { 2626 struct filedesc *fdp = td->td_proc->p_fd; 2627 2628 FILEDESC_XLOCK(fdp); 2629 if (fdp->fd_ofiles[idx].fde_file == fp) { 2630 fdfree(fdp, idx); 2631 FILEDESC_XUNLOCK(fdp); 2632 fdrop(fp, td); 2633 } else 2634 FILEDESC_XUNLOCK(fdp); 2635 } 2636 2637 /* 2638 * Close any files on exec? 2639 */ 2640 void 2641 fdcloseexec(struct thread *td) 2642 { 2643 struct filedesc *fdp; 2644 struct filedescent *fde; 2645 struct file *fp; 2646 int i; 2647 2648 fdp = td->td_proc->p_fd; 2649 KASSERT(refcount_load(&fdp->fd_refcnt) == 1, 2650 ("the fdtable should not be shared")); 2651 FILEDESC_FOREACH_FDE(fdp, i, fde) { 2652 fp = fde->fde_file; 2653 if (fp->f_type == DTYPE_MQUEUE || 2654 (fde->fde_flags & UF_EXCLOSE)) { 2655 FILEDESC_XLOCK(fdp); 2656 fdfree(fdp, i); 2657 (void) closefp(fdp, i, fp, td, false, false); 2658 FILEDESC_UNLOCK_ASSERT(fdp); 2659 } 2660 } 2661 } 2662 2663 /* 2664 * It is unsafe for set[ug]id processes to be started with file 2665 * descriptors 0..2 closed, as these descriptors are given implicit 2666 * significance in the Standard C library. fdcheckstd() will create a 2667 * descriptor referencing /dev/null for each of stdin, stdout, and 2668 * stderr that is not already open. 2669 */ 2670 int 2671 fdcheckstd(struct thread *td) 2672 { 2673 struct filedesc *fdp; 2674 register_t save; 2675 int i, error, devnull; 2676 2677 fdp = td->td_proc->p_fd; 2678 KASSERT(refcount_load(&fdp->fd_refcnt) == 1, 2679 ("the fdtable should not be shared")); 2680 MPASS(fdp->fd_nfiles >= 3); 2681 devnull = -1; 2682 for (i = 0; i <= 2; i++) { 2683 if (fdp->fd_ofiles[i].fde_file != NULL) 2684 continue; 2685 2686 save = td->td_retval[0]; 2687 if (devnull != -1) { 2688 error = kern_dup(td, FDDUP_FIXED, 0, devnull, i); 2689 } else { 2690 error = kern_openat(td, AT_FDCWD, "/dev/null", 2691 UIO_SYSSPACE, O_RDWR, 0); 2692 if (error == 0) { 2693 devnull = td->td_retval[0]; 2694 KASSERT(devnull == i, ("we didn't get our fd")); 2695 } 2696 } 2697 td->td_retval[0] = save; 2698 if (error != 0) 2699 return (error); 2700 } 2701 return (0); 2702 } 2703 2704 /* 2705 * Internal form of close. Decrement reference count on file structure. 2706 * Note: td may be NULL when closing a file that was being passed in a 2707 * message. 2708 */ 2709 int 2710 closef(struct file *fp, struct thread *td) 2711 { 2712 struct vnode *vp; 2713 struct flock lf; 2714 struct filedesc_to_leader *fdtol; 2715 struct filedesc *fdp; 2716 2717 MPASS(td != NULL); 2718 2719 /* 2720 * POSIX record locking dictates that any close releases ALL 2721 * locks owned by this process. This is handled by setting 2722 * a flag in the unlock to free ONLY locks obeying POSIX 2723 * semantics, and not to free BSD-style file locks. 2724 * If the descriptor was in a message, POSIX-style locks 2725 * aren't passed with the descriptor, and the thread pointer 2726 * will be NULL. Callers should be careful only to pass a 2727 * NULL thread pointer when there really is no owning 2728 * context that might have locks, or the locks will be 2729 * leaked. 2730 */ 2731 if (fp->f_type == DTYPE_VNODE) { 2732 vp = fp->f_vnode; 2733 if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { 2734 lf.l_whence = SEEK_SET; 2735 lf.l_start = 0; 2736 lf.l_len = 0; 2737 lf.l_type = F_UNLCK; 2738 (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, 2739 F_UNLCK, &lf, F_POSIX); 2740 } 2741 fdtol = td->td_proc->p_fdtol; 2742 if (fdtol != NULL) { 2743 /* 2744 * Handle special case where file descriptor table is 2745 * shared between multiple process leaders. 2746 */ 2747 fdp = td->td_proc->p_fd; 2748 FILEDESC_XLOCK(fdp); 2749 for (fdtol = fdtol->fdl_next; 2750 fdtol != td->td_proc->p_fdtol; 2751 fdtol = fdtol->fdl_next) { 2752 if ((fdtol->fdl_leader->p_flag & 2753 P_ADVLOCK) == 0) 2754 continue; 2755 fdtol->fdl_holdcount++; 2756 FILEDESC_XUNLOCK(fdp); 2757 lf.l_whence = SEEK_SET; 2758 lf.l_start = 0; 2759 lf.l_len = 0; 2760 lf.l_type = F_UNLCK; 2761 vp = fp->f_vnode; 2762 (void) VOP_ADVLOCK(vp, 2763 (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf, 2764 F_POSIX); 2765 FILEDESC_XLOCK(fdp); 2766 fdtol->fdl_holdcount--; 2767 if (fdtol->fdl_holdcount == 0 && 2768 fdtol->fdl_wakeup != 0) { 2769 fdtol->fdl_wakeup = 0; 2770 wakeup(fdtol); 2771 } 2772 } 2773 FILEDESC_XUNLOCK(fdp); 2774 } 2775 } 2776 return (fdrop_close(fp, td)); 2777 } 2778 2779 /* 2780 * Hack for file descriptor passing code. 2781 */ 2782 void 2783 closef_nothread(struct file *fp) 2784 { 2785 2786 fdrop(fp, NULL); 2787 } 2788 2789 /* 2790 * Initialize the file pointer with the specified properties. 2791 * 2792 * The ops are set with release semantics to be certain that the flags, type, 2793 * and data are visible when ops is. This is to prevent ops methods from being 2794 * called with bad data. 2795 */ 2796 void 2797 finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops) 2798 { 2799 fp->f_data = data; 2800 fp->f_flag = flag; 2801 fp->f_type = type; 2802 atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops); 2803 } 2804 2805 void 2806 finit_vnode(struct file *fp, u_int flag, void *data, struct fileops *ops) 2807 { 2808 fp->f_seqcount[UIO_READ] = 1; 2809 fp->f_seqcount[UIO_WRITE] = 1; 2810 finit(fp, (flag & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, 2811 data, ops); 2812 } 2813 2814 int 2815 fget_cap_locked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, 2816 struct file **fpp, struct filecaps *havecapsp) 2817 { 2818 struct filedescent *fde; 2819 int error; 2820 2821 FILEDESC_LOCK_ASSERT(fdp); 2822 2823 *fpp = NULL; 2824 fde = fdeget_locked(fdp, fd); 2825 if (fde == NULL) { 2826 error = EBADF; 2827 goto out; 2828 } 2829 2830 #ifdef CAPABILITIES 2831 error = cap_check(cap_rights_fde_inline(fde), needrightsp); 2832 if (error != 0) 2833 goto out; 2834 #endif 2835 2836 if (havecapsp != NULL) 2837 filecaps_copy(&fde->fde_caps, havecapsp, true); 2838 2839 *fpp = fde->fde_file; 2840 2841 error = 0; 2842 out: 2843 return (error); 2844 } 2845 2846 #ifdef CAPABILITIES 2847 int 2848 fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp, 2849 struct file **fpp, struct filecaps *havecapsp) 2850 { 2851 struct filedesc *fdp = td->td_proc->p_fd; 2852 int error; 2853 struct file *fp; 2854 seqc_t seq; 2855 2856 *fpp = NULL; 2857 for (;;) { 2858 error = fget_unlocked_seq(td, fd, needrightsp, &fp, &seq); 2859 if (error != 0) 2860 return (error); 2861 2862 if (havecapsp != NULL) { 2863 if (!filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, 2864 havecapsp, false)) { 2865 fdrop(fp, td); 2866 goto get_locked; 2867 } 2868 } 2869 2870 if (!fd_modified(fdp, fd, seq)) 2871 break; 2872 fdrop(fp, td); 2873 } 2874 2875 *fpp = fp; 2876 return (0); 2877 2878 get_locked: 2879 FILEDESC_SLOCK(fdp); 2880 error = fget_cap_locked(fdp, fd, needrightsp, fpp, havecapsp); 2881 if (error == 0 && !fhold(*fpp)) 2882 error = EBADF; 2883 FILEDESC_SUNLOCK(fdp); 2884 return (error); 2885 } 2886 #else 2887 int 2888 fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp, 2889 struct file **fpp, struct filecaps *havecapsp) 2890 { 2891 int error; 2892 error = fget_unlocked(td, fd, needrightsp, fpp); 2893 if (havecapsp != NULL && error == 0) 2894 filecaps_fill(havecapsp); 2895 2896 return (error); 2897 } 2898 #endif 2899 2900 #ifdef CAPABILITIES 2901 int 2902 fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch) 2903 { 2904 const struct filedescent *fde; 2905 const struct fdescenttbl *fdt; 2906 struct filedesc *fdp; 2907 struct file *fp; 2908 struct vnode *vp; 2909 const cap_rights_t *haverights; 2910 cap_rights_t rights; 2911 seqc_t seq; 2912 2913 VFS_SMR_ASSERT_ENTERED(); 2914 2915 rights = *ndp->ni_rightsneeded; 2916 cap_rights_set_one(&rights, CAP_LOOKUP); 2917 2918 fdp = curproc->p_fd; 2919 fdt = fdp->fd_files; 2920 if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) 2921 return (EBADF); 2922 seq = seqc_read_notmodify(fd_seqc(fdt, fd)); 2923 fde = &fdt->fdt_ofiles[fd]; 2924 haverights = cap_rights_fde_inline(fde); 2925 fp = fde->fde_file; 2926 if (__predict_false(fp == NULL)) 2927 return (EAGAIN); 2928 if (__predict_false(cap_check_inline_transient(haverights, &rights))) 2929 return (EAGAIN); 2930 *fsearch = ((fp->f_flag & FSEARCH) != 0); 2931 vp = fp->f_vnode; 2932 if (__predict_false(vp == NULL)) { 2933 return (EAGAIN); 2934 } 2935 if (!filecaps_copy(&fde->fde_caps, &ndp->ni_filecaps, false)) { 2936 return (EAGAIN); 2937 } 2938 /* 2939 * Use an acquire barrier to force re-reading of fdt so it is 2940 * refreshed for verification. 2941 */ 2942 atomic_thread_fence_acq(); 2943 fdt = fdp->fd_files; 2944 if (__predict_false(!seqc_consistent_no_fence(fd_seqc(fdt, fd), seq))) 2945 return (EAGAIN); 2946 /* 2947 * If file descriptor doesn't have all rights, 2948 * all lookups relative to it must also be 2949 * strictly relative. 2950 * 2951 * Not yet supported by fast path. 2952 */ 2953 CAP_ALL(&rights); 2954 if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) || 2955 ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL || 2956 ndp->ni_filecaps.fc_nioctls != -1) { 2957 #ifdef notyet 2958 ndp->ni_lcf |= NI_LCF_STRICTRELATIVE; 2959 #else 2960 return (EAGAIN); 2961 #endif 2962 } 2963 *vpp = vp; 2964 return (0); 2965 } 2966 #else 2967 int 2968 fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch) 2969 { 2970 const struct fdescenttbl *fdt; 2971 struct filedesc *fdp; 2972 struct file *fp; 2973 struct vnode *vp; 2974 2975 VFS_SMR_ASSERT_ENTERED(); 2976 2977 fdp = curproc->p_fd; 2978 fdt = fdp->fd_files; 2979 if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) 2980 return (EBADF); 2981 fp = fdt->fdt_ofiles[fd].fde_file; 2982 if (__predict_false(fp == NULL)) 2983 return (EAGAIN); 2984 *fsearch = ((fp->f_flag & FSEARCH) != 0); 2985 vp = fp->f_vnode; 2986 if (__predict_false(vp == NULL || vp->v_type != VDIR)) { 2987 return (EAGAIN); 2988 } 2989 /* 2990 * Use an acquire barrier to force re-reading of fdt so it is 2991 * refreshed for verification. 2992 */ 2993 atomic_thread_fence_acq(); 2994 fdt = fdp->fd_files; 2995 if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file)) 2996 return (EAGAIN); 2997 filecaps_fill(&ndp->ni_filecaps); 2998 *vpp = vp; 2999 return (0); 3000 } 3001 #endif 3002 3003 /* 3004 * Fetch the descriptor locklessly. 3005 * 3006 * We avoid fdrop() races by never raising a refcount above 0. To accomplish 3007 * this we have to use a cmpset loop rather than an atomic_add. The descriptor 3008 * must be re-verified once we acquire a reference to be certain that the 3009 * identity is still correct and we did not lose a race due to preemption. 3010 * 3011 * Force a reload of fdt when looping. Another thread could reallocate 3012 * the table before this fd was closed, so it is possible that there is 3013 * a stale fp pointer in cached version. 3014 */ 3015 #ifdef CAPABILITIES 3016 static int 3017 fget_unlocked_seq(struct thread *td, int fd, cap_rights_t *needrightsp, 3018 struct file **fpp, seqc_t *seqp) 3019 { 3020 struct filedesc *fdp; 3021 const struct filedescent *fde; 3022 const struct fdescenttbl *fdt; 3023 struct file *fp; 3024 seqc_t seq; 3025 cap_rights_t haverights; 3026 int error; 3027 3028 fdp = td->td_proc->p_fd; 3029 fdt = fdp->fd_files; 3030 if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) 3031 return (EBADF); 3032 3033 for (;;) { 3034 seq = seqc_read_notmodify(fd_seqc(fdt, fd)); 3035 fde = &fdt->fdt_ofiles[fd]; 3036 haverights = *cap_rights_fde_inline(fde); 3037 fp = fde->fde_file; 3038 if (__predict_false(fp == NULL)) { 3039 if (seqc_consistent(fd_seqc(fdt, fd), seq)) 3040 return (EBADF); 3041 fdt = atomic_load_ptr(&fdp->fd_files); 3042 continue; 3043 } 3044 error = cap_check_inline(&haverights, needrightsp); 3045 if (__predict_false(error != 0)) { 3046 if (seqc_consistent(fd_seqc(fdt, fd), seq)) 3047 return (error); 3048 fdt = atomic_load_ptr(&fdp->fd_files); 3049 continue; 3050 } 3051 if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) { 3052 fdt = atomic_load_ptr(&fdp->fd_files); 3053 continue; 3054 } 3055 /* 3056 * Use an acquire barrier to force re-reading of fdt so it is 3057 * refreshed for verification. 3058 */ 3059 atomic_thread_fence_acq(); 3060 fdt = fdp->fd_files; 3061 if (seqc_consistent_no_fence(fd_seqc(fdt, fd), seq)) 3062 break; 3063 fdrop(fp, td); 3064 } 3065 *fpp = fp; 3066 if (seqp != NULL) { 3067 *seqp = seq; 3068 } 3069 return (0); 3070 } 3071 #else 3072 static int 3073 fget_unlocked_seq(struct thread *td, int fd, cap_rights_t *needrightsp, 3074 struct file **fpp, seqc_t *seqp __unused) 3075 { 3076 struct filedesc *fdp; 3077 const struct fdescenttbl *fdt; 3078 struct file *fp; 3079 3080 fdp = td->td_proc->p_fd; 3081 fdt = fdp->fd_files; 3082 if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) 3083 return (EBADF); 3084 3085 for (;;) { 3086 fp = fdt->fdt_ofiles[fd].fde_file; 3087 if (__predict_false(fp == NULL)) 3088 return (EBADF); 3089 if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) { 3090 fdt = atomic_load_ptr(&fdp->fd_files); 3091 continue; 3092 } 3093 /* 3094 * Use an acquire barrier to force re-reading of fdt so it is 3095 * refreshed for verification. 3096 */ 3097 atomic_thread_fence_acq(); 3098 fdt = fdp->fd_files; 3099 if (__predict_true(fp == fdt->fdt_ofiles[fd].fde_file)) 3100 break; 3101 fdrop(fp, td); 3102 } 3103 *fpp = fp; 3104 return (0); 3105 } 3106 #endif 3107 3108 /* 3109 * See the comments in fget_unlocked_seq for an explanation of how this works. 3110 * 3111 * This is a simplified variant which bails out to the aforementioned routine 3112 * if anything goes wrong. In practice this only happens when userspace is 3113 * racing with itself. 3114 */ 3115 int 3116 fget_unlocked(struct thread *td, int fd, cap_rights_t *needrightsp, 3117 struct file **fpp) 3118 { 3119 struct filedesc *fdp; 3120 #ifdef CAPABILITIES 3121 const struct filedescent *fde; 3122 #endif 3123 const struct fdescenttbl *fdt; 3124 struct file *fp; 3125 #ifdef CAPABILITIES 3126 seqc_t seq; 3127 const cap_rights_t *haverights; 3128 #endif 3129 3130 fdp = td->td_proc->p_fd; 3131 fdt = fdp->fd_files; 3132 if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) { 3133 *fpp = NULL; 3134 return (EBADF); 3135 } 3136 #ifdef CAPABILITIES 3137 seq = seqc_read_notmodify(fd_seqc(fdt, fd)); 3138 fde = &fdt->fdt_ofiles[fd]; 3139 haverights = cap_rights_fde_inline(fde); 3140 fp = fde->fde_file; 3141 #else 3142 fp = fdt->fdt_ofiles[fd].fde_file; 3143 #endif 3144 if (__predict_false(fp == NULL)) 3145 goto out_fallback; 3146 #ifdef CAPABILITIES 3147 if (__predict_false(cap_check_inline_transient(haverights, needrightsp))) 3148 goto out_fallback; 3149 #endif 3150 if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) 3151 goto out_fallback; 3152 3153 /* 3154 * Use an acquire barrier to force re-reading of fdt so it is 3155 * refreshed for verification. 3156 */ 3157 atomic_thread_fence_acq(); 3158 fdt = fdp->fd_files; 3159 #ifdef CAPABILITIES 3160 if (__predict_false(!seqc_consistent_no_fence(fd_seqc(fdt, fd), seq))) 3161 #else 3162 if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file)) 3163 #endif 3164 goto out_fdrop; 3165 *fpp = fp; 3166 return (0); 3167 out_fdrop: 3168 fdrop(fp, td); 3169 out_fallback: 3170 *fpp = NULL; 3171 return (fget_unlocked_seq(td, fd, needrightsp, fpp, NULL)); 3172 } 3173 3174 /* 3175 * Translate fd -> file when the caller guarantees the file descriptor table 3176 * can't be changed by others. 3177 * 3178 * Note this does not mean the file object itself is only visible to the caller, 3179 * merely that it wont disappear without having to be referenced. 3180 * 3181 * Must be paired with fput_only_user. 3182 */ 3183 #ifdef CAPABILITIES 3184 int 3185 fget_only_user(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, 3186 struct file **fpp) 3187 { 3188 const struct filedescent *fde; 3189 const struct fdescenttbl *fdt; 3190 const cap_rights_t *haverights; 3191 struct file *fp; 3192 int error; 3193 3194 MPASS(FILEDESC_IS_ONLY_USER(fdp)); 3195 3196 *fpp = NULL; 3197 if (__predict_false(fd >= fdp->fd_nfiles)) 3198 return (EBADF); 3199 3200 fdt = fdp->fd_files; 3201 fde = &fdt->fdt_ofiles[fd]; 3202 fp = fde->fde_file; 3203 if (__predict_false(fp == NULL)) 3204 return (EBADF); 3205 MPASS(refcount_load(&fp->f_count) > 0); 3206 haverights = cap_rights_fde_inline(fde); 3207 error = cap_check_inline(haverights, needrightsp); 3208 if (__predict_false(error != 0)) 3209 return (error); 3210 *fpp = fp; 3211 return (0); 3212 } 3213 #else 3214 int 3215 fget_only_user(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, 3216 struct file **fpp) 3217 { 3218 struct file *fp; 3219 3220 MPASS(FILEDESC_IS_ONLY_USER(fdp)); 3221 3222 *fpp = NULL; 3223 if (__predict_false(fd >= fdp->fd_nfiles)) 3224 return (EBADF); 3225 3226 fp = fdp->fd_ofiles[fd].fde_file; 3227 if (__predict_false(fp == NULL)) 3228 return (EBADF); 3229 3230 MPASS(refcount_load(&fp->f_count) > 0); 3231 *fpp = fp; 3232 return (0); 3233 } 3234 #endif 3235 3236 /* 3237 * Extract the file pointer associated with the specified descriptor for the 3238 * current user process. 3239 * 3240 * If the descriptor doesn't exist or doesn't match 'flags', EBADF is 3241 * returned. 3242 * 3243 * File's rights will be checked against the capability rights mask. 3244 * 3245 * If an error occurred the non-zero error is returned and *fpp is set to 3246 * NULL. Otherwise *fpp is held and set and zero is returned. Caller is 3247 * responsible for fdrop(). 3248 */ 3249 static __inline int 3250 _fget(struct thread *td, int fd, struct file **fpp, int flags, 3251 cap_rights_t *needrightsp) 3252 { 3253 struct file *fp; 3254 int error; 3255 3256 *fpp = NULL; 3257 error = fget_unlocked(td, fd, needrightsp, &fp); 3258 if (__predict_false(error != 0)) 3259 return (error); 3260 if (__predict_false(fp->f_ops == &badfileops)) { 3261 fdrop(fp, td); 3262 return (EBADF); 3263 } 3264 3265 /* 3266 * FREAD and FWRITE failure return EBADF as per POSIX. 3267 */ 3268 error = 0; 3269 switch (flags) { 3270 case FREAD: 3271 case FWRITE: 3272 if ((fp->f_flag & flags) == 0) 3273 error = EBADF; 3274 break; 3275 case FEXEC: 3276 if (fp->f_ops != &path_fileops && 3277 ((fp->f_flag & (FREAD | FEXEC)) == 0 || 3278 (fp->f_flag & FWRITE) != 0)) 3279 error = EBADF; 3280 break; 3281 case 0: 3282 break; 3283 default: 3284 KASSERT(0, ("wrong flags")); 3285 } 3286 3287 if (error != 0) { 3288 fdrop(fp, td); 3289 return (error); 3290 } 3291 3292 *fpp = fp; 3293 return (0); 3294 } 3295 3296 int 3297 fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) 3298 { 3299 3300 return (_fget(td, fd, fpp, 0, rightsp)); 3301 } 3302 3303 int 3304 fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, vm_prot_t *maxprotp, 3305 struct file **fpp) 3306 { 3307 int error; 3308 #ifndef CAPABILITIES 3309 error = _fget(td, fd, fpp, 0, rightsp); 3310 if (maxprotp != NULL) 3311 *maxprotp = VM_PROT_ALL; 3312 return (error); 3313 #else 3314 cap_rights_t fdrights; 3315 struct filedesc *fdp; 3316 struct file *fp; 3317 seqc_t seq; 3318 3319 *fpp = NULL; 3320 fdp = td->td_proc->p_fd; 3321 MPASS(cap_rights_is_set(rightsp, CAP_MMAP)); 3322 for (;;) { 3323 error = fget_unlocked_seq(td, fd, rightsp, &fp, &seq); 3324 if (__predict_false(error != 0)) 3325 return (error); 3326 if (__predict_false(fp->f_ops == &badfileops)) { 3327 fdrop(fp, td); 3328 return (EBADF); 3329 } 3330 if (maxprotp != NULL) 3331 fdrights = *cap_rights(fdp, fd); 3332 if (!fd_modified(fdp, fd, seq)) 3333 break; 3334 fdrop(fp, td); 3335 } 3336 3337 /* 3338 * If requested, convert capability rights to access flags. 3339 */ 3340 if (maxprotp != NULL) 3341 *maxprotp = cap_rights_to_vmprot(&fdrights); 3342 *fpp = fp; 3343 return (0); 3344 #endif 3345 } 3346 3347 int 3348 fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) 3349 { 3350 3351 return (_fget(td, fd, fpp, FREAD, rightsp)); 3352 } 3353 3354 int 3355 fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) 3356 { 3357 3358 return (_fget(td, fd, fpp, FWRITE, rightsp)); 3359 } 3360 3361 int 3362 fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl, 3363 struct file **fpp) 3364 { 3365 #ifndef CAPABILITIES 3366 return (fget_unlocked(td, fd, rightsp, fpp)); 3367 #else 3368 struct filedesc *fdp = td->td_proc->p_fd; 3369 struct file *fp; 3370 int error; 3371 seqc_t seq; 3372 3373 *fpp = NULL; 3374 MPASS(cap_rights_is_set(rightsp, CAP_FCNTL)); 3375 for (;;) { 3376 error = fget_unlocked_seq(td, fd, rightsp, &fp, &seq); 3377 if (error != 0) 3378 return (error); 3379 error = cap_fcntl_check(fdp, fd, needfcntl); 3380 if (!fd_modified(fdp, fd, seq)) 3381 break; 3382 fdrop(fp, td); 3383 } 3384 if (error != 0) { 3385 fdrop(fp, td); 3386 return (error); 3387 } 3388 *fpp = fp; 3389 return (0); 3390 #endif 3391 } 3392 3393 /* 3394 * Like fget() but loads the underlying vnode, or returns an error if the 3395 * descriptor does not represent a vnode. Note that pipes use vnodes but 3396 * never have VM objects. The returned vnode will be vref()'d. 3397 * 3398 * XXX: what about the unused flags ? 3399 */ 3400 static __inline int 3401 _fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp, 3402 struct vnode **vpp) 3403 { 3404 struct file *fp; 3405 int error; 3406 3407 *vpp = NULL; 3408 error = _fget(td, fd, &fp, flags, needrightsp); 3409 if (error != 0) 3410 return (error); 3411 if (fp->f_vnode == NULL) { 3412 error = EINVAL; 3413 } else { 3414 *vpp = fp->f_vnode; 3415 vref(*vpp); 3416 } 3417 fdrop(fp, td); 3418 3419 return (error); 3420 } 3421 3422 int 3423 fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) 3424 { 3425 3426 return (_fgetvp(td, fd, 0, rightsp, vpp)); 3427 } 3428 3429 int 3430 fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp, 3431 struct filecaps *havecaps, struct vnode **vpp) 3432 { 3433 struct filecaps caps; 3434 struct file *fp; 3435 int error; 3436 3437 error = fget_cap(td, fd, needrightsp, &fp, &caps); 3438 if (error != 0) 3439 return (error); 3440 if (fp->f_ops == &badfileops) { 3441 error = EBADF; 3442 goto out; 3443 } 3444 if (fp->f_vnode == NULL) { 3445 error = EINVAL; 3446 goto out; 3447 } 3448 3449 *havecaps = caps; 3450 *vpp = fp->f_vnode; 3451 vref(*vpp); 3452 fdrop(fp, td); 3453 3454 return (0); 3455 out: 3456 filecaps_free(&caps); 3457 fdrop(fp, td); 3458 return (error); 3459 } 3460 3461 int 3462 fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) 3463 { 3464 3465 return (_fgetvp(td, fd, FREAD, rightsp, vpp)); 3466 } 3467 3468 int 3469 fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) 3470 { 3471 3472 return (_fgetvp(td, fd, FEXEC, rightsp, vpp)); 3473 } 3474 3475 #ifdef notyet 3476 int 3477 fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp, 3478 struct vnode **vpp) 3479 { 3480 3481 return (_fgetvp(td, fd, FWRITE, rightsp, vpp)); 3482 } 3483 #endif 3484 3485 /* 3486 * Handle the last reference to a file being closed. 3487 * 3488 * Without the noinline attribute clang keeps inlining the func thorough this 3489 * file when fdrop is used. 3490 */ 3491 int __noinline 3492 _fdrop(struct file *fp, struct thread *td) 3493 { 3494 int error; 3495 #ifdef INVARIANTS 3496 int count; 3497 3498 count = refcount_load(&fp->f_count); 3499 if (count != 0) 3500 panic("fdrop: fp %p count %d", fp, count); 3501 #endif 3502 error = fo_close(fp, td); 3503 atomic_subtract_int(&openfiles, 1); 3504 crfree(fp->f_cred); 3505 free(fp->f_advice, M_FADVISE); 3506 uma_zfree(file_zone, fp); 3507 3508 return (error); 3509 } 3510 3511 /* 3512 * Apply an advisory lock on a file descriptor. 3513 * 3514 * Just attempt to get a record lock of the requested type on the entire file 3515 * (l_whence = SEEK_SET, l_start = 0, l_len = 0). 3516 */ 3517 #ifndef _SYS_SYSPROTO_H_ 3518 struct flock_args { 3519 int fd; 3520 int how; 3521 }; 3522 #endif 3523 /* ARGSUSED */ 3524 int 3525 sys_flock(struct thread *td, struct flock_args *uap) 3526 { 3527 struct file *fp; 3528 struct vnode *vp; 3529 struct flock lf; 3530 int error; 3531 3532 error = fget(td, uap->fd, &cap_flock_rights, &fp); 3533 if (error != 0) 3534 return (error); 3535 error = EOPNOTSUPP; 3536 if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) { 3537 goto done; 3538 } 3539 if (fp->f_ops == &path_fileops) { 3540 goto done; 3541 } 3542 3543 error = 0; 3544 vp = fp->f_vnode; 3545 lf.l_whence = SEEK_SET; 3546 lf.l_start = 0; 3547 lf.l_len = 0; 3548 if (uap->how & LOCK_UN) { 3549 lf.l_type = F_UNLCK; 3550 atomic_clear_int(&fp->f_flag, FHASLOCK); 3551 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); 3552 goto done; 3553 } 3554 if (uap->how & LOCK_EX) 3555 lf.l_type = F_WRLCK; 3556 else if (uap->how & LOCK_SH) 3557 lf.l_type = F_RDLCK; 3558 else { 3559 error = EBADF; 3560 goto done; 3561 } 3562 atomic_set_int(&fp->f_flag, FHASLOCK); 3563 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, 3564 (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); 3565 done: 3566 fdrop(fp, td); 3567 return (error); 3568 } 3569 /* 3570 * Duplicate the specified descriptor to a free descriptor. 3571 */ 3572 int 3573 dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode, 3574 int openerror, int *indxp) 3575 { 3576 struct filedescent *newfde, *oldfde; 3577 struct file *fp; 3578 u_long *ioctls; 3579 int error, indx; 3580 3581 KASSERT(openerror == ENODEV || openerror == ENXIO, 3582 ("unexpected error %d in %s", openerror, __func__)); 3583 3584 /* 3585 * If the to-be-dup'd fd number is greater than the allowed number 3586 * of file descriptors, or the fd to be dup'd has already been 3587 * closed, then reject. 3588 */ 3589 FILEDESC_XLOCK(fdp); 3590 if ((fp = fget_locked(fdp, dfd)) == NULL) { 3591 FILEDESC_XUNLOCK(fdp); 3592 return (EBADF); 3593 } 3594 3595 error = fdalloc(td, 0, &indx); 3596 if (error != 0) { 3597 FILEDESC_XUNLOCK(fdp); 3598 return (error); 3599 } 3600 3601 /* 3602 * There are two cases of interest here. 3603 * 3604 * For ENODEV simply dup (dfd) to file descriptor (indx) and return. 3605 * 3606 * For ENXIO steal away the file structure from (dfd) and store it in 3607 * (indx). (dfd) is effectively closed by this operation. 3608 */ 3609 switch (openerror) { 3610 case ENODEV: 3611 /* 3612 * Check that the mode the file is being opened for is a 3613 * subset of the mode of the existing descriptor. 3614 */ 3615 if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { 3616 fdunused(fdp, indx); 3617 FILEDESC_XUNLOCK(fdp); 3618 return (EACCES); 3619 } 3620 if (!fhold(fp)) { 3621 fdunused(fdp, indx); 3622 FILEDESC_XUNLOCK(fdp); 3623 return (EBADF); 3624 } 3625 newfde = &fdp->fd_ofiles[indx]; 3626 oldfde = &fdp->fd_ofiles[dfd]; 3627 ioctls = filecaps_copy_prep(&oldfde->fde_caps); 3628 #ifdef CAPABILITIES 3629 seqc_write_begin(&newfde->fde_seqc); 3630 #endif 3631 fde_copy(oldfde, newfde); 3632 filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps, 3633 ioctls); 3634 #ifdef CAPABILITIES 3635 seqc_write_end(&newfde->fde_seqc); 3636 #endif 3637 break; 3638 case ENXIO: 3639 /* 3640 * Steal away the file pointer from dfd and stuff it into indx. 3641 */ 3642 newfde = &fdp->fd_ofiles[indx]; 3643 oldfde = &fdp->fd_ofiles[dfd]; 3644 #ifdef CAPABILITIES 3645 seqc_write_begin(&oldfde->fde_seqc); 3646 seqc_write_begin(&newfde->fde_seqc); 3647 #endif 3648 fde_copy(oldfde, newfde); 3649 oldfde->fde_file = NULL; 3650 fdunused(fdp, dfd); 3651 #ifdef CAPABILITIES 3652 seqc_write_end(&newfde->fde_seqc); 3653 seqc_write_end(&oldfde->fde_seqc); 3654 #endif 3655 break; 3656 } 3657 FILEDESC_XUNLOCK(fdp); 3658 *indxp = indx; 3659 return (0); 3660 } 3661 3662 /* 3663 * This sysctl determines if we will allow a process to chroot(2) if it 3664 * has a directory open: 3665 * 0: disallowed for all processes. 3666 * 1: allowed for processes that were not already chroot(2)'ed. 3667 * 2: allowed for all processes. 3668 */ 3669 3670 static int chroot_allow_open_directories = 1; 3671 3672 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW, 3673 &chroot_allow_open_directories, 0, 3674 "Allow a process to chroot(2) if it has a directory open"); 3675 3676 /* 3677 * Helper function for raised chroot(2) security function: Refuse if 3678 * any filedescriptors are open directories. 3679 */ 3680 static int 3681 chroot_refuse_vdir_fds(struct filedesc *fdp) 3682 { 3683 struct vnode *vp; 3684 struct file *fp; 3685 int i; 3686 3687 FILEDESC_LOCK_ASSERT(fdp); 3688 3689 FILEDESC_FOREACH_FP(fdp, i, fp) { 3690 if (fp->f_type == DTYPE_VNODE) { 3691 vp = fp->f_vnode; 3692 if (vp->v_type == VDIR) 3693 return (EPERM); 3694 } 3695 } 3696 return (0); 3697 } 3698 3699 static void 3700 pwd_fill(struct pwd *oldpwd, struct pwd *newpwd) 3701 { 3702 3703 if (newpwd->pwd_cdir == NULL && oldpwd->pwd_cdir != NULL) { 3704 vrefact(oldpwd->pwd_cdir); 3705 newpwd->pwd_cdir = oldpwd->pwd_cdir; 3706 } 3707 3708 if (newpwd->pwd_rdir == NULL && oldpwd->pwd_rdir != NULL) { 3709 vrefact(oldpwd->pwd_rdir); 3710 newpwd->pwd_rdir = oldpwd->pwd_rdir; 3711 } 3712 3713 if (newpwd->pwd_jdir == NULL && oldpwd->pwd_jdir != NULL) { 3714 vrefact(oldpwd->pwd_jdir); 3715 newpwd->pwd_jdir = oldpwd->pwd_jdir; 3716 } 3717 } 3718 3719 struct pwd * 3720 pwd_hold_pwddesc(struct pwddesc *pdp) 3721 { 3722 struct pwd *pwd; 3723 3724 PWDDESC_ASSERT_XLOCKED(pdp); 3725 pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 3726 if (pwd != NULL) 3727 refcount_acquire(&pwd->pwd_refcount); 3728 return (pwd); 3729 } 3730 3731 bool 3732 pwd_hold_smr(struct pwd *pwd) 3733 { 3734 3735 MPASS(pwd != NULL); 3736 if (__predict_true(refcount_acquire_if_not_zero(&pwd->pwd_refcount))) { 3737 return (true); 3738 } 3739 return (false); 3740 } 3741 3742 struct pwd * 3743 pwd_hold(struct thread *td) 3744 { 3745 struct pwddesc *pdp; 3746 struct pwd *pwd; 3747 3748 pdp = td->td_proc->p_pd; 3749 3750 vfs_smr_enter(); 3751 pwd = vfs_smr_entered_load(&pdp->pd_pwd); 3752 if (pwd_hold_smr(pwd)) { 3753 vfs_smr_exit(); 3754 return (pwd); 3755 } 3756 vfs_smr_exit(); 3757 PWDDESC_XLOCK(pdp); 3758 pwd = pwd_hold_pwddesc(pdp); 3759 MPASS(pwd != NULL); 3760 PWDDESC_XUNLOCK(pdp); 3761 return (pwd); 3762 } 3763 3764 struct pwd * 3765 pwd_hold_proc(struct proc *p) 3766 { 3767 struct pwddesc *pdp; 3768 struct pwd *pwd; 3769 3770 PROC_ASSERT_HELD(p); 3771 PROC_LOCK(p); 3772 pdp = pdhold(p); 3773 MPASS(pdp != NULL); 3774 PROC_UNLOCK(p); 3775 3776 PWDDESC_XLOCK(pdp); 3777 pwd = pwd_hold_pwddesc(pdp); 3778 MPASS(pwd != NULL); 3779 PWDDESC_XUNLOCK(pdp); 3780 pddrop(pdp); 3781 return (pwd); 3782 } 3783 3784 static struct pwd * 3785 pwd_alloc(void) 3786 { 3787 struct pwd *pwd; 3788 3789 pwd = uma_zalloc_smr(pwd_zone, M_WAITOK); 3790 bzero(pwd, sizeof(*pwd)); 3791 refcount_init(&pwd->pwd_refcount, 1); 3792 return (pwd); 3793 } 3794 3795 void 3796 pwd_drop(struct pwd *pwd) 3797 { 3798 3799 if (!refcount_release(&pwd->pwd_refcount)) 3800 return; 3801 3802 if (pwd->pwd_cdir != NULL) 3803 vrele(pwd->pwd_cdir); 3804 if (pwd->pwd_rdir != NULL) 3805 vrele(pwd->pwd_rdir); 3806 if (pwd->pwd_jdir != NULL) 3807 vrele(pwd->pwd_jdir); 3808 uma_zfree_smr(pwd_zone, pwd); 3809 } 3810 3811 /* 3812 * The caller is responsible for invoking priv_check() and 3813 * mac_vnode_check_chroot() to authorize this operation. 3814 */ 3815 int 3816 pwd_chroot(struct thread *td, struct vnode *vp) 3817 { 3818 struct pwddesc *pdp; 3819 struct filedesc *fdp; 3820 struct pwd *newpwd, *oldpwd; 3821 int error; 3822 3823 fdp = td->td_proc->p_fd; 3824 pdp = td->td_proc->p_pd; 3825 newpwd = pwd_alloc(); 3826 FILEDESC_SLOCK(fdp); 3827 PWDDESC_XLOCK(pdp); 3828 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 3829 if (chroot_allow_open_directories == 0 || 3830 (chroot_allow_open_directories == 1 && 3831 oldpwd->pwd_rdir != rootvnode)) { 3832 error = chroot_refuse_vdir_fds(fdp); 3833 FILEDESC_SUNLOCK(fdp); 3834 if (error != 0) { 3835 PWDDESC_XUNLOCK(pdp); 3836 pwd_drop(newpwd); 3837 return (error); 3838 } 3839 } else { 3840 FILEDESC_SUNLOCK(fdp); 3841 } 3842 3843 vrefact(vp); 3844 newpwd->pwd_rdir = vp; 3845 if (oldpwd->pwd_jdir == NULL) { 3846 vrefact(vp); 3847 newpwd->pwd_jdir = vp; 3848 } 3849 pwd_fill(oldpwd, newpwd); 3850 pwd_set(pdp, newpwd); 3851 PWDDESC_XUNLOCK(pdp); 3852 pwd_drop(oldpwd); 3853 return (0); 3854 } 3855 3856 void 3857 pwd_chdir(struct thread *td, struct vnode *vp) 3858 { 3859 struct pwddesc *pdp; 3860 struct pwd *newpwd, *oldpwd; 3861 3862 VNPASS(vp->v_usecount > 0, vp); 3863 3864 newpwd = pwd_alloc(); 3865 pdp = td->td_proc->p_pd; 3866 PWDDESC_XLOCK(pdp); 3867 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 3868 newpwd->pwd_cdir = vp; 3869 pwd_fill(oldpwd, newpwd); 3870 pwd_set(pdp, newpwd); 3871 PWDDESC_XUNLOCK(pdp); 3872 pwd_drop(oldpwd); 3873 } 3874 3875 /* 3876 * jail_attach(2) changes both root and working directories. 3877 */ 3878 int 3879 pwd_chroot_chdir(struct thread *td, struct vnode *vp) 3880 { 3881 struct pwddesc *pdp; 3882 struct filedesc *fdp; 3883 struct pwd *newpwd, *oldpwd; 3884 int error; 3885 3886 fdp = td->td_proc->p_fd; 3887 pdp = td->td_proc->p_pd; 3888 newpwd = pwd_alloc(); 3889 FILEDESC_SLOCK(fdp); 3890 PWDDESC_XLOCK(pdp); 3891 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 3892 error = chroot_refuse_vdir_fds(fdp); 3893 FILEDESC_SUNLOCK(fdp); 3894 if (error != 0) { 3895 PWDDESC_XUNLOCK(pdp); 3896 pwd_drop(newpwd); 3897 return (error); 3898 } 3899 3900 vrefact(vp); 3901 newpwd->pwd_rdir = vp; 3902 vrefact(vp); 3903 newpwd->pwd_cdir = vp; 3904 if (oldpwd->pwd_jdir == NULL) { 3905 vrefact(vp); 3906 newpwd->pwd_jdir = vp; 3907 } 3908 pwd_fill(oldpwd, newpwd); 3909 pwd_set(pdp, newpwd); 3910 PWDDESC_XUNLOCK(pdp); 3911 pwd_drop(oldpwd); 3912 return (0); 3913 } 3914 3915 void 3916 pwd_ensure_dirs(void) 3917 { 3918 struct pwddesc *pdp; 3919 struct pwd *oldpwd, *newpwd; 3920 3921 pdp = curproc->p_pd; 3922 PWDDESC_XLOCK(pdp); 3923 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 3924 if (oldpwd->pwd_cdir != NULL && oldpwd->pwd_rdir != NULL) { 3925 PWDDESC_XUNLOCK(pdp); 3926 return; 3927 } 3928 PWDDESC_XUNLOCK(pdp); 3929 3930 newpwd = pwd_alloc(); 3931 PWDDESC_XLOCK(pdp); 3932 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 3933 pwd_fill(oldpwd, newpwd); 3934 if (newpwd->pwd_cdir == NULL) { 3935 vrefact(rootvnode); 3936 newpwd->pwd_cdir = rootvnode; 3937 } 3938 if (newpwd->pwd_rdir == NULL) { 3939 vrefact(rootvnode); 3940 newpwd->pwd_rdir = rootvnode; 3941 } 3942 pwd_set(pdp, newpwd); 3943 PWDDESC_XUNLOCK(pdp); 3944 pwd_drop(oldpwd); 3945 } 3946 3947 void 3948 pwd_set_rootvnode(void) 3949 { 3950 struct pwddesc *pdp; 3951 struct pwd *oldpwd, *newpwd; 3952 3953 pdp = curproc->p_pd; 3954 3955 newpwd = pwd_alloc(); 3956 PWDDESC_XLOCK(pdp); 3957 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 3958 vrefact(rootvnode); 3959 newpwd->pwd_cdir = rootvnode; 3960 vrefact(rootvnode); 3961 newpwd->pwd_rdir = rootvnode; 3962 pwd_fill(oldpwd, newpwd); 3963 pwd_set(pdp, newpwd); 3964 PWDDESC_XUNLOCK(pdp); 3965 pwd_drop(oldpwd); 3966 } 3967 3968 /* 3969 * Scan all active processes and prisons to see if any of them have a current 3970 * or root directory of `olddp'. If so, replace them with the new mount point. 3971 */ 3972 void 3973 mountcheckdirs(struct vnode *olddp, struct vnode *newdp) 3974 { 3975 struct pwddesc *pdp; 3976 struct pwd *newpwd, *oldpwd; 3977 struct prison *pr; 3978 struct proc *p; 3979 int nrele; 3980 3981 if (vrefcnt(olddp) == 1) 3982 return; 3983 nrele = 0; 3984 newpwd = pwd_alloc(); 3985 sx_slock(&allproc_lock); 3986 FOREACH_PROC_IN_SYSTEM(p) { 3987 PROC_LOCK(p); 3988 pdp = pdhold(p); 3989 PROC_UNLOCK(p); 3990 if (pdp == NULL) 3991 continue; 3992 PWDDESC_XLOCK(pdp); 3993 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 3994 if (oldpwd == NULL || 3995 (oldpwd->pwd_cdir != olddp && 3996 oldpwd->pwd_rdir != olddp && 3997 oldpwd->pwd_jdir != olddp)) { 3998 PWDDESC_XUNLOCK(pdp); 3999 pddrop(pdp); 4000 continue; 4001 } 4002 if (oldpwd->pwd_cdir == olddp) { 4003 vrefact(newdp); 4004 newpwd->pwd_cdir = newdp; 4005 } 4006 if (oldpwd->pwd_rdir == olddp) { 4007 vrefact(newdp); 4008 newpwd->pwd_rdir = newdp; 4009 } 4010 if (oldpwd->pwd_jdir == olddp) { 4011 vrefact(newdp); 4012 newpwd->pwd_jdir = newdp; 4013 } 4014 pwd_fill(oldpwd, newpwd); 4015 pwd_set(pdp, newpwd); 4016 PWDDESC_XUNLOCK(pdp); 4017 pwd_drop(oldpwd); 4018 pddrop(pdp); 4019 newpwd = pwd_alloc(); 4020 } 4021 sx_sunlock(&allproc_lock); 4022 pwd_drop(newpwd); 4023 if (rootvnode == olddp) { 4024 vrefact(newdp); 4025 rootvnode = newdp; 4026 nrele++; 4027 } 4028 mtx_lock(&prison0.pr_mtx); 4029 if (prison0.pr_root == olddp) { 4030 vrefact(newdp); 4031 prison0.pr_root = newdp; 4032 nrele++; 4033 } 4034 mtx_unlock(&prison0.pr_mtx); 4035 sx_slock(&allprison_lock); 4036 TAILQ_FOREACH(pr, &allprison, pr_list) { 4037 mtx_lock(&pr->pr_mtx); 4038 if (pr->pr_root == olddp) { 4039 vrefact(newdp); 4040 pr->pr_root = newdp; 4041 nrele++; 4042 } 4043 mtx_unlock(&pr->pr_mtx); 4044 } 4045 sx_sunlock(&allprison_lock); 4046 while (nrele--) 4047 vrele(olddp); 4048 } 4049 4050 struct filedesc_to_leader * 4051 filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader) 4052 { 4053 struct filedesc_to_leader *fdtol; 4054 4055 fdtol = malloc(sizeof(struct filedesc_to_leader), 4056 M_FILEDESC_TO_LEADER, M_WAITOK); 4057 fdtol->fdl_refcount = 1; 4058 fdtol->fdl_holdcount = 0; 4059 fdtol->fdl_wakeup = 0; 4060 fdtol->fdl_leader = leader; 4061 if (old != NULL) { 4062 FILEDESC_XLOCK(fdp); 4063 fdtol->fdl_next = old->fdl_next; 4064 fdtol->fdl_prev = old; 4065 old->fdl_next = fdtol; 4066 fdtol->fdl_next->fdl_prev = fdtol; 4067 FILEDESC_XUNLOCK(fdp); 4068 } else { 4069 fdtol->fdl_next = fdtol; 4070 fdtol->fdl_prev = fdtol; 4071 } 4072 return (fdtol); 4073 } 4074 4075 static int 4076 sysctl_kern_proc_nfds(SYSCTL_HANDLER_ARGS) 4077 { 4078 NDSLOTTYPE *map; 4079 struct filedesc *fdp; 4080 u_int namelen; 4081 int count, off, minoff; 4082 4083 namelen = arg2; 4084 if (namelen != 1) 4085 return (EINVAL); 4086 4087 if (*(int *)arg1 != 0) 4088 return (EINVAL); 4089 4090 fdp = curproc->p_fd; 4091 count = 0; 4092 FILEDESC_SLOCK(fdp); 4093 map = fdp->fd_map; 4094 off = NDSLOT(fdp->fd_nfiles - 1); 4095 for (minoff = NDSLOT(0); off >= minoff; --off) 4096 count += bitcountl(map[off]); 4097 FILEDESC_SUNLOCK(fdp); 4098 4099 return (SYSCTL_OUT(req, &count, sizeof(count))); 4100 } 4101 4102 static SYSCTL_NODE(_kern_proc, KERN_PROC_NFDS, nfds, 4103 CTLFLAG_RD|CTLFLAG_CAPRD|CTLFLAG_MPSAFE, sysctl_kern_proc_nfds, 4104 "Number of open file descriptors"); 4105 4106 /* 4107 * Get file structures globally. 4108 */ 4109 static int 4110 sysctl_kern_file(SYSCTL_HANDLER_ARGS) 4111 { 4112 struct xfile xf; 4113 struct filedesc *fdp; 4114 struct file *fp; 4115 struct proc *p; 4116 int error, n; 4117 4118 error = sysctl_wire_old_buffer(req, 0); 4119 if (error != 0) 4120 return (error); 4121 if (req->oldptr == NULL) { 4122 n = 0; 4123 sx_slock(&allproc_lock); 4124 FOREACH_PROC_IN_SYSTEM(p) { 4125 PROC_LOCK(p); 4126 if (p->p_state == PRS_NEW) { 4127 PROC_UNLOCK(p); 4128 continue; 4129 } 4130 fdp = fdhold(p); 4131 PROC_UNLOCK(p); 4132 if (fdp == NULL) 4133 continue; 4134 /* overestimates sparse tables. */ 4135 n += fdp->fd_nfiles; 4136 fddrop(fdp); 4137 } 4138 sx_sunlock(&allproc_lock); 4139 return (SYSCTL_OUT(req, 0, n * sizeof(xf))); 4140 } 4141 error = 0; 4142 bzero(&xf, sizeof(xf)); 4143 xf.xf_size = sizeof(xf); 4144 sx_slock(&allproc_lock); 4145 FOREACH_PROC_IN_SYSTEM(p) { 4146 PROC_LOCK(p); 4147 if (p->p_state == PRS_NEW) { 4148 PROC_UNLOCK(p); 4149 continue; 4150 } 4151 if (p_cansee(req->td, p) != 0) { 4152 PROC_UNLOCK(p); 4153 continue; 4154 } 4155 xf.xf_pid = p->p_pid; 4156 xf.xf_uid = p->p_ucred->cr_uid; 4157 fdp = fdhold(p); 4158 PROC_UNLOCK(p); 4159 if (fdp == NULL) 4160 continue; 4161 FILEDESC_SLOCK(fdp); 4162 FILEDESC_FOREACH_FP(fdp, n, fp) { 4163 if (refcount_load(&fdp->fd_refcnt) == 0) 4164 break; 4165 xf.xf_fd = n; 4166 xf.xf_file = (uintptr_t)fp; 4167 xf.xf_data = (uintptr_t)fp->f_data; 4168 xf.xf_vnode = (uintptr_t)fp->f_vnode; 4169 xf.xf_type = (uintptr_t)fp->f_type; 4170 xf.xf_count = refcount_load(&fp->f_count); 4171 xf.xf_msgcount = 0; 4172 xf.xf_offset = foffset_get(fp); 4173 xf.xf_flag = fp->f_flag; 4174 error = SYSCTL_OUT(req, &xf, sizeof(xf)); 4175 if (error) 4176 break; 4177 } 4178 FILEDESC_SUNLOCK(fdp); 4179 fddrop(fdp); 4180 if (error) 4181 break; 4182 } 4183 sx_sunlock(&allproc_lock); 4184 return (error); 4185 } 4186 4187 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE, 4188 0, 0, sysctl_kern_file, "S,xfile", "Entire file table"); 4189 4190 #ifdef KINFO_FILE_SIZE 4191 CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE); 4192 #endif 4193 4194 static int 4195 xlate_fflags(int fflags) 4196 { 4197 static const struct { 4198 int fflag; 4199 int kf_fflag; 4200 } fflags_table[] = { 4201 { FAPPEND, KF_FLAG_APPEND }, 4202 { FASYNC, KF_FLAG_ASYNC }, 4203 { FFSYNC, KF_FLAG_FSYNC }, 4204 { FHASLOCK, KF_FLAG_HASLOCK }, 4205 { FNONBLOCK, KF_FLAG_NONBLOCK }, 4206 { FREAD, KF_FLAG_READ }, 4207 { FWRITE, KF_FLAG_WRITE }, 4208 { O_CREAT, KF_FLAG_CREAT }, 4209 { O_DIRECT, KF_FLAG_DIRECT }, 4210 { O_EXCL, KF_FLAG_EXCL }, 4211 { O_EXEC, KF_FLAG_EXEC }, 4212 { O_EXLOCK, KF_FLAG_EXLOCK }, 4213 { O_NOFOLLOW, KF_FLAG_NOFOLLOW }, 4214 { O_SHLOCK, KF_FLAG_SHLOCK }, 4215 { O_TRUNC, KF_FLAG_TRUNC } 4216 }; 4217 unsigned int i; 4218 int kflags; 4219 4220 kflags = 0; 4221 for (i = 0; i < nitems(fflags_table); i++) 4222 if (fflags & fflags_table[i].fflag) 4223 kflags |= fflags_table[i].kf_fflag; 4224 return (kflags); 4225 } 4226 4227 /* Trim unused data from kf_path by truncating the structure size. */ 4228 void 4229 pack_kinfo(struct kinfo_file *kif) 4230 { 4231 4232 kif->kf_structsize = offsetof(struct kinfo_file, kf_path) + 4233 strlen(kif->kf_path) + 1; 4234 kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t)); 4235 } 4236 4237 static void 4238 export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp, 4239 struct kinfo_file *kif, struct filedesc *fdp, int flags) 4240 { 4241 int error; 4242 4243 bzero(kif, sizeof(*kif)); 4244 4245 /* Set a default type to allow for empty fill_kinfo() methods. */ 4246 kif->kf_type = KF_TYPE_UNKNOWN; 4247 kif->kf_flags = xlate_fflags(fp->f_flag); 4248 if (rightsp != NULL) 4249 kif->kf_cap_rights = *rightsp; 4250 else 4251 cap_rights_init_zero(&kif->kf_cap_rights); 4252 kif->kf_fd = fd; 4253 kif->kf_ref_count = refcount_load(&fp->f_count); 4254 kif->kf_offset = foffset_get(fp); 4255 4256 /* 4257 * This may drop the filedesc lock, so the 'fp' cannot be 4258 * accessed after this call. 4259 */ 4260 error = fo_fill_kinfo(fp, kif, fdp); 4261 if (error == 0) 4262 kif->kf_status |= KF_ATTR_VALID; 4263 if ((flags & KERN_FILEDESC_PACK_KINFO) != 0) 4264 pack_kinfo(kif); 4265 else 4266 kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t)); 4267 } 4268 4269 static void 4270 export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags, 4271 struct kinfo_file *kif, int flags) 4272 { 4273 int error; 4274 4275 bzero(kif, sizeof(*kif)); 4276 4277 kif->kf_type = KF_TYPE_VNODE; 4278 error = vn_fill_kinfo_vnode(vp, kif); 4279 if (error == 0) 4280 kif->kf_status |= KF_ATTR_VALID; 4281 kif->kf_flags = xlate_fflags(fflags); 4282 cap_rights_init_zero(&kif->kf_cap_rights); 4283 kif->kf_fd = fd; 4284 kif->kf_ref_count = -1; 4285 kif->kf_offset = -1; 4286 if ((flags & KERN_FILEDESC_PACK_KINFO) != 0) 4287 pack_kinfo(kif); 4288 else 4289 kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t)); 4290 vrele(vp); 4291 } 4292 4293 struct export_fd_buf { 4294 struct filedesc *fdp; 4295 struct pwddesc *pdp; 4296 struct sbuf *sb; 4297 ssize_t remainder; 4298 struct kinfo_file kif; 4299 int flags; 4300 }; 4301 4302 static int 4303 export_kinfo_to_sb(struct export_fd_buf *efbuf) 4304 { 4305 struct kinfo_file *kif; 4306 4307 kif = &efbuf->kif; 4308 if (efbuf->remainder != -1) { 4309 if (efbuf->remainder < kif->kf_structsize) 4310 return (ENOMEM); 4311 efbuf->remainder -= kif->kf_structsize; 4312 } 4313 if (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) != 0) 4314 return (sbuf_error(efbuf->sb)); 4315 return (0); 4316 } 4317 4318 static int 4319 export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp, 4320 struct export_fd_buf *efbuf) 4321 { 4322 int error; 4323 4324 if (efbuf->remainder == 0) 4325 return (ENOMEM); 4326 export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp, 4327 efbuf->flags); 4328 FILEDESC_SUNLOCK(efbuf->fdp); 4329 error = export_kinfo_to_sb(efbuf); 4330 FILEDESC_SLOCK(efbuf->fdp); 4331 return (error); 4332 } 4333 4334 static int 4335 export_vnode_to_sb(struct vnode *vp, int fd, int fflags, 4336 struct export_fd_buf *efbuf) 4337 { 4338 int error; 4339 4340 if (efbuf->remainder == 0) 4341 return (ENOMEM); 4342 if (efbuf->pdp != NULL) 4343 PWDDESC_XUNLOCK(efbuf->pdp); 4344 export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif, efbuf->flags); 4345 error = export_kinfo_to_sb(efbuf); 4346 if (efbuf->pdp != NULL) 4347 PWDDESC_XLOCK(efbuf->pdp); 4348 return (error); 4349 } 4350 4351 /* 4352 * Store a process file descriptor information to sbuf. 4353 * 4354 * Takes a locked proc as argument, and returns with the proc unlocked. 4355 */ 4356 int 4357 kern_proc_filedesc_out(struct proc *p, struct sbuf *sb, ssize_t maxlen, 4358 int flags) 4359 { 4360 struct file *fp; 4361 struct filedesc *fdp; 4362 struct pwddesc *pdp; 4363 struct export_fd_buf *efbuf; 4364 struct vnode *cttyvp, *textvp, *tracevp; 4365 struct pwd *pwd; 4366 int error, i; 4367 cap_rights_t rights; 4368 4369 PROC_LOCK_ASSERT(p, MA_OWNED); 4370 4371 /* ktrace vnode */ 4372 tracevp = ktr_get_tracevp(p, true); 4373 /* text vnode */ 4374 textvp = p->p_textvp; 4375 if (textvp != NULL) 4376 vrefact(textvp); 4377 /* Controlling tty. */ 4378 cttyvp = NULL; 4379 if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) { 4380 cttyvp = p->p_pgrp->pg_session->s_ttyvp; 4381 if (cttyvp != NULL) 4382 vrefact(cttyvp); 4383 } 4384 fdp = fdhold(p); 4385 pdp = pdhold(p); 4386 PROC_UNLOCK(p); 4387 4388 efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); 4389 efbuf->fdp = NULL; 4390 efbuf->pdp = NULL; 4391 efbuf->sb = sb; 4392 efbuf->remainder = maxlen; 4393 efbuf->flags = flags; 4394 4395 error = 0; 4396 if (tracevp != NULL) 4397 error = export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, 4398 FREAD | FWRITE, efbuf); 4399 if (error == 0 && textvp != NULL) 4400 error = export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, 4401 efbuf); 4402 if (error == 0 && cttyvp != NULL) 4403 error = export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, 4404 FREAD | FWRITE, efbuf); 4405 if (error != 0 || pdp == NULL || fdp == NULL) 4406 goto fail; 4407 efbuf->fdp = fdp; 4408 efbuf->pdp = pdp; 4409 PWDDESC_XLOCK(pdp); 4410 pwd = pwd_hold_pwddesc(pdp); 4411 if (pwd != NULL) { 4412 /* working directory */ 4413 if (pwd->pwd_cdir != NULL) { 4414 vrefact(pwd->pwd_cdir); 4415 error = export_vnode_to_sb(pwd->pwd_cdir, 4416 KF_FD_TYPE_CWD, FREAD, efbuf); 4417 } 4418 /* root directory */ 4419 if (error == 0 && pwd->pwd_rdir != NULL) { 4420 vrefact(pwd->pwd_rdir); 4421 error = export_vnode_to_sb(pwd->pwd_rdir, 4422 KF_FD_TYPE_ROOT, FREAD, efbuf); 4423 } 4424 /* jail directory */ 4425 if (error == 0 && pwd->pwd_jdir != NULL) { 4426 vrefact(pwd->pwd_jdir); 4427 error = export_vnode_to_sb(pwd->pwd_jdir, 4428 KF_FD_TYPE_JAIL, FREAD, efbuf); 4429 } 4430 } 4431 PWDDESC_XUNLOCK(pdp); 4432 if (error != 0) 4433 goto fail; 4434 if (pwd != NULL) 4435 pwd_drop(pwd); 4436 FILEDESC_SLOCK(fdp); 4437 FILEDESC_FOREACH_FP(fdp, i, fp) { 4438 if (refcount_load(&fdp->fd_refcnt) == 0) 4439 break; 4440 #ifdef CAPABILITIES 4441 rights = *cap_rights(fdp, i); 4442 #else /* !CAPABILITIES */ 4443 rights = cap_no_rights; 4444 #endif 4445 /* 4446 * Create sysctl entry. It is OK to drop the filedesc 4447 * lock inside of export_file_to_sb() as we will 4448 * re-validate and re-evaluate its properties when the 4449 * loop continues. 4450 */ 4451 error = export_file_to_sb(fp, i, &rights, efbuf); 4452 if (error != 0) 4453 break; 4454 } 4455 FILEDESC_SUNLOCK(fdp); 4456 fail: 4457 if (fdp != NULL) 4458 fddrop(fdp); 4459 if (pdp != NULL) 4460 pddrop(pdp); 4461 free(efbuf, M_TEMP); 4462 return (error); 4463 } 4464 4465 #define FILEDESC_SBUF_SIZE (sizeof(struct kinfo_file) * 5) 4466 4467 /* 4468 * Get per-process file descriptors for use by procstat(1), et al. 4469 */ 4470 static int 4471 sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS) 4472 { 4473 struct sbuf sb; 4474 struct proc *p; 4475 ssize_t maxlen; 4476 u_int namelen; 4477 int error, error2, *name; 4478 4479 namelen = arg2; 4480 if (namelen != 1) 4481 return (EINVAL); 4482 4483 name = (int *)arg1; 4484 4485 sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req); 4486 sbuf_clear_flags(&sb, SBUF_INCLUDENUL); 4487 error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); 4488 if (error != 0) { 4489 sbuf_delete(&sb); 4490 return (error); 4491 } 4492 maxlen = req->oldptr != NULL ? req->oldlen : -1; 4493 error = kern_proc_filedesc_out(p, &sb, maxlen, 4494 KERN_FILEDESC_PACK_KINFO); 4495 error2 = sbuf_finish(&sb); 4496 sbuf_delete(&sb); 4497 return (error != 0 ? error : error2); 4498 } 4499 4500 #ifdef COMPAT_FREEBSD7 4501 #ifdef KINFO_OFILE_SIZE 4502 CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE); 4503 #endif 4504 4505 static void 4506 kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif) 4507 { 4508 4509 okif->kf_structsize = sizeof(*okif); 4510 okif->kf_type = kif->kf_type; 4511 okif->kf_fd = kif->kf_fd; 4512 okif->kf_ref_count = kif->kf_ref_count; 4513 okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE | 4514 KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK | 4515 KF_FLAG_DIRECT | KF_FLAG_HASLOCK); 4516 okif->kf_offset = kif->kf_offset; 4517 if (kif->kf_type == KF_TYPE_VNODE) 4518 okif->kf_vnode_type = kif->kf_un.kf_file.kf_file_type; 4519 else 4520 okif->kf_vnode_type = KF_VTYPE_VNON; 4521 strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path)); 4522 if (kif->kf_type == KF_TYPE_SOCKET) { 4523 okif->kf_sock_domain = kif->kf_un.kf_sock.kf_sock_domain0; 4524 okif->kf_sock_type = kif->kf_un.kf_sock.kf_sock_type0; 4525 okif->kf_sock_protocol = kif->kf_un.kf_sock.kf_sock_protocol0; 4526 okif->kf_sa_local = kif->kf_un.kf_sock.kf_sa_local; 4527 okif->kf_sa_peer = kif->kf_un.kf_sock.kf_sa_peer; 4528 } else { 4529 okif->kf_sa_local.ss_family = AF_UNSPEC; 4530 okif->kf_sa_peer.ss_family = AF_UNSPEC; 4531 } 4532 } 4533 4534 static int 4535 export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif, 4536 struct kinfo_ofile *okif, struct pwddesc *pdp, struct sysctl_req *req) 4537 { 4538 int error; 4539 4540 vrefact(vp); 4541 PWDDESC_XUNLOCK(pdp); 4542 export_vnode_to_kinfo(vp, type, 0, kif, KERN_FILEDESC_PACK_KINFO); 4543 kinfo_to_okinfo(kif, okif); 4544 error = SYSCTL_OUT(req, okif, sizeof(*okif)); 4545 PWDDESC_XLOCK(pdp); 4546 return (error); 4547 } 4548 4549 /* 4550 * Get per-process file descriptors for use by procstat(1), et al. 4551 */ 4552 static int 4553 sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS) 4554 { 4555 struct kinfo_ofile *okif; 4556 struct kinfo_file *kif; 4557 struct filedesc *fdp; 4558 struct pwddesc *pdp; 4559 struct pwd *pwd; 4560 u_int namelen; 4561 int error, i, *name; 4562 struct file *fp; 4563 struct proc *p; 4564 4565 namelen = arg2; 4566 if (namelen != 1) 4567 return (EINVAL); 4568 4569 name = (int *)arg1; 4570 error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); 4571 if (error != 0) 4572 return (error); 4573 fdp = fdhold(p); 4574 if (fdp != NULL) 4575 pdp = pdhold(p); 4576 PROC_UNLOCK(p); 4577 if (fdp == NULL || pdp == NULL) { 4578 if (fdp != NULL) 4579 fddrop(fdp); 4580 return (ENOENT); 4581 } 4582 kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK); 4583 okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK); 4584 PWDDESC_XLOCK(pdp); 4585 pwd = pwd_hold_pwddesc(pdp); 4586 if (pwd != NULL) { 4587 if (pwd->pwd_cdir != NULL) 4588 export_vnode_for_osysctl(pwd->pwd_cdir, KF_FD_TYPE_CWD, kif, 4589 okif, pdp, req); 4590 if (pwd->pwd_rdir != NULL) 4591 export_vnode_for_osysctl(pwd->pwd_rdir, KF_FD_TYPE_ROOT, kif, 4592 okif, pdp, req); 4593 if (pwd->pwd_jdir != NULL) 4594 export_vnode_for_osysctl(pwd->pwd_jdir, KF_FD_TYPE_JAIL, kif, 4595 okif, pdp, req); 4596 } 4597 PWDDESC_XUNLOCK(pdp); 4598 if (pwd != NULL) 4599 pwd_drop(pwd); 4600 FILEDESC_SLOCK(fdp); 4601 FILEDESC_FOREACH_FP(fdp, i, fp) { 4602 if (refcount_load(&fdp->fd_refcnt) == 0) 4603 break; 4604 export_file_to_kinfo(fp, i, NULL, kif, fdp, 4605 KERN_FILEDESC_PACK_KINFO); 4606 FILEDESC_SUNLOCK(fdp); 4607 kinfo_to_okinfo(kif, okif); 4608 error = SYSCTL_OUT(req, okif, sizeof(*okif)); 4609 FILEDESC_SLOCK(fdp); 4610 if (error) 4611 break; 4612 } 4613 FILEDESC_SUNLOCK(fdp); 4614 fddrop(fdp); 4615 pddrop(pdp); 4616 free(kif, M_TEMP); 4617 free(okif, M_TEMP); 4618 return (0); 4619 } 4620 4621 static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, 4622 CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc, 4623 "Process ofiledesc entries"); 4624 #endif /* COMPAT_FREEBSD7 */ 4625 4626 int 4627 vntype_to_kinfo(int vtype) 4628 { 4629 struct { 4630 int vtype; 4631 int kf_vtype; 4632 } vtypes_table[] = { 4633 { VBAD, KF_VTYPE_VBAD }, 4634 { VBLK, KF_VTYPE_VBLK }, 4635 { VCHR, KF_VTYPE_VCHR }, 4636 { VDIR, KF_VTYPE_VDIR }, 4637 { VFIFO, KF_VTYPE_VFIFO }, 4638 { VLNK, KF_VTYPE_VLNK }, 4639 { VNON, KF_VTYPE_VNON }, 4640 { VREG, KF_VTYPE_VREG }, 4641 { VSOCK, KF_VTYPE_VSOCK } 4642 }; 4643 unsigned int i; 4644 4645 /* 4646 * Perform vtype translation. 4647 */ 4648 for (i = 0; i < nitems(vtypes_table); i++) 4649 if (vtypes_table[i].vtype == vtype) 4650 return (vtypes_table[i].kf_vtype); 4651 4652 return (KF_VTYPE_UNKNOWN); 4653 } 4654 4655 static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, 4656 CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc, 4657 "Process filedesc entries"); 4658 4659 /* 4660 * Store a process current working directory information to sbuf. 4661 * 4662 * Takes a locked proc as argument, and returns with the proc unlocked. 4663 */ 4664 int 4665 kern_proc_cwd_out(struct proc *p, struct sbuf *sb, ssize_t maxlen) 4666 { 4667 struct pwddesc *pdp; 4668 struct pwd *pwd; 4669 struct export_fd_buf *efbuf; 4670 struct vnode *cdir; 4671 int error; 4672 4673 PROC_LOCK_ASSERT(p, MA_OWNED); 4674 4675 pdp = pdhold(p); 4676 PROC_UNLOCK(p); 4677 if (pdp == NULL) 4678 return (EINVAL); 4679 4680 efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); 4681 efbuf->fdp = NULL; 4682 efbuf->pdp = pdp; 4683 efbuf->sb = sb; 4684 efbuf->remainder = maxlen; 4685 efbuf->flags = 0; 4686 4687 PWDDESC_XLOCK(pdp); 4688 pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4689 cdir = pwd->pwd_cdir; 4690 if (cdir == NULL) { 4691 error = EINVAL; 4692 } else { 4693 vrefact(cdir); 4694 error = export_vnode_to_sb(cdir, KF_FD_TYPE_CWD, FREAD, efbuf); 4695 } 4696 PWDDESC_XUNLOCK(pdp); 4697 pddrop(pdp); 4698 free(efbuf, M_TEMP); 4699 return (error); 4700 } 4701 4702 /* 4703 * Get per-process current working directory. 4704 */ 4705 static int 4706 sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS) 4707 { 4708 struct sbuf sb; 4709 struct proc *p; 4710 ssize_t maxlen; 4711 u_int namelen; 4712 int error, error2, *name; 4713 4714 namelen = arg2; 4715 if (namelen != 1) 4716 return (EINVAL); 4717 4718 name = (int *)arg1; 4719 4720 sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req); 4721 sbuf_clear_flags(&sb, SBUF_INCLUDENUL); 4722 error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); 4723 if (error != 0) { 4724 sbuf_delete(&sb); 4725 return (error); 4726 } 4727 maxlen = req->oldptr != NULL ? req->oldlen : -1; 4728 error = kern_proc_cwd_out(p, &sb, maxlen); 4729 error2 = sbuf_finish(&sb); 4730 sbuf_delete(&sb); 4731 return (error != 0 ? error : error2); 4732 } 4733 4734 static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE, 4735 sysctl_kern_proc_cwd, "Process current working directory"); 4736 4737 #ifdef DDB 4738 /* 4739 * For the purposes of debugging, generate a human-readable string for the 4740 * file type. 4741 */ 4742 static const char * 4743 file_type_to_name(short type) 4744 { 4745 4746 switch (type) { 4747 case 0: 4748 return ("zero"); 4749 case DTYPE_VNODE: 4750 return ("vnode"); 4751 case DTYPE_SOCKET: 4752 return ("socket"); 4753 case DTYPE_PIPE: 4754 return ("pipe"); 4755 case DTYPE_FIFO: 4756 return ("fifo"); 4757 case DTYPE_KQUEUE: 4758 return ("kqueue"); 4759 case DTYPE_CRYPTO: 4760 return ("crypto"); 4761 case DTYPE_MQUEUE: 4762 return ("mqueue"); 4763 case DTYPE_SHM: 4764 return ("shm"); 4765 case DTYPE_SEM: 4766 return ("ksem"); 4767 case DTYPE_PTS: 4768 return ("pts"); 4769 case DTYPE_DEV: 4770 return ("dev"); 4771 case DTYPE_PROCDESC: 4772 return ("proc"); 4773 case DTYPE_EVENTFD: 4774 return ("eventfd"); 4775 case DTYPE_LINUXTFD: 4776 return ("ltimer"); 4777 default: 4778 return ("unkn"); 4779 } 4780 } 4781 4782 /* 4783 * For the purposes of debugging, identify a process (if any, perhaps one of 4784 * many) that references the passed file in its file descriptor array. Return 4785 * NULL if none. 4786 */ 4787 static struct proc * 4788 file_to_first_proc(struct file *fp) 4789 { 4790 struct filedesc *fdp; 4791 struct proc *p; 4792 int n; 4793 4794 FOREACH_PROC_IN_SYSTEM(p) { 4795 if (p->p_state == PRS_NEW) 4796 continue; 4797 fdp = p->p_fd; 4798 if (fdp == NULL) 4799 continue; 4800 for (n = 0; n < fdp->fd_nfiles; n++) { 4801 if (fp == fdp->fd_ofiles[n].fde_file) 4802 return (p); 4803 } 4804 } 4805 return (NULL); 4806 } 4807 4808 static void 4809 db_print_file(struct file *fp, int header) 4810 { 4811 #define XPTRWIDTH ((int)howmany(sizeof(void *) * NBBY, 4)) 4812 struct proc *p; 4813 4814 if (header) 4815 db_printf("%*s %6s %*s %8s %4s %5s %6s %*s %5s %s\n", 4816 XPTRWIDTH, "File", "Type", XPTRWIDTH, "Data", "Flag", 4817 "GCFl", "Count", "MCount", XPTRWIDTH, "Vnode", "FPID", 4818 "FCmd"); 4819 p = file_to_first_proc(fp); 4820 db_printf("%*p %6s %*p %08x %04x %5d %6d %*p %5d %s\n", XPTRWIDTH, 4821 fp, file_type_to_name(fp->f_type), XPTRWIDTH, fp->f_data, 4822 fp->f_flag, 0, refcount_load(&fp->f_count), 0, XPTRWIDTH, fp->f_vnode, 4823 p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-"); 4824 4825 #undef XPTRWIDTH 4826 } 4827 4828 DB_SHOW_COMMAND(file, db_show_file) 4829 { 4830 struct file *fp; 4831 4832 if (!have_addr) { 4833 db_printf("usage: show file <addr>\n"); 4834 return; 4835 } 4836 fp = (struct file *)addr; 4837 db_print_file(fp, 1); 4838 } 4839 4840 DB_SHOW_COMMAND(files, db_show_files) 4841 { 4842 struct filedesc *fdp; 4843 struct file *fp; 4844 struct proc *p; 4845 int header; 4846 int n; 4847 4848 header = 1; 4849 FOREACH_PROC_IN_SYSTEM(p) { 4850 if (p->p_state == PRS_NEW) 4851 continue; 4852 if ((fdp = p->p_fd) == NULL) 4853 continue; 4854 for (n = 0; n < fdp->fd_nfiles; ++n) { 4855 if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) 4856 continue; 4857 db_print_file(fp, header); 4858 header = 0; 4859 } 4860 } 4861 } 4862 #endif 4863 4864 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, 4865 &maxfilesperproc, 0, "Maximum files allowed open per process"); 4866 4867 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, 4868 &maxfiles, 0, "Maximum number of files"); 4869 4870 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, 4871 &openfiles, 0, "System-wide number of open files"); 4872 4873 /* ARGSUSED*/ 4874 static void 4875 filelistinit(void *dummy) 4876 { 4877 4878 file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, 4879 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 4880 filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0), 4881 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 4882 pwd_zone = uma_zcreate("PWD", sizeof(struct pwd), NULL, NULL, 4883 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_SMR); 4884 /* 4885 * XXXMJG this is a temporary hack due to boot ordering issues against 4886 * the vnode zone. 4887 */ 4888 vfs_smr = uma_zone_get_smr(pwd_zone); 4889 mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); 4890 } 4891 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL); 4892 4893 /*-------------------------------------------------------------------*/ 4894 4895 static int 4896 badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, 4897 int flags, struct thread *td) 4898 { 4899 4900 return (EBADF); 4901 } 4902 4903 static int 4904 badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, 4905 struct thread *td) 4906 { 4907 4908 return (EINVAL); 4909 } 4910 4911 static int 4912 badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, 4913 struct thread *td) 4914 { 4915 4916 return (EBADF); 4917 } 4918 4919 static int 4920 badfo_poll(struct file *fp, int events, struct ucred *active_cred, 4921 struct thread *td) 4922 { 4923 4924 return (0); 4925 } 4926 4927 static int 4928 badfo_kqfilter(struct file *fp, struct knote *kn) 4929 { 4930 4931 return (EBADF); 4932 } 4933 4934 static int 4935 badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) 4936 { 4937 4938 return (EBADF); 4939 } 4940 4941 static int 4942 badfo_close(struct file *fp, struct thread *td) 4943 { 4944 4945 return (0); 4946 } 4947 4948 static int 4949 badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 4950 struct thread *td) 4951 { 4952 4953 return (EBADF); 4954 } 4955 4956 static int 4957 badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 4958 struct thread *td) 4959 { 4960 4961 return (EBADF); 4962 } 4963 4964 static int 4965 badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, 4966 struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, 4967 struct thread *td) 4968 { 4969 4970 return (EBADF); 4971 } 4972 4973 static int 4974 badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 4975 { 4976 4977 return (0); 4978 } 4979 4980 struct fileops badfileops = { 4981 .fo_read = badfo_readwrite, 4982 .fo_write = badfo_readwrite, 4983 .fo_truncate = badfo_truncate, 4984 .fo_ioctl = badfo_ioctl, 4985 .fo_poll = badfo_poll, 4986 .fo_kqfilter = badfo_kqfilter, 4987 .fo_stat = badfo_stat, 4988 .fo_close = badfo_close, 4989 .fo_chmod = badfo_chmod, 4990 .fo_chown = badfo_chown, 4991 .fo_sendfile = badfo_sendfile, 4992 .fo_fill_kinfo = badfo_fill_kinfo, 4993 }; 4994 4995 static int 4996 path_poll(struct file *fp, int events, struct ucred *active_cred, 4997 struct thread *td) 4998 { 4999 return (POLLNVAL); 5000 } 5001 5002 static int 5003 path_close(struct file *fp, struct thread *td) 5004 { 5005 MPASS(fp->f_type == DTYPE_VNODE); 5006 fp->f_ops = &badfileops; 5007 vdrop(fp->f_vnode); 5008 return (0); 5009 } 5010 5011 struct fileops path_fileops = { 5012 .fo_read = badfo_readwrite, 5013 .fo_write = badfo_readwrite, 5014 .fo_truncate = badfo_truncate, 5015 .fo_ioctl = badfo_ioctl, 5016 .fo_poll = path_poll, 5017 .fo_kqfilter = vn_kqfilter_opath, 5018 .fo_stat = vn_statfile, 5019 .fo_close = path_close, 5020 .fo_chmod = badfo_chmod, 5021 .fo_chown = badfo_chown, 5022 .fo_sendfile = badfo_sendfile, 5023 .fo_fill_kinfo = vn_fill_kinfo, 5024 .fo_flags = DFLAG_PASSABLE, 5025 }; 5026 5027 int 5028 invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred, 5029 int flags, struct thread *td) 5030 { 5031 5032 return (EOPNOTSUPP); 5033 } 5034 5035 int 5036 invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, 5037 struct thread *td) 5038 { 5039 5040 return (EINVAL); 5041 } 5042 5043 int 5044 invfo_ioctl(struct file *fp, u_long com, void *data, 5045 struct ucred *active_cred, struct thread *td) 5046 { 5047 5048 return (ENOTTY); 5049 } 5050 5051 int 5052 invfo_poll(struct file *fp, int events, struct ucred *active_cred, 5053 struct thread *td) 5054 { 5055 5056 return (poll_no_poll(events)); 5057 } 5058 5059 int 5060 invfo_kqfilter(struct file *fp, struct knote *kn) 5061 { 5062 5063 return (EINVAL); 5064 } 5065 5066 int 5067 invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 5068 struct thread *td) 5069 { 5070 5071 return (EINVAL); 5072 } 5073 5074 int 5075 invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 5076 struct thread *td) 5077 { 5078 5079 return (EINVAL); 5080 } 5081 5082 int 5083 invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, 5084 struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, 5085 struct thread *td) 5086 { 5087 5088 return (EINVAL); 5089 } 5090 5091 /*-------------------------------------------------------------------*/ 5092 5093 /* 5094 * File Descriptor pseudo-device driver (/dev/fd/). 5095 * 5096 * Opening minor device N dup()s the file (if any) connected to file 5097 * descriptor N belonging to the calling process. Note that this driver 5098 * consists of only the ``open()'' routine, because all subsequent 5099 * references to this file will be direct to the other driver. 5100 * 5101 * XXX: we could give this one a cloning event handler if necessary. 5102 */ 5103 5104 /* ARGSUSED */ 5105 static int 5106 fdopen(struct cdev *dev, int mode, int type, struct thread *td) 5107 { 5108 5109 /* 5110 * XXX Kludge: set curthread->td_dupfd to contain the value of the 5111 * the file descriptor being sought for duplication. The error 5112 * return ensures that the vnode for this device will be released 5113 * by vn_open. Open will detect this special error and take the 5114 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN 5115 * will simply report the error. 5116 */ 5117 td->td_dupfd = dev2unit(dev); 5118 return (ENODEV); 5119 } 5120 5121 static struct cdevsw fildesc_cdevsw = { 5122 .d_version = D_VERSION, 5123 .d_open = fdopen, 5124 .d_name = "FD", 5125 }; 5126 5127 static void 5128 fildesc_drvinit(void *unused) 5129 { 5130 struct cdev *dev; 5131 5132 dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL, 5133 UID_ROOT, GID_WHEEL, 0666, "fd/0"); 5134 make_dev_alias(dev, "stdin"); 5135 dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL, 5136 UID_ROOT, GID_WHEEL, 0666, "fd/1"); 5137 make_dev_alias(dev, "stdout"); 5138 dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL, 5139 UID_ROOT, GID_WHEEL, 0666, "fd/2"); 5140 make_dev_alias(dev, "stderr"); 5141 } 5142 5143 SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL); 5144