1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1989, 1991, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 37 */ 38 39 #include <sys/cdefs.h> 40 __FBSDID("$FreeBSD$"); 41 42 #include "opt_capsicum.h" 43 #include "opt_ddb.h" 44 #include "opt_ktrace.h" 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 49 #include <sys/capsicum.h> 50 #include <sys/conf.h> 51 #include <sys/fcntl.h> 52 #include <sys/file.h> 53 #include <sys/filedesc.h> 54 #include <sys/filio.h> 55 #include <sys/jail.h> 56 #include <sys/kernel.h> 57 #include <sys/limits.h> 58 #include <sys/lock.h> 59 #include <sys/malloc.h> 60 #include <sys/mount.h> 61 #include <sys/mutex.h> 62 #include <sys/namei.h> 63 #include <sys/selinfo.h> 64 #include <sys/poll.h> 65 #include <sys/priv.h> 66 #include <sys/proc.h> 67 #include <sys/protosw.h> 68 #include <sys/racct.h> 69 #include <sys/resourcevar.h> 70 #include <sys/sbuf.h> 71 #include <sys/signalvar.h> 72 #include <sys/kdb.h> 73 #include <sys/smr.h> 74 #include <sys/stat.h> 75 #include <sys/sx.h> 76 #include <sys/syscallsubr.h> 77 #include <sys/sysctl.h> 78 #include <sys/sysproto.h> 79 #include <sys/unistd.h> 80 #include <sys/user.h> 81 #include <sys/vnode.h> 82 #include <sys/ktrace.h> 83 84 #include <net/vnet.h> 85 86 #include <security/audit/audit.h> 87 88 #include <vm/uma.h> 89 #include <vm/vm.h> 90 91 #include <ddb/ddb.h> 92 93 static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table"); 94 static MALLOC_DEFINE(M_PWD, "pwd", "Descriptor table vnodes"); 95 static MALLOC_DEFINE(M_PWDDESC, "pwddesc", "Pwd descriptors"); 96 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader", 97 "file desc to leader structures"); 98 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); 99 MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities"); 100 101 MALLOC_DECLARE(M_FADVISE); 102 103 static __read_mostly uma_zone_t file_zone; 104 static __read_mostly uma_zone_t filedesc0_zone; 105 __read_mostly uma_zone_t pwd_zone; 106 VFS_SMR_DECLARE; 107 108 static int closefp(struct filedesc *fdp, int fd, struct file *fp, 109 struct thread *td, bool holdleaders, bool audit); 110 static void export_file_to_kinfo(struct file *fp, int fd, 111 cap_rights_t *rightsp, struct kinfo_file *kif, 112 struct filedesc *fdp, int flags); 113 static int fd_first_free(struct filedesc *fdp, int low, int size); 114 static void fdgrowtable(struct filedesc *fdp, int nfd); 115 static void fdgrowtable_exp(struct filedesc *fdp, int nfd); 116 static void fdunused(struct filedesc *fdp, int fd); 117 static void fdused(struct filedesc *fdp, int fd); 118 static int fget_unlocked_seq(struct thread *td, int fd, 119 cap_rights_t *needrightsp, struct file **fpp, seqc_t *seqp); 120 static int getmaxfd(struct thread *td); 121 static u_long *filecaps_copy_prep(const struct filecaps *src); 122 static void filecaps_copy_finish(const struct filecaps *src, 123 struct filecaps *dst, u_long *ioctls); 124 static u_long *filecaps_free_prep(struct filecaps *fcaps); 125 static void filecaps_free_finish(u_long *ioctls); 126 127 static struct pwd *pwd_alloc(void); 128 129 /* 130 * Each process has: 131 * 132 * - An array of open file descriptors (fd_ofiles) 133 * - An array of file flags (fd_ofileflags) 134 * - A bitmap recording which descriptors are in use (fd_map) 135 * 136 * A process starts out with NDFILE descriptors. The value of NDFILE has 137 * been selected based the historical limit of 20 open files, and an 138 * assumption that the majority of processes, especially short-lived 139 * processes like shells, will never need more. 140 * 141 * If this initial allocation is exhausted, a larger descriptor table and 142 * map are allocated dynamically, and the pointers in the process's struct 143 * filedesc are updated to point to those. This is repeated every time 144 * the process runs out of file descriptors (provided it hasn't hit its 145 * resource limit). 146 * 147 * Since threads may hold references to individual descriptor table 148 * entries, the tables are never freed. Instead, they are placed on a 149 * linked list and freed only when the struct filedesc is released. 150 */ 151 #define NDFILE 20 152 #define NDSLOTSIZE sizeof(NDSLOTTYPE) 153 #define NDENTRIES (NDSLOTSIZE * __CHAR_BIT) 154 #define NDSLOT(x) ((x) / NDENTRIES) 155 #define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES)) 156 #define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES) 157 158 /* 159 * SLIST entry used to keep track of ofiles which must be reclaimed when 160 * the process exits. 161 */ 162 struct freetable { 163 struct fdescenttbl *ft_table; 164 SLIST_ENTRY(freetable) ft_next; 165 }; 166 167 /* 168 * Initial allocation: a filedesc structure + the head of SLIST used to 169 * keep track of old ofiles + enough space for NDFILE descriptors. 170 */ 171 172 struct fdescenttbl0 { 173 int fdt_nfiles; 174 struct filedescent fdt_ofiles[NDFILE]; 175 }; 176 177 struct filedesc0 { 178 struct filedesc fd_fd; 179 SLIST_HEAD(, freetable) fd_free; 180 struct fdescenttbl0 fd_dfiles; 181 NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)]; 182 }; 183 184 /* 185 * Descriptor management. 186 */ 187 static int __exclusive_cache_line openfiles; /* actual number of open files */ 188 struct mtx sigio_lock; /* mtx to protect pointers to sigio */ 189 void __read_mostly (*mq_fdclose)(struct thread *td, int fd, struct file *fp); 190 191 /* 192 * If low >= size, just return low. Otherwise find the first zero bit in the 193 * given bitmap, starting at low and not exceeding size - 1. Return size if 194 * not found. 195 */ 196 static int 197 fd_first_free(struct filedesc *fdp, int low, int size) 198 { 199 NDSLOTTYPE *map = fdp->fd_map; 200 NDSLOTTYPE mask; 201 int off, maxoff; 202 203 if (low >= size) 204 return (low); 205 206 off = NDSLOT(low); 207 if (low % NDENTRIES) { 208 mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES))); 209 if ((mask &= ~map[off]) != 0UL) 210 return (off * NDENTRIES + ffsl(mask) - 1); 211 ++off; 212 } 213 for (maxoff = NDSLOTS(size); off < maxoff; ++off) 214 if (map[off] != ~0UL) 215 return (off * NDENTRIES + ffsl(~map[off]) - 1); 216 return (size); 217 } 218 219 /* 220 * Find the last used fd. 221 * 222 * Call this variant if fdp can't be modified by anyone else (e.g, during exec). 223 * Otherwise use fdlastfile. 224 */ 225 int 226 fdlastfile_single(struct filedesc *fdp) 227 { 228 NDSLOTTYPE *map = fdp->fd_map; 229 int off, minoff; 230 231 off = NDSLOT(fdp->fd_nfiles - 1); 232 for (minoff = NDSLOT(0); off >= minoff; --off) 233 if (map[off] != 0) 234 return (off * NDENTRIES + flsl(map[off]) - 1); 235 return (-1); 236 } 237 238 int 239 fdlastfile(struct filedesc *fdp) 240 { 241 242 FILEDESC_LOCK_ASSERT(fdp); 243 return (fdlastfile_single(fdp)); 244 } 245 246 static int 247 fdisused(struct filedesc *fdp, int fd) 248 { 249 250 KASSERT(fd >= 0 && fd < fdp->fd_nfiles, 251 ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles)); 252 253 return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0); 254 } 255 256 /* 257 * Mark a file descriptor as used. 258 */ 259 static void 260 fdused_init(struct filedesc *fdp, int fd) 261 { 262 263 KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd)); 264 265 fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd); 266 } 267 268 static void 269 fdused(struct filedesc *fdp, int fd) 270 { 271 272 FILEDESC_XLOCK_ASSERT(fdp); 273 274 fdused_init(fdp, fd); 275 if (fd == fdp->fd_freefile) 276 fdp->fd_freefile++; 277 } 278 279 /* 280 * Mark a file descriptor as unused. 281 */ 282 static void 283 fdunused(struct filedesc *fdp, int fd) 284 { 285 286 FILEDESC_XLOCK_ASSERT(fdp); 287 288 KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd)); 289 KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, 290 ("fd=%d is still in use", fd)); 291 292 fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd); 293 if (fd < fdp->fd_freefile) 294 fdp->fd_freefile = fd; 295 } 296 297 /* 298 * Free a file descriptor. 299 * 300 * Avoid some work if fdp is about to be destroyed. 301 */ 302 static inline void 303 fdefree_last(struct filedescent *fde) 304 { 305 306 filecaps_free(&fde->fde_caps); 307 } 308 309 static inline void 310 fdfree(struct filedesc *fdp, int fd) 311 { 312 struct filedescent *fde; 313 314 FILEDESC_XLOCK_ASSERT(fdp); 315 fde = &fdp->fd_ofiles[fd]; 316 #ifdef CAPABILITIES 317 seqc_write_begin(&fde->fde_seqc); 318 #endif 319 fde->fde_file = NULL; 320 #ifdef CAPABILITIES 321 seqc_write_end(&fde->fde_seqc); 322 #endif 323 fdefree_last(fde); 324 fdunused(fdp, fd); 325 } 326 327 /* 328 * System calls on descriptors. 329 */ 330 #ifndef _SYS_SYSPROTO_H_ 331 struct getdtablesize_args { 332 int dummy; 333 }; 334 #endif 335 /* ARGSUSED */ 336 int 337 sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap) 338 { 339 #ifdef RACCT 340 uint64_t lim; 341 #endif 342 343 td->td_retval[0] = getmaxfd(td); 344 #ifdef RACCT 345 PROC_LOCK(td->td_proc); 346 lim = racct_get_limit(td->td_proc, RACCT_NOFILE); 347 PROC_UNLOCK(td->td_proc); 348 if (lim < td->td_retval[0]) 349 td->td_retval[0] = lim; 350 #endif 351 return (0); 352 } 353 354 /* 355 * Duplicate a file descriptor to a particular value. 356 * 357 * Note: keep in mind that a potential race condition exists when closing 358 * descriptors from a shared descriptor table (via rfork). 359 */ 360 #ifndef _SYS_SYSPROTO_H_ 361 struct dup2_args { 362 u_int from; 363 u_int to; 364 }; 365 #endif 366 /* ARGSUSED */ 367 int 368 sys_dup2(struct thread *td, struct dup2_args *uap) 369 { 370 371 return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to)); 372 } 373 374 /* 375 * Duplicate a file descriptor. 376 */ 377 #ifndef _SYS_SYSPROTO_H_ 378 struct dup_args { 379 u_int fd; 380 }; 381 #endif 382 /* ARGSUSED */ 383 int 384 sys_dup(struct thread *td, struct dup_args *uap) 385 { 386 387 return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0)); 388 } 389 390 /* 391 * The file control system call. 392 */ 393 #ifndef _SYS_SYSPROTO_H_ 394 struct fcntl_args { 395 int fd; 396 int cmd; 397 long arg; 398 }; 399 #endif 400 /* ARGSUSED */ 401 int 402 sys_fcntl(struct thread *td, struct fcntl_args *uap) 403 { 404 405 return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg)); 406 } 407 408 int 409 kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg) 410 { 411 struct flock fl; 412 struct __oflock ofl; 413 intptr_t arg1; 414 int error, newcmd; 415 416 error = 0; 417 newcmd = cmd; 418 switch (cmd) { 419 case F_OGETLK: 420 case F_OSETLK: 421 case F_OSETLKW: 422 /* 423 * Convert old flock structure to new. 424 */ 425 error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl)); 426 fl.l_start = ofl.l_start; 427 fl.l_len = ofl.l_len; 428 fl.l_pid = ofl.l_pid; 429 fl.l_type = ofl.l_type; 430 fl.l_whence = ofl.l_whence; 431 fl.l_sysid = 0; 432 433 switch (cmd) { 434 case F_OGETLK: 435 newcmd = F_GETLK; 436 break; 437 case F_OSETLK: 438 newcmd = F_SETLK; 439 break; 440 case F_OSETLKW: 441 newcmd = F_SETLKW; 442 break; 443 } 444 arg1 = (intptr_t)&fl; 445 break; 446 case F_GETLK: 447 case F_SETLK: 448 case F_SETLKW: 449 case F_SETLK_REMOTE: 450 error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl)); 451 arg1 = (intptr_t)&fl; 452 break; 453 default: 454 arg1 = arg; 455 break; 456 } 457 if (error) 458 return (error); 459 error = kern_fcntl(td, fd, newcmd, arg1); 460 if (error) 461 return (error); 462 if (cmd == F_OGETLK) { 463 ofl.l_start = fl.l_start; 464 ofl.l_len = fl.l_len; 465 ofl.l_pid = fl.l_pid; 466 ofl.l_type = fl.l_type; 467 ofl.l_whence = fl.l_whence; 468 error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl)); 469 } else if (cmd == F_GETLK) { 470 error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl)); 471 } 472 return (error); 473 } 474 475 int 476 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) 477 { 478 struct filedesc *fdp; 479 struct flock *flp; 480 struct file *fp, *fp2; 481 struct filedescent *fde; 482 struct proc *p; 483 struct vnode *vp; 484 struct mount *mp; 485 struct kinfo_file *kif; 486 int error, flg, kif_sz, seals, tmp; 487 uint64_t bsize; 488 off_t foffset; 489 490 error = 0; 491 flg = F_POSIX; 492 p = td->td_proc; 493 fdp = p->p_fd; 494 495 AUDIT_ARG_FD(cmd); 496 AUDIT_ARG_CMD(cmd); 497 switch (cmd) { 498 case F_DUPFD: 499 tmp = arg; 500 error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp); 501 break; 502 503 case F_DUPFD_CLOEXEC: 504 tmp = arg; 505 error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp); 506 break; 507 508 case F_DUP2FD: 509 tmp = arg; 510 error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp); 511 break; 512 513 case F_DUP2FD_CLOEXEC: 514 tmp = arg; 515 error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp); 516 break; 517 518 case F_GETFD: 519 error = EBADF; 520 FILEDESC_SLOCK(fdp); 521 fde = fdeget_noref(fdp, fd); 522 if (fde != NULL) { 523 td->td_retval[0] = 524 (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0; 525 error = 0; 526 } 527 FILEDESC_SUNLOCK(fdp); 528 break; 529 530 case F_SETFD: 531 error = EBADF; 532 FILEDESC_XLOCK(fdp); 533 fde = fdeget_noref(fdp, fd); 534 if (fde != NULL) { 535 fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) | 536 (arg & FD_CLOEXEC ? UF_EXCLOSE : 0); 537 error = 0; 538 } 539 FILEDESC_XUNLOCK(fdp); 540 break; 541 542 case F_GETFL: 543 error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETFL, &fp); 544 if (error != 0) 545 break; 546 td->td_retval[0] = OFLAGS(fp->f_flag); 547 fdrop(fp, td); 548 break; 549 550 case F_SETFL: 551 error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETFL, &fp); 552 if (error != 0) 553 break; 554 if (fp->f_ops == &path_fileops) { 555 fdrop(fp, td); 556 error = EBADF; 557 break; 558 } 559 do { 560 tmp = flg = fp->f_flag; 561 tmp &= ~FCNTLFLAGS; 562 tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS; 563 } while (atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); 564 tmp = fp->f_flag & FNONBLOCK; 565 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 566 if (error != 0) { 567 fdrop(fp, td); 568 break; 569 } 570 tmp = fp->f_flag & FASYNC; 571 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); 572 if (error == 0) { 573 fdrop(fp, td); 574 break; 575 } 576 atomic_clear_int(&fp->f_flag, FNONBLOCK); 577 tmp = 0; 578 (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 579 fdrop(fp, td); 580 break; 581 582 case F_GETOWN: 583 error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETOWN, &fp); 584 if (error != 0) 585 break; 586 error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td); 587 if (error == 0) 588 td->td_retval[0] = tmp; 589 fdrop(fp, td); 590 break; 591 592 case F_SETOWN: 593 error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETOWN, &fp); 594 if (error != 0) 595 break; 596 tmp = arg; 597 error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td); 598 fdrop(fp, td); 599 break; 600 601 case F_SETLK_REMOTE: 602 error = priv_check(td, PRIV_NFS_LOCKD); 603 if (error != 0) 604 return (error); 605 flg = F_REMOTE; 606 goto do_setlk; 607 608 case F_SETLKW: 609 flg |= F_WAIT; 610 /* FALLTHROUGH F_SETLK */ 611 612 case F_SETLK: 613 do_setlk: 614 flp = (struct flock *)arg; 615 if ((flg & F_REMOTE) != 0 && flp->l_sysid == 0) { 616 error = EINVAL; 617 break; 618 } 619 620 error = fget_unlocked(td, fd, &cap_flock_rights, &fp); 621 if (error != 0) 622 break; 623 if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { 624 error = EBADF; 625 fdrop(fp, td); 626 break; 627 } 628 629 if (flp->l_whence == SEEK_CUR) { 630 foffset = foffset_get(fp); 631 if (foffset < 0 || 632 (flp->l_start > 0 && 633 foffset > OFF_MAX - flp->l_start)) { 634 error = EOVERFLOW; 635 fdrop(fp, td); 636 break; 637 } 638 flp->l_start += foffset; 639 } 640 641 vp = fp->f_vnode; 642 switch (flp->l_type) { 643 case F_RDLCK: 644 if ((fp->f_flag & FREAD) == 0) { 645 error = EBADF; 646 break; 647 } 648 if ((p->p_leader->p_flag & P_ADVLOCK) == 0) { 649 PROC_LOCK(p->p_leader); 650 p->p_leader->p_flag |= P_ADVLOCK; 651 PROC_UNLOCK(p->p_leader); 652 } 653 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, 654 flp, flg); 655 break; 656 case F_WRLCK: 657 if ((fp->f_flag & FWRITE) == 0) { 658 error = EBADF; 659 break; 660 } 661 if ((p->p_leader->p_flag & P_ADVLOCK) == 0) { 662 PROC_LOCK(p->p_leader); 663 p->p_leader->p_flag |= P_ADVLOCK; 664 PROC_UNLOCK(p->p_leader); 665 } 666 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, 667 flp, flg); 668 break; 669 case F_UNLCK: 670 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, 671 flp, flg); 672 break; 673 case F_UNLCKSYS: 674 if (flg != F_REMOTE) { 675 error = EINVAL; 676 break; 677 } 678 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, 679 F_UNLCKSYS, flp, flg); 680 break; 681 default: 682 error = EINVAL; 683 break; 684 } 685 if (error != 0 || flp->l_type == F_UNLCK || 686 flp->l_type == F_UNLCKSYS) { 687 fdrop(fp, td); 688 break; 689 } 690 691 /* 692 * Check for a race with close. 693 * 694 * The vnode is now advisory locked (or unlocked, but this case 695 * is not really important) as the caller requested. 696 * We had to drop the filedesc lock, so we need to recheck if 697 * the descriptor is still valid, because if it was closed 698 * in the meantime we need to remove advisory lock from the 699 * vnode - close on any descriptor leading to an advisory 700 * locked vnode, removes that lock. 701 * We will return 0 on purpose in that case, as the result of 702 * successful advisory lock might have been externally visible 703 * already. This is fine - effectively we pretend to the caller 704 * that the closing thread was a bit slower and that the 705 * advisory lock succeeded before the close. 706 */ 707 error = fget_unlocked(td, fd, &cap_no_rights, &fp2); 708 if (error != 0) { 709 fdrop(fp, td); 710 break; 711 } 712 if (fp != fp2) { 713 flp->l_whence = SEEK_SET; 714 flp->l_start = 0; 715 flp->l_len = 0; 716 flp->l_type = F_UNLCK; 717 (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, 718 F_UNLCK, flp, F_POSIX); 719 } 720 fdrop(fp, td); 721 fdrop(fp2, td); 722 break; 723 724 case F_GETLK: 725 error = fget_unlocked(td, fd, &cap_flock_rights, &fp); 726 if (error != 0) 727 break; 728 if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { 729 error = EBADF; 730 fdrop(fp, td); 731 break; 732 } 733 flp = (struct flock *)arg; 734 if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK && 735 flp->l_type != F_UNLCK) { 736 error = EINVAL; 737 fdrop(fp, td); 738 break; 739 } 740 if (flp->l_whence == SEEK_CUR) { 741 foffset = foffset_get(fp); 742 if ((flp->l_start > 0 && 743 foffset > OFF_MAX - flp->l_start) || 744 (flp->l_start < 0 && 745 foffset < OFF_MIN - flp->l_start)) { 746 error = EOVERFLOW; 747 fdrop(fp, td); 748 break; 749 } 750 flp->l_start += foffset; 751 } 752 vp = fp->f_vnode; 753 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp, 754 F_POSIX); 755 fdrop(fp, td); 756 break; 757 758 case F_ADD_SEALS: 759 error = fget_unlocked(td, fd, &cap_no_rights, &fp); 760 if (error != 0) 761 break; 762 error = fo_add_seals(fp, arg); 763 fdrop(fp, td); 764 break; 765 766 case F_GET_SEALS: 767 error = fget_unlocked(td, fd, &cap_no_rights, &fp); 768 if (error != 0) 769 break; 770 if (fo_get_seals(fp, &seals) == 0) 771 td->td_retval[0] = seals; 772 else 773 error = EINVAL; 774 fdrop(fp, td); 775 break; 776 777 case F_RDAHEAD: 778 arg = arg ? 128 * 1024: 0; 779 /* FALLTHROUGH */ 780 case F_READAHEAD: 781 error = fget_unlocked(td, fd, &cap_no_rights, &fp); 782 if (error != 0) 783 break; 784 if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { 785 fdrop(fp, td); 786 error = EBADF; 787 break; 788 } 789 vp = fp->f_vnode; 790 if (vp->v_type != VREG) { 791 fdrop(fp, td); 792 error = ENOTTY; 793 break; 794 } 795 796 /* 797 * Exclusive lock synchronizes against f_seqcount reads and 798 * writes in sequential_heuristic(). 799 */ 800 error = vn_lock(vp, LK_EXCLUSIVE); 801 if (error != 0) { 802 fdrop(fp, td); 803 break; 804 } 805 if (arg >= 0) { 806 bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize; 807 arg = MIN(arg, INT_MAX - bsize + 1); 808 fp->f_seqcount[UIO_READ] = MIN(IO_SEQMAX, 809 (arg + bsize - 1) / bsize); 810 atomic_set_int(&fp->f_flag, FRDAHEAD); 811 } else { 812 atomic_clear_int(&fp->f_flag, FRDAHEAD); 813 } 814 VOP_UNLOCK(vp); 815 fdrop(fp, td); 816 break; 817 818 case F_ISUNIONSTACK: 819 /* 820 * Check if the vnode is part of a union stack (either the 821 * "union" flag from mount(2) or unionfs). 822 * 823 * Prior to introduction of this op libc's readdir would call 824 * fstatfs(2), in effect unnecessarily copying kilobytes of 825 * data just to check fs name and a mount flag. 826 * 827 * Fixing the code to handle everything in the kernel instead 828 * is a non-trivial endeavor and has low priority, thus this 829 * horrible kludge facilitates the current behavior in a much 830 * cheaper manner until someone(tm) sorts this out. 831 */ 832 error = fget_unlocked(td, fd, &cap_no_rights, &fp); 833 if (error != 0) 834 break; 835 if (fp->f_type != DTYPE_VNODE) { 836 fdrop(fp, td); 837 error = EBADF; 838 break; 839 } 840 vp = fp->f_vnode; 841 /* 842 * Since we don't prevent dooming the vnode even non-null mp 843 * found can become immediately stale. This is tolerable since 844 * mount points are type-stable (providing safe memory access) 845 * and any vfs op on this vnode going forward will return an 846 * error (meaning return value in this case is meaningless). 847 */ 848 mp = atomic_load_ptr(&vp->v_mount); 849 if (__predict_false(mp == NULL)) { 850 fdrop(fp, td); 851 error = EBADF; 852 break; 853 } 854 td->td_retval[0] = 0; 855 if (mp->mnt_kern_flag & MNTK_UNIONFS || 856 mp->mnt_flag & MNT_UNION) 857 td->td_retval[0] = 1; 858 fdrop(fp, td); 859 break; 860 861 case F_KINFO: 862 #ifdef CAPABILITY_MODE 863 if (IN_CAPABILITY_MODE(td)) { 864 error = ECAPMODE; 865 break; 866 } 867 #endif 868 error = copyin((void *)arg, &kif_sz, sizeof(kif_sz)); 869 if (error != 0) 870 break; 871 if (kif_sz != sizeof(*kif)) { 872 error = EINVAL; 873 break; 874 } 875 kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK | M_ZERO); 876 FILEDESC_SLOCK(fdp); 877 error = fget_cap_noref(fdp, fd, &cap_fcntl_rights, &fp, NULL); 878 if (error == 0 && fhold(fp)) { 879 export_file_to_kinfo(fp, fd, NULL, kif, fdp, 0); 880 FILEDESC_SUNLOCK(fdp); 881 fdrop(fp, td); 882 if ((kif->kf_status & KF_ATTR_VALID) != 0) { 883 kif->kf_structsize = sizeof(*kif); 884 error = copyout(kif, (void *)arg, sizeof(*kif)); 885 } else { 886 error = EBADF; 887 } 888 } else { 889 FILEDESC_SUNLOCK(fdp); 890 if (error == 0) 891 error = EBADF; 892 } 893 free(kif, M_TEMP); 894 break; 895 896 default: 897 error = EINVAL; 898 break; 899 } 900 return (error); 901 } 902 903 static int 904 getmaxfd(struct thread *td) 905 { 906 907 return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc)); 908 } 909 910 /* 911 * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD). 912 */ 913 int 914 kern_dup(struct thread *td, u_int mode, int flags, int old, int new) 915 { 916 struct filedesc *fdp; 917 struct filedescent *oldfde, *newfde; 918 struct proc *p; 919 struct file *delfp, *oldfp; 920 u_long *oioctls, *nioctls; 921 int error, maxfd; 922 923 p = td->td_proc; 924 fdp = p->p_fd; 925 oioctls = NULL; 926 927 MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0); 928 MPASS(mode < FDDUP_LASTMODE); 929 930 AUDIT_ARG_FD(old); 931 /* XXXRW: if (flags & FDDUP_FIXED) AUDIT_ARG_FD2(new); */ 932 933 /* 934 * Verify we have a valid descriptor to dup from and possibly to 935 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should 936 * return EINVAL when the new descriptor is out of bounds. 937 */ 938 if (old < 0) 939 return (EBADF); 940 if (new < 0) 941 return (mode == FDDUP_FCNTL ? EINVAL : EBADF); 942 maxfd = getmaxfd(td); 943 if (new >= maxfd) 944 return (mode == FDDUP_FCNTL ? EINVAL : EBADF); 945 946 error = EBADF; 947 FILEDESC_XLOCK(fdp); 948 if (fget_noref(fdp, old) == NULL) 949 goto unlock; 950 if (mode == FDDUP_FIXED && old == new) { 951 td->td_retval[0] = new; 952 if (flags & FDDUP_FLAG_CLOEXEC) 953 fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE; 954 error = 0; 955 goto unlock; 956 } 957 958 oldfde = &fdp->fd_ofiles[old]; 959 oldfp = oldfde->fde_file; 960 if (!fhold(oldfp)) 961 goto unlock; 962 963 /* 964 * If the caller specified a file descriptor, make sure the file 965 * table is large enough to hold it, and grab it. Otherwise, just 966 * allocate a new descriptor the usual way. 967 */ 968 switch (mode) { 969 case FDDUP_NORMAL: 970 case FDDUP_FCNTL: 971 if ((error = fdalloc(td, new, &new)) != 0) { 972 fdrop(oldfp, td); 973 goto unlock; 974 } 975 break; 976 case FDDUP_FIXED: 977 if (new >= fdp->fd_nfiles) { 978 /* 979 * The resource limits are here instead of e.g. 980 * fdalloc(), because the file descriptor table may be 981 * shared between processes, so we can't really use 982 * racct_add()/racct_sub(). Instead of counting the 983 * number of actually allocated descriptors, just put 984 * the limit on the size of the file descriptor table. 985 */ 986 #ifdef RACCT 987 if (RACCT_ENABLED()) { 988 error = racct_set_unlocked(p, RACCT_NOFILE, new + 1); 989 if (error != 0) { 990 error = EMFILE; 991 fdrop(oldfp, td); 992 goto unlock; 993 } 994 } 995 #endif 996 fdgrowtable_exp(fdp, new + 1); 997 } 998 if (!fdisused(fdp, new)) 999 fdused(fdp, new); 1000 break; 1001 default: 1002 KASSERT(0, ("%s unsupported mode %d", __func__, mode)); 1003 } 1004 1005 KASSERT(old != new, ("new fd is same as old")); 1006 1007 /* Refetch oldfde because the table may have grown and old one freed. */ 1008 oldfde = &fdp->fd_ofiles[old]; 1009 KASSERT(oldfp == oldfde->fde_file, 1010 ("fdt_ofiles shift from growth observed at fd %d", 1011 old)); 1012 1013 newfde = &fdp->fd_ofiles[new]; 1014 delfp = newfde->fde_file; 1015 1016 nioctls = filecaps_copy_prep(&oldfde->fde_caps); 1017 1018 /* 1019 * Duplicate the source descriptor. 1020 */ 1021 #ifdef CAPABILITIES 1022 seqc_write_begin(&newfde->fde_seqc); 1023 #endif 1024 oioctls = filecaps_free_prep(&newfde->fde_caps); 1025 fde_copy(oldfde, newfde); 1026 filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps, 1027 nioctls); 1028 if ((flags & FDDUP_FLAG_CLOEXEC) != 0) 1029 newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE; 1030 else 1031 newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE; 1032 #ifdef CAPABILITIES 1033 seqc_write_end(&newfde->fde_seqc); 1034 #endif 1035 td->td_retval[0] = new; 1036 1037 error = 0; 1038 1039 if (delfp != NULL) { 1040 (void) closefp(fdp, new, delfp, td, true, false); 1041 FILEDESC_UNLOCK_ASSERT(fdp); 1042 } else { 1043 unlock: 1044 FILEDESC_XUNLOCK(fdp); 1045 } 1046 1047 filecaps_free_finish(oioctls); 1048 return (error); 1049 } 1050 1051 static void 1052 sigiofree(struct sigio *sigio) 1053 { 1054 crfree(sigio->sio_ucred); 1055 free(sigio, M_SIGIO); 1056 } 1057 1058 static struct sigio * 1059 funsetown_locked(struct sigio *sigio) 1060 { 1061 struct proc *p; 1062 struct pgrp *pg; 1063 1064 SIGIO_ASSERT_LOCKED(); 1065 1066 if (sigio == NULL) 1067 return (NULL); 1068 *sigio->sio_myref = NULL; 1069 if (sigio->sio_pgid < 0) { 1070 pg = sigio->sio_pgrp; 1071 PGRP_LOCK(pg); 1072 SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, sio_pgsigio); 1073 PGRP_UNLOCK(pg); 1074 } else { 1075 p = sigio->sio_proc; 1076 PROC_LOCK(p); 1077 SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, sio_pgsigio); 1078 PROC_UNLOCK(p); 1079 } 1080 return (sigio); 1081 } 1082 1083 /* 1084 * If sigio is on the list associated with a process or process group, 1085 * disable signalling from the device, remove sigio from the list and 1086 * free sigio. 1087 */ 1088 void 1089 funsetown(struct sigio **sigiop) 1090 { 1091 struct sigio *sigio; 1092 1093 /* Racy check, consumers must provide synchronization. */ 1094 if (*sigiop == NULL) 1095 return; 1096 1097 SIGIO_LOCK(); 1098 sigio = funsetown_locked(*sigiop); 1099 SIGIO_UNLOCK(); 1100 if (sigio != NULL) 1101 sigiofree(sigio); 1102 } 1103 1104 /* 1105 * Free a list of sigio structures. The caller must ensure that new sigio 1106 * structures cannot be added after this point. For process groups this is 1107 * guaranteed using the proctree lock; for processes, the P_WEXIT flag serves 1108 * as an interlock. 1109 */ 1110 void 1111 funsetownlst(struct sigiolst *sigiolst) 1112 { 1113 struct proc *p; 1114 struct pgrp *pg; 1115 struct sigio *sigio, *tmp; 1116 1117 /* Racy check. */ 1118 sigio = SLIST_FIRST(sigiolst); 1119 if (sigio == NULL) 1120 return; 1121 1122 p = NULL; 1123 pg = NULL; 1124 1125 SIGIO_LOCK(); 1126 sigio = SLIST_FIRST(sigiolst); 1127 if (sigio == NULL) { 1128 SIGIO_UNLOCK(); 1129 return; 1130 } 1131 1132 /* 1133 * Every entry of the list should belong to a single proc or pgrp. 1134 */ 1135 if (sigio->sio_pgid < 0) { 1136 pg = sigio->sio_pgrp; 1137 sx_assert(&proctree_lock, SX_XLOCKED); 1138 PGRP_LOCK(pg); 1139 } else /* if (sigio->sio_pgid > 0) */ { 1140 p = sigio->sio_proc; 1141 PROC_LOCK(p); 1142 KASSERT((p->p_flag & P_WEXIT) != 0, 1143 ("%s: process %p is not exiting", __func__, p)); 1144 } 1145 1146 SLIST_FOREACH(sigio, sigiolst, sio_pgsigio) { 1147 *sigio->sio_myref = NULL; 1148 if (pg != NULL) { 1149 KASSERT(sigio->sio_pgid < 0, 1150 ("Proc sigio in pgrp sigio list")); 1151 KASSERT(sigio->sio_pgrp == pg, 1152 ("Bogus pgrp in sigio list")); 1153 } else /* if (p != NULL) */ { 1154 KASSERT(sigio->sio_pgid > 0, 1155 ("Pgrp sigio in proc sigio list")); 1156 KASSERT(sigio->sio_proc == p, 1157 ("Bogus proc in sigio list")); 1158 } 1159 } 1160 1161 if (pg != NULL) 1162 PGRP_UNLOCK(pg); 1163 else 1164 PROC_UNLOCK(p); 1165 SIGIO_UNLOCK(); 1166 1167 SLIST_FOREACH_SAFE(sigio, sigiolst, sio_pgsigio, tmp) 1168 sigiofree(sigio); 1169 } 1170 1171 /* 1172 * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). 1173 * 1174 * After permission checking, add a sigio structure to the sigio list for 1175 * the process or process group. 1176 */ 1177 int 1178 fsetown(pid_t pgid, struct sigio **sigiop) 1179 { 1180 struct proc *proc; 1181 struct pgrp *pgrp; 1182 struct sigio *osigio, *sigio; 1183 int ret; 1184 1185 if (pgid == 0) { 1186 funsetown(sigiop); 1187 return (0); 1188 } 1189 1190 sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK); 1191 sigio->sio_pgid = pgid; 1192 sigio->sio_ucred = crhold(curthread->td_ucred); 1193 sigio->sio_myref = sigiop; 1194 1195 ret = 0; 1196 if (pgid > 0) { 1197 ret = pget(pgid, PGET_NOTWEXIT | PGET_NOTID | PGET_HOLD, &proc); 1198 SIGIO_LOCK(); 1199 osigio = funsetown_locked(*sigiop); 1200 if (ret == 0) { 1201 PROC_LOCK(proc); 1202 _PRELE(proc); 1203 if ((proc->p_flag & P_WEXIT) != 0) { 1204 ret = ESRCH; 1205 } else if (proc->p_session != 1206 curthread->td_proc->p_session) { 1207 /* 1208 * Policy - Don't allow a process to FSETOWN a 1209 * process in another session. 1210 * 1211 * Remove this test to allow maximum flexibility 1212 * or restrict FSETOWN to the current process or 1213 * process group for maximum safety. 1214 */ 1215 ret = EPERM; 1216 } else { 1217 sigio->sio_proc = proc; 1218 SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, 1219 sio_pgsigio); 1220 } 1221 PROC_UNLOCK(proc); 1222 } 1223 } else /* if (pgid < 0) */ { 1224 sx_slock(&proctree_lock); 1225 SIGIO_LOCK(); 1226 osigio = funsetown_locked(*sigiop); 1227 pgrp = pgfind(-pgid); 1228 if (pgrp == NULL) { 1229 ret = ESRCH; 1230 } else { 1231 if (pgrp->pg_session != curthread->td_proc->p_session) { 1232 /* 1233 * Policy - Don't allow a process to FSETOWN a 1234 * process in another session. 1235 * 1236 * Remove this test to allow maximum flexibility 1237 * or restrict FSETOWN to the current process or 1238 * process group for maximum safety. 1239 */ 1240 ret = EPERM; 1241 } else { 1242 sigio->sio_pgrp = pgrp; 1243 SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, 1244 sio_pgsigio); 1245 } 1246 PGRP_UNLOCK(pgrp); 1247 } 1248 sx_sunlock(&proctree_lock); 1249 } 1250 if (ret == 0) 1251 *sigiop = sigio; 1252 SIGIO_UNLOCK(); 1253 if (osigio != NULL) 1254 sigiofree(osigio); 1255 return (ret); 1256 } 1257 1258 /* 1259 * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). 1260 */ 1261 pid_t 1262 fgetown(struct sigio **sigiop) 1263 { 1264 pid_t pgid; 1265 1266 SIGIO_LOCK(); 1267 pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0; 1268 SIGIO_UNLOCK(); 1269 return (pgid); 1270 } 1271 1272 static int 1273 closefp_impl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, 1274 bool audit) 1275 { 1276 int error; 1277 1278 FILEDESC_XLOCK_ASSERT(fdp); 1279 1280 /* 1281 * We now hold the fp reference that used to be owned by the 1282 * descriptor array. We have to unlock the FILEDESC *AFTER* 1283 * knote_fdclose to prevent a race of the fd getting opened, a knote 1284 * added, and deleteing a knote for the new fd. 1285 */ 1286 if (__predict_false(!TAILQ_EMPTY(&fdp->fd_kqlist))) 1287 knote_fdclose(td, fd); 1288 1289 /* 1290 * We need to notify mqueue if the object is of type mqueue. 1291 */ 1292 if (__predict_false(fp->f_type == DTYPE_MQUEUE)) 1293 mq_fdclose(td, fd, fp); 1294 FILEDESC_XUNLOCK(fdp); 1295 1296 #ifdef AUDIT 1297 if (AUDITING_TD(td) && audit) 1298 audit_sysclose(td, fd, fp); 1299 #endif 1300 error = closef(fp, td); 1301 1302 /* 1303 * All paths leading up to closefp() will have already removed or 1304 * replaced the fd in the filedesc table, so a restart would not 1305 * operate on the same file. 1306 */ 1307 if (error == ERESTART) 1308 error = EINTR; 1309 1310 return (error); 1311 } 1312 1313 static int 1314 closefp_hl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, 1315 bool holdleaders, bool audit) 1316 { 1317 int error; 1318 1319 FILEDESC_XLOCK_ASSERT(fdp); 1320 1321 if (holdleaders) { 1322 if (td->td_proc->p_fdtol != NULL) { 1323 /* 1324 * Ask fdfree() to sleep to ensure that all relevant 1325 * process leaders can be traversed in closef(). 1326 */ 1327 fdp->fd_holdleaderscount++; 1328 } else { 1329 holdleaders = false; 1330 } 1331 } 1332 1333 error = closefp_impl(fdp, fd, fp, td, audit); 1334 if (holdleaders) { 1335 FILEDESC_XLOCK(fdp); 1336 fdp->fd_holdleaderscount--; 1337 if (fdp->fd_holdleaderscount == 0 && 1338 fdp->fd_holdleaderswakeup != 0) { 1339 fdp->fd_holdleaderswakeup = 0; 1340 wakeup(&fdp->fd_holdleaderscount); 1341 } 1342 FILEDESC_XUNLOCK(fdp); 1343 } 1344 return (error); 1345 } 1346 1347 static int 1348 closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, 1349 bool holdleaders, bool audit) 1350 { 1351 1352 FILEDESC_XLOCK_ASSERT(fdp); 1353 1354 if (__predict_false(td->td_proc->p_fdtol != NULL)) { 1355 return (closefp_hl(fdp, fd, fp, td, holdleaders, audit)); 1356 } else { 1357 return (closefp_impl(fdp, fd, fp, td, audit)); 1358 } 1359 } 1360 1361 /* 1362 * Close a file descriptor. 1363 */ 1364 #ifndef _SYS_SYSPROTO_H_ 1365 struct close_args { 1366 int fd; 1367 }; 1368 #endif 1369 /* ARGSUSED */ 1370 int 1371 sys_close(struct thread *td, struct close_args *uap) 1372 { 1373 1374 return (kern_close(td, uap->fd)); 1375 } 1376 1377 int 1378 kern_close(struct thread *td, int fd) 1379 { 1380 struct filedesc *fdp; 1381 struct file *fp; 1382 1383 fdp = td->td_proc->p_fd; 1384 1385 FILEDESC_XLOCK(fdp); 1386 if ((fp = fget_noref(fdp, fd)) == NULL) { 1387 FILEDESC_XUNLOCK(fdp); 1388 return (EBADF); 1389 } 1390 fdfree(fdp, fd); 1391 1392 /* closefp() drops the FILEDESC lock for us. */ 1393 return (closefp(fdp, fd, fp, td, true, true)); 1394 } 1395 1396 static int 1397 close_range_cloexec(struct thread *td, u_int lowfd, u_int highfd) 1398 { 1399 struct filedesc *fdp; 1400 struct fdescenttbl *fdt; 1401 struct filedescent *fde; 1402 int fd; 1403 1404 fdp = td->td_proc->p_fd; 1405 FILEDESC_XLOCK(fdp); 1406 fdt = atomic_load_ptr(&fdp->fd_files); 1407 highfd = MIN(highfd, fdt->fdt_nfiles - 1); 1408 fd = lowfd; 1409 if (__predict_false(fd > highfd)) { 1410 goto out_locked; 1411 } 1412 for (; fd <= highfd; fd++) { 1413 fde = &fdt->fdt_ofiles[fd]; 1414 if (fde->fde_file != NULL) 1415 fde->fde_flags |= UF_EXCLOSE; 1416 } 1417 out_locked: 1418 FILEDESC_XUNLOCK(fdp); 1419 return (0); 1420 } 1421 1422 static int 1423 close_range_impl(struct thread *td, u_int lowfd, u_int highfd) 1424 { 1425 struct filedesc *fdp; 1426 const struct fdescenttbl *fdt; 1427 struct file *fp; 1428 int fd; 1429 1430 fdp = td->td_proc->p_fd; 1431 FILEDESC_XLOCK(fdp); 1432 fdt = atomic_load_ptr(&fdp->fd_files); 1433 highfd = MIN(highfd, fdt->fdt_nfiles - 1); 1434 fd = lowfd; 1435 if (__predict_false(fd > highfd)) { 1436 goto out_locked; 1437 } 1438 for (;;) { 1439 fp = fdt->fdt_ofiles[fd].fde_file; 1440 if (fp == NULL) { 1441 if (fd == highfd) 1442 goto out_locked; 1443 } else { 1444 fdfree(fdp, fd); 1445 (void) closefp(fdp, fd, fp, td, true, true); 1446 if (fd == highfd) 1447 goto out_unlocked; 1448 FILEDESC_XLOCK(fdp); 1449 fdt = atomic_load_ptr(&fdp->fd_files); 1450 } 1451 fd++; 1452 } 1453 out_locked: 1454 FILEDESC_XUNLOCK(fdp); 1455 out_unlocked: 1456 return (0); 1457 } 1458 1459 int 1460 kern_close_range(struct thread *td, int flags, u_int lowfd, u_int highfd) 1461 { 1462 1463 /* 1464 * Check this prior to clamping; closefrom(3) with only fd 0, 1, and 2 1465 * open should not be a usage error. From a close_range() perspective, 1466 * close_range(3, ~0U, 0) in the same scenario should also likely not 1467 * be a usage error as all fd above 3 are in-fact already closed. 1468 */ 1469 if (highfd < lowfd) { 1470 return (EINVAL); 1471 } 1472 1473 if ((flags & CLOSE_RANGE_CLOEXEC) != 0) 1474 return (close_range_cloexec(td, lowfd, highfd)); 1475 1476 return (close_range_impl(td, lowfd, highfd)); 1477 } 1478 1479 #ifndef _SYS_SYSPROTO_H_ 1480 struct close_range_args { 1481 u_int lowfd; 1482 u_int highfd; 1483 int flags; 1484 }; 1485 #endif 1486 int 1487 sys_close_range(struct thread *td, struct close_range_args *uap) 1488 { 1489 1490 AUDIT_ARG_FD(uap->lowfd); 1491 AUDIT_ARG_CMD(uap->highfd); 1492 AUDIT_ARG_FFLAGS(uap->flags); 1493 1494 if ((uap->flags & ~(CLOSE_RANGE_CLOEXEC)) != 0) 1495 return (EINVAL); 1496 return (kern_close_range(td, uap->flags, uap->lowfd, uap->highfd)); 1497 } 1498 1499 #ifdef COMPAT_FREEBSD12 1500 /* 1501 * Close open file descriptors. 1502 */ 1503 #ifndef _SYS_SYSPROTO_H_ 1504 struct freebsd12_closefrom_args { 1505 int lowfd; 1506 }; 1507 #endif 1508 /* ARGSUSED */ 1509 int 1510 freebsd12_closefrom(struct thread *td, struct freebsd12_closefrom_args *uap) 1511 { 1512 u_int lowfd; 1513 1514 AUDIT_ARG_FD(uap->lowfd); 1515 1516 /* 1517 * Treat negative starting file descriptor values identical to 1518 * closefrom(0) which closes all files. 1519 */ 1520 lowfd = MAX(0, uap->lowfd); 1521 return (kern_close_range(td, 0, lowfd, ~0U)); 1522 } 1523 #endif /* COMPAT_FREEBSD12 */ 1524 1525 #if defined(COMPAT_43) 1526 /* 1527 * Return status information about a file descriptor. 1528 */ 1529 #ifndef _SYS_SYSPROTO_H_ 1530 struct ofstat_args { 1531 int fd; 1532 struct ostat *sb; 1533 }; 1534 #endif 1535 /* ARGSUSED */ 1536 int 1537 ofstat(struct thread *td, struct ofstat_args *uap) 1538 { 1539 struct ostat oub; 1540 struct stat ub; 1541 int error; 1542 1543 error = kern_fstat(td, uap->fd, &ub); 1544 if (error == 0) { 1545 cvtstat(&ub, &oub); 1546 error = copyout(&oub, uap->sb, sizeof(oub)); 1547 } 1548 return (error); 1549 } 1550 #endif /* COMPAT_43 */ 1551 1552 #if defined(COMPAT_FREEBSD11) 1553 int 1554 freebsd11_fstat(struct thread *td, struct freebsd11_fstat_args *uap) 1555 { 1556 struct stat sb; 1557 struct freebsd11_stat osb; 1558 int error; 1559 1560 error = kern_fstat(td, uap->fd, &sb); 1561 if (error != 0) 1562 return (error); 1563 error = freebsd11_cvtstat(&sb, &osb); 1564 if (error == 0) 1565 error = copyout(&osb, uap->sb, sizeof(osb)); 1566 return (error); 1567 } 1568 #endif /* COMPAT_FREEBSD11 */ 1569 1570 /* 1571 * Return status information about a file descriptor. 1572 */ 1573 #ifndef _SYS_SYSPROTO_H_ 1574 struct fstat_args { 1575 int fd; 1576 struct stat *sb; 1577 }; 1578 #endif 1579 /* ARGSUSED */ 1580 int 1581 sys_fstat(struct thread *td, struct fstat_args *uap) 1582 { 1583 struct stat ub; 1584 int error; 1585 1586 error = kern_fstat(td, uap->fd, &ub); 1587 if (error == 0) 1588 error = copyout(&ub, uap->sb, sizeof(ub)); 1589 return (error); 1590 } 1591 1592 int 1593 kern_fstat(struct thread *td, int fd, struct stat *sbp) 1594 { 1595 struct file *fp; 1596 int error; 1597 1598 AUDIT_ARG_FD(fd); 1599 1600 error = fget(td, fd, &cap_fstat_rights, &fp); 1601 if (__predict_false(error != 0)) 1602 return (error); 1603 1604 AUDIT_ARG_FILE(td->td_proc, fp); 1605 1606 error = fo_stat(fp, sbp, td->td_ucred); 1607 fdrop(fp, td); 1608 #ifdef __STAT_TIME_T_EXT 1609 sbp->st_atim_ext = 0; 1610 sbp->st_mtim_ext = 0; 1611 sbp->st_ctim_ext = 0; 1612 sbp->st_btim_ext = 0; 1613 #endif 1614 #ifdef KTRACE 1615 if (KTRPOINT(td, KTR_STRUCT)) 1616 ktrstat_error(sbp, error); 1617 #endif 1618 return (error); 1619 } 1620 1621 #if defined(COMPAT_FREEBSD11) 1622 /* 1623 * Return status information about a file descriptor. 1624 */ 1625 #ifndef _SYS_SYSPROTO_H_ 1626 struct freebsd11_nfstat_args { 1627 int fd; 1628 struct nstat *sb; 1629 }; 1630 #endif 1631 /* ARGSUSED */ 1632 int 1633 freebsd11_nfstat(struct thread *td, struct freebsd11_nfstat_args *uap) 1634 { 1635 struct nstat nub; 1636 struct stat ub; 1637 int error; 1638 1639 error = kern_fstat(td, uap->fd, &ub); 1640 if (error != 0) 1641 return (error); 1642 error = freebsd11_cvtnstat(&ub, &nub); 1643 if (error != 0) 1644 error = copyout(&nub, uap->sb, sizeof(nub)); 1645 return (error); 1646 } 1647 #endif /* COMPAT_FREEBSD11 */ 1648 1649 /* 1650 * Return pathconf information about a file descriptor. 1651 */ 1652 #ifndef _SYS_SYSPROTO_H_ 1653 struct fpathconf_args { 1654 int fd; 1655 int name; 1656 }; 1657 #endif 1658 /* ARGSUSED */ 1659 int 1660 sys_fpathconf(struct thread *td, struct fpathconf_args *uap) 1661 { 1662 long value; 1663 int error; 1664 1665 error = kern_fpathconf(td, uap->fd, uap->name, &value); 1666 if (error == 0) 1667 td->td_retval[0] = value; 1668 return (error); 1669 } 1670 1671 int 1672 kern_fpathconf(struct thread *td, int fd, int name, long *valuep) 1673 { 1674 struct file *fp; 1675 struct vnode *vp; 1676 int error; 1677 1678 error = fget(td, fd, &cap_fpathconf_rights, &fp); 1679 if (error != 0) 1680 return (error); 1681 1682 if (name == _PC_ASYNC_IO) { 1683 *valuep = _POSIX_ASYNCHRONOUS_IO; 1684 goto out; 1685 } 1686 vp = fp->f_vnode; 1687 if (vp != NULL) { 1688 vn_lock(vp, LK_SHARED | LK_RETRY); 1689 error = VOP_PATHCONF(vp, name, valuep); 1690 VOP_UNLOCK(vp); 1691 } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) { 1692 if (name != _PC_PIPE_BUF) { 1693 error = EINVAL; 1694 } else { 1695 *valuep = PIPE_BUF; 1696 error = 0; 1697 } 1698 } else { 1699 error = EOPNOTSUPP; 1700 } 1701 out: 1702 fdrop(fp, td); 1703 return (error); 1704 } 1705 1706 /* 1707 * Copy filecaps structure allocating memory for ioctls array if needed. 1708 * 1709 * The last parameter indicates whether the fdtable is locked. If it is not and 1710 * ioctls are encountered, copying fails and the caller must lock the table. 1711 * 1712 * Note that if the table was not locked, the caller has to check the relevant 1713 * sequence counter to determine whether the operation was successful. 1714 */ 1715 bool 1716 filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked) 1717 { 1718 size_t size; 1719 1720 if (src->fc_ioctls != NULL && !locked) 1721 return (false); 1722 memcpy(dst, src, sizeof(*src)); 1723 if (src->fc_ioctls == NULL) 1724 return (true); 1725 1726 KASSERT(src->fc_nioctls > 0, 1727 ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); 1728 1729 size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; 1730 dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK); 1731 memcpy(dst->fc_ioctls, src->fc_ioctls, size); 1732 return (true); 1733 } 1734 1735 static u_long * 1736 filecaps_copy_prep(const struct filecaps *src) 1737 { 1738 u_long *ioctls; 1739 size_t size; 1740 1741 if (__predict_true(src->fc_ioctls == NULL)) 1742 return (NULL); 1743 1744 KASSERT(src->fc_nioctls > 0, 1745 ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); 1746 1747 size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; 1748 ioctls = malloc(size, M_FILECAPS, M_WAITOK); 1749 return (ioctls); 1750 } 1751 1752 static void 1753 filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst, 1754 u_long *ioctls) 1755 { 1756 size_t size; 1757 1758 *dst = *src; 1759 if (__predict_true(src->fc_ioctls == NULL)) { 1760 MPASS(ioctls == NULL); 1761 return; 1762 } 1763 1764 size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; 1765 dst->fc_ioctls = ioctls; 1766 bcopy(src->fc_ioctls, dst->fc_ioctls, size); 1767 } 1768 1769 /* 1770 * Move filecaps structure to the new place and clear the old place. 1771 */ 1772 void 1773 filecaps_move(struct filecaps *src, struct filecaps *dst) 1774 { 1775 1776 *dst = *src; 1777 bzero(src, sizeof(*src)); 1778 } 1779 1780 /* 1781 * Fill the given filecaps structure with full rights. 1782 */ 1783 static void 1784 filecaps_fill(struct filecaps *fcaps) 1785 { 1786 1787 CAP_ALL(&fcaps->fc_rights); 1788 fcaps->fc_ioctls = NULL; 1789 fcaps->fc_nioctls = -1; 1790 fcaps->fc_fcntls = CAP_FCNTL_ALL; 1791 } 1792 1793 /* 1794 * Free memory allocated within filecaps structure. 1795 */ 1796 void 1797 filecaps_free(struct filecaps *fcaps) 1798 { 1799 1800 free(fcaps->fc_ioctls, M_FILECAPS); 1801 bzero(fcaps, sizeof(*fcaps)); 1802 } 1803 1804 static u_long * 1805 filecaps_free_prep(struct filecaps *fcaps) 1806 { 1807 u_long *ioctls; 1808 1809 ioctls = fcaps->fc_ioctls; 1810 bzero(fcaps, sizeof(*fcaps)); 1811 return (ioctls); 1812 } 1813 1814 static void 1815 filecaps_free_finish(u_long *ioctls) 1816 { 1817 1818 free(ioctls, M_FILECAPS); 1819 } 1820 1821 /* 1822 * Validate the given filecaps structure. 1823 */ 1824 static void 1825 filecaps_validate(const struct filecaps *fcaps, const char *func) 1826 { 1827 1828 KASSERT(cap_rights_is_valid(&fcaps->fc_rights), 1829 ("%s: invalid rights", func)); 1830 KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0, 1831 ("%s: invalid fcntls", func)); 1832 KASSERT(fcaps->fc_fcntls == 0 || 1833 cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL), 1834 ("%s: fcntls without CAP_FCNTL", func)); 1835 KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 : 1836 (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0), 1837 ("%s: invalid ioctls", func)); 1838 KASSERT(fcaps->fc_nioctls == 0 || 1839 cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL), 1840 ("%s: ioctls without CAP_IOCTL", func)); 1841 } 1842 1843 static void 1844 fdgrowtable_exp(struct filedesc *fdp, int nfd) 1845 { 1846 int nfd1; 1847 1848 FILEDESC_XLOCK_ASSERT(fdp); 1849 1850 nfd1 = fdp->fd_nfiles * 2; 1851 if (nfd1 < nfd) 1852 nfd1 = nfd; 1853 fdgrowtable(fdp, nfd1); 1854 } 1855 1856 /* 1857 * Grow the file table to accommodate (at least) nfd descriptors. 1858 */ 1859 static void 1860 fdgrowtable(struct filedesc *fdp, int nfd) 1861 { 1862 struct filedesc0 *fdp0; 1863 struct freetable *ft; 1864 struct fdescenttbl *ntable; 1865 struct fdescenttbl *otable; 1866 int nnfiles, onfiles; 1867 NDSLOTTYPE *nmap, *omap; 1868 1869 KASSERT(fdp->fd_nfiles > 0, ("zero-length file table")); 1870 1871 /* save old values */ 1872 onfiles = fdp->fd_nfiles; 1873 otable = fdp->fd_files; 1874 omap = fdp->fd_map; 1875 1876 /* compute the size of the new table */ 1877 nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */ 1878 if (nnfiles <= onfiles) 1879 /* the table is already large enough */ 1880 return; 1881 1882 /* 1883 * Allocate a new table. We need enough space for the number of 1884 * entries, file entries themselves and the struct freetable we will use 1885 * when we decommission the table and place it on the freelist. 1886 * We place the struct freetable in the middle so we don't have 1887 * to worry about padding. 1888 */ 1889 ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) + 1890 nnfiles * sizeof(ntable->fdt_ofiles[0]) + 1891 sizeof(struct freetable), 1892 M_FILEDESC, M_ZERO | M_WAITOK); 1893 /* copy the old data */ 1894 ntable->fdt_nfiles = nnfiles; 1895 memcpy(ntable->fdt_ofiles, otable->fdt_ofiles, 1896 onfiles * sizeof(ntable->fdt_ofiles[0])); 1897 1898 /* 1899 * Allocate a new map only if the old is not large enough. It will 1900 * grow at a slower rate than the table as it can map more 1901 * entries than the table can hold. 1902 */ 1903 if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) { 1904 nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC, 1905 M_ZERO | M_WAITOK); 1906 /* copy over the old data and update the pointer */ 1907 memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap)); 1908 fdp->fd_map = nmap; 1909 } 1910 1911 /* 1912 * Make sure that ntable is correctly initialized before we replace 1913 * fd_files poiner. Otherwise fget_unlocked() may see inconsistent 1914 * data. 1915 */ 1916 atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable); 1917 1918 /* 1919 * Free the old file table when not shared by other threads or processes. 1920 * The old file table is considered to be shared when either are true: 1921 * - The process has more than one thread. 1922 * - The file descriptor table has been shared via fdshare(). 1923 * 1924 * When shared, the old file table will be placed on a freelist 1925 * which will be processed when the struct filedesc is released. 1926 * 1927 * Note that if onfiles == NDFILE, we're dealing with the original 1928 * static allocation contained within (struct filedesc0 *)fdp, 1929 * which must not be freed. 1930 */ 1931 if (onfiles > NDFILE) { 1932 /* 1933 * Note we may be called here from fdinit while allocating a 1934 * table for a new process in which case ->p_fd points 1935 * elsewhere. 1936 */ 1937 if (curproc->p_fd != fdp || FILEDESC_IS_ONLY_USER(fdp)) { 1938 free(otable, M_FILEDESC); 1939 } else { 1940 ft = (struct freetable *)&otable->fdt_ofiles[onfiles]; 1941 fdp0 = (struct filedesc0 *)fdp; 1942 ft->ft_table = otable; 1943 SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next); 1944 } 1945 } 1946 /* 1947 * The map does not have the same possibility of threads still 1948 * holding references to it. So always free it as long as it 1949 * does not reference the original static allocation. 1950 */ 1951 if (NDSLOTS(onfiles) > NDSLOTS(NDFILE)) 1952 free(omap, M_FILEDESC); 1953 } 1954 1955 /* 1956 * Allocate a file descriptor for the process. 1957 */ 1958 int 1959 fdalloc(struct thread *td, int minfd, int *result) 1960 { 1961 struct proc *p = td->td_proc; 1962 struct filedesc *fdp = p->p_fd; 1963 int fd, maxfd, allocfd; 1964 #ifdef RACCT 1965 int error; 1966 #endif 1967 1968 FILEDESC_XLOCK_ASSERT(fdp); 1969 1970 if (fdp->fd_freefile > minfd) 1971 minfd = fdp->fd_freefile; 1972 1973 maxfd = getmaxfd(td); 1974 1975 /* 1976 * Search the bitmap for a free descriptor starting at minfd. 1977 * If none is found, grow the file table. 1978 */ 1979 fd = fd_first_free(fdp, minfd, fdp->fd_nfiles); 1980 if (__predict_false(fd >= maxfd)) 1981 return (EMFILE); 1982 if (__predict_false(fd >= fdp->fd_nfiles)) { 1983 allocfd = min(fd * 2, maxfd); 1984 #ifdef RACCT 1985 if (RACCT_ENABLED()) { 1986 error = racct_set_unlocked(p, RACCT_NOFILE, allocfd); 1987 if (error != 0) 1988 return (EMFILE); 1989 } 1990 #endif 1991 /* 1992 * fd is already equal to first free descriptor >= minfd, so 1993 * we only need to grow the table and we are done. 1994 */ 1995 fdgrowtable_exp(fdp, allocfd); 1996 } 1997 1998 /* 1999 * Perform some sanity checks, then mark the file descriptor as 2000 * used and return it to the caller. 2001 */ 2002 KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles), 2003 ("invalid descriptor %d", fd)); 2004 KASSERT(!fdisused(fdp, fd), 2005 ("fd_first_free() returned non-free descriptor")); 2006 KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, 2007 ("file descriptor isn't free")); 2008 fdused(fdp, fd); 2009 *result = fd; 2010 return (0); 2011 } 2012 2013 /* 2014 * Allocate n file descriptors for the process. 2015 */ 2016 int 2017 fdallocn(struct thread *td, int minfd, int *fds, int n) 2018 { 2019 struct proc *p = td->td_proc; 2020 struct filedesc *fdp = p->p_fd; 2021 int i; 2022 2023 FILEDESC_XLOCK_ASSERT(fdp); 2024 2025 for (i = 0; i < n; i++) 2026 if (fdalloc(td, 0, &fds[i]) != 0) 2027 break; 2028 2029 if (i < n) { 2030 for (i--; i >= 0; i--) 2031 fdunused(fdp, fds[i]); 2032 return (EMFILE); 2033 } 2034 2035 return (0); 2036 } 2037 2038 /* 2039 * Create a new open file structure and allocate a file descriptor for the 2040 * process that refers to it. We add one reference to the file for the 2041 * descriptor table and one reference for resultfp. This is to prevent us 2042 * being preempted and the entry in the descriptor table closed after we 2043 * release the FILEDESC lock. 2044 */ 2045 int 2046 falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags, 2047 struct filecaps *fcaps) 2048 { 2049 struct file *fp; 2050 int error, fd; 2051 2052 MPASS(resultfp != NULL); 2053 MPASS(resultfd != NULL); 2054 2055 error = _falloc_noinstall(td, &fp, 2); 2056 if (__predict_false(error != 0)) { 2057 return (error); 2058 } 2059 2060 error = finstall_refed(td, fp, &fd, flags, fcaps); 2061 if (__predict_false(error != 0)) { 2062 falloc_abort(td, fp); 2063 return (error); 2064 } 2065 2066 *resultfp = fp; 2067 *resultfd = fd; 2068 2069 return (0); 2070 } 2071 2072 /* 2073 * Create a new open file structure without allocating a file descriptor. 2074 */ 2075 int 2076 _falloc_noinstall(struct thread *td, struct file **resultfp, u_int n) 2077 { 2078 struct file *fp; 2079 int maxuserfiles = maxfiles - (maxfiles / 20); 2080 int openfiles_new; 2081 static struct timeval lastfail; 2082 static int curfail; 2083 2084 KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__)); 2085 MPASS(n > 0); 2086 2087 openfiles_new = atomic_fetchadd_int(&openfiles, 1) + 1; 2088 if ((openfiles_new >= maxuserfiles && 2089 priv_check(td, PRIV_MAXFILES) != 0) || 2090 openfiles_new >= maxfiles) { 2091 atomic_subtract_int(&openfiles, 1); 2092 if (ppsratecheck(&lastfail, &curfail, 1)) { 2093 printf("kern.maxfiles limit exceeded by uid %i, (%s) " 2094 "please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm); 2095 } 2096 return (ENFILE); 2097 } 2098 fp = uma_zalloc(file_zone, M_WAITOK); 2099 bzero(fp, sizeof(*fp)); 2100 refcount_init(&fp->f_count, n); 2101 fp->f_cred = crhold(td->td_ucred); 2102 fp->f_ops = &badfileops; 2103 *resultfp = fp; 2104 return (0); 2105 } 2106 2107 void 2108 falloc_abort(struct thread *td, struct file *fp) 2109 { 2110 2111 /* 2112 * For assertion purposes. 2113 */ 2114 refcount_init(&fp->f_count, 0); 2115 _fdrop(fp, td); 2116 } 2117 2118 /* 2119 * Install a file in a file descriptor table. 2120 */ 2121 void 2122 _finstall(struct filedesc *fdp, struct file *fp, int fd, int flags, 2123 struct filecaps *fcaps) 2124 { 2125 struct filedescent *fde; 2126 2127 MPASS(fp != NULL); 2128 if (fcaps != NULL) 2129 filecaps_validate(fcaps, __func__); 2130 FILEDESC_XLOCK_ASSERT(fdp); 2131 2132 fde = &fdp->fd_ofiles[fd]; 2133 #ifdef CAPABILITIES 2134 seqc_write_begin(&fde->fde_seqc); 2135 #endif 2136 fde->fde_file = fp; 2137 fde->fde_flags = (flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0; 2138 if (fcaps != NULL) 2139 filecaps_move(fcaps, &fde->fde_caps); 2140 else 2141 filecaps_fill(&fde->fde_caps); 2142 #ifdef CAPABILITIES 2143 seqc_write_end(&fde->fde_seqc); 2144 #endif 2145 } 2146 2147 int 2148 finstall_refed(struct thread *td, struct file *fp, int *fd, int flags, 2149 struct filecaps *fcaps) 2150 { 2151 struct filedesc *fdp = td->td_proc->p_fd; 2152 int error; 2153 2154 MPASS(fd != NULL); 2155 2156 FILEDESC_XLOCK(fdp); 2157 error = fdalloc(td, 0, fd); 2158 if (__predict_true(error == 0)) { 2159 _finstall(fdp, fp, *fd, flags, fcaps); 2160 } 2161 FILEDESC_XUNLOCK(fdp); 2162 return (error); 2163 } 2164 2165 int 2166 finstall(struct thread *td, struct file *fp, int *fd, int flags, 2167 struct filecaps *fcaps) 2168 { 2169 int error; 2170 2171 MPASS(fd != NULL); 2172 2173 if (!fhold(fp)) 2174 return (EBADF); 2175 error = finstall_refed(td, fp, fd, flags, fcaps); 2176 if (__predict_false(error != 0)) { 2177 fdrop(fp, td); 2178 } 2179 return (error); 2180 } 2181 2182 /* 2183 * Build a new filedesc structure from another. 2184 * 2185 * If fdp is not NULL, return with it shared locked. 2186 */ 2187 struct filedesc * 2188 fdinit(void) 2189 { 2190 struct filedesc0 *newfdp0; 2191 struct filedesc *newfdp; 2192 2193 newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO); 2194 newfdp = &newfdp0->fd_fd; 2195 2196 /* Create the file descriptor table. */ 2197 FILEDESC_LOCK_INIT(newfdp); 2198 refcount_init(&newfdp->fd_refcnt, 1); 2199 refcount_init(&newfdp->fd_holdcnt, 1); 2200 newfdp->fd_map = newfdp0->fd_dmap; 2201 newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles; 2202 newfdp->fd_files->fdt_nfiles = NDFILE; 2203 2204 return (newfdp); 2205 } 2206 2207 /* 2208 * Build a pwddesc structure from another. 2209 * Copy the current, root, and jail root vnode references. 2210 * 2211 * If pdp is not NULL, return with it shared locked. 2212 */ 2213 struct pwddesc * 2214 pdinit(struct pwddesc *pdp, bool keeplock) 2215 { 2216 struct pwddesc *newpdp; 2217 struct pwd *newpwd; 2218 2219 newpdp = malloc(sizeof(*newpdp), M_PWDDESC, M_WAITOK | M_ZERO); 2220 2221 PWDDESC_LOCK_INIT(newpdp); 2222 refcount_init(&newpdp->pd_refcount, 1); 2223 newpdp->pd_cmask = CMASK; 2224 2225 if (pdp == NULL) { 2226 newpwd = pwd_alloc(); 2227 smr_serialized_store(&newpdp->pd_pwd, newpwd, true); 2228 return (newpdp); 2229 } 2230 2231 PWDDESC_XLOCK(pdp); 2232 newpwd = pwd_hold_pwddesc(pdp); 2233 smr_serialized_store(&newpdp->pd_pwd, newpwd, true); 2234 if (!keeplock) 2235 PWDDESC_XUNLOCK(pdp); 2236 return (newpdp); 2237 } 2238 2239 /* 2240 * Hold either filedesc or pwddesc of the passed process. 2241 * 2242 * The process lock is used to synchronize against the target exiting and 2243 * freeing the data. 2244 * 2245 * Clearing can be ilustrated in 3 steps: 2246 * 1. set the pointer to NULL. Either routine can race against it, hence 2247 * atomic_load_ptr. 2248 * 2. observe the process lock as not taken. Until then fdhold/pdhold can 2249 * race to either still see the pointer or find NULL. It is still safe to 2250 * grab a reference as clearing is stalled. 2251 * 3. after the lock is observed as not taken, any fdhold/pdhold calls are 2252 * guaranteed to see NULL, making it safe to finish clearing 2253 */ 2254 static struct filedesc * 2255 fdhold(struct proc *p) 2256 { 2257 struct filedesc *fdp; 2258 2259 PROC_LOCK_ASSERT(p, MA_OWNED); 2260 fdp = atomic_load_ptr(&p->p_fd); 2261 if (fdp != NULL) 2262 refcount_acquire(&fdp->fd_holdcnt); 2263 return (fdp); 2264 } 2265 2266 static struct pwddesc * 2267 pdhold(struct proc *p) 2268 { 2269 struct pwddesc *pdp; 2270 2271 PROC_LOCK_ASSERT(p, MA_OWNED); 2272 pdp = atomic_load_ptr(&p->p_pd); 2273 if (pdp != NULL) 2274 refcount_acquire(&pdp->pd_refcount); 2275 return (pdp); 2276 } 2277 2278 static void 2279 fddrop(struct filedesc *fdp) 2280 { 2281 2282 if (refcount_load(&fdp->fd_holdcnt) > 1) { 2283 if (refcount_release(&fdp->fd_holdcnt) == 0) 2284 return; 2285 } 2286 2287 FILEDESC_LOCK_DESTROY(fdp); 2288 uma_zfree(filedesc0_zone, fdp); 2289 } 2290 2291 static void 2292 pddrop(struct pwddesc *pdp) 2293 { 2294 struct pwd *pwd; 2295 2296 if (refcount_release_if_not_last(&pdp->pd_refcount)) 2297 return; 2298 2299 PWDDESC_XLOCK(pdp); 2300 if (refcount_release(&pdp->pd_refcount) == 0) { 2301 PWDDESC_XUNLOCK(pdp); 2302 return; 2303 } 2304 pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 2305 pwd_set(pdp, NULL); 2306 PWDDESC_XUNLOCK(pdp); 2307 pwd_drop(pwd); 2308 2309 PWDDESC_LOCK_DESTROY(pdp); 2310 free(pdp, M_PWDDESC); 2311 } 2312 2313 /* 2314 * Share a filedesc structure. 2315 */ 2316 struct filedesc * 2317 fdshare(struct filedesc *fdp) 2318 { 2319 2320 refcount_acquire(&fdp->fd_refcnt); 2321 return (fdp); 2322 } 2323 2324 /* 2325 * Share a pwddesc structure. 2326 */ 2327 struct pwddesc * 2328 pdshare(struct pwddesc *pdp) 2329 { 2330 refcount_acquire(&pdp->pd_refcount); 2331 return (pdp); 2332 } 2333 2334 /* 2335 * Unshare a filedesc structure, if necessary by making a copy 2336 */ 2337 void 2338 fdunshare(struct thread *td) 2339 { 2340 struct filedesc *tmp; 2341 struct proc *p = td->td_proc; 2342 2343 if (refcount_load(&p->p_fd->fd_refcnt) == 1) 2344 return; 2345 2346 tmp = fdcopy(p->p_fd); 2347 fdescfree(td); 2348 p->p_fd = tmp; 2349 } 2350 2351 /* 2352 * Unshare a pwddesc structure. 2353 */ 2354 void 2355 pdunshare(struct thread *td) 2356 { 2357 struct pwddesc *pdp; 2358 struct proc *p; 2359 2360 p = td->td_proc; 2361 /* Not shared. */ 2362 if (refcount_load(&p->p_pd->pd_refcount) == 1) 2363 return; 2364 2365 pdp = pdcopy(p->p_pd); 2366 pdescfree(td); 2367 p->p_pd = pdp; 2368 } 2369 2370 /* 2371 * Copy a filedesc structure. A NULL pointer in returns a NULL reference, 2372 * this is to ease callers, not catch errors. 2373 */ 2374 struct filedesc * 2375 fdcopy(struct filedesc *fdp) 2376 { 2377 struct filedesc *newfdp; 2378 struct filedescent *nfde, *ofde; 2379 int i, lastfile; 2380 2381 MPASS(fdp != NULL); 2382 2383 newfdp = fdinit(); 2384 FILEDESC_SLOCK(fdp); 2385 for (;;) { 2386 lastfile = fdlastfile(fdp); 2387 if (lastfile < newfdp->fd_nfiles) 2388 break; 2389 FILEDESC_SUNLOCK(fdp); 2390 fdgrowtable(newfdp, lastfile + 1); 2391 FILEDESC_SLOCK(fdp); 2392 } 2393 /* copy all passable descriptors (i.e. not kqueue) */ 2394 newfdp->fd_freefile = fdp->fd_freefile; 2395 FILEDESC_FOREACH_FDE(fdp, i, ofde) { 2396 if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0 || 2397 !fhold(ofde->fde_file)) { 2398 if (newfdp->fd_freefile == fdp->fd_freefile) 2399 newfdp->fd_freefile = i; 2400 continue; 2401 } 2402 nfde = &newfdp->fd_ofiles[i]; 2403 *nfde = *ofde; 2404 filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true); 2405 fdused_init(newfdp, i); 2406 } 2407 MPASS(newfdp->fd_freefile != -1); 2408 FILEDESC_SUNLOCK(fdp); 2409 return (newfdp); 2410 } 2411 2412 /* 2413 * Copy a pwddesc structure. 2414 */ 2415 struct pwddesc * 2416 pdcopy(struct pwddesc *pdp) 2417 { 2418 struct pwddesc *newpdp; 2419 2420 MPASS(pdp != NULL); 2421 2422 newpdp = pdinit(pdp, true); 2423 newpdp->pd_cmask = pdp->pd_cmask; 2424 PWDDESC_XUNLOCK(pdp); 2425 return (newpdp); 2426 } 2427 2428 /* 2429 * Clear POSIX style locks. This is only used when fdp looses a reference (i.e. 2430 * one of processes using it exits) and the table used to be shared. 2431 */ 2432 static void 2433 fdclearlocks(struct thread *td) 2434 { 2435 struct filedesc *fdp; 2436 struct filedesc_to_leader *fdtol; 2437 struct flock lf; 2438 struct file *fp; 2439 struct proc *p; 2440 struct vnode *vp; 2441 int i; 2442 2443 p = td->td_proc; 2444 fdp = p->p_fd; 2445 fdtol = p->p_fdtol; 2446 MPASS(fdtol != NULL); 2447 2448 FILEDESC_XLOCK(fdp); 2449 KASSERT(fdtol->fdl_refcount > 0, 2450 ("filedesc_to_refcount botch: fdl_refcount=%d", 2451 fdtol->fdl_refcount)); 2452 if (fdtol->fdl_refcount == 1 && 2453 (p->p_leader->p_flag & P_ADVLOCK) != 0) { 2454 FILEDESC_FOREACH_FP(fdp, i, fp) { 2455 if (fp->f_type != DTYPE_VNODE || 2456 !fhold(fp)) 2457 continue; 2458 FILEDESC_XUNLOCK(fdp); 2459 lf.l_whence = SEEK_SET; 2460 lf.l_start = 0; 2461 lf.l_len = 0; 2462 lf.l_type = F_UNLCK; 2463 vp = fp->f_vnode; 2464 (void) VOP_ADVLOCK(vp, 2465 (caddr_t)p->p_leader, F_UNLCK, 2466 &lf, F_POSIX); 2467 FILEDESC_XLOCK(fdp); 2468 fdrop(fp, td); 2469 } 2470 } 2471 retry: 2472 if (fdtol->fdl_refcount == 1) { 2473 if (fdp->fd_holdleaderscount > 0 && 2474 (p->p_leader->p_flag & P_ADVLOCK) != 0) { 2475 /* 2476 * close() or kern_dup() has cleared a reference 2477 * in a shared file descriptor table. 2478 */ 2479 fdp->fd_holdleaderswakeup = 1; 2480 sx_sleep(&fdp->fd_holdleaderscount, 2481 FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); 2482 goto retry; 2483 } 2484 if (fdtol->fdl_holdcount > 0) { 2485 /* 2486 * Ensure that fdtol->fdl_leader remains 2487 * valid in closef(). 2488 */ 2489 fdtol->fdl_wakeup = 1; 2490 sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK, 2491 "fdlhold", 0); 2492 goto retry; 2493 } 2494 } 2495 fdtol->fdl_refcount--; 2496 if (fdtol->fdl_refcount == 0 && 2497 fdtol->fdl_holdcount == 0) { 2498 fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; 2499 fdtol->fdl_prev->fdl_next = fdtol->fdl_next; 2500 } else 2501 fdtol = NULL; 2502 p->p_fdtol = NULL; 2503 FILEDESC_XUNLOCK(fdp); 2504 if (fdtol != NULL) 2505 free(fdtol, M_FILEDESC_TO_LEADER); 2506 } 2507 2508 /* 2509 * Release a filedesc structure. 2510 */ 2511 static void 2512 fdescfree_fds(struct thread *td, struct filedesc *fdp) 2513 { 2514 struct filedesc0 *fdp0; 2515 struct freetable *ft, *tft; 2516 struct filedescent *fde; 2517 struct file *fp; 2518 int i; 2519 2520 KASSERT(refcount_load(&fdp->fd_refcnt) == 0, 2521 ("%s: fd table %p carries references", __func__, fdp)); 2522 2523 /* 2524 * Serialize with threads iterating over the table, if any. 2525 */ 2526 if (refcount_load(&fdp->fd_holdcnt) > 1) { 2527 FILEDESC_XLOCK(fdp); 2528 FILEDESC_XUNLOCK(fdp); 2529 } 2530 2531 FILEDESC_FOREACH_FDE(fdp, i, fde) { 2532 fp = fde->fde_file; 2533 fdefree_last(fde); 2534 (void) closef(fp, td); 2535 } 2536 2537 if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE)) 2538 free(fdp->fd_map, M_FILEDESC); 2539 if (fdp->fd_nfiles > NDFILE) 2540 free(fdp->fd_files, M_FILEDESC); 2541 2542 fdp0 = (struct filedesc0 *)fdp; 2543 SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft) 2544 free(ft->ft_table, M_FILEDESC); 2545 2546 fddrop(fdp); 2547 } 2548 2549 void 2550 fdescfree(struct thread *td) 2551 { 2552 struct proc *p; 2553 struct filedesc *fdp; 2554 2555 p = td->td_proc; 2556 fdp = p->p_fd; 2557 MPASS(fdp != NULL); 2558 2559 #ifdef RACCT 2560 if (RACCT_ENABLED()) 2561 racct_set_unlocked(p, RACCT_NOFILE, 0); 2562 #endif 2563 2564 if (p->p_fdtol != NULL) 2565 fdclearlocks(td); 2566 2567 /* 2568 * Check fdhold for an explanation. 2569 */ 2570 atomic_store_ptr(&p->p_fd, NULL); 2571 atomic_thread_fence_seq_cst(); 2572 PROC_WAIT_UNLOCKED(p); 2573 2574 if (refcount_release(&fdp->fd_refcnt) == 0) 2575 return; 2576 2577 fdescfree_fds(td, fdp); 2578 } 2579 2580 void 2581 pdescfree(struct thread *td) 2582 { 2583 struct proc *p; 2584 struct pwddesc *pdp; 2585 2586 p = td->td_proc; 2587 pdp = p->p_pd; 2588 MPASS(pdp != NULL); 2589 2590 /* 2591 * Check pdhold for an explanation. 2592 */ 2593 atomic_store_ptr(&p->p_pd, NULL); 2594 atomic_thread_fence_seq_cst(); 2595 PROC_WAIT_UNLOCKED(p); 2596 2597 pddrop(pdp); 2598 } 2599 2600 /* 2601 * For setugid programs, we don't want to people to use that setugidness 2602 * to generate error messages which write to a file which otherwise would 2603 * otherwise be off-limits to the process. We check for filesystems where 2604 * the vnode can change out from under us after execve (like [lin]procfs). 2605 * 2606 * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is 2607 * sufficient. We also don't check for setugidness since we know we are. 2608 */ 2609 static bool 2610 is_unsafe(struct file *fp) 2611 { 2612 struct vnode *vp; 2613 2614 if (fp->f_type != DTYPE_VNODE) 2615 return (false); 2616 2617 vp = fp->f_vnode; 2618 return ((vp->v_vflag & VV_PROCDEP) != 0); 2619 } 2620 2621 /* 2622 * Make this setguid thing safe, if at all possible. 2623 */ 2624 void 2625 fdsetugidsafety(struct thread *td) 2626 { 2627 struct filedesc *fdp; 2628 struct file *fp; 2629 int i; 2630 2631 fdp = td->td_proc->p_fd; 2632 KASSERT(refcount_load(&fdp->fd_refcnt) == 1, 2633 ("the fdtable should not be shared")); 2634 MPASS(fdp->fd_nfiles >= 3); 2635 for (i = 0; i <= 2; i++) { 2636 fp = fdp->fd_ofiles[i].fde_file; 2637 if (fp != NULL && is_unsafe(fp)) { 2638 FILEDESC_XLOCK(fdp); 2639 knote_fdclose(td, i); 2640 /* 2641 * NULL-out descriptor prior to close to avoid 2642 * a race while close blocks. 2643 */ 2644 fdfree(fdp, i); 2645 FILEDESC_XUNLOCK(fdp); 2646 (void) closef(fp, td); 2647 } 2648 } 2649 } 2650 2651 /* 2652 * If a specific file object occupies a specific file descriptor, close the 2653 * file descriptor entry and drop a reference on the file object. This is a 2654 * convenience function to handle a subsequent error in a function that calls 2655 * falloc() that handles the race that another thread might have closed the 2656 * file descriptor out from under the thread creating the file object. 2657 */ 2658 void 2659 fdclose(struct thread *td, struct file *fp, int idx) 2660 { 2661 struct filedesc *fdp = td->td_proc->p_fd; 2662 2663 FILEDESC_XLOCK(fdp); 2664 if (fdp->fd_ofiles[idx].fde_file == fp) { 2665 fdfree(fdp, idx); 2666 FILEDESC_XUNLOCK(fdp); 2667 fdrop(fp, td); 2668 } else 2669 FILEDESC_XUNLOCK(fdp); 2670 } 2671 2672 /* 2673 * Close any files on exec? 2674 */ 2675 void 2676 fdcloseexec(struct thread *td) 2677 { 2678 struct filedesc *fdp; 2679 struct filedescent *fde; 2680 struct file *fp; 2681 int i; 2682 2683 fdp = td->td_proc->p_fd; 2684 KASSERT(refcount_load(&fdp->fd_refcnt) == 1, 2685 ("the fdtable should not be shared")); 2686 FILEDESC_FOREACH_FDE(fdp, i, fde) { 2687 fp = fde->fde_file; 2688 if (fp->f_type == DTYPE_MQUEUE || 2689 (fde->fde_flags & UF_EXCLOSE)) { 2690 FILEDESC_XLOCK(fdp); 2691 fdfree(fdp, i); 2692 (void) closefp(fdp, i, fp, td, false, false); 2693 FILEDESC_UNLOCK_ASSERT(fdp); 2694 } 2695 } 2696 } 2697 2698 /* 2699 * It is unsafe for set[ug]id processes to be started with file 2700 * descriptors 0..2 closed, as these descriptors are given implicit 2701 * significance in the Standard C library. fdcheckstd() will create a 2702 * descriptor referencing /dev/null for each of stdin, stdout, and 2703 * stderr that is not already open. 2704 */ 2705 int 2706 fdcheckstd(struct thread *td) 2707 { 2708 struct filedesc *fdp; 2709 register_t save; 2710 int i, error, devnull; 2711 2712 fdp = td->td_proc->p_fd; 2713 KASSERT(refcount_load(&fdp->fd_refcnt) == 1, 2714 ("the fdtable should not be shared")); 2715 MPASS(fdp->fd_nfiles >= 3); 2716 devnull = -1; 2717 for (i = 0; i <= 2; i++) { 2718 if (fdp->fd_ofiles[i].fde_file != NULL) 2719 continue; 2720 2721 save = td->td_retval[0]; 2722 if (devnull != -1) { 2723 error = kern_dup(td, FDDUP_FIXED, 0, devnull, i); 2724 } else { 2725 error = kern_openat(td, AT_FDCWD, "/dev/null", 2726 UIO_SYSSPACE, O_RDWR, 0); 2727 if (error == 0) { 2728 devnull = td->td_retval[0]; 2729 KASSERT(devnull == i, ("we didn't get our fd")); 2730 } 2731 } 2732 td->td_retval[0] = save; 2733 if (error != 0) 2734 return (error); 2735 } 2736 return (0); 2737 } 2738 2739 /* 2740 * Internal form of close. Decrement reference count on file structure. 2741 * Note: td may be NULL when closing a file that was being passed in a 2742 * message. 2743 */ 2744 int 2745 closef(struct file *fp, struct thread *td) 2746 { 2747 struct vnode *vp; 2748 struct flock lf; 2749 struct filedesc_to_leader *fdtol; 2750 struct filedesc *fdp; 2751 2752 MPASS(td != NULL); 2753 2754 /* 2755 * POSIX record locking dictates that any close releases ALL 2756 * locks owned by this process. This is handled by setting 2757 * a flag in the unlock to free ONLY locks obeying POSIX 2758 * semantics, and not to free BSD-style file locks. 2759 * If the descriptor was in a message, POSIX-style locks 2760 * aren't passed with the descriptor, and the thread pointer 2761 * will be NULL. Callers should be careful only to pass a 2762 * NULL thread pointer when there really is no owning 2763 * context that might have locks, or the locks will be 2764 * leaked. 2765 */ 2766 if (fp->f_type == DTYPE_VNODE) { 2767 vp = fp->f_vnode; 2768 if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { 2769 lf.l_whence = SEEK_SET; 2770 lf.l_start = 0; 2771 lf.l_len = 0; 2772 lf.l_type = F_UNLCK; 2773 (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, 2774 F_UNLCK, &lf, F_POSIX); 2775 } 2776 fdtol = td->td_proc->p_fdtol; 2777 if (fdtol != NULL) { 2778 /* 2779 * Handle special case where file descriptor table is 2780 * shared between multiple process leaders. 2781 */ 2782 fdp = td->td_proc->p_fd; 2783 FILEDESC_XLOCK(fdp); 2784 for (fdtol = fdtol->fdl_next; 2785 fdtol != td->td_proc->p_fdtol; 2786 fdtol = fdtol->fdl_next) { 2787 if ((fdtol->fdl_leader->p_flag & 2788 P_ADVLOCK) == 0) 2789 continue; 2790 fdtol->fdl_holdcount++; 2791 FILEDESC_XUNLOCK(fdp); 2792 lf.l_whence = SEEK_SET; 2793 lf.l_start = 0; 2794 lf.l_len = 0; 2795 lf.l_type = F_UNLCK; 2796 vp = fp->f_vnode; 2797 (void) VOP_ADVLOCK(vp, 2798 (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf, 2799 F_POSIX); 2800 FILEDESC_XLOCK(fdp); 2801 fdtol->fdl_holdcount--; 2802 if (fdtol->fdl_holdcount == 0 && 2803 fdtol->fdl_wakeup != 0) { 2804 fdtol->fdl_wakeup = 0; 2805 wakeup(fdtol); 2806 } 2807 } 2808 FILEDESC_XUNLOCK(fdp); 2809 } 2810 } 2811 return (fdrop_close(fp, td)); 2812 } 2813 2814 /* 2815 * Hack for file descriptor passing code. 2816 */ 2817 void 2818 closef_nothread(struct file *fp) 2819 { 2820 2821 fdrop(fp, NULL); 2822 } 2823 2824 /* 2825 * Initialize the file pointer with the specified properties. 2826 * 2827 * The ops are set with release semantics to be certain that the flags, type, 2828 * and data are visible when ops is. This is to prevent ops methods from being 2829 * called with bad data. 2830 */ 2831 void 2832 finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops) 2833 { 2834 fp->f_data = data; 2835 fp->f_flag = flag; 2836 fp->f_type = type; 2837 atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops); 2838 } 2839 2840 void 2841 finit_vnode(struct file *fp, u_int flag, void *data, struct fileops *ops) 2842 { 2843 fp->f_seqcount[UIO_READ] = 1; 2844 fp->f_seqcount[UIO_WRITE] = 1; 2845 finit(fp, (flag & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, 2846 data, ops); 2847 } 2848 2849 int 2850 fget_cap_noref(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, 2851 struct file **fpp, struct filecaps *havecapsp) 2852 { 2853 struct filedescent *fde; 2854 int error; 2855 2856 FILEDESC_LOCK_ASSERT(fdp); 2857 2858 *fpp = NULL; 2859 fde = fdeget_noref(fdp, fd); 2860 if (fde == NULL) { 2861 error = EBADF; 2862 goto out; 2863 } 2864 2865 #ifdef CAPABILITIES 2866 error = cap_check(cap_rights_fde_inline(fde), needrightsp); 2867 if (error != 0) 2868 goto out; 2869 #endif 2870 2871 if (havecapsp != NULL) 2872 filecaps_copy(&fde->fde_caps, havecapsp, true); 2873 2874 *fpp = fde->fde_file; 2875 2876 error = 0; 2877 out: 2878 return (error); 2879 } 2880 2881 #ifdef CAPABILITIES 2882 int 2883 fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp, 2884 struct file **fpp, struct filecaps *havecapsp) 2885 { 2886 struct filedesc *fdp = td->td_proc->p_fd; 2887 int error; 2888 struct file *fp; 2889 seqc_t seq; 2890 2891 *fpp = NULL; 2892 for (;;) { 2893 error = fget_unlocked_seq(td, fd, needrightsp, &fp, &seq); 2894 if (error != 0) 2895 return (error); 2896 2897 if (havecapsp != NULL) { 2898 if (!filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, 2899 havecapsp, false)) { 2900 fdrop(fp, td); 2901 goto get_locked; 2902 } 2903 } 2904 2905 if (!fd_modified(fdp, fd, seq)) 2906 break; 2907 fdrop(fp, td); 2908 } 2909 2910 *fpp = fp; 2911 return (0); 2912 2913 get_locked: 2914 FILEDESC_SLOCK(fdp); 2915 error = fget_cap_noref(fdp, fd, needrightsp, fpp, havecapsp); 2916 if (error == 0 && !fhold(*fpp)) 2917 error = EBADF; 2918 FILEDESC_SUNLOCK(fdp); 2919 return (error); 2920 } 2921 #else 2922 int 2923 fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp, 2924 struct file **fpp, struct filecaps *havecapsp) 2925 { 2926 int error; 2927 error = fget_unlocked(td, fd, needrightsp, fpp); 2928 if (havecapsp != NULL && error == 0) 2929 filecaps_fill(havecapsp); 2930 2931 return (error); 2932 } 2933 #endif 2934 2935 #ifdef CAPABILITIES 2936 int 2937 fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch) 2938 { 2939 const struct filedescent *fde; 2940 const struct fdescenttbl *fdt; 2941 struct filedesc *fdp; 2942 struct file *fp; 2943 struct vnode *vp; 2944 const cap_rights_t *haverights; 2945 cap_rights_t rights; 2946 seqc_t seq; 2947 2948 VFS_SMR_ASSERT_ENTERED(); 2949 2950 rights = *ndp->ni_rightsneeded; 2951 cap_rights_set_one(&rights, CAP_LOOKUP); 2952 2953 fdp = curproc->p_fd; 2954 fdt = fdp->fd_files; 2955 if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) 2956 return (EBADF); 2957 seq = seqc_read_notmodify(fd_seqc(fdt, fd)); 2958 fde = &fdt->fdt_ofiles[fd]; 2959 haverights = cap_rights_fde_inline(fde); 2960 fp = fde->fde_file; 2961 if (__predict_false(fp == NULL)) 2962 return (EAGAIN); 2963 if (__predict_false(cap_check_inline_transient(haverights, &rights))) 2964 return (EAGAIN); 2965 *fsearch = ((fp->f_flag & FSEARCH) != 0); 2966 vp = fp->f_vnode; 2967 if (__predict_false(vp == NULL)) { 2968 return (EAGAIN); 2969 } 2970 if (!filecaps_copy(&fde->fde_caps, &ndp->ni_filecaps, false)) { 2971 return (EAGAIN); 2972 } 2973 /* 2974 * Use an acquire barrier to force re-reading of fdt so it is 2975 * refreshed for verification. 2976 */ 2977 atomic_thread_fence_acq(); 2978 fdt = fdp->fd_files; 2979 if (__predict_false(!seqc_consistent_no_fence(fd_seqc(fdt, fd), seq))) 2980 return (EAGAIN); 2981 /* 2982 * If file descriptor doesn't have all rights, 2983 * all lookups relative to it must also be 2984 * strictly relative. 2985 * 2986 * Not yet supported by fast path. 2987 */ 2988 CAP_ALL(&rights); 2989 if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) || 2990 ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL || 2991 ndp->ni_filecaps.fc_nioctls != -1) { 2992 #ifdef notyet 2993 ndp->ni_lcf |= NI_LCF_STRICTRELATIVE; 2994 #else 2995 return (EAGAIN); 2996 #endif 2997 } 2998 *vpp = vp; 2999 return (0); 3000 } 3001 #else 3002 int 3003 fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch) 3004 { 3005 const struct fdescenttbl *fdt; 3006 struct filedesc *fdp; 3007 struct file *fp; 3008 struct vnode *vp; 3009 3010 VFS_SMR_ASSERT_ENTERED(); 3011 3012 fdp = curproc->p_fd; 3013 fdt = fdp->fd_files; 3014 if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) 3015 return (EBADF); 3016 fp = fdt->fdt_ofiles[fd].fde_file; 3017 if (__predict_false(fp == NULL)) 3018 return (EAGAIN); 3019 *fsearch = ((fp->f_flag & FSEARCH) != 0); 3020 vp = fp->f_vnode; 3021 if (__predict_false(vp == NULL || vp->v_type != VDIR)) { 3022 return (EAGAIN); 3023 } 3024 /* 3025 * Use an acquire barrier to force re-reading of fdt so it is 3026 * refreshed for verification. 3027 */ 3028 atomic_thread_fence_acq(); 3029 fdt = fdp->fd_files; 3030 if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file)) 3031 return (EAGAIN); 3032 filecaps_fill(&ndp->ni_filecaps); 3033 *vpp = vp; 3034 return (0); 3035 } 3036 #endif 3037 3038 /* 3039 * Fetch the descriptor locklessly. 3040 * 3041 * We avoid fdrop() races by never raising a refcount above 0. To accomplish 3042 * this we have to use a cmpset loop rather than an atomic_add. The descriptor 3043 * must be re-verified once we acquire a reference to be certain that the 3044 * identity is still correct and we did not lose a race due to preemption. 3045 * 3046 * Force a reload of fdt when looping. Another thread could reallocate 3047 * the table before this fd was closed, so it is possible that there is 3048 * a stale fp pointer in cached version. 3049 */ 3050 #ifdef CAPABILITIES 3051 static int 3052 fget_unlocked_seq(struct thread *td, int fd, cap_rights_t *needrightsp, 3053 struct file **fpp, seqc_t *seqp) 3054 { 3055 struct filedesc *fdp; 3056 const struct filedescent *fde; 3057 const struct fdescenttbl *fdt; 3058 struct file *fp; 3059 seqc_t seq; 3060 cap_rights_t haverights; 3061 int error; 3062 3063 fdp = td->td_proc->p_fd; 3064 fdt = fdp->fd_files; 3065 if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) 3066 return (EBADF); 3067 3068 for (;;) { 3069 seq = seqc_read_notmodify(fd_seqc(fdt, fd)); 3070 fde = &fdt->fdt_ofiles[fd]; 3071 haverights = *cap_rights_fde_inline(fde); 3072 fp = fde->fde_file; 3073 if (__predict_false(fp == NULL)) { 3074 if (seqc_consistent(fd_seqc(fdt, fd), seq)) 3075 return (EBADF); 3076 fdt = atomic_load_ptr(&fdp->fd_files); 3077 continue; 3078 } 3079 error = cap_check_inline(&haverights, needrightsp); 3080 if (__predict_false(error != 0)) { 3081 if (seqc_consistent(fd_seqc(fdt, fd), seq)) 3082 return (error); 3083 fdt = atomic_load_ptr(&fdp->fd_files); 3084 continue; 3085 } 3086 if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) { 3087 fdt = atomic_load_ptr(&fdp->fd_files); 3088 continue; 3089 } 3090 /* 3091 * Use an acquire barrier to force re-reading of fdt so it is 3092 * refreshed for verification. 3093 */ 3094 atomic_thread_fence_acq(); 3095 fdt = fdp->fd_files; 3096 if (seqc_consistent_no_fence(fd_seqc(fdt, fd), seq)) 3097 break; 3098 fdrop(fp, td); 3099 } 3100 *fpp = fp; 3101 if (seqp != NULL) { 3102 *seqp = seq; 3103 } 3104 return (0); 3105 } 3106 #else 3107 static int 3108 fget_unlocked_seq(struct thread *td, int fd, cap_rights_t *needrightsp, 3109 struct file **fpp, seqc_t *seqp __unused) 3110 { 3111 struct filedesc *fdp; 3112 const struct fdescenttbl *fdt; 3113 struct file *fp; 3114 3115 fdp = td->td_proc->p_fd; 3116 fdt = fdp->fd_files; 3117 if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) 3118 return (EBADF); 3119 3120 for (;;) { 3121 fp = fdt->fdt_ofiles[fd].fde_file; 3122 if (__predict_false(fp == NULL)) 3123 return (EBADF); 3124 if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) { 3125 fdt = atomic_load_ptr(&fdp->fd_files); 3126 continue; 3127 } 3128 /* 3129 * Use an acquire barrier to force re-reading of fdt so it is 3130 * refreshed for verification. 3131 */ 3132 atomic_thread_fence_acq(); 3133 fdt = fdp->fd_files; 3134 if (__predict_true(fp == fdt->fdt_ofiles[fd].fde_file)) 3135 break; 3136 fdrop(fp, td); 3137 } 3138 *fpp = fp; 3139 return (0); 3140 } 3141 #endif 3142 3143 /* 3144 * See the comments in fget_unlocked_seq for an explanation of how this works. 3145 * 3146 * This is a simplified variant which bails out to the aforementioned routine 3147 * if anything goes wrong. In practice this only happens when userspace is 3148 * racing with itself. 3149 */ 3150 int 3151 fget_unlocked(struct thread *td, int fd, cap_rights_t *needrightsp, 3152 struct file **fpp) 3153 { 3154 struct filedesc *fdp; 3155 #ifdef CAPABILITIES 3156 const struct filedescent *fde; 3157 #endif 3158 const struct fdescenttbl *fdt; 3159 struct file *fp; 3160 #ifdef CAPABILITIES 3161 seqc_t seq; 3162 const cap_rights_t *haverights; 3163 #endif 3164 3165 fdp = td->td_proc->p_fd; 3166 fdt = fdp->fd_files; 3167 if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) { 3168 *fpp = NULL; 3169 return (EBADF); 3170 } 3171 #ifdef CAPABILITIES 3172 seq = seqc_read_notmodify(fd_seqc(fdt, fd)); 3173 fde = &fdt->fdt_ofiles[fd]; 3174 haverights = cap_rights_fde_inline(fde); 3175 fp = fde->fde_file; 3176 #else 3177 fp = fdt->fdt_ofiles[fd].fde_file; 3178 #endif 3179 if (__predict_false(fp == NULL)) 3180 goto out_fallback; 3181 #ifdef CAPABILITIES 3182 if (__predict_false(cap_check_inline_transient(haverights, needrightsp))) 3183 goto out_fallback; 3184 #endif 3185 if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) 3186 goto out_fallback; 3187 3188 /* 3189 * Use an acquire barrier to force re-reading of fdt so it is 3190 * refreshed for verification. 3191 */ 3192 atomic_thread_fence_acq(); 3193 fdt = fdp->fd_files; 3194 #ifdef CAPABILITIES 3195 if (__predict_false(!seqc_consistent_no_fence(fd_seqc(fdt, fd), seq))) 3196 #else 3197 if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file)) 3198 #endif 3199 goto out_fdrop; 3200 *fpp = fp; 3201 return (0); 3202 out_fdrop: 3203 fdrop(fp, td); 3204 out_fallback: 3205 *fpp = NULL; 3206 return (fget_unlocked_seq(td, fd, needrightsp, fpp, NULL)); 3207 } 3208 3209 /* 3210 * Translate fd -> file when the caller guarantees the file descriptor table 3211 * can't be changed by others. 3212 * 3213 * Note this does not mean the file object itself is only visible to the caller, 3214 * merely that it wont disappear without having to be referenced. 3215 * 3216 * Must be paired with fput_only_user. 3217 */ 3218 #ifdef CAPABILITIES 3219 int 3220 fget_only_user(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, 3221 struct file **fpp) 3222 { 3223 const struct filedescent *fde; 3224 const struct fdescenttbl *fdt; 3225 const cap_rights_t *haverights; 3226 struct file *fp; 3227 int error; 3228 3229 MPASS(FILEDESC_IS_ONLY_USER(fdp)); 3230 3231 *fpp = NULL; 3232 if (__predict_false(fd >= fdp->fd_nfiles)) 3233 return (EBADF); 3234 3235 fdt = fdp->fd_files; 3236 fde = &fdt->fdt_ofiles[fd]; 3237 fp = fde->fde_file; 3238 if (__predict_false(fp == NULL)) 3239 return (EBADF); 3240 MPASS(refcount_load(&fp->f_count) > 0); 3241 haverights = cap_rights_fde_inline(fde); 3242 error = cap_check_inline(haverights, needrightsp); 3243 if (__predict_false(error != 0)) 3244 return (error); 3245 *fpp = fp; 3246 return (0); 3247 } 3248 #else 3249 int 3250 fget_only_user(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, 3251 struct file **fpp) 3252 { 3253 struct file *fp; 3254 3255 MPASS(FILEDESC_IS_ONLY_USER(fdp)); 3256 3257 *fpp = NULL; 3258 if (__predict_false(fd >= fdp->fd_nfiles)) 3259 return (EBADF); 3260 3261 fp = fdp->fd_ofiles[fd].fde_file; 3262 if (__predict_false(fp == NULL)) 3263 return (EBADF); 3264 3265 MPASS(refcount_load(&fp->f_count) > 0); 3266 *fpp = fp; 3267 return (0); 3268 } 3269 #endif 3270 3271 /* 3272 * Extract the file pointer associated with the specified descriptor for the 3273 * current user process. 3274 * 3275 * If the descriptor doesn't exist or doesn't match 'flags', EBADF is 3276 * returned. 3277 * 3278 * File's rights will be checked against the capability rights mask. 3279 * 3280 * If an error occurred the non-zero error is returned and *fpp is set to 3281 * NULL. Otherwise *fpp is held and set and zero is returned. Caller is 3282 * responsible for fdrop(). 3283 */ 3284 static __inline int 3285 _fget(struct thread *td, int fd, struct file **fpp, int flags, 3286 cap_rights_t *needrightsp) 3287 { 3288 struct file *fp; 3289 int error; 3290 3291 *fpp = NULL; 3292 error = fget_unlocked(td, fd, needrightsp, &fp); 3293 if (__predict_false(error != 0)) 3294 return (error); 3295 if (__predict_false(fp->f_ops == &badfileops)) { 3296 fdrop(fp, td); 3297 return (EBADF); 3298 } 3299 3300 /* 3301 * FREAD and FWRITE failure return EBADF as per POSIX. 3302 */ 3303 error = 0; 3304 switch (flags) { 3305 case FREAD: 3306 case FWRITE: 3307 if ((fp->f_flag & flags) == 0) 3308 error = EBADF; 3309 break; 3310 case FEXEC: 3311 if (fp->f_ops != &path_fileops && 3312 ((fp->f_flag & (FREAD | FEXEC)) == 0 || 3313 (fp->f_flag & FWRITE) != 0)) 3314 error = EBADF; 3315 break; 3316 case 0: 3317 break; 3318 default: 3319 KASSERT(0, ("wrong flags")); 3320 } 3321 3322 if (error != 0) { 3323 fdrop(fp, td); 3324 return (error); 3325 } 3326 3327 *fpp = fp; 3328 return (0); 3329 } 3330 3331 int 3332 fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) 3333 { 3334 3335 return (_fget(td, fd, fpp, 0, rightsp)); 3336 } 3337 3338 int 3339 fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, vm_prot_t *maxprotp, 3340 struct file **fpp) 3341 { 3342 int error; 3343 #ifndef CAPABILITIES 3344 error = _fget(td, fd, fpp, 0, rightsp); 3345 if (maxprotp != NULL) 3346 *maxprotp = VM_PROT_ALL; 3347 return (error); 3348 #else 3349 cap_rights_t fdrights; 3350 struct filedesc *fdp; 3351 struct file *fp; 3352 seqc_t seq; 3353 3354 *fpp = NULL; 3355 fdp = td->td_proc->p_fd; 3356 MPASS(cap_rights_is_set(rightsp, CAP_MMAP)); 3357 for (;;) { 3358 error = fget_unlocked_seq(td, fd, rightsp, &fp, &seq); 3359 if (__predict_false(error != 0)) 3360 return (error); 3361 if (__predict_false(fp->f_ops == &badfileops)) { 3362 fdrop(fp, td); 3363 return (EBADF); 3364 } 3365 if (maxprotp != NULL) 3366 fdrights = *cap_rights(fdp, fd); 3367 if (!fd_modified(fdp, fd, seq)) 3368 break; 3369 fdrop(fp, td); 3370 } 3371 3372 /* 3373 * If requested, convert capability rights to access flags. 3374 */ 3375 if (maxprotp != NULL) 3376 *maxprotp = cap_rights_to_vmprot(&fdrights); 3377 *fpp = fp; 3378 return (0); 3379 #endif 3380 } 3381 3382 int 3383 fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) 3384 { 3385 3386 return (_fget(td, fd, fpp, FREAD, rightsp)); 3387 } 3388 3389 int 3390 fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) 3391 { 3392 3393 return (_fget(td, fd, fpp, FWRITE, rightsp)); 3394 } 3395 3396 int 3397 fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl, 3398 struct file **fpp) 3399 { 3400 #ifndef CAPABILITIES 3401 return (fget_unlocked(td, fd, rightsp, fpp)); 3402 #else 3403 struct filedesc *fdp = td->td_proc->p_fd; 3404 struct file *fp; 3405 int error; 3406 seqc_t seq; 3407 3408 *fpp = NULL; 3409 MPASS(cap_rights_is_set(rightsp, CAP_FCNTL)); 3410 for (;;) { 3411 error = fget_unlocked_seq(td, fd, rightsp, &fp, &seq); 3412 if (error != 0) 3413 return (error); 3414 error = cap_fcntl_check(fdp, fd, needfcntl); 3415 if (!fd_modified(fdp, fd, seq)) 3416 break; 3417 fdrop(fp, td); 3418 } 3419 if (error != 0) { 3420 fdrop(fp, td); 3421 return (error); 3422 } 3423 *fpp = fp; 3424 return (0); 3425 #endif 3426 } 3427 3428 /* 3429 * Like fget() but loads the underlying vnode, or returns an error if the 3430 * descriptor does not represent a vnode. Note that pipes use vnodes but 3431 * never have VM objects. The returned vnode will be vref()'d. 3432 * 3433 * XXX: what about the unused flags ? 3434 */ 3435 static __inline int 3436 _fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp, 3437 struct vnode **vpp) 3438 { 3439 struct file *fp; 3440 int error; 3441 3442 *vpp = NULL; 3443 error = _fget(td, fd, &fp, flags, needrightsp); 3444 if (error != 0) 3445 return (error); 3446 if (fp->f_vnode == NULL) { 3447 error = EINVAL; 3448 } else { 3449 *vpp = fp->f_vnode; 3450 vref(*vpp); 3451 } 3452 fdrop(fp, td); 3453 3454 return (error); 3455 } 3456 3457 int 3458 fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) 3459 { 3460 3461 return (_fgetvp(td, fd, 0, rightsp, vpp)); 3462 } 3463 3464 int 3465 fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp, 3466 struct filecaps *havecaps, struct vnode **vpp) 3467 { 3468 struct filecaps caps; 3469 struct file *fp; 3470 int error; 3471 3472 error = fget_cap(td, fd, needrightsp, &fp, &caps); 3473 if (error != 0) 3474 return (error); 3475 if (fp->f_ops == &badfileops) { 3476 error = EBADF; 3477 goto out; 3478 } 3479 if (fp->f_vnode == NULL) { 3480 error = EINVAL; 3481 goto out; 3482 } 3483 3484 *havecaps = caps; 3485 *vpp = fp->f_vnode; 3486 vref(*vpp); 3487 fdrop(fp, td); 3488 3489 return (0); 3490 out: 3491 filecaps_free(&caps); 3492 fdrop(fp, td); 3493 return (error); 3494 } 3495 3496 int 3497 fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) 3498 { 3499 3500 return (_fgetvp(td, fd, FREAD, rightsp, vpp)); 3501 } 3502 3503 int 3504 fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) 3505 { 3506 3507 return (_fgetvp(td, fd, FEXEC, rightsp, vpp)); 3508 } 3509 3510 #ifdef notyet 3511 int 3512 fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp, 3513 struct vnode **vpp) 3514 { 3515 3516 return (_fgetvp(td, fd, FWRITE, rightsp, vpp)); 3517 } 3518 #endif 3519 3520 /* 3521 * Handle the last reference to a file being closed. 3522 * 3523 * Without the noinline attribute clang keeps inlining the func thorough this 3524 * file when fdrop is used. 3525 */ 3526 int __noinline 3527 _fdrop(struct file *fp, struct thread *td) 3528 { 3529 int error; 3530 #ifdef INVARIANTS 3531 int count; 3532 3533 count = refcount_load(&fp->f_count); 3534 if (count != 0) 3535 panic("fdrop: fp %p count %d", fp, count); 3536 #endif 3537 error = fo_close(fp, td); 3538 atomic_subtract_int(&openfiles, 1); 3539 crfree(fp->f_cred); 3540 free(fp->f_advice, M_FADVISE); 3541 uma_zfree(file_zone, fp); 3542 3543 return (error); 3544 } 3545 3546 /* 3547 * Apply an advisory lock on a file descriptor. 3548 * 3549 * Just attempt to get a record lock of the requested type on the entire file 3550 * (l_whence = SEEK_SET, l_start = 0, l_len = 0). 3551 */ 3552 #ifndef _SYS_SYSPROTO_H_ 3553 struct flock_args { 3554 int fd; 3555 int how; 3556 }; 3557 #endif 3558 /* ARGSUSED */ 3559 int 3560 sys_flock(struct thread *td, struct flock_args *uap) 3561 { 3562 struct file *fp; 3563 struct vnode *vp; 3564 struct flock lf; 3565 int error; 3566 3567 error = fget(td, uap->fd, &cap_flock_rights, &fp); 3568 if (error != 0) 3569 return (error); 3570 error = EOPNOTSUPP; 3571 if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) { 3572 goto done; 3573 } 3574 if (fp->f_ops == &path_fileops) { 3575 goto done; 3576 } 3577 3578 error = 0; 3579 vp = fp->f_vnode; 3580 lf.l_whence = SEEK_SET; 3581 lf.l_start = 0; 3582 lf.l_len = 0; 3583 if (uap->how & LOCK_UN) { 3584 lf.l_type = F_UNLCK; 3585 atomic_clear_int(&fp->f_flag, FHASLOCK); 3586 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); 3587 goto done; 3588 } 3589 if (uap->how & LOCK_EX) 3590 lf.l_type = F_WRLCK; 3591 else if (uap->how & LOCK_SH) 3592 lf.l_type = F_RDLCK; 3593 else { 3594 error = EBADF; 3595 goto done; 3596 } 3597 atomic_set_int(&fp->f_flag, FHASLOCK); 3598 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, 3599 (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); 3600 done: 3601 fdrop(fp, td); 3602 return (error); 3603 } 3604 /* 3605 * Duplicate the specified descriptor to a free descriptor. 3606 */ 3607 int 3608 dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode, 3609 int openerror, int *indxp) 3610 { 3611 struct filedescent *newfde, *oldfde; 3612 struct file *fp; 3613 u_long *ioctls; 3614 int error, indx; 3615 3616 KASSERT(openerror == ENODEV || openerror == ENXIO, 3617 ("unexpected error %d in %s", openerror, __func__)); 3618 3619 /* 3620 * If the to-be-dup'd fd number is greater than the allowed number 3621 * of file descriptors, or the fd to be dup'd has already been 3622 * closed, then reject. 3623 */ 3624 FILEDESC_XLOCK(fdp); 3625 if ((fp = fget_noref(fdp, dfd)) == NULL) { 3626 FILEDESC_XUNLOCK(fdp); 3627 return (EBADF); 3628 } 3629 3630 error = fdalloc(td, 0, &indx); 3631 if (error != 0) { 3632 FILEDESC_XUNLOCK(fdp); 3633 return (error); 3634 } 3635 3636 /* 3637 * There are two cases of interest here. 3638 * 3639 * For ENODEV simply dup (dfd) to file descriptor (indx) and return. 3640 * 3641 * For ENXIO steal away the file structure from (dfd) and store it in 3642 * (indx). (dfd) is effectively closed by this operation. 3643 */ 3644 switch (openerror) { 3645 case ENODEV: 3646 /* 3647 * Check that the mode the file is being opened for is a 3648 * subset of the mode of the existing descriptor. 3649 */ 3650 if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { 3651 fdunused(fdp, indx); 3652 FILEDESC_XUNLOCK(fdp); 3653 return (EACCES); 3654 } 3655 if (!fhold(fp)) { 3656 fdunused(fdp, indx); 3657 FILEDESC_XUNLOCK(fdp); 3658 return (EBADF); 3659 } 3660 newfde = &fdp->fd_ofiles[indx]; 3661 oldfde = &fdp->fd_ofiles[dfd]; 3662 ioctls = filecaps_copy_prep(&oldfde->fde_caps); 3663 #ifdef CAPABILITIES 3664 seqc_write_begin(&newfde->fde_seqc); 3665 #endif 3666 fde_copy(oldfde, newfde); 3667 filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps, 3668 ioctls); 3669 #ifdef CAPABILITIES 3670 seqc_write_end(&newfde->fde_seqc); 3671 #endif 3672 break; 3673 case ENXIO: 3674 /* 3675 * Steal away the file pointer from dfd and stuff it into indx. 3676 */ 3677 newfde = &fdp->fd_ofiles[indx]; 3678 oldfde = &fdp->fd_ofiles[dfd]; 3679 #ifdef CAPABILITIES 3680 seqc_write_begin(&oldfde->fde_seqc); 3681 seqc_write_begin(&newfde->fde_seqc); 3682 #endif 3683 fde_copy(oldfde, newfde); 3684 oldfde->fde_file = NULL; 3685 fdunused(fdp, dfd); 3686 #ifdef CAPABILITIES 3687 seqc_write_end(&newfde->fde_seqc); 3688 seqc_write_end(&oldfde->fde_seqc); 3689 #endif 3690 break; 3691 } 3692 FILEDESC_XUNLOCK(fdp); 3693 *indxp = indx; 3694 return (0); 3695 } 3696 3697 /* 3698 * This sysctl determines if we will allow a process to chroot(2) if it 3699 * has a directory open: 3700 * 0: disallowed for all processes. 3701 * 1: allowed for processes that were not already chroot(2)'ed. 3702 * 2: allowed for all processes. 3703 */ 3704 3705 static int chroot_allow_open_directories = 1; 3706 3707 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW, 3708 &chroot_allow_open_directories, 0, 3709 "Allow a process to chroot(2) if it has a directory open"); 3710 3711 /* 3712 * Helper function for raised chroot(2) security function: Refuse if 3713 * any filedescriptors are open directories. 3714 */ 3715 static int 3716 chroot_refuse_vdir_fds(struct filedesc *fdp) 3717 { 3718 struct vnode *vp; 3719 struct file *fp; 3720 int i; 3721 3722 FILEDESC_LOCK_ASSERT(fdp); 3723 3724 FILEDESC_FOREACH_FP(fdp, i, fp) { 3725 if (fp->f_type == DTYPE_VNODE) { 3726 vp = fp->f_vnode; 3727 if (vp->v_type == VDIR) 3728 return (EPERM); 3729 } 3730 } 3731 return (0); 3732 } 3733 3734 static void 3735 pwd_fill(struct pwd *oldpwd, struct pwd *newpwd) 3736 { 3737 3738 if (newpwd->pwd_cdir == NULL && oldpwd->pwd_cdir != NULL) { 3739 vrefact(oldpwd->pwd_cdir); 3740 newpwd->pwd_cdir = oldpwd->pwd_cdir; 3741 } 3742 3743 if (newpwd->pwd_rdir == NULL && oldpwd->pwd_rdir != NULL) { 3744 vrefact(oldpwd->pwd_rdir); 3745 newpwd->pwd_rdir = oldpwd->pwd_rdir; 3746 } 3747 3748 if (newpwd->pwd_jdir == NULL && oldpwd->pwd_jdir != NULL) { 3749 vrefact(oldpwd->pwd_jdir); 3750 newpwd->pwd_jdir = oldpwd->pwd_jdir; 3751 } 3752 } 3753 3754 struct pwd * 3755 pwd_hold_pwddesc(struct pwddesc *pdp) 3756 { 3757 struct pwd *pwd; 3758 3759 PWDDESC_ASSERT_XLOCKED(pdp); 3760 pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 3761 if (pwd != NULL) 3762 refcount_acquire(&pwd->pwd_refcount); 3763 return (pwd); 3764 } 3765 3766 bool 3767 pwd_hold_smr(struct pwd *pwd) 3768 { 3769 3770 MPASS(pwd != NULL); 3771 if (__predict_true(refcount_acquire_if_not_zero(&pwd->pwd_refcount))) { 3772 return (true); 3773 } 3774 return (false); 3775 } 3776 3777 struct pwd * 3778 pwd_hold(struct thread *td) 3779 { 3780 struct pwddesc *pdp; 3781 struct pwd *pwd; 3782 3783 pdp = td->td_proc->p_pd; 3784 3785 vfs_smr_enter(); 3786 pwd = vfs_smr_entered_load(&pdp->pd_pwd); 3787 if (pwd_hold_smr(pwd)) { 3788 vfs_smr_exit(); 3789 return (pwd); 3790 } 3791 vfs_smr_exit(); 3792 PWDDESC_XLOCK(pdp); 3793 pwd = pwd_hold_pwddesc(pdp); 3794 MPASS(pwd != NULL); 3795 PWDDESC_XUNLOCK(pdp); 3796 return (pwd); 3797 } 3798 3799 struct pwd * 3800 pwd_hold_proc(struct proc *p) 3801 { 3802 struct pwddesc *pdp; 3803 struct pwd *pwd; 3804 3805 PROC_ASSERT_HELD(p); 3806 PROC_LOCK(p); 3807 pdp = pdhold(p); 3808 MPASS(pdp != NULL); 3809 PROC_UNLOCK(p); 3810 3811 PWDDESC_XLOCK(pdp); 3812 pwd = pwd_hold_pwddesc(pdp); 3813 MPASS(pwd != NULL); 3814 PWDDESC_XUNLOCK(pdp); 3815 pddrop(pdp); 3816 return (pwd); 3817 } 3818 3819 static struct pwd * 3820 pwd_alloc(void) 3821 { 3822 struct pwd *pwd; 3823 3824 pwd = uma_zalloc_smr(pwd_zone, M_WAITOK); 3825 bzero(pwd, sizeof(*pwd)); 3826 refcount_init(&pwd->pwd_refcount, 1); 3827 return (pwd); 3828 } 3829 3830 void 3831 pwd_drop(struct pwd *pwd) 3832 { 3833 3834 if (!refcount_release(&pwd->pwd_refcount)) 3835 return; 3836 3837 if (pwd->pwd_cdir != NULL) 3838 vrele(pwd->pwd_cdir); 3839 if (pwd->pwd_rdir != NULL) 3840 vrele(pwd->pwd_rdir); 3841 if (pwd->pwd_jdir != NULL) 3842 vrele(pwd->pwd_jdir); 3843 uma_zfree_smr(pwd_zone, pwd); 3844 } 3845 3846 /* 3847 * The caller is responsible for invoking priv_check() and 3848 * mac_vnode_check_chroot() to authorize this operation. 3849 */ 3850 int 3851 pwd_chroot(struct thread *td, struct vnode *vp) 3852 { 3853 struct pwddesc *pdp; 3854 struct filedesc *fdp; 3855 struct pwd *newpwd, *oldpwd; 3856 int error; 3857 3858 fdp = td->td_proc->p_fd; 3859 pdp = td->td_proc->p_pd; 3860 newpwd = pwd_alloc(); 3861 FILEDESC_SLOCK(fdp); 3862 PWDDESC_XLOCK(pdp); 3863 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 3864 if (chroot_allow_open_directories == 0 || 3865 (chroot_allow_open_directories == 1 && 3866 oldpwd->pwd_rdir != rootvnode)) { 3867 error = chroot_refuse_vdir_fds(fdp); 3868 FILEDESC_SUNLOCK(fdp); 3869 if (error != 0) { 3870 PWDDESC_XUNLOCK(pdp); 3871 pwd_drop(newpwd); 3872 return (error); 3873 } 3874 } else { 3875 FILEDESC_SUNLOCK(fdp); 3876 } 3877 3878 vrefact(vp); 3879 newpwd->pwd_rdir = vp; 3880 if (oldpwd->pwd_jdir == NULL) { 3881 vrefact(vp); 3882 newpwd->pwd_jdir = vp; 3883 } 3884 pwd_fill(oldpwd, newpwd); 3885 pwd_set(pdp, newpwd); 3886 PWDDESC_XUNLOCK(pdp); 3887 pwd_drop(oldpwd); 3888 return (0); 3889 } 3890 3891 void 3892 pwd_chdir(struct thread *td, struct vnode *vp) 3893 { 3894 struct pwddesc *pdp; 3895 struct pwd *newpwd, *oldpwd; 3896 3897 VNPASS(vp->v_usecount > 0, vp); 3898 3899 newpwd = pwd_alloc(); 3900 pdp = td->td_proc->p_pd; 3901 PWDDESC_XLOCK(pdp); 3902 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 3903 newpwd->pwd_cdir = vp; 3904 pwd_fill(oldpwd, newpwd); 3905 pwd_set(pdp, newpwd); 3906 PWDDESC_XUNLOCK(pdp); 3907 pwd_drop(oldpwd); 3908 } 3909 3910 /* 3911 * jail_attach(2) changes both root and working directories. 3912 */ 3913 int 3914 pwd_chroot_chdir(struct thread *td, struct vnode *vp) 3915 { 3916 struct pwddesc *pdp; 3917 struct filedesc *fdp; 3918 struct pwd *newpwd, *oldpwd; 3919 int error; 3920 3921 fdp = td->td_proc->p_fd; 3922 pdp = td->td_proc->p_pd; 3923 newpwd = pwd_alloc(); 3924 FILEDESC_SLOCK(fdp); 3925 PWDDESC_XLOCK(pdp); 3926 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 3927 error = chroot_refuse_vdir_fds(fdp); 3928 FILEDESC_SUNLOCK(fdp); 3929 if (error != 0) { 3930 PWDDESC_XUNLOCK(pdp); 3931 pwd_drop(newpwd); 3932 return (error); 3933 } 3934 3935 vrefact(vp); 3936 newpwd->pwd_rdir = vp; 3937 vrefact(vp); 3938 newpwd->pwd_cdir = vp; 3939 if (oldpwd->pwd_jdir == NULL) { 3940 vrefact(vp); 3941 newpwd->pwd_jdir = vp; 3942 } 3943 pwd_fill(oldpwd, newpwd); 3944 pwd_set(pdp, newpwd); 3945 PWDDESC_XUNLOCK(pdp); 3946 pwd_drop(oldpwd); 3947 return (0); 3948 } 3949 3950 void 3951 pwd_ensure_dirs(void) 3952 { 3953 struct pwddesc *pdp; 3954 struct pwd *oldpwd, *newpwd; 3955 3956 pdp = curproc->p_pd; 3957 PWDDESC_XLOCK(pdp); 3958 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 3959 if (oldpwd->pwd_cdir != NULL && oldpwd->pwd_rdir != NULL) { 3960 PWDDESC_XUNLOCK(pdp); 3961 return; 3962 } 3963 PWDDESC_XUNLOCK(pdp); 3964 3965 newpwd = pwd_alloc(); 3966 PWDDESC_XLOCK(pdp); 3967 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 3968 pwd_fill(oldpwd, newpwd); 3969 if (newpwd->pwd_cdir == NULL) { 3970 vrefact(rootvnode); 3971 newpwd->pwd_cdir = rootvnode; 3972 } 3973 if (newpwd->pwd_rdir == NULL) { 3974 vrefact(rootvnode); 3975 newpwd->pwd_rdir = rootvnode; 3976 } 3977 pwd_set(pdp, newpwd); 3978 PWDDESC_XUNLOCK(pdp); 3979 pwd_drop(oldpwd); 3980 } 3981 3982 void 3983 pwd_set_rootvnode(void) 3984 { 3985 struct pwddesc *pdp; 3986 struct pwd *oldpwd, *newpwd; 3987 3988 pdp = curproc->p_pd; 3989 3990 newpwd = pwd_alloc(); 3991 PWDDESC_XLOCK(pdp); 3992 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 3993 vrefact(rootvnode); 3994 newpwd->pwd_cdir = rootvnode; 3995 vrefact(rootvnode); 3996 newpwd->pwd_rdir = rootvnode; 3997 pwd_fill(oldpwd, newpwd); 3998 pwd_set(pdp, newpwd); 3999 PWDDESC_XUNLOCK(pdp); 4000 pwd_drop(oldpwd); 4001 } 4002 4003 /* 4004 * Scan all active processes and prisons to see if any of them have a current 4005 * or root directory of `olddp'. If so, replace them with the new mount point. 4006 */ 4007 void 4008 mountcheckdirs(struct vnode *olddp, struct vnode *newdp) 4009 { 4010 struct pwddesc *pdp; 4011 struct pwd *newpwd, *oldpwd; 4012 struct prison *pr; 4013 struct proc *p; 4014 int nrele; 4015 4016 if (vrefcnt(olddp) == 1) 4017 return; 4018 nrele = 0; 4019 newpwd = pwd_alloc(); 4020 sx_slock(&allproc_lock); 4021 FOREACH_PROC_IN_SYSTEM(p) { 4022 PROC_LOCK(p); 4023 pdp = pdhold(p); 4024 PROC_UNLOCK(p); 4025 if (pdp == NULL) 4026 continue; 4027 PWDDESC_XLOCK(pdp); 4028 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4029 if (oldpwd == NULL || 4030 (oldpwd->pwd_cdir != olddp && 4031 oldpwd->pwd_rdir != olddp && 4032 oldpwd->pwd_jdir != olddp)) { 4033 PWDDESC_XUNLOCK(pdp); 4034 pddrop(pdp); 4035 continue; 4036 } 4037 if (oldpwd->pwd_cdir == olddp) { 4038 vrefact(newdp); 4039 newpwd->pwd_cdir = newdp; 4040 } 4041 if (oldpwd->pwd_rdir == olddp) { 4042 vrefact(newdp); 4043 newpwd->pwd_rdir = newdp; 4044 } 4045 if (oldpwd->pwd_jdir == olddp) { 4046 vrefact(newdp); 4047 newpwd->pwd_jdir = newdp; 4048 } 4049 pwd_fill(oldpwd, newpwd); 4050 pwd_set(pdp, newpwd); 4051 PWDDESC_XUNLOCK(pdp); 4052 pwd_drop(oldpwd); 4053 pddrop(pdp); 4054 newpwd = pwd_alloc(); 4055 } 4056 sx_sunlock(&allproc_lock); 4057 pwd_drop(newpwd); 4058 if (rootvnode == olddp) { 4059 vrefact(newdp); 4060 rootvnode = newdp; 4061 nrele++; 4062 } 4063 mtx_lock(&prison0.pr_mtx); 4064 if (prison0.pr_root == olddp) { 4065 vrefact(newdp); 4066 prison0.pr_root = newdp; 4067 nrele++; 4068 } 4069 mtx_unlock(&prison0.pr_mtx); 4070 sx_slock(&allprison_lock); 4071 TAILQ_FOREACH(pr, &allprison, pr_list) { 4072 mtx_lock(&pr->pr_mtx); 4073 if (pr->pr_root == olddp) { 4074 vrefact(newdp); 4075 pr->pr_root = newdp; 4076 nrele++; 4077 } 4078 mtx_unlock(&pr->pr_mtx); 4079 } 4080 sx_sunlock(&allprison_lock); 4081 while (nrele--) 4082 vrele(olddp); 4083 } 4084 4085 struct filedesc_to_leader * 4086 filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader) 4087 { 4088 struct filedesc_to_leader *fdtol; 4089 4090 fdtol = malloc(sizeof(struct filedesc_to_leader), 4091 M_FILEDESC_TO_LEADER, M_WAITOK); 4092 fdtol->fdl_refcount = 1; 4093 fdtol->fdl_holdcount = 0; 4094 fdtol->fdl_wakeup = 0; 4095 fdtol->fdl_leader = leader; 4096 if (old != NULL) { 4097 FILEDESC_XLOCK(fdp); 4098 fdtol->fdl_next = old->fdl_next; 4099 fdtol->fdl_prev = old; 4100 old->fdl_next = fdtol; 4101 fdtol->fdl_next->fdl_prev = fdtol; 4102 FILEDESC_XUNLOCK(fdp); 4103 } else { 4104 fdtol->fdl_next = fdtol; 4105 fdtol->fdl_prev = fdtol; 4106 } 4107 return (fdtol); 4108 } 4109 4110 static int 4111 sysctl_kern_proc_nfds(SYSCTL_HANDLER_ARGS) 4112 { 4113 NDSLOTTYPE *map; 4114 struct filedesc *fdp; 4115 u_int namelen; 4116 int count, off, minoff; 4117 4118 namelen = arg2; 4119 if (namelen != 1) 4120 return (EINVAL); 4121 4122 if (*(int *)arg1 != 0) 4123 return (EINVAL); 4124 4125 fdp = curproc->p_fd; 4126 count = 0; 4127 FILEDESC_SLOCK(fdp); 4128 map = fdp->fd_map; 4129 off = NDSLOT(fdp->fd_nfiles - 1); 4130 for (minoff = NDSLOT(0); off >= minoff; --off) 4131 count += bitcountl(map[off]); 4132 FILEDESC_SUNLOCK(fdp); 4133 4134 return (SYSCTL_OUT(req, &count, sizeof(count))); 4135 } 4136 4137 static SYSCTL_NODE(_kern_proc, KERN_PROC_NFDS, nfds, 4138 CTLFLAG_RD|CTLFLAG_CAPRD|CTLFLAG_MPSAFE, sysctl_kern_proc_nfds, 4139 "Number of open file descriptors"); 4140 4141 /* 4142 * Get file structures globally. 4143 */ 4144 static int 4145 sysctl_kern_file(SYSCTL_HANDLER_ARGS) 4146 { 4147 struct xfile xf; 4148 struct filedesc *fdp; 4149 struct file *fp; 4150 struct proc *p; 4151 int error, n; 4152 4153 error = sysctl_wire_old_buffer(req, 0); 4154 if (error != 0) 4155 return (error); 4156 if (req->oldptr == NULL) { 4157 n = 0; 4158 sx_slock(&allproc_lock); 4159 FOREACH_PROC_IN_SYSTEM(p) { 4160 PROC_LOCK(p); 4161 if (p->p_state == PRS_NEW) { 4162 PROC_UNLOCK(p); 4163 continue; 4164 } 4165 fdp = fdhold(p); 4166 PROC_UNLOCK(p); 4167 if (fdp == NULL) 4168 continue; 4169 /* overestimates sparse tables. */ 4170 n += fdp->fd_nfiles; 4171 fddrop(fdp); 4172 } 4173 sx_sunlock(&allproc_lock); 4174 return (SYSCTL_OUT(req, 0, n * sizeof(xf))); 4175 } 4176 error = 0; 4177 bzero(&xf, sizeof(xf)); 4178 xf.xf_size = sizeof(xf); 4179 sx_slock(&allproc_lock); 4180 FOREACH_PROC_IN_SYSTEM(p) { 4181 PROC_LOCK(p); 4182 if (p->p_state == PRS_NEW) { 4183 PROC_UNLOCK(p); 4184 continue; 4185 } 4186 if (p_cansee(req->td, p) != 0) { 4187 PROC_UNLOCK(p); 4188 continue; 4189 } 4190 xf.xf_pid = p->p_pid; 4191 xf.xf_uid = p->p_ucred->cr_uid; 4192 fdp = fdhold(p); 4193 PROC_UNLOCK(p); 4194 if (fdp == NULL) 4195 continue; 4196 FILEDESC_SLOCK(fdp); 4197 FILEDESC_FOREACH_FP(fdp, n, fp) { 4198 if (refcount_load(&fdp->fd_refcnt) == 0) 4199 break; 4200 xf.xf_fd = n; 4201 xf.xf_file = (uintptr_t)fp; 4202 xf.xf_data = (uintptr_t)fp->f_data; 4203 xf.xf_vnode = (uintptr_t)fp->f_vnode; 4204 xf.xf_type = (uintptr_t)fp->f_type; 4205 xf.xf_count = refcount_load(&fp->f_count); 4206 xf.xf_msgcount = 0; 4207 xf.xf_offset = foffset_get(fp); 4208 xf.xf_flag = fp->f_flag; 4209 error = SYSCTL_OUT(req, &xf, sizeof(xf)); 4210 if (error) 4211 break; 4212 } 4213 FILEDESC_SUNLOCK(fdp); 4214 fddrop(fdp); 4215 if (error) 4216 break; 4217 } 4218 sx_sunlock(&allproc_lock); 4219 return (error); 4220 } 4221 4222 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE, 4223 0, 0, sysctl_kern_file, "S,xfile", "Entire file table"); 4224 4225 #ifdef KINFO_FILE_SIZE 4226 CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE); 4227 #endif 4228 4229 static int 4230 xlate_fflags(int fflags) 4231 { 4232 static const struct { 4233 int fflag; 4234 int kf_fflag; 4235 } fflags_table[] = { 4236 { FAPPEND, KF_FLAG_APPEND }, 4237 { FASYNC, KF_FLAG_ASYNC }, 4238 { FFSYNC, KF_FLAG_FSYNC }, 4239 { FHASLOCK, KF_FLAG_HASLOCK }, 4240 { FNONBLOCK, KF_FLAG_NONBLOCK }, 4241 { FREAD, KF_FLAG_READ }, 4242 { FWRITE, KF_FLAG_WRITE }, 4243 { O_CREAT, KF_FLAG_CREAT }, 4244 { O_DIRECT, KF_FLAG_DIRECT }, 4245 { O_EXCL, KF_FLAG_EXCL }, 4246 { O_EXEC, KF_FLAG_EXEC }, 4247 { O_EXLOCK, KF_FLAG_EXLOCK }, 4248 { O_NOFOLLOW, KF_FLAG_NOFOLLOW }, 4249 { O_SHLOCK, KF_FLAG_SHLOCK }, 4250 { O_TRUNC, KF_FLAG_TRUNC } 4251 }; 4252 unsigned int i; 4253 int kflags; 4254 4255 kflags = 0; 4256 for (i = 0; i < nitems(fflags_table); i++) 4257 if (fflags & fflags_table[i].fflag) 4258 kflags |= fflags_table[i].kf_fflag; 4259 return (kflags); 4260 } 4261 4262 /* Trim unused data from kf_path by truncating the structure size. */ 4263 void 4264 pack_kinfo(struct kinfo_file *kif) 4265 { 4266 4267 kif->kf_structsize = offsetof(struct kinfo_file, kf_path) + 4268 strlen(kif->kf_path) + 1; 4269 kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t)); 4270 } 4271 4272 static void 4273 export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp, 4274 struct kinfo_file *kif, struct filedesc *fdp, int flags) 4275 { 4276 int error; 4277 4278 bzero(kif, sizeof(*kif)); 4279 4280 /* Set a default type to allow for empty fill_kinfo() methods. */ 4281 kif->kf_type = KF_TYPE_UNKNOWN; 4282 kif->kf_flags = xlate_fflags(fp->f_flag); 4283 if (rightsp != NULL) 4284 kif->kf_cap_rights = *rightsp; 4285 else 4286 cap_rights_init_zero(&kif->kf_cap_rights); 4287 kif->kf_fd = fd; 4288 kif->kf_ref_count = refcount_load(&fp->f_count); 4289 kif->kf_offset = foffset_get(fp); 4290 4291 /* 4292 * This may drop the filedesc lock, so the 'fp' cannot be 4293 * accessed after this call. 4294 */ 4295 error = fo_fill_kinfo(fp, kif, fdp); 4296 if (error == 0) 4297 kif->kf_status |= KF_ATTR_VALID; 4298 if ((flags & KERN_FILEDESC_PACK_KINFO) != 0) 4299 pack_kinfo(kif); 4300 else 4301 kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t)); 4302 } 4303 4304 static void 4305 export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags, 4306 struct kinfo_file *kif, int flags) 4307 { 4308 int error; 4309 4310 bzero(kif, sizeof(*kif)); 4311 4312 kif->kf_type = KF_TYPE_VNODE; 4313 error = vn_fill_kinfo_vnode(vp, kif); 4314 if (error == 0) 4315 kif->kf_status |= KF_ATTR_VALID; 4316 kif->kf_flags = xlate_fflags(fflags); 4317 cap_rights_init_zero(&kif->kf_cap_rights); 4318 kif->kf_fd = fd; 4319 kif->kf_ref_count = -1; 4320 kif->kf_offset = -1; 4321 if ((flags & KERN_FILEDESC_PACK_KINFO) != 0) 4322 pack_kinfo(kif); 4323 else 4324 kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t)); 4325 vrele(vp); 4326 } 4327 4328 struct export_fd_buf { 4329 struct filedesc *fdp; 4330 struct pwddesc *pdp; 4331 struct sbuf *sb; 4332 ssize_t remainder; 4333 struct kinfo_file kif; 4334 int flags; 4335 }; 4336 4337 static int 4338 export_kinfo_to_sb(struct export_fd_buf *efbuf) 4339 { 4340 struct kinfo_file *kif; 4341 4342 kif = &efbuf->kif; 4343 if (efbuf->remainder != -1) { 4344 if (efbuf->remainder < kif->kf_structsize) 4345 return (ENOMEM); 4346 efbuf->remainder -= kif->kf_structsize; 4347 } 4348 if (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) != 0) 4349 return (sbuf_error(efbuf->sb)); 4350 return (0); 4351 } 4352 4353 static int 4354 export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp, 4355 struct export_fd_buf *efbuf) 4356 { 4357 int error; 4358 4359 if (efbuf->remainder == 0) 4360 return (ENOMEM); 4361 export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp, 4362 efbuf->flags); 4363 FILEDESC_SUNLOCK(efbuf->fdp); 4364 error = export_kinfo_to_sb(efbuf); 4365 FILEDESC_SLOCK(efbuf->fdp); 4366 return (error); 4367 } 4368 4369 static int 4370 export_vnode_to_sb(struct vnode *vp, int fd, int fflags, 4371 struct export_fd_buf *efbuf) 4372 { 4373 int error; 4374 4375 if (efbuf->remainder == 0) 4376 return (ENOMEM); 4377 if (efbuf->pdp != NULL) 4378 PWDDESC_XUNLOCK(efbuf->pdp); 4379 export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif, efbuf->flags); 4380 error = export_kinfo_to_sb(efbuf); 4381 if (efbuf->pdp != NULL) 4382 PWDDESC_XLOCK(efbuf->pdp); 4383 return (error); 4384 } 4385 4386 /* 4387 * Store a process file descriptor information to sbuf. 4388 * 4389 * Takes a locked proc as argument, and returns with the proc unlocked. 4390 */ 4391 int 4392 kern_proc_filedesc_out(struct proc *p, struct sbuf *sb, ssize_t maxlen, 4393 int flags) 4394 { 4395 struct file *fp; 4396 struct filedesc *fdp; 4397 struct pwddesc *pdp; 4398 struct export_fd_buf *efbuf; 4399 struct vnode *cttyvp, *textvp, *tracevp; 4400 struct pwd *pwd; 4401 int error, i; 4402 cap_rights_t rights; 4403 4404 PROC_LOCK_ASSERT(p, MA_OWNED); 4405 4406 /* ktrace vnode */ 4407 tracevp = ktr_get_tracevp(p, true); 4408 /* text vnode */ 4409 textvp = p->p_textvp; 4410 if (textvp != NULL) 4411 vrefact(textvp); 4412 /* Controlling tty. */ 4413 cttyvp = NULL; 4414 if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) { 4415 cttyvp = p->p_pgrp->pg_session->s_ttyvp; 4416 if (cttyvp != NULL) 4417 vrefact(cttyvp); 4418 } 4419 fdp = fdhold(p); 4420 pdp = pdhold(p); 4421 PROC_UNLOCK(p); 4422 4423 efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); 4424 efbuf->fdp = NULL; 4425 efbuf->pdp = NULL; 4426 efbuf->sb = sb; 4427 efbuf->remainder = maxlen; 4428 efbuf->flags = flags; 4429 4430 error = 0; 4431 if (tracevp != NULL) 4432 error = export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, 4433 FREAD | FWRITE, efbuf); 4434 if (error == 0 && textvp != NULL) 4435 error = export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, 4436 efbuf); 4437 if (error == 0 && cttyvp != NULL) 4438 error = export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, 4439 FREAD | FWRITE, efbuf); 4440 if (error != 0 || pdp == NULL || fdp == NULL) 4441 goto fail; 4442 efbuf->fdp = fdp; 4443 efbuf->pdp = pdp; 4444 PWDDESC_XLOCK(pdp); 4445 pwd = pwd_hold_pwddesc(pdp); 4446 if (pwd != NULL) { 4447 /* working directory */ 4448 if (pwd->pwd_cdir != NULL) { 4449 vrefact(pwd->pwd_cdir); 4450 error = export_vnode_to_sb(pwd->pwd_cdir, 4451 KF_FD_TYPE_CWD, FREAD, efbuf); 4452 } 4453 /* root directory */ 4454 if (error == 0 && pwd->pwd_rdir != NULL) { 4455 vrefact(pwd->pwd_rdir); 4456 error = export_vnode_to_sb(pwd->pwd_rdir, 4457 KF_FD_TYPE_ROOT, FREAD, efbuf); 4458 } 4459 /* jail directory */ 4460 if (error == 0 && pwd->pwd_jdir != NULL) { 4461 vrefact(pwd->pwd_jdir); 4462 error = export_vnode_to_sb(pwd->pwd_jdir, 4463 KF_FD_TYPE_JAIL, FREAD, efbuf); 4464 } 4465 } 4466 PWDDESC_XUNLOCK(pdp); 4467 if (error != 0) 4468 goto fail; 4469 if (pwd != NULL) 4470 pwd_drop(pwd); 4471 FILEDESC_SLOCK(fdp); 4472 FILEDESC_FOREACH_FP(fdp, i, fp) { 4473 if (refcount_load(&fdp->fd_refcnt) == 0) 4474 break; 4475 #ifdef CAPABILITIES 4476 rights = *cap_rights(fdp, i); 4477 #else /* !CAPABILITIES */ 4478 rights = cap_no_rights; 4479 #endif 4480 /* 4481 * Create sysctl entry. It is OK to drop the filedesc 4482 * lock inside of export_file_to_sb() as we will 4483 * re-validate and re-evaluate its properties when the 4484 * loop continues. 4485 */ 4486 error = export_file_to_sb(fp, i, &rights, efbuf); 4487 if (error != 0) 4488 break; 4489 } 4490 FILEDESC_SUNLOCK(fdp); 4491 fail: 4492 if (fdp != NULL) 4493 fddrop(fdp); 4494 if (pdp != NULL) 4495 pddrop(pdp); 4496 free(efbuf, M_TEMP); 4497 return (error); 4498 } 4499 4500 #define FILEDESC_SBUF_SIZE (sizeof(struct kinfo_file) * 5) 4501 4502 /* 4503 * Get per-process file descriptors for use by procstat(1), et al. 4504 */ 4505 static int 4506 sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS) 4507 { 4508 struct sbuf sb; 4509 struct proc *p; 4510 ssize_t maxlen; 4511 u_int namelen; 4512 int error, error2, *name; 4513 4514 namelen = arg2; 4515 if (namelen != 1) 4516 return (EINVAL); 4517 4518 name = (int *)arg1; 4519 4520 sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req); 4521 sbuf_clear_flags(&sb, SBUF_INCLUDENUL); 4522 error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); 4523 if (error != 0) { 4524 sbuf_delete(&sb); 4525 return (error); 4526 } 4527 maxlen = req->oldptr != NULL ? req->oldlen : -1; 4528 error = kern_proc_filedesc_out(p, &sb, maxlen, 4529 KERN_FILEDESC_PACK_KINFO); 4530 error2 = sbuf_finish(&sb); 4531 sbuf_delete(&sb); 4532 return (error != 0 ? error : error2); 4533 } 4534 4535 #ifdef COMPAT_FREEBSD7 4536 #ifdef KINFO_OFILE_SIZE 4537 CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE); 4538 #endif 4539 4540 static void 4541 kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif) 4542 { 4543 4544 okif->kf_structsize = sizeof(*okif); 4545 okif->kf_type = kif->kf_type; 4546 okif->kf_fd = kif->kf_fd; 4547 okif->kf_ref_count = kif->kf_ref_count; 4548 okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE | 4549 KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK | 4550 KF_FLAG_DIRECT | KF_FLAG_HASLOCK); 4551 okif->kf_offset = kif->kf_offset; 4552 if (kif->kf_type == KF_TYPE_VNODE) 4553 okif->kf_vnode_type = kif->kf_un.kf_file.kf_file_type; 4554 else 4555 okif->kf_vnode_type = KF_VTYPE_VNON; 4556 strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path)); 4557 if (kif->kf_type == KF_TYPE_SOCKET) { 4558 okif->kf_sock_domain = kif->kf_un.kf_sock.kf_sock_domain0; 4559 okif->kf_sock_type = kif->kf_un.kf_sock.kf_sock_type0; 4560 okif->kf_sock_protocol = kif->kf_un.kf_sock.kf_sock_protocol0; 4561 okif->kf_sa_local = kif->kf_un.kf_sock.kf_sa_local; 4562 okif->kf_sa_peer = kif->kf_un.kf_sock.kf_sa_peer; 4563 } else { 4564 okif->kf_sa_local.ss_family = AF_UNSPEC; 4565 okif->kf_sa_peer.ss_family = AF_UNSPEC; 4566 } 4567 } 4568 4569 static int 4570 export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif, 4571 struct kinfo_ofile *okif, struct pwddesc *pdp, struct sysctl_req *req) 4572 { 4573 int error; 4574 4575 vrefact(vp); 4576 PWDDESC_XUNLOCK(pdp); 4577 export_vnode_to_kinfo(vp, type, 0, kif, KERN_FILEDESC_PACK_KINFO); 4578 kinfo_to_okinfo(kif, okif); 4579 error = SYSCTL_OUT(req, okif, sizeof(*okif)); 4580 PWDDESC_XLOCK(pdp); 4581 return (error); 4582 } 4583 4584 /* 4585 * Get per-process file descriptors for use by procstat(1), et al. 4586 */ 4587 static int 4588 sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS) 4589 { 4590 struct kinfo_ofile *okif; 4591 struct kinfo_file *kif; 4592 struct filedesc *fdp; 4593 struct pwddesc *pdp; 4594 struct pwd *pwd; 4595 u_int namelen; 4596 int error, i, *name; 4597 struct file *fp; 4598 struct proc *p; 4599 4600 namelen = arg2; 4601 if (namelen != 1) 4602 return (EINVAL); 4603 4604 name = (int *)arg1; 4605 error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); 4606 if (error != 0) 4607 return (error); 4608 fdp = fdhold(p); 4609 if (fdp != NULL) 4610 pdp = pdhold(p); 4611 PROC_UNLOCK(p); 4612 if (fdp == NULL || pdp == NULL) { 4613 if (fdp != NULL) 4614 fddrop(fdp); 4615 return (ENOENT); 4616 } 4617 kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK); 4618 okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK); 4619 PWDDESC_XLOCK(pdp); 4620 pwd = pwd_hold_pwddesc(pdp); 4621 if (pwd != NULL) { 4622 if (pwd->pwd_cdir != NULL) 4623 export_vnode_for_osysctl(pwd->pwd_cdir, KF_FD_TYPE_CWD, kif, 4624 okif, pdp, req); 4625 if (pwd->pwd_rdir != NULL) 4626 export_vnode_for_osysctl(pwd->pwd_rdir, KF_FD_TYPE_ROOT, kif, 4627 okif, pdp, req); 4628 if (pwd->pwd_jdir != NULL) 4629 export_vnode_for_osysctl(pwd->pwd_jdir, KF_FD_TYPE_JAIL, kif, 4630 okif, pdp, req); 4631 } 4632 PWDDESC_XUNLOCK(pdp); 4633 if (pwd != NULL) 4634 pwd_drop(pwd); 4635 FILEDESC_SLOCK(fdp); 4636 FILEDESC_FOREACH_FP(fdp, i, fp) { 4637 if (refcount_load(&fdp->fd_refcnt) == 0) 4638 break; 4639 export_file_to_kinfo(fp, i, NULL, kif, fdp, 4640 KERN_FILEDESC_PACK_KINFO); 4641 FILEDESC_SUNLOCK(fdp); 4642 kinfo_to_okinfo(kif, okif); 4643 error = SYSCTL_OUT(req, okif, sizeof(*okif)); 4644 FILEDESC_SLOCK(fdp); 4645 if (error) 4646 break; 4647 } 4648 FILEDESC_SUNLOCK(fdp); 4649 fddrop(fdp); 4650 pddrop(pdp); 4651 free(kif, M_TEMP); 4652 free(okif, M_TEMP); 4653 return (0); 4654 } 4655 4656 static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, 4657 CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc, 4658 "Process ofiledesc entries"); 4659 #endif /* COMPAT_FREEBSD7 */ 4660 4661 int 4662 vntype_to_kinfo(int vtype) 4663 { 4664 struct { 4665 int vtype; 4666 int kf_vtype; 4667 } vtypes_table[] = { 4668 { VBAD, KF_VTYPE_VBAD }, 4669 { VBLK, KF_VTYPE_VBLK }, 4670 { VCHR, KF_VTYPE_VCHR }, 4671 { VDIR, KF_VTYPE_VDIR }, 4672 { VFIFO, KF_VTYPE_VFIFO }, 4673 { VLNK, KF_VTYPE_VLNK }, 4674 { VNON, KF_VTYPE_VNON }, 4675 { VREG, KF_VTYPE_VREG }, 4676 { VSOCK, KF_VTYPE_VSOCK } 4677 }; 4678 unsigned int i; 4679 4680 /* 4681 * Perform vtype translation. 4682 */ 4683 for (i = 0; i < nitems(vtypes_table); i++) 4684 if (vtypes_table[i].vtype == vtype) 4685 return (vtypes_table[i].kf_vtype); 4686 4687 return (KF_VTYPE_UNKNOWN); 4688 } 4689 4690 static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, 4691 CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc, 4692 "Process filedesc entries"); 4693 4694 /* 4695 * Store a process current working directory information to sbuf. 4696 * 4697 * Takes a locked proc as argument, and returns with the proc unlocked. 4698 */ 4699 int 4700 kern_proc_cwd_out(struct proc *p, struct sbuf *sb, ssize_t maxlen) 4701 { 4702 struct pwddesc *pdp; 4703 struct pwd *pwd; 4704 struct export_fd_buf *efbuf; 4705 struct vnode *cdir; 4706 int error; 4707 4708 PROC_LOCK_ASSERT(p, MA_OWNED); 4709 4710 pdp = pdhold(p); 4711 PROC_UNLOCK(p); 4712 if (pdp == NULL) 4713 return (EINVAL); 4714 4715 efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); 4716 efbuf->fdp = NULL; 4717 efbuf->pdp = pdp; 4718 efbuf->sb = sb; 4719 efbuf->remainder = maxlen; 4720 efbuf->flags = 0; 4721 4722 PWDDESC_XLOCK(pdp); 4723 pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4724 cdir = pwd->pwd_cdir; 4725 if (cdir == NULL) { 4726 error = EINVAL; 4727 } else { 4728 vrefact(cdir); 4729 error = export_vnode_to_sb(cdir, KF_FD_TYPE_CWD, FREAD, efbuf); 4730 } 4731 PWDDESC_XUNLOCK(pdp); 4732 pddrop(pdp); 4733 free(efbuf, M_TEMP); 4734 return (error); 4735 } 4736 4737 /* 4738 * Get per-process current working directory. 4739 */ 4740 static int 4741 sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS) 4742 { 4743 struct sbuf sb; 4744 struct proc *p; 4745 ssize_t maxlen; 4746 u_int namelen; 4747 int error, error2, *name; 4748 4749 namelen = arg2; 4750 if (namelen != 1) 4751 return (EINVAL); 4752 4753 name = (int *)arg1; 4754 4755 sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req); 4756 sbuf_clear_flags(&sb, SBUF_INCLUDENUL); 4757 error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); 4758 if (error != 0) { 4759 sbuf_delete(&sb); 4760 return (error); 4761 } 4762 maxlen = req->oldptr != NULL ? req->oldlen : -1; 4763 error = kern_proc_cwd_out(p, &sb, maxlen); 4764 error2 = sbuf_finish(&sb); 4765 sbuf_delete(&sb); 4766 return (error != 0 ? error : error2); 4767 } 4768 4769 static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE, 4770 sysctl_kern_proc_cwd, "Process current working directory"); 4771 4772 #ifdef DDB 4773 /* 4774 * For the purposes of debugging, generate a human-readable string for the 4775 * file type. 4776 */ 4777 static const char * 4778 file_type_to_name(short type) 4779 { 4780 4781 switch (type) { 4782 case 0: 4783 return ("zero"); 4784 case DTYPE_VNODE: 4785 return ("vnode"); 4786 case DTYPE_SOCKET: 4787 return ("socket"); 4788 case DTYPE_PIPE: 4789 return ("pipe"); 4790 case DTYPE_FIFO: 4791 return ("fifo"); 4792 case DTYPE_KQUEUE: 4793 return ("kqueue"); 4794 case DTYPE_CRYPTO: 4795 return ("crypto"); 4796 case DTYPE_MQUEUE: 4797 return ("mqueue"); 4798 case DTYPE_SHM: 4799 return ("shm"); 4800 case DTYPE_SEM: 4801 return ("ksem"); 4802 case DTYPE_PTS: 4803 return ("pts"); 4804 case DTYPE_DEV: 4805 return ("dev"); 4806 case DTYPE_PROCDESC: 4807 return ("proc"); 4808 case DTYPE_EVENTFD: 4809 return ("eventfd"); 4810 case DTYPE_LINUXTFD: 4811 return ("ltimer"); 4812 default: 4813 return ("unkn"); 4814 } 4815 } 4816 4817 /* 4818 * For the purposes of debugging, identify a process (if any, perhaps one of 4819 * many) that references the passed file in its file descriptor array. Return 4820 * NULL if none. 4821 */ 4822 static struct proc * 4823 file_to_first_proc(struct file *fp) 4824 { 4825 struct filedesc *fdp; 4826 struct proc *p; 4827 int n; 4828 4829 FOREACH_PROC_IN_SYSTEM(p) { 4830 if (p->p_state == PRS_NEW) 4831 continue; 4832 fdp = p->p_fd; 4833 if (fdp == NULL) 4834 continue; 4835 for (n = 0; n < fdp->fd_nfiles; n++) { 4836 if (fp == fdp->fd_ofiles[n].fde_file) 4837 return (p); 4838 } 4839 } 4840 return (NULL); 4841 } 4842 4843 static void 4844 db_print_file(struct file *fp, int header) 4845 { 4846 #define XPTRWIDTH ((int)howmany(sizeof(void *) * NBBY, 4)) 4847 struct proc *p; 4848 4849 if (header) 4850 db_printf("%*s %6s %*s %8s %4s %5s %6s %*s %5s %s\n", 4851 XPTRWIDTH, "File", "Type", XPTRWIDTH, "Data", "Flag", 4852 "GCFl", "Count", "MCount", XPTRWIDTH, "Vnode", "FPID", 4853 "FCmd"); 4854 p = file_to_first_proc(fp); 4855 db_printf("%*p %6s %*p %08x %04x %5d %6d %*p %5d %s\n", XPTRWIDTH, 4856 fp, file_type_to_name(fp->f_type), XPTRWIDTH, fp->f_data, 4857 fp->f_flag, 0, refcount_load(&fp->f_count), 0, XPTRWIDTH, fp->f_vnode, 4858 p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-"); 4859 4860 #undef XPTRWIDTH 4861 } 4862 4863 DB_SHOW_COMMAND(file, db_show_file) 4864 { 4865 struct file *fp; 4866 4867 if (!have_addr) { 4868 db_printf("usage: show file <addr>\n"); 4869 return; 4870 } 4871 fp = (struct file *)addr; 4872 db_print_file(fp, 1); 4873 } 4874 4875 DB_SHOW_COMMAND(files, db_show_files) 4876 { 4877 struct filedesc *fdp; 4878 struct file *fp; 4879 struct proc *p; 4880 int header; 4881 int n; 4882 4883 header = 1; 4884 FOREACH_PROC_IN_SYSTEM(p) { 4885 if (p->p_state == PRS_NEW) 4886 continue; 4887 if ((fdp = p->p_fd) == NULL) 4888 continue; 4889 for (n = 0; n < fdp->fd_nfiles; ++n) { 4890 if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) 4891 continue; 4892 db_print_file(fp, header); 4893 header = 0; 4894 } 4895 } 4896 } 4897 #endif 4898 4899 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, 4900 &maxfilesperproc, 0, "Maximum files allowed open per process"); 4901 4902 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, 4903 &maxfiles, 0, "Maximum number of files"); 4904 4905 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, 4906 &openfiles, 0, "System-wide number of open files"); 4907 4908 /* ARGSUSED*/ 4909 static void 4910 filelistinit(void *dummy) 4911 { 4912 4913 file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, 4914 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 4915 filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0), 4916 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 4917 pwd_zone = uma_zcreate("PWD", sizeof(struct pwd), NULL, NULL, 4918 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_SMR); 4919 /* 4920 * XXXMJG this is a temporary hack due to boot ordering issues against 4921 * the vnode zone. 4922 */ 4923 vfs_smr = uma_zone_get_smr(pwd_zone); 4924 mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); 4925 } 4926 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL); 4927 4928 /*-------------------------------------------------------------------*/ 4929 4930 static int 4931 badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, 4932 int flags, struct thread *td) 4933 { 4934 4935 return (EBADF); 4936 } 4937 4938 static int 4939 badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, 4940 struct thread *td) 4941 { 4942 4943 return (EINVAL); 4944 } 4945 4946 static int 4947 badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, 4948 struct thread *td) 4949 { 4950 4951 return (EBADF); 4952 } 4953 4954 static int 4955 badfo_poll(struct file *fp, int events, struct ucred *active_cred, 4956 struct thread *td) 4957 { 4958 4959 return (0); 4960 } 4961 4962 static int 4963 badfo_kqfilter(struct file *fp, struct knote *kn) 4964 { 4965 4966 return (EBADF); 4967 } 4968 4969 static int 4970 badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) 4971 { 4972 4973 return (EBADF); 4974 } 4975 4976 static int 4977 badfo_close(struct file *fp, struct thread *td) 4978 { 4979 4980 return (0); 4981 } 4982 4983 static int 4984 badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 4985 struct thread *td) 4986 { 4987 4988 return (EBADF); 4989 } 4990 4991 static int 4992 badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 4993 struct thread *td) 4994 { 4995 4996 return (EBADF); 4997 } 4998 4999 static int 5000 badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, 5001 struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, 5002 struct thread *td) 5003 { 5004 5005 return (EBADF); 5006 } 5007 5008 static int 5009 badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 5010 { 5011 5012 return (0); 5013 } 5014 5015 struct fileops badfileops = { 5016 .fo_read = badfo_readwrite, 5017 .fo_write = badfo_readwrite, 5018 .fo_truncate = badfo_truncate, 5019 .fo_ioctl = badfo_ioctl, 5020 .fo_poll = badfo_poll, 5021 .fo_kqfilter = badfo_kqfilter, 5022 .fo_stat = badfo_stat, 5023 .fo_close = badfo_close, 5024 .fo_chmod = badfo_chmod, 5025 .fo_chown = badfo_chown, 5026 .fo_sendfile = badfo_sendfile, 5027 .fo_fill_kinfo = badfo_fill_kinfo, 5028 }; 5029 5030 static int 5031 path_poll(struct file *fp, int events, struct ucred *active_cred, 5032 struct thread *td) 5033 { 5034 return (POLLNVAL); 5035 } 5036 5037 static int 5038 path_close(struct file *fp, struct thread *td) 5039 { 5040 MPASS(fp->f_type == DTYPE_VNODE); 5041 fp->f_ops = &badfileops; 5042 vdrop(fp->f_vnode); 5043 return (0); 5044 } 5045 5046 struct fileops path_fileops = { 5047 .fo_read = badfo_readwrite, 5048 .fo_write = badfo_readwrite, 5049 .fo_truncate = badfo_truncate, 5050 .fo_ioctl = badfo_ioctl, 5051 .fo_poll = path_poll, 5052 .fo_kqfilter = vn_kqfilter_opath, 5053 .fo_stat = vn_statfile, 5054 .fo_close = path_close, 5055 .fo_chmod = badfo_chmod, 5056 .fo_chown = badfo_chown, 5057 .fo_sendfile = badfo_sendfile, 5058 .fo_fill_kinfo = vn_fill_kinfo, 5059 .fo_flags = DFLAG_PASSABLE, 5060 }; 5061 5062 int 5063 invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred, 5064 int flags, struct thread *td) 5065 { 5066 5067 return (EOPNOTSUPP); 5068 } 5069 5070 int 5071 invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, 5072 struct thread *td) 5073 { 5074 5075 return (EINVAL); 5076 } 5077 5078 int 5079 invfo_ioctl(struct file *fp, u_long com, void *data, 5080 struct ucred *active_cred, struct thread *td) 5081 { 5082 5083 return (ENOTTY); 5084 } 5085 5086 int 5087 invfo_poll(struct file *fp, int events, struct ucred *active_cred, 5088 struct thread *td) 5089 { 5090 5091 return (poll_no_poll(events)); 5092 } 5093 5094 int 5095 invfo_kqfilter(struct file *fp, struct knote *kn) 5096 { 5097 5098 return (EINVAL); 5099 } 5100 5101 int 5102 invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 5103 struct thread *td) 5104 { 5105 5106 return (EINVAL); 5107 } 5108 5109 int 5110 invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 5111 struct thread *td) 5112 { 5113 5114 return (EINVAL); 5115 } 5116 5117 int 5118 invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, 5119 struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, 5120 struct thread *td) 5121 { 5122 5123 return (EINVAL); 5124 } 5125 5126 /*-------------------------------------------------------------------*/ 5127 5128 /* 5129 * File Descriptor pseudo-device driver (/dev/fd/). 5130 * 5131 * Opening minor device N dup()s the file (if any) connected to file 5132 * descriptor N belonging to the calling process. Note that this driver 5133 * consists of only the ``open()'' routine, because all subsequent 5134 * references to this file will be direct to the other driver. 5135 * 5136 * XXX: we could give this one a cloning event handler if necessary. 5137 */ 5138 5139 /* ARGSUSED */ 5140 static int 5141 fdopen(struct cdev *dev, int mode, int type, struct thread *td) 5142 { 5143 5144 /* 5145 * XXX Kludge: set curthread->td_dupfd to contain the value of the 5146 * the file descriptor being sought for duplication. The error 5147 * return ensures that the vnode for this device will be released 5148 * by vn_open. Open will detect this special error and take the 5149 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN 5150 * will simply report the error. 5151 */ 5152 td->td_dupfd = dev2unit(dev); 5153 return (ENODEV); 5154 } 5155 5156 static struct cdevsw fildesc_cdevsw = { 5157 .d_version = D_VERSION, 5158 .d_open = fdopen, 5159 .d_name = "FD", 5160 }; 5161 5162 static void 5163 fildesc_drvinit(void *unused) 5164 { 5165 struct cdev *dev; 5166 5167 dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL, 5168 UID_ROOT, GID_WHEEL, 0666, "fd/0"); 5169 make_dev_alias(dev, "stdin"); 5170 dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL, 5171 UID_ROOT, GID_WHEEL, 0666, "fd/1"); 5172 make_dev_alias(dev, "stdout"); 5173 dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL, 5174 UID_ROOT, GID_WHEEL, 0666, "fd/2"); 5175 make_dev_alias(dev, "stderr"); 5176 } 5177 5178 SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL); 5179