1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1989, 1991, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 37 */ 38 39 #include <sys/cdefs.h> 40 #include "opt_capsicum.h" 41 #include "opt_ddb.h" 42 #include "opt_ktrace.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 47 #include <sys/capsicum.h> 48 #include <sys/conf.h> 49 #include <sys/fcntl.h> 50 #include <sys/file.h> 51 #include <sys/filedesc.h> 52 #include <sys/filio.h> 53 #include <sys/jail.h> 54 #include <sys/kernel.h> 55 #include <sys/limits.h> 56 #include <sys/lock.h> 57 #include <sys/malloc.h> 58 #include <sys/mount.h> 59 #include <sys/mutex.h> 60 #include <sys/namei.h> 61 #include <sys/selinfo.h> 62 #include <sys/poll.h> 63 #include <sys/priv.h> 64 #include <sys/proc.h> 65 #include <sys/protosw.h> 66 #include <sys/racct.h> 67 #include <sys/resourcevar.h> 68 #include <sys/sbuf.h> 69 #include <sys/signalvar.h> 70 #include <sys/kdb.h> 71 #include <sys/smr.h> 72 #include <sys/stat.h> 73 #include <sys/sx.h> 74 #include <sys/syscallsubr.h> 75 #include <sys/sysctl.h> 76 #include <sys/sysproto.h> 77 #include <sys/unistd.h> 78 #include <sys/user.h> 79 #include <sys/vnode.h> 80 #include <sys/ktrace.h> 81 82 #include <net/vnet.h> 83 84 #include <security/audit/audit.h> 85 86 #include <vm/uma.h> 87 #include <vm/vm.h> 88 89 #include <ddb/ddb.h> 90 91 static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table"); 92 static MALLOC_DEFINE(M_PWD, "pwd", "Descriptor table vnodes"); 93 static MALLOC_DEFINE(M_PWDDESC, "pwddesc", "Pwd descriptors"); 94 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader", 95 "file desc to leader structures"); 96 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); 97 MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities"); 98 99 MALLOC_DECLARE(M_FADVISE); 100 101 static __read_mostly uma_zone_t file_zone; 102 static __read_mostly uma_zone_t filedesc0_zone; 103 __read_mostly uma_zone_t pwd_zone; 104 VFS_SMR_DECLARE; 105 106 static int closefp(struct filedesc *fdp, int fd, struct file *fp, 107 struct thread *td, bool holdleaders, bool audit); 108 static void export_file_to_kinfo(struct file *fp, int fd, 109 cap_rights_t *rightsp, struct kinfo_file *kif, 110 struct filedesc *fdp, int flags); 111 static int fd_first_free(struct filedesc *fdp, int low, int size); 112 static void fdgrowtable(struct filedesc *fdp, int nfd); 113 static void fdgrowtable_exp(struct filedesc *fdp, int nfd); 114 static void fdunused(struct filedesc *fdp, int fd); 115 static void fdused(struct filedesc *fdp, int fd); 116 static int fget_unlocked_seq(struct thread *td, int fd, 117 cap_rights_t *needrightsp, struct file **fpp, seqc_t *seqp); 118 static int getmaxfd(struct thread *td); 119 static u_long *filecaps_copy_prep(const struct filecaps *src); 120 static void filecaps_copy_finish(const struct filecaps *src, 121 struct filecaps *dst, u_long *ioctls); 122 static u_long *filecaps_free_prep(struct filecaps *fcaps); 123 static void filecaps_free_finish(u_long *ioctls); 124 125 static struct pwd *pwd_alloc(void); 126 127 /* 128 * Each process has: 129 * 130 * - An array of open file descriptors (fd_ofiles) 131 * - An array of file flags (fd_ofileflags) 132 * - A bitmap recording which descriptors are in use (fd_map) 133 * 134 * A process starts out with NDFILE descriptors. The value of NDFILE has 135 * been selected based the historical limit of 20 open files, and an 136 * assumption that the majority of processes, especially short-lived 137 * processes like shells, will never need more. 138 * 139 * If this initial allocation is exhausted, a larger descriptor table and 140 * map are allocated dynamically, and the pointers in the process's struct 141 * filedesc are updated to point to those. This is repeated every time 142 * the process runs out of file descriptors (provided it hasn't hit its 143 * resource limit). 144 * 145 * Since threads may hold references to individual descriptor table 146 * entries, the tables are never freed. Instead, they are placed on a 147 * linked list and freed only when the struct filedesc is released. 148 */ 149 #define NDFILE 20 150 #define NDSLOTSIZE sizeof(NDSLOTTYPE) 151 #define NDENTRIES (NDSLOTSIZE * __CHAR_BIT) 152 #define NDSLOT(x) ((x) / NDENTRIES) 153 #define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES)) 154 #define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES) 155 156 #define FILEDESC_FOREACH_FDE(fdp, _iterator, _fde) \ 157 struct filedesc *_fdp = (fdp); \ 158 int _lastfile = fdlastfile_single(_fdp); \ 159 for (_iterator = 0; _iterator <= _lastfile; _iterator++) \ 160 if ((_fde = &_fdp->fd_ofiles[_iterator])->fde_file != NULL) 161 162 #define FILEDESC_FOREACH_FP(fdp, _iterator, _fp) \ 163 struct filedesc *_fdp = (fdp); \ 164 int _lastfile = fdlastfile_single(_fdp); \ 165 for (_iterator = 0; _iterator <= _lastfile; _iterator++) \ 166 if ((_fp = _fdp->fd_ofiles[_iterator].fde_file) != NULL) 167 168 /* 169 * SLIST entry used to keep track of ofiles which must be reclaimed when 170 * the process exits. 171 */ 172 struct freetable { 173 struct fdescenttbl *ft_table; 174 SLIST_ENTRY(freetable) ft_next; 175 }; 176 177 /* 178 * Initial allocation: a filedesc structure + the head of SLIST used to 179 * keep track of old ofiles + enough space for NDFILE descriptors. 180 */ 181 182 struct fdescenttbl0 { 183 int fdt_nfiles; 184 struct filedescent fdt_ofiles[NDFILE]; 185 }; 186 187 struct filedesc0 { 188 struct filedesc fd_fd; 189 SLIST_HEAD(, freetable) fd_free; 190 struct fdescenttbl0 fd_dfiles; 191 NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)]; 192 }; 193 194 /* 195 * Descriptor management. 196 */ 197 static int __exclusive_cache_line openfiles; /* actual number of open files */ 198 struct mtx sigio_lock; /* mtx to protect pointers to sigio */ 199 void __read_mostly (*mq_fdclose)(struct thread *td, int fd, struct file *fp); 200 201 /* 202 * If low >= size, just return low. Otherwise find the first zero bit in the 203 * given bitmap, starting at low and not exceeding size - 1. Return size if 204 * not found. 205 */ 206 static int 207 fd_first_free(struct filedesc *fdp, int low, int size) 208 { 209 NDSLOTTYPE *map = fdp->fd_map; 210 NDSLOTTYPE mask; 211 int off, maxoff; 212 213 if (low >= size) 214 return (low); 215 216 off = NDSLOT(low); 217 if (low % NDENTRIES) { 218 mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES))); 219 if ((mask &= ~map[off]) != 0UL) 220 return (off * NDENTRIES + ffsl(mask) - 1); 221 ++off; 222 } 223 for (maxoff = NDSLOTS(size); off < maxoff; ++off) 224 if (map[off] != ~0UL) 225 return (off * NDENTRIES + ffsl(~map[off]) - 1); 226 return (size); 227 } 228 229 /* 230 * Find the last used fd. 231 * 232 * Call this variant if fdp can't be modified by anyone else (e.g, during exec). 233 * Otherwise use fdlastfile. 234 */ 235 int 236 fdlastfile_single(struct filedesc *fdp) 237 { 238 NDSLOTTYPE *map = fdp->fd_map; 239 int off, minoff; 240 241 off = NDSLOT(fdp->fd_nfiles - 1); 242 for (minoff = NDSLOT(0); off >= minoff; --off) 243 if (map[off] != 0) 244 return (off * NDENTRIES + flsl(map[off]) - 1); 245 return (-1); 246 } 247 248 int 249 fdlastfile(struct filedesc *fdp) 250 { 251 252 FILEDESC_LOCK_ASSERT(fdp); 253 return (fdlastfile_single(fdp)); 254 } 255 256 static int 257 fdisused(struct filedesc *fdp, int fd) 258 { 259 260 KASSERT(fd >= 0 && fd < fdp->fd_nfiles, 261 ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles)); 262 263 return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0); 264 } 265 266 /* 267 * Mark a file descriptor as used. 268 */ 269 static void 270 fdused_init(struct filedesc *fdp, int fd) 271 { 272 273 KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd)); 274 275 fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd); 276 } 277 278 static void 279 fdused(struct filedesc *fdp, int fd) 280 { 281 282 FILEDESC_XLOCK_ASSERT(fdp); 283 284 fdused_init(fdp, fd); 285 if (fd == fdp->fd_freefile) 286 fdp->fd_freefile++; 287 } 288 289 /* 290 * Mark a file descriptor as unused. 291 */ 292 static void 293 fdunused(struct filedesc *fdp, int fd) 294 { 295 296 FILEDESC_XLOCK_ASSERT(fdp); 297 298 KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd)); 299 KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, 300 ("fd=%d is still in use", fd)); 301 302 fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd); 303 if (fd < fdp->fd_freefile) 304 fdp->fd_freefile = fd; 305 } 306 307 /* 308 * Free a file descriptor. 309 * 310 * Avoid some work if fdp is about to be destroyed. 311 */ 312 static inline void 313 fdefree_last(struct filedescent *fde) 314 { 315 316 filecaps_free(&fde->fde_caps); 317 } 318 319 static inline void 320 fdfree(struct filedesc *fdp, int fd) 321 { 322 struct filedescent *fde; 323 324 FILEDESC_XLOCK_ASSERT(fdp); 325 fde = &fdp->fd_ofiles[fd]; 326 #ifdef CAPABILITIES 327 seqc_write_begin(&fde->fde_seqc); 328 #endif 329 fde->fde_file = NULL; 330 #ifdef CAPABILITIES 331 seqc_write_end(&fde->fde_seqc); 332 #endif 333 fdefree_last(fde); 334 fdunused(fdp, fd); 335 } 336 337 /* 338 * System calls on descriptors. 339 */ 340 #ifndef _SYS_SYSPROTO_H_ 341 struct getdtablesize_args { 342 int dummy; 343 }; 344 #endif 345 /* ARGSUSED */ 346 int 347 sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap) 348 { 349 #ifdef RACCT 350 uint64_t lim; 351 #endif 352 353 td->td_retval[0] = getmaxfd(td); 354 #ifdef RACCT 355 PROC_LOCK(td->td_proc); 356 lim = racct_get_limit(td->td_proc, RACCT_NOFILE); 357 PROC_UNLOCK(td->td_proc); 358 if (lim < td->td_retval[0]) 359 td->td_retval[0] = lim; 360 #endif 361 return (0); 362 } 363 364 /* 365 * Duplicate a file descriptor to a particular value. 366 * 367 * Note: keep in mind that a potential race condition exists when closing 368 * descriptors from a shared descriptor table (via rfork). 369 */ 370 #ifndef _SYS_SYSPROTO_H_ 371 struct dup2_args { 372 u_int from; 373 u_int to; 374 }; 375 #endif 376 /* ARGSUSED */ 377 int 378 sys_dup2(struct thread *td, struct dup2_args *uap) 379 { 380 381 return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to)); 382 } 383 384 /* 385 * Duplicate a file descriptor. 386 */ 387 #ifndef _SYS_SYSPROTO_H_ 388 struct dup_args { 389 u_int fd; 390 }; 391 #endif 392 /* ARGSUSED */ 393 int 394 sys_dup(struct thread *td, struct dup_args *uap) 395 { 396 397 return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0)); 398 } 399 400 /* 401 * The file control system call. 402 */ 403 #ifndef _SYS_SYSPROTO_H_ 404 struct fcntl_args { 405 int fd; 406 int cmd; 407 long arg; 408 }; 409 #endif 410 /* ARGSUSED */ 411 int 412 sys_fcntl(struct thread *td, struct fcntl_args *uap) 413 { 414 415 return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg)); 416 } 417 418 int 419 kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg) 420 { 421 struct flock fl; 422 struct __oflock ofl; 423 intptr_t arg1; 424 int error, newcmd; 425 426 error = 0; 427 newcmd = cmd; 428 switch (cmd) { 429 case F_OGETLK: 430 case F_OSETLK: 431 case F_OSETLKW: 432 /* 433 * Convert old flock structure to new. 434 */ 435 error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl)); 436 fl.l_start = ofl.l_start; 437 fl.l_len = ofl.l_len; 438 fl.l_pid = ofl.l_pid; 439 fl.l_type = ofl.l_type; 440 fl.l_whence = ofl.l_whence; 441 fl.l_sysid = 0; 442 443 switch (cmd) { 444 case F_OGETLK: 445 newcmd = F_GETLK; 446 break; 447 case F_OSETLK: 448 newcmd = F_SETLK; 449 break; 450 case F_OSETLKW: 451 newcmd = F_SETLKW; 452 break; 453 } 454 arg1 = (intptr_t)&fl; 455 break; 456 case F_GETLK: 457 case F_SETLK: 458 case F_SETLKW: 459 case F_SETLK_REMOTE: 460 error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl)); 461 arg1 = (intptr_t)&fl; 462 break; 463 default: 464 arg1 = arg; 465 break; 466 } 467 if (error) 468 return (error); 469 error = kern_fcntl(td, fd, newcmd, arg1); 470 if (error) 471 return (error); 472 if (cmd == F_OGETLK) { 473 ofl.l_start = fl.l_start; 474 ofl.l_len = fl.l_len; 475 ofl.l_pid = fl.l_pid; 476 ofl.l_type = fl.l_type; 477 ofl.l_whence = fl.l_whence; 478 error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl)); 479 } else if (cmd == F_GETLK) { 480 error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl)); 481 } 482 return (error); 483 } 484 485 int 486 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) 487 { 488 struct filedesc *fdp; 489 struct flock *flp; 490 struct file *fp, *fp2; 491 struct filedescent *fde; 492 struct proc *p; 493 struct vnode *vp; 494 struct mount *mp; 495 struct kinfo_file *kif; 496 int error, flg, kif_sz, seals, tmp, got_set, got_cleared; 497 uint64_t bsize; 498 off_t foffset; 499 500 error = 0; 501 flg = F_POSIX; 502 p = td->td_proc; 503 fdp = p->p_fd; 504 505 AUDIT_ARG_FD(cmd); 506 AUDIT_ARG_CMD(cmd); 507 switch (cmd) { 508 case F_DUPFD: 509 tmp = arg; 510 error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp); 511 break; 512 513 case F_DUPFD_CLOEXEC: 514 tmp = arg; 515 error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp); 516 break; 517 518 case F_DUP2FD: 519 tmp = arg; 520 error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp); 521 break; 522 523 case F_DUP2FD_CLOEXEC: 524 tmp = arg; 525 error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp); 526 break; 527 528 case F_GETFD: 529 error = EBADF; 530 FILEDESC_SLOCK(fdp); 531 fde = fdeget_noref(fdp, fd); 532 if (fde != NULL) { 533 td->td_retval[0] = 534 (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0; 535 error = 0; 536 } 537 FILEDESC_SUNLOCK(fdp); 538 break; 539 540 case F_SETFD: 541 error = EBADF; 542 FILEDESC_XLOCK(fdp); 543 fde = fdeget_noref(fdp, fd); 544 if (fde != NULL) { 545 fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) | 546 (arg & FD_CLOEXEC ? UF_EXCLOSE : 0); 547 error = 0; 548 } 549 FILEDESC_XUNLOCK(fdp); 550 break; 551 552 case F_GETFL: 553 error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETFL, &fp); 554 if (error != 0) 555 break; 556 td->td_retval[0] = OFLAGS(fp->f_flag); 557 fdrop(fp, td); 558 break; 559 560 case F_SETFL: 561 error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETFL, &fp); 562 if (error != 0) 563 break; 564 if (fp->f_ops == &path_fileops) { 565 fdrop(fp, td); 566 error = EBADF; 567 break; 568 } 569 do { 570 tmp = flg = fp->f_flag; 571 tmp &= ~FCNTLFLAGS; 572 tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS; 573 } while (atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); 574 got_set = tmp & ~flg; 575 got_cleared = flg & ~tmp; 576 tmp = fp->f_flag & FNONBLOCK; 577 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 578 if (error != 0) 579 goto revert_f_setfl; 580 tmp = fp->f_flag & FASYNC; 581 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); 582 if (error == 0) { 583 fdrop(fp, td); 584 break; 585 } 586 atomic_clear_int(&fp->f_flag, FNONBLOCK); 587 tmp = 0; 588 (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 589 revert_f_setfl: 590 do { 591 tmp = flg = fp->f_flag; 592 tmp &= ~FCNTLFLAGS; 593 tmp |= got_cleared; 594 tmp &= ~got_set; 595 } while (atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); 596 fdrop(fp, td); 597 break; 598 599 case F_GETOWN: 600 error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETOWN, &fp); 601 if (error != 0) 602 break; 603 error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td); 604 if (error == 0) 605 td->td_retval[0] = tmp; 606 fdrop(fp, td); 607 break; 608 609 case F_SETOWN: 610 error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETOWN, &fp); 611 if (error != 0) 612 break; 613 tmp = arg; 614 error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td); 615 fdrop(fp, td); 616 break; 617 618 case F_SETLK_REMOTE: 619 error = priv_check(td, PRIV_NFS_LOCKD); 620 if (error != 0) 621 return (error); 622 flg = F_REMOTE; 623 goto do_setlk; 624 625 case F_SETLKW: 626 flg |= F_WAIT; 627 /* FALLTHROUGH F_SETLK */ 628 629 case F_SETLK: 630 do_setlk: 631 flp = (struct flock *)arg; 632 if ((flg & F_REMOTE) != 0 && flp->l_sysid == 0) { 633 error = EINVAL; 634 break; 635 } 636 637 error = fget_unlocked(td, fd, &cap_flock_rights, &fp); 638 if (error != 0) 639 break; 640 if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { 641 error = EBADF; 642 fdrop(fp, td); 643 break; 644 } 645 646 if (flp->l_whence == SEEK_CUR) { 647 foffset = foffset_get(fp); 648 if (foffset < 0 || 649 (flp->l_start > 0 && 650 foffset > OFF_MAX - flp->l_start)) { 651 error = EOVERFLOW; 652 fdrop(fp, td); 653 break; 654 } 655 flp->l_start += foffset; 656 } 657 658 vp = fp->f_vnode; 659 switch (flp->l_type) { 660 case F_RDLCK: 661 if ((fp->f_flag & FREAD) == 0) { 662 error = EBADF; 663 break; 664 } 665 if ((p->p_leader->p_flag & P_ADVLOCK) == 0) { 666 PROC_LOCK(p->p_leader); 667 p->p_leader->p_flag |= P_ADVLOCK; 668 PROC_UNLOCK(p->p_leader); 669 } 670 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, 671 flp, flg); 672 break; 673 case F_WRLCK: 674 if ((fp->f_flag & FWRITE) == 0) { 675 error = EBADF; 676 break; 677 } 678 if ((p->p_leader->p_flag & P_ADVLOCK) == 0) { 679 PROC_LOCK(p->p_leader); 680 p->p_leader->p_flag |= P_ADVLOCK; 681 PROC_UNLOCK(p->p_leader); 682 } 683 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, 684 flp, flg); 685 break; 686 case F_UNLCK: 687 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, 688 flp, flg); 689 break; 690 case F_UNLCKSYS: 691 if (flg != F_REMOTE) { 692 error = EINVAL; 693 break; 694 } 695 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, 696 F_UNLCKSYS, flp, flg); 697 break; 698 default: 699 error = EINVAL; 700 break; 701 } 702 if (error != 0 || flp->l_type == F_UNLCK || 703 flp->l_type == F_UNLCKSYS) { 704 fdrop(fp, td); 705 break; 706 } 707 708 /* 709 * Check for a race with close. 710 * 711 * The vnode is now advisory locked (or unlocked, but this case 712 * is not really important) as the caller requested. 713 * We had to drop the filedesc lock, so we need to recheck if 714 * the descriptor is still valid, because if it was closed 715 * in the meantime we need to remove advisory lock from the 716 * vnode - close on any descriptor leading to an advisory 717 * locked vnode, removes that lock. 718 * We will return 0 on purpose in that case, as the result of 719 * successful advisory lock might have been externally visible 720 * already. This is fine - effectively we pretend to the caller 721 * that the closing thread was a bit slower and that the 722 * advisory lock succeeded before the close. 723 */ 724 error = fget_unlocked(td, fd, &cap_no_rights, &fp2); 725 if (error != 0) { 726 fdrop(fp, td); 727 break; 728 } 729 if (fp != fp2) { 730 flp->l_whence = SEEK_SET; 731 flp->l_start = 0; 732 flp->l_len = 0; 733 flp->l_type = F_UNLCK; 734 (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, 735 F_UNLCK, flp, F_POSIX); 736 } 737 fdrop(fp, td); 738 fdrop(fp2, td); 739 break; 740 741 case F_GETLK: 742 error = fget_unlocked(td, fd, &cap_flock_rights, &fp); 743 if (error != 0) 744 break; 745 if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { 746 error = EBADF; 747 fdrop(fp, td); 748 break; 749 } 750 flp = (struct flock *)arg; 751 if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK && 752 flp->l_type != F_UNLCK) { 753 error = EINVAL; 754 fdrop(fp, td); 755 break; 756 } 757 if (flp->l_whence == SEEK_CUR) { 758 foffset = foffset_get(fp); 759 if ((flp->l_start > 0 && 760 foffset > OFF_MAX - flp->l_start) || 761 (flp->l_start < 0 && 762 foffset < OFF_MIN - flp->l_start)) { 763 error = EOVERFLOW; 764 fdrop(fp, td); 765 break; 766 } 767 flp->l_start += foffset; 768 } 769 vp = fp->f_vnode; 770 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp, 771 F_POSIX); 772 fdrop(fp, td); 773 break; 774 775 case F_ADD_SEALS: 776 error = fget_unlocked(td, fd, &cap_no_rights, &fp); 777 if (error != 0) 778 break; 779 error = fo_add_seals(fp, arg); 780 fdrop(fp, td); 781 break; 782 783 case F_GET_SEALS: 784 error = fget_unlocked(td, fd, &cap_no_rights, &fp); 785 if (error != 0) 786 break; 787 if (fo_get_seals(fp, &seals) == 0) 788 td->td_retval[0] = seals; 789 else 790 error = EINVAL; 791 fdrop(fp, td); 792 break; 793 794 case F_RDAHEAD: 795 arg = arg ? 128 * 1024: 0; 796 /* FALLTHROUGH */ 797 case F_READAHEAD: 798 error = fget_unlocked(td, fd, &cap_no_rights, &fp); 799 if (error != 0) 800 break; 801 if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { 802 fdrop(fp, td); 803 error = EBADF; 804 break; 805 } 806 vp = fp->f_vnode; 807 if (vp->v_type != VREG) { 808 fdrop(fp, td); 809 error = ENOTTY; 810 break; 811 } 812 813 /* 814 * Exclusive lock synchronizes against f_seqcount reads and 815 * writes in sequential_heuristic(). 816 */ 817 error = vn_lock(vp, LK_EXCLUSIVE); 818 if (error != 0) { 819 fdrop(fp, td); 820 break; 821 } 822 if (arg >= 0) { 823 bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize; 824 arg = MIN(arg, INT_MAX - bsize + 1); 825 fp->f_seqcount[UIO_READ] = MIN(IO_SEQMAX, 826 (arg + bsize - 1) / bsize); 827 atomic_set_int(&fp->f_flag, FRDAHEAD); 828 } else { 829 atomic_clear_int(&fp->f_flag, FRDAHEAD); 830 } 831 VOP_UNLOCK(vp); 832 fdrop(fp, td); 833 break; 834 835 case F_ISUNIONSTACK: 836 /* 837 * Check if the vnode is part of a union stack (either the 838 * "union" flag from mount(2) or unionfs). 839 * 840 * Prior to introduction of this op libc's readdir would call 841 * fstatfs(2), in effect unnecessarily copying kilobytes of 842 * data just to check fs name and a mount flag. 843 * 844 * Fixing the code to handle everything in the kernel instead 845 * is a non-trivial endeavor and has low priority, thus this 846 * horrible kludge facilitates the current behavior in a much 847 * cheaper manner until someone(tm) sorts this out. 848 */ 849 error = fget_unlocked(td, fd, &cap_no_rights, &fp); 850 if (error != 0) 851 break; 852 if (fp->f_type != DTYPE_VNODE) { 853 fdrop(fp, td); 854 error = EBADF; 855 break; 856 } 857 vp = fp->f_vnode; 858 /* 859 * Since we don't prevent dooming the vnode even non-null mp 860 * found can become immediately stale. This is tolerable since 861 * mount points are type-stable (providing safe memory access) 862 * and any vfs op on this vnode going forward will return an 863 * error (meaning return value in this case is meaningless). 864 */ 865 mp = atomic_load_ptr(&vp->v_mount); 866 if (__predict_false(mp == NULL)) { 867 fdrop(fp, td); 868 error = EBADF; 869 break; 870 } 871 td->td_retval[0] = 0; 872 if (mp->mnt_kern_flag & MNTK_UNIONFS || 873 mp->mnt_flag & MNT_UNION) 874 td->td_retval[0] = 1; 875 fdrop(fp, td); 876 break; 877 878 case F_KINFO: 879 #ifdef CAPABILITY_MODE 880 if (IN_CAPABILITY_MODE(td)) { 881 error = ECAPMODE; 882 break; 883 } 884 #endif 885 error = copyin((void *)arg, &kif_sz, sizeof(kif_sz)); 886 if (error != 0) 887 break; 888 if (kif_sz != sizeof(*kif)) { 889 error = EINVAL; 890 break; 891 } 892 kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK | M_ZERO); 893 FILEDESC_SLOCK(fdp); 894 error = fget_cap_noref(fdp, fd, &cap_fcntl_rights, &fp, NULL); 895 if (error == 0 && fhold(fp)) { 896 export_file_to_kinfo(fp, fd, NULL, kif, fdp, 0); 897 FILEDESC_SUNLOCK(fdp); 898 fdrop(fp, td); 899 if ((kif->kf_status & KF_ATTR_VALID) != 0) { 900 kif->kf_structsize = sizeof(*kif); 901 error = copyout(kif, (void *)arg, sizeof(*kif)); 902 } else { 903 error = EBADF; 904 } 905 } else { 906 FILEDESC_SUNLOCK(fdp); 907 if (error == 0) 908 error = EBADF; 909 } 910 free(kif, M_TEMP); 911 break; 912 913 default: 914 error = EINVAL; 915 break; 916 } 917 return (error); 918 } 919 920 static int 921 getmaxfd(struct thread *td) 922 { 923 924 return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc)); 925 } 926 927 /* 928 * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD). 929 */ 930 int 931 kern_dup(struct thread *td, u_int mode, int flags, int old, int new) 932 { 933 struct filedesc *fdp; 934 struct filedescent *oldfde, *newfde; 935 struct proc *p; 936 struct file *delfp, *oldfp; 937 u_long *oioctls, *nioctls; 938 int error, maxfd; 939 940 p = td->td_proc; 941 fdp = p->p_fd; 942 oioctls = NULL; 943 944 MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0); 945 MPASS(mode < FDDUP_LASTMODE); 946 947 AUDIT_ARG_FD(old); 948 /* XXXRW: if (flags & FDDUP_FIXED) AUDIT_ARG_FD2(new); */ 949 950 /* 951 * Verify we have a valid descriptor to dup from and possibly to 952 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should 953 * return EINVAL when the new descriptor is out of bounds. 954 */ 955 if (old < 0) 956 return (EBADF); 957 if (new < 0) 958 return (mode == FDDUP_FCNTL ? EINVAL : EBADF); 959 maxfd = getmaxfd(td); 960 if (new >= maxfd) 961 return (mode == FDDUP_FCNTL ? EINVAL : EBADF); 962 963 error = EBADF; 964 FILEDESC_XLOCK(fdp); 965 if (fget_noref(fdp, old) == NULL) 966 goto unlock; 967 if (mode == FDDUP_FIXED && old == new) { 968 td->td_retval[0] = new; 969 if (flags & FDDUP_FLAG_CLOEXEC) 970 fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE; 971 error = 0; 972 goto unlock; 973 } 974 975 oldfde = &fdp->fd_ofiles[old]; 976 oldfp = oldfde->fde_file; 977 if (!fhold(oldfp)) 978 goto unlock; 979 980 /* 981 * If the caller specified a file descriptor, make sure the file 982 * table is large enough to hold it, and grab it. Otherwise, just 983 * allocate a new descriptor the usual way. 984 */ 985 switch (mode) { 986 case FDDUP_NORMAL: 987 case FDDUP_FCNTL: 988 if ((error = fdalloc(td, new, &new)) != 0) { 989 fdrop(oldfp, td); 990 goto unlock; 991 } 992 break; 993 case FDDUP_FIXED: 994 if (new >= fdp->fd_nfiles) { 995 /* 996 * The resource limits are here instead of e.g. 997 * fdalloc(), because the file descriptor table may be 998 * shared between processes, so we can't really use 999 * racct_add()/racct_sub(). Instead of counting the 1000 * number of actually allocated descriptors, just put 1001 * the limit on the size of the file descriptor table. 1002 */ 1003 #ifdef RACCT 1004 if (RACCT_ENABLED()) { 1005 error = racct_set_unlocked(p, RACCT_NOFILE, new + 1); 1006 if (error != 0) { 1007 error = EMFILE; 1008 fdrop(oldfp, td); 1009 goto unlock; 1010 } 1011 } 1012 #endif 1013 fdgrowtable_exp(fdp, new + 1); 1014 } 1015 if (!fdisused(fdp, new)) 1016 fdused(fdp, new); 1017 break; 1018 default: 1019 KASSERT(0, ("%s unsupported mode %d", __func__, mode)); 1020 } 1021 1022 KASSERT(old != new, ("new fd is same as old")); 1023 1024 /* Refetch oldfde because the table may have grown and old one freed. */ 1025 oldfde = &fdp->fd_ofiles[old]; 1026 KASSERT(oldfp == oldfde->fde_file, 1027 ("fdt_ofiles shift from growth observed at fd %d", 1028 old)); 1029 1030 newfde = &fdp->fd_ofiles[new]; 1031 delfp = newfde->fde_file; 1032 1033 nioctls = filecaps_copy_prep(&oldfde->fde_caps); 1034 1035 /* 1036 * Duplicate the source descriptor. 1037 */ 1038 #ifdef CAPABILITIES 1039 seqc_write_begin(&newfde->fde_seqc); 1040 #endif 1041 oioctls = filecaps_free_prep(&newfde->fde_caps); 1042 fde_copy(oldfde, newfde); 1043 filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps, 1044 nioctls); 1045 if ((flags & FDDUP_FLAG_CLOEXEC) != 0) 1046 newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE; 1047 else 1048 newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE; 1049 #ifdef CAPABILITIES 1050 seqc_write_end(&newfde->fde_seqc); 1051 #endif 1052 td->td_retval[0] = new; 1053 1054 error = 0; 1055 1056 if (delfp != NULL) { 1057 (void) closefp(fdp, new, delfp, td, true, false); 1058 FILEDESC_UNLOCK_ASSERT(fdp); 1059 } else { 1060 unlock: 1061 FILEDESC_XUNLOCK(fdp); 1062 } 1063 1064 filecaps_free_finish(oioctls); 1065 return (error); 1066 } 1067 1068 static void 1069 sigiofree(struct sigio *sigio) 1070 { 1071 crfree(sigio->sio_ucred); 1072 free(sigio, M_SIGIO); 1073 } 1074 1075 static struct sigio * 1076 funsetown_locked(struct sigio *sigio) 1077 { 1078 struct proc *p; 1079 struct pgrp *pg; 1080 1081 SIGIO_ASSERT_LOCKED(); 1082 1083 if (sigio == NULL) 1084 return (NULL); 1085 *sigio->sio_myref = NULL; 1086 if (sigio->sio_pgid < 0) { 1087 pg = sigio->sio_pgrp; 1088 PGRP_LOCK(pg); 1089 SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, sio_pgsigio); 1090 PGRP_UNLOCK(pg); 1091 } else { 1092 p = sigio->sio_proc; 1093 PROC_LOCK(p); 1094 SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, sio_pgsigio); 1095 PROC_UNLOCK(p); 1096 } 1097 return (sigio); 1098 } 1099 1100 /* 1101 * If sigio is on the list associated with a process or process group, 1102 * disable signalling from the device, remove sigio from the list and 1103 * free sigio. 1104 */ 1105 void 1106 funsetown(struct sigio **sigiop) 1107 { 1108 struct sigio *sigio; 1109 1110 /* Racy check, consumers must provide synchronization. */ 1111 if (*sigiop == NULL) 1112 return; 1113 1114 SIGIO_LOCK(); 1115 sigio = funsetown_locked(*sigiop); 1116 SIGIO_UNLOCK(); 1117 if (sigio != NULL) 1118 sigiofree(sigio); 1119 } 1120 1121 /* 1122 * Free a list of sigio structures. The caller must ensure that new sigio 1123 * structures cannot be added after this point. For process groups this is 1124 * guaranteed using the proctree lock; for processes, the P_WEXIT flag serves 1125 * as an interlock. 1126 */ 1127 void 1128 funsetownlst(struct sigiolst *sigiolst) 1129 { 1130 struct proc *p; 1131 struct pgrp *pg; 1132 struct sigio *sigio, *tmp; 1133 1134 /* Racy check. */ 1135 sigio = SLIST_FIRST(sigiolst); 1136 if (sigio == NULL) 1137 return; 1138 1139 p = NULL; 1140 pg = NULL; 1141 1142 SIGIO_LOCK(); 1143 sigio = SLIST_FIRST(sigiolst); 1144 if (sigio == NULL) { 1145 SIGIO_UNLOCK(); 1146 return; 1147 } 1148 1149 /* 1150 * Every entry of the list should belong to a single proc or pgrp. 1151 */ 1152 if (sigio->sio_pgid < 0) { 1153 pg = sigio->sio_pgrp; 1154 sx_assert(&proctree_lock, SX_XLOCKED); 1155 PGRP_LOCK(pg); 1156 } else /* if (sigio->sio_pgid > 0) */ { 1157 p = sigio->sio_proc; 1158 PROC_LOCK(p); 1159 KASSERT((p->p_flag & P_WEXIT) != 0, 1160 ("%s: process %p is not exiting", __func__, p)); 1161 } 1162 1163 SLIST_FOREACH(sigio, sigiolst, sio_pgsigio) { 1164 *sigio->sio_myref = NULL; 1165 if (pg != NULL) { 1166 KASSERT(sigio->sio_pgid < 0, 1167 ("Proc sigio in pgrp sigio list")); 1168 KASSERT(sigio->sio_pgrp == pg, 1169 ("Bogus pgrp in sigio list")); 1170 } else /* if (p != NULL) */ { 1171 KASSERT(sigio->sio_pgid > 0, 1172 ("Pgrp sigio in proc sigio list")); 1173 KASSERT(sigio->sio_proc == p, 1174 ("Bogus proc in sigio list")); 1175 } 1176 } 1177 1178 if (pg != NULL) 1179 PGRP_UNLOCK(pg); 1180 else 1181 PROC_UNLOCK(p); 1182 SIGIO_UNLOCK(); 1183 1184 SLIST_FOREACH_SAFE(sigio, sigiolst, sio_pgsigio, tmp) 1185 sigiofree(sigio); 1186 } 1187 1188 /* 1189 * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). 1190 * 1191 * After permission checking, add a sigio structure to the sigio list for 1192 * the process or process group. 1193 */ 1194 int 1195 fsetown(pid_t pgid, struct sigio **sigiop) 1196 { 1197 struct proc *proc; 1198 struct pgrp *pgrp; 1199 struct sigio *osigio, *sigio; 1200 int ret; 1201 1202 if (pgid == 0) { 1203 funsetown(sigiop); 1204 return (0); 1205 } 1206 1207 sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK); 1208 sigio->sio_pgid = pgid; 1209 sigio->sio_ucred = crhold(curthread->td_ucred); 1210 sigio->sio_myref = sigiop; 1211 1212 ret = 0; 1213 if (pgid > 0) { 1214 ret = pget(pgid, PGET_NOTWEXIT | PGET_NOTID | PGET_HOLD, &proc); 1215 SIGIO_LOCK(); 1216 osigio = funsetown_locked(*sigiop); 1217 if (ret == 0) { 1218 PROC_LOCK(proc); 1219 _PRELE(proc); 1220 if ((proc->p_flag & P_WEXIT) != 0) { 1221 ret = ESRCH; 1222 } else if (proc->p_session != 1223 curthread->td_proc->p_session) { 1224 /* 1225 * Policy - Don't allow a process to FSETOWN a 1226 * process in another session. 1227 * 1228 * Remove this test to allow maximum flexibility 1229 * or restrict FSETOWN to the current process or 1230 * process group for maximum safety. 1231 */ 1232 ret = EPERM; 1233 } else { 1234 sigio->sio_proc = proc; 1235 SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, 1236 sio_pgsigio); 1237 } 1238 PROC_UNLOCK(proc); 1239 } 1240 } else /* if (pgid < 0) */ { 1241 sx_slock(&proctree_lock); 1242 SIGIO_LOCK(); 1243 osigio = funsetown_locked(*sigiop); 1244 pgrp = pgfind(-pgid); 1245 if (pgrp == NULL) { 1246 ret = ESRCH; 1247 } else { 1248 if (pgrp->pg_session != curthread->td_proc->p_session) { 1249 /* 1250 * Policy - Don't allow a process to FSETOWN a 1251 * process in another session. 1252 * 1253 * Remove this test to allow maximum flexibility 1254 * or restrict FSETOWN to the current process or 1255 * process group for maximum safety. 1256 */ 1257 ret = EPERM; 1258 } else { 1259 sigio->sio_pgrp = pgrp; 1260 SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, 1261 sio_pgsigio); 1262 } 1263 PGRP_UNLOCK(pgrp); 1264 } 1265 sx_sunlock(&proctree_lock); 1266 } 1267 if (ret == 0) 1268 *sigiop = sigio; 1269 SIGIO_UNLOCK(); 1270 if (osigio != NULL) 1271 sigiofree(osigio); 1272 return (ret); 1273 } 1274 1275 /* 1276 * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). 1277 */ 1278 pid_t 1279 fgetown(struct sigio **sigiop) 1280 { 1281 pid_t pgid; 1282 1283 SIGIO_LOCK(); 1284 pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0; 1285 SIGIO_UNLOCK(); 1286 return (pgid); 1287 } 1288 1289 static int 1290 closefp_impl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, 1291 bool audit) 1292 { 1293 int error; 1294 1295 FILEDESC_XLOCK_ASSERT(fdp); 1296 1297 /* 1298 * We now hold the fp reference that used to be owned by the 1299 * descriptor array. We have to unlock the FILEDESC *AFTER* 1300 * knote_fdclose to prevent a race of the fd getting opened, a knote 1301 * added, and deleteing a knote for the new fd. 1302 */ 1303 if (__predict_false(!TAILQ_EMPTY(&fdp->fd_kqlist))) 1304 knote_fdclose(td, fd); 1305 1306 /* 1307 * We need to notify mqueue if the object is of type mqueue. 1308 */ 1309 if (__predict_false(fp->f_type == DTYPE_MQUEUE)) 1310 mq_fdclose(td, fd, fp); 1311 FILEDESC_XUNLOCK(fdp); 1312 1313 #ifdef AUDIT 1314 if (AUDITING_TD(td) && audit) 1315 audit_sysclose(td, fd, fp); 1316 #endif 1317 error = closef(fp, td); 1318 1319 /* 1320 * All paths leading up to closefp() will have already removed or 1321 * replaced the fd in the filedesc table, so a restart would not 1322 * operate on the same file. 1323 */ 1324 if (error == ERESTART) 1325 error = EINTR; 1326 1327 return (error); 1328 } 1329 1330 static int 1331 closefp_hl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, 1332 bool holdleaders, bool audit) 1333 { 1334 int error; 1335 1336 FILEDESC_XLOCK_ASSERT(fdp); 1337 1338 if (holdleaders) { 1339 if (td->td_proc->p_fdtol != NULL) { 1340 /* 1341 * Ask fdfree() to sleep to ensure that all relevant 1342 * process leaders can be traversed in closef(). 1343 */ 1344 fdp->fd_holdleaderscount++; 1345 } else { 1346 holdleaders = false; 1347 } 1348 } 1349 1350 error = closefp_impl(fdp, fd, fp, td, audit); 1351 if (holdleaders) { 1352 FILEDESC_XLOCK(fdp); 1353 fdp->fd_holdleaderscount--; 1354 if (fdp->fd_holdleaderscount == 0 && 1355 fdp->fd_holdleaderswakeup != 0) { 1356 fdp->fd_holdleaderswakeup = 0; 1357 wakeup(&fdp->fd_holdleaderscount); 1358 } 1359 FILEDESC_XUNLOCK(fdp); 1360 } 1361 return (error); 1362 } 1363 1364 static int 1365 closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, 1366 bool holdleaders, bool audit) 1367 { 1368 1369 FILEDESC_XLOCK_ASSERT(fdp); 1370 1371 if (__predict_false(td->td_proc->p_fdtol != NULL)) { 1372 return (closefp_hl(fdp, fd, fp, td, holdleaders, audit)); 1373 } else { 1374 return (closefp_impl(fdp, fd, fp, td, audit)); 1375 } 1376 } 1377 1378 /* 1379 * Close a file descriptor. 1380 */ 1381 #ifndef _SYS_SYSPROTO_H_ 1382 struct close_args { 1383 int fd; 1384 }; 1385 #endif 1386 /* ARGSUSED */ 1387 int 1388 sys_close(struct thread *td, struct close_args *uap) 1389 { 1390 1391 return (kern_close(td, uap->fd)); 1392 } 1393 1394 int 1395 kern_close(struct thread *td, int fd) 1396 { 1397 struct filedesc *fdp; 1398 struct file *fp; 1399 1400 fdp = td->td_proc->p_fd; 1401 1402 FILEDESC_XLOCK(fdp); 1403 if ((fp = fget_noref(fdp, fd)) == NULL) { 1404 FILEDESC_XUNLOCK(fdp); 1405 return (EBADF); 1406 } 1407 fdfree(fdp, fd); 1408 1409 /* closefp() drops the FILEDESC lock for us. */ 1410 return (closefp(fdp, fd, fp, td, true, true)); 1411 } 1412 1413 static int 1414 close_range_cloexec(struct thread *td, u_int lowfd, u_int highfd) 1415 { 1416 struct filedesc *fdp; 1417 struct fdescenttbl *fdt; 1418 struct filedescent *fde; 1419 int fd; 1420 1421 fdp = td->td_proc->p_fd; 1422 FILEDESC_XLOCK(fdp); 1423 fdt = atomic_load_ptr(&fdp->fd_files); 1424 highfd = MIN(highfd, fdt->fdt_nfiles - 1); 1425 fd = lowfd; 1426 if (__predict_false(fd > highfd)) { 1427 goto out_locked; 1428 } 1429 for (; fd <= highfd; fd++) { 1430 fde = &fdt->fdt_ofiles[fd]; 1431 if (fde->fde_file != NULL) 1432 fde->fde_flags |= UF_EXCLOSE; 1433 } 1434 out_locked: 1435 FILEDESC_XUNLOCK(fdp); 1436 return (0); 1437 } 1438 1439 static int 1440 close_range_impl(struct thread *td, u_int lowfd, u_int highfd) 1441 { 1442 struct filedesc *fdp; 1443 const struct fdescenttbl *fdt; 1444 struct file *fp; 1445 int fd; 1446 1447 fdp = td->td_proc->p_fd; 1448 FILEDESC_XLOCK(fdp); 1449 fdt = atomic_load_ptr(&fdp->fd_files); 1450 highfd = MIN(highfd, fdt->fdt_nfiles - 1); 1451 fd = lowfd; 1452 if (__predict_false(fd > highfd)) { 1453 goto out_locked; 1454 } 1455 for (;;) { 1456 fp = fdt->fdt_ofiles[fd].fde_file; 1457 if (fp == NULL) { 1458 if (fd == highfd) 1459 goto out_locked; 1460 } else { 1461 fdfree(fdp, fd); 1462 (void) closefp(fdp, fd, fp, td, true, true); 1463 if (fd == highfd) 1464 goto out_unlocked; 1465 FILEDESC_XLOCK(fdp); 1466 fdt = atomic_load_ptr(&fdp->fd_files); 1467 } 1468 fd++; 1469 } 1470 out_locked: 1471 FILEDESC_XUNLOCK(fdp); 1472 out_unlocked: 1473 return (0); 1474 } 1475 1476 int 1477 kern_close_range(struct thread *td, int flags, u_int lowfd, u_int highfd) 1478 { 1479 1480 /* 1481 * Check this prior to clamping; closefrom(3) with only fd 0, 1, and 2 1482 * open should not be a usage error. From a close_range() perspective, 1483 * close_range(3, ~0U, 0) in the same scenario should also likely not 1484 * be a usage error as all fd above 3 are in-fact already closed. 1485 */ 1486 if (highfd < lowfd) { 1487 return (EINVAL); 1488 } 1489 1490 if ((flags & CLOSE_RANGE_CLOEXEC) != 0) 1491 return (close_range_cloexec(td, lowfd, highfd)); 1492 1493 return (close_range_impl(td, lowfd, highfd)); 1494 } 1495 1496 #ifndef _SYS_SYSPROTO_H_ 1497 struct close_range_args { 1498 u_int lowfd; 1499 u_int highfd; 1500 int flags; 1501 }; 1502 #endif 1503 int 1504 sys_close_range(struct thread *td, struct close_range_args *uap) 1505 { 1506 1507 AUDIT_ARG_FD(uap->lowfd); 1508 AUDIT_ARG_CMD(uap->highfd); 1509 AUDIT_ARG_FFLAGS(uap->flags); 1510 1511 if ((uap->flags & ~(CLOSE_RANGE_CLOEXEC)) != 0) 1512 return (EINVAL); 1513 return (kern_close_range(td, uap->flags, uap->lowfd, uap->highfd)); 1514 } 1515 1516 #ifdef COMPAT_FREEBSD12 1517 /* 1518 * Close open file descriptors. 1519 */ 1520 #ifndef _SYS_SYSPROTO_H_ 1521 struct freebsd12_closefrom_args { 1522 int lowfd; 1523 }; 1524 #endif 1525 /* ARGSUSED */ 1526 int 1527 freebsd12_closefrom(struct thread *td, struct freebsd12_closefrom_args *uap) 1528 { 1529 u_int lowfd; 1530 1531 AUDIT_ARG_FD(uap->lowfd); 1532 1533 /* 1534 * Treat negative starting file descriptor values identical to 1535 * closefrom(0) which closes all files. 1536 */ 1537 lowfd = MAX(0, uap->lowfd); 1538 return (kern_close_range(td, 0, lowfd, ~0U)); 1539 } 1540 #endif /* COMPAT_FREEBSD12 */ 1541 1542 #if defined(COMPAT_43) 1543 /* 1544 * Return status information about a file descriptor. 1545 */ 1546 #ifndef _SYS_SYSPROTO_H_ 1547 struct ofstat_args { 1548 int fd; 1549 struct ostat *sb; 1550 }; 1551 #endif 1552 /* ARGSUSED */ 1553 int 1554 ofstat(struct thread *td, struct ofstat_args *uap) 1555 { 1556 struct ostat oub; 1557 struct stat ub; 1558 int error; 1559 1560 error = kern_fstat(td, uap->fd, &ub); 1561 if (error == 0) { 1562 cvtstat(&ub, &oub); 1563 error = copyout(&oub, uap->sb, sizeof(oub)); 1564 } 1565 return (error); 1566 } 1567 #endif /* COMPAT_43 */ 1568 1569 #if defined(COMPAT_FREEBSD11) 1570 int 1571 freebsd11_fstat(struct thread *td, struct freebsd11_fstat_args *uap) 1572 { 1573 struct stat sb; 1574 struct freebsd11_stat osb; 1575 int error; 1576 1577 error = kern_fstat(td, uap->fd, &sb); 1578 if (error != 0) 1579 return (error); 1580 error = freebsd11_cvtstat(&sb, &osb); 1581 if (error == 0) 1582 error = copyout(&osb, uap->sb, sizeof(osb)); 1583 return (error); 1584 } 1585 #endif /* COMPAT_FREEBSD11 */ 1586 1587 /* 1588 * Return status information about a file descriptor. 1589 */ 1590 #ifndef _SYS_SYSPROTO_H_ 1591 struct fstat_args { 1592 int fd; 1593 struct stat *sb; 1594 }; 1595 #endif 1596 /* ARGSUSED */ 1597 int 1598 sys_fstat(struct thread *td, struct fstat_args *uap) 1599 { 1600 struct stat ub; 1601 int error; 1602 1603 error = kern_fstat(td, uap->fd, &ub); 1604 if (error == 0) 1605 error = copyout(&ub, uap->sb, sizeof(ub)); 1606 return (error); 1607 } 1608 1609 int 1610 kern_fstat(struct thread *td, int fd, struct stat *sbp) 1611 { 1612 struct file *fp; 1613 int error; 1614 1615 AUDIT_ARG_FD(fd); 1616 1617 error = fget(td, fd, &cap_fstat_rights, &fp); 1618 if (__predict_false(error != 0)) 1619 return (error); 1620 1621 AUDIT_ARG_FILE(td->td_proc, fp); 1622 1623 error = fo_stat(fp, sbp, td->td_ucred); 1624 fdrop(fp, td); 1625 #ifdef __STAT_TIME_T_EXT 1626 sbp->st_atim_ext = 0; 1627 sbp->st_mtim_ext = 0; 1628 sbp->st_ctim_ext = 0; 1629 sbp->st_btim_ext = 0; 1630 #endif 1631 #ifdef KTRACE 1632 if (KTRPOINT(td, KTR_STRUCT)) 1633 ktrstat_error(sbp, error); 1634 #endif 1635 return (error); 1636 } 1637 1638 #if defined(COMPAT_FREEBSD11) 1639 /* 1640 * Return status information about a file descriptor. 1641 */ 1642 #ifndef _SYS_SYSPROTO_H_ 1643 struct freebsd11_nfstat_args { 1644 int fd; 1645 struct nstat *sb; 1646 }; 1647 #endif 1648 /* ARGSUSED */ 1649 int 1650 freebsd11_nfstat(struct thread *td, struct freebsd11_nfstat_args *uap) 1651 { 1652 struct nstat nub; 1653 struct stat ub; 1654 int error; 1655 1656 error = kern_fstat(td, uap->fd, &ub); 1657 if (error != 0) 1658 return (error); 1659 error = freebsd11_cvtnstat(&ub, &nub); 1660 if (error != 0) 1661 error = copyout(&nub, uap->sb, sizeof(nub)); 1662 return (error); 1663 } 1664 #endif /* COMPAT_FREEBSD11 */ 1665 1666 /* 1667 * Return pathconf information about a file descriptor. 1668 */ 1669 #ifndef _SYS_SYSPROTO_H_ 1670 struct fpathconf_args { 1671 int fd; 1672 int name; 1673 }; 1674 #endif 1675 /* ARGSUSED */ 1676 int 1677 sys_fpathconf(struct thread *td, struct fpathconf_args *uap) 1678 { 1679 long value; 1680 int error; 1681 1682 error = kern_fpathconf(td, uap->fd, uap->name, &value); 1683 if (error == 0) 1684 td->td_retval[0] = value; 1685 return (error); 1686 } 1687 1688 int 1689 kern_fpathconf(struct thread *td, int fd, int name, long *valuep) 1690 { 1691 struct file *fp; 1692 struct vnode *vp; 1693 int error; 1694 1695 error = fget(td, fd, &cap_fpathconf_rights, &fp); 1696 if (error != 0) 1697 return (error); 1698 1699 if (name == _PC_ASYNC_IO) { 1700 *valuep = _POSIX_ASYNCHRONOUS_IO; 1701 goto out; 1702 } 1703 vp = fp->f_vnode; 1704 if (vp != NULL) { 1705 vn_lock(vp, LK_SHARED | LK_RETRY); 1706 error = VOP_PATHCONF(vp, name, valuep); 1707 VOP_UNLOCK(vp); 1708 } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) { 1709 if (name != _PC_PIPE_BUF) { 1710 error = EINVAL; 1711 } else { 1712 *valuep = PIPE_BUF; 1713 error = 0; 1714 } 1715 } else { 1716 error = EOPNOTSUPP; 1717 } 1718 out: 1719 fdrop(fp, td); 1720 return (error); 1721 } 1722 1723 /* 1724 * Copy filecaps structure allocating memory for ioctls array if needed. 1725 * 1726 * The last parameter indicates whether the fdtable is locked. If it is not and 1727 * ioctls are encountered, copying fails and the caller must lock the table. 1728 * 1729 * Note that if the table was not locked, the caller has to check the relevant 1730 * sequence counter to determine whether the operation was successful. 1731 */ 1732 bool 1733 filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked) 1734 { 1735 size_t size; 1736 1737 if (src->fc_ioctls != NULL && !locked) 1738 return (false); 1739 memcpy(dst, src, sizeof(*src)); 1740 if (src->fc_ioctls == NULL) 1741 return (true); 1742 1743 KASSERT(src->fc_nioctls > 0, 1744 ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); 1745 1746 size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; 1747 dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK); 1748 memcpy(dst->fc_ioctls, src->fc_ioctls, size); 1749 return (true); 1750 } 1751 1752 static u_long * 1753 filecaps_copy_prep(const struct filecaps *src) 1754 { 1755 u_long *ioctls; 1756 size_t size; 1757 1758 if (__predict_true(src->fc_ioctls == NULL)) 1759 return (NULL); 1760 1761 KASSERT(src->fc_nioctls > 0, 1762 ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); 1763 1764 size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; 1765 ioctls = malloc(size, M_FILECAPS, M_WAITOK); 1766 return (ioctls); 1767 } 1768 1769 static void 1770 filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst, 1771 u_long *ioctls) 1772 { 1773 size_t size; 1774 1775 *dst = *src; 1776 if (__predict_true(src->fc_ioctls == NULL)) { 1777 MPASS(ioctls == NULL); 1778 return; 1779 } 1780 1781 size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; 1782 dst->fc_ioctls = ioctls; 1783 bcopy(src->fc_ioctls, dst->fc_ioctls, size); 1784 } 1785 1786 /* 1787 * Move filecaps structure to the new place and clear the old place. 1788 */ 1789 void 1790 filecaps_move(struct filecaps *src, struct filecaps *dst) 1791 { 1792 1793 *dst = *src; 1794 bzero(src, sizeof(*src)); 1795 } 1796 1797 /* 1798 * Fill the given filecaps structure with full rights. 1799 */ 1800 static void 1801 filecaps_fill(struct filecaps *fcaps) 1802 { 1803 1804 CAP_ALL(&fcaps->fc_rights); 1805 fcaps->fc_ioctls = NULL; 1806 fcaps->fc_nioctls = -1; 1807 fcaps->fc_fcntls = CAP_FCNTL_ALL; 1808 } 1809 1810 /* 1811 * Free memory allocated within filecaps structure. 1812 */ 1813 static void 1814 filecaps_free_ioctl(struct filecaps *fcaps) 1815 { 1816 1817 free(fcaps->fc_ioctls, M_FILECAPS); 1818 fcaps->fc_ioctls = NULL; 1819 } 1820 1821 void 1822 filecaps_free(struct filecaps *fcaps) 1823 { 1824 1825 filecaps_free_ioctl(fcaps); 1826 bzero(fcaps, sizeof(*fcaps)); 1827 } 1828 1829 static u_long * 1830 filecaps_free_prep(struct filecaps *fcaps) 1831 { 1832 u_long *ioctls; 1833 1834 ioctls = fcaps->fc_ioctls; 1835 bzero(fcaps, sizeof(*fcaps)); 1836 return (ioctls); 1837 } 1838 1839 static void 1840 filecaps_free_finish(u_long *ioctls) 1841 { 1842 1843 free(ioctls, M_FILECAPS); 1844 } 1845 1846 /* 1847 * Validate the given filecaps structure. 1848 */ 1849 static void 1850 filecaps_validate(const struct filecaps *fcaps, const char *func) 1851 { 1852 1853 KASSERT(cap_rights_is_valid(&fcaps->fc_rights), 1854 ("%s: invalid rights", func)); 1855 KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0, 1856 ("%s: invalid fcntls", func)); 1857 KASSERT(fcaps->fc_fcntls == 0 || 1858 cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL), 1859 ("%s: fcntls without CAP_FCNTL", func)); 1860 /* 1861 * open calls without WANTIOCTLCAPS free caps but leave the counter 1862 */ 1863 #if 0 1864 KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 : 1865 (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0), 1866 ("%s: invalid ioctls", func)); 1867 #endif 1868 KASSERT(fcaps->fc_nioctls == 0 || 1869 cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL), 1870 ("%s: ioctls without CAP_IOCTL", func)); 1871 } 1872 1873 static void 1874 fdgrowtable_exp(struct filedesc *fdp, int nfd) 1875 { 1876 int nfd1; 1877 1878 FILEDESC_XLOCK_ASSERT(fdp); 1879 1880 nfd1 = fdp->fd_nfiles * 2; 1881 if (nfd1 < nfd) 1882 nfd1 = nfd; 1883 fdgrowtable(fdp, nfd1); 1884 } 1885 1886 /* 1887 * Grow the file table to accommodate (at least) nfd descriptors. 1888 */ 1889 static void 1890 fdgrowtable(struct filedesc *fdp, int nfd) 1891 { 1892 struct filedesc0 *fdp0; 1893 struct freetable *ft; 1894 struct fdescenttbl *ntable; 1895 struct fdescenttbl *otable; 1896 int nnfiles, onfiles; 1897 NDSLOTTYPE *nmap, *omap; 1898 1899 KASSERT(fdp->fd_nfiles > 0, ("zero-length file table")); 1900 1901 /* save old values */ 1902 onfiles = fdp->fd_nfiles; 1903 otable = fdp->fd_files; 1904 omap = fdp->fd_map; 1905 1906 /* compute the size of the new table */ 1907 nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */ 1908 if (nnfiles <= onfiles) 1909 /* the table is already large enough */ 1910 return; 1911 1912 /* 1913 * Allocate a new table. We need enough space for the number of 1914 * entries, file entries themselves and the struct freetable we will use 1915 * when we decommission the table and place it on the freelist. 1916 * We place the struct freetable in the middle so we don't have 1917 * to worry about padding. 1918 */ 1919 ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) + 1920 nnfiles * sizeof(ntable->fdt_ofiles[0]) + 1921 sizeof(struct freetable), 1922 M_FILEDESC, M_ZERO | M_WAITOK); 1923 /* copy the old data */ 1924 ntable->fdt_nfiles = nnfiles; 1925 memcpy(ntable->fdt_ofiles, otable->fdt_ofiles, 1926 onfiles * sizeof(ntable->fdt_ofiles[0])); 1927 1928 /* 1929 * Allocate a new map only if the old is not large enough. It will 1930 * grow at a slower rate than the table as it can map more 1931 * entries than the table can hold. 1932 */ 1933 if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) { 1934 nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC, 1935 M_ZERO | M_WAITOK); 1936 /* copy over the old data and update the pointer */ 1937 memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap)); 1938 fdp->fd_map = nmap; 1939 } 1940 1941 /* 1942 * Make sure that ntable is correctly initialized before we replace 1943 * fd_files poiner. Otherwise fget_unlocked() may see inconsistent 1944 * data. 1945 */ 1946 atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable); 1947 1948 /* 1949 * Free the old file table when not shared by other threads or processes. 1950 * The old file table is considered to be shared when either are true: 1951 * - The process has more than one thread. 1952 * - The file descriptor table has been shared via fdshare(). 1953 * 1954 * When shared, the old file table will be placed on a freelist 1955 * which will be processed when the struct filedesc is released. 1956 * 1957 * Note that if onfiles == NDFILE, we're dealing with the original 1958 * static allocation contained within (struct filedesc0 *)fdp, 1959 * which must not be freed. 1960 */ 1961 if (onfiles > NDFILE) { 1962 /* 1963 * Note we may be called here from fdinit while allocating a 1964 * table for a new process in which case ->p_fd points 1965 * elsewhere. 1966 */ 1967 if (curproc->p_fd != fdp || FILEDESC_IS_ONLY_USER(fdp)) { 1968 free(otable, M_FILEDESC); 1969 } else { 1970 ft = (struct freetable *)&otable->fdt_ofiles[onfiles]; 1971 fdp0 = (struct filedesc0 *)fdp; 1972 ft->ft_table = otable; 1973 SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next); 1974 } 1975 } 1976 /* 1977 * The map does not have the same possibility of threads still 1978 * holding references to it. So always free it as long as it 1979 * does not reference the original static allocation. 1980 */ 1981 if (NDSLOTS(onfiles) > NDSLOTS(NDFILE)) 1982 free(omap, M_FILEDESC); 1983 } 1984 1985 /* 1986 * Allocate a file descriptor for the process. 1987 */ 1988 int 1989 fdalloc(struct thread *td, int minfd, int *result) 1990 { 1991 struct proc *p = td->td_proc; 1992 struct filedesc *fdp = p->p_fd; 1993 int fd, maxfd, allocfd; 1994 #ifdef RACCT 1995 int error; 1996 #endif 1997 1998 FILEDESC_XLOCK_ASSERT(fdp); 1999 2000 if (fdp->fd_freefile > minfd) 2001 minfd = fdp->fd_freefile; 2002 2003 maxfd = getmaxfd(td); 2004 2005 /* 2006 * Search the bitmap for a free descriptor starting at minfd. 2007 * If none is found, grow the file table. 2008 */ 2009 fd = fd_first_free(fdp, minfd, fdp->fd_nfiles); 2010 if (__predict_false(fd >= maxfd)) 2011 return (EMFILE); 2012 if (__predict_false(fd >= fdp->fd_nfiles)) { 2013 allocfd = min(fd * 2, maxfd); 2014 #ifdef RACCT 2015 if (RACCT_ENABLED()) { 2016 error = racct_set_unlocked(p, RACCT_NOFILE, allocfd); 2017 if (error != 0) 2018 return (EMFILE); 2019 } 2020 #endif 2021 /* 2022 * fd is already equal to first free descriptor >= minfd, so 2023 * we only need to grow the table and we are done. 2024 */ 2025 fdgrowtable_exp(fdp, allocfd); 2026 } 2027 2028 /* 2029 * Perform some sanity checks, then mark the file descriptor as 2030 * used and return it to the caller. 2031 */ 2032 KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles), 2033 ("invalid descriptor %d", fd)); 2034 KASSERT(!fdisused(fdp, fd), 2035 ("fd_first_free() returned non-free descriptor")); 2036 KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, 2037 ("file descriptor isn't free")); 2038 fdused(fdp, fd); 2039 *result = fd; 2040 return (0); 2041 } 2042 2043 /* 2044 * Allocate n file descriptors for the process. 2045 */ 2046 int 2047 fdallocn(struct thread *td, int minfd, int *fds, int n) 2048 { 2049 struct proc *p = td->td_proc; 2050 struct filedesc *fdp = p->p_fd; 2051 int i; 2052 2053 FILEDESC_XLOCK_ASSERT(fdp); 2054 2055 for (i = 0; i < n; i++) 2056 if (fdalloc(td, 0, &fds[i]) != 0) 2057 break; 2058 2059 if (i < n) { 2060 for (i--; i >= 0; i--) 2061 fdunused(fdp, fds[i]); 2062 return (EMFILE); 2063 } 2064 2065 return (0); 2066 } 2067 2068 /* 2069 * Create a new open file structure and allocate a file descriptor for the 2070 * process that refers to it. We add one reference to the file for the 2071 * descriptor table and one reference for resultfp. This is to prevent us 2072 * being preempted and the entry in the descriptor table closed after we 2073 * release the FILEDESC lock. 2074 */ 2075 int 2076 falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags, 2077 struct filecaps *fcaps) 2078 { 2079 struct file *fp; 2080 int error, fd; 2081 2082 MPASS(resultfp != NULL); 2083 MPASS(resultfd != NULL); 2084 2085 error = _falloc_noinstall(td, &fp, 2); 2086 if (__predict_false(error != 0)) { 2087 return (error); 2088 } 2089 2090 error = finstall_refed(td, fp, &fd, flags, fcaps); 2091 if (__predict_false(error != 0)) { 2092 falloc_abort(td, fp); 2093 return (error); 2094 } 2095 2096 *resultfp = fp; 2097 *resultfd = fd; 2098 2099 return (0); 2100 } 2101 2102 /* 2103 * Create a new open file structure without allocating a file descriptor. 2104 */ 2105 int 2106 _falloc_noinstall(struct thread *td, struct file **resultfp, u_int n) 2107 { 2108 struct file *fp; 2109 int maxuserfiles = maxfiles - (maxfiles / 20); 2110 int openfiles_new; 2111 static struct timeval lastfail; 2112 static int curfail; 2113 2114 KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__)); 2115 MPASS(n > 0); 2116 2117 openfiles_new = atomic_fetchadd_int(&openfiles, 1) + 1; 2118 if ((openfiles_new >= maxuserfiles && 2119 priv_check(td, PRIV_MAXFILES) != 0) || 2120 openfiles_new >= maxfiles) { 2121 atomic_subtract_int(&openfiles, 1); 2122 if (ppsratecheck(&lastfail, &curfail, 1)) { 2123 printf("kern.maxfiles limit exceeded by uid %i, (%s) " 2124 "please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm); 2125 } 2126 return (ENFILE); 2127 } 2128 fp = uma_zalloc(file_zone, M_WAITOK); 2129 bzero(fp, sizeof(*fp)); 2130 refcount_init(&fp->f_count, n); 2131 fp->f_cred = crhold(td->td_ucred); 2132 fp->f_ops = &badfileops; 2133 *resultfp = fp; 2134 return (0); 2135 } 2136 2137 void 2138 falloc_abort(struct thread *td, struct file *fp) 2139 { 2140 2141 /* 2142 * For assertion purposes. 2143 */ 2144 refcount_init(&fp->f_count, 0); 2145 _fdrop(fp, td); 2146 } 2147 2148 /* 2149 * Install a file in a file descriptor table. 2150 */ 2151 void 2152 _finstall(struct filedesc *fdp, struct file *fp, int fd, int flags, 2153 struct filecaps *fcaps) 2154 { 2155 struct filedescent *fde; 2156 2157 MPASS(fp != NULL); 2158 if (fcaps != NULL) 2159 filecaps_validate(fcaps, __func__); 2160 FILEDESC_XLOCK_ASSERT(fdp); 2161 2162 fde = &fdp->fd_ofiles[fd]; 2163 #ifdef CAPABILITIES 2164 seqc_write_begin(&fde->fde_seqc); 2165 #endif 2166 fde->fde_file = fp; 2167 fde->fde_flags = (flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0; 2168 if (fcaps != NULL) 2169 filecaps_move(fcaps, &fde->fde_caps); 2170 else 2171 filecaps_fill(&fde->fde_caps); 2172 #ifdef CAPABILITIES 2173 seqc_write_end(&fde->fde_seqc); 2174 #endif 2175 } 2176 2177 int 2178 finstall_refed(struct thread *td, struct file *fp, int *fd, int flags, 2179 struct filecaps *fcaps) 2180 { 2181 struct filedesc *fdp = td->td_proc->p_fd; 2182 int error; 2183 2184 MPASS(fd != NULL); 2185 2186 FILEDESC_XLOCK(fdp); 2187 error = fdalloc(td, 0, fd); 2188 if (__predict_true(error == 0)) { 2189 _finstall(fdp, fp, *fd, flags, fcaps); 2190 } 2191 FILEDESC_XUNLOCK(fdp); 2192 return (error); 2193 } 2194 2195 int 2196 finstall(struct thread *td, struct file *fp, int *fd, int flags, 2197 struct filecaps *fcaps) 2198 { 2199 int error; 2200 2201 MPASS(fd != NULL); 2202 2203 if (!fhold(fp)) 2204 return (EBADF); 2205 error = finstall_refed(td, fp, fd, flags, fcaps); 2206 if (__predict_false(error != 0)) { 2207 fdrop(fp, td); 2208 } 2209 return (error); 2210 } 2211 2212 /* 2213 * Build a new filedesc structure from another. 2214 * 2215 * If fdp is not NULL, return with it shared locked. 2216 */ 2217 struct filedesc * 2218 fdinit(void) 2219 { 2220 struct filedesc0 *newfdp0; 2221 struct filedesc *newfdp; 2222 2223 newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO); 2224 newfdp = &newfdp0->fd_fd; 2225 2226 /* Create the file descriptor table. */ 2227 FILEDESC_LOCK_INIT(newfdp); 2228 refcount_init(&newfdp->fd_refcnt, 1); 2229 refcount_init(&newfdp->fd_holdcnt, 1); 2230 newfdp->fd_map = newfdp0->fd_dmap; 2231 newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles; 2232 newfdp->fd_files->fdt_nfiles = NDFILE; 2233 2234 return (newfdp); 2235 } 2236 2237 /* 2238 * Build a pwddesc structure from another. 2239 * Copy the current, root, and jail root vnode references. 2240 * 2241 * If pdp is not NULL, return with it shared locked. 2242 */ 2243 struct pwddesc * 2244 pdinit(struct pwddesc *pdp, bool keeplock) 2245 { 2246 struct pwddesc *newpdp; 2247 struct pwd *newpwd; 2248 2249 newpdp = malloc(sizeof(*newpdp), M_PWDDESC, M_WAITOK | M_ZERO); 2250 2251 PWDDESC_LOCK_INIT(newpdp); 2252 refcount_init(&newpdp->pd_refcount, 1); 2253 newpdp->pd_cmask = CMASK; 2254 2255 if (pdp == NULL) { 2256 newpwd = pwd_alloc(); 2257 smr_serialized_store(&newpdp->pd_pwd, newpwd, true); 2258 return (newpdp); 2259 } 2260 2261 PWDDESC_XLOCK(pdp); 2262 newpwd = pwd_hold_pwddesc(pdp); 2263 smr_serialized_store(&newpdp->pd_pwd, newpwd, true); 2264 if (!keeplock) 2265 PWDDESC_XUNLOCK(pdp); 2266 return (newpdp); 2267 } 2268 2269 /* 2270 * Hold either filedesc or pwddesc of the passed process. 2271 * 2272 * The process lock is used to synchronize against the target exiting and 2273 * freeing the data. 2274 * 2275 * Clearing can be ilustrated in 3 steps: 2276 * 1. set the pointer to NULL. Either routine can race against it, hence 2277 * atomic_load_ptr. 2278 * 2. observe the process lock as not taken. Until then fdhold/pdhold can 2279 * race to either still see the pointer or find NULL. It is still safe to 2280 * grab a reference as clearing is stalled. 2281 * 3. after the lock is observed as not taken, any fdhold/pdhold calls are 2282 * guaranteed to see NULL, making it safe to finish clearing 2283 */ 2284 static struct filedesc * 2285 fdhold(struct proc *p) 2286 { 2287 struct filedesc *fdp; 2288 2289 PROC_LOCK_ASSERT(p, MA_OWNED); 2290 fdp = atomic_load_ptr(&p->p_fd); 2291 if (fdp != NULL) 2292 refcount_acquire(&fdp->fd_holdcnt); 2293 return (fdp); 2294 } 2295 2296 static struct pwddesc * 2297 pdhold(struct proc *p) 2298 { 2299 struct pwddesc *pdp; 2300 2301 PROC_LOCK_ASSERT(p, MA_OWNED); 2302 pdp = atomic_load_ptr(&p->p_pd); 2303 if (pdp != NULL) 2304 refcount_acquire(&pdp->pd_refcount); 2305 return (pdp); 2306 } 2307 2308 static void 2309 fddrop(struct filedesc *fdp) 2310 { 2311 2312 if (refcount_load(&fdp->fd_holdcnt) > 1) { 2313 if (refcount_release(&fdp->fd_holdcnt) == 0) 2314 return; 2315 } 2316 2317 FILEDESC_LOCK_DESTROY(fdp); 2318 uma_zfree(filedesc0_zone, fdp); 2319 } 2320 2321 static void 2322 pddrop(struct pwddesc *pdp) 2323 { 2324 struct pwd *pwd; 2325 2326 if (refcount_release_if_not_last(&pdp->pd_refcount)) 2327 return; 2328 2329 PWDDESC_XLOCK(pdp); 2330 if (refcount_release(&pdp->pd_refcount) == 0) { 2331 PWDDESC_XUNLOCK(pdp); 2332 return; 2333 } 2334 pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 2335 pwd_set(pdp, NULL); 2336 PWDDESC_XUNLOCK(pdp); 2337 pwd_drop(pwd); 2338 2339 PWDDESC_LOCK_DESTROY(pdp); 2340 free(pdp, M_PWDDESC); 2341 } 2342 2343 /* 2344 * Share a filedesc structure. 2345 */ 2346 struct filedesc * 2347 fdshare(struct filedesc *fdp) 2348 { 2349 2350 refcount_acquire(&fdp->fd_refcnt); 2351 return (fdp); 2352 } 2353 2354 /* 2355 * Share a pwddesc structure. 2356 */ 2357 struct pwddesc * 2358 pdshare(struct pwddesc *pdp) 2359 { 2360 refcount_acquire(&pdp->pd_refcount); 2361 return (pdp); 2362 } 2363 2364 /* 2365 * Unshare a filedesc structure, if necessary by making a copy 2366 */ 2367 void 2368 fdunshare(struct thread *td) 2369 { 2370 struct filedesc *tmp; 2371 struct proc *p = td->td_proc; 2372 2373 if (refcount_load(&p->p_fd->fd_refcnt) == 1) 2374 return; 2375 2376 tmp = fdcopy(p->p_fd); 2377 fdescfree(td); 2378 p->p_fd = tmp; 2379 } 2380 2381 /* 2382 * Unshare a pwddesc structure. 2383 */ 2384 void 2385 pdunshare(struct thread *td) 2386 { 2387 struct pwddesc *pdp; 2388 struct proc *p; 2389 2390 p = td->td_proc; 2391 /* Not shared. */ 2392 if (refcount_load(&p->p_pd->pd_refcount) == 1) 2393 return; 2394 2395 pdp = pdcopy(p->p_pd); 2396 pdescfree(td); 2397 p->p_pd = pdp; 2398 } 2399 2400 /* 2401 * Copy a filedesc structure. A NULL pointer in returns a NULL reference, 2402 * this is to ease callers, not catch errors. 2403 */ 2404 struct filedesc * 2405 fdcopy(struct filedesc *fdp) 2406 { 2407 struct filedesc *newfdp; 2408 struct filedescent *nfde, *ofde; 2409 int i, lastfile; 2410 2411 MPASS(fdp != NULL); 2412 2413 newfdp = fdinit(); 2414 FILEDESC_SLOCK(fdp); 2415 for (;;) { 2416 lastfile = fdlastfile(fdp); 2417 if (lastfile < newfdp->fd_nfiles) 2418 break; 2419 FILEDESC_SUNLOCK(fdp); 2420 fdgrowtable(newfdp, lastfile + 1); 2421 FILEDESC_SLOCK(fdp); 2422 } 2423 /* copy all passable descriptors (i.e. not kqueue) */ 2424 newfdp->fd_freefile = fdp->fd_freefile; 2425 FILEDESC_FOREACH_FDE(fdp, i, ofde) { 2426 if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0 || 2427 !fhold(ofde->fde_file)) { 2428 if (newfdp->fd_freefile == fdp->fd_freefile) 2429 newfdp->fd_freefile = i; 2430 continue; 2431 } 2432 nfde = &newfdp->fd_ofiles[i]; 2433 *nfde = *ofde; 2434 filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true); 2435 fdused_init(newfdp, i); 2436 } 2437 MPASS(newfdp->fd_freefile != -1); 2438 FILEDESC_SUNLOCK(fdp); 2439 return (newfdp); 2440 } 2441 2442 /* 2443 * Copy a pwddesc structure. 2444 */ 2445 struct pwddesc * 2446 pdcopy(struct pwddesc *pdp) 2447 { 2448 struct pwddesc *newpdp; 2449 2450 MPASS(pdp != NULL); 2451 2452 newpdp = pdinit(pdp, true); 2453 newpdp->pd_cmask = pdp->pd_cmask; 2454 PWDDESC_XUNLOCK(pdp); 2455 return (newpdp); 2456 } 2457 2458 /* 2459 * Clear POSIX style locks. This is only used when fdp looses a reference (i.e. 2460 * one of processes using it exits) and the table used to be shared. 2461 */ 2462 static void 2463 fdclearlocks(struct thread *td) 2464 { 2465 struct filedesc *fdp; 2466 struct filedesc_to_leader *fdtol; 2467 struct flock lf; 2468 struct file *fp; 2469 struct proc *p; 2470 struct vnode *vp; 2471 int i; 2472 2473 p = td->td_proc; 2474 fdp = p->p_fd; 2475 fdtol = p->p_fdtol; 2476 MPASS(fdtol != NULL); 2477 2478 FILEDESC_XLOCK(fdp); 2479 KASSERT(fdtol->fdl_refcount > 0, 2480 ("filedesc_to_refcount botch: fdl_refcount=%d", 2481 fdtol->fdl_refcount)); 2482 if (fdtol->fdl_refcount == 1 && 2483 (p->p_leader->p_flag & P_ADVLOCK) != 0) { 2484 FILEDESC_FOREACH_FP(fdp, i, fp) { 2485 if (fp->f_type != DTYPE_VNODE || 2486 !fhold(fp)) 2487 continue; 2488 FILEDESC_XUNLOCK(fdp); 2489 lf.l_whence = SEEK_SET; 2490 lf.l_start = 0; 2491 lf.l_len = 0; 2492 lf.l_type = F_UNLCK; 2493 vp = fp->f_vnode; 2494 (void) VOP_ADVLOCK(vp, 2495 (caddr_t)p->p_leader, F_UNLCK, 2496 &lf, F_POSIX); 2497 FILEDESC_XLOCK(fdp); 2498 fdrop(fp, td); 2499 } 2500 } 2501 retry: 2502 if (fdtol->fdl_refcount == 1) { 2503 if (fdp->fd_holdleaderscount > 0 && 2504 (p->p_leader->p_flag & P_ADVLOCK) != 0) { 2505 /* 2506 * close() or kern_dup() has cleared a reference 2507 * in a shared file descriptor table. 2508 */ 2509 fdp->fd_holdleaderswakeup = 1; 2510 sx_sleep(&fdp->fd_holdleaderscount, 2511 FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); 2512 goto retry; 2513 } 2514 if (fdtol->fdl_holdcount > 0) { 2515 /* 2516 * Ensure that fdtol->fdl_leader remains 2517 * valid in closef(). 2518 */ 2519 fdtol->fdl_wakeup = 1; 2520 sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK, 2521 "fdlhold", 0); 2522 goto retry; 2523 } 2524 } 2525 fdtol->fdl_refcount--; 2526 if (fdtol->fdl_refcount == 0 && 2527 fdtol->fdl_holdcount == 0) { 2528 fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; 2529 fdtol->fdl_prev->fdl_next = fdtol->fdl_next; 2530 } else 2531 fdtol = NULL; 2532 p->p_fdtol = NULL; 2533 FILEDESC_XUNLOCK(fdp); 2534 if (fdtol != NULL) 2535 free(fdtol, M_FILEDESC_TO_LEADER); 2536 } 2537 2538 /* 2539 * Release a filedesc structure. 2540 */ 2541 static void 2542 fdescfree_fds(struct thread *td, struct filedesc *fdp) 2543 { 2544 struct filedesc0 *fdp0; 2545 struct freetable *ft, *tft; 2546 struct filedescent *fde; 2547 struct file *fp; 2548 int i; 2549 2550 KASSERT(refcount_load(&fdp->fd_refcnt) == 0, 2551 ("%s: fd table %p carries references", __func__, fdp)); 2552 2553 /* 2554 * Serialize with threads iterating over the table, if any. 2555 */ 2556 if (refcount_load(&fdp->fd_holdcnt) > 1) { 2557 FILEDESC_XLOCK(fdp); 2558 FILEDESC_XUNLOCK(fdp); 2559 } 2560 2561 FILEDESC_FOREACH_FDE(fdp, i, fde) { 2562 fp = fde->fde_file; 2563 fdefree_last(fde); 2564 (void) closef(fp, td); 2565 } 2566 2567 if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE)) 2568 free(fdp->fd_map, M_FILEDESC); 2569 if (fdp->fd_nfiles > NDFILE) 2570 free(fdp->fd_files, M_FILEDESC); 2571 2572 fdp0 = (struct filedesc0 *)fdp; 2573 SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft) 2574 free(ft->ft_table, M_FILEDESC); 2575 2576 fddrop(fdp); 2577 } 2578 2579 void 2580 fdescfree(struct thread *td) 2581 { 2582 struct proc *p; 2583 struct filedesc *fdp; 2584 2585 p = td->td_proc; 2586 fdp = p->p_fd; 2587 MPASS(fdp != NULL); 2588 2589 #ifdef RACCT 2590 if (RACCT_ENABLED()) 2591 racct_set_unlocked(p, RACCT_NOFILE, 0); 2592 #endif 2593 2594 if (p->p_fdtol != NULL) 2595 fdclearlocks(td); 2596 2597 /* 2598 * Check fdhold for an explanation. 2599 */ 2600 atomic_store_ptr(&p->p_fd, NULL); 2601 atomic_thread_fence_seq_cst(); 2602 PROC_WAIT_UNLOCKED(p); 2603 2604 if (refcount_release(&fdp->fd_refcnt) == 0) 2605 return; 2606 2607 fdescfree_fds(td, fdp); 2608 } 2609 2610 void 2611 pdescfree(struct thread *td) 2612 { 2613 struct proc *p; 2614 struct pwddesc *pdp; 2615 2616 p = td->td_proc; 2617 pdp = p->p_pd; 2618 MPASS(pdp != NULL); 2619 2620 /* 2621 * Check pdhold for an explanation. 2622 */ 2623 atomic_store_ptr(&p->p_pd, NULL); 2624 atomic_thread_fence_seq_cst(); 2625 PROC_WAIT_UNLOCKED(p); 2626 2627 pddrop(pdp); 2628 } 2629 2630 /* 2631 * For setugid programs, we don't want to people to use that setugidness 2632 * to generate error messages which write to a file which otherwise would 2633 * otherwise be off-limits to the process. We check for filesystems where 2634 * the vnode can change out from under us after execve (like [lin]procfs). 2635 * 2636 * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is 2637 * sufficient. We also don't check for setugidness since we know we are. 2638 */ 2639 static bool 2640 is_unsafe(struct file *fp) 2641 { 2642 struct vnode *vp; 2643 2644 if (fp->f_type != DTYPE_VNODE) 2645 return (false); 2646 2647 vp = fp->f_vnode; 2648 return ((vp->v_vflag & VV_PROCDEP) != 0); 2649 } 2650 2651 /* 2652 * Make this setguid thing safe, if at all possible. 2653 */ 2654 void 2655 fdsetugidsafety(struct thread *td) 2656 { 2657 struct filedesc *fdp; 2658 struct file *fp; 2659 int i; 2660 2661 fdp = td->td_proc->p_fd; 2662 KASSERT(refcount_load(&fdp->fd_refcnt) == 1, 2663 ("the fdtable should not be shared")); 2664 MPASS(fdp->fd_nfiles >= 3); 2665 for (i = 0; i <= 2; i++) { 2666 fp = fdp->fd_ofiles[i].fde_file; 2667 if (fp != NULL && is_unsafe(fp)) { 2668 FILEDESC_XLOCK(fdp); 2669 knote_fdclose(td, i); 2670 /* 2671 * NULL-out descriptor prior to close to avoid 2672 * a race while close blocks. 2673 */ 2674 fdfree(fdp, i); 2675 FILEDESC_XUNLOCK(fdp); 2676 (void) closef(fp, td); 2677 } 2678 } 2679 } 2680 2681 /* 2682 * If a specific file object occupies a specific file descriptor, close the 2683 * file descriptor entry and drop a reference on the file object. This is a 2684 * convenience function to handle a subsequent error in a function that calls 2685 * falloc() that handles the race that another thread might have closed the 2686 * file descriptor out from under the thread creating the file object. 2687 */ 2688 void 2689 fdclose(struct thread *td, struct file *fp, int idx) 2690 { 2691 struct filedesc *fdp = td->td_proc->p_fd; 2692 2693 FILEDESC_XLOCK(fdp); 2694 if (fdp->fd_ofiles[idx].fde_file == fp) { 2695 fdfree(fdp, idx); 2696 FILEDESC_XUNLOCK(fdp); 2697 fdrop(fp, td); 2698 } else 2699 FILEDESC_XUNLOCK(fdp); 2700 } 2701 2702 /* 2703 * Close any files on exec? 2704 */ 2705 void 2706 fdcloseexec(struct thread *td) 2707 { 2708 struct filedesc *fdp; 2709 struct filedescent *fde; 2710 struct file *fp; 2711 int i; 2712 2713 fdp = td->td_proc->p_fd; 2714 KASSERT(refcount_load(&fdp->fd_refcnt) == 1, 2715 ("the fdtable should not be shared")); 2716 FILEDESC_FOREACH_FDE(fdp, i, fde) { 2717 fp = fde->fde_file; 2718 if (fp->f_type == DTYPE_MQUEUE || 2719 (fde->fde_flags & UF_EXCLOSE)) { 2720 FILEDESC_XLOCK(fdp); 2721 fdfree(fdp, i); 2722 (void) closefp(fdp, i, fp, td, false, false); 2723 FILEDESC_UNLOCK_ASSERT(fdp); 2724 } 2725 } 2726 } 2727 2728 /* 2729 * It is unsafe for set[ug]id processes to be started with file 2730 * descriptors 0..2 closed, as these descriptors are given implicit 2731 * significance in the Standard C library. fdcheckstd() will create a 2732 * descriptor referencing /dev/null for each of stdin, stdout, and 2733 * stderr that is not already open. 2734 */ 2735 int 2736 fdcheckstd(struct thread *td) 2737 { 2738 struct filedesc *fdp; 2739 register_t save; 2740 int i, error, devnull; 2741 2742 fdp = td->td_proc->p_fd; 2743 KASSERT(refcount_load(&fdp->fd_refcnt) == 1, 2744 ("the fdtable should not be shared")); 2745 MPASS(fdp->fd_nfiles >= 3); 2746 devnull = -1; 2747 for (i = 0; i <= 2; i++) { 2748 if (fdp->fd_ofiles[i].fde_file != NULL) 2749 continue; 2750 2751 save = td->td_retval[0]; 2752 if (devnull != -1) { 2753 error = kern_dup(td, FDDUP_FIXED, 0, devnull, i); 2754 } else { 2755 error = kern_openat(td, AT_FDCWD, "/dev/null", 2756 UIO_SYSSPACE, O_RDWR, 0); 2757 if (error == 0) { 2758 devnull = td->td_retval[0]; 2759 KASSERT(devnull == i, ("we didn't get our fd")); 2760 } 2761 } 2762 td->td_retval[0] = save; 2763 if (error != 0) 2764 return (error); 2765 } 2766 return (0); 2767 } 2768 2769 /* 2770 * Internal form of close. Decrement reference count on file structure. 2771 * Note: td may be NULL when closing a file that was being passed in a 2772 * message. 2773 */ 2774 int 2775 closef(struct file *fp, struct thread *td) 2776 { 2777 struct vnode *vp; 2778 struct flock lf; 2779 struct filedesc_to_leader *fdtol; 2780 struct filedesc *fdp; 2781 2782 MPASS(td != NULL); 2783 2784 /* 2785 * POSIX record locking dictates that any close releases ALL 2786 * locks owned by this process. This is handled by setting 2787 * a flag in the unlock to free ONLY locks obeying POSIX 2788 * semantics, and not to free BSD-style file locks. 2789 * If the descriptor was in a message, POSIX-style locks 2790 * aren't passed with the descriptor, and the thread pointer 2791 * will be NULL. Callers should be careful only to pass a 2792 * NULL thread pointer when there really is no owning 2793 * context that might have locks, or the locks will be 2794 * leaked. 2795 */ 2796 if (fp->f_type == DTYPE_VNODE) { 2797 vp = fp->f_vnode; 2798 if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { 2799 lf.l_whence = SEEK_SET; 2800 lf.l_start = 0; 2801 lf.l_len = 0; 2802 lf.l_type = F_UNLCK; 2803 (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, 2804 F_UNLCK, &lf, F_POSIX); 2805 } 2806 fdtol = td->td_proc->p_fdtol; 2807 if (fdtol != NULL) { 2808 /* 2809 * Handle special case where file descriptor table is 2810 * shared between multiple process leaders. 2811 */ 2812 fdp = td->td_proc->p_fd; 2813 FILEDESC_XLOCK(fdp); 2814 for (fdtol = fdtol->fdl_next; 2815 fdtol != td->td_proc->p_fdtol; 2816 fdtol = fdtol->fdl_next) { 2817 if ((fdtol->fdl_leader->p_flag & 2818 P_ADVLOCK) == 0) 2819 continue; 2820 fdtol->fdl_holdcount++; 2821 FILEDESC_XUNLOCK(fdp); 2822 lf.l_whence = SEEK_SET; 2823 lf.l_start = 0; 2824 lf.l_len = 0; 2825 lf.l_type = F_UNLCK; 2826 vp = fp->f_vnode; 2827 (void) VOP_ADVLOCK(vp, 2828 (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf, 2829 F_POSIX); 2830 FILEDESC_XLOCK(fdp); 2831 fdtol->fdl_holdcount--; 2832 if (fdtol->fdl_holdcount == 0 && 2833 fdtol->fdl_wakeup != 0) { 2834 fdtol->fdl_wakeup = 0; 2835 wakeup(fdtol); 2836 } 2837 } 2838 FILEDESC_XUNLOCK(fdp); 2839 } 2840 } 2841 return (fdrop_close(fp, td)); 2842 } 2843 2844 /* 2845 * Hack for file descriptor passing code. 2846 */ 2847 void 2848 closef_nothread(struct file *fp) 2849 { 2850 2851 fdrop(fp, NULL); 2852 } 2853 2854 /* 2855 * Initialize the file pointer with the specified properties. 2856 * 2857 * The ops are set with release semantics to be certain that the flags, type, 2858 * and data are visible when ops is. This is to prevent ops methods from being 2859 * called with bad data. 2860 */ 2861 void 2862 finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops) 2863 { 2864 fp->f_data = data; 2865 fp->f_flag = flag; 2866 fp->f_type = type; 2867 atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops); 2868 } 2869 2870 void 2871 finit_vnode(struct file *fp, u_int flag, void *data, struct fileops *ops) 2872 { 2873 fp->f_seqcount[UIO_READ] = 1; 2874 fp->f_seqcount[UIO_WRITE] = 1; 2875 finit(fp, (flag & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, 2876 data, ops); 2877 } 2878 2879 int 2880 fget_cap_noref(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, 2881 struct file **fpp, struct filecaps *havecapsp) 2882 { 2883 struct filedescent *fde; 2884 int error; 2885 2886 FILEDESC_LOCK_ASSERT(fdp); 2887 2888 *fpp = NULL; 2889 fde = fdeget_noref(fdp, fd); 2890 if (fde == NULL) { 2891 error = EBADF; 2892 goto out; 2893 } 2894 2895 #ifdef CAPABILITIES 2896 error = cap_check(cap_rights_fde_inline(fde), needrightsp); 2897 if (error != 0) 2898 goto out; 2899 #endif 2900 2901 if (havecapsp != NULL) 2902 filecaps_copy(&fde->fde_caps, havecapsp, true); 2903 2904 *fpp = fde->fde_file; 2905 2906 error = 0; 2907 out: 2908 return (error); 2909 } 2910 2911 #ifdef CAPABILITIES 2912 int 2913 fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp, 2914 struct file **fpp, struct filecaps *havecapsp) 2915 { 2916 struct filedesc *fdp = td->td_proc->p_fd; 2917 int error; 2918 struct file *fp; 2919 seqc_t seq; 2920 2921 *fpp = NULL; 2922 for (;;) { 2923 error = fget_unlocked_seq(td, fd, needrightsp, &fp, &seq); 2924 if (error != 0) 2925 return (error); 2926 2927 if (havecapsp != NULL) { 2928 if (!filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, 2929 havecapsp, false)) { 2930 fdrop(fp, td); 2931 goto get_locked; 2932 } 2933 } 2934 2935 if (!fd_modified(fdp, fd, seq)) 2936 break; 2937 fdrop(fp, td); 2938 } 2939 2940 *fpp = fp; 2941 return (0); 2942 2943 get_locked: 2944 FILEDESC_SLOCK(fdp); 2945 error = fget_cap_noref(fdp, fd, needrightsp, fpp, havecapsp); 2946 if (error == 0 && !fhold(*fpp)) 2947 error = EBADF; 2948 FILEDESC_SUNLOCK(fdp); 2949 return (error); 2950 } 2951 #else 2952 int 2953 fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp, 2954 struct file **fpp, struct filecaps *havecapsp) 2955 { 2956 int error; 2957 error = fget_unlocked(td, fd, needrightsp, fpp); 2958 if (havecapsp != NULL && error == 0) 2959 filecaps_fill(havecapsp); 2960 2961 return (error); 2962 } 2963 #endif 2964 2965 #ifdef CAPABILITIES 2966 int 2967 fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch) 2968 { 2969 const struct filedescent *fde; 2970 const struct fdescenttbl *fdt; 2971 struct filedesc *fdp; 2972 struct file *fp; 2973 struct vnode *vp; 2974 const cap_rights_t *haverights; 2975 cap_rights_t rights; 2976 seqc_t seq; 2977 2978 VFS_SMR_ASSERT_ENTERED(); 2979 2980 rights = *ndp->ni_rightsneeded; 2981 cap_rights_set_one(&rights, CAP_LOOKUP); 2982 2983 fdp = curproc->p_fd; 2984 fdt = fdp->fd_files; 2985 if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) 2986 return (EBADF); 2987 seq = seqc_read_notmodify(fd_seqc(fdt, fd)); 2988 fde = &fdt->fdt_ofiles[fd]; 2989 haverights = cap_rights_fde_inline(fde); 2990 fp = fde->fde_file; 2991 if (__predict_false(fp == NULL)) 2992 return (EAGAIN); 2993 if (__predict_false(cap_check_inline_transient(haverights, &rights))) 2994 return (EAGAIN); 2995 *fsearch = ((fp->f_flag & FSEARCH) != 0); 2996 vp = fp->f_vnode; 2997 if (__predict_false(vp == NULL)) { 2998 return (EAGAIN); 2999 } 3000 if (!filecaps_copy(&fde->fde_caps, &ndp->ni_filecaps, false)) { 3001 return (EAGAIN); 3002 } 3003 /* 3004 * Use an acquire barrier to force re-reading of fdt so it is 3005 * refreshed for verification. 3006 */ 3007 atomic_thread_fence_acq(); 3008 fdt = fdp->fd_files; 3009 if (__predict_false(!seqc_consistent_no_fence(fd_seqc(fdt, fd), seq))) 3010 return (EAGAIN); 3011 /* 3012 * If file descriptor doesn't have all rights, 3013 * all lookups relative to it must also be 3014 * strictly relative. 3015 * 3016 * Not yet supported by fast path. 3017 */ 3018 CAP_ALL(&rights); 3019 if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) || 3020 ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL || 3021 ndp->ni_filecaps.fc_nioctls != -1) { 3022 #ifdef notyet 3023 ndp->ni_lcf |= NI_LCF_STRICTRELATIVE; 3024 #else 3025 return (EAGAIN); 3026 #endif 3027 } 3028 *vpp = vp; 3029 return (0); 3030 } 3031 #else 3032 int 3033 fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch) 3034 { 3035 const struct fdescenttbl *fdt; 3036 struct filedesc *fdp; 3037 struct file *fp; 3038 struct vnode *vp; 3039 3040 VFS_SMR_ASSERT_ENTERED(); 3041 3042 fdp = curproc->p_fd; 3043 fdt = fdp->fd_files; 3044 if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) 3045 return (EBADF); 3046 fp = fdt->fdt_ofiles[fd].fde_file; 3047 if (__predict_false(fp == NULL)) 3048 return (EAGAIN); 3049 *fsearch = ((fp->f_flag & FSEARCH) != 0); 3050 vp = fp->f_vnode; 3051 if (__predict_false(vp == NULL || vp->v_type != VDIR)) { 3052 return (EAGAIN); 3053 } 3054 /* 3055 * Use an acquire barrier to force re-reading of fdt so it is 3056 * refreshed for verification. 3057 */ 3058 atomic_thread_fence_acq(); 3059 fdt = fdp->fd_files; 3060 if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file)) 3061 return (EAGAIN); 3062 filecaps_fill(&ndp->ni_filecaps); 3063 *vpp = vp; 3064 return (0); 3065 } 3066 #endif 3067 3068 int 3069 fgetvp_lookup(int fd, struct nameidata *ndp, struct vnode **vpp) 3070 { 3071 struct thread *td; 3072 struct file *fp; 3073 struct vnode *vp; 3074 struct componentname *cnp; 3075 cap_rights_t rights; 3076 int error; 3077 3078 td = curthread; 3079 rights = *ndp->ni_rightsneeded; 3080 cap_rights_set_one(&rights, CAP_LOOKUP); 3081 cnp = &ndp->ni_cnd; 3082 3083 error = fget_cap(td, ndp->ni_dirfd, &rights, &fp, &ndp->ni_filecaps); 3084 if (__predict_false(error != 0)) 3085 return (error); 3086 if (__predict_false(fp->f_ops == &badfileops)) { 3087 error = EBADF; 3088 goto out_free; 3089 } 3090 vp = fp->f_vnode; 3091 if (__predict_false(vp == NULL)) { 3092 error = ENOTDIR; 3093 goto out_free; 3094 } 3095 vrefact(vp); 3096 /* 3097 * XXX does not check for VDIR, handled by namei_setup 3098 */ 3099 if ((fp->f_flag & FSEARCH) != 0) 3100 cnp->cn_flags |= NOEXECCHECK; 3101 fdrop(fp, td); 3102 3103 #ifdef CAPABILITIES 3104 /* 3105 * If file descriptor doesn't have all rights, 3106 * all lookups relative to it must also be 3107 * strictly relative. 3108 */ 3109 CAP_ALL(&rights); 3110 if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) || 3111 ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL || 3112 ndp->ni_filecaps.fc_nioctls != -1) { 3113 ndp->ni_lcf |= NI_LCF_STRICTRELATIVE; 3114 ndp->ni_resflags |= NIRES_STRICTREL; 3115 } 3116 #endif 3117 3118 /* 3119 * TODO: avoid copying ioctl caps if it can be helped to begin with 3120 */ 3121 if ((cnp->cn_flags & WANTIOCTLCAPS) == 0) 3122 filecaps_free_ioctl(&ndp->ni_filecaps); 3123 3124 *vpp = vp; 3125 return (0); 3126 3127 out_free: 3128 filecaps_free(&ndp->ni_filecaps); 3129 fdrop(fp, td); 3130 return (error); 3131 } 3132 3133 /* 3134 * Fetch the descriptor locklessly. 3135 * 3136 * We avoid fdrop() races by never raising a refcount above 0. To accomplish 3137 * this we have to use a cmpset loop rather than an atomic_add. The descriptor 3138 * must be re-verified once we acquire a reference to be certain that the 3139 * identity is still correct and we did not lose a race due to preemption. 3140 * 3141 * Force a reload of fdt when looping. Another thread could reallocate 3142 * the table before this fd was closed, so it is possible that there is 3143 * a stale fp pointer in cached version. 3144 */ 3145 #ifdef CAPABILITIES 3146 static int 3147 fget_unlocked_seq(struct thread *td, int fd, cap_rights_t *needrightsp, 3148 struct file **fpp, seqc_t *seqp) 3149 { 3150 struct filedesc *fdp; 3151 const struct filedescent *fde; 3152 const struct fdescenttbl *fdt; 3153 struct file *fp; 3154 seqc_t seq; 3155 cap_rights_t haverights; 3156 int error; 3157 3158 fdp = td->td_proc->p_fd; 3159 fdt = fdp->fd_files; 3160 if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) 3161 return (EBADF); 3162 3163 for (;;) { 3164 seq = seqc_read_notmodify(fd_seqc(fdt, fd)); 3165 fde = &fdt->fdt_ofiles[fd]; 3166 haverights = *cap_rights_fde_inline(fde); 3167 fp = fde->fde_file; 3168 if (__predict_false(fp == NULL)) { 3169 if (seqc_consistent(fd_seqc(fdt, fd), seq)) 3170 return (EBADF); 3171 fdt = atomic_load_ptr(&fdp->fd_files); 3172 continue; 3173 } 3174 error = cap_check_inline(&haverights, needrightsp); 3175 if (__predict_false(error != 0)) { 3176 if (seqc_consistent(fd_seqc(fdt, fd), seq)) 3177 return (error); 3178 fdt = atomic_load_ptr(&fdp->fd_files); 3179 continue; 3180 } 3181 if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) { 3182 fdt = atomic_load_ptr(&fdp->fd_files); 3183 continue; 3184 } 3185 /* 3186 * Use an acquire barrier to force re-reading of fdt so it is 3187 * refreshed for verification. 3188 */ 3189 atomic_thread_fence_acq(); 3190 fdt = fdp->fd_files; 3191 if (seqc_consistent_no_fence(fd_seqc(fdt, fd), seq)) 3192 break; 3193 fdrop(fp, td); 3194 } 3195 *fpp = fp; 3196 if (seqp != NULL) { 3197 *seqp = seq; 3198 } 3199 return (0); 3200 } 3201 #else 3202 static int 3203 fget_unlocked_seq(struct thread *td, int fd, cap_rights_t *needrightsp, 3204 struct file **fpp, seqc_t *seqp __unused) 3205 { 3206 struct filedesc *fdp; 3207 const struct fdescenttbl *fdt; 3208 struct file *fp; 3209 3210 fdp = td->td_proc->p_fd; 3211 fdt = fdp->fd_files; 3212 if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) 3213 return (EBADF); 3214 3215 for (;;) { 3216 fp = fdt->fdt_ofiles[fd].fde_file; 3217 if (__predict_false(fp == NULL)) 3218 return (EBADF); 3219 if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) { 3220 fdt = atomic_load_ptr(&fdp->fd_files); 3221 continue; 3222 } 3223 /* 3224 * Use an acquire barrier to force re-reading of fdt so it is 3225 * refreshed for verification. 3226 */ 3227 atomic_thread_fence_acq(); 3228 fdt = fdp->fd_files; 3229 if (__predict_true(fp == fdt->fdt_ofiles[fd].fde_file)) 3230 break; 3231 fdrop(fp, td); 3232 } 3233 *fpp = fp; 3234 return (0); 3235 } 3236 #endif 3237 3238 /* 3239 * See the comments in fget_unlocked_seq for an explanation of how this works. 3240 * 3241 * This is a simplified variant which bails out to the aforementioned routine 3242 * if anything goes wrong. In practice this only happens when userspace is 3243 * racing with itself. 3244 */ 3245 int 3246 fget_unlocked(struct thread *td, int fd, cap_rights_t *needrightsp, 3247 struct file **fpp) 3248 { 3249 struct filedesc *fdp; 3250 #ifdef CAPABILITIES 3251 const struct filedescent *fde; 3252 #endif 3253 const struct fdescenttbl *fdt; 3254 struct file *fp; 3255 #ifdef CAPABILITIES 3256 seqc_t seq; 3257 const cap_rights_t *haverights; 3258 #endif 3259 3260 fdp = td->td_proc->p_fd; 3261 fdt = fdp->fd_files; 3262 if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) { 3263 *fpp = NULL; 3264 return (EBADF); 3265 } 3266 #ifdef CAPABILITIES 3267 seq = seqc_read_notmodify(fd_seqc(fdt, fd)); 3268 fde = &fdt->fdt_ofiles[fd]; 3269 haverights = cap_rights_fde_inline(fde); 3270 fp = fde->fde_file; 3271 #else 3272 fp = fdt->fdt_ofiles[fd].fde_file; 3273 #endif 3274 if (__predict_false(fp == NULL)) 3275 goto out_fallback; 3276 #ifdef CAPABILITIES 3277 if (__predict_false(cap_check_inline_transient(haverights, needrightsp))) 3278 goto out_fallback; 3279 #endif 3280 if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) 3281 goto out_fallback; 3282 3283 /* 3284 * Use an acquire barrier to force re-reading of fdt so it is 3285 * refreshed for verification. 3286 */ 3287 atomic_thread_fence_acq(); 3288 fdt = fdp->fd_files; 3289 #ifdef CAPABILITIES 3290 if (__predict_false(!seqc_consistent_no_fence(fd_seqc(fdt, fd), seq))) 3291 #else 3292 if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file)) 3293 #endif 3294 goto out_fdrop; 3295 *fpp = fp; 3296 return (0); 3297 out_fdrop: 3298 fdrop(fp, td); 3299 out_fallback: 3300 *fpp = NULL; 3301 return (fget_unlocked_seq(td, fd, needrightsp, fpp, NULL)); 3302 } 3303 3304 /* 3305 * Translate fd -> file when the caller guarantees the file descriptor table 3306 * can't be changed by others. 3307 * 3308 * Note this does not mean the file object itself is only visible to the caller, 3309 * merely that it wont disappear without having to be referenced. 3310 * 3311 * Must be paired with fput_only_user. 3312 */ 3313 #ifdef CAPABILITIES 3314 int 3315 fget_only_user(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, 3316 struct file **fpp) 3317 { 3318 const struct filedescent *fde; 3319 const struct fdescenttbl *fdt; 3320 const cap_rights_t *haverights; 3321 struct file *fp; 3322 int error; 3323 3324 MPASS(FILEDESC_IS_ONLY_USER(fdp)); 3325 3326 *fpp = NULL; 3327 if (__predict_false(fd >= fdp->fd_nfiles)) 3328 return (EBADF); 3329 3330 fdt = fdp->fd_files; 3331 fde = &fdt->fdt_ofiles[fd]; 3332 fp = fde->fde_file; 3333 if (__predict_false(fp == NULL)) 3334 return (EBADF); 3335 MPASS(refcount_load(&fp->f_count) > 0); 3336 haverights = cap_rights_fde_inline(fde); 3337 error = cap_check_inline(haverights, needrightsp); 3338 if (__predict_false(error != 0)) 3339 return (error); 3340 *fpp = fp; 3341 return (0); 3342 } 3343 #else 3344 int 3345 fget_only_user(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, 3346 struct file **fpp) 3347 { 3348 struct file *fp; 3349 3350 MPASS(FILEDESC_IS_ONLY_USER(fdp)); 3351 3352 *fpp = NULL; 3353 if (__predict_false(fd >= fdp->fd_nfiles)) 3354 return (EBADF); 3355 3356 fp = fdp->fd_ofiles[fd].fde_file; 3357 if (__predict_false(fp == NULL)) 3358 return (EBADF); 3359 3360 MPASS(refcount_load(&fp->f_count) > 0); 3361 *fpp = fp; 3362 return (0); 3363 } 3364 #endif 3365 3366 /* 3367 * Extract the file pointer associated with the specified descriptor for the 3368 * current user process. 3369 * 3370 * If the descriptor doesn't exist or doesn't match 'flags', EBADF is 3371 * returned. 3372 * 3373 * File's rights will be checked against the capability rights mask. 3374 * 3375 * If an error occurred the non-zero error is returned and *fpp is set to 3376 * NULL. Otherwise *fpp is held and set and zero is returned. Caller is 3377 * responsible for fdrop(). 3378 */ 3379 static __inline int 3380 _fget(struct thread *td, int fd, struct file **fpp, int flags, 3381 cap_rights_t *needrightsp) 3382 { 3383 struct file *fp; 3384 int error; 3385 3386 *fpp = NULL; 3387 error = fget_unlocked(td, fd, needrightsp, &fp); 3388 if (__predict_false(error != 0)) 3389 return (error); 3390 if (__predict_false(fp->f_ops == &badfileops)) { 3391 fdrop(fp, td); 3392 return (EBADF); 3393 } 3394 3395 /* 3396 * FREAD and FWRITE failure return EBADF as per POSIX. 3397 */ 3398 error = 0; 3399 switch (flags) { 3400 case FREAD: 3401 case FWRITE: 3402 if ((fp->f_flag & flags) == 0) 3403 error = EBADF; 3404 break; 3405 case FEXEC: 3406 if (fp->f_ops != &path_fileops && 3407 ((fp->f_flag & (FREAD | FEXEC)) == 0 || 3408 (fp->f_flag & FWRITE) != 0)) 3409 error = EBADF; 3410 break; 3411 case 0: 3412 break; 3413 default: 3414 KASSERT(0, ("wrong flags")); 3415 } 3416 3417 if (error != 0) { 3418 fdrop(fp, td); 3419 return (error); 3420 } 3421 3422 *fpp = fp; 3423 return (0); 3424 } 3425 3426 int 3427 fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) 3428 { 3429 3430 return (_fget(td, fd, fpp, 0, rightsp)); 3431 } 3432 3433 int 3434 fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, vm_prot_t *maxprotp, 3435 struct file **fpp) 3436 { 3437 int error; 3438 #ifndef CAPABILITIES 3439 error = _fget(td, fd, fpp, 0, rightsp); 3440 if (maxprotp != NULL) 3441 *maxprotp = VM_PROT_ALL; 3442 return (error); 3443 #else 3444 cap_rights_t fdrights; 3445 struct filedesc *fdp; 3446 struct file *fp; 3447 seqc_t seq; 3448 3449 *fpp = NULL; 3450 fdp = td->td_proc->p_fd; 3451 MPASS(cap_rights_is_set(rightsp, CAP_MMAP)); 3452 for (;;) { 3453 error = fget_unlocked_seq(td, fd, rightsp, &fp, &seq); 3454 if (__predict_false(error != 0)) 3455 return (error); 3456 if (__predict_false(fp->f_ops == &badfileops)) { 3457 fdrop(fp, td); 3458 return (EBADF); 3459 } 3460 if (maxprotp != NULL) 3461 fdrights = *cap_rights(fdp, fd); 3462 if (!fd_modified(fdp, fd, seq)) 3463 break; 3464 fdrop(fp, td); 3465 } 3466 3467 /* 3468 * If requested, convert capability rights to access flags. 3469 */ 3470 if (maxprotp != NULL) 3471 *maxprotp = cap_rights_to_vmprot(&fdrights); 3472 *fpp = fp; 3473 return (0); 3474 #endif 3475 } 3476 3477 int 3478 fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) 3479 { 3480 3481 return (_fget(td, fd, fpp, FREAD, rightsp)); 3482 } 3483 3484 int 3485 fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) 3486 { 3487 3488 return (_fget(td, fd, fpp, FWRITE, rightsp)); 3489 } 3490 3491 int 3492 fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl, 3493 struct file **fpp) 3494 { 3495 #ifndef CAPABILITIES 3496 return (fget_unlocked(td, fd, rightsp, fpp)); 3497 #else 3498 struct filedesc *fdp = td->td_proc->p_fd; 3499 struct file *fp; 3500 int error; 3501 seqc_t seq; 3502 3503 *fpp = NULL; 3504 MPASS(cap_rights_is_set(rightsp, CAP_FCNTL)); 3505 for (;;) { 3506 error = fget_unlocked_seq(td, fd, rightsp, &fp, &seq); 3507 if (error != 0) 3508 return (error); 3509 error = cap_fcntl_check(fdp, fd, needfcntl); 3510 if (!fd_modified(fdp, fd, seq)) 3511 break; 3512 fdrop(fp, td); 3513 } 3514 if (error != 0) { 3515 fdrop(fp, td); 3516 return (error); 3517 } 3518 *fpp = fp; 3519 return (0); 3520 #endif 3521 } 3522 3523 /* 3524 * Like fget() but loads the underlying vnode, or returns an error if the 3525 * descriptor does not represent a vnode. Note that pipes use vnodes but 3526 * never have VM objects. The returned vnode will be vref()'d. 3527 * 3528 * XXX: what about the unused flags ? 3529 */ 3530 static __inline int 3531 _fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp, 3532 struct vnode **vpp) 3533 { 3534 struct file *fp; 3535 int error; 3536 3537 *vpp = NULL; 3538 error = _fget(td, fd, &fp, flags, needrightsp); 3539 if (error != 0) 3540 return (error); 3541 if (fp->f_vnode == NULL) { 3542 error = EINVAL; 3543 } else { 3544 *vpp = fp->f_vnode; 3545 vrefact(*vpp); 3546 } 3547 fdrop(fp, td); 3548 3549 return (error); 3550 } 3551 3552 int 3553 fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) 3554 { 3555 3556 return (_fgetvp(td, fd, 0, rightsp, vpp)); 3557 } 3558 3559 int 3560 fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp, 3561 struct filecaps *havecaps, struct vnode **vpp) 3562 { 3563 struct filecaps caps; 3564 struct file *fp; 3565 int error; 3566 3567 error = fget_cap(td, fd, needrightsp, &fp, &caps); 3568 if (error != 0) 3569 return (error); 3570 if (fp->f_ops == &badfileops) { 3571 error = EBADF; 3572 goto out; 3573 } 3574 if (fp->f_vnode == NULL) { 3575 error = EINVAL; 3576 goto out; 3577 } 3578 3579 *havecaps = caps; 3580 *vpp = fp->f_vnode; 3581 vrefact(*vpp); 3582 fdrop(fp, td); 3583 3584 return (0); 3585 out: 3586 filecaps_free(&caps); 3587 fdrop(fp, td); 3588 return (error); 3589 } 3590 3591 int 3592 fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) 3593 { 3594 3595 return (_fgetvp(td, fd, FREAD, rightsp, vpp)); 3596 } 3597 3598 int 3599 fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) 3600 { 3601 3602 return (_fgetvp(td, fd, FEXEC, rightsp, vpp)); 3603 } 3604 3605 #ifdef notyet 3606 int 3607 fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp, 3608 struct vnode **vpp) 3609 { 3610 3611 return (_fgetvp(td, fd, FWRITE, rightsp, vpp)); 3612 } 3613 #endif 3614 3615 /* 3616 * Handle the last reference to a file being closed. 3617 * 3618 * Without the noinline attribute clang keeps inlining the func thorough this 3619 * file when fdrop is used. 3620 */ 3621 int __noinline 3622 _fdrop(struct file *fp, struct thread *td) 3623 { 3624 int error; 3625 #ifdef INVARIANTS 3626 int count; 3627 3628 count = refcount_load(&fp->f_count); 3629 if (count != 0) 3630 panic("fdrop: fp %p count %d", fp, count); 3631 #endif 3632 error = fo_close(fp, td); 3633 atomic_subtract_int(&openfiles, 1); 3634 crfree(fp->f_cred); 3635 free(fp->f_advice, M_FADVISE); 3636 uma_zfree(file_zone, fp); 3637 3638 return (error); 3639 } 3640 3641 /* 3642 * Apply an advisory lock on a file descriptor. 3643 * 3644 * Just attempt to get a record lock of the requested type on the entire file 3645 * (l_whence = SEEK_SET, l_start = 0, l_len = 0). 3646 */ 3647 #ifndef _SYS_SYSPROTO_H_ 3648 struct flock_args { 3649 int fd; 3650 int how; 3651 }; 3652 #endif 3653 /* ARGSUSED */ 3654 int 3655 sys_flock(struct thread *td, struct flock_args *uap) 3656 { 3657 struct file *fp; 3658 struct vnode *vp; 3659 struct flock lf; 3660 int error; 3661 3662 error = fget(td, uap->fd, &cap_flock_rights, &fp); 3663 if (error != 0) 3664 return (error); 3665 error = EOPNOTSUPP; 3666 if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) { 3667 goto done; 3668 } 3669 if (fp->f_ops == &path_fileops) { 3670 goto done; 3671 } 3672 3673 error = 0; 3674 vp = fp->f_vnode; 3675 lf.l_whence = SEEK_SET; 3676 lf.l_start = 0; 3677 lf.l_len = 0; 3678 if (uap->how & LOCK_UN) { 3679 lf.l_type = F_UNLCK; 3680 atomic_clear_int(&fp->f_flag, FHASLOCK); 3681 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); 3682 goto done; 3683 } 3684 if (uap->how & LOCK_EX) 3685 lf.l_type = F_WRLCK; 3686 else if (uap->how & LOCK_SH) 3687 lf.l_type = F_RDLCK; 3688 else { 3689 error = EBADF; 3690 goto done; 3691 } 3692 atomic_set_int(&fp->f_flag, FHASLOCK); 3693 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, 3694 (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); 3695 done: 3696 fdrop(fp, td); 3697 return (error); 3698 } 3699 /* 3700 * Duplicate the specified descriptor to a free descriptor. 3701 */ 3702 int 3703 dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode, 3704 int openerror, int *indxp) 3705 { 3706 struct filedescent *newfde, *oldfde; 3707 struct file *fp; 3708 u_long *ioctls; 3709 int error, indx; 3710 3711 KASSERT(openerror == ENODEV || openerror == ENXIO, 3712 ("unexpected error %d in %s", openerror, __func__)); 3713 3714 /* 3715 * If the to-be-dup'd fd number is greater than the allowed number 3716 * of file descriptors, or the fd to be dup'd has already been 3717 * closed, then reject. 3718 */ 3719 FILEDESC_XLOCK(fdp); 3720 if ((fp = fget_noref(fdp, dfd)) == NULL) { 3721 FILEDESC_XUNLOCK(fdp); 3722 return (EBADF); 3723 } 3724 3725 error = fdalloc(td, 0, &indx); 3726 if (error != 0) { 3727 FILEDESC_XUNLOCK(fdp); 3728 return (error); 3729 } 3730 3731 /* 3732 * There are two cases of interest here. 3733 * 3734 * For ENODEV simply dup (dfd) to file descriptor (indx) and return. 3735 * 3736 * For ENXIO steal away the file structure from (dfd) and store it in 3737 * (indx). (dfd) is effectively closed by this operation. 3738 */ 3739 switch (openerror) { 3740 case ENODEV: 3741 /* 3742 * Check that the mode the file is being opened for is a 3743 * subset of the mode of the existing descriptor. 3744 */ 3745 if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { 3746 fdunused(fdp, indx); 3747 FILEDESC_XUNLOCK(fdp); 3748 return (EACCES); 3749 } 3750 if (!fhold(fp)) { 3751 fdunused(fdp, indx); 3752 FILEDESC_XUNLOCK(fdp); 3753 return (EBADF); 3754 } 3755 newfde = &fdp->fd_ofiles[indx]; 3756 oldfde = &fdp->fd_ofiles[dfd]; 3757 ioctls = filecaps_copy_prep(&oldfde->fde_caps); 3758 #ifdef CAPABILITIES 3759 seqc_write_begin(&newfde->fde_seqc); 3760 #endif 3761 fde_copy(oldfde, newfde); 3762 filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps, 3763 ioctls); 3764 #ifdef CAPABILITIES 3765 seqc_write_end(&newfde->fde_seqc); 3766 #endif 3767 break; 3768 case ENXIO: 3769 /* 3770 * Steal away the file pointer from dfd and stuff it into indx. 3771 */ 3772 newfde = &fdp->fd_ofiles[indx]; 3773 oldfde = &fdp->fd_ofiles[dfd]; 3774 #ifdef CAPABILITIES 3775 seqc_write_begin(&oldfde->fde_seqc); 3776 seqc_write_begin(&newfde->fde_seqc); 3777 #endif 3778 fde_copy(oldfde, newfde); 3779 oldfde->fde_file = NULL; 3780 fdunused(fdp, dfd); 3781 #ifdef CAPABILITIES 3782 seqc_write_end(&newfde->fde_seqc); 3783 seqc_write_end(&oldfde->fde_seqc); 3784 #endif 3785 break; 3786 } 3787 FILEDESC_XUNLOCK(fdp); 3788 *indxp = indx; 3789 return (0); 3790 } 3791 3792 /* 3793 * This sysctl determines if we will allow a process to chroot(2) if it 3794 * has a directory open: 3795 * 0: disallowed for all processes. 3796 * 1: allowed for processes that were not already chroot(2)'ed. 3797 * 2: allowed for all processes. 3798 */ 3799 3800 static int chroot_allow_open_directories = 1; 3801 3802 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW, 3803 &chroot_allow_open_directories, 0, 3804 "Allow a process to chroot(2) if it has a directory open"); 3805 3806 /* 3807 * Helper function for raised chroot(2) security function: Refuse if 3808 * any filedescriptors are open directories. 3809 */ 3810 static int 3811 chroot_refuse_vdir_fds(struct filedesc *fdp) 3812 { 3813 struct vnode *vp; 3814 struct file *fp; 3815 int i; 3816 3817 FILEDESC_LOCK_ASSERT(fdp); 3818 3819 FILEDESC_FOREACH_FP(fdp, i, fp) { 3820 if (fp->f_type == DTYPE_VNODE) { 3821 vp = fp->f_vnode; 3822 if (vp->v_type == VDIR) 3823 return (EPERM); 3824 } 3825 } 3826 return (0); 3827 } 3828 3829 static void 3830 pwd_fill(struct pwd *oldpwd, struct pwd *newpwd) 3831 { 3832 3833 if (newpwd->pwd_cdir == NULL && oldpwd->pwd_cdir != NULL) { 3834 vrefact(oldpwd->pwd_cdir); 3835 newpwd->pwd_cdir = oldpwd->pwd_cdir; 3836 } 3837 3838 if (newpwd->pwd_rdir == NULL && oldpwd->pwd_rdir != NULL) { 3839 vrefact(oldpwd->pwd_rdir); 3840 newpwd->pwd_rdir = oldpwd->pwd_rdir; 3841 } 3842 3843 if (newpwd->pwd_jdir == NULL && oldpwd->pwd_jdir != NULL) { 3844 vrefact(oldpwd->pwd_jdir); 3845 newpwd->pwd_jdir = oldpwd->pwd_jdir; 3846 } 3847 3848 if (newpwd->pwd_adir == NULL && oldpwd->pwd_adir != NULL) { 3849 vrefact(oldpwd->pwd_adir); 3850 newpwd->pwd_adir = oldpwd->pwd_adir; 3851 } 3852 } 3853 3854 struct pwd * 3855 pwd_hold_pwddesc(struct pwddesc *pdp) 3856 { 3857 struct pwd *pwd; 3858 3859 PWDDESC_ASSERT_XLOCKED(pdp); 3860 pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 3861 if (pwd != NULL) 3862 refcount_acquire(&pwd->pwd_refcount); 3863 return (pwd); 3864 } 3865 3866 bool 3867 pwd_hold_smr(struct pwd *pwd) 3868 { 3869 3870 MPASS(pwd != NULL); 3871 if (__predict_true(refcount_acquire_if_not_zero(&pwd->pwd_refcount))) { 3872 return (true); 3873 } 3874 return (false); 3875 } 3876 3877 struct pwd * 3878 pwd_hold(struct thread *td) 3879 { 3880 struct pwddesc *pdp; 3881 struct pwd *pwd; 3882 3883 pdp = td->td_proc->p_pd; 3884 3885 vfs_smr_enter(); 3886 pwd = vfs_smr_entered_load(&pdp->pd_pwd); 3887 if (pwd_hold_smr(pwd)) { 3888 vfs_smr_exit(); 3889 return (pwd); 3890 } 3891 vfs_smr_exit(); 3892 PWDDESC_XLOCK(pdp); 3893 pwd = pwd_hold_pwddesc(pdp); 3894 MPASS(pwd != NULL); 3895 PWDDESC_XUNLOCK(pdp); 3896 return (pwd); 3897 } 3898 3899 struct pwd * 3900 pwd_hold_proc(struct proc *p) 3901 { 3902 struct pwddesc *pdp; 3903 struct pwd *pwd; 3904 3905 PROC_ASSERT_HELD(p); 3906 PROC_LOCK(p); 3907 pdp = pdhold(p); 3908 MPASS(pdp != NULL); 3909 PROC_UNLOCK(p); 3910 3911 PWDDESC_XLOCK(pdp); 3912 pwd = pwd_hold_pwddesc(pdp); 3913 MPASS(pwd != NULL); 3914 PWDDESC_XUNLOCK(pdp); 3915 pddrop(pdp); 3916 return (pwd); 3917 } 3918 3919 static struct pwd * 3920 pwd_alloc(void) 3921 { 3922 struct pwd *pwd; 3923 3924 pwd = uma_zalloc_smr(pwd_zone, M_WAITOK); 3925 bzero(pwd, sizeof(*pwd)); 3926 refcount_init(&pwd->pwd_refcount, 1); 3927 return (pwd); 3928 } 3929 3930 void 3931 pwd_drop(struct pwd *pwd) 3932 { 3933 3934 if (!refcount_release(&pwd->pwd_refcount)) 3935 return; 3936 3937 if (pwd->pwd_cdir != NULL) 3938 vrele(pwd->pwd_cdir); 3939 if (pwd->pwd_rdir != NULL) 3940 vrele(pwd->pwd_rdir); 3941 if (pwd->pwd_jdir != NULL) 3942 vrele(pwd->pwd_jdir); 3943 if (pwd->pwd_adir != NULL) 3944 vrele(pwd->pwd_adir); 3945 uma_zfree_smr(pwd_zone, pwd); 3946 } 3947 3948 /* 3949 * The caller is responsible for invoking priv_check() and 3950 * mac_vnode_check_chroot() to authorize this operation. 3951 */ 3952 int 3953 pwd_chroot(struct thread *td, struct vnode *vp) 3954 { 3955 struct pwddesc *pdp; 3956 struct filedesc *fdp; 3957 struct pwd *newpwd, *oldpwd; 3958 int error; 3959 3960 fdp = td->td_proc->p_fd; 3961 pdp = td->td_proc->p_pd; 3962 newpwd = pwd_alloc(); 3963 FILEDESC_SLOCK(fdp); 3964 PWDDESC_XLOCK(pdp); 3965 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 3966 if (chroot_allow_open_directories == 0 || 3967 (chroot_allow_open_directories == 1 && 3968 oldpwd->pwd_rdir != rootvnode)) { 3969 error = chroot_refuse_vdir_fds(fdp); 3970 FILEDESC_SUNLOCK(fdp); 3971 if (error != 0) { 3972 PWDDESC_XUNLOCK(pdp); 3973 pwd_drop(newpwd); 3974 return (error); 3975 } 3976 } else { 3977 FILEDESC_SUNLOCK(fdp); 3978 } 3979 3980 vrefact(vp); 3981 newpwd->pwd_rdir = vp; 3982 vrefact(vp); 3983 newpwd->pwd_adir = vp; 3984 if (oldpwd->pwd_jdir == NULL) { 3985 vrefact(vp); 3986 newpwd->pwd_jdir = vp; 3987 } 3988 pwd_fill(oldpwd, newpwd); 3989 pwd_set(pdp, newpwd); 3990 PWDDESC_XUNLOCK(pdp); 3991 pwd_drop(oldpwd); 3992 return (0); 3993 } 3994 3995 void 3996 pwd_chdir(struct thread *td, struct vnode *vp) 3997 { 3998 struct pwddesc *pdp; 3999 struct pwd *newpwd, *oldpwd; 4000 4001 VNPASS(vp->v_usecount > 0, vp); 4002 4003 newpwd = pwd_alloc(); 4004 pdp = td->td_proc->p_pd; 4005 PWDDESC_XLOCK(pdp); 4006 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4007 newpwd->pwd_cdir = vp; 4008 pwd_fill(oldpwd, newpwd); 4009 pwd_set(pdp, newpwd); 4010 PWDDESC_XUNLOCK(pdp); 4011 pwd_drop(oldpwd); 4012 } 4013 4014 /* 4015 * Process is transitioning to/from a non-native ABI. 4016 */ 4017 void 4018 pwd_altroot(struct thread *td, struct vnode *altroot_vp) 4019 { 4020 struct pwddesc *pdp; 4021 struct pwd *newpwd, *oldpwd; 4022 4023 newpwd = pwd_alloc(); 4024 pdp = td->td_proc->p_pd; 4025 PWDDESC_XLOCK(pdp); 4026 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4027 if (altroot_vp != NULL) { 4028 /* 4029 * Native process to a non-native ABI. 4030 */ 4031 4032 vrefact(altroot_vp); 4033 newpwd->pwd_adir = altroot_vp; 4034 } else { 4035 /* 4036 * Non-native process to the native ABI. 4037 */ 4038 4039 vrefact(oldpwd->pwd_rdir); 4040 newpwd->pwd_adir = oldpwd->pwd_rdir; 4041 } 4042 pwd_fill(oldpwd, newpwd); 4043 pwd_set(pdp, newpwd); 4044 PWDDESC_XUNLOCK(pdp); 4045 pwd_drop(oldpwd); 4046 } 4047 4048 /* 4049 * jail_attach(2) changes both root and working directories. 4050 */ 4051 int 4052 pwd_chroot_chdir(struct thread *td, struct vnode *vp) 4053 { 4054 struct pwddesc *pdp; 4055 struct filedesc *fdp; 4056 struct pwd *newpwd, *oldpwd; 4057 int error; 4058 4059 fdp = td->td_proc->p_fd; 4060 pdp = td->td_proc->p_pd; 4061 newpwd = pwd_alloc(); 4062 FILEDESC_SLOCK(fdp); 4063 PWDDESC_XLOCK(pdp); 4064 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4065 error = chroot_refuse_vdir_fds(fdp); 4066 FILEDESC_SUNLOCK(fdp); 4067 if (error != 0) { 4068 PWDDESC_XUNLOCK(pdp); 4069 pwd_drop(newpwd); 4070 return (error); 4071 } 4072 4073 vrefact(vp); 4074 newpwd->pwd_rdir = vp; 4075 vrefact(vp); 4076 newpwd->pwd_cdir = vp; 4077 if (oldpwd->pwd_jdir == NULL) { 4078 vrefact(vp); 4079 newpwd->pwd_jdir = vp; 4080 } 4081 vrefact(vp); 4082 newpwd->pwd_adir = vp; 4083 pwd_fill(oldpwd, newpwd); 4084 pwd_set(pdp, newpwd); 4085 PWDDESC_XUNLOCK(pdp); 4086 pwd_drop(oldpwd); 4087 return (0); 4088 } 4089 4090 void 4091 pwd_ensure_dirs(void) 4092 { 4093 struct pwddesc *pdp; 4094 struct pwd *oldpwd, *newpwd; 4095 4096 pdp = curproc->p_pd; 4097 PWDDESC_XLOCK(pdp); 4098 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4099 if (oldpwd->pwd_cdir != NULL && oldpwd->pwd_rdir != NULL && 4100 oldpwd->pwd_adir != NULL) { 4101 PWDDESC_XUNLOCK(pdp); 4102 return; 4103 } 4104 PWDDESC_XUNLOCK(pdp); 4105 4106 newpwd = pwd_alloc(); 4107 PWDDESC_XLOCK(pdp); 4108 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4109 pwd_fill(oldpwd, newpwd); 4110 if (newpwd->pwd_cdir == NULL) { 4111 vrefact(rootvnode); 4112 newpwd->pwd_cdir = rootvnode; 4113 } 4114 if (newpwd->pwd_rdir == NULL) { 4115 vrefact(rootvnode); 4116 newpwd->pwd_rdir = rootvnode; 4117 } 4118 if (newpwd->pwd_adir == NULL) { 4119 vrefact(rootvnode); 4120 newpwd->pwd_adir = rootvnode; 4121 } 4122 pwd_set(pdp, newpwd); 4123 PWDDESC_XUNLOCK(pdp); 4124 pwd_drop(oldpwd); 4125 } 4126 4127 void 4128 pwd_set_rootvnode(void) 4129 { 4130 struct pwddesc *pdp; 4131 struct pwd *oldpwd, *newpwd; 4132 4133 pdp = curproc->p_pd; 4134 4135 newpwd = pwd_alloc(); 4136 PWDDESC_XLOCK(pdp); 4137 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4138 vrefact(rootvnode); 4139 newpwd->pwd_cdir = rootvnode; 4140 vrefact(rootvnode); 4141 newpwd->pwd_rdir = rootvnode; 4142 vrefact(rootvnode); 4143 newpwd->pwd_adir = rootvnode; 4144 pwd_fill(oldpwd, newpwd); 4145 pwd_set(pdp, newpwd); 4146 PWDDESC_XUNLOCK(pdp); 4147 pwd_drop(oldpwd); 4148 } 4149 4150 /* 4151 * Scan all active processes and prisons to see if any of them have a current 4152 * or root directory of `olddp'. If so, replace them with the new mount point. 4153 */ 4154 void 4155 mountcheckdirs(struct vnode *olddp, struct vnode *newdp) 4156 { 4157 struct pwddesc *pdp; 4158 struct pwd *newpwd, *oldpwd; 4159 struct prison *pr; 4160 struct proc *p; 4161 int nrele; 4162 4163 if (vrefcnt(olddp) == 1) 4164 return; 4165 nrele = 0; 4166 newpwd = pwd_alloc(); 4167 sx_slock(&allproc_lock); 4168 FOREACH_PROC_IN_SYSTEM(p) { 4169 PROC_LOCK(p); 4170 pdp = pdhold(p); 4171 PROC_UNLOCK(p); 4172 if (pdp == NULL) 4173 continue; 4174 PWDDESC_XLOCK(pdp); 4175 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4176 if (oldpwd == NULL || 4177 (oldpwd->pwd_cdir != olddp && 4178 oldpwd->pwd_rdir != olddp && 4179 oldpwd->pwd_jdir != olddp && 4180 oldpwd->pwd_adir != olddp)) { 4181 PWDDESC_XUNLOCK(pdp); 4182 pddrop(pdp); 4183 continue; 4184 } 4185 if (oldpwd->pwd_cdir == olddp) { 4186 vrefact(newdp); 4187 newpwd->pwd_cdir = newdp; 4188 } 4189 if (oldpwd->pwd_rdir == olddp) { 4190 vrefact(newdp); 4191 newpwd->pwd_rdir = newdp; 4192 } 4193 if (oldpwd->pwd_jdir == olddp) { 4194 vrefact(newdp); 4195 newpwd->pwd_jdir = newdp; 4196 } 4197 if (oldpwd->pwd_adir == olddp) { 4198 vrefact(newdp); 4199 newpwd->pwd_adir = newdp; 4200 } 4201 pwd_fill(oldpwd, newpwd); 4202 pwd_set(pdp, newpwd); 4203 PWDDESC_XUNLOCK(pdp); 4204 pwd_drop(oldpwd); 4205 pddrop(pdp); 4206 newpwd = pwd_alloc(); 4207 } 4208 sx_sunlock(&allproc_lock); 4209 pwd_drop(newpwd); 4210 if (rootvnode == olddp) { 4211 vrefact(newdp); 4212 rootvnode = newdp; 4213 nrele++; 4214 } 4215 mtx_lock(&prison0.pr_mtx); 4216 if (prison0.pr_root == olddp) { 4217 vrefact(newdp); 4218 prison0.pr_root = newdp; 4219 nrele++; 4220 } 4221 mtx_unlock(&prison0.pr_mtx); 4222 sx_slock(&allprison_lock); 4223 TAILQ_FOREACH(pr, &allprison, pr_list) { 4224 mtx_lock(&pr->pr_mtx); 4225 if (pr->pr_root == olddp) { 4226 vrefact(newdp); 4227 pr->pr_root = newdp; 4228 nrele++; 4229 } 4230 mtx_unlock(&pr->pr_mtx); 4231 } 4232 sx_sunlock(&allprison_lock); 4233 while (nrele--) 4234 vrele(olddp); 4235 } 4236 4237 int 4238 descrip_check_write_mp(struct filedesc *fdp, struct mount *mp) 4239 { 4240 struct file *fp; 4241 struct vnode *vp; 4242 int error, i; 4243 4244 error = 0; 4245 FILEDESC_SLOCK(fdp); 4246 FILEDESC_FOREACH_FP(fdp, i, fp) { 4247 if (fp->f_type != DTYPE_VNODE || 4248 (atomic_load_int(&fp->f_flag) & FWRITE) == 0) 4249 continue; 4250 vp = fp->f_vnode; 4251 if (vp->v_mount == mp) { 4252 error = EDEADLK; 4253 break; 4254 } 4255 } 4256 FILEDESC_SUNLOCK(fdp); 4257 return (error); 4258 } 4259 4260 struct filedesc_to_leader * 4261 filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, 4262 struct proc *leader) 4263 { 4264 struct filedesc_to_leader *fdtol; 4265 4266 fdtol = malloc(sizeof(struct filedesc_to_leader), 4267 M_FILEDESC_TO_LEADER, M_WAITOK); 4268 fdtol->fdl_refcount = 1; 4269 fdtol->fdl_holdcount = 0; 4270 fdtol->fdl_wakeup = 0; 4271 fdtol->fdl_leader = leader; 4272 if (old != NULL) { 4273 FILEDESC_XLOCK(fdp); 4274 fdtol->fdl_next = old->fdl_next; 4275 fdtol->fdl_prev = old; 4276 old->fdl_next = fdtol; 4277 fdtol->fdl_next->fdl_prev = fdtol; 4278 FILEDESC_XUNLOCK(fdp); 4279 } else { 4280 fdtol->fdl_next = fdtol; 4281 fdtol->fdl_prev = fdtol; 4282 } 4283 return (fdtol); 4284 } 4285 4286 struct filedesc_to_leader * 4287 filedesc_to_leader_share(struct filedesc_to_leader *fdtol, struct filedesc *fdp) 4288 { 4289 FILEDESC_XLOCK(fdp); 4290 fdtol->fdl_refcount++; 4291 FILEDESC_XUNLOCK(fdp); 4292 return (fdtol); 4293 } 4294 4295 static int 4296 sysctl_kern_proc_nfds(SYSCTL_HANDLER_ARGS) 4297 { 4298 NDSLOTTYPE *map; 4299 struct filedesc *fdp; 4300 u_int namelen; 4301 int count, off, minoff; 4302 4303 namelen = arg2; 4304 if (namelen != 1) 4305 return (EINVAL); 4306 4307 if (*(int *)arg1 != 0) 4308 return (EINVAL); 4309 4310 fdp = curproc->p_fd; 4311 count = 0; 4312 FILEDESC_SLOCK(fdp); 4313 map = fdp->fd_map; 4314 off = NDSLOT(fdp->fd_nfiles - 1); 4315 for (minoff = NDSLOT(0); off >= minoff; --off) 4316 count += bitcountl(map[off]); 4317 FILEDESC_SUNLOCK(fdp); 4318 4319 return (SYSCTL_OUT(req, &count, sizeof(count))); 4320 } 4321 4322 static SYSCTL_NODE(_kern_proc, KERN_PROC_NFDS, nfds, 4323 CTLFLAG_RD|CTLFLAG_CAPRD|CTLFLAG_MPSAFE, sysctl_kern_proc_nfds, 4324 "Number of open file descriptors"); 4325 4326 /* 4327 * Get file structures globally. 4328 */ 4329 static int 4330 sysctl_kern_file(SYSCTL_HANDLER_ARGS) 4331 { 4332 struct xfile xf; 4333 struct filedesc *fdp; 4334 struct file *fp; 4335 struct proc *p; 4336 int error, n; 4337 4338 error = sysctl_wire_old_buffer(req, 0); 4339 if (error != 0) 4340 return (error); 4341 if (req->oldptr == NULL) { 4342 n = 0; 4343 sx_slock(&allproc_lock); 4344 FOREACH_PROC_IN_SYSTEM(p) { 4345 PROC_LOCK(p); 4346 if (p->p_state == PRS_NEW) { 4347 PROC_UNLOCK(p); 4348 continue; 4349 } 4350 fdp = fdhold(p); 4351 PROC_UNLOCK(p); 4352 if (fdp == NULL) 4353 continue; 4354 /* overestimates sparse tables. */ 4355 n += fdp->fd_nfiles; 4356 fddrop(fdp); 4357 } 4358 sx_sunlock(&allproc_lock); 4359 return (SYSCTL_OUT(req, 0, n * sizeof(xf))); 4360 } 4361 error = 0; 4362 bzero(&xf, sizeof(xf)); 4363 xf.xf_size = sizeof(xf); 4364 sx_slock(&allproc_lock); 4365 FOREACH_PROC_IN_SYSTEM(p) { 4366 PROC_LOCK(p); 4367 if (p->p_state == PRS_NEW) { 4368 PROC_UNLOCK(p); 4369 continue; 4370 } 4371 if (p_cansee(req->td, p) != 0) { 4372 PROC_UNLOCK(p); 4373 continue; 4374 } 4375 xf.xf_pid = p->p_pid; 4376 xf.xf_uid = p->p_ucred->cr_uid; 4377 fdp = fdhold(p); 4378 PROC_UNLOCK(p); 4379 if (fdp == NULL) 4380 continue; 4381 FILEDESC_SLOCK(fdp); 4382 if (refcount_load(&fdp->fd_refcnt) == 0) 4383 goto nextproc; 4384 FILEDESC_FOREACH_FP(fdp, n, fp) { 4385 xf.xf_fd = n; 4386 xf.xf_file = (uintptr_t)fp; 4387 xf.xf_data = (uintptr_t)fp->f_data; 4388 xf.xf_vnode = (uintptr_t)fp->f_vnode; 4389 xf.xf_type = (uintptr_t)fp->f_type; 4390 xf.xf_count = refcount_load(&fp->f_count); 4391 xf.xf_msgcount = 0; 4392 xf.xf_offset = foffset_get(fp); 4393 xf.xf_flag = fp->f_flag; 4394 error = SYSCTL_OUT(req, &xf, sizeof(xf)); 4395 4396 /* 4397 * There is no need to re-check the fdtable refcount 4398 * here since the filedesc lock is not dropped in the 4399 * loop body. 4400 */ 4401 if (error != 0) 4402 break; 4403 } 4404 nextproc: 4405 FILEDESC_SUNLOCK(fdp); 4406 fddrop(fdp); 4407 if (error) 4408 break; 4409 } 4410 sx_sunlock(&allproc_lock); 4411 return (error); 4412 } 4413 4414 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE, 4415 0, 0, sysctl_kern_file, "S,xfile", "Entire file table"); 4416 4417 #ifdef KINFO_FILE_SIZE 4418 CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE); 4419 #endif 4420 4421 static int 4422 xlate_fflags(int fflags) 4423 { 4424 static const struct { 4425 int fflag; 4426 int kf_fflag; 4427 } fflags_table[] = { 4428 { FAPPEND, KF_FLAG_APPEND }, 4429 { FASYNC, KF_FLAG_ASYNC }, 4430 { FFSYNC, KF_FLAG_FSYNC }, 4431 { FHASLOCK, KF_FLAG_HASLOCK }, 4432 { FNONBLOCK, KF_FLAG_NONBLOCK }, 4433 { FREAD, KF_FLAG_READ }, 4434 { FWRITE, KF_FLAG_WRITE }, 4435 { O_CREAT, KF_FLAG_CREAT }, 4436 { O_DIRECT, KF_FLAG_DIRECT }, 4437 { O_EXCL, KF_FLAG_EXCL }, 4438 { O_EXEC, KF_FLAG_EXEC }, 4439 { O_EXLOCK, KF_FLAG_EXLOCK }, 4440 { O_NOFOLLOW, KF_FLAG_NOFOLLOW }, 4441 { O_SHLOCK, KF_FLAG_SHLOCK }, 4442 { O_TRUNC, KF_FLAG_TRUNC } 4443 }; 4444 unsigned int i; 4445 int kflags; 4446 4447 kflags = 0; 4448 for (i = 0; i < nitems(fflags_table); i++) 4449 if (fflags & fflags_table[i].fflag) 4450 kflags |= fflags_table[i].kf_fflag; 4451 return (kflags); 4452 } 4453 4454 /* Trim unused data from kf_path by truncating the structure size. */ 4455 void 4456 pack_kinfo(struct kinfo_file *kif) 4457 { 4458 4459 kif->kf_structsize = offsetof(struct kinfo_file, kf_path) + 4460 strlen(kif->kf_path) + 1; 4461 kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t)); 4462 } 4463 4464 static void 4465 export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp, 4466 struct kinfo_file *kif, struct filedesc *fdp, int flags) 4467 { 4468 int error; 4469 4470 bzero(kif, sizeof(*kif)); 4471 4472 /* Set a default type to allow for empty fill_kinfo() methods. */ 4473 kif->kf_type = KF_TYPE_UNKNOWN; 4474 kif->kf_flags = xlate_fflags(fp->f_flag); 4475 if (rightsp != NULL) 4476 kif->kf_cap_rights = *rightsp; 4477 else 4478 cap_rights_init_zero(&kif->kf_cap_rights); 4479 kif->kf_fd = fd; 4480 kif->kf_ref_count = refcount_load(&fp->f_count); 4481 kif->kf_offset = foffset_get(fp); 4482 4483 /* 4484 * This may drop the filedesc lock, so the 'fp' cannot be 4485 * accessed after this call. 4486 */ 4487 error = fo_fill_kinfo(fp, kif, fdp); 4488 if (error == 0) 4489 kif->kf_status |= KF_ATTR_VALID; 4490 if ((flags & KERN_FILEDESC_PACK_KINFO) != 0) 4491 pack_kinfo(kif); 4492 else 4493 kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t)); 4494 } 4495 4496 static void 4497 export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags, 4498 struct kinfo_file *kif, int flags) 4499 { 4500 int error; 4501 4502 bzero(kif, sizeof(*kif)); 4503 4504 kif->kf_type = KF_TYPE_VNODE; 4505 error = vn_fill_kinfo_vnode(vp, kif); 4506 if (error == 0) 4507 kif->kf_status |= KF_ATTR_VALID; 4508 kif->kf_flags = xlate_fflags(fflags); 4509 cap_rights_init_zero(&kif->kf_cap_rights); 4510 kif->kf_fd = fd; 4511 kif->kf_ref_count = -1; 4512 kif->kf_offset = -1; 4513 if ((flags & KERN_FILEDESC_PACK_KINFO) != 0) 4514 pack_kinfo(kif); 4515 else 4516 kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t)); 4517 vrele(vp); 4518 } 4519 4520 struct export_fd_buf { 4521 struct filedesc *fdp; 4522 struct pwddesc *pdp; 4523 struct sbuf *sb; 4524 ssize_t remainder; 4525 struct kinfo_file kif; 4526 int flags; 4527 }; 4528 4529 static int 4530 export_kinfo_to_sb(struct export_fd_buf *efbuf) 4531 { 4532 struct kinfo_file *kif; 4533 4534 kif = &efbuf->kif; 4535 if (efbuf->remainder != -1) { 4536 if (efbuf->remainder < kif->kf_structsize) 4537 return (ENOMEM); 4538 efbuf->remainder -= kif->kf_structsize; 4539 } 4540 if (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) != 0) 4541 return (sbuf_error(efbuf->sb)); 4542 return (0); 4543 } 4544 4545 static int 4546 export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp, 4547 struct export_fd_buf *efbuf) 4548 { 4549 int error; 4550 4551 if (efbuf->remainder == 0) 4552 return (ENOMEM); 4553 export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp, 4554 efbuf->flags); 4555 FILEDESC_SUNLOCK(efbuf->fdp); 4556 error = export_kinfo_to_sb(efbuf); 4557 FILEDESC_SLOCK(efbuf->fdp); 4558 return (error); 4559 } 4560 4561 static int 4562 export_vnode_to_sb(struct vnode *vp, int fd, int fflags, 4563 struct export_fd_buf *efbuf) 4564 { 4565 int error; 4566 4567 if (efbuf->remainder == 0) 4568 return (ENOMEM); 4569 if (efbuf->pdp != NULL) 4570 PWDDESC_XUNLOCK(efbuf->pdp); 4571 export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif, efbuf->flags); 4572 error = export_kinfo_to_sb(efbuf); 4573 if (efbuf->pdp != NULL) 4574 PWDDESC_XLOCK(efbuf->pdp); 4575 return (error); 4576 } 4577 4578 /* 4579 * Store a process file descriptor information to sbuf. 4580 * 4581 * Takes a locked proc as argument, and returns with the proc unlocked. 4582 */ 4583 int 4584 kern_proc_filedesc_out(struct proc *p, struct sbuf *sb, ssize_t maxlen, 4585 int flags) 4586 { 4587 struct file *fp; 4588 struct filedesc *fdp; 4589 struct pwddesc *pdp; 4590 struct export_fd_buf *efbuf; 4591 struct vnode *cttyvp, *textvp, *tracevp; 4592 struct pwd *pwd; 4593 int error, i; 4594 cap_rights_t rights; 4595 4596 PROC_LOCK_ASSERT(p, MA_OWNED); 4597 4598 /* ktrace vnode */ 4599 tracevp = ktr_get_tracevp(p, true); 4600 /* text vnode */ 4601 textvp = p->p_textvp; 4602 if (textvp != NULL) 4603 vrefact(textvp); 4604 /* Controlling tty. */ 4605 cttyvp = NULL; 4606 if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) { 4607 cttyvp = p->p_pgrp->pg_session->s_ttyvp; 4608 if (cttyvp != NULL) 4609 vrefact(cttyvp); 4610 } 4611 fdp = fdhold(p); 4612 pdp = pdhold(p); 4613 PROC_UNLOCK(p); 4614 4615 efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); 4616 efbuf->fdp = NULL; 4617 efbuf->pdp = NULL; 4618 efbuf->sb = sb; 4619 efbuf->remainder = maxlen; 4620 efbuf->flags = flags; 4621 4622 error = 0; 4623 if (tracevp != NULL) 4624 error = export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, 4625 FREAD | FWRITE, efbuf); 4626 if (error == 0 && textvp != NULL) 4627 error = export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, 4628 efbuf); 4629 if (error == 0 && cttyvp != NULL) 4630 error = export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, 4631 FREAD | FWRITE, efbuf); 4632 if (error != 0 || pdp == NULL || fdp == NULL) 4633 goto fail; 4634 efbuf->fdp = fdp; 4635 efbuf->pdp = pdp; 4636 PWDDESC_XLOCK(pdp); 4637 pwd = pwd_hold_pwddesc(pdp); 4638 if (pwd != NULL) { 4639 /* working directory */ 4640 if (pwd->pwd_cdir != NULL) { 4641 vrefact(pwd->pwd_cdir); 4642 error = export_vnode_to_sb(pwd->pwd_cdir, 4643 KF_FD_TYPE_CWD, FREAD, efbuf); 4644 } 4645 /* root directory */ 4646 if (error == 0 && pwd->pwd_rdir != NULL) { 4647 vrefact(pwd->pwd_rdir); 4648 error = export_vnode_to_sb(pwd->pwd_rdir, 4649 KF_FD_TYPE_ROOT, FREAD, efbuf); 4650 } 4651 /* jail directory */ 4652 if (error == 0 && pwd->pwd_jdir != NULL) { 4653 vrefact(pwd->pwd_jdir); 4654 error = export_vnode_to_sb(pwd->pwd_jdir, 4655 KF_FD_TYPE_JAIL, FREAD, efbuf); 4656 } 4657 } 4658 PWDDESC_XUNLOCK(pdp); 4659 if (error != 0) 4660 goto fail; 4661 if (pwd != NULL) 4662 pwd_drop(pwd); 4663 FILEDESC_SLOCK(fdp); 4664 if (refcount_load(&fdp->fd_refcnt) == 0) 4665 goto skip; 4666 FILEDESC_FOREACH_FP(fdp, i, fp) { 4667 #ifdef CAPABILITIES 4668 rights = *cap_rights(fdp, i); 4669 #else /* !CAPABILITIES */ 4670 rights = cap_no_rights; 4671 #endif 4672 /* 4673 * Create sysctl entry. It is OK to drop the filedesc 4674 * lock inside of export_file_to_sb() as we will 4675 * re-validate and re-evaluate its properties when the 4676 * loop continues. 4677 */ 4678 error = export_file_to_sb(fp, i, &rights, efbuf); 4679 if (error != 0 || refcount_load(&fdp->fd_refcnt) == 0) 4680 break; 4681 } 4682 skip: 4683 FILEDESC_SUNLOCK(fdp); 4684 fail: 4685 if (fdp != NULL) 4686 fddrop(fdp); 4687 if (pdp != NULL) 4688 pddrop(pdp); 4689 free(efbuf, M_TEMP); 4690 return (error); 4691 } 4692 4693 #define FILEDESC_SBUF_SIZE (sizeof(struct kinfo_file) * 5) 4694 4695 /* 4696 * Get per-process file descriptors for use by procstat(1), et al. 4697 */ 4698 static int 4699 sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS) 4700 { 4701 struct sbuf sb; 4702 struct proc *p; 4703 ssize_t maxlen; 4704 u_int namelen; 4705 int error, error2, *name; 4706 4707 namelen = arg2; 4708 if (namelen != 1) 4709 return (EINVAL); 4710 4711 name = (int *)arg1; 4712 4713 sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req); 4714 sbuf_clear_flags(&sb, SBUF_INCLUDENUL); 4715 error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); 4716 if (error != 0) { 4717 sbuf_delete(&sb); 4718 return (error); 4719 } 4720 maxlen = req->oldptr != NULL ? req->oldlen : -1; 4721 error = kern_proc_filedesc_out(p, &sb, maxlen, 4722 KERN_FILEDESC_PACK_KINFO); 4723 error2 = sbuf_finish(&sb); 4724 sbuf_delete(&sb); 4725 return (error != 0 ? error : error2); 4726 } 4727 4728 #ifdef COMPAT_FREEBSD7 4729 #ifdef KINFO_OFILE_SIZE 4730 CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE); 4731 #endif 4732 4733 static void 4734 kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif) 4735 { 4736 4737 okif->kf_structsize = sizeof(*okif); 4738 okif->kf_type = kif->kf_type; 4739 okif->kf_fd = kif->kf_fd; 4740 okif->kf_ref_count = kif->kf_ref_count; 4741 okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE | 4742 KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK | 4743 KF_FLAG_DIRECT | KF_FLAG_HASLOCK); 4744 okif->kf_offset = kif->kf_offset; 4745 if (kif->kf_type == KF_TYPE_VNODE) 4746 okif->kf_vnode_type = kif->kf_un.kf_file.kf_file_type; 4747 else 4748 okif->kf_vnode_type = KF_VTYPE_VNON; 4749 strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path)); 4750 if (kif->kf_type == KF_TYPE_SOCKET) { 4751 okif->kf_sock_domain = kif->kf_un.kf_sock.kf_sock_domain0; 4752 okif->kf_sock_type = kif->kf_un.kf_sock.kf_sock_type0; 4753 okif->kf_sock_protocol = kif->kf_un.kf_sock.kf_sock_protocol0; 4754 okif->kf_sa_local = kif->kf_un.kf_sock.kf_sa_local; 4755 okif->kf_sa_peer = kif->kf_un.kf_sock.kf_sa_peer; 4756 } else { 4757 okif->kf_sa_local.ss_family = AF_UNSPEC; 4758 okif->kf_sa_peer.ss_family = AF_UNSPEC; 4759 } 4760 } 4761 4762 static int 4763 export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif, 4764 struct kinfo_ofile *okif, struct pwddesc *pdp, struct sysctl_req *req) 4765 { 4766 int error; 4767 4768 vrefact(vp); 4769 PWDDESC_XUNLOCK(pdp); 4770 export_vnode_to_kinfo(vp, type, 0, kif, KERN_FILEDESC_PACK_KINFO); 4771 kinfo_to_okinfo(kif, okif); 4772 error = SYSCTL_OUT(req, okif, sizeof(*okif)); 4773 PWDDESC_XLOCK(pdp); 4774 return (error); 4775 } 4776 4777 /* 4778 * Get per-process file descriptors for use by procstat(1), et al. 4779 */ 4780 static int 4781 sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS) 4782 { 4783 struct kinfo_ofile *okif; 4784 struct kinfo_file *kif; 4785 struct filedesc *fdp; 4786 struct pwddesc *pdp; 4787 struct pwd *pwd; 4788 u_int namelen; 4789 int error, i, *name; 4790 struct file *fp; 4791 struct proc *p; 4792 4793 namelen = arg2; 4794 if (namelen != 1) 4795 return (EINVAL); 4796 4797 name = (int *)arg1; 4798 error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); 4799 if (error != 0) 4800 return (error); 4801 fdp = fdhold(p); 4802 if (fdp != NULL) 4803 pdp = pdhold(p); 4804 PROC_UNLOCK(p); 4805 if (fdp == NULL || pdp == NULL) { 4806 if (fdp != NULL) 4807 fddrop(fdp); 4808 return (ENOENT); 4809 } 4810 kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK); 4811 okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK); 4812 PWDDESC_XLOCK(pdp); 4813 pwd = pwd_hold_pwddesc(pdp); 4814 if (pwd != NULL) { 4815 if (pwd->pwd_cdir != NULL) 4816 export_vnode_for_osysctl(pwd->pwd_cdir, KF_FD_TYPE_CWD, kif, 4817 okif, pdp, req); 4818 if (pwd->pwd_rdir != NULL) 4819 export_vnode_for_osysctl(pwd->pwd_rdir, KF_FD_TYPE_ROOT, kif, 4820 okif, pdp, req); 4821 if (pwd->pwd_jdir != NULL) 4822 export_vnode_for_osysctl(pwd->pwd_jdir, KF_FD_TYPE_JAIL, kif, 4823 okif, pdp, req); 4824 } 4825 PWDDESC_XUNLOCK(pdp); 4826 if (pwd != NULL) 4827 pwd_drop(pwd); 4828 FILEDESC_SLOCK(fdp); 4829 if (refcount_load(&fdp->fd_refcnt) == 0) 4830 goto skip; 4831 FILEDESC_FOREACH_FP(fdp, i, fp) { 4832 export_file_to_kinfo(fp, i, NULL, kif, fdp, 4833 KERN_FILEDESC_PACK_KINFO); 4834 FILEDESC_SUNLOCK(fdp); 4835 kinfo_to_okinfo(kif, okif); 4836 error = SYSCTL_OUT(req, okif, sizeof(*okif)); 4837 FILEDESC_SLOCK(fdp); 4838 if (error != 0 || refcount_load(&fdp->fd_refcnt) == 0) 4839 break; 4840 } 4841 skip: 4842 FILEDESC_SUNLOCK(fdp); 4843 fddrop(fdp); 4844 pddrop(pdp); 4845 free(kif, M_TEMP); 4846 free(okif, M_TEMP); 4847 return (0); 4848 } 4849 4850 static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, 4851 CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc, 4852 "Process ofiledesc entries"); 4853 #endif /* COMPAT_FREEBSD7 */ 4854 4855 int 4856 vntype_to_kinfo(int vtype) 4857 { 4858 struct { 4859 int vtype; 4860 int kf_vtype; 4861 } vtypes_table[] = { 4862 { VBAD, KF_VTYPE_VBAD }, 4863 { VBLK, KF_VTYPE_VBLK }, 4864 { VCHR, KF_VTYPE_VCHR }, 4865 { VDIR, KF_VTYPE_VDIR }, 4866 { VFIFO, KF_VTYPE_VFIFO }, 4867 { VLNK, KF_VTYPE_VLNK }, 4868 { VNON, KF_VTYPE_VNON }, 4869 { VREG, KF_VTYPE_VREG }, 4870 { VSOCK, KF_VTYPE_VSOCK } 4871 }; 4872 unsigned int i; 4873 4874 /* 4875 * Perform vtype translation. 4876 */ 4877 for (i = 0; i < nitems(vtypes_table); i++) 4878 if (vtypes_table[i].vtype == vtype) 4879 return (vtypes_table[i].kf_vtype); 4880 4881 return (KF_VTYPE_UNKNOWN); 4882 } 4883 4884 static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, 4885 CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc, 4886 "Process filedesc entries"); 4887 4888 /* 4889 * Store a process current working directory information to sbuf. 4890 * 4891 * Takes a locked proc as argument, and returns with the proc unlocked. 4892 */ 4893 int 4894 kern_proc_cwd_out(struct proc *p, struct sbuf *sb, ssize_t maxlen) 4895 { 4896 struct pwddesc *pdp; 4897 struct pwd *pwd; 4898 struct export_fd_buf *efbuf; 4899 struct vnode *cdir; 4900 int error; 4901 4902 PROC_LOCK_ASSERT(p, MA_OWNED); 4903 4904 pdp = pdhold(p); 4905 PROC_UNLOCK(p); 4906 if (pdp == NULL) 4907 return (EINVAL); 4908 4909 efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); 4910 efbuf->fdp = NULL; 4911 efbuf->pdp = pdp; 4912 efbuf->sb = sb; 4913 efbuf->remainder = maxlen; 4914 efbuf->flags = 0; 4915 4916 PWDDESC_XLOCK(pdp); 4917 pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4918 cdir = pwd->pwd_cdir; 4919 if (cdir == NULL) { 4920 error = EINVAL; 4921 } else { 4922 vrefact(cdir); 4923 error = export_vnode_to_sb(cdir, KF_FD_TYPE_CWD, FREAD, efbuf); 4924 } 4925 PWDDESC_XUNLOCK(pdp); 4926 pddrop(pdp); 4927 free(efbuf, M_TEMP); 4928 return (error); 4929 } 4930 4931 /* 4932 * Get per-process current working directory. 4933 */ 4934 static int 4935 sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS) 4936 { 4937 struct sbuf sb; 4938 struct proc *p; 4939 ssize_t maxlen; 4940 u_int namelen; 4941 int error, error2, *name; 4942 4943 namelen = arg2; 4944 if (namelen != 1) 4945 return (EINVAL); 4946 4947 name = (int *)arg1; 4948 4949 sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req); 4950 sbuf_clear_flags(&sb, SBUF_INCLUDENUL); 4951 error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); 4952 if (error != 0) { 4953 sbuf_delete(&sb); 4954 return (error); 4955 } 4956 maxlen = req->oldptr != NULL ? req->oldlen : -1; 4957 error = kern_proc_cwd_out(p, &sb, maxlen); 4958 error2 = sbuf_finish(&sb); 4959 sbuf_delete(&sb); 4960 return (error != 0 ? error : error2); 4961 } 4962 4963 static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE, 4964 sysctl_kern_proc_cwd, "Process current working directory"); 4965 4966 #ifdef DDB 4967 /* 4968 * For the purposes of debugging, generate a human-readable string for the 4969 * file type. 4970 */ 4971 static const char * 4972 file_type_to_name(short type) 4973 { 4974 4975 switch (type) { 4976 case 0: 4977 return ("zero"); 4978 case DTYPE_VNODE: 4979 return ("vnode"); 4980 case DTYPE_SOCKET: 4981 return ("socket"); 4982 case DTYPE_PIPE: 4983 return ("pipe"); 4984 case DTYPE_FIFO: 4985 return ("fifo"); 4986 case DTYPE_KQUEUE: 4987 return ("kqueue"); 4988 case DTYPE_CRYPTO: 4989 return ("crypto"); 4990 case DTYPE_MQUEUE: 4991 return ("mqueue"); 4992 case DTYPE_SHM: 4993 return ("shm"); 4994 case DTYPE_SEM: 4995 return ("ksem"); 4996 case DTYPE_PTS: 4997 return ("pts"); 4998 case DTYPE_DEV: 4999 return ("dev"); 5000 case DTYPE_PROCDESC: 5001 return ("proc"); 5002 case DTYPE_EVENTFD: 5003 return ("eventfd"); 5004 case DTYPE_TIMERFD: 5005 return ("timerfd"); 5006 default: 5007 return ("unkn"); 5008 } 5009 } 5010 5011 /* 5012 * For the purposes of debugging, identify a process (if any, perhaps one of 5013 * many) that references the passed file in its file descriptor array. Return 5014 * NULL if none. 5015 */ 5016 static struct proc * 5017 file_to_first_proc(struct file *fp) 5018 { 5019 struct filedesc *fdp; 5020 struct proc *p; 5021 int n; 5022 5023 FOREACH_PROC_IN_SYSTEM(p) { 5024 if (p->p_state == PRS_NEW) 5025 continue; 5026 fdp = p->p_fd; 5027 if (fdp == NULL) 5028 continue; 5029 for (n = 0; n < fdp->fd_nfiles; n++) { 5030 if (fp == fdp->fd_ofiles[n].fde_file) 5031 return (p); 5032 } 5033 } 5034 return (NULL); 5035 } 5036 5037 static void 5038 db_print_file(struct file *fp, int header) 5039 { 5040 #define XPTRWIDTH ((int)howmany(sizeof(void *) * NBBY, 4)) 5041 struct proc *p; 5042 5043 if (header) 5044 db_printf("%*s %6s %*s %8s %4s %5s %6s %*s %5s %s\n", 5045 XPTRWIDTH, "File", "Type", XPTRWIDTH, "Data", "Flag", 5046 "GCFl", "Count", "MCount", XPTRWIDTH, "Vnode", "FPID", 5047 "FCmd"); 5048 p = file_to_first_proc(fp); 5049 db_printf("%*p %6s %*p %08x %04x %5d %6d %*p %5d %s\n", XPTRWIDTH, 5050 fp, file_type_to_name(fp->f_type), XPTRWIDTH, fp->f_data, 5051 fp->f_flag, 0, refcount_load(&fp->f_count), 0, XPTRWIDTH, fp->f_vnode, 5052 p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-"); 5053 5054 #undef XPTRWIDTH 5055 } 5056 5057 DB_SHOW_COMMAND(file, db_show_file) 5058 { 5059 struct file *fp; 5060 5061 if (!have_addr) { 5062 db_printf("usage: show file <addr>\n"); 5063 return; 5064 } 5065 fp = (struct file *)addr; 5066 db_print_file(fp, 1); 5067 } 5068 5069 DB_SHOW_COMMAND_FLAGS(files, db_show_files, DB_CMD_MEMSAFE) 5070 { 5071 struct filedesc *fdp; 5072 struct file *fp; 5073 struct proc *p; 5074 int header; 5075 int n; 5076 5077 header = 1; 5078 FOREACH_PROC_IN_SYSTEM(p) { 5079 if (p->p_state == PRS_NEW) 5080 continue; 5081 if ((fdp = p->p_fd) == NULL) 5082 continue; 5083 for (n = 0; n < fdp->fd_nfiles; ++n) { 5084 if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) 5085 continue; 5086 db_print_file(fp, header); 5087 header = 0; 5088 } 5089 } 5090 } 5091 #endif 5092 5093 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, 5094 CTLFLAG_RWTUN | CTLFLAG_NOFETCH, 5095 &maxfilesperproc, 0, "Maximum files allowed open per process"); 5096 5097 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RWTUN | CTLFLAG_NOFETCH, 5098 &maxfiles, 0, "Maximum number of files"); 5099 5100 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, 5101 &openfiles, 0, "System-wide number of open files"); 5102 5103 /* ARGSUSED*/ 5104 static void 5105 filelistinit(void *dummy) 5106 { 5107 5108 file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, 5109 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 5110 filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0), 5111 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 5112 pwd_zone = uma_zcreate("PWD", sizeof(struct pwd), NULL, NULL, 5113 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_SMR); 5114 /* 5115 * XXXMJG this is a temporary hack due to boot ordering issues against 5116 * the vnode zone. 5117 */ 5118 vfs_smr = uma_zone_get_smr(pwd_zone); 5119 mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); 5120 } 5121 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL); 5122 5123 /*-------------------------------------------------------------------*/ 5124 5125 static int 5126 badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, 5127 int flags, struct thread *td) 5128 { 5129 5130 return (EBADF); 5131 } 5132 5133 static int 5134 badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, 5135 struct thread *td) 5136 { 5137 5138 return (EINVAL); 5139 } 5140 5141 static int 5142 badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, 5143 struct thread *td) 5144 { 5145 5146 return (EBADF); 5147 } 5148 5149 static int 5150 badfo_poll(struct file *fp, int events, struct ucred *active_cred, 5151 struct thread *td) 5152 { 5153 5154 return (0); 5155 } 5156 5157 static int 5158 badfo_kqfilter(struct file *fp, struct knote *kn) 5159 { 5160 5161 return (EBADF); 5162 } 5163 5164 static int 5165 badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) 5166 { 5167 5168 return (EBADF); 5169 } 5170 5171 static int 5172 badfo_close(struct file *fp, struct thread *td) 5173 { 5174 5175 return (0); 5176 } 5177 5178 static int 5179 badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 5180 struct thread *td) 5181 { 5182 5183 return (EBADF); 5184 } 5185 5186 static int 5187 badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 5188 struct thread *td) 5189 { 5190 5191 return (EBADF); 5192 } 5193 5194 static int 5195 badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, 5196 struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, 5197 struct thread *td) 5198 { 5199 5200 return (EBADF); 5201 } 5202 5203 static int 5204 badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 5205 { 5206 5207 return (0); 5208 } 5209 5210 struct fileops badfileops = { 5211 .fo_read = badfo_readwrite, 5212 .fo_write = badfo_readwrite, 5213 .fo_truncate = badfo_truncate, 5214 .fo_ioctl = badfo_ioctl, 5215 .fo_poll = badfo_poll, 5216 .fo_kqfilter = badfo_kqfilter, 5217 .fo_stat = badfo_stat, 5218 .fo_close = badfo_close, 5219 .fo_chmod = badfo_chmod, 5220 .fo_chown = badfo_chown, 5221 .fo_sendfile = badfo_sendfile, 5222 .fo_fill_kinfo = badfo_fill_kinfo, 5223 }; 5224 5225 static int 5226 path_poll(struct file *fp, int events, struct ucred *active_cred, 5227 struct thread *td) 5228 { 5229 return (POLLNVAL); 5230 } 5231 5232 static int 5233 path_close(struct file *fp, struct thread *td) 5234 { 5235 MPASS(fp->f_type == DTYPE_VNODE); 5236 fp->f_ops = &badfileops; 5237 vrele(fp->f_vnode); 5238 return (0); 5239 } 5240 5241 struct fileops path_fileops = { 5242 .fo_read = badfo_readwrite, 5243 .fo_write = badfo_readwrite, 5244 .fo_truncate = badfo_truncate, 5245 .fo_ioctl = badfo_ioctl, 5246 .fo_poll = path_poll, 5247 .fo_kqfilter = vn_kqfilter_opath, 5248 .fo_stat = vn_statfile, 5249 .fo_close = path_close, 5250 .fo_chmod = badfo_chmod, 5251 .fo_chown = badfo_chown, 5252 .fo_sendfile = badfo_sendfile, 5253 .fo_fill_kinfo = vn_fill_kinfo, 5254 .fo_flags = DFLAG_PASSABLE, 5255 }; 5256 5257 int 5258 invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred, 5259 int flags, struct thread *td) 5260 { 5261 5262 return (EOPNOTSUPP); 5263 } 5264 5265 int 5266 invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, 5267 struct thread *td) 5268 { 5269 5270 return (EINVAL); 5271 } 5272 5273 int 5274 invfo_ioctl(struct file *fp, u_long com, void *data, 5275 struct ucred *active_cred, struct thread *td) 5276 { 5277 5278 return (ENOTTY); 5279 } 5280 5281 int 5282 invfo_poll(struct file *fp, int events, struct ucred *active_cred, 5283 struct thread *td) 5284 { 5285 5286 return (poll_no_poll(events)); 5287 } 5288 5289 int 5290 invfo_kqfilter(struct file *fp, struct knote *kn) 5291 { 5292 5293 return (EINVAL); 5294 } 5295 5296 int 5297 invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 5298 struct thread *td) 5299 { 5300 5301 return (EINVAL); 5302 } 5303 5304 int 5305 invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 5306 struct thread *td) 5307 { 5308 5309 return (EINVAL); 5310 } 5311 5312 int 5313 invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, 5314 struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, 5315 struct thread *td) 5316 { 5317 5318 return (EINVAL); 5319 } 5320 5321 /*-------------------------------------------------------------------*/ 5322 5323 /* 5324 * File Descriptor pseudo-device driver (/dev/fd/). 5325 * 5326 * Opening minor device N dup()s the file (if any) connected to file 5327 * descriptor N belonging to the calling process. Note that this driver 5328 * consists of only the ``open()'' routine, because all subsequent 5329 * references to this file will be direct to the other driver. 5330 * 5331 * XXX: we could give this one a cloning event handler if necessary. 5332 */ 5333 5334 /* ARGSUSED */ 5335 static int 5336 fdopen(struct cdev *dev, int mode, int type, struct thread *td) 5337 { 5338 5339 /* 5340 * XXX Kludge: set curthread->td_dupfd to contain the value of the 5341 * the file descriptor being sought for duplication. The error 5342 * return ensures that the vnode for this device will be released 5343 * by vn_open. Open will detect this special error and take the 5344 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN 5345 * will simply report the error. 5346 */ 5347 td->td_dupfd = dev2unit(dev); 5348 return (ENODEV); 5349 } 5350 5351 static struct cdevsw fildesc_cdevsw = { 5352 .d_version = D_VERSION, 5353 .d_open = fdopen, 5354 .d_name = "FD", 5355 }; 5356 5357 static void 5358 fildesc_drvinit(void *unused) 5359 { 5360 struct cdev *dev; 5361 5362 dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL, 5363 UID_ROOT, GID_WHEEL, 0666, "fd/0"); 5364 make_dev_alias(dev, "stdin"); 5365 dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL, 5366 UID_ROOT, GID_WHEEL, 0666, "fd/1"); 5367 make_dev_alias(dev, "stdout"); 5368 dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL, 5369 UID_ROOT, GID_WHEEL, 0666, "fd/2"); 5370 make_dev_alias(dev, "stderr"); 5371 } 5372 5373 SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL); 5374