1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1989, 1991, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 37 */ 38 39 #include <sys/cdefs.h> 40 __FBSDID("$FreeBSD$"); 41 42 #include "opt_capsicum.h" 43 #include "opt_ddb.h" 44 #include "opt_ktrace.h" 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 49 #include <sys/capsicum.h> 50 #include <sys/conf.h> 51 #include <sys/fcntl.h> 52 #include <sys/file.h> 53 #include <sys/filedesc.h> 54 #include <sys/filio.h> 55 #include <sys/jail.h> 56 #include <sys/kernel.h> 57 #include <sys/limits.h> 58 #include <sys/lock.h> 59 #include <sys/malloc.h> 60 #include <sys/mount.h> 61 #include <sys/mutex.h> 62 #include <sys/namei.h> 63 #include <sys/selinfo.h> 64 #include <sys/poll.h> 65 #include <sys/priv.h> 66 #include <sys/proc.h> 67 #include <sys/protosw.h> 68 #include <sys/racct.h> 69 #include <sys/resourcevar.h> 70 #include <sys/sbuf.h> 71 #include <sys/signalvar.h> 72 #include <sys/kdb.h> 73 #include <sys/smr.h> 74 #include <sys/stat.h> 75 #include <sys/sx.h> 76 #include <sys/syscallsubr.h> 77 #include <sys/sysctl.h> 78 #include <sys/sysproto.h> 79 #include <sys/unistd.h> 80 #include <sys/user.h> 81 #include <sys/vnode.h> 82 #include <sys/ktrace.h> 83 84 #include <net/vnet.h> 85 86 #include <security/audit/audit.h> 87 88 #include <vm/uma.h> 89 #include <vm/vm.h> 90 91 #include <ddb/ddb.h> 92 93 static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table"); 94 static MALLOC_DEFINE(M_PWD, "pwd", "Descriptor table vnodes"); 95 static MALLOC_DEFINE(M_PWDDESC, "pwddesc", "Pwd descriptors"); 96 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader", 97 "file desc to leader structures"); 98 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); 99 MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities"); 100 101 MALLOC_DECLARE(M_FADVISE); 102 103 static __read_mostly uma_zone_t file_zone; 104 static __read_mostly uma_zone_t filedesc0_zone; 105 __read_mostly uma_zone_t pwd_zone; 106 VFS_SMR_DECLARE; 107 108 static int closefp(struct filedesc *fdp, int fd, struct file *fp, 109 struct thread *td, bool holdleaders, bool audit); 110 static void export_file_to_kinfo(struct file *fp, int fd, 111 cap_rights_t *rightsp, struct kinfo_file *kif, 112 struct filedesc *fdp, int flags); 113 static int fd_first_free(struct filedesc *fdp, int low, int size); 114 static void fdgrowtable(struct filedesc *fdp, int nfd); 115 static void fdgrowtable_exp(struct filedesc *fdp, int nfd); 116 static void fdunused(struct filedesc *fdp, int fd); 117 static void fdused(struct filedesc *fdp, int fd); 118 static int fget_unlocked_seq(struct thread *td, int fd, 119 cap_rights_t *needrightsp, struct file **fpp, seqc_t *seqp); 120 static int getmaxfd(struct thread *td); 121 static u_long *filecaps_copy_prep(const struct filecaps *src); 122 static void filecaps_copy_finish(const struct filecaps *src, 123 struct filecaps *dst, u_long *ioctls); 124 static u_long *filecaps_free_prep(struct filecaps *fcaps); 125 static void filecaps_free_finish(u_long *ioctls); 126 127 static struct pwd *pwd_alloc(void); 128 129 /* 130 * Each process has: 131 * 132 * - An array of open file descriptors (fd_ofiles) 133 * - An array of file flags (fd_ofileflags) 134 * - A bitmap recording which descriptors are in use (fd_map) 135 * 136 * A process starts out with NDFILE descriptors. The value of NDFILE has 137 * been selected based the historical limit of 20 open files, and an 138 * assumption that the majority of processes, especially short-lived 139 * processes like shells, will never need more. 140 * 141 * If this initial allocation is exhausted, a larger descriptor table and 142 * map are allocated dynamically, and the pointers in the process's struct 143 * filedesc are updated to point to those. This is repeated every time 144 * the process runs out of file descriptors (provided it hasn't hit its 145 * resource limit). 146 * 147 * Since threads may hold references to individual descriptor table 148 * entries, the tables are never freed. Instead, they are placed on a 149 * linked list and freed only when the struct filedesc is released. 150 */ 151 #define NDFILE 20 152 #define NDSLOTSIZE sizeof(NDSLOTTYPE) 153 #define NDENTRIES (NDSLOTSIZE * __CHAR_BIT) 154 #define NDSLOT(x) ((x) / NDENTRIES) 155 #define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES)) 156 #define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES) 157 158 #define FILEDESC_FOREACH_FDE(fdp, _iterator, _fde) \ 159 struct filedesc *_fdp = (fdp); \ 160 int _lastfile = fdlastfile_single(_fdp); \ 161 for (_iterator = 0; _iterator <= _lastfile; _iterator++) \ 162 if ((_fde = &_fdp->fd_ofiles[_iterator])->fde_file != NULL) 163 164 #define FILEDESC_FOREACH_FP(fdp, _iterator, _fp) \ 165 struct filedesc *_fdp = (fdp); \ 166 int _lastfile = fdlastfile_single(_fdp); \ 167 for (_iterator = 0; _iterator <= _lastfile; _iterator++) \ 168 if ((_fp = _fdp->fd_ofiles[_iterator].fde_file) != NULL) 169 170 /* 171 * SLIST entry used to keep track of ofiles which must be reclaimed when 172 * the process exits. 173 */ 174 struct freetable { 175 struct fdescenttbl *ft_table; 176 SLIST_ENTRY(freetable) ft_next; 177 }; 178 179 /* 180 * Initial allocation: a filedesc structure + the head of SLIST used to 181 * keep track of old ofiles + enough space for NDFILE descriptors. 182 */ 183 184 struct fdescenttbl0 { 185 int fdt_nfiles; 186 struct filedescent fdt_ofiles[NDFILE]; 187 }; 188 189 struct filedesc0 { 190 struct filedesc fd_fd; 191 SLIST_HEAD(, freetable) fd_free; 192 struct fdescenttbl0 fd_dfiles; 193 NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)]; 194 }; 195 196 /* 197 * Descriptor management. 198 */ 199 static int __exclusive_cache_line openfiles; /* actual number of open files */ 200 struct mtx sigio_lock; /* mtx to protect pointers to sigio */ 201 void __read_mostly (*mq_fdclose)(struct thread *td, int fd, struct file *fp); 202 203 /* 204 * If low >= size, just return low. Otherwise find the first zero bit in the 205 * given bitmap, starting at low and not exceeding size - 1. Return size if 206 * not found. 207 */ 208 static int 209 fd_first_free(struct filedesc *fdp, int low, int size) 210 { 211 NDSLOTTYPE *map = fdp->fd_map; 212 NDSLOTTYPE mask; 213 int off, maxoff; 214 215 if (low >= size) 216 return (low); 217 218 off = NDSLOT(low); 219 if (low % NDENTRIES) { 220 mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES))); 221 if ((mask &= ~map[off]) != 0UL) 222 return (off * NDENTRIES + ffsl(mask) - 1); 223 ++off; 224 } 225 for (maxoff = NDSLOTS(size); off < maxoff; ++off) 226 if (map[off] != ~0UL) 227 return (off * NDENTRIES + ffsl(~map[off]) - 1); 228 return (size); 229 } 230 231 /* 232 * Find the last used fd. 233 * 234 * Call this variant if fdp can't be modified by anyone else (e.g, during exec). 235 * Otherwise use fdlastfile. 236 */ 237 int 238 fdlastfile_single(struct filedesc *fdp) 239 { 240 NDSLOTTYPE *map = fdp->fd_map; 241 int off, minoff; 242 243 off = NDSLOT(fdp->fd_nfiles - 1); 244 for (minoff = NDSLOT(0); off >= minoff; --off) 245 if (map[off] != 0) 246 return (off * NDENTRIES + flsl(map[off]) - 1); 247 return (-1); 248 } 249 250 int 251 fdlastfile(struct filedesc *fdp) 252 { 253 254 FILEDESC_LOCK_ASSERT(fdp); 255 return (fdlastfile_single(fdp)); 256 } 257 258 static int 259 fdisused(struct filedesc *fdp, int fd) 260 { 261 262 KASSERT(fd >= 0 && fd < fdp->fd_nfiles, 263 ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles)); 264 265 return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0); 266 } 267 268 /* 269 * Mark a file descriptor as used. 270 */ 271 static void 272 fdused_init(struct filedesc *fdp, int fd) 273 { 274 275 KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd)); 276 277 fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd); 278 } 279 280 static void 281 fdused(struct filedesc *fdp, int fd) 282 { 283 284 FILEDESC_XLOCK_ASSERT(fdp); 285 286 fdused_init(fdp, fd); 287 if (fd == fdp->fd_freefile) 288 fdp->fd_freefile++; 289 } 290 291 /* 292 * Mark a file descriptor as unused. 293 */ 294 static void 295 fdunused(struct filedesc *fdp, int fd) 296 { 297 298 FILEDESC_XLOCK_ASSERT(fdp); 299 300 KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd)); 301 KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, 302 ("fd=%d is still in use", fd)); 303 304 fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd); 305 if (fd < fdp->fd_freefile) 306 fdp->fd_freefile = fd; 307 } 308 309 /* 310 * Free a file descriptor. 311 * 312 * Avoid some work if fdp is about to be destroyed. 313 */ 314 static inline void 315 fdefree_last(struct filedescent *fde) 316 { 317 318 filecaps_free(&fde->fde_caps); 319 } 320 321 static inline void 322 fdfree(struct filedesc *fdp, int fd) 323 { 324 struct filedescent *fde; 325 326 FILEDESC_XLOCK_ASSERT(fdp); 327 fde = &fdp->fd_ofiles[fd]; 328 #ifdef CAPABILITIES 329 seqc_write_begin(&fde->fde_seqc); 330 #endif 331 fde->fde_file = NULL; 332 #ifdef CAPABILITIES 333 seqc_write_end(&fde->fde_seqc); 334 #endif 335 fdefree_last(fde); 336 fdunused(fdp, fd); 337 } 338 339 /* 340 * System calls on descriptors. 341 */ 342 #ifndef _SYS_SYSPROTO_H_ 343 struct getdtablesize_args { 344 int dummy; 345 }; 346 #endif 347 /* ARGSUSED */ 348 int 349 sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap) 350 { 351 #ifdef RACCT 352 uint64_t lim; 353 #endif 354 355 td->td_retval[0] = getmaxfd(td); 356 #ifdef RACCT 357 PROC_LOCK(td->td_proc); 358 lim = racct_get_limit(td->td_proc, RACCT_NOFILE); 359 PROC_UNLOCK(td->td_proc); 360 if (lim < td->td_retval[0]) 361 td->td_retval[0] = lim; 362 #endif 363 return (0); 364 } 365 366 /* 367 * Duplicate a file descriptor to a particular value. 368 * 369 * Note: keep in mind that a potential race condition exists when closing 370 * descriptors from a shared descriptor table (via rfork). 371 */ 372 #ifndef _SYS_SYSPROTO_H_ 373 struct dup2_args { 374 u_int from; 375 u_int to; 376 }; 377 #endif 378 /* ARGSUSED */ 379 int 380 sys_dup2(struct thread *td, struct dup2_args *uap) 381 { 382 383 return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to)); 384 } 385 386 /* 387 * Duplicate a file descriptor. 388 */ 389 #ifndef _SYS_SYSPROTO_H_ 390 struct dup_args { 391 u_int fd; 392 }; 393 #endif 394 /* ARGSUSED */ 395 int 396 sys_dup(struct thread *td, struct dup_args *uap) 397 { 398 399 return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0)); 400 } 401 402 /* 403 * The file control system call. 404 */ 405 #ifndef _SYS_SYSPROTO_H_ 406 struct fcntl_args { 407 int fd; 408 int cmd; 409 long arg; 410 }; 411 #endif 412 /* ARGSUSED */ 413 int 414 sys_fcntl(struct thread *td, struct fcntl_args *uap) 415 { 416 417 return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg)); 418 } 419 420 int 421 kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg) 422 { 423 struct flock fl; 424 struct __oflock ofl; 425 intptr_t arg1; 426 int error, newcmd; 427 428 error = 0; 429 newcmd = cmd; 430 switch (cmd) { 431 case F_OGETLK: 432 case F_OSETLK: 433 case F_OSETLKW: 434 /* 435 * Convert old flock structure to new. 436 */ 437 error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl)); 438 fl.l_start = ofl.l_start; 439 fl.l_len = ofl.l_len; 440 fl.l_pid = ofl.l_pid; 441 fl.l_type = ofl.l_type; 442 fl.l_whence = ofl.l_whence; 443 fl.l_sysid = 0; 444 445 switch (cmd) { 446 case F_OGETLK: 447 newcmd = F_GETLK; 448 break; 449 case F_OSETLK: 450 newcmd = F_SETLK; 451 break; 452 case F_OSETLKW: 453 newcmd = F_SETLKW; 454 break; 455 } 456 arg1 = (intptr_t)&fl; 457 break; 458 case F_GETLK: 459 case F_SETLK: 460 case F_SETLKW: 461 case F_SETLK_REMOTE: 462 error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl)); 463 arg1 = (intptr_t)&fl; 464 break; 465 default: 466 arg1 = arg; 467 break; 468 } 469 if (error) 470 return (error); 471 error = kern_fcntl(td, fd, newcmd, arg1); 472 if (error) 473 return (error); 474 if (cmd == F_OGETLK) { 475 ofl.l_start = fl.l_start; 476 ofl.l_len = fl.l_len; 477 ofl.l_pid = fl.l_pid; 478 ofl.l_type = fl.l_type; 479 ofl.l_whence = fl.l_whence; 480 error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl)); 481 } else if (cmd == F_GETLK) { 482 error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl)); 483 } 484 return (error); 485 } 486 487 int 488 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) 489 { 490 struct filedesc *fdp; 491 struct flock *flp; 492 struct file *fp, *fp2; 493 struct filedescent *fde; 494 struct proc *p; 495 struct vnode *vp; 496 struct mount *mp; 497 struct kinfo_file *kif; 498 int error, flg, kif_sz, seals, tmp, got_set, got_cleared; 499 uint64_t bsize; 500 off_t foffset; 501 502 error = 0; 503 flg = F_POSIX; 504 p = td->td_proc; 505 fdp = p->p_fd; 506 507 AUDIT_ARG_FD(cmd); 508 AUDIT_ARG_CMD(cmd); 509 switch (cmd) { 510 case F_DUPFD: 511 tmp = arg; 512 error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp); 513 break; 514 515 case F_DUPFD_CLOEXEC: 516 tmp = arg; 517 error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp); 518 break; 519 520 case F_DUP2FD: 521 tmp = arg; 522 error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp); 523 break; 524 525 case F_DUP2FD_CLOEXEC: 526 tmp = arg; 527 error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp); 528 break; 529 530 case F_GETFD: 531 error = EBADF; 532 FILEDESC_SLOCK(fdp); 533 fde = fdeget_noref(fdp, fd); 534 if (fde != NULL) { 535 td->td_retval[0] = 536 (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0; 537 error = 0; 538 } 539 FILEDESC_SUNLOCK(fdp); 540 break; 541 542 case F_SETFD: 543 error = EBADF; 544 FILEDESC_XLOCK(fdp); 545 fde = fdeget_noref(fdp, fd); 546 if (fde != NULL) { 547 fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) | 548 (arg & FD_CLOEXEC ? UF_EXCLOSE : 0); 549 error = 0; 550 } 551 FILEDESC_XUNLOCK(fdp); 552 break; 553 554 case F_GETFL: 555 error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETFL, &fp); 556 if (error != 0) 557 break; 558 td->td_retval[0] = OFLAGS(fp->f_flag); 559 fdrop(fp, td); 560 break; 561 562 case F_SETFL: 563 error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETFL, &fp); 564 if (error != 0) 565 break; 566 if (fp->f_ops == &path_fileops) { 567 fdrop(fp, td); 568 error = EBADF; 569 break; 570 } 571 do { 572 tmp = flg = fp->f_flag; 573 tmp &= ~FCNTLFLAGS; 574 tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS; 575 } while (atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); 576 got_set = tmp & ~flg; 577 got_cleared = flg & ~tmp; 578 tmp = fp->f_flag & FNONBLOCK; 579 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 580 if (error != 0) 581 goto revert_f_setfl; 582 tmp = fp->f_flag & FASYNC; 583 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); 584 if (error == 0) { 585 fdrop(fp, td); 586 break; 587 } 588 atomic_clear_int(&fp->f_flag, FNONBLOCK); 589 tmp = 0; 590 (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 591 revert_f_setfl: 592 do { 593 tmp = flg = fp->f_flag; 594 tmp &= ~FCNTLFLAGS; 595 tmp |= got_cleared; 596 tmp &= ~got_set; 597 } while (atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); 598 fdrop(fp, td); 599 break; 600 601 case F_GETOWN: 602 error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETOWN, &fp); 603 if (error != 0) 604 break; 605 error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td); 606 if (error == 0) 607 td->td_retval[0] = tmp; 608 fdrop(fp, td); 609 break; 610 611 case F_SETOWN: 612 error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETOWN, &fp); 613 if (error != 0) 614 break; 615 tmp = arg; 616 error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td); 617 fdrop(fp, td); 618 break; 619 620 case F_SETLK_REMOTE: 621 error = priv_check(td, PRIV_NFS_LOCKD); 622 if (error != 0) 623 return (error); 624 flg = F_REMOTE; 625 goto do_setlk; 626 627 case F_SETLKW: 628 flg |= F_WAIT; 629 /* FALLTHROUGH F_SETLK */ 630 631 case F_SETLK: 632 do_setlk: 633 flp = (struct flock *)arg; 634 if ((flg & F_REMOTE) != 0 && flp->l_sysid == 0) { 635 error = EINVAL; 636 break; 637 } 638 639 error = fget_unlocked(td, fd, &cap_flock_rights, &fp); 640 if (error != 0) 641 break; 642 if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { 643 error = EBADF; 644 fdrop(fp, td); 645 break; 646 } 647 648 if (flp->l_whence == SEEK_CUR) { 649 foffset = foffset_get(fp); 650 if (foffset < 0 || 651 (flp->l_start > 0 && 652 foffset > OFF_MAX - flp->l_start)) { 653 error = EOVERFLOW; 654 fdrop(fp, td); 655 break; 656 } 657 flp->l_start += foffset; 658 } 659 660 vp = fp->f_vnode; 661 switch (flp->l_type) { 662 case F_RDLCK: 663 if ((fp->f_flag & FREAD) == 0) { 664 error = EBADF; 665 break; 666 } 667 if ((p->p_leader->p_flag & P_ADVLOCK) == 0) { 668 PROC_LOCK(p->p_leader); 669 p->p_leader->p_flag |= P_ADVLOCK; 670 PROC_UNLOCK(p->p_leader); 671 } 672 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, 673 flp, flg); 674 break; 675 case F_WRLCK: 676 if ((fp->f_flag & FWRITE) == 0) { 677 error = EBADF; 678 break; 679 } 680 if ((p->p_leader->p_flag & P_ADVLOCK) == 0) { 681 PROC_LOCK(p->p_leader); 682 p->p_leader->p_flag |= P_ADVLOCK; 683 PROC_UNLOCK(p->p_leader); 684 } 685 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, 686 flp, flg); 687 break; 688 case F_UNLCK: 689 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, 690 flp, flg); 691 break; 692 case F_UNLCKSYS: 693 if (flg != F_REMOTE) { 694 error = EINVAL; 695 break; 696 } 697 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, 698 F_UNLCKSYS, flp, flg); 699 break; 700 default: 701 error = EINVAL; 702 break; 703 } 704 if (error != 0 || flp->l_type == F_UNLCK || 705 flp->l_type == F_UNLCKSYS) { 706 fdrop(fp, td); 707 break; 708 } 709 710 /* 711 * Check for a race with close. 712 * 713 * The vnode is now advisory locked (or unlocked, but this case 714 * is not really important) as the caller requested. 715 * We had to drop the filedesc lock, so we need to recheck if 716 * the descriptor is still valid, because if it was closed 717 * in the meantime we need to remove advisory lock from the 718 * vnode - close on any descriptor leading to an advisory 719 * locked vnode, removes that lock. 720 * We will return 0 on purpose in that case, as the result of 721 * successful advisory lock might have been externally visible 722 * already. This is fine - effectively we pretend to the caller 723 * that the closing thread was a bit slower and that the 724 * advisory lock succeeded before the close. 725 */ 726 error = fget_unlocked(td, fd, &cap_no_rights, &fp2); 727 if (error != 0) { 728 fdrop(fp, td); 729 break; 730 } 731 if (fp != fp2) { 732 flp->l_whence = SEEK_SET; 733 flp->l_start = 0; 734 flp->l_len = 0; 735 flp->l_type = F_UNLCK; 736 (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, 737 F_UNLCK, flp, F_POSIX); 738 } 739 fdrop(fp, td); 740 fdrop(fp2, td); 741 break; 742 743 case F_GETLK: 744 error = fget_unlocked(td, fd, &cap_flock_rights, &fp); 745 if (error != 0) 746 break; 747 if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { 748 error = EBADF; 749 fdrop(fp, td); 750 break; 751 } 752 flp = (struct flock *)arg; 753 if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK && 754 flp->l_type != F_UNLCK) { 755 error = EINVAL; 756 fdrop(fp, td); 757 break; 758 } 759 if (flp->l_whence == SEEK_CUR) { 760 foffset = foffset_get(fp); 761 if ((flp->l_start > 0 && 762 foffset > OFF_MAX - flp->l_start) || 763 (flp->l_start < 0 && 764 foffset < OFF_MIN - flp->l_start)) { 765 error = EOVERFLOW; 766 fdrop(fp, td); 767 break; 768 } 769 flp->l_start += foffset; 770 } 771 vp = fp->f_vnode; 772 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp, 773 F_POSIX); 774 fdrop(fp, td); 775 break; 776 777 case F_ADD_SEALS: 778 error = fget_unlocked(td, fd, &cap_no_rights, &fp); 779 if (error != 0) 780 break; 781 error = fo_add_seals(fp, arg); 782 fdrop(fp, td); 783 break; 784 785 case F_GET_SEALS: 786 error = fget_unlocked(td, fd, &cap_no_rights, &fp); 787 if (error != 0) 788 break; 789 if (fo_get_seals(fp, &seals) == 0) 790 td->td_retval[0] = seals; 791 else 792 error = EINVAL; 793 fdrop(fp, td); 794 break; 795 796 case F_RDAHEAD: 797 arg = arg ? 128 * 1024: 0; 798 /* FALLTHROUGH */ 799 case F_READAHEAD: 800 error = fget_unlocked(td, fd, &cap_no_rights, &fp); 801 if (error != 0) 802 break; 803 if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { 804 fdrop(fp, td); 805 error = EBADF; 806 break; 807 } 808 vp = fp->f_vnode; 809 if (vp->v_type != VREG) { 810 fdrop(fp, td); 811 error = ENOTTY; 812 break; 813 } 814 815 /* 816 * Exclusive lock synchronizes against f_seqcount reads and 817 * writes in sequential_heuristic(). 818 */ 819 error = vn_lock(vp, LK_EXCLUSIVE); 820 if (error != 0) { 821 fdrop(fp, td); 822 break; 823 } 824 if (arg >= 0) { 825 bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize; 826 arg = MIN(arg, INT_MAX - bsize + 1); 827 fp->f_seqcount[UIO_READ] = MIN(IO_SEQMAX, 828 (arg + bsize - 1) / bsize); 829 atomic_set_int(&fp->f_flag, FRDAHEAD); 830 } else { 831 atomic_clear_int(&fp->f_flag, FRDAHEAD); 832 } 833 VOP_UNLOCK(vp); 834 fdrop(fp, td); 835 break; 836 837 case F_ISUNIONSTACK: 838 /* 839 * Check if the vnode is part of a union stack (either the 840 * "union" flag from mount(2) or unionfs). 841 * 842 * Prior to introduction of this op libc's readdir would call 843 * fstatfs(2), in effect unnecessarily copying kilobytes of 844 * data just to check fs name and a mount flag. 845 * 846 * Fixing the code to handle everything in the kernel instead 847 * is a non-trivial endeavor and has low priority, thus this 848 * horrible kludge facilitates the current behavior in a much 849 * cheaper manner until someone(tm) sorts this out. 850 */ 851 error = fget_unlocked(td, fd, &cap_no_rights, &fp); 852 if (error != 0) 853 break; 854 if (fp->f_type != DTYPE_VNODE) { 855 fdrop(fp, td); 856 error = EBADF; 857 break; 858 } 859 vp = fp->f_vnode; 860 /* 861 * Since we don't prevent dooming the vnode even non-null mp 862 * found can become immediately stale. This is tolerable since 863 * mount points are type-stable (providing safe memory access) 864 * and any vfs op on this vnode going forward will return an 865 * error (meaning return value in this case is meaningless). 866 */ 867 mp = atomic_load_ptr(&vp->v_mount); 868 if (__predict_false(mp == NULL)) { 869 fdrop(fp, td); 870 error = EBADF; 871 break; 872 } 873 td->td_retval[0] = 0; 874 if (mp->mnt_kern_flag & MNTK_UNIONFS || 875 mp->mnt_flag & MNT_UNION) 876 td->td_retval[0] = 1; 877 fdrop(fp, td); 878 break; 879 880 case F_KINFO: 881 #ifdef CAPABILITY_MODE 882 if (IN_CAPABILITY_MODE(td)) { 883 error = ECAPMODE; 884 break; 885 } 886 #endif 887 error = copyin((void *)arg, &kif_sz, sizeof(kif_sz)); 888 if (error != 0) 889 break; 890 if (kif_sz != sizeof(*kif)) { 891 error = EINVAL; 892 break; 893 } 894 kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK | M_ZERO); 895 FILEDESC_SLOCK(fdp); 896 error = fget_cap_noref(fdp, fd, &cap_fcntl_rights, &fp, NULL); 897 if (error == 0 && fhold(fp)) { 898 export_file_to_kinfo(fp, fd, NULL, kif, fdp, 0); 899 FILEDESC_SUNLOCK(fdp); 900 fdrop(fp, td); 901 if ((kif->kf_status & KF_ATTR_VALID) != 0) { 902 kif->kf_structsize = sizeof(*kif); 903 error = copyout(kif, (void *)arg, sizeof(*kif)); 904 } else { 905 error = EBADF; 906 } 907 } else { 908 FILEDESC_SUNLOCK(fdp); 909 if (error == 0) 910 error = EBADF; 911 } 912 free(kif, M_TEMP); 913 break; 914 915 default: 916 error = EINVAL; 917 break; 918 } 919 return (error); 920 } 921 922 static int 923 getmaxfd(struct thread *td) 924 { 925 926 return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc)); 927 } 928 929 /* 930 * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD). 931 */ 932 int 933 kern_dup(struct thread *td, u_int mode, int flags, int old, int new) 934 { 935 struct filedesc *fdp; 936 struct filedescent *oldfde, *newfde; 937 struct proc *p; 938 struct file *delfp, *oldfp; 939 u_long *oioctls, *nioctls; 940 int error, maxfd; 941 942 p = td->td_proc; 943 fdp = p->p_fd; 944 oioctls = NULL; 945 946 MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0); 947 MPASS(mode < FDDUP_LASTMODE); 948 949 AUDIT_ARG_FD(old); 950 /* XXXRW: if (flags & FDDUP_FIXED) AUDIT_ARG_FD2(new); */ 951 952 /* 953 * Verify we have a valid descriptor to dup from and possibly to 954 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should 955 * return EINVAL when the new descriptor is out of bounds. 956 */ 957 if (old < 0) 958 return (EBADF); 959 if (new < 0) 960 return (mode == FDDUP_FCNTL ? EINVAL : EBADF); 961 maxfd = getmaxfd(td); 962 if (new >= maxfd) 963 return (mode == FDDUP_FCNTL ? EINVAL : EBADF); 964 965 error = EBADF; 966 FILEDESC_XLOCK(fdp); 967 if (fget_noref(fdp, old) == NULL) 968 goto unlock; 969 if (mode == FDDUP_FIXED && old == new) { 970 td->td_retval[0] = new; 971 if (flags & FDDUP_FLAG_CLOEXEC) 972 fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE; 973 error = 0; 974 goto unlock; 975 } 976 977 oldfde = &fdp->fd_ofiles[old]; 978 oldfp = oldfde->fde_file; 979 if (!fhold(oldfp)) 980 goto unlock; 981 982 /* 983 * If the caller specified a file descriptor, make sure the file 984 * table is large enough to hold it, and grab it. Otherwise, just 985 * allocate a new descriptor the usual way. 986 */ 987 switch (mode) { 988 case FDDUP_NORMAL: 989 case FDDUP_FCNTL: 990 if ((error = fdalloc(td, new, &new)) != 0) { 991 fdrop(oldfp, td); 992 goto unlock; 993 } 994 break; 995 case FDDUP_FIXED: 996 if (new >= fdp->fd_nfiles) { 997 /* 998 * The resource limits are here instead of e.g. 999 * fdalloc(), because the file descriptor table may be 1000 * shared between processes, so we can't really use 1001 * racct_add()/racct_sub(). Instead of counting the 1002 * number of actually allocated descriptors, just put 1003 * the limit on the size of the file descriptor table. 1004 */ 1005 #ifdef RACCT 1006 if (RACCT_ENABLED()) { 1007 error = racct_set_unlocked(p, RACCT_NOFILE, new + 1); 1008 if (error != 0) { 1009 error = EMFILE; 1010 fdrop(oldfp, td); 1011 goto unlock; 1012 } 1013 } 1014 #endif 1015 fdgrowtable_exp(fdp, new + 1); 1016 } 1017 if (!fdisused(fdp, new)) 1018 fdused(fdp, new); 1019 break; 1020 default: 1021 KASSERT(0, ("%s unsupported mode %d", __func__, mode)); 1022 } 1023 1024 KASSERT(old != new, ("new fd is same as old")); 1025 1026 /* Refetch oldfde because the table may have grown and old one freed. */ 1027 oldfde = &fdp->fd_ofiles[old]; 1028 KASSERT(oldfp == oldfde->fde_file, 1029 ("fdt_ofiles shift from growth observed at fd %d", 1030 old)); 1031 1032 newfde = &fdp->fd_ofiles[new]; 1033 delfp = newfde->fde_file; 1034 1035 nioctls = filecaps_copy_prep(&oldfde->fde_caps); 1036 1037 /* 1038 * Duplicate the source descriptor. 1039 */ 1040 #ifdef CAPABILITIES 1041 seqc_write_begin(&newfde->fde_seqc); 1042 #endif 1043 oioctls = filecaps_free_prep(&newfde->fde_caps); 1044 fde_copy(oldfde, newfde); 1045 filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps, 1046 nioctls); 1047 if ((flags & FDDUP_FLAG_CLOEXEC) != 0) 1048 newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE; 1049 else 1050 newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE; 1051 #ifdef CAPABILITIES 1052 seqc_write_end(&newfde->fde_seqc); 1053 #endif 1054 td->td_retval[0] = new; 1055 1056 error = 0; 1057 1058 if (delfp != NULL) { 1059 (void) closefp(fdp, new, delfp, td, true, false); 1060 FILEDESC_UNLOCK_ASSERT(fdp); 1061 } else { 1062 unlock: 1063 FILEDESC_XUNLOCK(fdp); 1064 } 1065 1066 filecaps_free_finish(oioctls); 1067 return (error); 1068 } 1069 1070 static void 1071 sigiofree(struct sigio *sigio) 1072 { 1073 crfree(sigio->sio_ucred); 1074 free(sigio, M_SIGIO); 1075 } 1076 1077 static struct sigio * 1078 funsetown_locked(struct sigio *sigio) 1079 { 1080 struct proc *p; 1081 struct pgrp *pg; 1082 1083 SIGIO_ASSERT_LOCKED(); 1084 1085 if (sigio == NULL) 1086 return (NULL); 1087 *sigio->sio_myref = NULL; 1088 if (sigio->sio_pgid < 0) { 1089 pg = sigio->sio_pgrp; 1090 PGRP_LOCK(pg); 1091 SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, sio_pgsigio); 1092 PGRP_UNLOCK(pg); 1093 } else { 1094 p = sigio->sio_proc; 1095 PROC_LOCK(p); 1096 SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, sio_pgsigio); 1097 PROC_UNLOCK(p); 1098 } 1099 return (sigio); 1100 } 1101 1102 /* 1103 * If sigio is on the list associated with a process or process group, 1104 * disable signalling from the device, remove sigio from the list and 1105 * free sigio. 1106 */ 1107 void 1108 funsetown(struct sigio **sigiop) 1109 { 1110 struct sigio *sigio; 1111 1112 /* Racy check, consumers must provide synchronization. */ 1113 if (*sigiop == NULL) 1114 return; 1115 1116 SIGIO_LOCK(); 1117 sigio = funsetown_locked(*sigiop); 1118 SIGIO_UNLOCK(); 1119 if (sigio != NULL) 1120 sigiofree(sigio); 1121 } 1122 1123 /* 1124 * Free a list of sigio structures. The caller must ensure that new sigio 1125 * structures cannot be added after this point. For process groups this is 1126 * guaranteed using the proctree lock; for processes, the P_WEXIT flag serves 1127 * as an interlock. 1128 */ 1129 void 1130 funsetownlst(struct sigiolst *sigiolst) 1131 { 1132 struct proc *p; 1133 struct pgrp *pg; 1134 struct sigio *sigio, *tmp; 1135 1136 /* Racy check. */ 1137 sigio = SLIST_FIRST(sigiolst); 1138 if (sigio == NULL) 1139 return; 1140 1141 p = NULL; 1142 pg = NULL; 1143 1144 SIGIO_LOCK(); 1145 sigio = SLIST_FIRST(sigiolst); 1146 if (sigio == NULL) { 1147 SIGIO_UNLOCK(); 1148 return; 1149 } 1150 1151 /* 1152 * Every entry of the list should belong to a single proc or pgrp. 1153 */ 1154 if (sigio->sio_pgid < 0) { 1155 pg = sigio->sio_pgrp; 1156 sx_assert(&proctree_lock, SX_XLOCKED); 1157 PGRP_LOCK(pg); 1158 } else /* if (sigio->sio_pgid > 0) */ { 1159 p = sigio->sio_proc; 1160 PROC_LOCK(p); 1161 KASSERT((p->p_flag & P_WEXIT) != 0, 1162 ("%s: process %p is not exiting", __func__, p)); 1163 } 1164 1165 SLIST_FOREACH(sigio, sigiolst, sio_pgsigio) { 1166 *sigio->sio_myref = NULL; 1167 if (pg != NULL) { 1168 KASSERT(sigio->sio_pgid < 0, 1169 ("Proc sigio in pgrp sigio list")); 1170 KASSERT(sigio->sio_pgrp == pg, 1171 ("Bogus pgrp in sigio list")); 1172 } else /* if (p != NULL) */ { 1173 KASSERT(sigio->sio_pgid > 0, 1174 ("Pgrp sigio in proc sigio list")); 1175 KASSERT(sigio->sio_proc == p, 1176 ("Bogus proc in sigio list")); 1177 } 1178 } 1179 1180 if (pg != NULL) 1181 PGRP_UNLOCK(pg); 1182 else 1183 PROC_UNLOCK(p); 1184 SIGIO_UNLOCK(); 1185 1186 SLIST_FOREACH_SAFE(sigio, sigiolst, sio_pgsigio, tmp) 1187 sigiofree(sigio); 1188 } 1189 1190 /* 1191 * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). 1192 * 1193 * After permission checking, add a sigio structure to the sigio list for 1194 * the process or process group. 1195 */ 1196 int 1197 fsetown(pid_t pgid, struct sigio **sigiop) 1198 { 1199 struct proc *proc; 1200 struct pgrp *pgrp; 1201 struct sigio *osigio, *sigio; 1202 int ret; 1203 1204 if (pgid == 0) { 1205 funsetown(sigiop); 1206 return (0); 1207 } 1208 1209 sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK); 1210 sigio->sio_pgid = pgid; 1211 sigio->sio_ucred = crhold(curthread->td_ucred); 1212 sigio->sio_myref = sigiop; 1213 1214 ret = 0; 1215 if (pgid > 0) { 1216 ret = pget(pgid, PGET_NOTWEXIT | PGET_NOTID | PGET_HOLD, &proc); 1217 SIGIO_LOCK(); 1218 osigio = funsetown_locked(*sigiop); 1219 if (ret == 0) { 1220 PROC_LOCK(proc); 1221 _PRELE(proc); 1222 if ((proc->p_flag & P_WEXIT) != 0) { 1223 ret = ESRCH; 1224 } else if (proc->p_session != 1225 curthread->td_proc->p_session) { 1226 /* 1227 * Policy - Don't allow a process to FSETOWN a 1228 * process in another session. 1229 * 1230 * Remove this test to allow maximum flexibility 1231 * or restrict FSETOWN to the current process or 1232 * process group for maximum safety. 1233 */ 1234 ret = EPERM; 1235 } else { 1236 sigio->sio_proc = proc; 1237 SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, 1238 sio_pgsigio); 1239 } 1240 PROC_UNLOCK(proc); 1241 } 1242 } else /* if (pgid < 0) */ { 1243 sx_slock(&proctree_lock); 1244 SIGIO_LOCK(); 1245 osigio = funsetown_locked(*sigiop); 1246 pgrp = pgfind(-pgid); 1247 if (pgrp == NULL) { 1248 ret = ESRCH; 1249 } else { 1250 if (pgrp->pg_session != curthread->td_proc->p_session) { 1251 /* 1252 * Policy - Don't allow a process to FSETOWN a 1253 * process in another session. 1254 * 1255 * Remove this test to allow maximum flexibility 1256 * or restrict FSETOWN to the current process or 1257 * process group for maximum safety. 1258 */ 1259 ret = EPERM; 1260 } else { 1261 sigio->sio_pgrp = pgrp; 1262 SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, 1263 sio_pgsigio); 1264 } 1265 PGRP_UNLOCK(pgrp); 1266 } 1267 sx_sunlock(&proctree_lock); 1268 } 1269 if (ret == 0) 1270 *sigiop = sigio; 1271 SIGIO_UNLOCK(); 1272 if (osigio != NULL) 1273 sigiofree(osigio); 1274 return (ret); 1275 } 1276 1277 /* 1278 * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). 1279 */ 1280 pid_t 1281 fgetown(struct sigio **sigiop) 1282 { 1283 pid_t pgid; 1284 1285 SIGIO_LOCK(); 1286 pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0; 1287 SIGIO_UNLOCK(); 1288 return (pgid); 1289 } 1290 1291 static int 1292 closefp_impl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, 1293 bool audit) 1294 { 1295 int error; 1296 1297 FILEDESC_XLOCK_ASSERT(fdp); 1298 1299 /* 1300 * We now hold the fp reference that used to be owned by the 1301 * descriptor array. We have to unlock the FILEDESC *AFTER* 1302 * knote_fdclose to prevent a race of the fd getting opened, a knote 1303 * added, and deleteing a knote for the new fd. 1304 */ 1305 if (__predict_false(!TAILQ_EMPTY(&fdp->fd_kqlist))) 1306 knote_fdclose(td, fd); 1307 1308 /* 1309 * We need to notify mqueue if the object is of type mqueue. 1310 */ 1311 if (__predict_false(fp->f_type == DTYPE_MQUEUE)) 1312 mq_fdclose(td, fd, fp); 1313 FILEDESC_XUNLOCK(fdp); 1314 1315 #ifdef AUDIT 1316 if (AUDITING_TD(td) && audit) 1317 audit_sysclose(td, fd, fp); 1318 #endif 1319 error = closef(fp, td); 1320 1321 /* 1322 * All paths leading up to closefp() will have already removed or 1323 * replaced the fd in the filedesc table, so a restart would not 1324 * operate on the same file. 1325 */ 1326 if (error == ERESTART) 1327 error = EINTR; 1328 1329 return (error); 1330 } 1331 1332 static int 1333 closefp_hl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, 1334 bool holdleaders, bool audit) 1335 { 1336 int error; 1337 1338 FILEDESC_XLOCK_ASSERT(fdp); 1339 1340 if (holdleaders) { 1341 if (td->td_proc->p_fdtol != NULL) { 1342 /* 1343 * Ask fdfree() to sleep to ensure that all relevant 1344 * process leaders can be traversed in closef(). 1345 */ 1346 fdp->fd_holdleaderscount++; 1347 } else { 1348 holdleaders = false; 1349 } 1350 } 1351 1352 error = closefp_impl(fdp, fd, fp, td, audit); 1353 if (holdleaders) { 1354 FILEDESC_XLOCK(fdp); 1355 fdp->fd_holdleaderscount--; 1356 if (fdp->fd_holdleaderscount == 0 && 1357 fdp->fd_holdleaderswakeup != 0) { 1358 fdp->fd_holdleaderswakeup = 0; 1359 wakeup(&fdp->fd_holdleaderscount); 1360 } 1361 FILEDESC_XUNLOCK(fdp); 1362 } 1363 return (error); 1364 } 1365 1366 static int 1367 closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, 1368 bool holdleaders, bool audit) 1369 { 1370 1371 FILEDESC_XLOCK_ASSERT(fdp); 1372 1373 if (__predict_false(td->td_proc->p_fdtol != NULL)) { 1374 return (closefp_hl(fdp, fd, fp, td, holdleaders, audit)); 1375 } else { 1376 return (closefp_impl(fdp, fd, fp, td, audit)); 1377 } 1378 } 1379 1380 /* 1381 * Close a file descriptor. 1382 */ 1383 #ifndef _SYS_SYSPROTO_H_ 1384 struct close_args { 1385 int fd; 1386 }; 1387 #endif 1388 /* ARGSUSED */ 1389 int 1390 sys_close(struct thread *td, struct close_args *uap) 1391 { 1392 1393 return (kern_close(td, uap->fd)); 1394 } 1395 1396 int 1397 kern_close(struct thread *td, int fd) 1398 { 1399 struct filedesc *fdp; 1400 struct file *fp; 1401 1402 fdp = td->td_proc->p_fd; 1403 1404 FILEDESC_XLOCK(fdp); 1405 if ((fp = fget_noref(fdp, fd)) == NULL) { 1406 FILEDESC_XUNLOCK(fdp); 1407 return (EBADF); 1408 } 1409 fdfree(fdp, fd); 1410 1411 /* closefp() drops the FILEDESC lock for us. */ 1412 return (closefp(fdp, fd, fp, td, true, true)); 1413 } 1414 1415 static int 1416 close_range_cloexec(struct thread *td, u_int lowfd, u_int highfd) 1417 { 1418 struct filedesc *fdp; 1419 struct fdescenttbl *fdt; 1420 struct filedescent *fde; 1421 int fd; 1422 1423 fdp = td->td_proc->p_fd; 1424 FILEDESC_XLOCK(fdp); 1425 fdt = atomic_load_ptr(&fdp->fd_files); 1426 highfd = MIN(highfd, fdt->fdt_nfiles - 1); 1427 fd = lowfd; 1428 if (__predict_false(fd > highfd)) { 1429 goto out_locked; 1430 } 1431 for (; fd <= highfd; fd++) { 1432 fde = &fdt->fdt_ofiles[fd]; 1433 if (fde->fde_file != NULL) 1434 fde->fde_flags |= UF_EXCLOSE; 1435 } 1436 out_locked: 1437 FILEDESC_XUNLOCK(fdp); 1438 return (0); 1439 } 1440 1441 static int 1442 close_range_impl(struct thread *td, u_int lowfd, u_int highfd) 1443 { 1444 struct filedesc *fdp; 1445 const struct fdescenttbl *fdt; 1446 struct file *fp; 1447 int fd; 1448 1449 fdp = td->td_proc->p_fd; 1450 FILEDESC_XLOCK(fdp); 1451 fdt = atomic_load_ptr(&fdp->fd_files); 1452 highfd = MIN(highfd, fdt->fdt_nfiles - 1); 1453 fd = lowfd; 1454 if (__predict_false(fd > highfd)) { 1455 goto out_locked; 1456 } 1457 for (;;) { 1458 fp = fdt->fdt_ofiles[fd].fde_file; 1459 if (fp == NULL) { 1460 if (fd == highfd) 1461 goto out_locked; 1462 } else { 1463 fdfree(fdp, fd); 1464 (void) closefp(fdp, fd, fp, td, true, true); 1465 if (fd == highfd) 1466 goto out_unlocked; 1467 FILEDESC_XLOCK(fdp); 1468 fdt = atomic_load_ptr(&fdp->fd_files); 1469 } 1470 fd++; 1471 } 1472 out_locked: 1473 FILEDESC_XUNLOCK(fdp); 1474 out_unlocked: 1475 return (0); 1476 } 1477 1478 int 1479 kern_close_range(struct thread *td, int flags, u_int lowfd, u_int highfd) 1480 { 1481 1482 /* 1483 * Check this prior to clamping; closefrom(3) with only fd 0, 1, and 2 1484 * open should not be a usage error. From a close_range() perspective, 1485 * close_range(3, ~0U, 0) in the same scenario should also likely not 1486 * be a usage error as all fd above 3 are in-fact already closed. 1487 */ 1488 if (highfd < lowfd) { 1489 return (EINVAL); 1490 } 1491 1492 if ((flags & CLOSE_RANGE_CLOEXEC) != 0) 1493 return (close_range_cloexec(td, lowfd, highfd)); 1494 1495 return (close_range_impl(td, lowfd, highfd)); 1496 } 1497 1498 #ifndef _SYS_SYSPROTO_H_ 1499 struct close_range_args { 1500 u_int lowfd; 1501 u_int highfd; 1502 int flags; 1503 }; 1504 #endif 1505 int 1506 sys_close_range(struct thread *td, struct close_range_args *uap) 1507 { 1508 1509 AUDIT_ARG_FD(uap->lowfd); 1510 AUDIT_ARG_CMD(uap->highfd); 1511 AUDIT_ARG_FFLAGS(uap->flags); 1512 1513 if ((uap->flags & ~(CLOSE_RANGE_CLOEXEC)) != 0) 1514 return (EINVAL); 1515 return (kern_close_range(td, uap->flags, uap->lowfd, uap->highfd)); 1516 } 1517 1518 #ifdef COMPAT_FREEBSD12 1519 /* 1520 * Close open file descriptors. 1521 */ 1522 #ifndef _SYS_SYSPROTO_H_ 1523 struct freebsd12_closefrom_args { 1524 int lowfd; 1525 }; 1526 #endif 1527 /* ARGSUSED */ 1528 int 1529 freebsd12_closefrom(struct thread *td, struct freebsd12_closefrom_args *uap) 1530 { 1531 u_int lowfd; 1532 1533 AUDIT_ARG_FD(uap->lowfd); 1534 1535 /* 1536 * Treat negative starting file descriptor values identical to 1537 * closefrom(0) which closes all files. 1538 */ 1539 lowfd = MAX(0, uap->lowfd); 1540 return (kern_close_range(td, 0, lowfd, ~0U)); 1541 } 1542 #endif /* COMPAT_FREEBSD12 */ 1543 1544 #if defined(COMPAT_43) 1545 /* 1546 * Return status information about a file descriptor. 1547 */ 1548 #ifndef _SYS_SYSPROTO_H_ 1549 struct ofstat_args { 1550 int fd; 1551 struct ostat *sb; 1552 }; 1553 #endif 1554 /* ARGSUSED */ 1555 int 1556 ofstat(struct thread *td, struct ofstat_args *uap) 1557 { 1558 struct ostat oub; 1559 struct stat ub; 1560 int error; 1561 1562 error = kern_fstat(td, uap->fd, &ub); 1563 if (error == 0) { 1564 cvtstat(&ub, &oub); 1565 error = copyout(&oub, uap->sb, sizeof(oub)); 1566 } 1567 return (error); 1568 } 1569 #endif /* COMPAT_43 */ 1570 1571 #if defined(COMPAT_FREEBSD11) 1572 int 1573 freebsd11_fstat(struct thread *td, struct freebsd11_fstat_args *uap) 1574 { 1575 struct stat sb; 1576 struct freebsd11_stat osb; 1577 int error; 1578 1579 error = kern_fstat(td, uap->fd, &sb); 1580 if (error != 0) 1581 return (error); 1582 error = freebsd11_cvtstat(&sb, &osb); 1583 if (error == 0) 1584 error = copyout(&osb, uap->sb, sizeof(osb)); 1585 return (error); 1586 } 1587 #endif /* COMPAT_FREEBSD11 */ 1588 1589 /* 1590 * Return status information about a file descriptor. 1591 */ 1592 #ifndef _SYS_SYSPROTO_H_ 1593 struct fstat_args { 1594 int fd; 1595 struct stat *sb; 1596 }; 1597 #endif 1598 /* ARGSUSED */ 1599 int 1600 sys_fstat(struct thread *td, struct fstat_args *uap) 1601 { 1602 struct stat ub; 1603 int error; 1604 1605 error = kern_fstat(td, uap->fd, &ub); 1606 if (error == 0) 1607 error = copyout(&ub, uap->sb, sizeof(ub)); 1608 return (error); 1609 } 1610 1611 int 1612 kern_fstat(struct thread *td, int fd, struct stat *sbp) 1613 { 1614 struct file *fp; 1615 int error; 1616 1617 AUDIT_ARG_FD(fd); 1618 1619 error = fget(td, fd, &cap_fstat_rights, &fp); 1620 if (__predict_false(error != 0)) 1621 return (error); 1622 1623 AUDIT_ARG_FILE(td->td_proc, fp); 1624 1625 error = fo_stat(fp, sbp, td->td_ucred); 1626 fdrop(fp, td); 1627 #ifdef __STAT_TIME_T_EXT 1628 sbp->st_atim_ext = 0; 1629 sbp->st_mtim_ext = 0; 1630 sbp->st_ctim_ext = 0; 1631 sbp->st_btim_ext = 0; 1632 #endif 1633 #ifdef KTRACE 1634 if (KTRPOINT(td, KTR_STRUCT)) 1635 ktrstat_error(sbp, error); 1636 #endif 1637 return (error); 1638 } 1639 1640 #if defined(COMPAT_FREEBSD11) 1641 /* 1642 * Return status information about a file descriptor. 1643 */ 1644 #ifndef _SYS_SYSPROTO_H_ 1645 struct freebsd11_nfstat_args { 1646 int fd; 1647 struct nstat *sb; 1648 }; 1649 #endif 1650 /* ARGSUSED */ 1651 int 1652 freebsd11_nfstat(struct thread *td, struct freebsd11_nfstat_args *uap) 1653 { 1654 struct nstat nub; 1655 struct stat ub; 1656 int error; 1657 1658 error = kern_fstat(td, uap->fd, &ub); 1659 if (error != 0) 1660 return (error); 1661 error = freebsd11_cvtnstat(&ub, &nub); 1662 if (error != 0) 1663 error = copyout(&nub, uap->sb, sizeof(nub)); 1664 return (error); 1665 } 1666 #endif /* COMPAT_FREEBSD11 */ 1667 1668 /* 1669 * Return pathconf information about a file descriptor. 1670 */ 1671 #ifndef _SYS_SYSPROTO_H_ 1672 struct fpathconf_args { 1673 int fd; 1674 int name; 1675 }; 1676 #endif 1677 /* ARGSUSED */ 1678 int 1679 sys_fpathconf(struct thread *td, struct fpathconf_args *uap) 1680 { 1681 long value; 1682 int error; 1683 1684 error = kern_fpathconf(td, uap->fd, uap->name, &value); 1685 if (error == 0) 1686 td->td_retval[0] = value; 1687 return (error); 1688 } 1689 1690 int 1691 kern_fpathconf(struct thread *td, int fd, int name, long *valuep) 1692 { 1693 struct file *fp; 1694 struct vnode *vp; 1695 int error; 1696 1697 error = fget(td, fd, &cap_fpathconf_rights, &fp); 1698 if (error != 0) 1699 return (error); 1700 1701 if (name == _PC_ASYNC_IO) { 1702 *valuep = _POSIX_ASYNCHRONOUS_IO; 1703 goto out; 1704 } 1705 vp = fp->f_vnode; 1706 if (vp != NULL) { 1707 vn_lock(vp, LK_SHARED | LK_RETRY); 1708 error = VOP_PATHCONF(vp, name, valuep); 1709 VOP_UNLOCK(vp); 1710 } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) { 1711 if (name != _PC_PIPE_BUF) { 1712 error = EINVAL; 1713 } else { 1714 *valuep = PIPE_BUF; 1715 error = 0; 1716 } 1717 } else { 1718 error = EOPNOTSUPP; 1719 } 1720 out: 1721 fdrop(fp, td); 1722 return (error); 1723 } 1724 1725 /* 1726 * Copy filecaps structure allocating memory for ioctls array if needed. 1727 * 1728 * The last parameter indicates whether the fdtable is locked. If it is not and 1729 * ioctls are encountered, copying fails and the caller must lock the table. 1730 * 1731 * Note that if the table was not locked, the caller has to check the relevant 1732 * sequence counter to determine whether the operation was successful. 1733 */ 1734 bool 1735 filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked) 1736 { 1737 size_t size; 1738 1739 if (src->fc_ioctls != NULL && !locked) 1740 return (false); 1741 memcpy(dst, src, sizeof(*src)); 1742 if (src->fc_ioctls == NULL) 1743 return (true); 1744 1745 KASSERT(src->fc_nioctls > 0, 1746 ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); 1747 1748 size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; 1749 dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK); 1750 memcpy(dst->fc_ioctls, src->fc_ioctls, size); 1751 return (true); 1752 } 1753 1754 static u_long * 1755 filecaps_copy_prep(const struct filecaps *src) 1756 { 1757 u_long *ioctls; 1758 size_t size; 1759 1760 if (__predict_true(src->fc_ioctls == NULL)) 1761 return (NULL); 1762 1763 KASSERT(src->fc_nioctls > 0, 1764 ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); 1765 1766 size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; 1767 ioctls = malloc(size, M_FILECAPS, M_WAITOK); 1768 return (ioctls); 1769 } 1770 1771 static void 1772 filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst, 1773 u_long *ioctls) 1774 { 1775 size_t size; 1776 1777 *dst = *src; 1778 if (__predict_true(src->fc_ioctls == NULL)) { 1779 MPASS(ioctls == NULL); 1780 return; 1781 } 1782 1783 size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; 1784 dst->fc_ioctls = ioctls; 1785 bcopy(src->fc_ioctls, dst->fc_ioctls, size); 1786 } 1787 1788 /* 1789 * Move filecaps structure to the new place and clear the old place. 1790 */ 1791 void 1792 filecaps_move(struct filecaps *src, struct filecaps *dst) 1793 { 1794 1795 *dst = *src; 1796 bzero(src, sizeof(*src)); 1797 } 1798 1799 /* 1800 * Fill the given filecaps structure with full rights. 1801 */ 1802 static void 1803 filecaps_fill(struct filecaps *fcaps) 1804 { 1805 1806 CAP_ALL(&fcaps->fc_rights); 1807 fcaps->fc_ioctls = NULL; 1808 fcaps->fc_nioctls = -1; 1809 fcaps->fc_fcntls = CAP_FCNTL_ALL; 1810 } 1811 1812 /* 1813 * Free memory allocated within filecaps structure. 1814 */ 1815 static void 1816 filecaps_free_ioctl(struct filecaps *fcaps) 1817 { 1818 1819 free(fcaps->fc_ioctls, M_FILECAPS); 1820 fcaps->fc_ioctls = NULL; 1821 } 1822 1823 void 1824 filecaps_free(struct filecaps *fcaps) 1825 { 1826 1827 filecaps_free_ioctl(fcaps); 1828 bzero(fcaps, sizeof(*fcaps)); 1829 } 1830 1831 static u_long * 1832 filecaps_free_prep(struct filecaps *fcaps) 1833 { 1834 u_long *ioctls; 1835 1836 ioctls = fcaps->fc_ioctls; 1837 bzero(fcaps, sizeof(*fcaps)); 1838 return (ioctls); 1839 } 1840 1841 static void 1842 filecaps_free_finish(u_long *ioctls) 1843 { 1844 1845 free(ioctls, M_FILECAPS); 1846 } 1847 1848 /* 1849 * Validate the given filecaps structure. 1850 */ 1851 static void 1852 filecaps_validate(const struct filecaps *fcaps, const char *func) 1853 { 1854 1855 KASSERT(cap_rights_is_valid(&fcaps->fc_rights), 1856 ("%s: invalid rights", func)); 1857 KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0, 1858 ("%s: invalid fcntls", func)); 1859 KASSERT(fcaps->fc_fcntls == 0 || 1860 cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL), 1861 ("%s: fcntls without CAP_FCNTL", func)); 1862 /* 1863 * open calls without WANTIOCTLCAPS free caps but leave the counter 1864 */ 1865 #if 0 1866 KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 : 1867 (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0), 1868 ("%s: invalid ioctls", func)); 1869 #endif 1870 KASSERT(fcaps->fc_nioctls == 0 || 1871 cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL), 1872 ("%s: ioctls without CAP_IOCTL", func)); 1873 } 1874 1875 static void 1876 fdgrowtable_exp(struct filedesc *fdp, int nfd) 1877 { 1878 int nfd1; 1879 1880 FILEDESC_XLOCK_ASSERT(fdp); 1881 1882 nfd1 = fdp->fd_nfiles * 2; 1883 if (nfd1 < nfd) 1884 nfd1 = nfd; 1885 fdgrowtable(fdp, nfd1); 1886 } 1887 1888 /* 1889 * Grow the file table to accommodate (at least) nfd descriptors. 1890 */ 1891 static void 1892 fdgrowtable(struct filedesc *fdp, int nfd) 1893 { 1894 struct filedesc0 *fdp0; 1895 struct freetable *ft; 1896 struct fdescenttbl *ntable; 1897 struct fdescenttbl *otable; 1898 int nnfiles, onfiles; 1899 NDSLOTTYPE *nmap, *omap; 1900 1901 KASSERT(fdp->fd_nfiles > 0, ("zero-length file table")); 1902 1903 /* save old values */ 1904 onfiles = fdp->fd_nfiles; 1905 otable = fdp->fd_files; 1906 omap = fdp->fd_map; 1907 1908 /* compute the size of the new table */ 1909 nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */ 1910 if (nnfiles <= onfiles) 1911 /* the table is already large enough */ 1912 return; 1913 1914 /* 1915 * Allocate a new table. We need enough space for the number of 1916 * entries, file entries themselves and the struct freetable we will use 1917 * when we decommission the table and place it on the freelist. 1918 * We place the struct freetable in the middle so we don't have 1919 * to worry about padding. 1920 */ 1921 ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) + 1922 nnfiles * sizeof(ntable->fdt_ofiles[0]) + 1923 sizeof(struct freetable), 1924 M_FILEDESC, M_ZERO | M_WAITOK); 1925 /* copy the old data */ 1926 ntable->fdt_nfiles = nnfiles; 1927 memcpy(ntable->fdt_ofiles, otable->fdt_ofiles, 1928 onfiles * sizeof(ntable->fdt_ofiles[0])); 1929 1930 /* 1931 * Allocate a new map only if the old is not large enough. It will 1932 * grow at a slower rate than the table as it can map more 1933 * entries than the table can hold. 1934 */ 1935 if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) { 1936 nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC, 1937 M_ZERO | M_WAITOK); 1938 /* copy over the old data and update the pointer */ 1939 memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap)); 1940 fdp->fd_map = nmap; 1941 } 1942 1943 /* 1944 * Make sure that ntable is correctly initialized before we replace 1945 * fd_files poiner. Otherwise fget_unlocked() may see inconsistent 1946 * data. 1947 */ 1948 atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable); 1949 1950 /* 1951 * Free the old file table when not shared by other threads or processes. 1952 * The old file table is considered to be shared when either are true: 1953 * - The process has more than one thread. 1954 * - The file descriptor table has been shared via fdshare(). 1955 * 1956 * When shared, the old file table will be placed on a freelist 1957 * which will be processed when the struct filedesc is released. 1958 * 1959 * Note that if onfiles == NDFILE, we're dealing with the original 1960 * static allocation contained within (struct filedesc0 *)fdp, 1961 * which must not be freed. 1962 */ 1963 if (onfiles > NDFILE) { 1964 /* 1965 * Note we may be called here from fdinit while allocating a 1966 * table for a new process in which case ->p_fd points 1967 * elsewhere. 1968 */ 1969 if (curproc->p_fd != fdp || FILEDESC_IS_ONLY_USER(fdp)) { 1970 free(otable, M_FILEDESC); 1971 } else { 1972 ft = (struct freetable *)&otable->fdt_ofiles[onfiles]; 1973 fdp0 = (struct filedesc0 *)fdp; 1974 ft->ft_table = otable; 1975 SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next); 1976 } 1977 } 1978 /* 1979 * The map does not have the same possibility of threads still 1980 * holding references to it. So always free it as long as it 1981 * does not reference the original static allocation. 1982 */ 1983 if (NDSLOTS(onfiles) > NDSLOTS(NDFILE)) 1984 free(omap, M_FILEDESC); 1985 } 1986 1987 /* 1988 * Allocate a file descriptor for the process. 1989 */ 1990 int 1991 fdalloc(struct thread *td, int minfd, int *result) 1992 { 1993 struct proc *p = td->td_proc; 1994 struct filedesc *fdp = p->p_fd; 1995 int fd, maxfd, allocfd; 1996 #ifdef RACCT 1997 int error; 1998 #endif 1999 2000 FILEDESC_XLOCK_ASSERT(fdp); 2001 2002 if (fdp->fd_freefile > minfd) 2003 minfd = fdp->fd_freefile; 2004 2005 maxfd = getmaxfd(td); 2006 2007 /* 2008 * Search the bitmap for a free descriptor starting at minfd. 2009 * If none is found, grow the file table. 2010 */ 2011 fd = fd_first_free(fdp, minfd, fdp->fd_nfiles); 2012 if (__predict_false(fd >= maxfd)) 2013 return (EMFILE); 2014 if (__predict_false(fd >= fdp->fd_nfiles)) { 2015 allocfd = min(fd * 2, maxfd); 2016 #ifdef RACCT 2017 if (RACCT_ENABLED()) { 2018 error = racct_set_unlocked(p, RACCT_NOFILE, allocfd); 2019 if (error != 0) 2020 return (EMFILE); 2021 } 2022 #endif 2023 /* 2024 * fd is already equal to first free descriptor >= minfd, so 2025 * we only need to grow the table and we are done. 2026 */ 2027 fdgrowtable_exp(fdp, allocfd); 2028 } 2029 2030 /* 2031 * Perform some sanity checks, then mark the file descriptor as 2032 * used and return it to the caller. 2033 */ 2034 KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles), 2035 ("invalid descriptor %d", fd)); 2036 KASSERT(!fdisused(fdp, fd), 2037 ("fd_first_free() returned non-free descriptor")); 2038 KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, 2039 ("file descriptor isn't free")); 2040 fdused(fdp, fd); 2041 *result = fd; 2042 return (0); 2043 } 2044 2045 /* 2046 * Allocate n file descriptors for the process. 2047 */ 2048 int 2049 fdallocn(struct thread *td, int minfd, int *fds, int n) 2050 { 2051 struct proc *p = td->td_proc; 2052 struct filedesc *fdp = p->p_fd; 2053 int i; 2054 2055 FILEDESC_XLOCK_ASSERT(fdp); 2056 2057 for (i = 0; i < n; i++) 2058 if (fdalloc(td, 0, &fds[i]) != 0) 2059 break; 2060 2061 if (i < n) { 2062 for (i--; i >= 0; i--) 2063 fdunused(fdp, fds[i]); 2064 return (EMFILE); 2065 } 2066 2067 return (0); 2068 } 2069 2070 /* 2071 * Create a new open file structure and allocate a file descriptor for the 2072 * process that refers to it. We add one reference to the file for the 2073 * descriptor table and one reference for resultfp. This is to prevent us 2074 * being preempted and the entry in the descriptor table closed after we 2075 * release the FILEDESC lock. 2076 */ 2077 int 2078 falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags, 2079 struct filecaps *fcaps) 2080 { 2081 struct file *fp; 2082 int error, fd; 2083 2084 MPASS(resultfp != NULL); 2085 MPASS(resultfd != NULL); 2086 2087 error = _falloc_noinstall(td, &fp, 2); 2088 if (__predict_false(error != 0)) { 2089 return (error); 2090 } 2091 2092 error = finstall_refed(td, fp, &fd, flags, fcaps); 2093 if (__predict_false(error != 0)) { 2094 falloc_abort(td, fp); 2095 return (error); 2096 } 2097 2098 *resultfp = fp; 2099 *resultfd = fd; 2100 2101 return (0); 2102 } 2103 2104 /* 2105 * Create a new open file structure without allocating a file descriptor. 2106 */ 2107 int 2108 _falloc_noinstall(struct thread *td, struct file **resultfp, u_int n) 2109 { 2110 struct file *fp; 2111 int maxuserfiles = maxfiles - (maxfiles / 20); 2112 int openfiles_new; 2113 static struct timeval lastfail; 2114 static int curfail; 2115 2116 KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__)); 2117 MPASS(n > 0); 2118 2119 openfiles_new = atomic_fetchadd_int(&openfiles, 1) + 1; 2120 if ((openfiles_new >= maxuserfiles && 2121 priv_check(td, PRIV_MAXFILES) != 0) || 2122 openfiles_new >= maxfiles) { 2123 atomic_subtract_int(&openfiles, 1); 2124 if (ppsratecheck(&lastfail, &curfail, 1)) { 2125 printf("kern.maxfiles limit exceeded by uid %i, (%s) " 2126 "please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm); 2127 } 2128 return (ENFILE); 2129 } 2130 fp = uma_zalloc(file_zone, M_WAITOK); 2131 bzero(fp, sizeof(*fp)); 2132 refcount_init(&fp->f_count, n); 2133 fp->f_cred = crhold(td->td_ucred); 2134 fp->f_ops = &badfileops; 2135 *resultfp = fp; 2136 return (0); 2137 } 2138 2139 void 2140 falloc_abort(struct thread *td, struct file *fp) 2141 { 2142 2143 /* 2144 * For assertion purposes. 2145 */ 2146 refcount_init(&fp->f_count, 0); 2147 _fdrop(fp, td); 2148 } 2149 2150 /* 2151 * Install a file in a file descriptor table. 2152 */ 2153 void 2154 _finstall(struct filedesc *fdp, struct file *fp, int fd, int flags, 2155 struct filecaps *fcaps) 2156 { 2157 struct filedescent *fde; 2158 2159 MPASS(fp != NULL); 2160 if (fcaps != NULL) 2161 filecaps_validate(fcaps, __func__); 2162 FILEDESC_XLOCK_ASSERT(fdp); 2163 2164 fde = &fdp->fd_ofiles[fd]; 2165 #ifdef CAPABILITIES 2166 seqc_write_begin(&fde->fde_seqc); 2167 #endif 2168 fde->fde_file = fp; 2169 fde->fde_flags = (flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0; 2170 if (fcaps != NULL) 2171 filecaps_move(fcaps, &fde->fde_caps); 2172 else 2173 filecaps_fill(&fde->fde_caps); 2174 #ifdef CAPABILITIES 2175 seqc_write_end(&fde->fde_seqc); 2176 #endif 2177 } 2178 2179 int 2180 finstall_refed(struct thread *td, struct file *fp, int *fd, int flags, 2181 struct filecaps *fcaps) 2182 { 2183 struct filedesc *fdp = td->td_proc->p_fd; 2184 int error; 2185 2186 MPASS(fd != NULL); 2187 2188 FILEDESC_XLOCK(fdp); 2189 error = fdalloc(td, 0, fd); 2190 if (__predict_true(error == 0)) { 2191 _finstall(fdp, fp, *fd, flags, fcaps); 2192 } 2193 FILEDESC_XUNLOCK(fdp); 2194 return (error); 2195 } 2196 2197 int 2198 finstall(struct thread *td, struct file *fp, int *fd, int flags, 2199 struct filecaps *fcaps) 2200 { 2201 int error; 2202 2203 MPASS(fd != NULL); 2204 2205 if (!fhold(fp)) 2206 return (EBADF); 2207 error = finstall_refed(td, fp, fd, flags, fcaps); 2208 if (__predict_false(error != 0)) { 2209 fdrop(fp, td); 2210 } 2211 return (error); 2212 } 2213 2214 /* 2215 * Build a new filedesc structure from another. 2216 * 2217 * If fdp is not NULL, return with it shared locked. 2218 */ 2219 struct filedesc * 2220 fdinit(void) 2221 { 2222 struct filedesc0 *newfdp0; 2223 struct filedesc *newfdp; 2224 2225 newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO); 2226 newfdp = &newfdp0->fd_fd; 2227 2228 /* Create the file descriptor table. */ 2229 FILEDESC_LOCK_INIT(newfdp); 2230 refcount_init(&newfdp->fd_refcnt, 1); 2231 refcount_init(&newfdp->fd_holdcnt, 1); 2232 newfdp->fd_map = newfdp0->fd_dmap; 2233 newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles; 2234 newfdp->fd_files->fdt_nfiles = NDFILE; 2235 2236 return (newfdp); 2237 } 2238 2239 /* 2240 * Build a pwddesc structure from another. 2241 * Copy the current, root, and jail root vnode references. 2242 * 2243 * If pdp is not NULL, return with it shared locked. 2244 */ 2245 struct pwddesc * 2246 pdinit(struct pwddesc *pdp, bool keeplock) 2247 { 2248 struct pwddesc *newpdp; 2249 struct pwd *newpwd; 2250 2251 newpdp = malloc(sizeof(*newpdp), M_PWDDESC, M_WAITOK | M_ZERO); 2252 2253 PWDDESC_LOCK_INIT(newpdp); 2254 refcount_init(&newpdp->pd_refcount, 1); 2255 newpdp->pd_cmask = CMASK; 2256 2257 if (pdp == NULL) { 2258 newpwd = pwd_alloc(); 2259 smr_serialized_store(&newpdp->pd_pwd, newpwd, true); 2260 return (newpdp); 2261 } 2262 2263 PWDDESC_XLOCK(pdp); 2264 newpwd = pwd_hold_pwddesc(pdp); 2265 smr_serialized_store(&newpdp->pd_pwd, newpwd, true); 2266 if (!keeplock) 2267 PWDDESC_XUNLOCK(pdp); 2268 return (newpdp); 2269 } 2270 2271 /* 2272 * Hold either filedesc or pwddesc of the passed process. 2273 * 2274 * The process lock is used to synchronize against the target exiting and 2275 * freeing the data. 2276 * 2277 * Clearing can be ilustrated in 3 steps: 2278 * 1. set the pointer to NULL. Either routine can race against it, hence 2279 * atomic_load_ptr. 2280 * 2. observe the process lock as not taken. Until then fdhold/pdhold can 2281 * race to either still see the pointer or find NULL. It is still safe to 2282 * grab a reference as clearing is stalled. 2283 * 3. after the lock is observed as not taken, any fdhold/pdhold calls are 2284 * guaranteed to see NULL, making it safe to finish clearing 2285 */ 2286 static struct filedesc * 2287 fdhold(struct proc *p) 2288 { 2289 struct filedesc *fdp; 2290 2291 PROC_LOCK_ASSERT(p, MA_OWNED); 2292 fdp = atomic_load_ptr(&p->p_fd); 2293 if (fdp != NULL) 2294 refcount_acquire(&fdp->fd_holdcnt); 2295 return (fdp); 2296 } 2297 2298 static struct pwddesc * 2299 pdhold(struct proc *p) 2300 { 2301 struct pwddesc *pdp; 2302 2303 PROC_LOCK_ASSERT(p, MA_OWNED); 2304 pdp = atomic_load_ptr(&p->p_pd); 2305 if (pdp != NULL) 2306 refcount_acquire(&pdp->pd_refcount); 2307 return (pdp); 2308 } 2309 2310 static void 2311 fddrop(struct filedesc *fdp) 2312 { 2313 2314 if (refcount_load(&fdp->fd_holdcnt) > 1) { 2315 if (refcount_release(&fdp->fd_holdcnt) == 0) 2316 return; 2317 } 2318 2319 FILEDESC_LOCK_DESTROY(fdp); 2320 uma_zfree(filedesc0_zone, fdp); 2321 } 2322 2323 static void 2324 pddrop(struct pwddesc *pdp) 2325 { 2326 struct pwd *pwd; 2327 2328 if (refcount_release_if_not_last(&pdp->pd_refcount)) 2329 return; 2330 2331 PWDDESC_XLOCK(pdp); 2332 if (refcount_release(&pdp->pd_refcount) == 0) { 2333 PWDDESC_XUNLOCK(pdp); 2334 return; 2335 } 2336 pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 2337 pwd_set(pdp, NULL); 2338 PWDDESC_XUNLOCK(pdp); 2339 pwd_drop(pwd); 2340 2341 PWDDESC_LOCK_DESTROY(pdp); 2342 free(pdp, M_PWDDESC); 2343 } 2344 2345 /* 2346 * Share a filedesc structure. 2347 */ 2348 struct filedesc * 2349 fdshare(struct filedesc *fdp) 2350 { 2351 2352 refcount_acquire(&fdp->fd_refcnt); 2353 return (fdp); 2354 } 2355 2356 /* 2357 * Share a pwddesc structure. 2358 */ 2359 struct pwddesc * 2360 pdshare(struct pwddesc *pdp) 2361 { 2362 refcount_acquire(&pdp->pd_refcount); 2363 return (pdp); 2364 } 2365 2366 /* 2367 * Unshare a filedesc structure, if necessary by making a copy 2368 */ 2369 void 2370 fdunshare(struct thread *td) 2371 { 2372 struct filedesc *tmp; 2373 struct proc *p = td->td_proc; 2374 2375 if (refcount_load(&p->p_fd->fd_refcnt) == 1) 2376 return; 2377 2378 tmp = fdcopy(p->p_fd); 2379 fdescfree(td); 2380 p->p_fd = tmp; 2381 } 2382 2383 /* 2384 * Unshare a pwddesc structure. 2385 */ 2386 void 2387 pdunshare(struct thread *td) 2388 { 2389 struct pwddesc *pdp; 2390 struct proc *p; 2391 2392 p = td->td_proc; 2393 /* Not shared. */ 2394 if (refcount_load(&p->p_pd->pd_refcount) == 1) 2395 return; 2396 2397 pdp = pdcopy(p->p_pd); 2398 pdescfree(td); 2399 p->p_pd = pdp; 2400 } 2401 2402 /* 2403 * Copy a filedesc structure. A NULL pointer in returns a NULL reference, 2404 * this is to ease callers, not catch errors. 2405 */ 2406 struct filedesc * 2407 fdcopy(struct filedesc *fdp) 2408 { 2409 struct filedesc *newfdp; 2410 struct filedescent *nfde, *ofde; 2411 int i, lastfile; 2412 2413 MPASS(fdp != NULL); 2414 2415 newfdp = fdinit(); 2416 FILEDESC_SLOCK(fdp); 2417 for (;;) { 2418 lastfile = fdlastfile(fdp); 2419 if (lastfile < newfdp->fd_nfiles) 2420 break; 2421 FILEDESC_SUNLOCK(fdp); 2422 fdgrowtable(newfdp, lastfile + 1); 2423 FILEDESC_SLOCK(fdp); 2424 } 2425 /* copy all passable descriptors (i.e. not kqueue) */ 2426 newfdp->fd_freefile = fdp->fd_freefile; 2427 FILEDESC_FOREACH_FDE(fdp, i, ofde) { 2428 if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0 || 2429 !fhold(ofde->fde_file)) { 2430 if (newfdp->fd_freefile == fdp->fd_freefile) 2431 newfdp->fd_freefile = i; 2432 continue; 2433 } 2434 nfde = &newfdp->fd_ofiles[i]; 2435 *nfde = *ofde; 2436 filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true); 2437 fdused_init(newfdp, i); 2438 } 2439 MPASS(newfdp->fd_freefile != -1); 2440 FILEDESC_SUNLOCK(fdp); 2441 return (newfdp); 2442 } 2443 2444 /* 2445 * Copy a pwddesc structure. 2446 */ 2447 struct pwddesc * 2448 pdcopy(struct pwddesc *pdp) 2449 { 2450 struct pwddesc *newpdp; 2451 2452 MPASS(pdp != NULL); 2453 2454 newpdp = pdinit(pdp, true); 2455 newpdp->pd_cmask = pdp->pd_cmask; 2456 PWDDESC_XUNLOCK(pdp); 2457 return (newpdp); 2458 } 2459 2460 /* 2461 * Clear POSIX style locks. This is only used when fdp looses a reference (i.e. 2462 * one of processes using it exits) and the table used to be shared. 2463 */ 2464 static void 2465 fdclearlocks(struct thread *td) 2466 { 2467 struct filedesc *fdp; 2468 struct filedesc_to_leader *fdtol; 2469 struct flock lf; 2470 struct file *fp; 2471 struct proc *p; 2472 struct vnode *vp; 2473 int i; 2474 2475 p = td->td_proc; 2476 fdp = p->p_fd; 2477 fdtol = p->p_fdtol; 2478 MPASS(fdtol != NULL); 2479 2480 FILEDESC_XLOCK(fdp); 2481 KASSERT(fdtol->fdl_refcount > 0, 2482 ("filedesc_to_refcount botch: fdl_refcount=%d", 2483 fdtol->fdl_refcount)); 2484 if (fdtol->fdl_refcount == 1 && 2485 (p->p_leader->p_flag & P_ADVLOCK) != 0) { 2486 FILEDESC_FOREACH_FP(fdp, i, fp) { 2487 if (fp->f_type != DTYPE_VNODE || 2488 !fhold(fp)) 2489 continue; 2490 FILEDESC_XUNLOCK(fdp); 2491 lf.l_whence = SEEK_SET; 2492 lf.l_start = 0; 2493 lf.l_len = 0; 2494 lf.l_type = F_UNLCK; 2495 vp = fp->f_vnode; 2496 (void) VOP_ADVLOCK(vp, 2497 (caddr_t)p->p_leader, F_UNLCK, 2498 &lf, F_POSIX); 2499 FILEDESC_XLOCK(fdp); 2500 fdrop(fp, td); 2501 } 2502 } 2503 retry: 2504 if (fdtol->fdl_refcount == 1) { 2505 if (fdp->fd_holdleaderscount > 0 && 2506 (p->p_leader->p_flag & P_ADVLOCK) != 0) { 2507 /* 2508 * close() or kern_dup() has cleared a reference 2509 * in a shared file descriptor table. 2510 */ 2511 fdp->fd_holdleaderswakeup = 1; 2512 sx_sleep(&fdp->fd_holdleaderscount, 2513 FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); 2514 goto retry; 2515 } 2516 if (fdtol->fdl_holdcount > 0) { 2517 /* 2518 * Ensure that fdtol->fdl_leader remains 2519 * valid in closef(). 2520 */ 2521 fdtol->fdl_wakeup = 1; 2522 sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK, 2523 "fdlhold", 0); 2524 goto retry; 2525 } 2526 } 2527 fdtol->fdl_refcount--; 2528 if (fdtol->fdl_refcount == 0 && 2529 fdtol->fdl_holdcount == 0) { 2530 fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; 2531 fdtol->fdl_prev->fdl_next = fdtol->fdl_next; 2532 } else 2533 fdtol = NULL; 2534 p->p_fdtol = NULL; 2535 FILEDESC_XUNLOCK(fdp); 2536 if (fdtol != NULL) 2537 free(fdtol, M_FILEDESC_TO_LEADER); 2538 } 2539 2540 /* 2541 * Release a filedesc structure. 2542 */ 2543 static void 2544 fdescfree_fds(struct thread *td, struct filedesc *fdp) 2545 { 2546 struct filedesc0 *fdp0; 2547 struct freetable *ft, *tft; 2548 struct filedescent *fde; 2549 struct file *fp; 2550 int i; 2551 2552 KASSERT(refcount_load(&fdp->fd_refcnt) == 0, 2553 ("%s: fd table %p carries references", __func__, fdp)); 2554 2555 /* 2556 * Serialize with threads iterating over the table, if any. 2557 */ 2558 if (refcount_load(&fdp->fd_holdcnt) > 1) { 2559 FILEDESC_XLOCK(fdp); 2560 FILEDESC_XUNLOCK(fdp); 2561 } 2562 2563 FILEDESC_FOREACH_FDE(fdp, i, fde) { 2564 fp = fde->fde_file; 2565 fdefree_last(fde); 2566 (void) closef(fp, td); 2567 } 2568 2569 if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE)) 2570 free(fdp->fd_map, M_FILEDESC); 2571 if (fdp->fd_nfiles > NDFILE) 2572 free(fdp->fd_files, M_FILEDESC); 2573 2574 fdp0 = (struct filedesc0 *)fdp; 2575 SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft) 2576 free(ft->ft_table, M_FILEDESC); 2577 2578 fddrop(fdp); 2579 } 2580 2581 void 2582 fdescfree(struct thread *td) 2583 { 2584 struct proc *p; 2585 struct filedesc *fdp; 2586 2587 p = td->td_proc; 2588 fdp = p->p_fd; 2589 MPASS(fdp != NULL); 2590 2591 #ifdef RACCT 2592 if (RACCT_ENABLED()) 2593 racct_set_unlocked(p, RACCT_NOFILE, 0); 2594 #endif 2595 2596 if (p->p_fdtol != NULL) 2597 fdclearlocks(td); 2598 2599 /* 2600 * Check fdhold for an explanation. 2601 */ 2602 atomic_store_ptr(&p->p_fd, NULL); 2603 atomic_thread_fence_seq_cst(); 2604 PROC_WAIT_UNLOCKED(p); 2605 2606 if (refcount_release(&fdp->fd_refcnt) == 0) 2607 return; 2608 2609 fdescfree_fds(td, fdp); 2610 } 2611 2612 void 2613 pdescfree(struct thread *td) 2614 { 2615 struct proc *p; 2616 struct pwddesc *pdp; 2617 2618 p = td->td_proc; 2619 pdp = p->p_pd; 2620 MPASS(pdp != NULL); 2621 2622 /* 2623 * Check pdhold for an explanation. 2624 */ 2625 atomic_store_ptr(&p->p_pd, NULL); 2626 atomic_thread_fence_seq_cst(); 2627 PROC_WAIT_UNLOCKED(p); 2628 2629 pddrop(pdp); 2630 } 2631 2632 /* 2633 * For setugid programs, we don't want to people to use that setugidness 2634 * to generate error messages which write to a file which otherwise would 2635 * otherwise be off-limits to the process. We check for filesystems where 2636 * the vnode can change out from under us after execve (like [lin]procfs). 2637 * 2638 * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is 2639 * sufficient. We also don't check for setugidness since we know we are. 2640 */ 2641 static bool 2642 is_unsafe(struct file *fp) 2643 { 2644 struct vnode *vp; 2645 2646 if (fp->f_type != DTYPE_VNODE) 2647 return (false); 2648 2649 vp = fp->f_vnode; 2650 return ((vp->v_vflag & VV_PROCDEP) != 0); 2651 } 2652 2653 /* 2654 * Make this setguid thing safe, if at all possible. 2655 */ 2656 void 2657 fdsetugidsafety(struct thread *td) 2658 { 2659 struct filedesc *fdp; 2660 struct file *fp; 2661 int i; 2662 2663 fdp = td->td_proc->p_fd; 2664 KASSERT(refcount_load(&fdp->fd_refcnt) == 1, 2665 ("the fdtable should not be shared")); 2666 MPASS(fdp->fd_nfiles >= 3); 2667 for (i = 0; i <= 2; i++) { 2668 fp = fdp->fd_ofiles[i].fde_file; 2669 if (fp != NULL && is_unsafe(fp)) { 2670 FILEDESC_XLOCK(fdp); 2671 knote_fdclose(td, i); 2672 /* 2673 * NULL-out descriptor prior to close to avoid 2674 * a race while close blocks. 2675 */ 2676 fdfree(fdp, i); 2677 FILEDESC_XUNLOCK(fdp); 2678 (void) closef(fp, td); 2679 } 2680 } 2681 } 2682 2683 /* 2684 * If a specific file object occupies a specific file descriptor, close the 2685 * file descriptor entry and drop a reference on the file object. This is a 2686 * convenience function to handle a subsequent error in a function that calls 2687 * falloc() that handles the race that another thread might have closed the 2688 * file descriptor out from under the thread creating the file object. 2689 */ 2690 void 2691 fdclose(struct thread *td, struct file *fp, int idx) 2692 { 2693 struct filedesc *fdp = td->td_proc->p_fd; 2694 2695 FILEDESC_XLOCK(fdp); 2696 if (fdp->fd_ofiles[idx].fde_file == fp) { 2697 fdfree(fdp, idx); 2698 FILEDESC_XUNLOCK(fdp); 2699 fdrop(fp, td); 2700 } else 2701 FILEDESC_XUNLOCK(fdp); 2702 } 2703 2704 /* 2705 * Close any files on exec? 2706 */ 2707 void 2708 fdcloseexec(struct thread *td) 2709 { 2710 struct filedesc *fdp; 2711 struct filedescent *fde; 2712 struct file *fp; 2713 int i; 2714 2715 fdp = td->td_proc->p_fd; 2716 KASSERT(refcount_load(&fdp->fd_refcnt) == 1, 2717 ("the fdtable should not be shared")); 2718 FILEDESC_FOREACH_FDE(fdp, i, fde) { 2719 fp = fde->fde_file; 2720 if (fp->f_type == DTYPE_MQUEUE || 2721 (fde->fde_flags & UF_EXCLOSE)) { 2722 FILEDESC_XLOCK(fdp); 2723 fdfree(fdp, i); 2724 (void) closefp(fdp, i, fp, td, false, false); 2725 FILEDESC_UNLOCK_ASSERT(fdp); 2726 } 2727 } 2728 } 2729 2730 /* 2731 * It is unsafe for set[ug]id processes to be started with file 2732 * descriptors 0..2 closed, as these descriptors are given implicit 2733 * significance in the Standard C library. fdcheckstd() will create a 2734 * descriptor referencing /dev/null for each of stdin, stdout, and 2735 * stderr that is not already open. 2736 */ 2737 int 2738 fdcheckstd(struct thread *td) 2739 { 2740 struct filedesc *fdp; 2741 register_t save; 2742 int i, error, devnull; 2743 2744 fdp = td->td_proc->p_fd; 2745 KASSERT(refcount_load(&fdp->fd_refcnt) == 1, 2746 ("the fdtable should not be shared")); 2747 MPASS(fdp->fd_nfiles >= 3); 2748 devnull = -1; 2749 for (i = 0; i <= 2; i++) { 2750 if (fdp->fd_ofiles[i].fde_file != NULL) 2751 continue; 2752 2753 save = td->td_retval[0]; 2754 if (devnull != -1) { 2755 error = kern_dup(td, FDDUP_FIXED, 0, devnull, i); 2756 } else { 2757 error = kern_openat(td, AT_FDCWD, "/dev/null", 2758 UIO_SYSSPACE, O_RDWR, 0); 2759 if (error == 0) { 2760 devnull = td->td_retval[0]; 2761 KASSERT(devnull == i, ("we didn't get our fd")); 2762 } 2763 } 2764 td->td_retval[0] = save; 2765 if (error != 0) 2766 return (error); 2767 } 2768 return (0); 2769 } 2770 2771 /* 2772 * Internal form of close. Decrement reference count on file structure. 2773 * Note: td may be NULL when closing a file that was being passed in a 2774 * message. 2775 */ 2776 int 2777 closef(struct file *fp, struct thread *td) 2778 { 2779 struct vnode *vp; 2780 struct flock lf; 2781 struct filedesc_to_leader *fdtol; 2782 struct filedesc *fdp; 2783 2784 MPASS(td != NULL); 2785 2786 /* 2787 * POSIX record locking dictates that any close releases ALL 2788 * locks owned by this process. This is handled by setting 2789 * a flag in the unlock to free ONLY locks obeying POSIX 2790 * semantics, and not to free BSD-style file locks. 2791 * If the descriptor was in a message, POSIX-style locks 2792 * aren't passed with the descriptor, and the thread pointer 2793 * will be NULL. Callers should be careful only to pass a 2794 * NULL thread pointer when there really is no owning 2795 * context that might have locks, or the locks will be 2796 * leaked. 2797 */ 2798 if (fp->f_type == DTYPE_VNODE) { 2799 vp = fp->f_vnode; 2800 if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { 2801 lf.l_whence = SEEK_SET; 2802 lf.l_start = 0; 2803 lf.l_len = 0; 2804 lf.l_type = F_UNLCK; 2805 (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, 2806 F_UNLCK, &lf, F_POSIX); 2807 } 2808 fdtol = td->td_proc->p_fdtol; 2809 if (fdtol != NULL) { 2810 /* 2811 * Handle special case where file descriptor table is 2812 * shared between multiple process leaders. 2813 */ 2814 fdp = td->td_proc->p_fd; 2815 FILEDESC_XLOCK(fdp); 2816 for (fdtol = fdtol->fdl_next; 2817 fdtol != td->td_proc->p_fdtol; 2818 fdtol = fdtol->fdl_next) { 2819 if ((fdtol->fdl_leader->p_flag & 2820 P_ADVLOCK) == 0) 2821 continue; 2822 fdtol->fdl_holdcount++; 2823 FILEDESC_XUNLOCK(fdp); 2824 lf.l_whence = SEEK_SET; 2825 lf.l_start = 0; 2826 lf.l_len = 0; 2827 lf.l_type = F_UNLCK; 2828 vp = fp->f_vnode; 2829 (void) VOP_ADVLOCK(vp, 2830 (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf, 2831 F_POSIX); 2832 FILEDESC_XLOCK(fdp); 2833 fdtol->fdl_holdcount--; 2834 if (fdtol->fdl_holdcount == 0 && 2835 fdtol->fdl_wakeup != 0) { 2836 fdtol->fdl_wakeup = 0; 2837 wakeup(fdtol); 2838 } 2839 } 2840 FILEDESC_XUNLOCK(fdp); 2841 } 2842 } 2843 return (fdrop_close(fp, td)); 2844 } 2845 2846 /* 2847 * Hack for file descriptor passing code. 2848 */ 2849 void 2850 closef_nothread(struct file *fp) 2851 { 2852 2853 fdrop(fp, NULL); 2854 } 2855 2856 /* 2857 * Initialize the file pointer with the specified properties. 2858 * 2859 * The ops are set with release semantics to be certain that the flags, type, 2860 * and data are visible when ops is. This is to prevent ops methods from being 2861 * called with bad data. 2862 */ 2863 void 2864 finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops) 2865 { 2866 fp->f_data = data; 2867 fp->f_flag = flag; 2868 fp->f_type = type; 2869 atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops); 2870 } 2871 2872 void 2873 finit_vnode(struct file *fp, u_int flag, void *data, struct fileops *ops) 2874 { 2875 fp->f_seqcount[UIO_READ] = 1; 2876 fp->f_seqcount[UIO_WRITE] = 1; 2877 finit(fp, (flag & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, 2878 data, ops); 2879 } 2880 2881 int 2882 fget_cap_noref(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, 2883 struct file **fpp, struct filecaps *havecapsp) 2884 { 2885 struct filedescent *fde; 2886 int error; 2887 2888 FILEDESC_LOCK_ASSERT(fdp); 2889 2890 *fpp = NULL; 2891 fde = fdeget_noref(fdp, fd); 2892 if (fde == NULL) { 2893 error = EBADF; 2894 goto out; 2895 } 2896 2897 #ifdef CAPABILITIES 2898 error = cap_check(cap_rights_fde_inline(fde), needrightsp); 2899 if (error != 0) 2900 goto out; 2901 #endif 2902 2903 if (havecapsp != NULL) 2904 filecaps_copy(&fde->fde_caps, havecapsp, true); 2905 2906 *fpp = fde->fde_file; 2907 2908 error = 0; 2909 out: 2910 return (error); 2911 } 2912 2913 #ifdef CAPABILITIES 2914 int 2915 fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp, 2916 struct file **fpp, struct filecaps *havecapsp) 2917 { 2918 struct filedesc *fdp = td->td_proc->p_fd; 2919 int error; 2920 struct file *fp; 2921 seqc_t seq; 2922 2923 *fpp = NULL; 2924 for (;;) { 2925 error = fget_unlocked_seq(td, fd, needrightsp, &fp, &seq); 2926 if (error != 0) 2927 return (error); 2928 2929 if (havecapsp != NULL) { 2930 if (!filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, 2931 havecapsp, false)) { 2932 fdrop(fp, td); 2933 goto get_locked; 2934 } 2935 } 2936 2937 if (!fd_modified(fdp, fd, seq)) 2938 break; 2939 fdrop(fp, td); 2940 } 2941 2942 *fpp = fp; 2943 return (0); 2944 2945 get_locked: 2946 FILEDESC_SLOCK(fdp); 2947 error = fget_cap_noref(fdp, fd, needrightsp, fpp, havecapsp); 2948 if (error == 0 && !fhold(*fpp)) 2949 error = EBADF; 2950 FILEDESC_SUNLOCK(fdp); 2951 return (error); 2952 } 2953 #else 2954 int 2955 fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp, 2956 struct file **fpp, struct filecaps *havecapsp) 2957 { 2958 int error; 2959 error = fget_unlocked(td, fd, needrightsp, fpp); 2960 if (havecapsp != NULL && error == 0) 2961 filecaps_fill(havecapsp); 2962 2963 return (error); 2964 } 2965 #endif 2966 2967 #ifdef CAPABILITIES 2968 int 2969 fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch) 2970 { 2971 const struct filedescent *fde; 2972 const struct fdescenttbl *fdt; 2973 struct filedesc *fdp; 2974 struct file *fp; 2975 struct vnode *vp; 2976 const cap_rights_t *haverights; 2977 cap_rights_t rights; 2978 seqc_t seq; 2979 2980 VFS_SMR_ASSERT_ENTERED(); 2981 2982 rights = *ndp->ni_rightsneeded; 2983 cap_rights_set_one(&rights, CAP_LOOKUP); 2984 2985 fdp = curproc->p_fd; 2986 fdt = fdp->fd_files; 2987 if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) 2988 return (EBADF); 2989 seq = seqc_read_notmodify(fd_seqc(fdt, fd)); 2990 fde = &fdt->fdt_ofiles[fd]; 2991 haverights = cap_rights_fde_inline(fde); 2992 fp = fde->fde_file; 2993 if (__predict_false(fp == NULL)) 2994 return (EAGAIN); 2995 if (__predict_false(cap_check_inline_transient(haverights, &rights))) 2996 return (EAGAIN); 2997 *fsearch = ((fp->f_flag & FSEARCH) != 0); 2998 vp = fp->f_vnode; 2999 if (__predict_false(vp == NULL)) { 3000 return (EAGAIN); 3001 } 3002 if (!filecaps_copy(&fde->fde_caps, &ndp->ni_filecaps, false)) { 3003 return (EAGAIN); 3004 } 3005 /* 3006 * Use an acquire barrier to force re-reading of fdt so it is 3007 * refreshed for verification. 3008 */ 3009 atomic_thread_fence_acq(); 3010 fdt = fdp->fd_files; 3011 if (__predict_false(!seqc_consistent_no_fence(fd_seqc(fdt, fd), seq))) 3012 return (EAGAIN); 3013 /* 3014 * If file descriptor doesn't have all rights, 3015 * all lookups relative to it must also be 3016 * strictly relative. 3017 * 3018 * Not yet supported by fast path. 3019 */ 3020 CAP_ALL(&rights); 3021 if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) || 3022 ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL || 3023 ndp->ni_filecaps.fc_nioctls != -1) { 3024 #ifdef notyet 3025 ndp->ni_lcf |= NI_LCF_STRICTRELATIVE; 3026 #else 3027 return (EAGAIN); 3028 #endif 3029 } 3030 *vpp = vp; 3031 return (0); 3032 } 3033 #else 3034 int 3035 fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch) 3036 { 3037 const struct fdescenttbl *fdt; 3038 struct filedesc *fdp; 3039 struct file *fp; 3040 struct vnode *vp; 3041 3042 VFS_SMR_ASSERT_ENTERED(); 3043 3044 fdp = curproc->p_fd; 3045 fdt = fdp->fd_files; 3046 if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) 3047 return (EBADF); 3048 fp = fdt->fdt_ofiles[fd].fde_file; 3049 if (__predict_false(fp == NULL)) 3050 return (EAGAIN); 3051 *fsearch = ((fp->f_flag & FSEARCH) != 0); 3052 vp = fp->f_vnode; 3053 if (__predict_false(vp == NULL || vp->v_type != VDIR)) { 3054 return (EAGAIN); 3055 } 3056 /* 3057 * Use an acquire barrier to force re-reading of fdt so it is 3058 * refreshed for verification. 3059 */ 3060 atomic_thread_fence_acq(); 3061 fdt = fdp->fd_files; 3062 if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file)) 3063 return (EAGAIN); 3064 filecaps_fill(&ndp->ni_filecaps); 3065 *vpp = vp; 3066 return (0); 3067 } 3068 #endif 3069 3070 int 3071 fgetvp_lookup(int fd, struct nameidata *ndp, struct vnode **vpp) 3072 { 3073 struct thread *td; 3074 struct file *fp; 3075 struct vnode *vp; 3076 struct componentname *cnp; 3077 cap_rights_t rights; 3078 int error; 3079 3080 td = curthread; 3081 rights = *ndp->ni_rightsneeded; 3082 cap_rights_set_one(&rights, CAP_LOOKUP); 3083 cnp = &ndp->ni_cnd; 3084 3085 error = fget_cap(td, ndp->ni_dirfd, &rights, &fp, &ndp->ni_filecaps); 3086 if (__predict_false(error != 0)) 3087 return (error); 3088 if (__predict_false(fp->f_ops == &badfileops)) { 3089 error = EBADF; 3090 goto out_free; 3091 } 3092 vp = fp->f_vnode; 3093 if (__predict_false(vp == NULL)) { 3094 error = ENOTDIR; 3095 goto out_free; 3096 } 3097 vrefact(vp); 3098 /* 3099 * XXX does not check for VDIR, handled by namei_setup 3100 */ 3101 if ((fp->f_flag & FSEARCH) != 0) 3102 cnp->cn_flags |= NOEXECCHECK; 3103 fdrop(fp, td); 3104 3105 #ifdef CAPABILITIES 3106 /* 3107 * If file descriptor doesn't have all rights, 3108 * all lookups relative to it must also be 3109 * strictly relative. 3110 */ 3111 CAP_ALL(&rights); 3112 if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) || 3113 ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL || 3114 ndp->ni_filecaps.fc_nioctls != -1) { 3115 ndp->ni_lcf |= NI_LCF_STRICTRELATIVE; 3116 ndp->ni_resflags |= NIRES_STRICTREL; 3117 } 3118 #endif 3119 3120 /* 3121 * TODO: avoid copying ioctl caps if it can be helped to begin with 3122 */ 3123 if ((cnp->cn_flags & WANTIOCTLCAPS) == 0) 3124 filecaps_free_ioctl(&ndp->ni_filecaps); 3125 3126 *vpp = vp; 3127 return (0); 3128 3129 out_free: 3130 filecaps_free(&ndp->ni_filecaps); 3131 fdrop(fp, td); 3132 return (error); 3133 } 3134 3135 /* 3136 * Fetch the descriptor locklessly. 3137 * 3138 * We avoid fdrop() races by never raising a refcount above 0. To accomplish 3139 * this we have to use a cmpset loop rather than an atomic_add. The descriptor 3140 * must be re-verified once we acquire a reference to be certain that the 3141 * identity is still correct and we did not lose a race due to preemption. 3142 * 3143 * Force a reload of fdt when looping. Another thread could reallocate 3144 * the table before this fd was closed, so it is possible that there is 3145 * a stale fp pointer in cached version. 3146 */ 3147 #ifdef CAPABILITIES 3148 static int 3149 fget_unlocked_seq(struct thread *td, int fd, cap_rights_t *needrightsp, 3150 struct file **fpp, seqc_t *seqp) 3151 { 3152 struct filedesc *fdp; 3153 const struct filedescent *fde; 3154 const struct fdescenttbl *fdt; 3155 struct file *fp; 3156 seqc_t seq; 3157 cap_rights_t haverights; 3158 int error; 3159 3160 fdp = td->td_proc->p_fd; 3161 fdt = fdp->fd_files; 3162 if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) 3163 return (EBADF); 3164 3165 for (;;) { 3166 seq = seqc_read_notmodify(fd_seqc(fdt, fd)); 3167 fde = &fdt->fdt_ofiles[fd]; 3168 haverights = *cap_rights_fde_inline(fde); 3169 fp = fde->fde_file; 3170 if (__predict_false(fp == NULL)) { 3171 if (seqc_consistent(fd_seqc(fdt, fd), seq)) 3172 return (EBADF); 3173 fdt = atomic_load_ptr(&fdp->fd_files); 3174 continue; 3175 } 3176 error = cap_check_inline(&haverights, needrightsp); 3177 if (__predict_false(error != 0)) { 3178 if (seqc_consistent(fd_seqc(fdt, fd), seq)) 3179 return (error); 3180 fdt = atomic_load_ptr(&fdp->fd_files); 3181 continue; 3182 } 3183 if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) { 3184 fdt = atomic_load_ptr(&fdp->fd_files); 3185 continue; 3186 } 3187 /* 3188 * Use an acquire barrier to force re-reading of fdt so it is 3189 * refreshed for verification. 3190 */ 3191 atomic_thread_fence_acq(); 3192 fdt = fdp->fd_files; 3193 if (seqc_consistent_no_fence(fd_seqc(fdt, fd), seq)) 3194 break; 3195 fdrop(fp, td); 3196 } 3197 *fpp = fp; 3198 if (seqp != NULL) { 3199 *seqp = seq; 3200 } 3201 return (0); 3202 } 3203 #else 3204 static int 3205 fget_unlocked_seq(struct thread *td, int fd, cap_rights_t *needrightsp, 3206 struct file **fpp, seqc_t *seqp __unused) 3207 { 3208 struct filedesc *fdp; 3209 const struct fdescenttbl *fdt; 3210 struct file *fp; 3211 3212 fdp = td->td_proc->p_fd; 3213 fdt = fdp->fd_files; 3214 if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) 3215 return (EBADF); 3216 3217 for (;;) { 3218 fp = fdt->fdt_ofiles[fd].fde_file; 3219 if (__predict_false(fp == NULL)) 3220 return (EBADF); 3221 if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) { 3222 fdt = atomic_load_ptr(&fdp->fd_files); 3223 continue; 3224 } 3225 /* 3226 * Use an acquire barrier to force re-reading of fdt so it is 3227 * refreshed for verification. 3228 */ 3229 atomic_thread_fence_acq(); 3230 fdt = fdp->fd_files; 3231 if (__predict_true(fp == fdt->fdt_ofiles[fd].fde_file)) 3232 break; 3233 fdrop(fp, td); 3234 } 3235 *fpp = fp; 3236 return (0); 3237 } 3238 #endif 3239 3240 /* 3241 * See the comments in fget_unlocked_seq for an explanation of how this works. 3242 * 3243 * This is a simplified variant which bails out to the aforementioned routine 3244 * if anything goes wrong. In practice this only happens when userspace is 3245 * racing with itself. 3246 */ 3247 int 3248 fget_unlocked(struct thread *td, int fd, cap_rights_t *needrightsp, 3249 struct file **fpp) 3250 { 3251 struct filedesc *fdp; 3252 #ifdef CAPABILITIES 3253 const struct filedescent *fde; 3254 #endif 3255 const struct fdescenttbl *fdt; 3256 struct file *fp; 3257 #ifdef CAPABILITIES 3258 seqc_t seq; 3259 const cap_rights_t *haverights; 3260 #endif 3261 3262 fdp = td->td_proc->p_fd; 3263 fdt = fdp->fd_files; 3264 if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) { 3265 *fpp = NULL; 3266 return (EBADF); 3267 } 3268 #ifdef CAPABILITIES 3269 seq = seqc_read_notmodify(fd_seqc(fdt, fd)); 3270 fde = &fdt->fdt_ofiles[fd]; 3271 haverights = cap_rights_fde_inline(fde); 3272 fp = fde->fde_file; 3273 #else 3274 fp = fdt->fdt_ofiles[fd].fde_file; 3275 #endif 3276 if (__predict_false(fp == NULL)) 3277 goto out_fallback; 3278 #ifdef CAPABILITIES 3279 if (__predict_false(cap_check_inline_transient(haverights, needrightsp))) 3280 goto out_fallback; 3281 #endif 3282 if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) 3283 goto out_fallback; 3284 3285 /* 3286 * Use an acquire barrier to force re-reading of fdt so it is 3287 * refreshed for verification. 3288 */ 3289 atomic_thread_fence_acq(); 3290 fdt = fdp->fd_files; 3291 #ifdef CAPABILITIES 3292 if (__predict_false(!seqc_consistent_no_fence(fd_seqc(fdt, fd), seq))) 3293 #else 3294 if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file)) 3295 #endif 3296 goto out_fdrop; 3297 *fpp = fp; 3298 return (0); 3299 out_fdrop: 3300 fdrop(fp, td); 3301 out_fallback: 3302 *fpp = NULL; 3303 return (fget_unlocked_seq(td, fd, needrightsp, fpp, NULL)); 3304 } 3305 3306 /* 3307 * Translate fd -> file when the caller guarantees the file descriptor table 3308 * can't be changed by others. 3309 * 3310 * Note this does not mean the file object itself is only visible to the caller, 3311 * merely that it wont disappear without having to be referenced. 3312 * 3313 * Must be paired with fput_only_user. 3314 */ 3315 #ifdef CAPABILITIES 3316 int 3317 fget_only_user(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, 3318 struct file **fpp) 3319 { 3320 const struct filedescent *fde; 3321 const struct fdescenttbl *fdt; 3322 const cap_rights_t *haverights; 3323 struct file *fp; 3324 int error; 3325 3326 MPASS(FILEDESC_IS_ONLY_USER(fdp)); 3327 3328 *fpp = NULL; 3329 if (__predict_false(fd >= fdp->fd_nfiles)) 3330 return (EBADF); 3331 3332 fdt = fdp->fd_files; 3333 fde = &fdt->fdt_ofiles[fd]; 3334 fp = fde->fde_file; 3335 if (__predict_false(fp == NULL)) 3336 return (EBADF); 3337 MPASS(refcount_load(&fp->f_count) > 0); 3338 haverights = cap_rights_fde_inline(fde); 3339 error = cap_check_inline(haverights, needrightsp); 3340 if (__predict_false(error != 0)) 3341 return (error); 3342 *fpp = fp; 3343 return (0); 3344 } 3345 #else 3346 int 3347 fget_only_user(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, 3348 struct file **fpp) 3349 { 3350 struct file *fp; 3351 3352 MPASS(FILEDESC_IS_ONLY_USER(fdp)); 3353 3354 *fpp = NULL; 3355 if (__predict_false(fd >= fdp->fd_nfiles)) 3356 return (EBADF); 3357 3358 fp = fdp->fd_ofiles[fd].fde_file; 3359 if (__predict_false(fp == NULL)) 3360 return (EBADF); 3361 3362 MPASS(refcount_load(&fp->f_count) > 0); 3363 *fpp = fp; 3364 return (0); 3365 } 3366 #endif 3367 3368 /* 3369 * Extract the file pointer associated with the specified descriptor for the 3370 * current user process. 3371 * 3372 * If the descriptor doesn't exist or doesn't match 'flags', EBADF is 3373 * returned. 3374 * 3375 * File's rights will be checked against the capability rights mask. 3376 * 3377 * If an error occurred the non-zero error is returned and *fpp is set to 3378 * NULL. Otherwise *fpp is held and set and zero is returned. Caller is 3379 * responsible for fdrop(). 3380 */ 3381 static __inline int 3382 _fget(struct thread *td, int fd, struct file **fpp, int flags, 3383 cap_rights_t *needrightsp) 3384 { 3385 struct file *fp; 3386 int error; 3387 3388 *fpp = NULL; 3389 error = fget_unlocked(td, fd, needrightsp, &fp); 3390 if (__predict_false(error != 0)) 3391 return (error); 3392 if (__predict_false(fp->f_ops == &badfileops)) { 3393 fdrop(fp, td); 3394 return (EBADF); 3395 } 3396 3397 /* 3398 * FREAD and FWRITE failure return EBADF as per POSIX. 3399 */ 3400 error = 0; 3401 switch (flags) { 3402 case FREAD: 3403 case FWRITE: 3404 if ((fp->f_flag & flags) == 0) 3405 error = EBADF; 3406 break; 3407 case FEXEC: 3408 if (fp->f_ops != &path_fileops && 3409 ((fp->f_flag & (FREAD | FEXEC)) == 0 || 3410 (fp->f_flag & FWRITE) != 0)) 3411 error = EBADF; 3412 break; 3413 case 0: 3414 break; 3415 default: 3416 KASSERT(0, ("wrong flags")); 3417 } 3418 3419 if (error != 0) { 3420 fdrop(fp, td); 3421 return (error); 3422 } 3423 3424 *fpp = fp; 3425 return (0); 3426 } 3427 3428 int 3429 fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) 3430 { 3431 3432 return (_fget(td, fd, fpp, 0, rightsp)); 3433 } 3434 3435 int 3436 fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, vm_prot_t *maxprotp, 3437 struct file **fpp) 3438 { 3439 int error; 3440 #ifndef CAPABILITIES 3441 error = _fget(td, fd, fpp, 0, rightsp); 3442 if (maxprotp != NULL) 3443 *maxprotp = VM_PROT_ALL; 3444 return (error); 3445 #else 3446 cap_rights_t fdrights; 3447 struct filedesc *fdp; 3448 struct file *fp; 3449 seqc_t seq; 3450 3451 *fpp = NULL; 3452 fdp = td->td_proc->p_fd; 3453 MPASS(cap_rights_is_set(rightsp, CAP_MMAP)); 3454 for (;;) { 3455 error = fget_unlocked_seq(td, fd, rightsp, &fp, &seq); 3456 if (__predict_false(error != 0)) 3457 return (error); 3458 if (__predict_false(fp->f_ops == &badfileops)) { 3459 fdrop(fp, td); 3460 return (EBADF); 3461 } 3462 if (maxprotp != NULL) 3463 fdrights = *cap_rights(fdp, fd); 3464 if (!fd_modified(fdp, fd, seq)) 3465 break; 3466 fdrop(fp, td); 3467 } 3468 3469 /* 3470 * If requested, convert capability rights to access flags. 3471 */ 3472 if (maxprotp != NULL) 3473 *maxprotp = cap_rights_to_vmprot(&fdrights); 3474 *fpp = fp; 3475 return (0); 3476 #endif 3477 } 3478 3479 int 3480 fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) 3481 { 3482 3483 return (_fget(td, fd, fpp, FREAD, rightsp)); 3484 } 3485 3486 int 3487 fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) 3488 { 3489 3490 return (_fget(td, fd, fpp, FWRITE, rightsp)); 3491 } 3492 3493 int 3494 fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl, 3495 struct file **fpp) 3496 { 3497 #ifndef CAPABILITIES 3498 return (fget_unlocked(td, fd, rightsp, fpp)); 3499 #else 3500 struct filedesc *fdp = td->td_proc->p_fd; 3501 struct file *fp; 3502 int error; 3503 seqc_t seq; 3504 3505 *fpp = NULL; 3506 MPASS(cap_rights_is_set(rightsp, CAP_FCNTL)); 3507 for (;;) { 3508 error = fget_unlocked_seq(td, fd, rightsp, &fp, &seq); 3509 if (error != 0) 3510 return (error); 3511 error = cap_fcntl_check(fdp, fd, needfcntl); 3512 if (!fd_modified(fdp, fd, seq)) 3513 break; 3514 fdrop(fp, td); 3515 } 3516 if (error != 0) { 3517 fdrop(fp, td); 3518 return (error); 3519 } 3520 *fpp = fp; 3521 return (0); 3522 #endif 3523 } 3524 3525 /* 3526 * Like fget() but loads the underlying vnode, or returns an error if the 3527 * descriptor does not represent a vnode. Note that pipes use vnodes but 3528 * never have VM objects. The returned vnode will be vref()'d. 3529 * 3530 * XXX: what about the unused flags ? 3531 */ 3532 static __inline int 3533 _fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp, 3534 struct vnode **vpp) 3535 { 3536 struct file *fp; 3537 int error; 3538 3539 *vpp = NULL; 3540 error = _fget(td, fd, &fp, flags, needrightsp); 3541 if (error != 0) 3542 return (error); 3543 if (fp->f_vnode == NULL) { 3544 error = EINVAL; 3545 } else { 3546 *vpp = fp->f_vnode; 3547 vrefact(*vpp); 3548 } 3549 fdrop(fp, td); 3550 3551 return (error); 3552 } 3553 3554 int 3555 fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) 3556 { 3557 3558 return (_fgetvp(td, fd, 0, rightsp, vpp)); 3559 } 3560 3561 int 3562 fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp, 3563 struct filecaps *havecaps, struct vnode **vpp) 3564 { 3565 struct filecaps caps; 3566 struct file *fp; 3567 int error; 3568 3569 error = fget_cap(td, fd, needrightsp, &fp, &caps); 3570 if (error != 0) 3571 return (error); 3572 if (fp->f_ops == &badfileops) { 3573 error = EBADF; 3574 goto out; 3575 } 3576 if (fp->f_vnode == NULL) { 3577 error = EINVAL; 3578 goto out; 3579 } 3580 3581 *havecaps = caps; 3582 *vpp = fp->f_vnode; 3583 vrefact(*vpp); 3584 fdrop(fp, td); 3585 3586 return (0); 3587 out: 3588 filecaps_free(&caps); 3589 fdrop(fp, td); 3590 return (error); 3591 } 3592 3593 int 3594 fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) 3595 { 3596 3597 return (_fgetvp(td, fd, FREAD, rightsp, vpp)); 3598 } 3599 3600 int 3601 fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) 3602 { 3603 3604 return (_fgetvp(td, fd, FEXEC, rightsp, vpp)); 3605 } 3606 3607 #ifdef notyet 3608 int 3609 fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp, 3610 struct vnode **vpp) 3611 { 3612 3613 return (_fgetvp(td, fd, FWRITE, rightsp, vpp)); 3614 } 3615 #endif 3616 3617 /* 3618 * Handle the last reference to a file being closed. 3619 * 3620 * Without the noinline attribute clang keeps inlining the func thorough this 3621 * file when fdrop is used. 3622 */ 3623 int __noinline 3624 _fdrop(struct file *fp, struct thread *td) 3625 { 3626 int error; 3627 #ifdef INVARIANTS 3628 int count; 3629 3630 count = refcount_load(&fp->f_count); 3631 if (count != 0) 3632 panic("fdrop: fp %p count %d", fp, count); 3633 #endif 3634 error = fo_close(fp, td); 3635 atomic_subtract_int(&openfiles, 1); 3636 crfree(fp->f_cred); 3637 free(fp->f_advice, M_FADVISE); 3638 uma_zfree(file_zone, fp); 3639 3640 return (error); 3641 } 3642 3643 /* 3644 * Apply an advisory lock on a file descriptor. 3645 * 3646 * Just attempt to get a record lock of the requested type on the entire file 3647 * (l_whence = SEEK_SET, l_start = 0, l_len = 0). 3648 */ 3649 #ifndef _SYS_SYSPROTO_H_ 3650 struct flock_args { 3651 int fd; 3652 int how; 3653 }; 3654 #endif 3655 /* ARGSUSED */ 3656 int 3657 sys_flock(struct thread *td, struct flock_args *uap) 3658 { 3659 struct file *fp; 3660 struct vnode *vp; 3661 struct flock lf; 3662 int error; 3663 3664 error = fget(td, uap->fd, &cap_flock_rights, &fp); 3665 if (error != 0) 3666 return (error); 3667 error = EOPNOTSUPP; 3668 if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) { 3669 goto done; 3670 } 3671 if (fp->f_ops == &path_fileops) { 3672 goto done; 3673 } 3674 3675 error = 0; 3676 vp = fp->f_vnode; 3677 lf.l_whence = SEEK_SET; 3678 lf.l_start = 0; 3679 lf.l_len = 0; 3680 if (uap->how & LOCK_UN) { 3681 lf.l_type = F_UNLCK; 3682 atomic_clear_int(&fp->f_flag, FHASLOCK); 3683 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); 3684 goto done; 3685 } 3686 if (uap->how & LOCK_EX) 3687 lf.l_type = F_WRLCK; 3688 else if (uap->how & LOCK_SH) 3689 lf.l_type = F_RDLCK; 3690 else { 3691 error = EBADF; 3692 goto done; 3693 } 3694 atomic_set_int(&fp->f_flag, FHASLOCK); 3695 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, 3696 (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); 3697 done: 3698 fdrop(fp, td); 3699 return (error); 3700 } 3701 /* 3702 * Duplicate the specified descriptor to a free descriptor. 3703 */ 3704 int 3705 dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode, 3706 int openerror, int *indxp) 3707 { 3708 struct filedescent *newfde, *oldfde; 3709 struct file *fp; 3710 u_long *ioctls; 3711 int error, indx; 3712 3713 KASSERT(openerror == ENODEV || openerror == ENXIO, 3714 ("unexpected error %d in %s", openerror, __func__)); 3715 3716 /* 3717 * If the to-be-dup'd fd number is greater than the allowed number 3718 * of file descriptors, or the fd to be dup'd has already been 3719 * closed, then reject. 3720 */ 3721 FILEDESC_XLOCK(fdp); 3722 if ((fp = fget_noref(fdp, dfd)) == NULL) { 3723 FILEDESC_XUNLOCK(fdp); 3724 return (EBADF); 3725 } 3726 3727 error = fdalloc(td, 0, &indx); 3728 if (error != 0) { 3729 FILEDESC_XUNLOCK(fdp); 3730 return (error); 3731 } 3732 3733 /* 3734 * There are two cases of interest here. 3735 * 3736 * For ENODEV simply dup (dfd) to file descriptor (indx) and return. 3737 * 3738 * For ENXIO steal away the file structure from (dfd) and store it in 3739 * (indx). (dfd) is effectively closed by this operation. 3740 */ 3741 switch (openerror) { 3742 case ENODEV: 3743 /* 3744 * Check that the mode the file is being opened for is a 3745 * subset of the mode of the existing descriptor. 3746 */ 3747 if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { 3748 fdunused(fdp, indx); 3749 FILEDESC_XUNLOCK(fdp); 3750 return (EACCES); 3751 } 3752 if (!fhold(fp)) { 3753 fdunused(fdp, indx); 3754 FILEDESC_XUNLOCK(fdp); 3755 return (EBADF); 3756 } 3757 newfde = &fdp->fd_ofiles[indx]; 3758 oldfde = &fdp->fd_ofiles[dfd]; 3759 ioctls = filecaps_copy_prep(&oldfde->fde_caps); 3760 #ifdef CAPABILITIES 3761 seqc_write_begin(&newfde->fde_seqc); 3762 #endif 3763 fde_copy(oldfde, newfde); 3764 filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps, 3765 ioctls); 3766 #ifdef CAPABILITIES 3767 seqc_write_end(&newfde->fde_seqc); 3768 #endif 3769 break; 3770 case ENXIO: 3771 /* 3772 * Steal away the file pointer from dfd and stuff it into indx. 3773 */ 3774 newfde = &fdp->fd_ofiles[indx]; 3775 oldfde = &fdp->fd_ofiles[dfd]; 3776 #ifdef CAPABILITIES 3777 seqc_write_begin(&oldfde->fde_seqc); 3778 seqc_write_begin(&newfde->fde_seqc); 3779 #endif 3780 fde_copy(oldfde, newfde); 3781 oldfde->fde_file = NULL; 3782 fdunused(fdp, dfd); 3783 #ifdef CAPABILITIES 3784 seqc_write_end(&newfde->fde_seqc); 3785 seqc_write_end(&oldfde->fde_seqc); 3786 #endif 3787 break; 3788 } 3789 FILEDESC_XUNLOCK(fdp); 3790 *indxp = indx; 3791 return (0); 3792 } 3793 3794 /* 3795 * This sysctl determines if we will allow a process to chroot(2) if it 3796 * has a directory open: 3797 * 0: disallowed for all processes. 3798 * 1: allowed for processes that were not already chroot(2)'ed. 3799 * 2: allowed for all processes. 3800 */ 3801 3802 static int chroot_allow_open_directories = 1; 3803 3804 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW, 3805 &chroot_allow_open_directories, 0, 3806 "Allow a process to chroot(2) if it has a directory open"); 3807 3808 /* 3809 * Helper function for raised chroot(2) security function: Refuse if 3810 * any filedescriptors are open directories. 3811 */ 3812 static int 3813 chroot_refuse_vdir_fds(struct filedesc *fdp) 3814 { 3815 struct vnode *vp; 3816 struct file *fp; 3817 int i; 3818 3819 FILEDESC_LOCK_ASSERT(fdp); 3820 3821 FILEDESC_FOREACH_FP(fdp, i, fp) { 3822 if (fp->f_type == DTYPE_VNODE) { 3823 vp = fp->f_vnode; 3824 if (vp->v_type == VDIR) 3825 return (EPERM); 3826 } 3827 } 3828 return (0); 3829 } 3830 3831 static void 3832 pwd_fill(struct pwd *oldpwd, struct pwd *newpwd) 3833 { 3834 3835 if (newpwd->pwd_cdir == NULL && oldpwd->pwd_cdir != NULL) { 3836 vrefact(oldpwd->pwd_cdir); 3837 newpwd->pwd_cdir = oldpwd->pwd_cdir; 3838 } 3839 3840 if (newpwd->pwd_rdir == NULL && oldpwd->pwd_rdir != NULL) { 3841 vrefact(oldpwd->pwd_rdir); 3842 newpwd->pwd_rdir = oldpwd->pwd_rdir; 3843 } 3844 3845 if (newpwd->pwd_jdir == NULL && oldpwd->pwd_jdir != NULL) { 3846 vrefact(oldpwd->pwd_jdir); 3847 newpwd->pwd_jdir = oldpwd->pwd_jdir; 3848 } 3849 3850 if (newpwd->pwd_adir == NULL && oldpwd->pwd_adir != NULL) { 3851 vrefact(oldpwd->pwd_adir); 3852 newpwd->pwd_adir = oldpwd->pwd_adir; 3853 } 3854 } 3855 3856 struct pwd * 3857 pwd_hold_pwddesc(struct pwddesc *pdp) 3858 { 3859 struct pwd *pwd; 3860 3861 PWDDESC_ASSERT_XLOCKED(pdp); 3862 pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 3863 if (pwd != NULL) 3864 refcount_acquire(&pwd->pwd_refcount); 3865 return (pwd); 3866 } 3867 3868 bool 3869 pwd_hold_smr(struct pwd *pwd) 3870 { 3871 3872 MPASS(pwd != NULL); 3873 if (__predict_true(refcount_acquire_if_not_zero(&pwd->pwd_refcount))) { 3874 return (true); 3875 } 3876 return (false); 3877 } 3878 3879 struct pwd * 3880 pwd_hold(struct thread *td) 3881 { 3882 struct pwddesc *pdp; 3883 struct pwd *pwd; 3884 3885 pdp = td->td_proc->p_pd; 3886 3887 vfs_smr_enter(); 3888 pwd = vfs_smr_entered_load(&pdp->pd_pwd); 3889 if (pwd_hold_smr(pwd)) { 3890 vfs_smr_exit(); 3891 return (pwd); 3892 } 3893 vfs_smr_exit(); 3894 PWDDESC_XLOCK(pdp); 3895 pwd = pwd_hold_pwddesc(pdp); 3896 MPASS(pwd != NULL); 3897 PWDDESC_XUNLOCK(pdp); 3898 return (pwd); 3899 } 3900 3901 struct pwd * 3902 pwd_hold_proc(struct proc *p) 3903 { 3904 struct pwddesc *pdp; 3905 struct pwd *pwd; 3906 3907 PROC_ASSERT_HELD(p); 3908 PROC_LOCK(p); 3909 pdp = pdhold(p); 3910 MPASS(pdp != NULL); 3911 PROC_UNLOCK(p); 3912 3913 PWDDESC_XLOCK(pdp); 3914 pwd = pwd_hold_pwddesc(pdp); 3915 MPASS(pwd != NULL); 3916 PWDDESC_XUNLOCK(pdp); 3917 pddrop(pdp); 3918 return (pwd); 3919 } 3920 3921 static struct pwd * 3922 pwd_alloc(void) 3923 { 3924 struct pwd *pwd; 3925 3926 pwd = uma_zalloc_smr(pwd_zone, M_WAITOK); 3927 bzero(pwd, sizeof(*pwd)); 3928 refcount_init(&pwd->pwd_refcount, 1); 3929 return (pwd); 3930 } 3931 3932 void 3933 pwd_drop(struct pwd *pwd) 3934 { 3935 3936 if (!refcount_release(&pwd->pwd_refcount)) 3937 return; 3938 3939 if (pwd->pwd_cdir != NULL) 3940 vrele(pwd->pwd_cdir); 3941 if (pwd->pwd_rdir != NULL) 3942 vrele(pwd->pwd_rdir); 3943 if (pwd->pwd_jdir != NULL) 3944 vrele(pwd->pwd_jdir); 3945 if (pwd->pwd_adir != NULL) 3946 vrele(pwd->pwd_adir); 3947 uma_zfree_smr(pwd_zone, pwd); 3948 } 3949 3950 /* 3951 * The caller is responsible for invoking priv_check() and 3952 * mac_vnode_check_chroot() to authorize this operation. 3953 */ 3954 int 3955 pwd_chroot(struct thread *td, struct vnode *vp) 3956 { 3957 struct pwddesc *pdp; 3958 struct filedesc *fdp; 3959 struct pwd *newpwd, *oldpwd; 3960 int error; 3961 3962 fdp = td->td_proc->p_fd; 3963 pdp = td->td_proc->p_pd; 3964 newpwd = pwd_alloc(); 3965 FILEDESC_SLOCK(fdp); 3966 PWDDESC_XLOCK(pdp); 3967 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 3968 if (chroot_allow_open_directories == 0 || 3969 (chroot_allow_open_directories == 1 && 3970 oldpwd->pwd_rdir != rootvnode)) { 3971 error = chroot_refuse_vdir_fds(fdp); 3972 FILEDESC_SUNLOCK(fdp); 3973 if (error != 0) { 3974 PWDDESC_XUNLOCK(pdp); 3975 pwd_drop(newpwd); 3976 return (error); 3977 } 3978 } else { 3979 FILEDESC_SUNLOCK(fdp); 3980 } 3981 3982 vrefact(vp); 3983 newpwd->pwd_rdir = vp; 3984 vrefact(vp); 3985 newpwd->pwd_adir = vp; 3986 if (oldpwd->pwd_jdir == NULL) { 3987 vrefact(vp); 3988 newpwd->pwd_jdir = vp; 3989 } 3990 pwd_fill(oldpwd, newpwd); 3991 pwd_set(pdp, newpwd); 3992 PWDDESC_XUNLOCK(pdp); 3993 pwd_drop(oldpwd); 3994 return (0); 3995 } 3996 3997 void 3998 pwd_chdir(struct thread *td, struct vnode *vp) 3999 { 4000 struct pwddesc *pdp; 4001 struct pwd *newpwd, *oldpwd; 4002 4003 VNPASS(vp->v_usecount > 0, vp); 4004 4005 newpwd = pwd_alloc(); 4006 pdp = td->td_proc->p_pd; 4007 PWDDESC_XLOCK(pdp); 4008 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4009 newpwd->pwd_cdir = vp; 4010 pwd_fill(oldpwd, newpwd); 4011 pwd_set(pdp, newpwd); 4012 PWDDESC_XUNLOCK(pdp); 4013 pwd_drop(oldpwd); 4014 } 4015 4016 /* 4017 * Process is transitioning to/from a non-native ABI. 4018 */ 4019 void 4020 pwd_altroot(struct thread *td, struct vnode *altroot_vp) 4021 { 4022 struct pwddesc *pdp; 4023 struct pwd *newpwd, *oldpwd; 4024 4025 newpwd = pwd_alloc(); 4026 pdp = td->td_proc->p_pd; 4027 PWDDESC_XLOCK(pdp); 4028 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4029 if (altroot_vp != NULL) { 4030 /* 4031 * Native process to a non-native ABI. 4032 */ 4033 4034 vrefact(altroot_vp); 4035 newpwd->pwd_adir = altroot_vp; 4036 } else { 4037 /* 4038 * Non-native process to the native ABI. 4039 */ 4040 4041 vrefact(oldpwd->pwd_rdir); 4042 newpwd->pwd_adir = oldpwd->pwd_rdir; 4043 } 4044 pwd_fill(oldpwd, newpwd); 4045 pwd_set(pdp, newpwd); 4046 PWDDESC_XUNLOCK(pdp); 4047 pwd_drop(oldpwd); 4048 } 4049 4050 /* 4051 * jail_attach(2) changes both root and working directories. 4052 */ 4053 int 4054 pwd_chroot_chdir(struct thread *td, struct vnode *vp) 4055 { 4056 struct pwddesc *pdp; 4057 struct filedesc *fdp; 4058 struct pwd *newpwd, *oldpwd; 4059 int error; 4060 4061 fdp = td->td_proc->p_fd; 4062 pdp = td->td_proc->p_pd; 4063 newpwd = pwd_alloc(); 4064 FILEDESC_SLOCK(fdp); 4065 PWDDESC_XLOCK(pdp); 4066 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4067 error = chroot_refuse_vdir_fds(fdp); 4068 FILEDESC_SUNLOCK(fdp); 4069 if (error != 0) { 4070 PWDDESC_XUNLOCK(pdp); 4071 pwd_drop(newpwd); 4072 return (error); 4073 } 4074 4075 vrefact(vp); 4076 newpwd->pwd_rdir = vp; 4077 vrefact(vp); 4078 newpwd->pwd_cdir = vp; 4079 if (oldpwd->pwd_jdir == NULL) { 4080 vrefact(vp); 4081 newpwd->pwd_jdir = vp; 4082 } 4083 vrefact(vp); 4084 newpwd->pwd_adir = vp; 4085 pwd_fill(oldpwd, newpwd); 4086 pwd_set(pdp, newpwd); 4087 PWDDESC_XUNLOCK(pdp); 4088 pwd_drop(oldpwd); 4089 return (0); 4090 } 4091 4092 void 4093 pwd_ensure_dirs(void) 4094 { 4095 struct pwddesc *pdp; 4096 struct pwd *oldpwd, *newpwd; 4097 4098 pdp = curproc->p_pd; 4099 PWDDESC_XLOCK(pdp); 4100 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4101 if (oldpwd->pwd_cdir != NULL && oldpwd->pwd_rdir != NULL && 4102 oldpwd->pwd_adir != NULL) { 4103 PWDDESC_XUNLOCK(pdp); 4104 return; 4105 } 4106 PWDDESC_XUNLOCK(pdp); 4107 4108 newpwd = pwd_alloc(); 4109 PWDDESC_XLOCK(pdp); 4110 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4111 pwd_fill(oldpwd, newpwd); 4112 if (newpwd->pwd_cdir == NULL) { 4113 vrefact(rootvnode); 4114 newpwd->pwd_cdir = rootvnode; 4115 } 4116 if (newpwd->pwd_rdir == NULL) { 4117 vrefact(rootvnode); 4118 newpwd->pwd_rdir = rootvnode; 4119 } 4120 if (newpwd->pwd_adir == NULL) { 4121 vrefact(rootvnode); 4122 newpwd->pwd_adir = rootvnode; 4123 } 4124 pwd_set(pdp, newpwd); 4125 PWDDESC_XUNLOCK(pdp); 4126 pwd_drop(oldpwd); 4127 } 4128 4129 void 4130 pwd_set_rootvnode(void) 4131 { 4132 struct pwddesc *pdp; 4133 struct pwd *oldpwd, *newpwd; 4134 4135 pdp = curproc->p_pd; 4136 4137 newpwd = pwd_alloc(); 4138 PWDDESC_XLOCK(pdp); 4139 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4140 vrefact(rootvnode); 4141 newpwd->pwd_cdir = rootvnode; 4142 vrefact(rootvnode); 4143 newpwd->pwd_rdir = rootvnode; 4144 vrefact(rootvnode); 4145 newpwd->pwd_adir = rootvnode; 4146 pwd_fill(oldpwd, newpwd); 4147 pwd_set(pdp, newpwd); 4148 PWDDESC_XUNLOCK(pdp); 4149 pwd_drop(oldpwd); 4150 } 4151 4152 /* 4153 * Scan all active processes and prisons to see if any of them have a current 4154 * or root directory of `olddp'. If so, replace them with the new mount point. 4155 */ 4156 void 4157 mountcheckdirs(struct vnode *olddp, struct vnode *newdp) 4158 { 4159 struct pwddesc *pdp; 4160 struct pwd *newpwd, *oldpwd; 4161 struct prison *pr; 4162 struct proc *p; 4163 int nrele; 4164 4165 if (vrefcnt(olddp) == 1) 4166 return; 4167 nrele = 0; 4168 newpwd = pwd_alloc(); 4169 sx_slock(&allproc_lock); 4170 FOREACH_PROC_IN_SYSTEM(p) { 4171 PROC_LOCK(p); 4172 pdp = pdhold(p); 4173 PROC_UNLOCK(p); 4174 if (pdp == NULL) 4175 continue; 4176 PWDDESC_XLOCK(pdp); 4177 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4178 if (oldpwd == NULL || 4179 (oldpwd->pwd_cdir != olddp && 4180 oldpwd->pwd_rdir != olddp && 4181 oldpwd->pwd_jdir != olddp && 4182 oldpwd->pwd_adir != olddp)) { 4183 PWDDESC_XUNLOCK(pdp); 4184 pddrop(pdp); 4185 continue; 4186 } 4187 if (oldpwd->pwd_cdir == olddp) { 4188 vrefact(newdp); 4189 newpwd->pwd_cdir = newdp; 4190 } 4191 if (oldpwd->pwd_rdir == olddp) { 4192 vrefact(newdp); 4193 newpwd->pwd_rdir = newdp; 4194 } 4195 if (oldpwd->pwd_jdir == olddp) { 4196 vrefact(newdp); 4197 newpwd->pwd_jdir = newdp; 4198 } 4199 if (oldpwd->pwd_adir == olddp) { 4200 vrefact(newdp); 4201 newpwd->pwd_adir = newdp; 4202 } 4203 pwd_fill(oldpwd, newpwd); 4204 pwd_set(pdp, newpwd); 4205 PWDDESC_XUNLOCK(pdp); 4206 pwd_drop(oldpwd); 4207 pddrop(pdp); 4208 newpwd = pwd_alloc(); 4209 } 4210 sx_sunlock(&allproc_lock); 4211 pwd_drop(newpwd); 4212 if (rootvnode == olddp) { 4213 vrefact(newdp); 4214 rootvnode = newdp; 4215 nrele++; 4216 } 4217 mtx_lock(&prison0.pr_mtx); 4218 if (prison0.pr_root == olddp) { 4219 vrefact(newdp); 4220 prison0.pr_root = newdp; 4221 nrele++; 4222 } 4223 mtx_unlock(&prison0.pr_mtx); 4224 sx_slock(&allprison_lock); 4225 TAILQ_FOREACH(pr, &allprison, pr_list) { 4226 mtx_lock(&pr->pr_mtx); 4227 if (pr->pr_root == olddp) { 4228 vrefact(newdp); 4229 pr->pr_root = newdp; 4230 nrele++; 4231 } 4232 mtx_unlock(&pr->pr_mtx); 4233 } 4234 sx_sunlock(&allprison_lock); 4235 while (nrele--) 4236 vrele(olddp); 4237 } 4238 4239 int 4240 descrip_check_write_mp(struct filedesc *fdp, struct mount *mp) 4241 { 4242 struct file *fp; 4243 struct vnode *vp; 4244 int error, i; 4245 4246 error = 0; 4247 FILEDESC_SLOCK(fdp); 4248 FILEDESC_FOREACH_FP(fdp, i, fp) { 4249 if (fp->f_type != DTYPE_VNODE || 4250 (atomic_load_int(&fp->f_flag) & FWRITE) == 0) 4251 continue; 4252 vp = fp->f_vnode; 4253 if (vp->v_mount == mp) { 4254 error = EDEADLK; 4255 break; 4256 } 4257 } 4258 FILEDESC_SUNLOCK(fdp); 4259 return (error); 4260 } 4261 4262 struct filedesc_to_leader * 4263 filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, 4264 struct proc *leader) 4265 { 4266 struct filedesc_to_leader *fdtol; 4267 4268 fdtol = malloc(sizeof(struct filedesc_to_leader), 4269 M_FILEDESC_TO_LEADER, M_WAITOK); 4270 fdtol->fdl_refcount = 1; 4271 fdtol->fdl_holdcount = 0; 4272 fdtol->fdl_wakeup = 0; 4273 fdtol->fdl_leader = leader; 4274 if (old != NULL) { 4275 FILEDESC_XLOCK(fdp); 4276 fdtol->fdl_next = old->fdl_next; 4277 fdtol->fdl_prev = old; 4278 old->fdl_next = fdtol; 4279 fdtol->fdl_next->fdl_prev = fdtol; 4280 FILEDESC_XUNLOCK(fdp); 4281 } else { 4282 fdtol->fdl_next = fdtol; 4283 fdtol->fdl_prev = fdtol; 4284 } 4285 return (fdtol); 4286 } 4287 4288 struct filedesc_to_leader * 4289 filedesc_to_leader_share(struct filedesc_to_leader *fdtol, struct filedesc *fdp) 4290 { 4291 FILEDESC_XLOCK(fdp); 4292 fdtol->fdl_refcount++; 4293 FILEDESC_XUNLOCK(fdp); 4294 return (fdtol); 4295 } 4296 4297 static int 4298 sysctl_kern_proc_nfds(SYSCTL_HANDLER_ARGS) 4299 { 4300 NDSLOTTYPE *map; 4301 struct filedesc *fdp; 4302 u_int namelen; 4303 int count, off, minoff; 4304 4305 namelen = arg2; 4306 if (namelen != 1) 4307 return (EINVAL); 4308 4309 if (*(int *)arg1 != 0) 4310 return (EINVAL); 4311 4312 fdp = curproc->p_fd; 4313 count = 0; 4314 FILEDESC_SLOCK(fdp); 4315 map = fdp->fd_map; 4316 off = NDSLOT(fdp->fd_nfiles - 1); 4317 for (minoff = NDSLOT(0); off >= minoff; --off) 4318 count += bitcountl(map[off]); 4319 FILEDESC_SUNLOCK(fdp); 4320 4321 return (SYSCTL_OUT(req, &count, sizeof(count))); 4322 } 4323 4324 static SYSCTL_NODE(_kern_proc, KERN_PROC_NFDS, nfds, 4325 CTLFLAG_RD|CTLFLAG_CAPRD|CTLFLAG_MPSAFE, sysctl_kern_proc_nfds, 4326 "Number of open file descriptors"); 4327 4328 /* 4329 * Get file structures globally. 4330 */ 4331 static int 4332 sysctl_kern_file(SYSCTL_HANDLER_ARGS) 4333 { 4334 struct xfile xf; 4335 struct filedesc *fdp; 4336 struct file *fp; 4337 struct proc *p; 4338 int error, n; 4339 4340 error = sysctl_wire_old_buffer(req, 0); 4341 if (error != 0) 4342 return (error); 4343 if (req->oldptr == NULL) { 4344 n = 0; 4345 sx_slock(&allproc_lock); 4346 FOREACH_PROC_IN_SYSTEM(p) { 4347 PROC_LOCK(p); 4348 if (p->p_state == PRS_NEW) { 4349 PROC_UNLOCK(p); 4350 continue; 4351 } 4352 fdp = fdhold(p); 4353 PROC_UNLOCK(p); 4354 if (fdp == NULL) 4355 continue; 4356 /* overestimates sparse tables. */ 4357 n += fdp->fd_nfiles; 4358 fddrop(fdp); 4359 } 4360 sx_sunlock(&allproc_lock); 4361 return (SYSCTL_OUT(req, 0, n * sizeof(xf))); 4362 } 4363 error = 0; 4364 bzero(&xf, sizeof(xf)); 4365 xf.xf_size = sizeof(xf); 4366 sx_slock(&allproc_lock); 4367 FOREACH_PROC_IN_SYSTEM(p) { 4368 PROC_LOCK(p); 4369 if (p->p_state == PRS_NEW) { 4370 PROC_UNLOCK(p); 4371 continue; 4372 } 4373 if (p_cansee(req->td, p) != 0) { 4374 PROC_UNLOCK(p); 4375 continue; 4376 } 4377 xf.xf_pid = p->p_pid; 4378 xf.xf_uid = p->p_ucred->cr_uid; 4379 fdp = fdhold(p); 4380 PROC_UNLOCK(p); 4381 if (fdp == NULL) 4382 continue; 4383 FILEDESC_SLOCK(fdp); 4384 if (refcount_load(&fdp->fd_refcnt) == 0) 4385 goto nextproc; 4386 FILEDESC_FOREACH_FP(fdp, n, fp) { 4387 xf.xf_fd = n; 4388 xf.xf_file = (uintptr_t)fp; 4389 xf.xf_data = (uintptr_t)fp->f_data; 4390 xf.xf_vnode = (uintptr_t)fp->f_vnode; 4391 xf.xf_type = (uintptr_t)fp->f_type; 4392 xf.xf_count = refcount_load(&fp->f_count); 4393 xf.xf_msgcount = 0; 4394 xf.xf_offset = foffset_get(fp); 4395 xf.xf_flag = fp->f_flag; 4396 error = SYSCTL_OUT(req, &xf, sizeof(xf)); 4397 4398 /* 4399 * There is no need to re-check the fdtable refcount 4400 * here since the filedesc lock is not dropped in the 4401 * loop body. 4402 */ 4403 if (error != 0) 4404 break; 4405 } 4406 nextproc: 4407 FILEDESC_SUNLOCK(fdp); 4408 fddrop(fdp); 4409 if (error) 4410 break; 4411 } 4412 sx_sunlock(&allproc_lock); 4413 return (error); 4414 } 4415 4416 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE, 4417 0, 0, sysctl_kern_file, "S,xfile", "Entire file table"); 4418 4419 #ifdef KINFO_FILE_SIZE 4420 CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE); 4421 #endif 4422 4423 static int 4424 xlate_fflags(int fflags) 4425 { 4426 static const struct { 4427 int fflag; 4428 int kf_fflag; 4429 } fflags_table[] = { 4430 { FAPPEND, KF_FLAG_APPEND }, 4431 { FASYNC, KF_FLAG_ASYNC }, 4432 { FFSYNC, KF_FLAG_FSYNC }, 4433 { FHASLOCK, KF_FLAG_HASLOCK }, 4434 { FNONBLOCK, KF_FLAG_NONBLOCK }, 4435 { FREAD, KF_FLAG_READ }, 4436 { FWRITE, KF_FLAG_WRITE }, 4437 { O_CREAT, KF_FLAG_CREAT }, 4438 { O_DIRECT, KF_FLAG_DIRECT }, 4439 { O_EXCL, KF_FLAG_EXCL }, 4440 { O_EXEC, KF_FLAG_EXEC }, 4441 { O_EXLOCK, KF_FLAG_EXLOCK }, 4442 { O_NOFOLLOW, KF_FLAG_NOFOLLOW }, 4443 { O_SHLOCK, KF_FLAG_SHLOCK }, 4444 { O_TRUNC, KF_FLAG_TRUNC } 4445 }; 4446 unsigned int i; 4447 int kflags; 4448 4449 kflags = 0; 4450 for (i = 0; i < nitems(fflags_table); i++) 4451 if (fflags & fflags_table[i].fflag) 4452 kflags |= fflags_table[i].kf_fflag; 4453 return (kflags); 4454 } 4455 4456 /* Trim unused data from kf_path by truncating the structure size. */ 4457 void 4458 pack_kinfo(struct kinfo_file *kif) 4459 { 4460 4461 kif->kf_structsize = offsetof(struct kinfo_file, kf_path) + 4462 strlen(kif->kf_path) + 1; 4463 kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t)); 4464 } 4465 4466 static void 4467 export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp, 4468 struct kinfo_file *kif, struct filedesc *fdp, int flags) 4469 { 4470 int error; 4471 4472 bzero(kif, sizeof(*kif)); 4473 4474 /* Set a default type to allow for empty fill_kinfo() methods. */ 4475 kif->kf_type = KF_TYPE_UNKNOWN; 4476 kif->kf_flags = xlate_fflags(fp->f_flag); 4477 if (rightsp != NULL) 4478 kif->kf_cap_rights = *rightsp; 4479 else 4480 cap_rights_init_zero(&kif->kf_cap_rights); 4481 kif->kf_fd = fd; 4482 kif->kf_ref_count = refcount_load(&fp->f_count); 4483 kif->kf_offset = foffset_get(fp); 4484 4485 /* 4486 * This may drop the filedesc lock, so the 'fp' cannot be 4487 * accessed after this call. 4488 */ 4489 error = fo_fill_kinfo(fp, kif, fdp); 4490 if (error == 0) 4491 kif->kf_status |= KF_ATTR_VALID; 4492 if ((flags & KERN_FILEDESC_PACK_KINFO) != 0) 4493 pack_kinfo(kif); 4494 else 4495 kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t)); 4496 } 4497 4498 static void 4499 export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags, 4500 struct kinfo_file *kif, int flags) 4501 { 4502 int error; 4503 4504 bzero(kif, sizeof(*kif)); 4505 4506 kif->kf_type = KF_TYPE_VNODE; 4507 error = vn_fill_kinfo_vnode(vp, kif); 4508 if (error == 0) 4509 kif->kf_status |= KF_ATTR_VALID; 4510 kif->kf_flags = xlate_fflags(fflags); 4511 cap_rights_init_zero(&kif->kf_cap_rights); 4512 kif->kf_fd = fd; 4513 kif->kf_ref_count = -1; 4514 kif->kf_offset = -1; 4515 if ((flags & KERN_FILEDESC_PACK_KINFO) != 0) 4516 pack_kinfo(kif); 4517 else 4518 kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t)); 4519 vrele(vp); 4520 } 4521 4522 struct export_fd_buf { 4523 struct filedesc *fdp; 4524 struct pwddesc *pdp; 4525 struct sbuf *sb; 4526 ssize_t remainder; 4527 struct kinfo_file kif; 4528 int flags; 4529 }; 4530 4531 static int 4532 export_kinfo_to_sb(struct export_fd_buf *efbuf) 4533 { 4534 struct kinfo_file *kif; 4535 4536 kif = &efbuf->kif; 4537 if (efbuf->remainder != -1) { 4538 if (efbuf->remainder < kif->kf_structsize) 4539 return (ENOMEM); 4540 efbuf->remainder -= kif->kf_structsize; 4541 } 4542 if (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) != 0) 4543 return (sbuf_error(efbuf->sb)); 4544 return (0); 4545 } 4546 4547 static int 4548 export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp, 4549 struct export_fd_buf *efbuf) 4550 { 4551 int error; 4552 4553 if (efbuf->remainder == 0) 4554 return (ENOMEM); 4555 export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp, 4556 efbuf->flags); 4557 FILEDESC_SUNLOCK(efbuf->fdp); 4558 error = export_kinfo_to_sb(efbuf); 4559 FILEDESC_SLOCK(efbuf->fdp); 4560 return (error); 4561 } 4562 4563 static int 4564 export_vnode_to_sb(struct vnode *vp, int fd, int fflags, 4565 struct export_fd_buf *efbuf) 4566 { 4567 int error; 4568 4569 if (efbuf->remainder == 0) 4570 return (ENOMEM); 4571 if (efbuf->pdp != NULL) 4572 PWDDESC_XUNLOCK(efbuf->pdp); 4573 export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif, efbuf->flags); 4574 error = export_kinfo_to_sb(efbuf); 4575 if (efbuf->pdp != NULL) 4576 PWDDESC_XLOCK(efbuf->pdp); 4577 return (error); 4578 } 4579 4580 /* 4581 * Store a process file descriptor information to sbuf. 4582 * 4583 * Takes a locked proc as argument, and returns with the proc unlocked. 4584 */ 4585 int 4586 kern_proc_filedesc_out(struct proc *p, struct sbuf *sb, ssize_t maxlen, 4587 int flags) 4588 { 4589 struct file *fp; 4590 struct filedesc *fdp; 4591 struct pwddesc *pdp; 4592 struct export_fd_buf *efbuf; 4593 struct vnode *cttyvp, *textvp, *tracevp; 4594 struct pwd *pwd; 4595 int error, i; 4596 cap_rights_t rights; 4597 4598 PROC_LOCK_ASSERT(p, MA_OWNED); 4599 4600 /* ktrace vnode */ 4601 tracevp = ktr_get_tracevp(p, true); 4602 /* text vnode */ 4603 textvp = p->p_textvp; 4604 if (textvp != NULL) 4605 vrefact(textvp); 4606 /* Controlling tty. */ 4607 cttyvp = NULL; 4608 if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) { 4609 cttyvp = p->p_pgrp->pg_session->s_ttyvp; 4610 if (cttyvp != NULL) 4611 vrefact(cttyvp); 4612 } 4613 fdp = fdhold(p); 4614 pdp = pdhold(p); 4615 PROC_UNLOCK(p); 4616 4617 efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); 4618 efbuf->fdp = NULL; 4619 efbuf->pdp = NULL; 4620 efbuf->sb = sb; 4621 efbuf->remainder = maxlen; 4622 efbuf->flags = flags; 4623 4624 error = 0; 4625 if (tracevp != NULL) 4626 error = export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, 4627 FREAD | FWRITE, efbuf); 4628 if (error == 0 && textvp != NULL) 4629 error = export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, 4630 efbuf); 4631 if (error == 0 && cttyvp != NULL) 4632 error = export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, 4633 FREAD | FWRITE, efbuf); 4634 if (error != 0 || pdp == NULL || fdp == NULL) 4635 goto fail; 4636 efbuf->fdp = fdp; 4637 efbuf->pdp = pdp; 4638 PWDDESC_XLOCK(pdp); 4639 pwd = pwd_hold_pwddesc(pdp); 4640 if (pwd != NULL) { 4641 /* working directory */ 4642 if (pwd->pwd_cdir != NULL) { 4643 vrefact(pwd->pwd_cdir); 4644 error = export_vnode_to_sb(pwd->pwd_cdir, 4645 KF_FD_TYPE_CWD, FREAD, efbuf); 4646 } 4647 /* root directory */ 4648 if (error == 0 && pwd->pwd_rdir != NULL) { 4649 vrefact(pwd->pwd_rdir); 4650 error = export_vnode_to_sb(pwd->pwd_rdir, 4651 KF_FD_TYPE_ROOT, FREAD, efbuf); 4652 } 4653 /* jail directory */ 4654 if (error == 0 && pwd->pwd_jdir != NULL) { 4655 vrefact(pwd->pwd_jdir); 4656 error = export_vnode_to_sb(pwd->pwd_jdir, 4657 KF_FD_TYPE_JAIL, FREAD, efbuf); 4658 } 4659 } 4660 PWDDESC_XUNLOCK(pdp); 4661 if (error != 0) 4662 goto fail; 4663 if (pwd != NULL) 4664 pwd_drop(pwd); 4665 FILEDESC_SLOCK(fdp); 4666 if (refcount_load(&fdp->fd_refcnt) == 0) 4667 goto skip; 4668 FILEDESC_FOREACH_FP(fdp, i, fp) { 4669 #ifdef CAPABILITIES 4670 rights = *cap_rights(fdp, i); 4671 #else /* !CAPABILITIES */ 4672 rights = cap_no_rights; 4673 #endif 4674 /* 4675 * Create sysctl entry. It is OK to drop the filedesc 4676 * lock inside of export_file_to_sb() as we will 4677 * re-validate and re-evaluate its properties when the 4678 * loop continues. 4679 */ 4680 error = export_file_to_sb(fp, i, &rights, efbuf); 4681 if (error != 0 || refcount_load(&fdp->fd_refcnt) == 0) 4682 break; 4683 } 4684 skip: 4685 FILEDESC_SUNLOCK(fdp); 4686 fail: 4687 if (fdp != NULL) 4688 fddrop(fdp); 4689 if (pdp != NULL) 4690 pddrop(pdp); 4691 free(efbuf, M_TEMP); 4692 return (error); 4693 } 4694 4695 #define FILEDESC_SBUF_SIZE (sizeof(struct kinfo_file) * 5) 4696 4697 /* 4698 * Get per-process file descriptors for use by procstat(1), et al. 4699 */ 4700 static int 4701 sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS) 4702 { 4703 struct sbuf sb; 4704 struct proc *p; 4705 ssize_t maxlen; 4706 u_int namelen; 4707 int error, error2, *name; 4708 4709 namelen = arg2; 4710 if (namelen != 1) 4711 return (EINVAL); 4712 4713 name = (int *)arg1; 4714 4715 sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req); 4716 sbuf_clear_flags(&sb, SBUF_INCLUDENUL); 4717 error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); 4718 if (error != 0) { 4719 sbuf_delete(&sb); 4720 return (error); 4721 } 4722 maxlen = req->oldptr != NULL ? req->oldlen : -1; 4723 error = kern_proc_filedesc_out(p, &sb, maxlen, 4724 KERN_FILEDESC_PACK_KINFO); 4725 error2 = sbuf_finish(&sb); 4726 sbuf_delete(&sb); 4727 return (error != 0 ? error : error2); 4728 } 4729 4730 #ifdef COMPAT_FREEBSD7 4731 #ifdef KINFO_OFILE_SIZE 4732 CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE); 4733 #endif 4734 4735 static void 4736 kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif) 4737 { 4738 4739 okif->kf_structsize = sizeof(*okif); 4740 okif->kf_type = kif->kf_type; 4741 okif->kf_fd = kif->kf_fd; 4742 okif->kf_ref_count = kif->kf_ref_count; 4743 okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE | 4744 KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK | 4745 KF_FLAG_DIRECT | KF_FLAG_HASLOCK); 4746 okif->kf_offset = kif->kf_offset; 4747 if (kif->kf_type == KF_TYPE_VNODE) 4748 okif->kf_vnode_type = kif->kf_un.kf_file.kf_file_type; 4749 else 4750 okif->kf_vnode_type = KF_VTYPE_VNON; 4751 strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path)); 4752 if (kif->kf_type == KF_TYPE_SOCKET) { 4753 okif->kf_sock_domain = kif->kf_un.kf_sock.kf_sock_domain0; 4754 okif->kf_sock_type = kif->kf_un.kf_sock.kf_sock_type0; 4755 okif->kf_sock_protocol = kif->kf_un.kf_sock.kf_sock_protocol0; 4756 okif->kf_sa_local = kif->kf_un.kf_sock.kf_sa_local; 4757 okif->kf_sa_peer = kif->kf_un.kf_sock.kf_sa_peer; 4758 } else { 4759 okif->kf_sa_local.ss_family = AF_UNSPEC; 4760 okif->kf_sa_peer.ss_family = AF_UNSPEC; 4761 } 4762 } 4763 4764 static int 4765 export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif, 4766 struct kinfo_ofile *okif, struct pwddesc *pdp, struct sysctl_req *req) 4767 { 4768 int error; 4769 4770 vrefact(vp); 4771 PWDDESC_XUNLOCK(pdp); 4772 export_vnode_to_kinfo(vp, type, 0, kif, KERN_FILEDESC_PACK_KINFO); 4773 kinfo_to_okinfo(kif, okif); 4774 error = SYSCTL_OUT(req, okif, sizeof(*okif)); 4775 PWDDESC_XLOCK(pdp); 4776 return (error); 4777 } 4778 4779 /* 4780 * Get per-process file descriptors for use by procstat(1), et al. 4781 */ 4782 static int 4783 sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS) 4784 { 4785 struct kinfo_ofile *okif; 4786 struct kinfo_file *kif; 4787 struct filedesc *fdp; 4788 struct pwddesc *pdp; 4789 struct pwd *pwd; 4790 u_int namelen; 4791 int error, i, *name; 4792 struct file *fp; 4793 struct proc *p; 4794 4795 namelen = arg2; 4796 if (namelen != 1) 4797 return (EINVAL); 4798 4799 name = (int *)arg1; 4800 error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); 4801 if (error != 0) 4802 return (error); 4803 fdp = fdhold(p); 4804 if (fdp != NULL) 4805 pdp = pdhold(p); 4806 PROC_UNLOCK(p); 4807 if (fdp == NULL || pdp == NULL) { 4808 if (fdp != NULL) 4809 fddrop(fdp); 4810 return (ENOENT); 4811 } 4812 kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK); 4813 okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK); 4814 PWDDESC_XLOCK(pdp); 4815 pwd = pwd_hold_pwddesc(pdp); 4816 if (pwd != NULL) { 4817 if (pwd->pwd_cdir != NULL) 4818 export_vnode_for_osysctl(pwd->pwd_cdir, KF_FD_TYPE_CWD, kif, 4819 okif, pdp, req); 4820 if (pwd->pwd_rdir != NULL) 4821 export_vnode_for_osysctl(pwd->pwd_rdir, KF_FD_TYPE_ROOT, kif, 4822 okif, pdp, req); 4823 if (pwd->pwd_jdir != NULL) 4824 export_vnode_for_osysctl(pwd->pwd_jdir, KF_FD_TYPE_JAIL, kif, 4825 okif, pdp, req); 4826 } 4827 PWDDESC_XUNLOCK(pdp); 4828 if (pwd != NULL) 4829 pwd_drop(pwd); 4830 FILEDESC_SLOCK(fdp); 4831 if (refcount_load(&fdp->fd_refcnt) == 0) 4832 goto skip; 4833 FILEDESC_FOREACH_FP(fdp, i, fp) { 4834 export_file_to_kinfo(fp, i, NULL, kif, fdp, 4835 KERN_FILEDESC_PACK_KINFO); 4836 FILEDESC_SUNLOCK(fdp); 4837 kinfo_to_okinfo(kif, okif); 4838 error = SYSCTL_OUT(req, okif, sizeof(*okif)); 4839 FILEDESC_SLOCK(fdp); 4840 if (error != 0 || refcount_load(&fdp->fd_refcnt) == 0) 4841 break; 4842 } 4843 skip: 4844 FILEDESC_SUNLOCK(fdp); 4845 fddrop(fdp); 4846 pddrop(pdp); 4847 free(kif, M_TEMP); 4848 free(okif, M_TEMP); 4849 return (0); 4850 } 4851 4852 static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, 4853 CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc, 4854 "Process ofiledesc entries"); 4855 #endif /* COMPAT_FREEBSD7 */ 4856 4857 int 4858 vntype_to_kinfo(int vtype) 4859 { 4860 struct { 4861 int vtype; 4862 int kf_vtype; 4863 } vtypes_table[] = { 4864 { VBAD, KF_VTYPE_VBAD }, 4865 { VBLK, KF_VTYPE_VBLK }, 4866 { VCHR, KF_VTYPE_VCHR }, 4867 { VDIR, KF_VTYPE_VDIR }, 4868 { VFIFO, KF_VTYPE_VFIFO }, 4869 { VLNK, KF_VTYPE_VLNK }, 4870 { VNON, KF_VTYPE_VNON }, 4871 { VREG, KF_VTYPE_VREG }, 4872 { VSOCK, KF_VTYPE_VSOCK } 4873 }; 4874 unsigned int i; 4875 4876 /* 4877 * Perform vtype translation. 4878 */ 4879 for (i = 0; i < nitems(vtypes_table); i++) 4880 if (vtypes_table[i].vtype == vtype) 4881 return (vtypes_table[i].kf_vtype); 4882 4883 return (KF_VTYPE_UNKNOWN); 4884 } 4885 4886 static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, 4887 CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc, 4888 "Process filedesc entries"); 4889 4890 /* 4891 * Store a process current working directory information to sbuf. 4892 * 4893 * Takes a locked proc as argument, and returns with the proc unlocked. 4894 */ 4895 int 4896 kern_proc_cwd_out(struct proc *p, struct sbuf *sb, ssize_t maxlen) 4897 { 4898 struct pwddesc *pdp; 4899 struct pwd *pwd; 4900 struct export_fd_buf *efbuf; 4901 struct vnode *cdir; 4902 int error; 4903 4904 PROC_LOCK_ASSERT(p, MA_OWNED); 4905 4906 pdp = pdhold(p); 4907 PROC_UNLOCK(p); 4908 if (pdp == NULL) 4909 return (EINVAL); 4910 4911 efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); 4912 efbuf->fdp = NULL; 4913 efbuf->pdp = pdp; 4914 efbuf->sb = sb; 4915 efbuf->remainder = maxlen; 4916 efbuf->flags = 0; 4917 4918 PWDDESC_XLOCK(pdp); 4919 pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4920 cdir = pwd->pwd_cdir; 4921 if (cdir == NULL) { 4922 error = EINVAL; 4923 } else { 4924 vrefact(cdir); 4925 error = export_vnode_to_sb(cdir, KF_FD_TYPE_CWD, FREAD, efbuf); 4926 } 4927 PWDDESC_XUNLOCK(pdp); 4928 pddrop(pdp); 4929 free(efbuf, M_TEMP); 4930 return (error); 4931 } 4932 4933 /* 4934 * Get per-process current working directory. 4935 */ 4936 static int 4937 sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS) 4938 { 4939 struct sbuf sb; 4940 struct proc *p; 4941 ssize_t maxlen; 4942 u_int namelen; 4943 int error, error2, *name; 4944 4945 namelen = arg2; 4946 if (namelen != 1) 4947 return (EINVAL); 4948 4949 name = (int *)arg1; 4950 4951 sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req); 4952 sbuf_clear_flags(&sb, SBUF_INCLUDENUL); 4953 error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); 4954 if (error != 0) { 4955 sbuf_delete(&sb); 4956 return (error); 4957 } 4958 maxlen = req->oldptr != NULL ? req->oldlen : -1; 4959 error = kern_proc_cwd_out(p, &sb, maxlen); 4960 error2 = sbuf_finish(&sb); 4961 sbuf_delete(&sb); 4962 return (error != 0 ? error : error2); 4963 } 4964 4965 static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE, 4966 sysctl_kern_proc_cwd, "Process current working directory"); 4967 4968 #ifdef DDB 4969 /* 4970 * For the purposes of debugging, generate a human-readable string for the 4971 * file type. 4972 */ 4973 static const char * 4974 file_type_to_name(short type) 4975 { 4976 4977 switch (type) { 4978 case 0: 4979 return ("zero"); 4980 case DTYPE_VNODE: 4981 return ("vnode"); 4982 case DTYPE_SOCKET: 4983 return ("socket"); 4984 case DTYPE_PIPE: 4985 return ("pipe"); 4986 case DTYPE_FIFO: 4987 return ("fifo"); 4988 case DTYPE_KQUEUE: 4989 return ("kqueue"); 4990 case DTYPE_CRYPTO: 4991 return ("crypto"); 4992 case DTYPE_MQUEUE: 4993 return ("mqueue"); 4994 case DTYPE_SHM: 4995 return ("shm"); 4996 case DTYPE_SEM: 4997 return ("ksem"); 4998 case DTYPE_PTS: 4999 return ("pts"); 5000 case DTYPE_DEV: 5001 return ("dev"); 5002 case DTYPE_PROCDESC: 5003 return ("proc"); 5004 case DTYPE_EVENTFD: 5005 return ("eventfd"); 5006 case DTYPE_LINUXTFD: 5007 return ("ltimer"); 5008 default: 5009 return ("unkn"); 5010 } 5011 } 5012 5013 /* 5014 * For the purposes of debugging, identify a process (if any, perhaps one of 5015 * many) that references the passed file in its file descriptor array. Return 5016 * NULL if none. 5017 */ 5018 static struct proc * 5019 file_to_first_proc(struct file *fp) 5020 { 5021 struct filedesc *fdp; 5022 struct proc *p; 5023 int n; 5024 5025 FOREACH_PROC_IN_SYSTEM(p) { 5026 if (p->p_state == PRS_NEW) 5027 continue; 5028 fdp = p->p_fd; 5029 if (fdp == NULL) 5030 continue; 5031 for (n = 0; n < fdp->fd_nfiles; n++) { 5032 if (fp == fdp->fd_ofiles[n].fde_file) 5033 return (p); 5034 } 5035 } 5036 return (NULL); 5037 } 5038 5039 static void 5040 db_print_file(struct file *fp, int header) 5041 { 5042 #define XPTRWIDTH ((int)howmany(sizeof(void *) * NBBY, 4)) 5043 struct proc *p; 5044 5045 if (header) 5046 db_printf("%*s %6s %*s %8s %4s %5s %6s %*s %5s %s\n", 5047 XPTRWIDTH, "File", "Type", XPTRWIDTH, "Data", "Flag", 5048 "GCFl", "Count", "MCount", XPTRWIDTH, "Vnode", "FPID", 5049 "FCmd"); 5050 p = file_to_first_proc(fp); 5051 db_printf("%*p %6s %*p %08x %04x %5d %6d %*p %5d %s\n", XPTRWIDTH, 5052 fp, file_type_to_name(fp->f_type), XPTRWIDTH, fp->f_data, 5053 fp->f_flag, 0, refcount_load(&fp->f_count), 0, XPTRWIDTH, fp->f_vnode, 5054 p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-"); 5055 5056 #undef XPTRWIDTH 5057 } 5058 5059 DB_SHOW_COMMAND(file, db_show_file) 5060 { 5061 struct file *fp; 5062 5063 if (!have_addr) { 5064 db_printf("usage: show file <addr>\n"); 5065 return; 5066 } 5067 fp = (struct file *)addr; 5068 db_print_file(fp, 1); 5069 } 5070 5071 DB_SHOW_COMMAND_FLAGS(files, db_show_files, DB_CMD_MEMSAFE) 5072 { 5073 struct filedesc *fdp; 5074 struct file *fp; 5075 struct proc *p; 5076 int header; 5077 int n; 5078 5079 header = 1; 5080 FOREACH_PROC_IN_SYSTEM(p) { 5081 if (p->p_state == PRS_NEW) 5082 continue; 5083 if ((fdp = p->p_fd) == NULL) 5084 continue; 5085 for (n = 0; n < fdp->fd_nfiles; ++n) { 5086 if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) 5087 continue; 5088 db_print_file(fp, header); 5089 header = 0; 5090 } 5091 } 5092 } 5093 #endif 5094 5095 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, 5096 &maxfilesperproc, 0, "Maximum files allowed open per process"); 5097 5098 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, 5099 &maxfiles, 0, "Maximum number of files"); 5100 5101 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, 5102 &openfiles, 0, "System-wide number of open files"); 5103 5104 /* ARGSUSED*/ 5105 static void 5106 filelistinit(void *dummy) 5107 { 5108 5109 file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, 5110 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 5111 filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0), 5112 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 5113 pwd_zone = uma_zcreate("PWD", sizeof(struct pwd), NULL, NULL, 5114 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_SMR); 5115 /* 5116 * XXXMJG this is a temporary hack due to boot ordering issues against 5117 * the vnode zone. 5118 */ 5119 vfs_smr = uma_zone_get_smr(pwd_zone); 5120 mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); 5121 } 5122 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL); 5123 5124 /*-------------------------------------------------------------------*/ 5125 5126 static int 5127 badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, 5128 int flags, struct thread *td) 5129 { 5130 5131 return (EBADF); 5132 } 5133 5134 static int 5135 badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, 5136 struct thread *td) 5137 { 5138 5139 return (EINVAL); 5140 } 5141 5142 static int 5143 badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, 5144 struct thread *td) 5145 { 5146 5147 return (EBADF); 5148 } 5149 5150 static int 5151 badfo_poll(struct file *fp, int events, struct ucred *active_cred, 5152 struct thread *td) 5153 { 5154 5155 return (0); 5156 } 5157 5158 static int 5159 badfo_kqfilter(struct file *fp, struct knote *kn) 5160 { 5161 5162 return (EBADF); 5163 } 5164 5165 static int 5166 badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) 5167 { 5168 5169 return (EBADF); 5170 } 5171 5172 static int 5173 badfo_close(struct file *fp, struct thread *td) 5174 { 5175 5176 return (0); 5177 } 5178 5179 static int 5180 badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 5181 struct thread *td) 5182 { 5183 5184 return (EBADF); 5185 } 5186 5187 static int 5188 badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 5189 struct thread *td) 5190 { 5191 5192 return (EBADF); 5193 } 5194 5195 static int 5196 badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, 5197 struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, 5198 struct thread *td) 5199 { 5200 5201 return (EBADF); 5202 } 5203 5204 static int 5205 badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 5206 { 5207 5208 return (0); 5209 } 5210 5211 struct fileops badfileops = { 5212 .fo_read = badfo_readwrite, 5213 .fo_write = badfo_readwrite, 5214 .fo_truncate = badfo_truncate, 5215 .fo_ioctl = badfo_ioctl, 5216 .fo_poll = badfo_poll, 5217 .fo_kqfilter = badfo_kqfilter, 5218 .fo_stat = badfo_stat, 5219 .fo_close = badfo_close, 5220 .fo_chmod = badfo_chmod, 5221 .fo_chown = badfo_chown, 5222 .fo_sendfile = badfo_sendfile, 5223 .fo_fill_kinfo = badfo_fill_kinfo, 5224 }; 5225 5226 static int 5227 path_poll(struct file *fp, int events, struct ucred *active_cred, 5228 struct thread *td) 5229 { 5230 return (POLLNVAL); 5231 } 5232 5233 static int 5234 path_close(struct file *fp, struct thread *td) 5235 { 5236 MPASS(fp->f_type == DTYPE_VNODE); 5237 fp->f_ops = &badfileops; 5238 vrele(fp->f_vnode); 5239 return (0); 5240 } 5241 5242 struct fileops path_fileops = { 5243 .fo_read = badfo_readwrite, 5244 .fo_write = badfo_readwrite, 5245 .fo_truncate = badfo_truncate, 5246 .fo_ioctl = badfo_ioctl, 5247 .fo_poll = path_poll, 5248 .fo_kqfilter = vn_kqfilter_opath, 5249 .fo_stat = vn_statfile, 5250 .fo_close = path_close, 5251 .fo_chmod = badfo_chmod, 5252 .fo_chown = badfo_chown, 5253 .fo_sendfile = badfo_sendfile, 5254 .fo_fill_kinfo = vn_fill_kinfo, 5255 .fo_flags = DFLAG_PASSABLE, 5256 }; 5257 5258 int 5259 invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred, 5260 int flags, struct thread *td) 5261 { 5262 5263 return (EOPNOTSUPP); 5264 } 5265 5266 int 5267 invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, 5268 struct thread *td) 5269 { 5270 5271 return (EINVAL); 5272 } 5273 5274 int 5275 invfo_ioctl(struct file *fp, u_long com, void *data, 5276 struct ucred *active_cred, struct thread *td) 5277 { 5278 5279 return (ENOTTY); 5280 } 5281 5282 int 5283 invfo_poll(struct file *fp, int events, struct ucred *active_cred, 5284 struct thread *td) 5285 { 5286 5287 return (poll_no_poll(events)); 5288 } 5289 5290 int 5291 invfo_kqfilter(struct file *fp, struct knote *kn) 5292 { 5293 5294 return (EINVAL); 5295 } 5296 5297 int 5298 invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 5299 struct thread *td) 5300 { 5301 5302 return (EINVAL); 5303 } 5304 5305 int 5306 invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 5307 struct thread *td) 5308 { 5309 5310 return (EINVAL); 5311 } 5312 5313 int 5314 invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, 5315 struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, 5316 struct thread *td) 5317 { 5318 5319 return (EINVAL); 5320 } 5321 5322 /*-------------------------------------------------------------------*/ 5323 5324 /* 5325 * File Descriptor pseudo-device driver (/dev/fd/). 5326 * 5327 * Opening minor device N dup()s the file (if any) connected to file 5328 * descriptor N belonging to the calling process. Note that this driver 5329 * consists of only the ``open()'' routine, because all subsequent 5330 * references to this file will be direct to the other driver. 5331 * 5332 * XXX: we could give this one a cloning event handler if necessary. 5333 */ 5334 5335 /* ARGSUSED */ 5336 static int 5337 fdopen(struct cdev *dev, int mode, int type, struct thread *td) 5338 { 5339 5340 /* 5341 * XXX Kludge: set curthread->td_dupfd to contain the value of the 5342 * the file descriptor being sought for duplication. The error 5343 * return ensures that the vnode for this device will be released 5344 * by vn_open. Open will detect this special error and take the 5345 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN 5346 * will simply report the error. 5347 */ 5348 td->td_dupfd = dev2unit(dev); 5349 return (ENODEV); 5350 } 5351 5352 static struct cdevsw fildesc_cdevsw = { 5353 .d_version = D_VERSION, 5354 .d_open = fdopen, 5355 .d_name = "FD", 5356 }; 5357 5358 static void 5359 fildesc_drvinit(void *unused) 5360 { 5361 struct cdev *dev; 5362 5363 dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL, 5364 UID_ROOT, GID_WHEEL, 0666, "fd/0"); 5365 make_dev_alias(dev, "stdin"); 5366 dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL, 5367 UID_ROOT, GID_WHEEL, 0666, "fd/1"); 5368 make_dev_alias(dev, "stdout"); 5369 dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL, 5370 UID_ROOT, GID_WHEEL, 0666, "fd/2"); 5371 make_dev_alias(dev, "stderr"); 5372 } 5373 5374 SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL); 5375