1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1989, 1991, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 #include "opt_capsicum.h" 38 #include "opt_ddb.h" 39 #include "opt_ktrace.h" 40 41 #define EXTERR_CATEGORY EXTERR_CAT_FILEDESC 42 #include <sys/systm.h> 43 #include <sys/capsicum.h> 44 #include <sys/conf.h> 45 #include <sys/exterrvar.h> 46 #include <sys/fcntl.h> 47 #include <sys/file.h> 48 #include <sys/filedesc.h> 49 #include <sys/filio.h> 50 #include <sys/jail.h> 51 #include <sys/kernel.h> 52 #include <sys/limits.h> 53 #include <sys/lock.h> 54 #include <sys/malloc.h> 55 #include <sys/mount.h> 56 #include <sys/mutex.h> 57 #include <sys/namei.h> 58 #include <sys/selinfo.h> 59 #include <sys/poll.h> 60 #include <sys/priv.h> 61 #include <sys/proc.h> 62 #include <sys/protosw.h> 63 #include <sys/racct.h> 64 #include <sys/resourcevar.h> 65 #include <sys/sbuf.h> 66 #include <sys/signalvar.h> 67 #include <sys/kdb.h> 68 #include <sys/smr.h> 69 #include <sys/stat.h> 70 #include <sys/sx.h> 71 #include <sys/syscallsubr.h> 72 #include <sys/sysctl.h> 73 #include <sys/sysproto.h> 74 #include <sys/unistd.h> 75 #include <sys/user.h> 76 #include <sys/vnode.h> 77 #include <sys/ktrace.h> 78 79 #include <net/vnet.h> 80 81 #include <security/audit/audit.h> 82 83 #include <vm/uma.h> 84 #include <vm/vm.h> 85 86 #include <ddb/ddb.h> 87 88 static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table"); 89 static MALLOC_DEFINE(M_PWD, "pwd", "Descriptor table vnodes"); 90 static MALLOC_DEFINE(M_PWDDESC, "pwddesc", "Pwd descriptors"); 91 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader", 92 "file desc to leader structures"); 93 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); 94 MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities"); 95 96 MALLOC_DECLARE(M_FADVISE); 97 98 static __read_mostly uma_zone_t file_zone; 99 static __read_mostly uma_zone_t filedesc0_zone; 100 __read_mostly uma_zone_t pwd_zone; 101 VFS_SMR_DECLARE; 102 103 static int closefp(struct filedesc *fdp, int fd, struct file *fp, 104 struct thread *td, bool holdleaders, bool audit); 105 static void export_file_to_kinfo(struct file *fp, int fd, 106 cap_rights_t *rightsp, struct kinfo_file *kif, 107 struct filedesc *fdp, int flags); 108 static int fd_first_free(struct filedesc *fdp, int low, int size); 109 static void fdgrowtable(struct filedesc *fdp, int nfd); 110 static void fdgrowtable_exp(struct filedesc *fdp, int nfd); 111 static void fdunused(struct filedesc *fdp, int fd); 112 static void fdused(struct filedesc *fdp, int fd); 113 static int fget_unlocked_seq(struct thread *td, int fd, 114 const cap_rights_t *needrightsp, uint8_t *flagsp, 115 struct file **fpp, seqc_t *seqp); 116 static int getmaxfd(struct thread *td); 117 static u_long *filecaps_copy_prep(const struct filecaps *src); 118 static void filecaps_copy_finish(const struct filecaps *src, 119 struct filecaps *dst, u_long *ioctls); 120 static u_long *filecaps_free_prep(struct filecaps *fcaps); 121 static void filecaps_free_finish(u_long *ioctls); 122 123 static struct pwd *pwd_alloc(void); 124 125 /* 126 * Each process has: 127 * 128 * - An array of open file descriptors (fd_ofiles) 129 * - An array of file flags (fd_ofileflags) 130 * - A bitmap recording which descriptors are in use (fd_map) 131 * 132 * A process starts out with NDFILE descriptors. The value of NDFILE has 133 * been selected based the historical limit of 20 open files, and an 134 * assumption that the majority of processes, especially short-lived 135 * processes like shells, will never need more. 136 * 137 * If this initial allocation is exhausted, a larger descriptor table and 138 * map are allocated dynamically, and the pointers in the process's struct 139 * filedesc are updated to point to those. This is repeated every time 140 * the process runs out of file descriptors (provided it hasn't hit its 141 * resource limit). 142 * 143 * Since threads may hold references to individual descriptor table 144 * entries, the tables are never freed. Instead, they are placed on a 145 * linked list and freed only when the struct filedesc is released. 146 */ 147 #define NDFILE 20 148 #define NDSLOTSIZE sizeof(NDSLOTTYPE) 149 #define NDENTRIES (NDSLOTSIZE * __CHAR_BIT) 150 #define NDSLOT(x) ((x) / NDENTRIES) 151 #define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES)) 152 #define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES) 153 154 #define FILEDESC_FOREACH_FDE(fdp, _iterator, _fde) \ 155 struct filedesc *_fdp = (fdp); \ 156 int _lastfile = fdlastfile_single(_fdp); \ 157 for (_iterator = 0; _iterator <= _lastfile; _iterator++) \ 158 if ((_fde = &_fdp->fd_ofiles[_iterator])->fde_file != NULL) 159 160 #define FILEDESC_FOREACH_FP(fdp, _iterator, _fp) \ 161 struct filedesc *_fdp = (fdp); \ 162 int _lastfile = fdlastfile_single(_fdp); \ 163 for (_iterator = 0; _iterator <= _lastfile; _iterator++) \ 164 if ((_fp = _fdp->fd_ofiles[_iterator].fde_file) != NULL) 165 166 /* 167 * SLIST entry used to keep track of ofiles which must be reclaimed when 168 * the process exits. 169 */ 170 struct freetable { 171 struct fdescenttbl *ft_table; 172 SLIST_ENTRY(freetable) ft_next; 173 }; 174 175 /* 176 * Initial allocation: a filedesc structure + the head of SLIST used to 177 * keep track of old ofiles + enough space for NDFILE descriptors. 178 */ 179 180 struct fdescenttbl0 { 181 int fdt_nfiles; 182 struct filedescent fdt_ofiles[NDFILE]; 183 }; 184 185 struct filedesc0 { 186 struct filedesc fd_fd; 187 SLIST_HEAD(, freetable) fd_free; 188 struct fdescenttbl0 fd_dfiles; 189 NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)]; 190 }; 191 192 /* 193 * Descriptor management. 194 */ 195 static int __exclusive_cache_line openfiles; /* actual number of open files */ 196 struct mtx sigio_lock; /* mtx to protect pointers to sigio */ 197 void __read_mostly (*mq_fdclose)(struct thread *td, int fd, struct file *fp); 198 199 /* 200 * If low >= size, just return low. Otherwise find the first zero bit in the 201 * given bitmap, starting at low and not exceeding size - 1. Return size if 202 * not found. 203 */ 204 static int 205 fd_first_free(struct filedesc *fdp, int low, int size) 206 { 207 NDSLOTTYPE *map = fdp->fd_map; 208 NDSLOTTYPE mask; 209 int off, maxoff; 210 211 if (low >= size) 212 return (low); 213 214 off = NDSLOT(low); 215 if (low % NDENTRIES) { 216 mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES))); 217 if ((mask &= ~map[off]) != 0UL) 218 return (off * NDENTRIES + ffsl(mask) - 1); 219 ++off; 220 } 221 for (maxoff = NDSLOTS(size); off < maxoff; ++off) 222 if (map[off] != ~0UL) 223 return (off * NDENTRIES + ffsl(~map[off]) - 1); 224 return (size); 225 } 226 227 /* 228 * Find the last used fd. 229 * 230 * Call this variant if fdp can't be modified by anyone else (e.g, during exec). 231 * Otherwise use fdlastfile. 232 */ 233 int 234 fdlastfile_single(struct filedesc *fdp) 235 { 236 NDSLOTTYPE *map = fdp->fd_map; 237 int off, minoff; 238 239 off = NDSLOT(fdp->fd_nfiles - 1); 240 for (minoff = NDSLOT(0); off >= minoff; --off) 241 if (map[off] != 0) 242 return (off * NDENTRIES + flsl(map[off]) - 1); 243 return (-1); 244 } 245 246 int 247 fdlastfile(struct filedesc *fdp) 248 { 249 250 FILEDESC_LOCK_ASSERT(fdp); 251 return (fdlastfile_single(fdp)); 252 } 253 254 static int 255 fdisused(struct filedesc *fdp, int fd) 256 { 257 258 KASSERT(fd >= 0 && fd < fdp->fd_nfiles, 259 ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles)); 260 261 return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0); 262 } 263 264 /* 265 * Mark a file descriptor as used. 266 */ 267 static void 268 fdused_init(struct filedesc *fdp, int fd) 269 { 270 271 KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd)); 272 273 fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd); 274 } 275 276 static void 277 fdused(struct filedesc *fdp, int fd) 278 { 279 280 FILEDESC_XLOCK_ASSERT(fdp); 281 282 fdused_init(fdp, fd); 283 if (fd == fdp->fd_freefile) 284 fdp->fd_freefile++; 285 } 286 287 /* 288 * Mark a file descriptor as unused. 289 */ 290 static void 291 fdunused(struct filedesc *fdp, int fd) 292 { 293 294 FILEDESC_XLOCK_ASSERT(fdp); 295 296 KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd)); 297 KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, 298 ("fd=%d is still in use", fd)); 299 300 fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd); 301 if (fd < fdp->fd_freefile) 302 fdp->fd_freefile = fd; 303 } 304 305 /* 306 * Free a file descriptor. 307 * 308 * Avoid some work if fdp is about to be destroyed. 309 */ 310 static inline void 311 fdefree_last(struct filedescent *fde) 312 { 313 314 filecaps_free(&fde->fde_caps); 315 } 316 317 static inline void 318 fdfree(struct filedesc *fdp, int fd) 319 { 320 struct filedescent *fde; 321 322 FILEDESC_XLOCK_ASSERT(fdp); 323 fde = &fdp->fd_ofiles[fd]; 324 #ifdef CAPABILITIES 325 seqc_write_begin(&fde->fde_seqc); 326 #endif 327 fde->fde_file = NULL; 328 #ifdef CAPABILITIES 329 seqc_write_end(&fde->fde_seqc); 330 #endif 331 fdefree_last(fde); 332 fdunused(fdp, fd); 333 } 334 335 /* 336 * System calls on descriptors. 337 */ 338 #ifndef _SYS_SYSPROTO_H_ 339 struct getdtablesize_args { 340 int dummy; 341 }; 342 #endif 343 /* ARGSUSED */ 344 int 345 sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap) 346 { 347 #ifdef RACCT 348 uint64_t lim; 349 #endif 350 351 td->td_retval[0] = getmaxfd(td); 352 #ifdef RACCT 353 PROC_LOCK(td->td_proc); 354 lim = racct_get_limit(td->td_proc, RACCT_NOFILE); 355 PROC_UNLOCK(td->td_proc); 356 if (lim < td->td_retval[0]) 357 td->td_retval[0] = lim; 358 #endif 359 return (0); 360 } 361 362 /* 363 * Duplicate a file descriptor to a particular value. 364 * 365 * Note: keep in mind that a potential race condition exists when closing 366 * descriptors from a shared descriptor table (via rfork). 367 */ 368 #ifndef _SYS_SYSPROTO_H_ 369 struct dup2_args { 370 u_int from; 371 u_int to; 372 }; 373 #endif 374 /* ARGSUSED */ 375 int 376 sys_dup2(struct thread *td, struct dup2_args *uap) 377 { 378 379 return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to)); 380 } 381 382 /* 383 * Duplicate a file descriptor. 384 */ 385 #ifndef _SYS_SYSPROTO_H_ 386 struct dup_args { 387 u_int fd; 388 }; 389 #endif 390 /* ARGSUSED */ 391 int 392 sys_dup(struct thread *td, struct dup_args *uap) 393 { 394 395 return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0)); 396 } 397 398 /* 399 * The file control system call. 400 */ 401 #ifndef _SYS_SYSPROTO_H_ 402 struct fcntl_args { 403 int fd; 404 int cmd; 405 long arg; 406 }; 407 #endif 408 /* ARGSUSED */ 409 int 410 sys_fcntl(struct thread *td, struct fcntl_args *uap) 411 { 412 413 return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg)); 414 } 415 416 int 417 kern_fcntl_freebsd(struct thread *td, int fd, int cmd, intptr_t arg) 418 { 419 struct flock fl; 420 struct __oflock ofl; 421 intptr_t arg1; 422 int error, newcmd; 423 424 error = 0; 425 newcmd = cmd; 426 switch (cmd) { 427 case F_OGETLK: 428 case F_OSETLK: 429 case F_OSETLKW: 430 /* 431 * Convert old flock structure to new. 432 */ 433 error = copyin((void *)arg, &ofl, sizeof(ofl)); 434 fl.l_start = ofl.l_start; 435 fl.l_len = ofl.l_len; 436 fl.l_pid = ofl.l_pid; 437 fl.l_type = ofl.l_type; 438 fl.l_whence = ofl.l_whence; 439 fl.l_sysid = 0; 440 441 switch (cmd) { 442 case F_OGETLK: 443 newcmd = F_GETLK; 444 break; 445 case F_OSETLK: 446 newcmd = F_SETLK; 447 break; 448 case F_OSETLKW: 449 newcmd = F_SETLKW; 450 break; 451 } 452 arg1 = (intptr_t)&fl; 453 break; 454 case F_GETLK: 455 case F_SETLK: 456 case F_SETLKW: 457 case F_SETLK_REMOTE: 458 error = copyin((void *)arg, &fl, sizeof(fl)); 459 arg1 = (intptr_t)&fl; 460 break; 461 default: 462 arg1 = arg; 463 break; 464 } 465 if (error) 466 return (error); 467 error = kern_fcntl(td, fd, newcmd, arg1); 468 if (error) 469 return (error); 470 if (cmd == F_OGETLK) { 471 ofl.l_start = fl.l_start; 472 ofl.l_len = fl.l_len; 473 ofl.l_pid = fl.l_pid; 474 ofl.l_type = fl.l_type; 475 ofl.l_whence = fl.l_whence; 476 error = copyout(&ofl, (void *)arg, sizeof(ofl)); 477 } else if (cmd == F_GETLK) { 478 error = copyout(&fl, (void *)arg, sizeof(fl)); 479 } 480 return (error); 481 } 482 483 struct flags_trans_elem { 484 u_int f; 485 u_int t; 486 }; 487 488 static u_int 489 flags_trans(const struct flags_trans_elem *ftes, int nitems, u_int from_flags) 490 { 491 u_int res; 492 int i; 493 494 res = 0; 495 for (i = 0; i < nitems; i++) { 496 if ((from_flags & ftes[i].f) != 0) 497 res |= ftes[i].t; 498 } 499 return (res); 500 } 501 502 static uint8_t 503 fd_to_fde_flags(int fd_flags) 504 { 505 static const struct flags_trans_elem fd_to_fde_flags_s[] = { 506 { .f = FD_CLOEXEC, .t = UF_EXCLOSE }, 507 { .f = FD_CLOFORK, .t = UF_FOCLOSE }, 508 { .f = FD_RESOLVE_BENEATH, .t = UF_RESOLVE_BENEATH }, 509 }; 510 511 return (flags_trans(fd_to_fde_flags_s, nitems(fd_to_fde_flags_s), 512 fd_flags)); 513 } 514 515 static int 516 fde_to_fd_flags(uint8_t fde_flags) 517 { 518 static const struct flags_trans_elem fde_to_fd_flags_s[] = { 519 { .f = UF_EXCLOSE, .t = FD_CLOEXEC }, 520 { .f = UF_FOCLOSE, .t = FD_CLOFORK }, 521 { .f = UF_RESOLVE_BENEATH, .t = FD_RESOLVE_BENEATH }, 522 }; 523 524 return (flags_trans(fde_to_fd_flags_s, nitems(fde_to_fd_flags_s), 525 fde_flags)); 526 } 527 528 static uint8_t 529 fddup_to_fde_flags(int fddup_flags) 530 { 531 static const struct flags_trans_elem fddup_to_fde_flags_s[] = { 532 { .f = FDDUP_FLAG_CLOEXEC, .t = UF_EXCLOSE }, 533 { .f = FDDUP_FLAG_CLOFORK, .t = UF_FOCLOSE }, 534 }; 535 536 return (flags_trans(fddup_to_fde_flags_s, nitems(fddup_to_fde_flags_s), 537 fddup_flags)); 538 } 539 540 static uint8_t 541 close_range_to_fde_flags(int close_range_flags) 542 { 543 static const struct flags_trans_elem close_range_to_fde_flags_s[] = { 544 { .f = CLOSE_RANGE_CLOEXEC, .t = UF_EXCLOSE }, 545 { .f = CLOSE_RANGE_CLOFORK, .t = UF_FOCLOSE }, 546 }; 547 548 return (flags_trans(close_range_to_fde_flags_s, 549 nitems(close_range_to_fde_flags_s), close_range_flags)); 550 } 551 552 static uint8_t 553 open_to_fde_flags(int open_flags, bool sticky_orb) 554 { 555 static const struct flags_trans_elem open_to_fde_flags_s[] = { 556 { .f = O_CLOEXEC, .t = UF_EXCLOSE }, 557 { .f = O_CLOFORK, .t = UF_FOCLOSE }, 558 { .f = O_RESOLVE_BENEATH, .t = UF_RESOLVE_BENEATH }, 559 }; 560 #if defined(__clang__) && __clang_major__ >= 19 561 _Static_assert(open_to_fde_flags_s[nitems(open_to_fde_flags_s) - 1].f == 562 O_RESOLVE_BENEATH, "O_RESOLVE_BENEATH must be last, for sticky_orb"); 563 #endif 564 565 return (flags_trans(open_to_fde_flags_s, nitems(open_to_fde_flags_s) - 566 (sticky_orb ? 0 : 1), open_flags)); 567 } 568 569 int 570 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) 571 { 572 struct filedesc *fdp; 573 struct flock *flp; 574 struct file *fp, *fp2; 575 struct filedescent *fde; 576 struct proc *p; 577 struct vnode *vp; 578 struct mount *mp; 579 struct kinfo_file *kif; 580 int error, flg, kif_sz, seals, tmp, got_set, got_cleared; 581 uint64_t bsize; 582 off_t foffset; 583 int flags; 584 585 error = 0; 586 flg = F_POSIX; 587 p = td->td_proc; 588 fdp = p->p_fd; 589 590 AUDIT_ARG_FD(cmd); 591 AUDIT_ARG_CMD(cmd); 592 switch (cmd) { 593 case F_DUPFD: 594 tmp = arg; 595 error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp); 596 break; 597 598 case F_DUPFD_CLOEXEC: 599 tmp = arg; 600 error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp); 601 break; 602 603 case F_DUPFD_CLOFORK: 604 tmp = arg; 605 error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOFORK, fd, tmp); 606 break; 607 608 case F_DUP2FD: 609 tmp = arg; 610 error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp); 611 break; 612 613 case F_DUP2FD_CLOEXEC: 614 tmp = arg; 615 error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp); 616 break; 617 618 case F_GETFD: 619 error = EBADF; 620 FILEDESC_SLOCK(fdp); 621 fde = fdeget_noref(fdp, fd); 622 if (fde != NULL) { 623 td->td_retval[0] = fde_to_fd_flags(fde->fde_flags); 624 error = 0; 625 } 626 FILEDESC_SUNLOCK(fdp); 627 break; 628 629 case F_SETFD: 630 error = EBADF; 631 FILEDESC_XLOCK(fdp); 632 fde = fdeget_noref(fdp, fd); 633 if (fde != NULL) { 634 /* 635 * UF_RESOLVE_BENEATH is sticky and cannot be cleared. 636 */ 637 fde->fde_flags = (fde->fde_flags & 638 ~(UF_EXCLOSE | UF_FOCLOSE)) | fd_to_fde_flags(arg); 639 error = 0; 640 } 641 FILEDESC_XUNLOCK(fdp); 642 break; 643 644 case F_GETFL: 645 error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETFL, &fp); 646 if (error != 0) 647 break; 648 td->td_retval[0] = OFLAGS(fp->f_flag); 649 fdrop(fp, td); 650 break; 651 652 case F_SETFL: 653 error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETFL, &fp); 654 if (error != 0) 655 break; 656 if (fp->f_ops == &path_fileops) { 657 fdrop(fp, td); 658 error = EBADF; 659 break; 660 } 661 do { 662 tmp = flg = fp->f_flag; 663 tmp &= ~FCNTLFLAGS; 664 tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS; 665 } while (atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); 666 got_set = tmp & ~flg; 667 got_cleared = flg & ~tmp; 668 tmp = fp->f_flag & FNONBLOCK; 669 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 670 if (error != 0) 671 goto revert_f_setfl; 672 tmp = fp->f_flag & FASYNC; 673 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); 674 if (error == 0) { 675 fdrop(fp, td); 676 break; 677 } 678 atomic_clear_int(&fp->f_flag, FNONBLOCK); 679 tmp = 0; 680 (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 681 revert_f_setfl: 682 do { 683 tmp = flg = fp->f_flag; 684 tmp &= ~FCNTLFLAGS; 685 tmp |= got_cleared; 686 tmp &= ~got_set; 687 } while (atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); 688 fdrop(fp, td); 689 break; 690 691 case F_GETOWN: 692 error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETOWN, &fp); 693 if (error != 0) 694 break; 695 error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td); 696 if (error == 0) 697 td->td_retval[0] = tmp; 698 fdrop(fp, td); 699 break; 700 701 case F_SETOWN: 702 error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETOWN, &fp); 703 if (error != 0) 704 break; 705 tmp = arg; 706 error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td); 707 fdrop(fp, td); 708 break; 709 710 case F_SETLK_REMOTE: 711 error = priv_check(td, PRIV_NFS_LOCKD); 712 if (error != 0) 713 return (error); 714 flg = F_REMOTE; 715 goto do_setlk; 716 717 case F_SETLKW: 718 flg |= F_WAIT; 719 /* FALLTHROUGH F_SETLK */ 720 721 case F_SETLK: 722 do_setlk: 723 flp = (struct flock *)arg; 724 if ((flg & F_REMOTE) != 0 && flp->l_sysid == 0) { 725 error = EINVAL; 726 break; 727 } 728 729 error = fget_unlocked(td, fd, &cap_flock_rights, &fp); 730 if (error != 0) 731 break; 732 if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { 733 error = EBADF; 734 fdrop(fp, td); 735 break; 736 } 737 738 if (flp->l_whence == SEEK_CUR) { 739 foffset = foffset_get(fp); 740 if (foffset < 0 || 741 (flp->l_start > 0 && 742 foffset > OFF_MAX - flp->l_start)) { 743 error = EOVERFLOW; 744 fdrop(fp, td); 745 break; 746 } 747 flp->l_start += foffset; 748 } 749 750 vp = fp->f_vnode; 751 switch (flp->l_type) { 752 case F_RDLCK: 753 if ((fp->f_flag & FREAD) == 0) { 754 error = EBADF; 755 break; 756 } 757 if ((p->p_leader->p_flag & P_ADVLOCK) == 0) { 758 PROC_LOCK(p->p_leader); 759 p->p_leader->p_flag |= P_ADVLOCK; 760 PROC_UNLOCK(p->p_leader); 761 } 762 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, 763 flp, flg); 764 break; 765 case F_WRLCK: 766 if ((fp->f_flag & FWRITE) == 0) { 767 error = EBADF; 768 break; 769 } 770 if ((p->p_leader->p_flag & P_ADVLOCK) == 0) { 771 PROC_LOCK(p->p_leader); 772 p->p_leader->p_flag |= P_ADVLOCK; 773 PROC_UNLOCK(p->p_leader); 774 } 775 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, 776 flp, flg); 777 break; 778 case F_UNLCK: 779 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, 780 flp, flg); 781 break; 782 case F_UNLCKSYS: 783 if (flg != F_REMOTE) { 784 error = EINVAL; 785 break; 786 } 787 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, 788 F_UNLCKSYS, flp, flg); 789 break; 790 default: 791 error = EINVAL; 792 break; 793 } 794 if (error != 0 || flp->l_type == F_UNLCK || 795 flp->l_type == F_UNLCKSYS) { 796 fdrop(fp, td); 797 break; 798 } 799 800 /* 801 * Check for a race with close. 802 * 803 * The vnode is now advisory locked (or unlocked, but this case 804 * is not really important) as the caller requested. 805 * We had to drop the filedesc lock, so we need to recheck if 806 * the descriptor is still valid, because if it was closed 807 * in the meantime we need to remove advisory lock from the 808 * vnode - close on any descriptor leading to an advisory 809 * locked vnode, removes that lock. 810 * We will return 0 on purpose in that case, as the result of 811 * successful advisory lock might have been externally visible 812 * already. This is fine - effectively we pretend to the caller 813 * that the closing thread was a bit slower and that the 814 * advisory lock succeeded before the close. 815 */ 816 error = fget_unlocked(td, fd, &cap_no_rights, &fp2); 817 if (error != 0) { 818 fdrop(fp, td); 819 break; 820 } 821 if (fp != fp2) { 822 flp->l_whence = SEEK_SET; 823 flp->l_start = 0; 824 flp->l_len = 0; 825 flp->l_type = F_UNLCK; 826 (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, 827 F_UNLCK, flp, F_POSIX); 828 } 829 fdrop(fp, td); 830 fdrop(fp2, td); 831 break; 832 833 case F_GETLK: 834 error = fget_unlocked(td, fd, &cap_flock_rights, &fp); 835 if (error != 0) 836 break; 837 if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { 838 error = EBADF; 839 fdrop(fp, td); 840 break; 841 } 842 flp = (struct flock *)arg; 843 if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK && 844 flp->l_type != F_UNLCK) { 845 error = EINVAL; 846 fdrop(fp, td); 847 break; 848 } 849 if (flp->l_whence == SEEK_CUR) { 850 foffset = foffset_get(fp); 851 if ((flp->l_start > 0 && 852 foffset > OFF_MAX - flp->l_start) || 853 (flp->l_start < 0 && 854 foffset < OFF_MIN - flp->l_start)) { 855 error = EOVERFLOW; 856 fdrop(fp, td); 857 break; 858 } 859 flp->l_start += foffset; 860 } 861 vp = fp->f_vnode; 862 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp, 863 F_POSIX); 864 fdrop(fp, td); 865 break; 866 867 case F_ADD_SEALS: 868 error = fget_unlocked(td, fd, &cap_no_rights, &fp); 869 if (error != 0) 870 break; 871 error = fo_add_seals(fp, arg); 872 fdrop(fp, td); 873 break; 874 875 case F_GET_SEALS: 876 error = fget_unlocked(td, fd, &cap_no_rights, &fp); 877 if (error != 0) 878 break; 879 if (fo_get_seals(fp, &seals) == 0) 880 td->td_retval[0] = seals; 881 else 882 error = EINVAL; 883 fdrop(fp, td); 884 break; 885 886 case F_RDAHEAD: 887 arg = arg ? 128 * 1024: 0; 888 /* FALLTHROUGH */ 889 case F_READAHEAD: 890 error = fget_unlocked(td, fd, &cap_no_rights, &fp); 891 if (error != 0) 892 break; 893 if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { 894 fdrop(fp, td); 895 error = EBADF; 896 break; 897 } 898 vp = fp->f_vnode; 899 if (vp->v_type != VREG) { 900 fdrop(fp, td); 901 error = ENOTTY; 902 break; 903 } 904 905 /* 906 * Exclusive lock synchronizes against f_seqcount reads and 907 * writes in sequential_heuristic(). 908 */ 909 error = vn_lock(vp, LK_EXCLUSIVE); 910 if (error != 0) { 911 fdrop(fp, td); 912 break; 913 } 914 if (arg >= 0) { 915 bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize; 916 arg = MIN(arg, INT_MAX - bsize + 1); 917 fp->f_seqcount[UIO_READ] = MIN(IO_SEQMAX, 918 (arg + bsize - 1) / bsize); 919 atomic_set_int(&fp->f_flag, FRDAHEAD); 920 } else { 921 atomic_clear_int(&fp->f_flag, FRDAHEAD); 922 } 923 VOP_UNLOCK(vp); 924 fdrop(fp, td); 925 break; 926 927 case F_ISUNIONSTACK: 928 /* 929 * Check if the vnode is part of a union stack (either the 930 * "union" flag from mount(2) or unionfs). 931 * 932 * Prior to introduction of this op libc's readdir would call 933 * fstatfs(2), in effect unnecessarily copying kilobytes of 934 * data just to check fs name and a mount flag. 935 * 936 * Fixing the code to handle everything in the kernel instead 937 * is a non-trivial endeavor and has low priority, thus this 938 * horrible kludge facilitates the current behavior in a much 939 * cheaper manner until someone(tm) sorts this out. 940 */ 941 error = fget_unlocked(td, fd, &cap_no_rights, &fp); 942 if (error != 0) 943 break; 944 if (fp->f_type != DTYPE_VNODE) { 945 fdrop(fp, td); 946 error = EBADF; 947 break; 948 } 949 vp = fp->f_vnode; 950 /* 951 * Since we don't prevent dooming the vnode even non-null mp 952 * found can become immediately stale. This is tolerable since 953 * mount points are type-stable (providing safe memory access) 954 * and any vfs op on this vnode going forward will return an 955 * error (meaning return value in this case is meaningless). 956 */ 957 mp = atomic_load_ptr(&vp->v_mount); 958 if (__predict_false(mp == NULL)) { 959 fdrop(fp, td); 960 error = EBADF; 961 break; 962 } 963 td->td_retval[0] = 0; 964 if (mp->mnt_kern_flag & MNTK_UNIONFS || 965 mp->mnt_flag & MNT_UNION) 966 td->td_retval[0] = 1; 967 fdrop(fp, td); 968 break; 969 970 case F_KINFO: 971 #ifdef CAPABILITY_MODE 972 if (CAP_TRACING(td)) 973 ktrcapfail(CAPFAIL_SYSCALL, &cmd); 974 if (IN_CAPABILITY_MODE(td)) { 975 error = ECAPMODE; 976 break; 977 } 978 #endif 979 error = copyin((void *)arg, &kif_sz, sizeof(kif_sz)); 980 if (error != 0) 981 break; 982 if (kif_sz != sizeof(*kif)) { 983 error = EINVAL; 984 break; 985 } 986 kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK | M_ZERO); 987 FILEDESC_SLOCK(fdp); 988 error = fget_cap_noref(fdp, fd, &cap_fcntl_rights, &fp, NULL); 989 if (error == 0 && fhold(fp)) { 990 export_file_to_kinfo(fp, fd, NULL, kif, fdp, 0); 991 FILEDESC_SUNLOCK(fdp); 992 fdrop(fp, td); 993 if ((kif->kf_status & KF_ATTR_VALID) != 0) { 994 kif->kf_structsize = sizeof(*kif); 995 error = copyout(kif, (void *)arg, sizeof(*kif)); 996 } else { 997 error = EBADF; 998 } 999 } else { 1000 FILEDESC_SUNLOCK(fdp); 1001 if (error == 0) 1002 error = EBADF; 1003 } 1004 free(kif, M_TEMP); 1005 break; 1006 1007 default: 1008 if ((cmd & ((1u << F_DUP3FD_SHIFT) - 1)) != F_DUP3FD) 1009 return (EXTERROR(EINVAL, "invalid fcntl cmd")); 1010 /* Handle F_DUP3FD */ 1011 flags = (cmd >> F_DUP3FD_SHIFT); 1012 if ((flags & ~(FD_CLOEXEC | FD_CLOFORK)) != 0) 1013 return (EXTERROR(EINVAL, "invalid flags for F_DUP3FD")); 1014 tmp = arg; 1015 error = kern_dup(td, FDDUP_FIXED, 1016 ((flags & FD_CLOEXEC) != 0 ? FDDUP_FLAG_CLOEXEC : 0) | 1017 ((flags & FD_CLOFORK) != 0 ? FDDUP_FLAG_CLOFORK : 0), 1018 fd, tmp); 1019 break; 1020 } 1021 return (error); 1022 } 1023 1024 static int 1025 getmaxfd(struct thread *td) 1026 { 1027 1028 return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc)); 1029 } 1030 1031 /* 1032 * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD). 1033 */ 1034 int 1035 kern_dup(struct thread *td, u_int mode, int flags, int old, int new) 1036 { 1037 struct filedesc *fdp; 1038 struct filedescent *oldfde, *newfde; 1039 struct proc *p; 1040 struct file *delfp, *oldfp; 1041 u_long *oioctls, *nioctls; 1042 int error, maxfd; 1043 1044 p = td->td_proc; 1045 fdp = p->p_fd; 1046 oioctls = NULL; 1047 1048 MPASS((flags & ~(FDDUP_FLAG_CLOEXEC | FDDUP_FLAG_CLOFORK)) == 0); 1049 MPASS(mode < FDDUP_LASTMODE); 1050 1051 AUDIT_ARG_FD(old); 1052 /* XXXRW: if (flags & FDDUP_FIXED) AUDIT_ARG_FD2(new); */ 1053 1054 /* 1055 * Verify we have a valid descriptor to dup from and possibly to 1056 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should 1057 * return EINVAL when the new descriptor is out of bounds. 1058 */ 1059 if (old < 0) 1060 return (EBADF); 1061 if (new < 0) 1062 return (mode == FDDUP_FCNTL ? EINVAL : EBADF); 1063 maxfd = getmaxfd(td); 1064 if (new >= maxfd) 1065 return (mode == FDDUP_FCNTL ? EINVAL : EBADF); 1066 1067 error = EBADF; 1068 FILEDESC_XLOCK(fdp); 1069 if (fget_noref(fdp, old) == NULL) 1070 goto unlock; 1071 if (mode == FDDUP_FIXED && old == new) { 1072 td->td_retval[0] = new; 1073 fdp->fd_ofiles[new].fde_flags |= fddup_to_fde_flags(flags); 1074 error = 0; 1075 goto unlock; 1076 } 1077 1078 oldfde = &fdp->fd_ofiles[old]; 1079 oldfp = oldfde->fde_file; 1080 if (!fhold(oldfp)) 1081 goto unlock; 1082 1083 /* 1084 * If the caller specified a file descriptor, make sure the file 1085 * table is large enough to hold it, and grab it. Otherwise, just 1086 * allocate a new descriptor the usual way. 1087 */ 1088 switch (mode) { 1089 case FDDUP_NORMAL: 1090 case FDDUP_FCNTL: 1091 if ((error = fdalloc(td, new, &new)) != 0) { 1092 fdrop(oldfp, td); 1093 goto unlock; 1094 } 1095 break; 1096 case FDDUP_FIXED: 1097 if (new >= fdp->fd_nfiles) { 1098 /* 1099 * The resource limits are here instead of e.g. 1100 * fdalloc(), because the file descriptor table may be 1101 * shared between processes, so we can't really use 1102 * racct_add()/racct_sub(). Instead of counting the 1103 * number of actually allocated descriptors, just put 1104 * the limit on the size of the file descriptor table. 1105 */ 1106 #ifdef RACCT 1107 if (RACCT_ENABLED()) { 1108 error = racct_set_unlocked(p, RACCT_NOFILE, new + 1); 1109 if (error != 0) { 1110 error = EMFILE; 1111 fdrop(oldfp, td); 1112 goto unlock; 1113 } 1114 } 1115 #endif 1116 fdgrowtable_exp(fdp, new + 1); 1117 } 1118 if (!fdisused(fdp, new)) 1119 fdused(fdp, new); 1120 break; 1121 default: 1122 KASSERT(0, ("%s unsupported mode %d", __func__, mode)); 1123 } 1124 1125 KASSERT(old != new, ("new fd is same as old")); 1126 1127 /* Refetch oldfde because the table may have grown and old one freed. */ 1128 oldfde = &fdp->fd_ofiles[old]; 1129 KASSERT(oldfp == oldfde->fde_file, 1130 ("fdt_ofiles shift from growth observed at fd %d", 1131 old)); 1132 1133 newfde = &fdp->fd_ofiles[new]; 1134 delfp = newfde->fde_file; 1135 1136 nioctls = filecaps_copy_prep(&oldfde->fde_caps); 1137 1138 /* 1139 * Duplicate the source descriptor. 1140 */ 1141 #ifdef CAPABILITIES 1142 seqc_write_begin(&newfde->fde_seqc); 1143 #endif 1144 oioctls = filecaps_free_prep(&newfde->fde_caps); 1145 fde_copy(oldfde, newfde); 1146 filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps, 1147 nioctls); 1148 newfde->fde_flags = (oldfde->fde_flags & ~(UF_EXCLOSE | UF_FOCLOSE)) | 1149 fddup_to_fde_flags(flags); 1150 #ifdef CAPABILITIES 1151 seqc_write_end(&newfde->fde_seqc); 1152 #endif 1153 td->td_retval[0] = new; 1154 1155 error = 0; 1156 1157 if (delfp != NULL) { 1158 (void) closefp(fdp, new, delfp, td, true, false); 1159 FILEDESC_UNLOCK_ASSERT(fdp); 1160 } else { 1161 unlock: 1162 FILEDESC_XUNLOCK(fdp); 1163 } 1164 1165 filecaps_free_finish(oioctls); 1166 return (error); 1167 } 1168 1169 static void 1170 sigiofree(struct sigio *sigio) 1171 { 1172 crfree(sigio->sio_ucred); 1173 free(sigio, M_SIGIO); 1174 } 1175 1176 static struct sigio * 1177 funsetown_locked(struct sigio *sigio) 1178 { 1179 struct proc *p; 1180 struct pgrp *pg; 1181 1182 SIGIO_ASSERT_LOCKED(); 1183 1184 if (sigio == NULL) 1185 return (NULL); 1186 *sigio->sio_myref = NULL; 1187 if (sigio->sio_pgid < 0) { 1188 pg = sigio->sio_pgrp; 1189 PGRP_LOCK(pg); 1190 SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, sio_pgsigio); 1191 PGRP_UNLOCK(pg); 1192 } else { 1193 p = sigio->sio_proc; 1194 PROC_LOCK(p); 1195 SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, sio_pgsigio); 1196 PROC_UNLOCK(p); 1197 } 1198 return (sigio); 1199 } 1200 1201 /* 1202 * If sigio is on the list associated with a process or process group, 1203 * disable signalling from the device, remove sigio from the list and 1204 * free sigio. 1205 */ 1206 void 1207 funsetown(struct sigio **sigiop) 1208 { 1209 struct sigio *sigio; 1210 1211 /* Racy check, consumers must provide synchronization. */ 1212 if (*sigiop == NULL) 1213 return; 1214 1215 SIGIO_LOCK(); 1216 sigio = funsetown_locked(*sigiop); 1217 SIGIO_UNLOCK(); 1218 if (sigio != NULL) 1219 sigiofree(sigio); 1220 } 1221 1222 /* 1223 * Free a list of sigio structures. The caller must ensure that new sigio 1224 * structures cannot be added after this point. For process groups this is 1225 * guaranteed using the proctree lock; for processes, the P_WEXIT flag serves 1226 * as an interlock. 1227 */ 1228 void 1229 funsetownlst(struct sigiolst *sigiolst) 1230 { 1231 struct proc *p; 1232 struct pgrp *pg; 1233 struct sigio *sigio, *tmp; 1234 1235 /* Racy check. */ 1236 sigio = SLIST_FIRST(sigiolst); 1237 if (sigio == NULL) 1238 return; 1239 1240 p = NULL; 1241 pg = NULL; 1242 1243 SIGIO_LOCK(); 1244 sigio = SLIST_FIRST(sigiolst); 1245 if (sigio == NULL) { 1246 SIGIO_UNLOCK(); 1247 return; 1248 } 1249 1250 /* 1251 * Every entry of the list should belong to a single proc or pgrp. 1252 */ 1253 if (sigio->sio_pgid < 0) { 1254 pg = sigio->sio_pgrp; 1255 sx_assert(&proctree_lock, SX_XLOCKED); 1256 PGRP_LOCK(pg); 1257 } else /* if (sigio->sio_pgid > 0) */ { 1258 p = sigio->sio_proc; 1259 PROC_LOCK(p); 1260 KASSERT((p->p_flag & P_WEXIT) != 0, 1261 ("%s: process %p is not exiting", __func__, p)); 1262 } 1263 1264 SLIST_FOREACH(sigio, sigiolst, sio_pgsigio) { 1265 *sigio->sio_myref = NULL; 1266 if (pg != NULL) { 1267 KASSERT(sigio->sio_pgid < 0, 1268 ("Proc sigio in pgrp sigio list")); 1269 KASSERT(sigio->sio_pgrp == pg, 1270 ("Bogus pgrp in sigio list")); 1271 } else /* if (p != NULL) */ { 1272 KASSERT(sigio->sio_pgid > 0, 1273 ("Pgrp sigio in proc sigio list")); 1274 KASSERT(sigio->sio_proc == p, 1275 ("Bogus proc in sigio list")); 1276 } 1277 } 1278 1279 if (pg != NULL) 1280 PGRP_UNLOCK(pg); 1281 else 1282 PROC_UNLOCK(p); 1283 SIGIO_UNLOCK(); 1284 1285 SLIST_FOREACH_SAFE(sigio, sigiolst, sio_pgsigio, tmp) 1286 sigiofree(sigio); 1287 } 1288 1289 /* 1290 * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). 1291 * 1292 * After permission checking, add a sigio structure to the sigio list for 1293 * the process or process group. 1294 */ 1295 int 1296 fsetown(pid_t pgid, struct sigio **sigiop) 1297 { 1298 struct proc *proc; 1299 struct pgrp *pgrp; 1300 struct sigio *osigio, *sigio; 1301 int ret; 1302 1303 if (pgid == 0) { 1304 funsetown(sigiop); 1305 return (0); 1306 } 1307 1308 sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK); 1309 sigio->sio_pgid = pgid; 1310 sigio->sio_ucred = crhold(curthread->td_ucred); 1311 sigio->sio_myref = sigiop; 1312 1313 ret = 0; 1314 if (pgid > 0) { 1315 ret = pget(pgid, PGET_NOTWEXIT | PGET_NOTID | PGET_HOLD, &proc); 1316 SIGIO_LOCK(); 1317 osigio = funsetown_locked(*sigiop); 1318 if (ret == 0) { 1319 PROC_LOCK(proc); 1320 _PRELE(proc); 1321 if ((proc->p_flag & P_WEXIT) != 0) { 1322 ret = ESRCH; 1323 } else if (proc->p_session != 1324 curthread->td_proc->p_session) { 1325 /* 1326 * Policy - Don't allow a process to FSETOWN a 1327 * process in another session. 1328 * 1329 * Remove this test to allow maximum flexibility 1330 * or restrict FSETOWN to the current process or 1331 * process group for maximum safety. 1332 */ 1333 ret = EPERM; 1334 } else { 1335 sigio->sio_proc = proc; 1336 SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, 1337 sio_pgsigio); 1338 } 1339 PROC_UNLOCK(proc); 1340 } 1341 } else /* if (pgid < 0) */ { 1342 sx_slock(&proctree_lock); 1343 SIGIO_LOCK(); 1344 osigio = funsetown_locked(*sigiop); 1345 pgrp = pgfind(-pgid); 1346 if (pgrp == NULL) { 1347 ret = ESRCH; 1348 } else { 1349 if (pgrp->pg_session != curthread->td_proc->p_session) { 1350 /* 1351 * Policy - Don't allow a process to FSETOWN a 1352 * process in another session. 1353 * 1354 * Remove this test to allow maximum flexibility 1355 * or restrict FSETOWN to the current process or 1356 * process group for maximum safety. 1357 */ 1358 ret = EPERM; 1359 } else { 1360 sigio->sio_pgrp = pgrp; 1361 SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, 1362 sio_pgsigio); 1363 } 1364 PGRP_UNLOCK(pgrp); 1365 } 1366 sx_sunlock(&proctree_lock); 1367 } 1368 if (ret == 0) 1369 *sigiop = sigio; 1370 SIGIO_UNLOCK(); 1371 if (osigio != NULL) 1372 sigiofree(osigio); 1373 return (ret); 1374 } 1375 1376 /* 1377 * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). 1378 */ 1379 pid_t 1380 fgetown(struct sigio **sigiop) 1381 { 1382 pid_t pgid; 1383 1384 SIGIO_LOCK(); 1385 pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0; 1386 SIGIO_UNLOCK(); 1387 return (pgid); 1388 } 1389 1390 static int 1391 closefp_impl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, 1392 bool audit) 1393 { 1394 int error; 1395 1396 FILEDESC_XLOCK_ASSERT(fdp); 1397 1398 /* 1399 * We now hold the fp reference that used to be owned by the 1400 * descriptor array. We have to unlock the FILEDESC *AFTER* 1401 * knote_fdclose to prevent a race of the fd getting opened, a knote 1402 * added, and deleteing a knote for the new fd. 1403 */ 1404 if (__predict_false(!TAILQ_EMPTY(&fdp->fd_kqlist))) 1405 knote_fdclose(td, fd); 1406 1407 /* 1408 * We need to notify mqueue if the object is of type mqueue. 1409 */ 1410 if (__predict_false(fp->f_type == DTYPE_MQUEUE)) 1411 mq_fdclose(td, fd, fp); 1412 FILEDESC_XUNLOCK(fdp); 1413 1414 #ifdef AUDIT 1415 if (AUDITING_TD(td) && audit) 1416 audit_sysclose(td, fd, fp); 1417 #endif 1418 error = closef(fp, td); 1419 1420 /* 1421 * All paths leading up to closefp() will have already removed or 1422 * replaced the fd in the filedesc table, so a restart would not 1423 * operate on the same file. 1424 */ 1425 if (error == ERESTART) 1426 error = EINTR; 1427 1428 return (error); 1429 } 1430 1431 static int 1432 closefp_hl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, 1433 bool holdleaders, bool audit) 1434 { 1435 int error; 1436 1437 FILEDESC_XLOCK_ASSERT(fdp); 1438 1439 if (holdleaders) { 1440 if (td->td_proc->p_fdtol != NULL) { 1441 /* 1442 * Ask fdfree() to sleep to ensure that all relevant 1443 * process leaders can be traversed in closef(). 1444 */ 1445 fdp->fd_holdleaderscount++; 1446 } else { 1447 holdleaders = false; 1448 } 1449 } 1450 1451 error = closefp_impl(fdp, fd, fp, td, audit); 1452 if (holdleaders) { 1453 FILEDESC_XLOCK(fdp); 1454 fdp->fd_holdleaderscount--; 1455 if (fdp->fd_holdleaderscount == 0 && 1456 fdp->fd_holdleaderswakeup != 0) { 1457 fdp->fd_holdleaderswakeup = 0; 1458 wakeup(&fdp->fd_holdleaderscount); 1459 } 1460 FILEDESC_XUNLOCK(fdp); 1461 } 1462 return (error); 1463 } 1464 1465 static int 1466 closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, 1467 bool holdleaders, bool audit) 1468 { 1469 1470 FILEDESC_XLOCK_ASSERT(fdp); 1471 1472 if (__predict_false(td->td_proc->p_fdtol != NULL)) { 1473 return (closefp_hl(fdp, fd, fp, td, holdleaders, audit)); 1474 } else { 1475 return (closefp_impl(fdp, fd, fp, td, audit)); 1476 } 1477 } 1478 1479 /* 1480 * Close a file descriptor. 1481 */ 1482 #ifndef _SYS_SYSPROTO_H_ 1483 struct close_args { 1484 int fd; 1485 }; 1486 #endif 1487 /* ARGSUSED */ 1488 int 1489 sys_close(struct thread *td, struct close_args *uap) 1490 { 1491 1492 return (kern_close(td, uap->fd)); 1493 } 1494 1495 int 1496 kern_close(struct thread *td, int fd) 1497 { 1498 struct filedesc *fdp; 1499 struct file *fp; 1500 1501 fdp = td->td_proc->p_fd; 1502 1503 FILEDESC_XLOCK(fdp); 1504 if ((fp = fget_noref(fdp, fd)) == NULL) { 1505 FILEDESC_XUNLOCK(fdp); 1506 return (EBADF); 1507 } 1508 fdfree(fdp, fd); 1509 1510 /* closefp() drops the FILEDESC lock for us. */ 1511 return (closefp(fdp, fd, fp, td, true, true)); 1512 } 1513 1514 static int 1515 close_range_flags(struct thread *td, u_int lowfd, u_int highfd, int flags) 1516 { 1517 struct filedesc *fdp; 1518 struct fdescenttbl *fdt; 1519 struct filedescent *fde; 1520 int fd, fde_flags; 1521 1522 fde_flags = close_range_to_fde_flags(flags); 1523 fdp = td->td_proc->p_fd; 1524 FILEDESC_XLOCK(fdp); 1525 fdt = atomic_load_ptr(&fdp->fd_files); 1526 highfd = MIN(highfd, fdt->fdt_nfiles - 1); 1527 fd = lowfd; 1528 if (__predict_false(fd > highfd)) { 1529 goto out_locked; 1530 } 1531 for (; fd <= highfd; fd++) { 1532 fde = &fdt->fdt_ofiles[fd]; 1533 if (fde->fde_file != NULL) 1534 fde->fde_flags |= fde_flags; 1535 } 1536 out_locked: 1537 FILEDESC_XUNLOCK(fdp); 1538 return (0); 1539 } 1540 1541 static int 1542 close_range_impl(struct thread *td, u_int lowfd, u_int highfd) 1543 { 1544 struct filedesc *fdp; 1545 const struct fdescenttbl *fdt; 1546 struct file *fp; 1547 int fd; 1548 1549 fdp = td->td_proc->p_fd; 1550 FILEDESC_XLOCK(fdp); 1551 fdt = atomic_load_ptr(&fdp->fd_files); 1552 highfd = MIN(highfd, fdt->fdt_nfiles - 1); 1553 fd = lowfd; 1554 if (__predict_false(fd > highfd)) { 1555 goto out_locked; 1556 } 1557 for (;;) { 1558 fp = fdt->fdt_ofiles[fd].fde_file; 1559 if (fp == NULL) { 1560 if (fd == highfd) 1561 goto out_locked; 1562 } else { 1563 fdfree(fdp, fd); 1564 (void) closefp(fdp, fd, fp, td, true, true); 1565 if (fd == highfd) 1566 goto out_unlocked; 1567 FILEDESC_XLOCK(fdp); 1568 fdt = atomic_load_ptr(&fdp->fd_files); 1569 } 1570 fd++; 1571 } 1572 out_locked: 1573 FILEDESC_XUNLOCK(fdp); 1574 out_unlocked: 1575 return (0); 1576 } 1577 1578 int 1579 kern_close_range(struct thread *td, int flags, u_int lowfd, u_int highfd) 1580 { 1581 1582 /* 1583 * Check this prior to clamping; closefrom(3) with only fd 0, 1, and 2 1584 * open should not be a usage error. From a close_range() perspective, 1585 * close_range(3, ~0U, 0) in the same scenario should also likely not 1586 * be a usage error as all fd above 3 are in-fact already closed. 1587 */ 1588 if (highfd < lowfd) { 1589 return (EINVAL); 1590 } 1591 1592 if ((flags & (CLOSE_RANGE_CLOEXEC | CLOSE_RANGE_CLOFORK)) != 0) 1593 return (close_range_flags(td, lowfd, highfd, flags)); 1594 1595 return (close_range_impl(td, lowfd, highfd)); 1596 } 1597 1598 #ifndef _SYS_SYSPROTO_H_ 1599 struct close_range_args { 1600 u_int lowfd; 1601 u_int highfd; 1602 int flags; 1603 }; 1604 #endif 1605 int 1606 sys_close_range(struct thread *td, struct close_range_args *uap) 1607 { 1608 1609 AUDIT_ARG_FD(uap->lowfd); 1610 AUDIT_ARG_CMD(uap->highfd); 1611 AUDIT_ARG_FFLAGS(uap->flags); 1612 1613 if ((uap->flags & ~(CLOSE_RANGE_CLOEXEC | CLOSE_RANGE_CLOFORK)) != 0) 1614 return (EINVAL); 1615 return (kern_close_range(td, uap->flags, uap->lowfd, uap->highfd)); 1616 } 1617 1618 #ifdef COMPAT_FREEBSD12 1619 /* 1620 * Close open file descriptors. 1621 */ 1622 #ifndef _SYS_SYSPROTO_H_ 1623 struct freebsd12_closefrom_args { 1624 int lowfd; 1625 }; 1626 #endif 1627 /* ARGSUSED */ 1628 int 1629 freebsd12_closefrom(struct thread *td, struct freebsd12_closefrom_args *uap) 1630 { 1631 u_int lowfd; 1632 1633 AUDIT_ARG_FD(uap->lowfd); 1634 1635 /* 1636 * Treat negative starting file descriptor values identical to 1637 * closefrom(0) which closes all files. 1638 */ 1639 lowfd = MAX(0, uap->lowfd); 1640 return (kern_close_range(td, 0, lowfd, ~0U)); 1641 } 1642 #endif /* COMPAT_FREEBSD12 */ 1643 1644 #if defined(COMPAT_43) 1645 /* 1646 * Return status information about a file descriptor. 1647 */ 1648 #ifndef _SYS_SYSPROTO_H_ 1649 struct ofstat_args { 1650 int fd; 1651 struct ostat *sb; 1652 }; 1653 #endif 1654 /* ARGSUSED */ 1655 int 1656 ofstat(struct thread *td, struct ofstat_args *uap) 1657 { 1658 struct ostat oub; 1659 struct stat ub; 1660 int error; 1661 1662 error = kern_fstat(td, uap->fd, &ub); 1663 if (error == 0) { 1664 cvtstat(&ub, &oub); 1665 error = copyout(&oub, uap->sb, sizeof(oub)); 1666 } 1667 return (error); 1668 } 1669 #endif /* COMPAT_43 */ 1670 1671 #if defined(COMPAT_FREEBSD11) 1672 int 1673 freebsd11_fstat(struct thread *td, struct freebsd11_fstat_args *uap) 1674 { 1675 struct stat sb; 1676 struct freebsd11_stat osb; 1677 int error; 1678 1679 error = kern_fstat(td, uap->fd, &sb); 1680 if (error != 0) 1681 return (error); 1682 error = freebsd11_cvtstat(&sb, &osb); 1683 if (error == 0) 1684 error = copyout(&osb, uap->sb, sizeof(osb)); 1685 return (error); 1686 } 1687 #endif /* COMPAT_FREEBSD11 */ 1688 1689 /* 1690 * Return status information about a file descriptor. 1691 */ 1692 #ifndef _SYS_SYSPROTO_H_ 1693 struct fstat_args { 1694 int fd; 1695 struct stat *sb; 1696 }; 1697 #endif 1698 /* ARGSUSED */ 1699 int 1700 sys_fstat(struct thread *td, struct fstat_args *uap) 1701 { 1702 struct stat ub; 1703 int error; 1704 1705 error = kern_fstat(td, uap->fd, &ub); 1706 if (error == 0) 1707 error = copyout(&ub, uap->sb, sizeof(ub)); 1708 return (error); 1709 } 1710 1711 int 1712 kern_fstat(struct thread *td, int fd, struct stat *sbp) 1713 { 1714 struct file *fp; 1715 int error; 1716 1717 AUDIT_ARG_FD(fd); 1718 1719 error = fget(td, fd, &cap_fstat_rights, &fp); 1720 if (__predict_false(error != 0)) 1721 return (error); 1722 1723 AUDIT_ARG_FILE(td->td_proc, fp); 1724 1725 sbp->st_filerev = 0; 1726 sbp->st_bsdflags = 0; 1727 error = fo_stat(fp, sbp, td->td_ucred); 1728 fdrop(fp, td); 1729 #ifdef __STAT_TIME_T_EXT 1730 sbp->st_atim_ext = 0; 1731 sbp->st_mtim_ext = 0; 1732 sbp->st_ctim_ext = 0; 1733 sbp->st_btim_ext = 0; 1734 #endif 1735 #ifdef KTRACE 1736 if (KTRPOINT(td, KTR_STRUCT)) 1737 ktrstat_error(sbp, error); 1738 #endif 1739 return (error); 1740 } 1741 1742 #if defined(COMPAT_FREEBSD11) 1743 /* 1744 * Return status information about a file descriptor. 1745 */ 1746 #ifndef _SYS_SYSPROTO_H_ 1747 struct freebsd11_nfstat_args { 1748 int fd; 1749 struct nstat *sb; 1750 }; 1751 #endif 1752 /* ARGSUSED */ 1753 int 1754 freebsd11_nfstat(struct thread *td, struct freebsd11_nfstat_args *uap) 1755 { 1756 struct nstat nub; 1757 struct stat ub; 1758 int error; 1759 1760 error = kern_fstat(td, uap->fd, &ub); 1761 if (error != 0) 1762 return (error); 1763 error = freebsd11_cvtnstat(&ub, &nub); 1764 if (error != 0) 1765 error = copyout(&nub, uap->sb, sizeof(nub)); 1766 return (error); 1767 } 1768 #endif /* COMPAT_FREEBSD11 */ 1769 1770 /* 1771 * Return pathconf information about a file descriptor. 1772 */ 1773 #ifndef _SYS_SYSPROTO_H_ 1774 struct fpathconf_args { 1775 int fd; 1776 int name; 1777 }; 1778 #endif 1779 /* ARGSUSED */ 1780 int 1781 sys_fpathconf(struct thread *td, struct fpathconf_args *uap) 1782 { 1783 long value; 1784 int error; 1785 1786 error = kern_fpathconf(td, uap->fd, uap->name, &value); 1787 if (error == 0) 1788 td->td_retval[0] = value; 1789 return (error); 1790 } 1791 1792 int 1793 kern_fpathconf(struct thread *td, int fd, int name, long *valuep) 1794 { 1795 struct file *fp; 1796 struct vnode *vp; 1797 int error; 1798 1799 error = fget(td, fd, &cap_fpathconf_rights, &fp); 1800 if (error != 0) 1801 return (error); 1802 1803 if (name == _PC_ASYNC_IO) { 1804 *valuep = _POSIX_ASYNCHRONOUS_IO; 1805 goto out; 1806 } 1807 vp = fp->f_vnode; 1808 if (vp != NULL) { 1809 vn_lock(vp, LK_SHARED | LK_RETRY); 1810 error = VOP_PATHCONF(vp, name, valuep); 1811 VOP_UNLOCK(vp); 1812 } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) { 1813 if (name != _PC_PIPE_BUF) { 1814 error = EINVAL; 1815 } else { 1816 *valuep = PIPE_BUF; 1817 error = 0; 1818 } 1819 } else { 1820 error = EOPNOTSUPP; 1821 } 1822 out: 1823 fdrop(fp, td); 1824 return (error); 1825 } 1826 1827 /* 1828 * Copy filecaps structure allocating memory for ioctls array if needed. 1829 * 1830 * The last parameter indicates whether the fdtable is locked. If it is not and 1831 * ioctls are encountered, copying fails and the caller must lock the table. 1832 * 1833 * Note that if the table was not locked, the caller has to check the relevant 1834 * sequence counter to determine whether the operation was successful. 1835 */ 1836 bool 1837 filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked) 1838 { 1839 size_t size; 1840 1841 if (src->fc_ioctls != NULL && !locked) 1842 return (false); 1843 memcpy(dst, src, sizeof(*src)); 1844 if (src->fc_ioctls == NULL) 1845 return (true); 1846 1847 KASSERT(src->fc_nioctls > 0, 1848 ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); 1849 1850 size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; 1851 dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK); 1852 memcpy(dst->fc_ioctls, src->fc_ioctls, size); 1853 return (true); 1854 } 1855 1856 static u_long * 1857 filecaps_copy_prep(const struct filecaps *src) 1858 { 1859 u_long *ioctls; 1860 size_t size; 1861 1862 if (__predict_true(src->fc_ioctls == NULL)) 1863 return (NULL); 1864 1865 KASSERT(src->fc_nioctls > 0, 1866 ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); 1867 1868 size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; 1869 ioctls = malloc(size, M_FILECAPS, M_WAITOK); 1870 return (ioctls); 1871 } 1872 1873 static void 1874 filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst, 1875 u_long *ioctls) 1876 { 1877 size_t size; 1878 1879 *dst = *src; 1880 if (__predict_true(src->fc_ioctls == NULL)) { 1881 MPASS(ioctls == NULL); 1882 return; 1883 } 1884 1885 size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; 1886 dst->fc_ioctls = ioctls; 1887 bcopy(src->fc_ioctls, dst->fc_ioctls, size); 1888 } 1889 1890 /* 1891 * Move filecaps structure to the new place and clear the old place. 1892 */ 1893 void 1894 filecaps_move(struct filecaps *src, struct filecaps *dst) 1895 { 1896 1897 *dst = *src; 1898 bzero(src, sizeof(*src)); 1899 } 1900 1901 /* 1902 * Fill the given filecaps structure with full rights. 1903 */ 1904 static void 1905 filecaps_fill(struct filecaps *fcaps) 1906 { 1907 1908 CAP_ALL(&fcaps->fc_rights); 1909 fcaps->fc_ioctls = NULL; 1910 fcaps->fc_nioctls = -1; 1911 fcaps->fc_fcntls = CAP_FCNTL_ALL; 1912 } 1913 1914 /* 1915 * Free memory allocated within filecaps structure. 1916 */ 1917 static void 1918 filecaps_free_ioctl(struct filecaps *fcaps) 1919 { 1920 1921 free(fcaps->fc_ioctls, M_FILECAPS); 1922 fcaps->fc_ioctls = NULL; 1923 } 1924 1925 void 1926 filecaps_free(struct filecaps *fcaps) 1927 { 1928 1929 filecaps_free_ioctl(fcaps); 1930 bzero(fcaps, sizeof(*fcaps)); 1931 } 1932 1933 static u_long * 1934 filecaps_free_prep(struct filecaps *fcaps) 1935 { 1936 u_long *ioctls; 1937 1938 ioctls = fcaps->fc_ioctls; 1939 bzero(fcaps, sizeof(*fcaps)); 1940 return (ioctls); 1941 } 1942 1943 static void 1944 filecaps_free_finish(u_long *ioctls) 1945 { 1946 1947 free(ioctls, M_FILECAPS); 1948 } 1949 1950 /* 1951 * Validate the given filecaps structure. 1952 */ 1953 static void 1954 filecaps_validate(const struct filecaps *fcaps, const char *func) 1955 { 1956 1957 KASSERT(cap_rights_is_valid(&fcaps->fc_rights), 1958 ("%s: invalid rights", func)); 1959 KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0, 1960 ("%s: invalid fcntls", func)); 1961 KASSERT(fcaps->fc_fcntls == 0 || 1962 cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL), 1963 ("%s: fcntls without CAP_FCNTL", func)); 1964 /* 1965 * open calls without WANTIOCTLCAPS free caps but leave the counter 1966 */ 1967 #if 0 1968 KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 : 1969 (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0), 1970 ("%s: invalid ioctls", func)); 1971 #endif 1972 KASSERT(fcaps->fc_nioctls == 0 || 1973 cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL), 1974 ("%s: ioctls without CAP_IOCTL", func)); 1975 } 1976 1977 static void 1978 fdgrowtable_exp(struct filedesc *fdp, int nfd) 1979 { 1980 int nfd1; 1981 1982 FILEDESC_XLOCK_ASSERT(fdp); 1983 1984 nfd1 = fdp->fd_nfiles * 2; 1985 if (nfd1 < nfd) 1986 nfd1 = nfd; 1987 fdgrowtable(fdp, nfd1); 1988 } 1989 1990 /* 1991 * Grow the file table to accommodate (at least) nfd descriptors. 1992 */ 1993 static void 1994 fdgrowtable(struct filedesc *fdp, int nfd) 1995 { 1996 struct filedesc0 *fdp0; 1997 struct freetable *ft; 1998 struct fdescenttbl *ntable; 1999 struct fdescenttbl *otable; 2000 int nnfiles, onfiles; 2001 NDSLOTTYPE *nmap, *omap; 2002 2003 KASSERT(fdp->fd_nfiles > 0, ("zero-length file table")); 2004 2005 /* save old values */ 2006 onfiles = fdp->fd_nfiles; 2007 otable = fdp->fd_files; 2008 omap = fdp->fd_map; 2009 2010 /* compute the size of the new table */ 2011 nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */ 2012 if (nnfiles <= onfiles) 2013 /* the table is already large enough */ 2014 return; 2015 2016 /* 2017 * Allocate a new table. We need enough space for the number of 2018 * entries, file entries themselves and the struct freetable we will use 2019 * when we decommission the table and place it on the freelist. 2020 * We place the struct freetable in the middle so we don't have 2021 * to worry about padding. 2022 */ 2023 ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) + 2024 nnfiles * sizeof(ntable->fdt_ofiles[0]) + 2025 sizeof(struct freetable), 2026 M_FILEDESC, M_ZERO | M_WAITOK); 2027 /* copy the old data */ 2028 ntable->fdt_nfiles = nnfiles; 2029 memcpy(ntable->fdt_ofiles, otable->fdt_ofiles, 2030 onfiles * sizeof(ntable->fdt_ofiles[0])); 2031 2032 /* 2033 * Allocate a new map only if the old is not large enough. It will 2034 * grow at a slower rate than the table as it can map more 2035 * entries than the table can hold. 2036 */ 2037 if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) { 2038 nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC, 2039 M_ZERO | M_WAITOK); 2040 /* copy over the old data and update the pointer */ 2041 memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap)); 2042 fdp->fd_map = nmap; 2043 } 2044 2045 /* 2046 * Make sure that ntable is correctly initialized before we replace 2047 * fd_files poiner. Otherwise fget_unlocked() may see inconsistent 2048 * data. 2049 */ 2050 atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable); 2051 2052 /* 2053 * Free the old file table when not shared by other threads or processes. 2054 * The old file table is considered to be shared when either are true: 2055 * - The process has more than one thread. 2056 * - The file descriptor table has been shared via fdshare(). 2057 * 2058 * When shared, the old file table will be placed on a freelist 2059 * which will be processed when the struct filedesc is released. 2060 * 2061 * Note that if onfiles == NDFILE, we're dealing with the original 2062 * static allocation contained within (struct filedesc0 *)fdp, 2063 * which must not be freed. 2064 */ 2065 if (onfiles > NDFILE) { 2066 /* 2067 * Note we may be called here from fdinit while allocating a 2068 * table for a new process in which case ->p_fd points 2069 * elsewhere. 2070 */ 2071 if (curproc->p_fd != fdp || FILEDESC_IS_ONLY_USER(fdp)) { 2072 free(otable, M_FILEDESC); 2073 } else { 2074 ft = (struct freetable *)&otable->fdt_ofiles[onfiles]; 2075 fdp0 = (struct filedesc0 *)fdp; 2076 ft->ft_table = otable; 2077 SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next); 2078 } 2079 } 2080 /* 2081 * The map does not have the same possibility of threads still 2082 * holding references to it. So always free it as long as it 2083 * does not reference the original static allocation. 2084 */ 2085 if (NDSLOTS(onfiles) > NDSLOTS(NDFILE)) 2086 free(omap, M_FILEDESC); 2087 } 2088 2089 /* 2090 * Allocate a file descriptor for the process. 2091 */ 2092 int 2093 fdalloc(struct thread *td, int minfd, int *result) 2094 { 2095 struct proc *p = td->td_proc; 2096 struct filedesc *fdp = p->p_fd; 2097 int fd, maxfd, allocfd; 2098 #ifdef RACCT 2099 int error; 2100 #endif 2101 2102 FILEDESC_XLOCK_ASSERT(fdp); 2103 2104 if (fdp->fd_freefile > minfd) 2105 minfd = fdp->fd_freefile; 2106 2107 maxfd = getmaxfd(td); 2108 2109 /* 2110 * Search the bitmap for a free descriptor starting at minfd. 2111 * If none is found, grow the file table. 2112 */ 2113 fd = fd_first_free(fdp, minfd, fdp->fd_nfiles); 2114 if (__predict_false(fd >= maxfd)) 2115 return (EMFILE); 2116 if (__predict_false(fd >= fdp->fd_nfiles)) { 2117 allocfd = min(fd * 2, maxfd); 2118 #ifdef RACCT 2119 if (RACCT_ENABLED()) { 2120 error = racct_set_unlocked(p, RACCT_NOFILE, allocfd); 2121 if (error != 0) 2122 return (EMFILE); 2123 } 2124 #endif 2125 /* 2126 * fd is already equal to first free descriptor >= minfd, so 2127 * we only need to grow the table and we are done. 2128 */ 2129 fdgrowtable_exp(fdp, allocfd); 2130 } 2131 2132 /* 2133 * Perform some sanity checks, then mark the file descriptor as 2134 * used and return it to the caller. 2135 */ 2136 KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles), 2137 ("invalid descriptor %d", fd)); 2138 KASSERT(!fdisused(fdp, fd), 2139 ("fd_first_free() returned non-free descriptor")); 2140 KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, 2141 ("file descriptor isn't free")); 2142 fdused(fdp, fd); 2143 *result = fd; 2144 return (0); 2145 } 2146 2147 /* 2148 * Allocate n file descriptors for the process. 2149 */ 2150 int 2151 fdallocn(struct thread *td, int minfd, int *fds, int n) 2152 { 2153 struct proc *p = td->td_proc; 2154 struct filedesc *fdp = p->p_fd; 2155 int i; 2156 2157 FILEDESC_XLOCK_ASSERT(fdp); 2158 2159 for (i = 0; i < n; i++) 2160 if (fdalloc(td, 0, &fds[i]) != 0) 2161 break; 2162 2163 if (i < n) { 2164 for (i--; i >= 0; i--) 2165 fdunused(fdp, fds[i]); 2166 return (EMFILE); 2167 } 2168 2169 return (0); 2170 } 2171 2172 /* 2173 * Create a new open file structure and allocate a file descriptor for the 2174 * process that refers to it. We add one reference to the file for the 2175 * descriptor table and one reference for resultfp. This is to prevent us 2176 * being preempted and the entry in the descriptor table closed after we 2177 * release the FILEDESC lock. 2178 */ 2179 int 2180 falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags, 2181 struct filecaps *fcaps) 2182 { 2183 struct file *fp; 2184 int error, fd; 2185 2186 MPASS(resultfp != NULL); 2187 MPASS(resultfd != NULL); 2188 2189 error = _falloc_noinstall(td, &fp, 2); 2190 if (__predict_false(error != 0)) { 2191 return (error); 2192 } 2193 2194 error = finstall_refed(td, fp, &fd, flags, fcaps); 2195 if (__predict_false(error != 0)) { 2196 falloc_abort(td, fp); 2197 return (error); 2198 } 2199 2200 *resultfp = fp; 2201 *resultfd = fd; 2202 2203 return (0); 2204 } 2205 2206 /* 2207 * Create a new open file structure without allocating a file descriptor. 2208 */ 2209 int 2210 _falloc_noinstall(struct thread *td, struct file **resultfp, u_int n) 2211 { 2212 struct file *fp; 2213 int maxuserfiles = maxfiles - (maxfiles / 20); 2214 int openfiles_new; 2215 static struct timeval lastfail; 2216 static int curfail; 2217 2218 KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__)); 2219 MPASS(n > 0); 2220 2221 openfiles_new = atomic_fetchadd_int(&openfiles, 1) + 1; 2222 if ((openfiles_new >= maxuserfiles && 2223 priv_check(td, PRIV_MAXFILES) != 0) || 2224 openfiles_new >= maxfiles) { 2225 atomic_subtract_int(&openfiles, 1); 2226 if (ppsratecheck(&lastfail, &curfail, 1)) { 2227 printf("kern.maxfiles limit exceeded by uid %i, (%s) " 2228 "please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm); 2229 } 2230 return (ENFILE); 2231 } 2232 fp = uma_zalloc(file_zone, M_WAITOK); 2233 bzero(fp, sizeof(*fp)); 2234 refcount_init(&fp->f_count, n); 2235 fp->f_cred = crhold(td->td_ucred); 2236 fp->f_ops = &badfileops; 2237 *resultfp = fp; 2238 return (0); 2239 } 2240 2241 void 2242 falloc_abort(struct thread *td, struct file *fp) 2243 { 2244 2245 /* 2246 * For assertion purposes. 2247 */ 2248 refcount_init(&fp->f_count, 0); 2249 _fdrop(fp, td); 2250 } 2251 2252 /* 2253 * Install a file in a file descriptor table. 2254 */ 2255 void 2256 _finstall(struct filedesc *fdp, struct file *fp, int fd, int flags, 2257 struct filecaps *fcaps) 2258 { 2259 struct filedescent *fde; 2260 2261 MPASS(fp != NULL); 2262 if (fcaps != NULL) 2263 filecaps_validate(fcaps, __func__); 2264 FILEDESC_XLOCK_ASSERT(fdp); 2265 2266 fde = &fdp->fd_ofiles[fd]; 2267 #ifdef CAPABILITIES 2268 seqc_write_begin(&fde->fde_seqc); 2269 #endif 2270 fde->fde_file = fp; 2271 fde->fde_flags = open_to_fde_flags(flags, true); 2272 if (fcaps != NULL) 2273 filecaps_move(fcaps, &fde->fde_caps); 2274 else 2275 filecaps_fill(&fde->fde_caps); 2276 #ifdef CAPABILITIES 2277 seqc_write_end(&fde->fde_seqc); 2278 #endif 2279 } 2280 2281 int 2282 finstall_refed(struct thread *td, struct file *fp, int *fd, int flags, 2283 struct filecaps *fcaps) 2284 { 2285 struct filedesc *fdp = td->td_proc->p_fd; 2286 int error; 2287 2288 MPASS(fd != NULL); 2289 2290 FILEDESC_XLOCK(fdp); 2291 error = fdalloc(td, 0, fd); 2292 if (__predict_true(error == 0)) { 2293 _finstall(fdp, fp, *fd, flags, fcaps); 2294 } 2295 FILEDESC_XUNLOCK(fdp); 2296 return (error); 2297 } 2298 2299 int 2300 finstall(struct thread *td, struct file *fp, int *fd, int flags, 2301 struct filecaps *fcaps) 2302 { 2303 int error; 2304 2305 MPASS(fd != NULL); 2306 2307 if (!fhold(fp)) 2308 return (EBADF); 2309 error = finstall_refed(td, fp, fd, flags, fcaps); 2310 if (__predict_false(error != 0)) { 2311 fdrop(fp, td); 2312 } 2313 return (error); 2314 } 2315 2316 /* 2317 * Build a new filedesc structure from another. 2318 * 2319 * If fdp is not NULL, return with it shared locked. 2320 */ 2321 struct filedesc * 2322 fdinit(void) 2323 { 2324 struct filedesc0 *newfdp0; 2325 struct filedesc *newfdp; 2326 2327 newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO); 2328 newfdp = &newfdp0->fd_fd; 2329 2330 /* Create the file descriptor table. */ 2331 FILEDESC_LOCK_INIT(newfdp); 2332 refcount_init(&newfdp->fd_refcnt, 1); 2333 refcount_init(&newfdp->fd_holdcnt, 1); 2334 newfdp->fd_map = newfdp0->fd_dmap; 2335 newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles; 2336 newfdp->fd_files->fdt_nfiles = NDFILE; 2337 2338 return (newfdp); 2339 } 2340 2341 /* 2342 * Build a pwddesc structure from another. 2343 * Copy the current, root, and jail root vnode references. 2344 * 2345 * If pdp is not NULL and keeplock is true, return with it (exclusively) locked. 2346 */ 2347 struct pwddesc * 2348 pdinit(struct pwddesc *pdp, bool keeplock) 2349 { 2350 struct pwddesc *newpdp; 2351 struct pwd *newpwd; 2352 2353 newpdp = malloc(sizeof(*newpdp), M_PWDDESC, M_WAITOK | M_ZERO); 2354 2355 PWDDESC_LOCK_INIT(newpdp); 2356 refcount_init(&newpdp->pd_refcount, 1); 2357 newpdp->pd_cmask = CMASK; 2358 2359 if (pdp == NULL) { 2360 newpwd = pwd_alloc(); 2361 smr_serialized_store(&newpdp->pd_pwd, newpwd, true); 2362 return (newpdp); 2363 } 2364 2365 PWDDESC_XLOCK(pdp); 2366 newpwd = pwd_hold_pwddesc(pdp); 2367 smr_serialized_store(&newpdp->pd_pwd, newpwd, true); 2368 if (!keeplock) 2369 PWDDESC_XUNLOCK(pdp); 2370 return (newpdp); 2371 } 2372 2373 /* 2374 * Hold either filedesc or pwddesc of the passed process. 2375 * 2376 * The process lock is used to synchronize against the target exiting and 2377 * freeing the data. 2378 * 2379 * Clearing can be ilustrated in 3 steps: 2380 * 1. set the pointer to NULL. Either routine can race against it, hence 2381 * atomic_load_ptr. 2382 * 2. observe the process lock as not taken. Until then fdhold/pdhold can 2383 * race to either still see the pointer or find NULL. It is still safe to 2384 * grab a reference as clearing is stalled. 2385 * 3. after the lock is observed as not taken, any fdhold/pdhold calls are 2386 * guaranteed to see NULL, making it safe to finish clearing 2387 */ 2388 static struct filedesc * 2389 fdhold(struct proc *p) 2390 { 2391 struct filedesc *fdp; 2392 2393 PROC_LOCK_ASSERT(p, MA_OWNED); 2394 fdp = atomic_load_ptr(&p->p_fd); 2395 if (fdp != NULL) 2396 refcount_acquire(&fdp->fd_holdcnt); 2397 return (fdp); 2398 } 2399 2400 static struct pwddesc * 2401 pdhold(struct proc *p) 2402 { 2403 struct pwddesc *pdp; 2404 2405 PROC_LOCK_ASSERT(p, MA_OWNED); 2406 pdp = atomic_load_ptr(&p->p_pd); 2407 if (pdp != NULL) 2408 refcount_acquire(&pdp->pd_refcount); 2409 return (pdp); 2410 } 2411 2412 static void 2413 fddrop(struct filedesc *fdp) 2414 { 2415 2416 if (refcount_load(&fdp->fd_holdcnt) > 1) { 2417 if (refcount_release(&fdp->fd_holdcnt) == 0) 2418 return; 2419 } 2420 2421 FILEDESC_LOCK_DESTROY(fdp); 2422 uma_zfree(filedesc0_zone, fdp); 2423 } 2424 2425 static void 2426 pddrop(struct pwddesc *pdp) 2427 { 2428 struct pwd *pwd; 2429 2430 if (refcount_release_if_not_last(&pdp->pd_refcount)) 2431 return; 2432 2433 PWDDESC_XLOCK(pdp); 2434 if (refcount_release(&pdp->pd_refcount) == 0) { 2435 PWDDESC_XUNLOCK(pdp); 2436 return; 2437 } 2438 pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 2439 pwd_set(pdp, NULL); 2440 PWDDESC_XUNLOCK(pdp); 2441 pwd_drop(pwd); 2442 2443 PWDDESC_LOCK_DESTROY(pdp); 2444 free(pdp, M_PWDDESC); 2445 } 2446 2447 /* 2448 * Share a filedesc structure. 2449 */ 2450 struct filedesc * 2451 fdshare(struct filedesc *fdp) 2452 { 2453 2454 refcount_acquire(&fdp->fd_refcnt); 2455 return (fdp); 2456 } 2457 2458 /* 2459 * Share a pwddesc structure. 2460 */ 2461 struct pwddesc * 2462 pdshare(struct pwddesc *pdp) 2463 { 2464 refcount_acquire(&pdp->pd_refcount); 2465 return (pdp); 2466 } 2467 2468 /* 2469 * Unshare a filedesc structure, if necessary by making a copy 2470 */ 2471 void 2472 fdunshare(struct thread *td) 2473 { 2474 struct filedesc *tmp; 2475 struct proc *p = td->td_proc; 2476 2477 if (refcount_load(&p->p_fd->fd_refcnt) == 1) 2478 return; 2479 2480 tmp = fdcopy(p->p_fd); 2481 fdescfree(td); 2482 p->p_fd = tmp; 2483 } 2484 2485 /* 2486 * Unshare a pwddesc structure. 2487 */ 2488 void 2489 pdunshare(struct thread *td) 2490 { 2491 struct pwddesc *pdp; 2492 struct proc *p; 2493 2494 p = td->td_proc; 2495 /* Not shared. */ 2496 if (refcount_load(&p->p_pd->pd_refcount) == 1) 2497 return; 2498 2499 pdp = pdcopy(p->p_pd); 2500 pdescfree(td); 2501 p->p_pd = pdp; 2502 } 2503 2504 /* 2505 * Copy a filedesc structure. A NULL pointer in returns a NULL reference, 2506 * this is to ease callers, not catch errors. 2507 */ 2508 struct filedesc * 2509 fdcopy(struct filedesc *fdp) 2510 { 2511 struct filedesc *newfdp; 2512 struct filedescent *nfde, *ofde; 2513 int i, lastfile; 2514 2515 MPASS(fdp != NULL); 2516 2517 newfdp = fdinit(); 2518 FILEDESC_SLOCK(fdp); 2519 for (;;) { 2520 lastfile = fdlastfile(fdp); 2521 if (lastfile < newfdp->fd_nfiles) 2522 break; 2523 FILEDESC_SUNLOCK(fdp); 2524 fdgrowtable(newfdp, lastfile + 1); 2525 FILEDESC_SLOCK(fdp); 2526 } 2527 /* copy all passable descriptors (i.e. not kqueue) */ 2528 newfdp->fd_freefile = fdp->fd_freefile; 2529 FILEDESC_FOREACH_FDE(fdp, i, ofde) { 2530 if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0 || 2531 (ofde->fde_flags & UF_FOCLOSE) != 0 || 2532 !fhold(ofde->fde_file)) { 2533 if (newfdp->fd_freefile == fdp->fd_freefile) 2534 newfdp->fd_freefile = i; 2535 continue; 2536 } 2537 nfde = &newfdp->fd_ofiles[i]; 2538 *nfde = *ofde; 2539 filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true); 2540 fdused_init(newfdp, i); 2541 } 2542 MPASS(newfdp->fd_freefile != -1); 2543 FILEDESC_SUNLOCK(fdp); 2544 return (newfdp); 2545 } 2546 2547 /* 2548 * Copy a pwddesc structure. 2549 */ 2550 struct pwddesc * 2551 pdcopy(struct pwddesc *pdp) 2552 { 2553 struct pwddesc *newpdp; 2554 2555 MPASS(pdp != NULL); 2556 2557 newpdp = pdinit(pdp, true); 2558 newpdp->pd_cmask = pdp->pd_cmask; 2559 PWDDESC_XUNLOCK(pdp); 2560 return (newpdp); 2561 } 2562 2563 /* 2564 * Clear POSIX style locks. This is only used when fdp looses a reference (i.e. 2565 * one of processes using it exits) and the table used to be shared. 2566 */ 2567 static void 2568 fdclearlocks(struct thread *td) 2569 { 2570 struct filedesc *fdp; 2571 struct filedesc_to_leader *fdtol; 2572 struct flock lf; 2573 struct file *fp; 2574 struct proc *p; 2575 struct vnode *vp; 2576 int i; 2577 2578 p = td->td_proc; 2579 fdp = p->p_fd; 2580 fdtol = p->p_fdtol; 2581 MPASS(fdtol != NULL); 2582 2583 FILEDESC_XLOCK(fdp); 2584 KASSERT(fdtol->fdl_refcount > 0, 2585 ("filedesc_to_refcount botch: fdl_refcount=%d", 2586 fdtol->fdl_refcount)); 2587 if (fdtol->fdl_refcount == 1 && 2588 (p->p_leader->p_flag & P_ADVLOCK) != 0) { 2589 FILEDESC_FOREACH_FP(fdp, i, fp) { 2590 if (fp->f_type != DTYPE_VNODE || 2591 !fhold(fp)) 2592 continue; 2593 FILEDESC_XUNLOCK(fdp); 2594 lf.l_whence = SEEK_SET; 2595 lf.l_start = 0; 2596 lf.l_len = 0; 2597 lf.l_type = F_UNLCK; 2598 vp = fp->f_vnode; 2599 (void) VOP_ADVLOCK(vp, 2600 (caddr_t)p->p_leader, F_UNLCK, 2601 &lf, F_POSIX); 2602 FILEDESC_XLOCK(fdp); 2603 fdrop(fp, td); 2604 } 2605 } 2606 retry: 2607 if (fdtol->fdl_refcount == 1) { 2608 if (fdp->fd_holdleaderscount > 0 && 2609 (p->p_leader->p_flag & P_ADVLOCK) != 0) { 2610 /* 2611 * close() or kern_dup() has cleared a reference 2612 * in a shared file descriptor table. 2613 */ 2614 fdp->fd_holdleaderswakeup = 1; 2615 sx_sleep(&fdp->fd_holdleaderscount, 2616 FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); 2617 goto retry; 2618 } 2619 if (fdtol->fdl_holdcount > 0) { 2620 /* 2621 * Ensure that fdtol->fdl_leader remains 2622 * valid in closef(). 2623 */ 2624 fdtol->fdl_wakeup = 1; 2625 sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK, 2626 "fdlhold", 0); 2627 goto retry; 2628 } 2629 } 2630 fdtol->fdl_refcount--; 2631 if (fdtol->fdl_refcount == 0 && 2632 fdtol->fdl_holdcount == 0) { 2633 fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; 2634 fdtol->fdl_prev->fdl_next = fdtol->fdl_next; 2635 } else 2636 fdtol = NULL; 2637 p->p_fdtol = NULL; 2638 FILEDESC_XUNLOCK(fdp); 2639 if (fdtol != NULL) 2640 free(fdtol, M_FILEDESC_TO_LEADER); 2641 } 2642 2643 /* 2644 * Release a filedesc structure. 2645 */ 2646 static void 2647 fdescfree_fds(struct thread *td, struct filedesc *fdp) 2648 { 2649 struct filedesc0 *fdp0; 2650 struct freetable *ft, *tft; 2651 struct filedescent *fde; 2652 struct file *fp; 2653 int i; 2654 2655 KASSERT(refcount_load(&fdp->fd_refcnt) == 0, 2656 ("%s: fd table %p carries references", __func__, fdp)); 2657 2658 /* 2659 * Serialize with threads iterating over the table, if any. 2660 */ 2661 if (refcount_load(&fdp->fd_holdcnt) > 1) { 2662 FILEDESC_XLOCK(fdp); 2663 FILEDESC_XUNLOCK(fdp); 2664 } 2665 2666 FILEDESC_FOREACH_FDE(fdp, i, fde) { 2667 fp = fde->fde_file; 2668 fdefree_last(fde); 2669 (void) closef(fp, td); 2670 } 2671 2672 if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE)) 2673 free(fdp->fd_map, M_FILEDESC); 2674 if (fdp->fd_nfiles > NDFILE) 2675 free(fdp->fd_files, M_FILEDESC); 2676 2677 fdp0 = (struct filedesc0 *)fdp; 2678 SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft) 2679 free(ft->ft_table, M_FILEDESC); 2680 2681 fddrop(fdp); 2682 } 2683 2684 void 2685 fdescfree(struct thread *td) 2686 { 2687 struct proc *p; 2688 struct filedesc *fdp; 2689 2690 p = td->td_proc; 2691 fdp = p->p_fd; 2692 MPASS(fdp != NULL); 2693 2694 #ifdef RACCT 2695 if (RACCT_ENABLED()) 2696 racct_set_unlocked(p, RACCT_NOFILE, 0); 2697 #endif 2698 2699 if (p->p_fdtol != NULL) 2700 fdclearlocks(td); 2701 2702 /* 2703 * Check fdhold for an explanation. 2704 */ 2705 atomic_store_ptr(&p->p_fd, NULL); 2706 atomic_thread_fence_seq_cst(); 2707 PROC_WAIT_UNLOCKED(p); 2708 2709 if (refcount_release(&fdp->fd_refcnt) == 0) 2710 return; 2711 2712 fdescfree_fds(td, fdp); 2713 } 2714 2715 void 2716 pdescfree(struct thread *td) 2717 { 2718 struct proc *p; 2719 struct pwddesc *pdp; 2720 2721 p = td->td_proc; 2722 pdp = p->p_pd; 2723 MPASS(pdp != NULL); 2724 2725 /* 2726 * Check pdhold for an explanation. 2727 */ 2728 atomic_store_ptr(&p->p_pd, NULL); 2729 atomic_thread_fence_seq_cst(); 2730 PROC_WAIT_UNLOCKED(p); 2731 2732 pddrop(pdp); 2733 } 2734 2735 /* 2736 * For setugid programs, we don't want to people to use that setugidness 2737 * to generate error messages which write to a file which otherwise would 2738 * otherwise be off-limits to the process. We check for filesystems where 2739 * the vnode can change out from under us after execve (like [lin]procfs). 2740 * 2741 * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is 2742 * sufficient. We also don't check for setugidness since we know we are. 2743 */ 2744 static bool 2745 is_unsafe(struct file *fp) 2746 { 2747 struct vnode *vp; 2748 2749 if (fp->f_type != DTYPE_VNODE) 2750 return (false); 2751 2752 vp = fp->f_vnode; 2753 return ((vp->v_vflag & VV_PROCDEP) != 0); 2754 } 2755 2756 /* 2757 * Make this setguid thing safe, if at all possible. 2758 */ 2759 void 2760 fdsetugidsafety(struct thread *td) 2761 { 2762 struct filedesc *fdp; 2763 struct file *fp; 2764 int i; 2765 2766 fdp = td->td_proc->p_fd; 2767 KASSERT(refcount_load(&fdp->fd_refcnt) == 1, 2768 ("the fdtable should not be shared")); 2769 MPASS(fdp->fd_nfiles >= 3); 2770 for (i = 0; i <= 2; i++) { 2771 fp = fdp->fd_ofiles[i].fde_file; 2772 if (fp != NULL && is_unsafe(fp)) { 2773 FILEDESC_XLOCK(fdp); 2774 knote_fdclose(td, i); 2775 /* 2776 * NULL-out descriptor prior to close to avoid 2777 * a race while close blocks. 2778 */ 2779 fdfree(fdp, i); 2780 FILEDESC_XUNLOCK(fdp); 2781 (void) closef(fp, td); 2782 } 2783 } 2784 } 2785 2786 /* 2787 * If a specific file object occupies a specific file descriptor, close the 2788 * file descriptor entry and drop a reference on the file object. This is a 2789 * convenience function to handle a subsequent error in a function that calls 2790 * falloc() that handles the race that another thread might have closed the 2791 * file descriptor out from under the thread creating the file object. 2792 */ 2793 void 2794 fdclose(struct thread *td, struct file *fp, int idx) 2795 { 2796 struct filedesc *fdp = td->td_proc->p_fd; 2797 2798 FILEDESC_XLOCK(fdp); 2799 if (fdp->fd_ofiles[idx].fde_file == fp) { 2800 fdfree(fdp, idx); 2801 FILEDESC_XUNLOCK(fdp); 2802 fdrop(fp, td); 2803 } else 2804 FILEDESC_XUNLOCK(fdp); 2805 } 2806 2807 /* 2808 * Close any files on exec? 2809 */ 2810 void 2811 fdcloseexec(struct thread *td) 2812 { 2813 struct filedesc *fdp; 2814 struct filedescent *fde; 2815 struct file *fp; 2816 int i; 2817 2818 fdp = td->td_proc->p_fd; 2819 KASSERT(refcount_load(&fdp->fd_refcnt) == 1, 2820 ("the fdtable should not be shared")); 2821 FILEDESC_FOREACH_FDE(fdp, i, fde) { 2822 fp = fde->fde_file; 2823 if (fp->f_type == DTYPE_MQUEUE || 2824 (fde->fde_flags & UF_EXCLOSE)) { 2825 FILEDESC_XLOCK(fdp); 2826 fdfree(fdp, i); 2827 (void) closefp(fdp, i, fp, td, false, false); 2828 FILEDESC_UNLOCK_ASSERT(fdp); 2829 } else if (fde->fde_flags & UF_FOCLOSE) { 2830 /* 2831 * https://austingroupbugs.net/view.php?id=1851 2832 * FD_CLOFORK should not be preserved across exec 2833 */ 2834 fde->fde_flags &= ~UF_FOCLOSE; 2835 } 2836 } 2837 } 2838 2839 /* 2840 * It is unsafe for set[ug]id processes to be started with file 2841 * descriptors 0..2 closed, as these descriptors are given implicit 2842 * significance in the Standard C library. fdcheckstd() will create a 2843 * descriptor referencing /dev/null for each of stdin, stdout, and 2844 * stderr that is not already open. 2845 */ 2846 int 2847 fdcheckstd(struct thread *td) 2848 { 2849 struct filedesc *fdp; 2850 register_t save; 2851 int i, error, devnull; 2852 2853 fdp = td->td_proc->p_fd; 2854 KASSERT(refcount_load(&fdp->fd_refcnt) == 1, 2855 ("the fdtable should not be shared")); 2856 MPASS(fdp->fd_nfiles >= 3); 2857 devnull = -1; 2858 for (i = 0; i <= 2; i++) { 2859 if (fdp->fd_ofiles[i].fde_file != NULL) 2860 continue; 2861 2862 save = td->td_retval[0]; 2863 if (devnull != -1) { 2864 error = kern_dup(td, FDDUP_FIXED, 0, devnull, i); 2865 } else { 2866 error = kern_openat(td, AT_FDCWD, "/dev/null", 2867 UIO_SYSSPACE, O_RDWR, 0); 2868 if (error == 0) { 2869 devnull = td->td_retval[0]; 2870 KASSERT(devnull == i, ("we didn't get our fd")); 2871 } 2872 } 2873 td->td_retval[0] = save; 2874 if (error != 0) 2875 return (error); 2876 } 2877 return (0); 2878 } 2879 2880 /* 2881 * Internal form of close. Decrement reference count on file structure. 2882 * Note: td may be NULL when closing a file that was being passed in a 2883 * message. 2884 */ 2885 int 2886 closef(struct file *fp, struct thread *td) 2887 { 2888 struct vnode *vp; 2889 struct flock lf; 2890 struct filedesc_to_leader *fdtol; 2891 struct filedesc *fdp; 2892 2893 MPASS(td != NULL); 2894 2895 /* 2896 * POSIX record locking dictates that any close releases ALL 2897 * locks owned by this process. This is handled by setting 2898 * a flag in the unlock to free ONLY locks obeying POSIX 2899 * semantics, and not to free BSD-style file locks. 2900 * If the descriptor was in a message, POSIX-style locks 2901 * aren't passed with the descriptor, and the thread pointer 2902 * will be NULL. Callers should be careful only to pass a 2903 * NULL thread pointer when there really is no owning 2904 * context that might have locks, or the locks will be 2905 * leaked. 2906 */ 2907 if (fp->f_type == DTYPE_VNODE) { 2908 vp = fp->f_vnode; 2909 if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { 2910 lf.l_whence = SEEK_SET; 2911 lf.l_start = 0; 2912 lf.l_len = 0; 2913 lf.l_type = F_UNLCK; 2914 (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, 2915 F_UNLCK, &lf, F_POSIX); 2916 } 2917 fdtol = td->td_proc->p_fdtol; 2918 if (fdtol != NULL) { 2919 /* 2920 * Handle special case where file descriptor table is 2921 * shared between multiple process leaders. 2922 */ 2923 fdp = td->td_proc->p_fd; 2924 FILEDESC_XLOCK(fdp); 2925 for (fdtol = fdtol->fdl_next; 2926 fdtol != td->td_proc->p_fdtol; 2927 fdtol = fdtol->fdl_next) { 2928 if ((fdtol->fdl_leader->p_flag & 2929 P_ADVLOCK) == 0) 2930 continue; 2931 fdtol->fdl_holdcount++; 2932 FILEDESC_XUNLOCK(fdp); 2933 lf.l_whence = SEEK_SET; 2934 lf.l_start = 0; 2935 lf.l_len = 0; 2936 lf.l_type = F_UNLCK; 2937 vp = fp->f_vnode; 2938 (void) VOP_ADVLOCK(vp, 2939 (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf, 2940 F_POSIX); 2941 FILEDESC_XLOCK(fdp); 2942 fdtol->fdl_holdcount--; 2943 if (fdtol->fdl_holdcount == 0 && 2944 fdtol->fdl_wakeup != 0) { 2945 fdtol->fdl_wakeup = 0; 2946 wakeup(fdtol); 2947 } 2948 } 2949 FILEDESC_XUNLOCK(fdp); 2950 } 2951 } 2952 return (fdrop_close(fp, td)); 2953 } 2954 2955 /* 2956 * Hack for file descriptor passing code. 2957 */ 2958 void 2959 closef_nothread(struct file *fp) 2960 { 2961 2962 fdrop(fp, NULL); 2963 } 2964 2965 /* 2966 * Initialize the file pointer with the specified properties. 2967 * 2968 * The ops are set with release semantics to be certain that the flags, type, 2969 * and data are visible when ops is. This is to prevent ops methods from being 2970 * called with bad data. 2971 */ 2972 void 2973 finit(struct file *fp, u_int flag, short type, void *data, 2974 const struct fileops *ops) 2975 { 2976 fp->f_data = data; 2977 fp->f_flag = flag; 2978 fp->f_type = type; 2979 atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops); 2980 } 2981 2982 void 2983 finit_vnode(struct file *fp, u_int flag, void *data, const struct fileops *ops) 2984 { 2985 fp->f_seqcount[UIO_READ] = 1; 2986 fp->f_seqcount[UIO_WRITE] = 1; 2987 finit(fp, (flag & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, 2988 data, ops); 2989 } 2990 2991 int 2992 fget_cap_noref(struct filedesc *fdp, int fd, const cap_rights_t *needrightsp, 2993 struct file **fpp, struct filecaps *havecapsp) 2994 { 2995 struct filedescent *fde; 2996 int error; 2997 2998 FILEDESC_LOCK_ASSERT(fdp); 2999 3000 *fpp = NULL; 3001 fde = fdeget_noref(fdp, fd); 3002 if (fde == NULL) { 3003 error = EBADF; 3004 goto out; 3005 } 3006 3007 #ifdef CAPABILITIES 3008 error = cap_check(cap_rights_fde_inline(fde), needrightsp); 3009 if (error != 0) 3010 goto out; 3011 #endif 3012 3013 if (havecapsp != NULL) 3014 filecaps_copy(&fde->fde_caps, havecapsp, true); 3015 3016 *fpp = fde->fde_file; 3017 3018 error = 0; 3019 out: 3020 return (error); 3021 } 3022 3023 #ifdef CAPABILITIES 3024 int 3025 fget_cap(struct thread *td, int fd, const cap_rights_t *needrightsp, 3026 uint8_t *flagsp, struct file **fpp, struct filecaps *havecapsp) 3027 { 3028 struct filedesc *fdp = td->td_proc->p_fd; 3029 int error; 3030 struct file *fp; 3031 seqc_t seq; 3032 3033 *fpp = NULL; 3034 for (;;) { 3035 error = fget_unlocked_seq(td, fd, needrightsp, flagsp, &fp, 3036 &seq); 3037 if (error != 0) 3038 return (error); 3039 3040 if (havecapsp != NULL) { 3041 if (!filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, 3042 havecapsp, false)) { 3043 fdrop(fp, td); 3044 goto get_locked; 3045 } 3046 } 3047 3048 if (!fd_modified(fdp, fd, seq)) 3049 break; 3050 fdrop(fp, td); 3051 } 3052 3053 *fpp = fp; 3054 return (0); 3055 3056 get_locked: 3057 FILEDESC_SLOCK(fdp); 3058 error = fget_cap_noref(fdp, fd, needrightsp, fpp, havecapsp); 3059 if (error == 0 && !fhold(*fpp)) 3060 error = EBADF; 3061 FILEDESC_SUNLOCK(fdp); 3062 return (error); 3063 } 3064 #else 3065 int 3066 fget_cap(struct thread *td, int fd, const cap_rights_t *needrightsp, 3067 uint8_t *flagsp, struct file **fpp, struct filecaps *havecapsp) 3068 { 3069 int error; 3070 error = fget_unlocked_flags(td, fd, needrightsp, flagsp, fpp); 3071 if (havecapsp != NULL && error == 0) 3072 filecaps_fill(havecapsp); 3073 3074 return (error); 3075 } 3076 #endif 3077 3078 int 3079 fget_remote(struct thread *td, struct proc *p, int fd, struct file **fpp) 3080 { 3081 struct filedesc *fdp; 3082 struct file *fp; 3083 int error; 3084 3085 if (p == td->td_proc) /* curproc */ 3086 return (fget_unlocked(td, fd, &cap_no_rights, fpp)); 3087 3088 PROC_LOCK(p); 3089 fdp = fdhold(p); 3090 PROC_UNLOCK(p); 3091 if (fdp == NULL) 3092 return (ENOENT); 3093 FILEDESC_SLOCK(fdp); 3094 if (refcount_load(&fdp->fd_refcnt) != 0) { 3095 fp = fget_noref(fdp, fd); 3096 if (fp != NULL && fhold(fp)) { 3097 *fpp = fp; 3098 error = 0; 3099 } else { 3100 error = EBADF; 3101 } 3102 } else { 3103 error = ENOENT; 3104 } 3105 FILEDESC_SUNLOCK(fdp); 3106 fddrop(fdp); 3107 return (error); 3108 } 3109 3110 int 3111 fget_remote_foreach(struct thread *td, struct proc *p, 3112 int (*fn)(struct proc *, int, struct file *, void *), void *arg) 3113 { 3114 struct filedesc *fdp; 3115 struct fdescenttbl *fdt; 3116 struct file *fp; 3117 int error, error1, fd, highfd; 3118 3119 error = 0; 3120 PROC_LOCK(p); 3121 fdp = fdhold(p); 3122 PROC_UNLOCK(p); 3123 if (fdp == NULL) 3124 return (ENOENT); 3125 3126 FILEDESC_SLOCK(fdp); 3127 if (refcount_load(&fdp->fd_refcnt) != 0) { 3128 fdt = atomic_load_ptr(&fdp->fd_files); 3129 highfd = fdt->fdt_nfiles - 1; 3130 FILEDESC_SUNLOCK(fdp); 3131 } else { 3132 error = ENOENT; 3133 FILEDESC_SUNLOCK(fdp); 3134 goto out; 3135 } 3136 3137 for (fd = 0; fd <= highfd; fd++) { 3138 error1 = fget_remote(td, p, fd, &fp); 3139 if (error1 != 0) 3140 continue; 3141 error = fn(p, fd, fp, arg); 3142 fdrop(fp, td); 3143 if (error != 0) 3144 break; 3145 } 3146 out: 3147 fddrop(fdp); 3148 return (error); 3149 } 3150 3151 #ifdef CAPABILITIES 3152 int 3153 fgetvp_lookup_smr(struct nameidata *ndp, struct vnode **vpp, int *flagsp) 3154 { 3155 const struct filedescent *fde; 3156 const struct fdescenttbl *fdt; 3157 struct filedesc *fdp; 3158 struct file *fp; 3159 struct vnode *vp; 3160 const cap_rights_t *haverights; 3161 cap_rights_t rights; 3162 seqc_t seq; 3163 int fd, flags; 3164 3165 VFS_SMR_ASSERT_ENTERED(); 3166 3167 fd = ndp->ni_dirfd; 3168 rights = *ndp->ni_rightsneeded; 3169 cap_rights_set_one(&rights, CAP_LOOKUP); 3170 3171 fdp = curproc->p_fd; 3172 fdt = fdp->fd_files; 3173 if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) 3174 return (EBADF); 3175 seq = seqc_read_notmodify(fd_seqc(fdt, fd)); 3176 fde = &fdt->fdt_ofiles[fd]; 3177 haverights = cap_rights_fde_inline(fde); 3178 fp = fde->fde_file; 3179 if (__predict_false(fp == NULL)) 3180 return (EAGAIN); 3181 if (__predict_false(cap_check_inline_transient(haverights, &rights))) 3182 return (EAGAIN); 3183 flags = fp->f_flag & FSEARCH; 3184 flags |= (fde->fde_flags & UF_RESOLVE_BENEATH) != 0 ? 3185 O_RESOLVE_BENEATH : 0; 3186 vp = fp->f_vnode; 3187 if (__predict_false(vp == NULL)) { 3188 return (EAGAIN); 3189 } 3190 if (!filecaps_copy(&fde->fde_caps, &ndp->ni_filecaps, false)) { 3191 return (EAGAIN); 3192 } 3193 /* 3194 * Use an acquire barrier to force re-reading of fdt so it is 3195 * refreshed for verification. 3196 */ 3197 atomic_thread_fence_acq(); 3198 fdt = fdp->fd_files; 3199 if (__predict_false(!seqc_consistent_no_fence(fd_seqc(fdt, fd), seq))) 3200 return (EAGAIN); 3201 /* 3202 * If file descriptor doesn't have all rights, 3203 * all lookups relative to it must also be 3204 * strictly relative. 3205 * 3206 * Not yet supported by fast path. 3207 */ 3208 CAP_ALL(&rights); 3209 if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) || 3210 ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL || 3211 ndp->ni_filecaps.fc_nioctls != -1) { 3212 #ifdef notyet 3213 ndp->ni_lcf |= NI_LCF_STRICTREL; 3214 #else 3215 return (EAGAIN); 3216 #endif 3217 } 3218 *vpp = vp; 3219 *flagsp = flags; 3220 return (0); 3221 } 3222 #else 3223 int 3224 fgetvp_lookup_smr(struct nameidata *ndp, struct vnode **vpp, int *flagsp) 3225 { 3226 const struct filedescent *fde; 3227 const struct fdescenttbl *fdt; 3228 struct filedesc *fdp; 3229 struct file *fp; 3230 struct vnode *vp; 3231 int fd, flags; 3232 3233 VFS_SMR_ASSERT_ENTERED(); 3234 3235 fd = ndp->ni_dirfd; 3236 fdp = curproc->p_fd; 3237 fdt = fdp->fd_files; 3238 if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) 3239 return (EBADF); 3240 fde = &fdt->fdt_ofiles[fd]; 3241 fp = fde->fde_file; 3242 if (__predict_false(fp == NULL)) 3243 return (EAGAIN); 3244 flags = fp->f_flag & FSEARCH; 3245 flags |= (fde->fde_flags & UF_RESOLVE_BENEATH) != 0 ? 3246 O_RESOLVE_BENEATH : 0; 3247 vp = fp->f_vnode; 3248 if (__predict_false(vp == NULL || vp->v_type != VDIR)) { 3249 return (EAGAIN); 3250 } 3251 /* 3252 * Use an acquire barrier to force re-reading of fdt so it is 3253 * refreshed for verification. 3254 */ 3255 atomic_thread_fence_acq(); 3256 fdt = fdp->fd_files; 3257 if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file)) 3258 return (EAGAIN); 3259 filecaps_fill(&ndp->ni_filecaps); 3260 *vpp = vp; 3261 *flagsp = flags; 3262 return (0); 3263 } 3264 #endif 3265 3266 int 3267 fgetvp_lookup(struct nameidata *ndp, struct vnode **vpp) 3268 { 3269 struct thread *td; 3270 struct file *fp; 3271 struct vnode *vp; 3272 struct componentname *cnp; 3273 cap_rights_t rights; 3274 int error; 3275 uint8_t flags; 3276 3277 td = curthread; 3278 rights = *ndp->ni_rightsneeded; 3279 cap_rights_set_one(&rights, CAP_LOOKUP); 3280 cnp = &ndp->ni_cnd; 3281 3282 error = fget_cap(td, ndp->ni_dirfd, &rights, &flags, &fp, 3283 &ndp->ni_filecaps); 3284 if (__predict_false(error != 0)) 3285 return (error); 3286 if (__predict_false(fp->f_ops == &badfileops)) { 3287 error = EBADF; 3288 goto out_free; 3289 } 3290 vp = fp->f_vnode; 3291 if (__predict_false(vp == NULL)) { 3292 error = ENOTDIR; 3293 goto out_free; 3294 } 3295 vrefact(vp); 3296 /* 3297 * XXX does not check for VDIR, handled by namei_setup 3298 */ 3299 if ((fp->f_flag & FSEARCH) != 0) 3300 cnp->cn_flags |= NOEXECCHECK; 3301 if ((flags & UF_RESOLVE_BENEATH) != 0) { 3302 cnp->cn_flags |= RBENEATH; 3303 ndp->ni_resflags |= NIRES_BENEATH; 3304 } 3305 fdrop(fp, td); 3306 3307 #ifdef CAPABILITIES 3308 /* 3309 * If file descriptor doesn't have all rights, 3310 * all lookups relative to it must also be 3311 * strictly relative. 3312 */ 3313 CAP_ALL(&rights); 3314 if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) || 3315 ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL || 3316 ndp->ni_filecaps.fc_nioctls != -1) { 3317 ndp->ni_lcf |= NI_LCF_STRICTREL; 3318 ndp->ni_resflags |= NIRES_STRICTREL; 3319 } 3320 #endif 3321 3322 /* 3323 * TODO: avoid copying ioctl caps if it can be helped to begin with 3324 */ 3325 if ((cnp->cn_flags & WANTIOCTLCAPS) == 0) 3326 filecaps_free_ioctl(&ndp->ni_filecaps); 3327 3328 *vpp = vp; 3329 return (0); 3330 3331 out_free: 3332 filecaps_free(&ndp->ni_filecaps); 3333 fdrop(fp, td); 3334 return (error); 3335 } 3336 3337 /* 3338 * Fetch the descriptor locklessly. 3339 * 3340 * We avoid fdrop() races by never raising a refcount above 0. To accomplish 3341 * this we have to use a cmpset loop rather than an atomic_add. The descriptor 3342 * must be re-verified once we acquire a reference to be certain that the 3343 * identity is still correct and we did not lose a race due to preemption. 3344 * 3345 * Force a reload of fdt when looping. Another thread could reallocate 3346 * the table before this fd was closed, so it is possible that there is 3347 * a stale fp pointer in cached version. 3348 */ 3349 #ifdef CAPABILITIES 3350 static int 3351 fget_unlocked_seq(struct thread *td, int fd, const cap_rights_t *needrightsp, 3352 uint8_t *flagsp, struct file **fpp, seqc_t *seqp) 3353 { 3354 struct filedesc *fdp; 3355 const struct filedescent *fde; 3356 const struct fdescenttbl *fdt; 3357 struct file *fp; 3358 seqc_t seq; 3359 cap_rights_t haverights; 3360 int error; 3361 uint8_t flags; 3362 3363 fdp = td->td_proc->p_fd; 3364 fdt = fdp->fd_files; 3365 if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) 3366 return (EBADF); 3367 3368 for (;;) { 3369 seq = seqc_read_notmodify(fd_seqc(fdt, fd)); 3370 fde = &fdt->fdt_ofiles[fd]; 3371 haverights = *cap_rights_fde_inline(fde); 3372 fp = fde->fde_file; 3373 flags = fde->fde_flags; 3374 if (__predict_false(fp == NULL)) { 3375 if (seqc_consistent(fd_seqc(fdt, fd), seq)) 3376 return (EBADF); 3377 fdt = atomic_load_ptr(&fdp->fd_files); 3378 continue; 3379 } 3380 error = cap_check_inline(&haverights, needrightsp); 3381 if (__predict_false(error != 0)) { 3382 if (seqc_consistent(fd_seqc(fdt, fd), seq)) 3383 return (error); 3384 fdt = atomic_load_ptr(&fdp->fd_files); 3385 continue; 3386 } 3387 if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) { 3388 fdt = atomic_load_ptr(&fdp->fd_files); 3389 continue; 3390 } 3391 /* 3392 * Use an acquire barrier to force re-reading of fdt so it is 3393 * refreshed for verification. 3394 */ 3395 atomic_thread_fence_acq(); 3396 fdt = fdp->fd_files; 3397 if (seqc_consistent_no_fence(fd_seqc(fdt, fd), seq)) 3398 break; 3399 fdrop(fp, td); 3400 } 3401 *fpp = fp; 3402 if (flagsp != NULL) 3403 *flagsp = flags; 3404 if (seqp != NULL) 3405 *seqp = seq; 3406 return (0); 3407 } 3408 #else 3409 static int 3410 fget_unlocked_seq(struct thread *td, int fd, const cap_rights_t *needrightsp, 3411 uint8_t *flagsp, struct file **fpp, seqc_t *seqp __unused) 3412 { 3413 struct filedesc *fdp; 3414 const struct fdescenttbl *fdt; 3415 struct file *fp; 3416 uint8_t flags; 3417 3418 fdp = td->td_proc->p_fd; 3419 fdt = fdp->fd_files; 3420 if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) 3421 return (EBADF); 3422 3423 for (;;) { 3424 fp = fdt->fdt_ofiles[fd].fde_file; 3425 flags = fdt->fdt_ofiles[fd].fde_flags; 3426 if (__predict_false(fp == NULL)) 3427 return (EBADF); 3428 if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) { 3429 fdt = atomic_load_ptr(&fdp->fd_files); 3430 continue; 3431 } 3432 /* 3433 * Use an acquire barrier to force re-reading of fdt so it is 3434 * refreshed for verification. 3435 */ 3436 atomic_thread_fence_acq(); 3437 fdt = fdp->fd_files; 3438 if (__predict_true(fp == fdt->fdt_ofiles[fd].fde_file)) 3439 break; 3440 fdrop(fp, td); 3441 } 3442 if (flagsp != NULL) 3443 *flagsp = flags; 3444 *fpp = fp; 3445 return (0); 3446 } 3447 #endif 3448 3449 /* 3450 * See the comments in fget_unlocked_seq for an explanation of how this works. 3451 * 3452 * This is a simplified variant which bails out to the aforementioned routine 3453 * if anything goes wrong. In practice this only happens when userspace is 3454 * racing with itself. 3455 */ 3456 int 3457 fget_unlocked_flags(struct thread *td, int fd, const cap_rights_t *needrightsp, 3458 uint8_t *flagsp, struct file **fpp) 3459 { 3460 struct filedesc *fdp; 3461 #ifdef CAPABILITIES 3462 const struct filedescent *fde; 3463 #endif 3464 const struct fdescenttbl *fdt; 3465 struct file *fp; 3466 #ifdef CAPABILITIES 3467 seqc_t seq; 3468 const cap_rights_t *haverights; 3469 #endif 3470 uint8_t flags; 3471 3472 fdp = td->td_proc->p_fd; 3473 fdt = fdp->fd_files; 3474 if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) { 3475 *fpp = NULL; 3476 return (EBADF); 3477 } 3478 #ifdef CAPABILITIES 3479 seq = seqc_read_notmodify(fd_seqc(fdt, fd)); 3480 fde = &fdt->fdt_ofiles[fd]; 3481 haverights = cap_rights_fde_inline(fde); 3482 fp = fde->fde_file; 3483 flags = fde->fde_flags; 3484 #else 3485 fp = fdt->fdt_ofiles[fd].fde_file; 3486 flags = fdt->fdt_ofiles[fd].fde_flags; 3487 #endif 3488 if (__predict_false(fp == NULL)) 3489 goto out_fallback; 3490 #ifdef CAPABILITIES 3491 if (__predict_false(cap_check_inline_transient(haverights, needrightsp))) 3492 goto out_fallback; 3493 #endif 3494 if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) 3495 goto out_fallback; 3496 3497 /* 3498 * Use an acquire barrier to force re-reading of fdt so it is 3499 * refreshed for verification. 3500 */ 3501 atomic_thread_fence_acq(); 3502 fdt = fdp->fd_files; 3503 #ifdef CAPABILITIES 3504 if (__predict_false(!seqc_consistent_no_fence(fd_seqc(fdt, fd), seq))) 3505 #else 3506 if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file)) 3507 #endif 3508 goto out_fdrop; 3509 *fpp = fp; 3510 if (flagsp != NULL) 3511 *flagsp = flags; 3512 return (0); 3513 out_fdrop: 3514 fdrop(fp, td); 3515 out_fallback: 3516 *fpp = NULL; 3517 return (fget_unlocked_seq(td, fd, needrightsp, flagsp, fpp, NULL)); 3518 } 3519 3520 int 3521 fget_unlocked(struct thread *td, int fd, const cap_rights_t *needrightsp, 3522 struct file **fpp) 3523 { 3524 return (fget_unlocked_flags(td, fd, needrightsp, NULL, fpp)); 3525 } 3526 3527 /* 3528 * Translate fd -> file when the caller guarantees the file descriptor table 3529 * can't be changed by others. 3530 * 3531 * Note this does not mean the file object itself is only visible to the caller, 3532 * merely that it wont disappear without having to be referenced. 3533 * 3534 * Must be paired with fput_only_user. 3535 */ 3536 #ifdef CAPABILITIES 3537 int 3538 fget_only_user(struct filedesc *fdp, int fd, const cap_rights_t *needrightsp, 3539 struct file **fpp) 3540 { 3541 const struct filedescent *fde; 3542 const struct fdescenttbl *fdt; 3543 const cap_rights_t *haverights; 3544 struct file *fp; 3545 int error; 3546 3547 MPASS(FILEDESC_IS_ONLY_USER(fdp)); 3548 3549 *fpp = NULL; 3550 if (__predict_false(fd >= fdp->fd_nfiles)) 3551 return (EBADF); 3552 3553 fdt = fdp->fd_files; 3554 fde = &fdt->fdt_ofiles[fd]; 3555 fp = fde->fde_file; 3556 if (__predict_false(fp == NULL)) 3557 return (EBADF); 3558 MPASS(refcount_load(&fp->f_count) > 0); 3559 haverights = cap_rights_fde_inline(fde); 3560 error = cap_check_inline(haverights, needrightsp); 3561 if (__predict_false(error != 0)) 3562 return (error); 3563 *fpp = fp; 3564 return (0); 3565 } 3566 #else 3567 int 3568 fget_only_user(struct filedesc *fdp, int fd, const cap_rights_t *needrightsp, 3569 struct file **fpp) 3570 { 3571 struct file *fp; 3572 3573 MPASS(FILEDESC_IS_ONLY_USER(fdp)); 3574 3575 *fpp = NULL; 3576 if (__predict_false(fd >= fdp->fd_nfiles)) 3577 return (EBADF); 3578 3579 fp = fdp->fd_ofiles[fd].fde_file; 3580 if (__predict_false(fp == NULL)) 3581 return (EBADF); 3582 3583 MPASS(refcount_load(&fp->f_count) > 0); 3584 *fpp = fp; 3585 return (0); 3586 } 3587 #endif 3588 3589 /* 3590 * Extract the file pointer associated with the specified descriptor for the 3591 * current user process. 3592 * 3593 * If the descriptor doesn't exist or doesn't match 'flags', EBADF is 3594 * returned. 3595 * 3596 * File's rights will be checked against the capability rights mask. 3597 * 3598 * If an error occurred the non-zero error is returned and *fpp is set to 3599 * NULL. Otherwise *fpp is held and set and zero is returned. Caller is 3600 * responsible for fdrop(). 3601 */ 3602 static __inline int 3603 _fget(struct thread *td, int fd, struct file **fpp, int flags, 3604 const cap_rights_t *needrightsp) 3605 { 3606 struct file *fp; 3607 int error; 3608 3609 *fpp = NULL; 3610 error = fget_unlocked(td, fd, needrightsp, &fp); 3611 if (__predict_false(error != 0)) 3612 return (error); 3613 if (__predict_false(fp->f_ops == &badfileops)) { 3614 fdrop(fp, td); 3615 return (EBADF); 3616 } 3617 3618 /* 3619 * FREAD and FWRITE failure return EBADF as per POSIX. 3620 */ 3621 error = 0; 3622 switch (flags) { 3623 case FREAD: 3624 case FWRITE: 3625 if ((fp->f_flag & flags) == 0) 3626 error = EBADF; 3627 break; 3628 case FEXEC: 3629 if (fp->f_ops != &path_fileops && 3630 ((fp->f_flag & (FREAD | FEXEC)) == 0 || 3631 (fp->f_flag & FWRITE) != 0)) 3632 error = EBADF; 3633 break; 3634 case 0: 3635 break; 3636 default: 3637 KASSERT(0, ("wrong flags")); 3638 } 3639 3640 if (error != 0) { 3641 fdrop(fp, td); 3642 return (error); 3643 } 3644 3645 *fpp = fp; 3646 return (0); 3647 } 3648 3649 int 3650 fget(struct thread *td, int fd, const cap_rights_t *rightsp, struct file **fpp) 3651 { 3652 3653 return (_fget(td, fd, fpp, 0, rightsp)); 3654 } 3655 3656 int 3657 fget_mmap(struct thread *td, int fd, const cap_rights_t *rightsp, 3658 vm_prot_t *maxprotp, struct file **fpp) 3659 { 3660 int error; 3661 #ifndef CAPABILITIES 3662 error = _fget(td, fd, fpp, 0, rightsp); 3663 if (maxprotp != NULL) 3664 *maxprotp = VM_PROT_ALL; 3665 return (error); 3666 #else 3667 cap_rights_t fdrights; 3668 struct filedesc *fdp; 3669 struct file *fp; 3670 seqc_t seq; 3671 3672 *fpp = NULL; 3673 fdp = td->td_proc->p_fd; 3674 MPASS(cap_rights_is_set(rightsp, CAP_MMAP)); 3675 for (;;) { 3676 error = fget_unlocked_seq(td, fd, rightsp, NULL, &fp, &seq); 3677 if (__predict_false(error != 0)) 3678 return (error); 3679 if (__predict_false(fp->f_ops == &badfileops)) { 3680 fdrop(fp, td); 3681 return (EBADF); 3682 } 3683 if (maxprotp != NULL) 3684 fdrights = *cap_rights(fdp, fd); 3685 if (!fd_modified(fdp, fd, seq)) 3686 break; 3687 fdrop(fp, td); 3688 } 3689 3690 /* 3691 * If requested, convert capability rights to access flags. 3692 */ 3693 if (maxprotp != NULL) 3694 *maxprotp = cap_rights_to_vmprot(&fdrights); 3695 *fpp = fp; 3696 return (0); 3697 #endif 3698 } 3699 3700 int 3701 fget_read(struct thread *td, int fd, const cap_rights_t *rightsp, 3702 struct file **fpp) 3703 { 3704 3705 return (_fget(td, fd, fpp, FREAD, rightsp)); 3706 } 3707 3708 int 3709 fget_write(struct thread *td, int fd, const cap_rights_t *rightsp, 3710 struct file **fpp) 3711 { 3712 3713 return (_fget(td, fd, fpp, FWRITE, rightsp)); 3714 } 3715 3716 int 3717 fget_fcntl(struct thread *td, int fd, const cap_rights_t *rightsp, 3718 int needfcntl, struct file **fpp) 3719 { 3720 #ifndef CAPABILITIES 3721 return (fget_unlocked(td, fd, rightsp, fpp)); 3722 #else 3723 struct filedesc *fdp = td->td_proc->p_fd; 3724 struct file *fp; 3725 int error; 3726 seqc_t seq; 3727 3728 *fpp = NULL; 3729 MPASS(cap_rights_is_set(rightsp, CAP_FCNTL)); 3730 for (;;) { 3731 error = fget_unlocked_seq(td, fd, rightsp, NULL, &fp, &seq); 3732 if (error != 0) 3733 return (error); 3734 error = cap_fcntl_check(fdp, fd, needfcntl); 3735 if (!fd_modified(fdp, fd, seq)) 3736 break; 3737 fdrop(fp, td); 3738 } 3739 if (error != 0) { 3740 fdrop(fp, td); 3741 return (error); 3742 } 3743 *fpp = fp; 3744 return (0); 3745 #endif 3746 } 3747 3748 /* 3749 * Like fget() but loads the underlying vnode, or returns an error if the 3750 * descriptor does not represent a vnode. Note that pipes use vnodes but 3751 * never have VM objects. The returned vnode will be vref()'d. 3752 * 3753 * XXX: what about the unused flags ? 3754 */ 3755 static __inline int 3756 _fgetvp(struct thread *td, int fd, int flags, const cap_rights_t *needrightsp, 3757 struct vnode **vpp) 3758 { 3759 struct file *fp; 3760 int error; 3761 3762 *vpp = NULL; 3763 error = _fget(td, fd, &fp, flags, needrightsp); 3764 if (error != 0) 3765 return (error); 3766 if (fp->f_vnode == NULL) { 3767 error = EINVAL; 3768 } else { 3769 *vpp = fp->f_vnode; 3770 vrefact(*vpp); 3771 } 3772 fdrop(fp, td); 3773 3774 return (error); 3775 } 3776 3777 int 3778 fgetvp(struct thread *td, int fd, const cap_rights_t *rightsp, 3779 struct vnode **vpp) 3780 { 3781 3782 return (_fgetvp(td, fd, 0, rightsp, vpp)); 3783 } 3784 3785 int 3786 fgetvp_rights(struct thread *td, int fd, const cap_rights_t *needrightsp, 3787 struct filecaps *havecaps, struct vnode **vpp) 3788 { 3789 struct filecaps caps; 3790 struct file *fp; 3791 int error; 3792 3793 error = fget_cap(td, fd, needrightsp, NULL, &fp, &caps); 3794 if (error != 0) 3795 return (error); 3796 if (fp->f_ops == &badfileops) { 3797 error = EBADF; 3798 goto out; 3799 } 3800 if (fp->f_vnode == NULL) { 3801 error = EINVAL; 3802 goto out; 3803 } 3804 3805 *havecaps = caps; 3806 *vpp = fp->f_vnode; 3807 vrefact(*vpp); 3808 fdrop(fp, td); 3809 3810 return (0); 3811 out: 3812 filecaps_free(&caps); 3813 fdrop(fp, td); 3814 return (error); 3815 } 3816 3817 int 3818 fgetvp_read(struct thread *td, int fd, const cap_rights_t *rightsp, 3819 struct vnode **vpp) 3820 { 3821 3822 return (_fgetvp(td, fd, FREAD, rightsp, vpp)); 3823 } 3824 3825 int 3826 fgetvp_exec(struct thread *td, int fd, const cap_rights_t *rightsp, 3827 struct vnode **vpp) 3828 { 3829 3830 return (_fgetvp(td, fd, FEXEC, rightsp, vpp)); 3831 } 3832 3833 #ifdef notyet 3834 int 3835 fgetvp_write(struct thread *td, int fd, const cap_rights_t *rightsp, 3836 struct vnode **vpp) 3837 { 3838 3839 return (_fgetvp(td, fd, FWRITE, rightsp, vpp)); 3840 } 3841 #endif 3842 3843 /* 3844 * Handle the last reference to a file being closed. 3845 * 3846 * Without the noinline attribute clang keeps inlining the func thorough this 3847 * file when fdrop is used. 3848 */ 3849 int __noinline 3850 _fdrop(struct file *fp, struct thread *td) 3851 { 3852 int error; 3853 3854 KASSERT(refcount_load(&fp->f_count) == 0, 3855 ("fdrop: fp %p count %d", fp, refcount_load(&fp->f_count))); 3856 3857 error = fo_close(fp, td); 3858 atomic_subtract_int(&openfiles, 1); 3859 crfree(fp->f_cred); 3860 free(fp->f_advice, M_FADVISE); 3861 uma_zfree(file_zone, fp); 3862 3863 return (error); 3864 } 3865 3866 /* 3867 * Apply an advisory lock on a file descriptor. 3868 * 3869 * Just attempt to get a record lock of the requested type on the entire file 3870 * (l_whence = SEEK_SET, l_start = 0, l_len = 0). 3871 */ 3872 #ifndef _SYS_SYSPROTO_H_ 3873 struct flock_args { 3874 int fd; 3875 int how; 3876 }; 3877 #endif 3878 /* ARGSUSED */ 3879 int 3880 sys_flock(struct thread *td, struct flock_args *uap) 3881 { 3882 struct file *fp; 3883 struct vnode *vp; 3884 struct flock lf; 3885 int error; 3886 3887 error = fget(td, uap->fd, &cap_flock_rights, &fp); 3888 if (error != 0) 3889 return (error); 3890 error = EOPNOTSUPP; 3891 if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) { 3892 goto done; 3893 } 3894 if (fp->f_ops == &path_fileops) { 3895 goto done; 3896 } 3897 3898 error = 0; 3899 vp = fp->f_vnode; 3900 lf.l_whence = SEEK_SET; 3901 lf.l_start = 0; 3902 lf.l_len = 0; 3903 if (uap->how & LOCK_UN) { 3904 lf.l_type = F_UNLCK; 3905 atomic_clear_int(&fp->f_flag, FHASLOCK); 3906 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); 3907 goto done; 3908 } 3909 if (uap->how & LOCK_EX) 3910 lf.l_type = F_WRLCK; 3911 else if (uap->how & LOCK_SH) 3912 lf.l_type = F_RDLCK; 3913 else { 3914 error = EBADF; 3915 goto done; 3916 } 3917 atomic_set_int(&fp->f_flag, FHASLOCK); 3918 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, 3919 (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); 3920 done: 3921 fdrop(fp, td); 3922 return (error); 3923 } 3924 /* 3925 * Duplicate the specified descriptor to a free descriptor. 3926 */ 3927 int 3928 dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode, 3929 int openerror, int *indxp) 3930 { 3931 struct filedescent *newfde, *oldfde; 3932 struct file *fp; 3933 u_long *ioctls; 3934 int error, indx; 3935 3936 KASSERT(openerror == ENODEV || openerror == ENXIO, 3937 ("unexpected error %d in %s", openerror, __func__)); 3938 3939 /* 3940 * If the to-be-dup'd fd number is greater than the allowed number 3941 * of file descriptors, or the fd to be dup'd has already been 3942 * closed, then reject. 3943 */ 3944 FILEDESC_XLOCK(fdp); 3945 if ((fp = fget_noref(fdp, dfd)) == NULL) { 3946 FILEDESC_XUNLOCK(fdp); 3947 return (EBADF); 3948 } 3949 3950 error = fdalloc(td, 0, &indx); 3951 if (error != 0) { 3952 FILEDESC_XUNLOCK(fdp); 3953 return (error); 3954 } 3955 3956 /* 3957 * There are two cases of interest here. 3958 * 3959 * For ENODEV simply dup (dfd) to file descriptor (indx) and return. 3960 * 3961 * For ENXIO steal away the file structure from (dfd) and store it in 3962 * (indx). (dfd) is effectively closed by this operation. 3963 */ 3964 switch (openerror) { 3965 case ENODEV: 3966 /* 3967 * Check that the mode the file is being opened for is a 3968 * subset of the mode of the existing descriptor. 3969 */ 3970 if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { 3971 fdunused(fdp, indx); 3972 FILEDESC_XUNLOCK(fdp); 3973 return (EACCES); 3974 } 3975 if (!fhold(fp)) { 3976 fdunused(fdp, indx); 3977 FILEDESC_XUNLOCK(fdp); 3978 return (EBADF); 3979 } 3980 newfde = &fdp->fd_ofiles[indx]; 3981 oldfde = &fdp->fd_ofiles[dfd]; 3982 ioctls = filecaps_copy_prep(&oldfde->fde_caps); 3983 #ifdef CAPABILITIES 3984 seqc_write_begin(&newfde->fde_seqc); 3985 #endif 3986 fde_copy(oldfde, newfde); 3987 filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps, 3988 ioctls); 3989 #ifdef CAPABILITIES 3990 seqc_write_end(&newfde->fde_seqc); 3991 #endif 3992 break; 3993 case ENXIO: 3994 /* 3995 * Steal away the file pointer from dfd and stuff it into indx. 3996 */ 3997 newfde = &fdp->fd_ofiles[indx]; 3998 oldfde = &fdp->fd_ofiles[dfd]; 3999 #ifdef CAPABILITIES 4000 seqc_write_begin(&oldfde->fde_seqc); 4001 seqc_write_begin(&newfde->fde_seqc); 4002 #endif 4003 fde_copy(oldfde, newfde); 4004 oldfde->fde_file = NULL; 4005 fdunused(fdp, dfd); 4006 #ifdef CAPABILITIES 4007 seqc_write_end(&newfde->fde_seqc); 4008 seqc_write_end(&oldfde->fde_seqc); 4009 #endif 4010 break; 4011 } 4012 FILEDESC_XUNLOCK(fdp); 4013 *indxp = indx; 4014 return (0); 4015 } 4016 4017 /* 4018 * This sysctl determines if we will allow a process to chroot(2) if it 4019 * has a directory open: 4020 * 0: disallowed for all processes. 4021 * 1: allowed for processes that were not already chroot(2)'ed. 4022 * 2: allowed for all processes. 4023 */ 4024 4025 static int chroot_allow_open_directories = 1; 4026 4027 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW, 4028 &chroot_allow_open_directories, 0, 4029 "Allow a process to chroot(2) if it has a directory open"); 4030 4031 /* 4032 * Helper function for raised chroot(2) security function: Refuse if 4033 * any filedescriptors are open directories. 4034 */ 4035 static int 4036 chroot_refuse_vdir_fds(struct filedesc *fdp) 4037 { 4038 struct vnode *vp; 4039 struct file *fp; 4040 int i; 4041 4042 FILEDESC_LOCK_ASSERT(fdp); 4043 4044 FILEDESC_FOREACH_FP(fdp, i, fp) { 4045 if (fp->f_type == DTYPE_VNODE) { 4046 vp = fp->f_vnode; 4047 if (vp->v_type == VDIR) 4048 return (EPERM); 4049 } 4050 } 4051 return (0); 4052 } 4053 4054 static void 4055 pwd_fill(struct pwd *oldpwd, struct pwd *newpwd) 4056 { 4057 4058 if (newpwd->pwd_cdir == NULL && oldpwd->pwd_cdir != NULL) { 4059 vrefact(oldpwd->pwd_cdir); 4060 newpwd->pwd_cdir = oldpwd->pwd_cdir; 4061 } 4062 4063 if (newpwd->pwd_rdir == NULL && oldpwd->pwd_rdir != NULL) { 4064 vrefact(oldpwd->pwd_rdir); 4065 newpwd->pwd_rdir = oldpwd->pwd_rdir; 4066 } 4067 4068 if (newpwd->pwd_jdir == NULL && oldpwd->pwd_jdir != NULL) { 4069 vrefact(oldpwd->pwd_jdir); 4070 newpwd->pwd_jdir = oldpwd->pwd_jdir; 4071 } 4072 4073 if (newpwd->pwd_adir == NULL && oldpwd->pwd_adir != NULL) { 4074 vrefact(oldpwd->pwd_adir); 4075 newpwd->pwd_adir = oldpwd->pwd_adir; 4076 } 4077 } 4078 4079 struct pwd * 4080 pwd_hold_pwddesc(struct pwddesc *pdp) 4081 { 4082 struct pwd *pwd; 4083 4084 PWDDESC_ASSERT_XLOCKED(pdp); 4085 pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4086 if (pwd != NULL) 4087 refcount_acquire(&pwd->pwd_refcount); 4088 return (pwd); 4089 } 4090 4091 bool 4092 pwd_hold_smr(struct pwd *pwd) 4093 { 4094 4095 MPASS(pwd != NULL); 4096 if (__predict_true(refcount_acquire_if_not_zero(&pwd->pwd_refcount))) { 4097 return (true); 4098 } 4099 return (false); 4100 } 4101 4102 struct pwd * 4103 pwd_hold(struct thread *td) 4104 { 4105 struct pwddesc *pdp; 4106 struct pwd *pwd; 4107 4108 pdp = td->td_proc->p_pd; 4109 4110 vfs_smr_enter(); 4111 pwd = vfs_smr_entered_load(&pdp->pd_pwd); 4112 if (pwd_hold_smr(pwd)) { 4113 vfs_smr_exit(); 4114 return (pwd); 4115 } 4116 vfs_smr_exit(); 4117 PWDDESC_XLOCK(pdp); 4118 pwd = pwd_hold_pwddesc(pdp); 4119 MPASS(pwd != NULL); 4120 PWDDESC_XUNLOCK(pdp); 4121 return (pwd); 4122 } 4123 4124 struct pwd * 4125 pwd_hold_proc(struct proc *p) 4126 { 4127 struct pwddesc *pdp; 4128 struct pwd *pwd; 4129 4130 PROC_ASSERT_HELD(p); 4131 PROC_LOCK(p); 4132 pdp = pdhold(p); 4133 MPASS(pdp != NULL); 4134 PROC_UNLOCK(p); 4135 4136 PWDDESC_XLOCK(pdp); 4137 pwd = pwd_hold_pwddesc(pdp); 4138 MPASS(pwd != NULL); 4139 PWDDESC_XUNLOCK(pdp); 4140 pddrop(pdp); 4141 return (pwd); 4142 } 4143 4144 static struct pwd * 4145 pwd_alloc(void) 4146 { 4147 struct pwd *pwd; 4148 4149 pwd = uma_zalloc_smr(pwd_zone, M_WAITOK); 4150 bzero(pwd, sizeof(*pwd)); 4151 refcount_init(&pwd->pwd_refcount, 1); 4152 return (pwd); 4153 } 4154 4155 void 4156 pwd_drop(struct pwd *pwd) 4157 { 4158 4159 if (!refcount_release(&pwd->pwd_refcount)) 4160 return; 4161 4162 if (pwd->pwd_cdir != NULL) 4163 vrele(pwd->pwd_cdir); 4164 if (pwd->pwd_rdir != NULL) 4165 vrele(pwd->pwd_rdir); 4166 if (pwd->pwd_jdir != NULL) 4167 vrele(pwd->pwd_jdir); 4168 if (pwd->pwd_adir != NULL) 4169 vrele(pwd->pwd_adir); 4170 uma_zfree_smr(pwd_zone, pwd); 4171 } 4172 4173 /* 4174 * The caller is responsible for invoking priv_check() and 4175 * mac_vnode_check_chroot() to authorize this operation. 4176 */ 4177 int 4178 pwd_chroot(struct thread *td, struct vnode *vp) 4179 { 4180 struct pwddesc *pdp; 4181 struct filedesc *fdp; 4182 struct pwd *newpwd, *oldpwd; 4183 int error; 4184 4185 fdp = td->td_proc->p_fd; 4186 pdp = td->td_proc->p_pd; 4187 newpwd = pwd_alloc(); 4188 FILEDESC_SLOCK(fdp); 4189 PWDDESC_XLOCK(pdp); 4190 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4191 if (chroot_allow_open_directories == 0 || 4192 (chroot_allow_open_directories == 1 && 4193 oldpwd->pwd_rdir != rootvnode)) { 4194 error = chroot_refuse_vdir_fds(fdp); 4195 FILEDESC_SUNLOCK(fdp); 4196 if (error != 0) { 4197 PWDDESC_XUNLOCK(pdp); 4198 pwd_drop(newpwd); 4199 return (error); 4200 } 4201 } else { 4202 FILEDESC_SUNLOCK(fdp); 4203 } 4204 4205 vrefact(vp); 4206 newpwd->pwd_rdir = vp; 4207 vrefact(vp); 4208 newpwd->pwd_adir = vp; 4209 if (oldpwd->pwd_jdir == NULL) { 4210 vrefact(vp); 4211 newpwd->pwd_jdir = vp; 4212 } 4213 pwd_fill(oldpwd, newpwd); 4214 pwd_set(pdp, newpwd); 4215 PWDDESC_XUNLOCK(pdp); 4216 pwd_drop(oldpwd); 4217 return (0); 4218 } 4219 4220 void 4221 pwd_chdir(struct thread *td, struct vnode *vp) 4222 { 4223 struct pwddesc *pdp; 4224 struct pwd *newpwd, *oldpwd; 4225 4226 VNPASS(vp->v_usecount > 0, vp); 4227 4228 newpwd = pwd_alloc(); 4229 pdp = td->td_proc->p_pd; 4230 PWDDESC_XLOCK(pdp); 4231 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4232 newpwd->pwd_cdir = vp; 4233 pwd_fill(oldpwd, newpwd); 4234 pwd_set(pdp, newpwd); 4235 PWDDESC_XUNLOCK(pdp); 4236 pwd_drop(oldpwd); 4237 } 4238 4239 /* 4240 * Process is transitioning to/from a non-native ABI. 4241 */ 4242 void 4243 pwd_altroot(struct thread *td, struct vnode *altroot_vp) 4244 { 4245 struct pwddesc *pdp; 4246 struct pwd *newpwd, *oldpwd; 4247 4248 newpwd = pwd_alloc(); 4249 pdp = td->td_proc->p_pd; 4250 PWDDESC_XLOCK(pdp); 4251 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4252 if (altroot_vp != NULL) { 4253 /* 4254 * Native process to a non-native ABI. 4255 */ 4256 4257 vrefact(altroot_vp); 4258 newpwd->pwd_adir = altroot_vp; 4259 } else { 4260 /* 4261 * Non-native process to the native ABI. 4262 */ 4263 4264 vrefact(oldpwd->pwd_rdir); 4265 newpwd->pwd_adir = oldpwd->pwd_rdir; 4266 } 4267 pwd_fill(oldpwd, newpwd); 4268 pwd_set(pdp, newpwd); 4269 PWDDESC_XUNLOCK(pdp); 4270 pwd_drop(oldpwd); 4271 } 4272 4273 /* 4274 * jail_attach(2) changes both root and working directories. 4275 */ 4276 int 4277 pwd_chroot_chdir(struct thread *td, struct vnode *vp) 4278 { 4279 struct pwddesc *pdp; 4280 struct filedesc *fdp; 4281 struct pwd *newpwd, *oldpwd; 4282 int error; 4283 4284 fdp = td->td_proc->p_fd; 4285 pdp = td->td_proc->p_pd; 4286 newpwd = pwd_alloc(); 4287 FILEDESC_SLOCK(fdp); 4288 PWDDESC_XLOCK(pdp); 4289 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4290 error = chroot_refuse_vdir_fds(fdp); 4291 FILEDESC_SUNLOCK(fdp); 4292 if (error != 0) { 4293 PWDDESC_XUNLOCK(pdp); 4294 pwd_drop(newpwd); 4295 return (error); 4296 } 4297 4298 vrefact(vp); 4299 newpwd->pwd_rdir = vp; 4300 vrefact(vp); 4301 newpwd->pwd_cdir = vp; 4302 if (oldpwd->pwd_jdir == NULL) { 4303 vrefact(vp); 4304 newpwd->pwd_jdir = vp; 4305 } 4306 vrefact(vp); 4307 newpwd->pwd_adir = vp; 4308 pwd_fill(oldpwd, newpwd); 4309 pwd_set(pdp, newpwd); 4310 PWDDESC_XUNLOCK(pdp); 4311 pwd_drop(oldpwd); 4312 return (0); 4313 } 4314 4315 void 4316 pwd_ensure_dirs(void) 4317 { 4318 struct pwddesc *pdp; 4319 struct pwd *oldpwd, *newpwd; 4320 4321 pdp = curproc->p_pd; 4322 PWDDESC_XLOCK(pdp); 4323 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4324 if (oldpwd->pwd_cdir != NULL && oldpwd->pwd_rdir != NULL && 4325 oldpwd->pwd_adir != NULL) { 4326 PWDDESC_XUNLOCK(pdp); 4327 return; 4328 } 4329 PWDDESC_XUNLOCK(pdp); 4330 4331 newpwd = pwd_alloc(); 4332 PWDDESC_XLOCK(pdp); 4333 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4334 pwd_fill(oldpwd, newpwd); 4335 if (newpwd->pwd_cdir == NULL) { 4336 vrefact(rootvnode); 4337 newpwd->pwd_cdir = rootvnode; 4338 } 4339 if (newpwd->pwd_rdir == NULL) { 4340 vrefact(rootvnode); 4341 newpwd->pwd_rdir = rootvnode; 4342 } 4343 if (newpwd->pwd_adir == NULL) { 4344 vrefact(rootvnode); 4345 newpwd->pwd_adir = rootvnode; 4346 } 4347 pwd_set(pdp, newpwd); 4348 PWDDESC_XUNLOCK(pdp); 4349 pwd_drop(oldpwd); 4350 } 4351 4352 void 4353 pwd_set_rootvnode(void) 4354 { 4355 struct pwddesc *pdp; 4356 struct pwd *oldpwd, *newpwd; 4357 4358 pdp = curproc->p_pd; 4359 4360 newpwd = pwd_alloc(); 4361 PWDDESC_XLOCK(pdp); 4362 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4363 vrefact(rootvnode); 4364 newpwd->pwd_cdir = rootvnode; 4365 vrefact(rootvnode); 4366 newpwd->pwd_rdir = rootvnode; 4367 vrefact(rootvnode); 4368 newpwd->pwd_adir = rootvnode; 4369 pwd_fill(oldpwd, newpwd); 4370 pwd_set(pdp, newpwd); 4371 PWDDESC_XUNLOCK(pdp); 4372 pwd_drop(oldpwd); 4373 } 4374 4375 /* 4376 * Scan all active processes and prisons to see if any of them have a current 4377 * or root directory of `olddp'. If so, replace them with the new mount point. 4378 */ 4379 void 4380 mountcheckdirs(struct vnode *olddp, struct vnode *newdp) 4381 { 4382 struct pwddesc *pdp; 4383 struct pwd *newpwd, *oldpwd; 4384 struct prison *pr; 4385 struct proc *p; 4386 int nrele; 4387 4388 if (vrefcnt(olddp) == 1) 4389 return; 4390 nrele = 0; 4391 newpwd = pwd_alloc(); 4392 sx_slock(&allproc_lock); 4393 FOREACH_PROC_IN_SYSTEM(p) { 4394 PROC_LOCK(p); 4395 pdp = pdhold(p); 4396 PROC_UNLOCK(p); 4397 if (pdp == NULL) 4398 continue; 4399 PWDDESC_XLOCK(pdp); 4400 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 4401 if (oldpwd == NULL || 4402 (oldpwd->pwd_cdir != olddp && 4403 oldpwd->pwd_rdir != olddp && 4404 oldpwd->pwd_jdir != olddp && 4405 oldpwd->pwd_adir != olddp)) { 4406 PWDDESC_XUNLOCK(pdp); 4407 pddrop(pdp); 4408 continue; 4409 } 4410 if (oldpwd->pwd_cdir == olddp) { 4411 vrefact(newdp); 4412 newpwd->pwd_cdir = newdp; 4413 } 4414 if (oldpwd->pwd_rdir == olddp) { 4415 vrefact(newdp); 4416 newpwd->pwd_rdir = newdp; 4417 } 4418 if (oldpwd->pwd_jdir == olddp) { 4419 vrefact(newdp); 4420 newpwd->pwd_jdir = newdp; 4421 } 4422 if (oldpwd->pwd_adir == olddp) { 4423 vrefact(newdp); 4424 newpwd->pwd_adir = newdp; 4425 } 4426 pwd_fill(oldpwd, newpwd); 4427 pwd_set(pdp, newpwd); 4428 PWDDESC_XUNLOCK(pdp); 4429 pwd_drop(oldpwd); 4430 pddrop(pdp); 4431 newpwd = pwd_alloc(); 4432 } 4433 sx_sunlock(&allproc_lock); 4434 pwd_drop(newpwd); 4435 if (rootvnode == olddp) { 4436 vrefact(newdp); 4437 rootvnode = newdp; 4438 nrele++; 4439 } 4440 mtx_lock(&prison0.pr_mtx); 4441 if (prison0.pr_root == olddp) { 4442 vrefact(newdp); 4443 prison0.pr_root = newdp; 4444 nrele++; 4445 } 4446 mtx_unlock(&prison0.pr_mtx); 4447 sx_slock(&allprison_lock); 4448 TAILQ_FOREACH(pr, &allprison, pr_list) { 4449 mtx_lock(&pr->pr_mtx); 4450 if (pr->pr_root == olddp) { 4451 vrefact(newdp); 4452 pr->pr_root = newdp; 4453 nrele++; 4454 } 4455 mtx_unlock(&pr->pr_mtx); 4456 } 4457 sx_sunlock(&allprison_lock); 4458 while (nrele--) 4459 vrele(olddp); 4460 } 4461 4462 int 4463 descrip_check_write_mp(struct filedesc *fdp, struct mount *mp) 4464 { 4465 struct file *fp; 4466 struct vnode *vp; 4467 int error, i; 4468 4469 error = 0; 4470 FILEDESC_SLOCK(fdp); 4471 FILEDESC_FOREACH_FP(fdp, i, fp) { 4472 if (fp->f_type != DTYPE_VNODE || 4473 (atomic_load_int(&fp->f_flag) & FWRITE) == 0) 4474 continue; 4475 vp = fp->f_vnode; 4476 if (vp->v_mount == mp) { 4477 error = EDEADLK; 4478 break; 4479 } 4480 } 4481 FILEDESC_SUNLOCK(fdp); 4482 return (error); 4483 } 4484 4485 struct filedesc_to_leader * 4486 filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, 4487 struct proc *leader) 4488 { 4489 struct filedesc_to_leader *fdtol; 4490 4491 fdtol = malloc(sizeof(struct filedesc_to_leader), 4492 M_FILEDESC_TO_LEADER, M_WAITOK); 4493 fdtol->fdl_refcount = 1; 4494 fdtol->fdl_holdcount = 0; 4495 fdtol->fdl_wakeup = 0; 4496 fdtol->fdl_leader = leader; 4497 if (old != NULL) { 4498 FILEDESC_XLOCK(fdp); 4499 fdtol->fdl_next = old->fdl_next; 4500 fdtol->fdl_prev = old; 4501 old->fdl_next = fdtol; 4502 fdtol->fdl_next->fdl_prev = fdtol; 4503 FILEDESC_XUNLOCK(fdp); 4504 } else { 4505 fdtol->fdl_next = fdtol; 4506 fdtol->fdl_prev = fdtol; 4507 } 4508 return (fdtol); 4509 } 4510 4511 struct filedesc_to_leader * 4512 filedesc_to_leader_share(struct filedesc_to_leader *fdtol, struct filedesc *fdp) 4513 { 4514 FILEDESC_XLOCK(fdp); 4515 fdtol->fdl_refcount++; 4516 FILEDESC_XUNLOCK(fdp); 4517 return (fdtol); 4518 } 4519 4520 static int 4521 filedesc_nfiles(struct filedesc *fdp) 4522 { 4523 NDSLOTTYPE *map; 4524 int count, off, minoff; 4525 4526 if (fdp == NULL) 4527 return (0); 4528 count = 0; 4529 FILEDESC_SLOCK(fdp); 4530 map = fdp->fd_map; 4531 off = NDSLOT(fdp->fd_nfiles - 1); 4532 for (minoff = NDSLOT(0); off >= minoff; --off) 4533 count += bitcountl(map[off]); 4534 FILEDESC_SUNLOCK(fdp); 4535 return (count); 4536 } 4537 4538 int 4539 proc_nfiles(struct proc *p) 4540 { 4541 struct filedesc *fdp; 4542 int res; 4543 4544 PROC_LOCK(p); 4545 fdp = fdhold(p); 4546 PROC_UNLOCK(p); 4547 res = filedesc_nfiles(fdp); 4548 fddrop(fdp); 4549 return (res); 4550 } 4551 4552 static int 4553 sysctl_kern_proc_nfds(SYSCTL_HANDLER_ARGS) 4554 { 4555 u_int namelen; 4556 int count; 4557 4558 namelen = arg2; 4559 if (namelen != 1) 4560 return (EINVAL); 4561 4562 if (*(int *)arg1 != 0) 4563 return (EINVAL); 4564 4565 count = filedesc_nfiles(curproc->p_fd); 4566 return (SYSCTL_OUT(req, &count, sizeof(count))); 4567 } 4568 4569 static SYSCTL_NODE(_kern_proc, KERN_PROC_NFDS, nfds, 4570 CTLFLAG_RD|CTLFLAG_CAPRD|CTLFLAG_MPSAFE, sysctl_kern_proc_nfds, 4571 "Number of open file descriptors"); 4572 4573 /* 4574 * Get file structures globally. 4575 */ 4576 static int 4577 sysctl_kern_file(SYSCTL_HANDLER_ARGS) 4578 { 4579 struct xfile xf; 4580 struct filedesc *fdp; 4581 struct file *fp; 4582 struct proc *p; 4583 int error, n; 4584 4585 error = sysctl_wire_old_buffer(req, 0); 4586 if (error != 0) 4587 return (error); 4588 if (req->oldptr == NULL) { 4589 n = 0; 4590 sx_slock(&allproc_lock); 4591 FOREACH_PROC_IN_SYSTEM(p) { 4592 PROC_LOCK(p); 4593 if (p->p_state == PRS_NEW) { 4594 PROC_UNLOCK(p); 4595 continue; 4596 } 4597 fdp = fdhold(p); 4598 PROC_UNLOCK(p); 4599 if (fdp == NULL) 4600 continue; 4601 /* overestimates sparse tables. */ 4602 n += fdp->fd_nfiles; 4603 fddrop(fdp); 4604 } 4605 sx_sunlock(&allproc_lock); 4606 return (SYSCTL_OUT(req, 0, n * sizeof(xf))); 4607 } 4608 error = 0; 4609 bzero(&xf, sizeof(xf)); 4610 xf.xf_size = sizeof(xf); 4611 sx_slock(&allproc_lock); 4612 FOREACH_PROC_IN_SYSTEM(p) { 4613 PROC_LOCK(p); 4614 if (p->p_state == PRS_NEW) { 4615 PROC_UNLOCK(p); 4616 continue; 4617 } 4618 if (p_cansee(req->td, p) != 0) { 4619 PROC_UNLOCK(p); 4620 continue; 4621 } 4622 xf.xf_pid = p->p_pid; 4623 xf.xf_uid = p->p_ucred->cr_uid; 4624 fdp = fdhold(p); 4625 PROC_UNLOCK(p); 4626 if (fdp == NULL) 4627 continue; 4628 FILEDESC_SLOCK(fdp); 4629 if (refcount_load(&fdp->fd_refcnt) == 0) 4630 goto nextproc; 4631 FILEDESC_FOREACH_FP(fdp, n, fp) { 4632 xf.xf_fd = n; 4633 xf.xf_file = (uintptr_t)fp; 4634 xf.xf_data = (uintptr_t)fp->f_data; 4635 xf.xf_vnode = (uintptr_t)fp->f_vnode; 4636 xf.xf_type = (uintptr_t)fp->f_type; 4637 xf.xf_count = refcount_load(&fp->f_count); 4638 xf.xf_msgcount = 0; 4639 xf.xf_offset = foffset_get(fp); 4640 xf.xf_flag = fp->f_flag; 4641 error = SYSCTL_OUT(req, &xf, sizeof(xf)); 4642 4643 /* 4644 * There is no need to re-check the fdtable refcount 4645 * here since the filedesc lock is not dropped in the 4646 * loop body. 4647 */ 4648 if (error != 0) 4649 break; 4650 } 4651 nextproc: 4652 FILEDESC_SUNLOCK(fdp); 4653 fddrop(fdp); 4654 if (error) 4655 break; 4656 } 4657 sx_sunlock(&allproc_lock); 4658 return (error); 4659 } 4660 4661 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE, 4662 0, 0, sysctl_kern_file, "S,xfile", "Entire file table"); 4663 4664 #ifdef KINFO_FILE_SIZE 4665 CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE); 4666 #endif 4667 4668 static int 4669 xlate_fflags(int fflags) 4670 { 4671 static const struct { 4672 int fflag; 4673 int kf_fflag; 4674 } fflags_table[] = { 4675 { FAPPEND, KF_FLAG_APPEND }, 4676 { FASYNC, KF_FLAG_ASYNC }, 4677 { FFSYNC, KF_FLAG_FSYNC }, 4678 { FHASLOCK, KF_FLAG_HASLOCK }, 4679 { FNONBLOCK, KF_FLAG_NONBLOCK }, 4680 { FREAD, KF_FLAG_READ }, 4681 { FWRITE, KF_FLAG_WRITE }, 4682 { O_CREAT, KF_FLAG_CREAT }, 4683 { O_DIRECT, KF_FLAG_DIRECT }, 4684 { O_EXCL, KF_FLAG_EXCL }, 4685 { O_EXEC, KF_FLAG_EXEC }, 4686 { O_EXLOCK, KF_FLAG_EXLOCK }, 4687 { O_NOFOLLOW, KF_FLAG_NOFOLLOW }, 4688 { O_SHLOCK, KF_FLAG_SHLOCK }, 4689 { O_TRUNC, KF_FLAG_TRUNC } 4690 }; 4691 unsigned int i; 4692 int kflags; 4693 4694 kflags = 0; 4695 for (i = 0; i < nitems(fflags_table); i++) 4696 if (fflags & fflags_table[i].fflag) 4697 kflags |= fflags_table[i].kf_fflag; 4698 return (kflags); 4699 } 4700 4701 /* Trim unused data from kf_path by truncating the structure size. */ 4702 void 4703 pack_kinfo(struct kinfo_file *kif) 4704 { 4705 4706 kif->kf_structsize = offsetof(struct kinfo_file, kf_path) + 4707 strlen(kif->kf_path) + 1; 4708 kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t)); 4709 } 4710 4711 static void 4712 export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp, 4713 struct kinfo_file *kif, struct filedesc *fdp, int flags) 4714 { 4715 int error; 4716 4717 bzero(kif, sizeof(*kif)); 4718 4719 /* Set a default type to allow for empty fill_kinfo() methods. */ 4720 kif->kf_type = KF_TYPE_UNKNOWN; 4721 kif->kf_flags = xlate_fflags(fp->f_flag); 4722 if (rightsp != NULL) 4723 kif->kf_cap_rights = *rightsp; 4724 else 4725 cap_rights_init_zero(&kif->kf_cap_rights); 4726 kif->kf_fd = fd; 4727 kif->kf_ref_count = refcount_load(&fp->f_count); 4728 kif->kf_offset = foffset_get(fp); 4729 4730 /* 4731 * This may drop the filedesc lock, so the 'fp' cannot be 4732 * accessed after this call. 4733 */ 4734 error = fo_fill_kinfo(fp, kif, fdp); 4735 if (error == 0) 4736 kif->kf_status |= KF_ATTR_VALID; 4737 if ((flags & KERN_FILEDESC_PACK_KINFO) != 0) 4738 pack_kinfo(kif); 4739 else 4740 kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t)); 4741 } 4742 4743 static void 4744 export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags, 4745 struct kinfo_file *kif, int flags) 4746 { 4747 int error; 4748 4749 bzero(kif, sizeof(*kif)); 4750 4751 kif->kf_type = KF_TYPE_VNODE; 4752 error = vn_fill_kinfo_vnode(vp, kif); 4753 if (error == 0) 4754 kif->kf_status |= KF_ATTR_VALID; 4755 kif->kf_flags = xlate_fflags(fflags); 4756 cap_rights_init_zero(&kif->kf_cap_rights); 4757 kif->kf_fd = fd; 4758 kif->kf_ref_count = -1; 4759 kif->kf_offset = -1; 4760 if ((flags & KERN_FILEDESC_PACK_KINFO) != 0) 4761 pack_kinfo(kif); 4762 else 4763 kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t)); 4764 vrele(vp); 4765 } 4766 4767 struct export_fd_buf { 4768 struct filedesc *fdp; 4769 struct pwddesc *pdp; 4770 struct sbuf *sb; 4771 ssize_t remainder; 4772 struct kinfo_file kif; 4773 int flags; 4774 }; 4775 4776 static int 4777 export_kinfo_to_sb(struct export_fd_buf *efbuf) 4778 { 4779 struct kinfo_file *kif; 4780 4781 kif = &efbuf->kif; 4782 if (efbuf->remainder != -1) { 4783 if (efbuf->remainder < kif->kf_structsize) 4784 return (ENOMEM); 4785 efbuf->remainder -= kif->kf_structsize; 4786 } 4787 if (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) != 0) 4788 return (sbuf_error(efbuf->sb)); 4789 return (0); 4790 } 4791 4792 static int 4793 export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp, 4794 struct export_fd_buf *efbuf) 4795 { 4796 int error; 4797 4798 if (efbuf->remainder == 0) 4799 return (ENOMEM); 4800 export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp, 4801 efbuf->flags); 4802 FILEDESC_SUNLOCK(efbuf->fdp); 4803 error = export_kinfo_to_sb(efbuf); 4804 FILEDESC_SLOCK(efbuf->fdp); 4805 return (error); 4806 } 4807 4808 static int 4809 export_vnode_to_sb(struct vnode *vp, int fd, int fflags, 4810 struct export_fd_buf *efbuf) 4811 { 4812 int error; 4813 4814 if (efbuf->remainder == 0) 4815 return (ENOMEM); 4816 if (efbuf->pdp != NULL) 4817 PWDDESC_XUNLOCK(efbuf->pdp); 4818 export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif, efbuf->flags); 4819 error = export_kinfo_to_sb(efbuf); 4820 if (efbuf->pdp != NULL) 4821 PWDDESC_XLOCK(efbuf->pdp); 4822 return (error); 4823 } 4824 4825 /* 4826 * Store a process file descriptor information to sbuf. 4827 * 4828 * Takes a locked proc as argument, and returns with the proc unlocked. 4829 */ 4830 int 4831 kern_proc_filedesc_out(struct proc *p, struct sbuf *sb, ssize_t maxlen, 4832 int flags) 4833 { 4834 struct file *fp; 4835 struct filedesc *fdp; 4836 struct pwddesc *pdp; 4837 struct export_fd_buf *efbuf; 4838 struct vnode *cttyvp, *textvp, *tracevp; 4839 struct pwd *pwd; 4840 int error, i; 4841 cap_rights_t rights; 4842 4843 PROC_LOCK_ASSERT(p, MA_OWNED); 4844 4845 /* ktrace vnode */ 4846 tracevp = ktr_get_tracevp(p, true); 4847 /* text vnode */ 4848 textvp = p->p_textvp; 4849 if (textvp != NULL) 4850 vrefact(textvp); 4851 /* Controlling tty. */ 4852 cttyvp = NULL; 4853 if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) { 4854 cttyvp = p->p_pgrp->pg_session->s_ttyvp; 4855 if (cttyvp != NULL) 4856 vrefact(cttyvp); 4857 } 4858 fdp = fdhold(p); 4859 pdp = pdhold(p); 4860 PROC_UNLOCK(p); 4861 4862 efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); 4863 efbuf->fdp = NULL; 4864 efbuf->pdp = NULL; 4865 efbuf->sb = sb; 4866 efbuf->remainder = maxlen; 4867 efbuf->flags = flags; 4868 4869 error = 0; 4870 if (tracevp != NULL) 4871 error = export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, 4872 FREAD | FWRITE, efbuf); 4873 if (error == 0 && textvp != NULL) 4874 error = export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, 4875 efbuf); 4876 if (error == 0 && cttyvp != NULL) 4877 error = export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, 4878 FREAD | FWRITE, efbuf); 4879 if (error != 0 || pdp == NULL || fdp == NULL) 4880 goto fail; 4881 efbuf->fdp = fdp; 4882 efbuf->pdp = pdp; 4883 PWDDESC_XLOCK(pdp); 4884 pwd = pwd_hold_pwddesc(pdp); 4885 if (pwd != NULL) { 4886 /* working directory */ 4887 if (pwd->pwd_cdir != NULL) { 4888 vrefact(pwd->pwd_cdir); 4889 error = export_vnode_to_sb(pwd->pwd_cdir, 4890 KF_FD_TYPE_CWD, FREAD, efbuf); 4891 } 4892 /* root directory */ 4893 if (error == 0 && pwd->pwd_rdir != NULL) { 4894 vrefact(pwd->pwd_rdir); 4895 error = export_vnode_to_sb(pwd->pwd_rdir, 4896 KF_FD_TYPE_ROOT, FREAD, efbuf); 4897 } 4898 /* jail directory */ 4899 if (error == 0 && pwd->pwd_jdir != NULL) { 4900 vrefact(pwd->pwd_jdir); 4901 error = export_vnode_to_sb(pwd->pwd_jdir, 4902 KF_FD_TYPE_JAIL, FREAD, efbuf); 4903 } 4904 } 4905 PWDDESC_XUNLOCK(pdp); 4906 if (error != 0) 4907 goto fail; 4908 if (pwd != NULL) 4909 pwd_drop(pwd); 4910 FILEDESC_SLOCK(fdp); 4911 if (refcount_load(&fdp->fd_refcnt) == 0) 4912 goto skip; 4913 FILEDESC_FOREACH_FP(fdp, i, fp) { 4914 #ifdef CAPABILITIES 4915 rights = *cap_rights(fdp, i); 4916 #else /* !CAPABILITIES */ 4917 rights = cap_no_rights; 4918 #endif 4919 /* 4920 * Create sysctl entry. It is OK to drop the filedesc 4921 * lock inside of export_file_to_sb() as we will 4922 * re-validate and re-evaluate its properties when the 4923 * loop continues. 4924 */ 4925 error = export_file_to_sb(fp, i, &rights, efbuf); 4926 if (error != 0 || refcount_load(&fdp->fd_refcnt) == 0) 4927 break; 4928 } 4929 skip: 4930 FILEDESC_SUNLOCK(fdp); 4931 fail: 4932 if (fdp != NULL) 4933 fddrop(fdp); 4934 if (pdp != NULL) 4935 pddrop(pdp); 4936 free(efbuf, M_TEMP); 4937 return (error); 4938 } 4939 4940 #define FILEDESC_SBUF_SIZE (sizeof(struct kinfo_file) * 5) 4941 4942 /* 4943 * Get per-process file descriptors for use by procstat(1), et al. 4944 */ 4945 static int 4946 sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS) 4947 { 4948 struct sbuf sb; 4949 struct proc *p; 4950 ssize_t maxlen; 4951 u_int namelen; 4952 int error, error2, *name; 4953 4954 namelen = arg2; 4955 if (namelen != 1) 4956 return (EINVAL); 4957 4958 name = (int *)arg1; 4959 4960 sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req); 4961 sbuf_clear_flags(&sb, SBUF_INCLUDENUL); 4962 error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); 4963 if (error != 0) { 4964 sbuf_delete(&sb); 4965 return (error); 4966 } 4967 maxlen = req->oldptr != NULL ? req->oldlen : -1; 4968 error = kern_proc_filedesc_out(p, &sb, maxlen, 4969 KERN_FILEDESC_PACK_KINFO); 4970 error2 = sbuf_finish(&sb); 4971 sbuf_delete(&sb); 4972 return (error != 0 ? error : error2); 4973 } 4974 4975 #ifdef COMPAT_FREEBSD7 4976 #ifdef KINFO_OFILE_SIZE 4977 CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE); 4978 #endif 4979 4980 static void 4981 kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif) 4982 { 4983 4984 okif->kf_structsize = sizeof(*okif); 4985 okif->kf_type = kif->kf_type; 4986 okif->kf_fd = kif->kf_fd; 4987 okif->kf_ref_count = kif->kf_ref_count; 4988 okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE | 4989 KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK | 4990 KF_FLAG_DIRECT | KF_FLAG_HASLOCK); 4991 okif->kf_offset = kif->kf_offset; 4992 if (kif->kf_type == KF_TYPE_VNODE) 4993 okif->kf_vnode_type = kif->kf_un.kf_file.kf_file_type; 4994 else 4995 okif->kf_vnode_type = KF_VTYPE_VNON; 4996 strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path)); 4997 if (kif->kf_type == KF_TYPE_SOCKET) { 4998 okif->kf_sock_domain = kif->kf_un.kf_sock.kf_sock_domain0; 4999 okif->kf_sock_type = kif->kf_un.kf_sock.kf_sock_type0; 5000 okif->kf_sock_protocol = kif->kf_un.kf_sock.kf_sock_protocol0; 5001 okif->kf_sa_local = kif->kf_un.kf_sock.kf_sa_local; 5002 okif->kf_sa_peer = kif->kf_un.kf_sock.kf_sa_peer; 5003 } else { 5004 okif->kf_sa_local.ss_family = AF_UNSPEC; 5005 okif->kf_sa_peer.ss_family = AF_UNSPEC; 5006 } 5007 } 5008 5009 static int 5010 export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif, 5011 struct kinfo_ofile *okif, struct pwddesc *pdp, struct sysctl_req *req) 5012 { 5013 int error; 5014 5015 vrefact(vp); 5016 PWDDESC_XUNLOCK(pdp); 5017 export_vnode_to_kinfo(vp, type, 0, kif, KERN_FILEDESC_PACK_KINFO); 5018 kinfo_to_okinfo(kif, okif); 5019 error = SYSCTL_OUT(req, okif, sizeof(*okif)); 5020 PWDDESC_XLOCK(pdp); 5021 return (error); 5022 } 5023 5024 /* 5025 * Get per-process file descriptors for use by procstat(1), et al. 5026 */ 5027 static int 5028 sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS) 5029 { 5030 struct kinfo_ofile *okif; 5031 struct kinfo_file *kif; 5032 struct filedesc *fdp; 5033 struct pwddesc *pdp; 5034 struct pwd *pwd; 5035 u_int namelen; 5036 int error, i, *name; 5037 struct file *fp; 5038 struct proc *p; 5039 5040 namelen = arg2; 5041 if (namelen != 1) 5042 return (EINVAL); 5043 5044 name = (int *)arg1; 5045 error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); 5046 if (error != 0) 5047 return (error); 5048 fdp = fdhold(p); 5049 if (fdp != NULL) 5050 pdp = pdhold(p); 5051 PROC_UNLOCK(p); 5052 if (fdp == NULL || pdp == NULL) { 5053 if (fdp != NULL) 5054 fddrop(fdp); 5055 return (ENOENT); 5056 } 5057 kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK); 5058 okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK); 5059 PWDDESC_XLOCK(pdp); 5060 pwd = pwd_hold_pwddesc(pdp); 5061 if (pwd != NULL) { 5062 if (pwd->pwd_cdir != NULL) 5063 export_vnode_for_osysctl(pwd->pwd_cdir, KF_FD_TYPE_CWD, kif, 5064 okif, pdp, req); 5065 if (pwd->pwd_rdir != NULL) 5066 export_vnode_for_osysctl(pwd->pwd_rdir, KF_FD_TYPE_ROOT, kif, 5067 okif, pdp, req); 5068 if (pwd->pwd_jdir != NULL) 5069 export_vnode_for_osysctl(pwd->pwd_jdir, KF_FD_TYPE_JAIL, kif, 5070 okif, pdp, req); 5071 } 5072 PWDDESC_XUNLOCK(pdp); 5073 if (pwd != NULL) 5074 pwd_drop(pwd); 5075 FILEDESC_SLOCK(fdp); 5076 if (refcount_load(&fdp->fd_refcnt) == 0) 5077 goto skip; 5078 FILEDESC_FOREACH_FP(fdp, i, fp) { 5079 export_file_to_kinfo(fp, i, NULL, kif, fdp, 5080 KERN_FILEDESC_PACK_KINFO); 5081 FILEDESC_SUNLOCK(fdp); 5082 kinfo_to_okinfo(kif, okif); 5083 error = SYSCTL_OUT(req, okif, sizeof(*okif)); 5084 FILEDESC_SLOCK(fdp); 5085 if (error != 0 || refcount_load(&fdp->fd_refcnt) == 0) 5086 break; 5087 } 5088 skip: 5089 FILEDESC_SUNLOCK(fdp); 5090 fddrop(fdp); 5091 pddrop(pdp); 5092 free(kif, M_TEMP); 5093 free(okif, M_TEMP); 5094 return (0); 5095 } 5096 5097 static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, 5098 CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc, 5099 "Process ofiledesc entries"); 5100 #endif /* COMPAT_FREEBSD7 */ 5101 5102 int 5103 vntype_to_kinfo(int vtype) 5104 { 5105 struct { 5106 int vtype; 5107 int kf_vtype; 5108 } vtypes_table[] = { 5109 { VBAD, KF_VTYPE_VBAD }, 5110 { VBLK, KF_VTYPE_VBLK }, 5111 { VCHR, KF_VTYPE_VCHR }, 5112 { VDIR, KF_VTYPE_VDIR }, 5113 { VFIFO, KF_VTYPE_VFIFO }, 5114 { VLNK, KF_VTYPE_VLNK }, 5115 { VNON, KF_VTYPE_VNON }, 5116 { VREG, KF_VTYPE_VREG }, 5117 { VSOCK, KF_VTYPE_VSOCK } 5118 }; 5119 unsigned int i; 5120 5121 /* 5122 * Perform vtype translation. 5123 */ 5124 for (i = 0; i < nitems(vtypes_table); i++) 5125 if (vtypes_table[i].vtype == vtype) 5126 return (vtypes_table[i].kf_vtype); 5127 5128 return (KF_VTYPE_UNKNOWN); 5129 } 5130 5131 static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, 5132 CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc, 5133 "Process filedesc entries"); 5134 5135 /* 5136 * Store a process current working directory information to sbuf. 5137 * 5138 * Takes a locked proc as argument, and returns with the proc unlocked. 5139 */ 5140 int 5141 kern_proc_cwd_out(struct proc *p, struct sbuf *sb, ssize_t maxlen) 5142 { 5143 struct pwddesc *pdp; 5144 struct pwd *pwd; 5145 struct export_fd_buf *efbuf; 5146 struct vnode *cdir; 5147 int error; 5148 5149 PROC_LOCK_ASSERT(p, MA_OWNED); 5150 5151 pdp = pdhold(p); 5152 PROC_UNLOCK(p); 5153 if (pdp == NULL) 5154 return (EINVAL); 5155 5156 efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); 5157 efbuf->fdp = NULL; 5158 efbuf->pdp = pdp; 5159 efbuf->sb = sb; 5160 efbuf->remainder = maxlen; 5161 efbuf->flags = 0; 5162 5163 PWDDESC_XLOCK(pdp); 5164 pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); 5165 cdir = pwd->pwd_cdir; 5166 if (cdir == NULL) { 5167 error = EINVAL; 5168 } else { 5169 vrefact(cdir); 5170 error = export_vnode_to_sb(cdir, KF_FD_TYPE_CWD, FREAD, efbuf); 5171 } 5172 PWDDESC_XUNLOCK(pdp); 5173 pddrop(pdp); 5174 free(efbuf, M_TEMP); 5175 return (error); 5176 } 5177 5178 /* 5179 * Get per-process current working directory. 5180 */ 5181 static int 5182 sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS) 5183 { 5184 struct sbuf sb; 5185 struct proc *p; 5186 ssize_t maxlen; 5187 u_int namelen; 5188 int error, error2, *name; 5189 5190 namelen = arg2; 5191 if (namelen != 1) 5192 return (EINVAL); 5193 5194 name = (int *)arg1; 5195 5196 sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req); 5197 sbuf_clear_flags(&sb, SBUF_INCLUDENUL); 5198 error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); 5199 if (error != 0) { 5200 sbuf_delete(&sb); 5201 return (error); 5202 } 5203 maxlen = req->oldptr != NULL ? req->oldlen : -1; 5204 error = kern_proc_cwd_out(p, &sb, maxlen); 5205 error2 = sbuf_finish(&sb); 5206 sbuf_delete(&sb); 5207 return (error != 0 ? error : error2); 5208 } 5209 5210 static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE, 5211 sysctl_kern_proc_cwd, "Process current working directory"); 5212 5213 #ifdef DDB 5214 /* 5215 * For the purposes of debugging, generate a human-readable string for the 5216 * file type. 5217 */ 5218 static const char * 5219 file_type_to_name(short type) 5220 { 5221 5222 switch (type) { 5223 case 0: 5224 return ("zero"); 5225 case DTYPE_VNODE: 5226 return ("vnode"); 5227 case DTYPE_SOCKET: 5228 return ("socket"); 5229 case DTYPE_PIPE: 5230 return ("pipe"); 5231 case DTYPE_FIFO: 5232 return ("fifo"); 5233 case DTYPE_KQUEUE: 5234 return ("kqueue"); 5235 case DTYPE_CRYPTO: 5236 return ("crypto"); 5237 case DTYPE_MQUEUE: 5238 return ("mqueue"); 5239 case DTYPE_SHM: 5240 return ("shm"); 5241 case DTYPE_SEM: 5242 return ("ksem"); 5243 case DTYPE_PTS: 5244 return ("pts"); 5245 case DTYPE_DEV: 5246 return ("dev"); 5247 case DTYPE_PROCDESC: 5248 return ("proc"); 5249 case DTYPE_EVENTFD: 5250 return ("eventfd"); 5251 case DTYPE_TIMERFD: 5252 return ("timerfd"); 5253 default: 5254 return ("unkn"); 5255 } 5256 } 5257 5258 /* 5259 * For the purposes of debugging, identify a process (if any, perhaps one of 5260 * many) that references the passed file in its file descriptor array. Return 5261 * NULL if none. 5262 */ 5263 static struct proc * 5264 file_to_first_proc(struct file *fp) 5265 { 5266 struct filedesc *fdp; 5267 struct proc *p; 5268 int n; 5269 5270 FOREACH_PROC_IN_SYSTEM(p) { 5271 if (p->p_state == PRS_NEW) 5272 continue; 5273 fdp = p->p_fd; 5274 if (fdp == NULL) 5275 continue; 5276 for (n = 0; n < fdp->fd_nfiles; n++) { 5277 if (fp == fdp->fd_ofiles[n].fde_file) 5278 return (p); 5279 } 5280 } 5281 return (NULL); 5282 } 5283 5284 static void 5285 db_print_file(struct file *fp, int header) 5286 { 5287 #define XPTRWIDTH ((int)howmany(sizeof(void *) * NBBY, 4)) 5288 struct proc *p; 5289 5290 if (header) 5291 db_printf("%*s %6s %*s %8s %4s %5s %6s %*s %5s %s\n", 5292 XPTRWIDTH, "File", "Type", XPTRWIDTH, "Data", "Flag", 5293 "GCFl", "Count", "MCount", XPTRWIDTH, "Vnode", "FPID", 5294 "FCmd"); 5295 p = file_to_first_proc(fp); 5296 db_printf("%*p %6s %*p %08x %04x %5d %6d %*p %5d %s\n", XPTRWIDTH, 5297 fp, file_type_to_name(fp->f_type), XPTRWIDTH, fp->f_data, 5298 fp->f_flag, 0, refcount_load(&fp->f_count), 0, XPTRWIDTH, fp->f_vnode, 5299 p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-"); 5300 5301 #undef XPTRWIDTH 5302 } 5303 5304 DB_SHOW_COMMAND(file, db_show_file) 5305 { 5306 struct file *fp; 5307 5308 if (!have_addr) { 5309 db_printf("usage: show file <addr>\n"); 5310 return; 5311 } 5312 fp = (struct file *)addr; 5313 db_print_file(fp, 1); 5314 } 5315 5316 DB_SHOW_COMMAND_FLAGS(files, db_show_files, DB_CMD_MEMSAFE) 5317 { 5318 struct filedesc *fdp; 5319 struct file *fp; 5320 struct proc *p; 5321 int header; 5322 int n; 5323 5324 header = 1; 5325 FOREACH_PROC_IN_SYSTEM(p) { 5326 if (p->p_state == PRS_NEW) 5327 continue; 5328 if ((fdp = p->p_fd) == NULL) 5329 continue; 5330 for (n = 0; n < fdp->fd_nfiles; ++n) { 5331 if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) 5332 continue; 5333 db_print_file(fp, header); 5334 header = 0; 5335 } 5336 } 5337 } 5338 #endif 5339 5340 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, 5341 CTLFLAG_RWTUN | CTLFLAG_NOFETCH, 5342 &maxfilesperproc, 0, "Maximum files allowed open per process"); 5343 5344 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RWTUN | CTLFLAG_NOFETCH, 5345 &maxfiles, 0, "Maximum number of files"); 5346 5347 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, 5348 &openfiles, 0, "System-wide number of open files"); 5349 5350 /* ARGSUSED*/ 5351 static void 5352 filelistinit(void *dummy) 5353 { 5354 5355 file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, 5356 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 5357 filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0), 5358 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 5359 pwd_zone = uma_zcreate("PWD", sizeof(struct pwd), NULL, NULL, 5360 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_SMR); 5361 /* 5362 * XXXMJG this is a temporary hack due to boot ordering issues against 5363 * the vnode zone. 5364 */ 5365 vfs_smr = uma_zone_get_smr(pwd_zone); 5366 mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); 5367 } 5368 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL); 5369 5370 /*-------------------------------------------------------------------*/ 5371 5372 static int 5373 badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, 5374 int flags, struct thread *td) 5375 { 5376 5377 return (EBADF); 5378 } 5379 5380 static int 5381 badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, 5382 struct thread *td) 5383 { 5384 5385 return (EINVAL); 5386 } 5387 5388 static int 5389 badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, 5390 struct thread *td) 5391 { 5392 5393 return (EBADF); 5394 } 5395 5396 static int 5397 badfo_poll(struct file *fp, int events, struct ucred *active_cred, 5398 struct thread *td) 5399 { 5400 5401 return (0); 5402 } 5403 5404 static int 5405 badfo_kqfilter(struct file *fp, struct knote *kn) 5406 { 5407 5408 return (EBADF); 5409 } 5410 5411 static int 5412 badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) 5413 { 5414 5415 return (EBADF); 5416 } 5417 5418 static int 5419 badfo_close(struct file *fp, struct thread *td) 5420 { 5421 5422 return (0); 5423 } 5424 5425 static int 5426 badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 5427 struct thread *td) 5428 { 5429 5430 return (EBADF); 5431 } 5432 5433 static int 5434 badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 5435 struct thread *td) 5436 { 5437 5438 return (EBADF); 5439 } 5440 5441 static int 5442 badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, 5443 struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, 5444 struct thread *td) 5445 { 5446 5447 return (EBADF); 5448 } 5449 5450 static int 5451 badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 5452 { 5453 5454 return (0); 5455 } 5456 5457 const struct fileops badfileops = { 5458 .fo_read = badfo_readwrite, 5459 .fo_write = badfo_readwrite, 5460 .fo_truncate = badfo_truncate, 5461 .fo_ioctl = badfo_ioctl, 5462 .fo_poll = badfo_poll, 5463 .fo_kqfilter = badfo_kqfilter, 5464 .fo_stat = badfo_stat, 5465 .fo_close = badfo_close, 5466 .fo_chmod = badfo_chmod, 5467 .fo_chown = badfo_chown, 5468 .fo_sendfile = badfo_sendfile, 5469 .fo_fill_kinfo = badfo_fill_kinfo, 5470 }; 5471 5472 static int 5473 path_poll(struct file *fp, int events, struct ucred *active_cred, 5474 struct thread *td) 5475 { 5476 return (POLLNVAL); 5477 } 5478 5479 static int 5480 path_close(struct file *fp, struct thread *td) 5481 { 5482 MPASS(fp->f_type == DTYPE_VNODE); 5483 fp->f_ops = &badfileops; 5484 vrele(fp->f_vnode); 5485 return (0); 5486 } 5487 5488 const struct fileops path_fileops = { 5489 .fo_read = badfo_readwrite, 5490 .fo_write = badfo_readwrite, 5491 .fo_truncate = badfo_truncate, 5492 .fo_ioctl = badfo_ioctl, 5493 .fo_poll = path_poll, 5494 .fo_kqfilter = vn_kqfilter_opath, 5495 .fo_stat = vn_statfile, 5496 .fo_close = path_close, 5497 .fo_chmod = badfo_chmod, 5498 .fo_chown = badfo_chown, 5499 .fo_sendfile = badfo_sendfile, 5500 .fo_fill_kinfo = vn_fill_kinfo, 5501 .fo_cmp = vn_cmp, 5502 .fo_flags = DFLAG_PASSABLE, 5503 }; 5504 5505 int 5506 invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred, 5507 int flags, struct thread *td) 5508 { 5509 5510 return (EOPNOTSUPP); 5511 } 5512 5513 int 5514 invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, 5515 struct thread *td) 5516 { 5517 5518 return (EINVAL); 5519 } 5520 5521 int 5522 invfo_ioctl(struct file *fp, u_long com, void *data, 5523 struct ucred *active_cred, struct thread *td) 5524 { 5525 5526 return (ENOTTY); 5527 } 5528 5529 int 5530 invfo_poll(struct file *fp, int events, struct ucred *active_cred, 5531 struct thread *td) 5532 { 5533 5534 return (poll_no_poll(events)); 5535 } 5536 5537 int 5538 invfo_kqfilter(struct file *fp, struct knote *kn) 5539 { 5540 5541 return (EINVAL); 5542 } 5543 5544 int 5545 invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 5546 struct thread *td) 5547 { 5548 5549 return (EINVAL); 5550 } 5551 5552 int 5553 invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 5554 struct thread *td) 5555 { 5556 5557 return (EINVAL); 5558 } 5559 5560 int 5561 invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, 5562 struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, 5563 struct thread *td) 5564 { 5565 5566 return (EINVAL); 5567 } 5568 5569 /*-------------------------------------------------------------------*/ 5570 5571 /* 5572 * File Descriptor pseudo-device driver (/dev/fd/). 5573 * 5574 * Opening minor device N dup()s the file (if any) connected to file 5575 * descriptor N belonging to the calling process. Note that this driver 5576 * consists of only the ``open()'' routine, because all subsequent 5577 * references to this file will be direct to the other driver. 5578 * 5579 * XXX: we could give this one a cloning event handler if necessary. 5580 */ 5581 5582 /* ARGSUSED */ 5583 static int 5584 fdopen(struct cdev *dev, int mode, int type, struct thread *td) 5585 { 5586 5587 /* 5588 * XXX Kludge: set curthread->td_dupfd to contain the value of the 5589 * the file descriptor being sought for duplication. The error 5590 * return ensures that the vnode for this device will be released 5591 * by vn_open. Open will detect this special error and take the 5592 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN 5593 * will simply report the error. 5594 */ 5595 td->td_dupfd = dev2unit(dev); 5596 return (ENODEV); 5597 } 5598 5599 static struct cdevsw fildesc_cdevsw = { 5600 .d_version = D_VERSION, 5601 .d_open = fdopen, 5602 .d_name = "FD", 5603 }; 5604 5605 static void 5606 fildesc_drvinit(void *unused) 5607 { 5608 struct cdev *dev; 5609 5610 dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL, 5611 UID_ROOT, GID_WHEEL, 0666, "fd/0"); 5612 make_dev_alias(dev, "stdin"); 5613 dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL, 5614 UID_ROOT, GID_WHEEL, 0666, "fd/1"); 5615 make_dev_alias(dev, "stdout"); 5616 dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL, 5617 UID_ROOT, GID_WHEEL, 0666, "fd/2"); 5618 make_dev_alias(dev, "stderr"); 5619 } 5620 5621 SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL); 5622