1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_capsicum.h" 41 #include "opt_compat.h" 42 #include "opt_ddb.h" 43 #include "opt_ktrace.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 48 #include <sys/capsicum.h> 49 #include <sys/conf.h> 50 #include <sys/fcntl.h> 51 #include <sys/file.h> 52 #include <sys/filedesc.h> 53 #include <sys/filio.h> 54 #include <sys/jail.h> 55 #include <sys/kernel.h> 56 #include <sys/limits.h> 57 #include <sys/lock.h> 58 #include <sys/malloc.h> 59 #include <sys/mount.h> 60 #include <sys/mutex.h> 61 #include <sys/namei.h> 62 #include <sys/selinfo.h> 63 #include <sys/priv.h> 64 #include <sys/proc.h> 65 #include <sys/protosw.h> 66 #include <sys/racct.h> 67 #include <sys/resourcevar.h> 68 #include <sys/sbuf.h> 69 #include <sys/signalvar.h> 70 #include <sys/socketvar.h> 71 #include <sys/stat.h> 72 #include <sys/sx.h> 73 #include <sys/syscallsubr.h> 74 #include <sys/sysctl.h> 75 #include <sys/sysproto.h> 76 #include <sys/unistd.h> 77 #include <sys/user.h> 78 #include <sys/vnode.h> 79 #ifdef KTRACE 80 #include <sys/ktrace.h> 81 #endif 82 83 #include <net/vnet.h> 84 85 #include <security/audit/audit.h> 86 87 #include <vm/uma.h> 88 #include <vm/vm.h> 89 90 #include <ddb/ddb.h> 91 92 static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table"); 93 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader", 94 "file desc to leader structures"); 95 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); 96 MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities"); 97 98 MALLOC_DECLARE(M_FADVISE); 99 100 static uma_zone_t file_zone; 101 static uma_zone_t filedesc0_zone; 102 103 static int closefp(struct filedesc *fdp, int fd, struct file *fp, 104 struct thread *td, int holdleaders); 105 static int do_dup(struct thread *td, int flags, int old, int new); 106 static int fd_first_free(struct filedesc *fdp, int low, int size); 107 static int fd_last_used(struct filedesc *fdp, int size); 108 static void fdgrowtable(struct filedesc *fdp, int nfd); 109 static void fdgrowtable_exp(struct filedesc *fdp, int nfd); 110 static void fdunused(struct filedesc *fdp, int fd); 111 static void fdused(struct filedesc *fdp, int fd); 112 static int getmaxfd(struct proc *p); 113 114 /* Flags for do_dup() */ 115 #define DUP_FIXED 0x1 /* Force fixed allocation. */ 116 #define DUP_FCNTL 0x2 /* fcntl()-style errors. */ 117 #define DUP_CLOEXEC 0x4 /* Atomically set FD_CLOEXEC. */ 118 119 /* 120 * Each process has: 121 * 122 * - An array of open file descriptors (fd_ofiles) 123 * - An array of file flags (fd_ofileflags) 124 * - A bitmap recording which descriptors are in use (fd_map) 125 * 126 * A process starts out with NDFILE descriptors. The value of NDFILE has 127 * been selected based the historical limit of 20 open files, and an 128 * assumption that the majority of processes, especially short-lived 129 * processes like shells, will never need more. 130 * 131 * If this initial allocation is exhausted, a larger descriptor table and 132 * map are allocated dynamically, and the pointers in the process's struct 133 * filedesc are updated to point to those. This is repeated every time 134 * the process runs out of file descriptors (provided it hasn't hit its 135 * resource limit). 136 * 137 * Since threads may hold references to individual descriptor table 138 * entries, the tables are never freed. Instead, they are placed on a 139 * linked list and freed only when the struct filedesc is released. 140 */ 141 #define NDFILE 20 142 #define NDSLOTSIZE sizeof(NDSLOTTYPE) 143 #define NDENTRIES (NDSLOTSIZE * __CHAR_BIT) 144 #define NDSLOT(x) ((x) / NDENTRIES) 145 #define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES)) 146 #define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES) 147 148 /* 149 * SLIST entry used to keep track of ofiles which must be reclaimed when 150 * the process exits. 151 */ 152 struct freetable { 153 struct fdescenttbl *ft_table; 154 SLIST_ENTRY(freetable) ft_next; 155 }; 156 157 /* 158 * Initial allocation: a filedesc structure + the head of SLIST used to 159 * keep track of old ofiles + enough space for NDFILE descriptors. 160 */ 161 162 struct fdescenttbl0 { 163 int fdt_nfiles; 164 struct filedescent fdt_ofiles[NDFILE]; 165 }; 166 167 struct filedesc0 { 168 struct filedesc fd_fd; 169 SLIST_HEAD(, freetable) fd_free; 170 struct fdescenttbl0 fd_dfiles; 171 NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)]; 172 }; 173 174 /* 175 * Descriptor management. 176 */ 177 volatile int openfiles; /* actual number of open files */ 178 struct mtx sigio_lock; /* mtx to protect pointers to sigio */ 179 void (*mq_fdclose)(struct thread *td, int fd, struct file *fp); 180 181 /* A mutex to protect the association between a proc and filedesc. */ 182 static struct mtx fdesc_mtx; 183 184 /* 185 * If low >= size, just return low. Otherwise find the first zero bit in the 186 * given bitmap, starting at low and not exceeding size - 1. Return size if 187 * not found. 188 */ 189 static int 190 fd_first_free(struct filedesc *fdp, int low, int size) 191 { 192 NDSLOTTYPE *map = fdp->fd_map; 193 NDSLOTTYPE mask; 194 int off, maxoff; 195 196 if (low >= size) 197 return (low); 198 199 off = NDSLOT(low); 200 if (low % NDENTRIES) { 201 mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES))); 202 if ((mask &= ~map[off]) != 0UL) 203 return (off * NDENTRIES + ffsl(mask) - 1); 204 ++off; 205 } 206 for (maxoff = NDSLOTS(size); off < maxoff; ++off) 207 if (map[off] != ~0UL) 208 return (off * NDENTRIES + ffsl(~map[off]) - 1); 209 return (size); 210 } 211 212 /* 213 * Find the highest non-zero bit in the given bitmap, starting at 0 and 214 * not exceeding size - 1. Return -1 if not found. 215 */ 216 static int 217 fd_last_used(struct filedesc *fdp, int size) 218 { 219 NDSLOTTYPE *map = fdp->fd_map; 220 NDSLOTTYPE mask; 221 int off, minoff; 222 223 off = NDSLOT(size); 224 if (size % NDENTRIES) { 225 mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES)); 226 if ((mask &= map[off]) != 0) 227 return (off * NDENTRIES + flsl(mask) - 1); 228 --off; 229 } 230 for (minoff = NDSLOT(0); off >= minoff; --off) 231 if (map[off] != 0) 232 return (off * NDENTRIES + flsl(map[off]) - 1); 233 return (-1); 234 } 235 236 #ifdef INVARIANTS 237 static int 238 fdisused(struct filedesc *fdp, int fd) 239 { 240 241 KASSERT(fd >= 0 && fd < fdp->fd_nfiles, 242 ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles)); 243 244 return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0); 245 } 246 #endif 247 248 /* 249 * Mark a file descriptor as used. 250 */ 251 static void 252 fdused_init(struct filedesc *fdp, int fd) 253 { 254 255 KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd)); 256 257 fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd); 258 } 259 260 static void 261 fdused(struct filedesc *fdp, int fd) 262 { 263 264 FILEDESC_XLOCK_ASSERT(fdp); 265 266 fdused_init(fdp, fd); 267 if (fd > fdp->fd_lastfile) 268 fdp->fd_lastfile = fd; 269 if (fd == fdp->fd_freefile) 270 fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles); 271 } 272 273 /* 274 * Mark a file descriptor as unused. 275 */ 276 static void 277 fdunused(struct filedesc *fdp, int fd) 278 { 279 280 FILEDESC_XLOCK_ASSERT(fdp); 281 282 KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd)); 283 KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, 284 ("fd=%d is still in use", fd)); 285 286 fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd); 287 if (fd < fdp->fd_freefile) 288 fdp->fd_freefile = fd; 289 if (fd == fdp->fd_lastfile) 290 fdp->fd_lastfile = fd_last_used(fdp, fd); 291 } 292 293 /* 294 * Free a file descriptor. 295 * 296 * Avoid some work if fdp is about to be destroyed. 297 */ 298 static inline void 299 fdefree_last(struct filedescent *fde) 300 { 301 302 filecaps_free(&fde->fde_caps); 303 } 304 305 static inline void 306 fdfree(struct filedesc *fdp, int fd) 307 { 308 struct filedescent *fde; 309 310 fde = &fdp->fd_ofiles[fd]; 311 #ifdef CAPABILITIES 312 seq_write_begin(&fde->fde_seq); 313 #endif 314 fdefree_last(fde); 315 bzero(fde, fde_change_size); 316 fdunused(fdp, fd); 317 #ifdef CAPABILITIES 318 seq_write_end(&fde->fde_seq); 319 #endif 320 } 321 322 /* 323 * System calls on descriptors. 324 */ 325 #ifndef _SYS_SYSPROTO_H_ 326 struct getdtablesize_args { 327 int dummy; 328 }; 329 #endif 330 /* ARGSUSED */ 331 int 332 sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap) 333 { 334 struct proc *p = td->td_proc; 335 uint64_t lim; 336 337 PROC_LOCK(p); 338 td->td_retval[0] = 339 min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); 340 lim = racct_get_limit(td->td_proc, RACCT_NOFILE); 341 PROC_UNLOCK(p); 342 if (lim < td->td_retval[0]) 343 td->td_retval[0] = lim; 344 return (0); 345 } 346 347 /* 348 * Duplicate a file descriptor to a particular value. 349 * 350 * Note: keep in mind that a potential race condition exists when closing 351 * descriptors from a shared descriptor table (via rfork). 352 */ 353 #ifndef _SYS_SYSPROTO_H_ 354 struct dup2_args { 355 u_int from; 356 u_int to; 357 }; 358 #endif 359 /* ARGSUSED */ 360 int 361 sys_dup2(struct thread *td, struct dup2_args *uap) 362 { 363 364 return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to)); 365 } 366 367 /* 368 * Duplicate a file descriptor. 369 */ 370 #ifndef _SYS_SYSPROTO_H_ 371 struct dup_args { 372 u_int fd; 373 }; 374 #endif 375 /* ARGSUSED */ 376 int 377 sys_dup(struct thread *td, struct dup_args *uap) 378 { 379 380 return (do_dup(td, 0, (int)uap->fd, 0)); 381 } 382 383 /* 384 * The file control system call. 385 */ 386 #ifndef _SYS_SYSPROTO_H_ 387 struct fcntl_args { 388 int fd; 389 int cmd; 390 long arg; 391 }; 392 #endif 393 /* ARGSUSED */ 394 int 395 sys_fcntl(struct thread *td, struct fcntl_args *uap) 396 { 397 398 return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg)); 399 } 400 401 int 402 kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg) 403 { 404 struct flock fl; 405 struct __oflock ofl; 406 intptr_t arg1; 407 int error; 408 409 error = 0; 410 switch (cmd) { 411 case F_OGETLK: 412 case F_OSETLK: 413 case F_OSETLKW: 414 /* 415 * Convert old flock structure to new. 416 */ 417 error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl)); 418 fl.l_start = ofl.l_start; 419 fl.l_len = ofl.l_len; 420 fl.l_pid = ofl.l_pid; 421 fl.l_type = ofl.l_type; 422 fl.l_whence = ofl.l_whence; 423 fl.l_sysid = 0; 424 425 switch (cmd) { 426 case F_OGETLK: 427 cmd = F_GETLK; 428 break; 429 case F_OSETLK: 430 cmd = F_SETLK; 431 break; 432 case F_OSETLKW: 433 cmd = F_SETLKW; 434 break; 435 } 436 arg1 = (intptr_t)&fl; 437 break; 438 case F_GETLK: 439 case F_SETLK: 440 case F_SETLKW: 441 case F_SETLK_REMOTE: 442 error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl)); 443 arg1 = (intptr_t)&fl; 444 break; 445 default: 446 arg1 = arg; 447 break; 448 } 449 if (error) 450 return (error); 451 error = kern_fcntl(td, fd, cmd, arg1); 452 if (error) 453 return (error); 454 if (cmd == F_OGETLK) { 455 ofl.l_start = fl.l_start; 456 ofl.l_len = fl.l_len; 457 ofl.l_pid = fl.l_pid; 458 ofl.l_type = fl.l_type; 459 ofl.l_whence = fl.l_whence; 460 error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl)); 461 } else if (cmd == F_GETLK) { 462 error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl)); 463 } 464 return (error); 465 } 466 467 int 468 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) 469 { 470 struct filedesc *fdp; 471 struct flock *flp; 472 struct file *fp, *fp2; 473 struct filedescent *fde; 474 struct proc *p; 475 struct vnode *vp; 476 cap_rights_t rights; 477 int error, flg, tmp; 478 uint64_t bsize; 479 off_t foffset; 480 481 error = 0; 482 flg = F_POSIX; 483 p = td->td_proc; 484 fdp = p->p_fd; 485 486 switch (cmd) { 487 case F_DUPFD: 488 tmp = arg; 489 error = do_dup(td, DUP_FCNTL, fd, tmp); 490 break; 491 492 case F_DUPFD_CLOEXEC: 493 tmp = arg; 494 error = do_dup(td, DUP_FCNTL | DUP_CLOEXEC, fd, tmp); 495 break; 496 497 case F_DUP2FD: 498 tmp = arg; 499 error = do_dup(td, DUP_FIXED, fd, tmp); 500 break; 501 502 case F_DUP2FD_CLOEXEC: 503 tmp = arg; 504 error = do_dup(td, DUP_FIXED | DUP_CLOEXEC, fd, tmp); 505 break; 506 507 case F_GETFD: 508 FILEDESC_SLOCK(fdp); 509 if (fget_locked(fdp, fd) == NULL) { 510 FILEDESC_SUNLOCK(fdp); 511 error = EBADF; 512 break; 513 } 514 fde = &fdp->fd_ofiles[fd]; 515 td->td_retval[0] = 516 (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0; 517 FILEDESC_SUNLOCK(fdp); 518 break; 519 520 case F_SETFD: 521 FILEDESC_XLOCK(fdp); 522 if (fget_locked(fdp, fd) == NULL) { 523 FILEDESC_XUNLOCK(fdp); 524 error = EBADF; 525 break; 526 } 527 fde = &fdp->fd_ofiles[fd]; 528 fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) | 529 (arg & FD_CLOEXEC ? UF_EXCLOSE : 0); 530 FILEDESC_XUNLOCK(fdp); 531 break; 532 533 case F_GETFL: 534 error = fget_fcntl(td, fd, 535 cap_rights_init(&rights, CAP_FCNTL), F_GETFL, &fp); 536 if (error != 0) 537 break; 538 td->td_retval[0] = OFLAGS(fp->f_flag); 539 fdrop(fp, td); 540 break; 541 542 case F_SETFL: 543 error = fget_fcntl(td, fd, 544 cap_rights_init(&rights, CAP_FCNTL), F_SETFL, &fp); 545 if (error != 0) 546 break; 547 do { 548 tmp = flg = fp->f_flag; 549 tmp &= ~FCNTLFLAGS; 550 tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS; 551 } while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); 552 tmp = fp->f_flag & FNONBLOCK; 553 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 554 if (error != 0) { 555 fdrop(fp, td); 556 break; 557 } 558 tmp = fp->f_flag & FASYNC; 559 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); 560 if (error == 0) { 561 fdrop(fp, td); 562 break; 563 } 564 atomic_clear_int(&fp->f_flag, FNONBLOCK); 565 tmp = 0; 566 (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 567 fdrop(fp, td); 568 break; 569 570 case F_GETOWN: 571 error = fget_fcntl(td, fd, 572 cap_rights_init(&rights, CAP_FCNTL), F_GETOWN, &fp); 573 if (error != 0) 574 break; 575 error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td); 576 if (error == 0) 577 td->td_retval[0] = tmp; 578 fdrop(fp, td); 579 break; 580 581 case F_SETOWN: 582 error = fget_fcntl(td, fd, 583 cap_rights_init(&rights, CAP_FCNTL), F_SETOWN, &fp); 584 if (error != 0) 585 break; 586 tmp = arg; 587 error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td); 588 fdrop(fp, td); 589 break; 590 591 case F_SETLK_REMOTE: 592 error = priv_check(td, PRIV_NFS_LOCKD); 593 if (error) 594 return (error); 595 flg = F_REMOTE; 596 goto do_setlk; 597 598 case F_SETLKW: 599 flg |= F_WAIT; 600 /* FALLTHROUGH F_SETLK */ 601 602 case F_SETLK: 603 do_setlk: 604 cap_rights_init(&rights, CAP_FLOCK); 605 error = fget_unlocked(fdp, fd, &rights, &fp, NULL); 606 if (error != 0) 607 break; 608 if (fp->f_type != DTYPE_VNODE) { 609 error = EBADF; 610 fdrop(fp, td); 611 break; 612 } 613 614 flp = (struct flock *)arg; 615 if (flp->l_whence == SEEK_CUR) { 616 foffset = foffset_get(fp); 617 if (foffset < 0 || 618 (flp->l_start > 0 && 619 foffset > OFF_MAX - flp->l_start)) { 620 FILEDESC_SUNLOCK(fdp); 621 error = EOVERFLOW; 622 fdrop(fp, td); 623 break; 624 } 625 flp->l_start += foffset; 626 } 627 628 vp = fp->f_vnode; 629 switch (flp->l_type) { 630 case F_RDLCK: 631 if ((fp->f_flag & FREAD) == 0) { 632 error = EBADF; 633 break; 634 } 635 PROC_LOCK(p->p_leader); 636 p->p_leader->p_flag |= P_ADVLOCK; 637 PROC_UNLOCK(p->p_leader); 638 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, 639 flp, flg); 640 break; 641 case F_WRLCK: 642 if ((fp->f_flag & FWRITE) == 0) { 643 error = EBADF; 644 break; 645 } 646 PROC_LOCK(p->p_leader); 647 p->p_leader->p_flag |= P_ADVLOCK; 648 PROC_UNLOCK(p->p_leader); 649 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, 650 flp, flg); 651 break; 652 case F_UNLCK: 653 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, 654 flp, flg); 655 break; 656 case F_UNLCKSYS: 657 /* 658 * Temporary api for testing remote lock 659 * infrastructure. 660 */ 661 if (flg != F_REMOTE) { 662 error = EINVAL; 663 break; 664 } 665 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, 666 F_UNLCKSYS, flp, flg); 667 break; 668 default: 669 error = EINVAL; 670 break; 671 } 672 if (error != 0 || flp->l_type == F_UNLCK || 673 flp->l_type == F_UNLCKSYS) { 674 fdrop(fp, td); 675 break; 676 } 677 678 /* 679 * Check for a race with close. 680 * 681 * The vnode is now advisory locked (or unlocked, but this case 682 * is not really important) as the caller requested. 683 * We had to drop the filedesc lock, so we need to recheck if 684 * the descriptor is still valid, because if it was closed 685 * in the meantime we need to remove advisory lock from the 686 * vnode - close on any descriptor leading to an advisory 687 * locked vnode, removes that lock. 688 * We will return 0 on purpose in that case, as the result of 689 * successful advisory lock might have been externally visible 690 * already. This is fine - effectively we pretend to the caller 691 * that the closing thread was a bit slower and that the 692 * advisory lock succeeded before the close. 693 */ 694 error = fget_unlocked(fdp, fd, &rights, &fp2, NULL); 695 if (error != 0) { 696 fdrop(fp, td); 697 break; 698 } 699 if (fp != fp2) { 700 flp->l_whence = SEEK_SET; 701 flp->l_start = 0; 702 flp->l_len = 0; 703 flp->l_type = F_UNLCK; 704 (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, 705 F_UNLCK, flp, F_POSIX); 706 } 707 fdrop(fp, td); 708 fdrop(fp2, td); 709 break; 710 711 case F_GETLK: 712 error = fget_unlocked(fdp, fd, 713 cap_rights_init(&rights, CAP_FLOCK), &fp, NULL); 714 if (error != 0) 715 break; 716 if (fp->f_type != DTYPE_VNODE) { 717 error = EBADF; 718 fdrop(fp, td); 719 break; 720 } 721 flp = (struct flock *)arg; 722 if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK && 723 flp->l_type != F_UNLCK) { 724 error = EINVAL; 725 fdrop(fp, td); 726 break; 727 } 728 if (flp->l_whence == SEEK_CUR) { 729 foffset = foffset_get(fp); 730 if ((flp->l_start > 0 && 731 foffset > OFF_MAX - flp->l_start) || 732 (flp->l_start < 0 && 733 foffset < OFF_MIN - flp->l_start)) { 734 FILEDESC_SUNLOCK(fdp); 735 error = EOVERFLOW; 736 fdrop(fp, td); 737 break; 738 } 739 flp->l_start += foffset; 740 } 741 vp = fp->f_vnode; 742 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp, 743 F_POSIX); 744 fdrop(fp, td); 745 break; 746 747 case F_RDAHEAD: 748 arg = arg ? 128 * 1024: 0; 749 /* FALLTHROUGH */ 750 case F_READAHEAD: 751 error = fget_unlocked(fdp, fd, NULL, &fp, NULL); 752 if (error != 0) 753 break; 754 if (fp->f_type != DTYPE_VNODE) { 755 fdrop(fp, td); 756 error = EBADF; 757 break; 758 } 759 vp = fp->f_vnode; 760 /* 761 * Exclusive lock synchronizes against f_seqcount reads and 762 * writes in sequential_heuristic(). 763 */ 764 error = vn_lock(vp, LK_EXCLUSIVE); 765 if (error != 0) { 766 fdrop(fp, td); 767 break; 768 } 769 if (arg >= 0) { 770 bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize; 771 fp->f_seqcount = (arg + bsize - 1) / bsize; 772 atomic_set_int(&fp->f_flag, FRDAHEAD); 773 } else { 774 atomic_clear_int(&fp->f_flag, FRDAHEAD); 775 } 776 VOP_UNLOCK(vp, 0); 777 fdrop(fp, td); 778 break; 779 780 default: 781 error = EINVAL; 782 break; 783 } 784 return (error); 785 } 786 787 static int 788 getmaxfd(struct proc *p) 789 { 790 int maxfd; 791 792 PROC_LOCK(p); 793 maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); 794 PROC_UNLOCK(p); 795 796 return (maxfd); 797 } 798 799 /* 800 * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD). 801 */ 802 static int 803 do_dup(struct thread *td, int flags, int old, int new) 804 { 805 struct filedesc *fdp; 806 struct filedescent *oldfde, *newfde; 807 struct proc *p; 808 struct file *fp; 809 struct file *delfp; 810 int error, maxfd; 811 812 p = td->td_proc; 813 fdp = p->p_fd; 814 815 /* 816 * Verify we have a valid descriptor to dup from and possibly to 817 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should 818 * return EINVAL when the new descriptor is out of bounds. 819 */ 820 if (old < 0) 821 return (EBADF); 822 if (new < 0) 823 return (flags & DUP_FCNTL ? EINVAL : EBADF); 824 maxfd = getmaxfd(p); 825 if (new >= maxfd) 826 return (flags & DUP_FCNTL ? EINVAL : EBADF); 827 828 FILEDESC_XLOCK(fdp); 829 if (fget_locked(fdp, old) == NULL) { 830 FILEDESC_XUNLOCK(fdp); 831 return (EBADF); 832 } 833 oldfde = &fdp->fd_ofiles[old]; 834 if (flags & DUP_FIXED && old == new) { 835 td->td_retval[0] = new; 836 if (flags & DUP_CLOEXEC) 837 fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE; 838 FILEDESC_XUNLOCK(fdp); 839 return (0); 840 } 841 fp = oldfde->fde_file; 842 fhold(fp); 843 844 /* 845 * If the caller specified a file descriptor, make sure the file 846 * table is large enough to hold it, and grab it. Otherwise, just 847 * allocate a new descriptor the usual way. 848 */ 849 if (flags & DUP_FIXED) { 850 if (new >= fdp->fd_nfiles) { 851 /* 852 * The resource limits are here instead of e.g. 853 * fdalloc(), because the file descriptor table may be 854 * shared between processes, so we can't really use 855 * racct_add()/racct_sub(). Instead of counting the 856 * number of actually allocated descriptors, just put 857 * the limit on the size of the file descriptor table. 858 */ 859 #ifdef RACCT 860 if (racct_enable) { 861 PROC_LOCK(p); 862 error = racct_set(p, RACCT_NOFILE, new + 1); 863 PROC_UNLOCK(p); 864 if (error != 0) { 865 FILEDESC_XUNLOCK(fdp); 866 fdrop(fp, td); 867 return (EMFILE); 868 } 869 } 870 #endif 871 fdgrowtable_exp(fdp, new + 1); 872 oldfde = &fdp->fd_ofiles[old]; 873 } 874 newfde = &fdp->fd_ofiles[new]; 875 if (newfde->fde_file == NULL) 876 fdused(fdp, new); 877 } else { 878 if ((error = fdalloc(td, new, &new)) != 0) { 879 FILEDESC_XUNLOCK(fdp); 880 fdrop(fp, td); 881 return (error); 882 } 883 newfde = &fdp->fd_ofiles[new]; 884 } 885 886 KASSERT(fp == oldfde->fde_file, ("old fd has been modified")); 887 KASSERT(old != new, ("new fd is same as old")); 888 889 delfp = newfde->fde_file; 890 891 /* 892 * Duplicate the source descriptor. 893 */ 894 #ifdef CAPABILITIES 895 seq_write_begin(&newfde->fde_seq); 896 #endif 897 filecaps_free(&newfde->fde_caps); 898 memcpy(newfde, oldfde, fde_change_size); 899 filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps); 900 if ((flags & DUP_CLOEXEC) != 0) 901 newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE; 902 else 903 newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE; 904 #ifdef CAPABILITIES 905 seq_write_end(&newfde->fde_seq); 906 #endif 907 td->td_retval[0] = new; 908 909 if (delfp != NULL) { 910 (void) closefp(fdp, new, delfp, td, 1); 911 /* closefp() drops the FILEDESC lock for us. */ 912 } else { 913 FILEDESC_XUNLOCK(fdp); 914 } 915 916 return (0); 917 } 918 919 /* 920 * If sigio is on the list associated with a process or process group, 921 * disable signalling from the device, remove sigio from the list and 922 * free sigio. 923 */ 924 void 925 funsetown(struct sigio **sigiop) 926 { 927 struct sigio *sigio; 928 929 SIGIO_LOCK(); 930 sigio = *sigiop; 931 if (sigio == NULL) { 932 SIGIO_UNLOCK(); 933 return; 934 } 935 *(sigio->sio_myref) = NULL; 936 if ((sigio)->sio_pgid < 0) { 937 struct pgrp *pg = (sigio)->sio_pgrp; 938 PGRP_LOCK(pg); 939 SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio, 940 sigio, sio_pgsigio); 941 PGRP_UNLOCK(pg); 942 } else { 943 struct proc *p = (sigio)->sio_proc; 944 PROC_LOCK(p); 945 SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio, 946 sigio, sio_pgsigio); 947 PROC_UNLOCK(p); 948 } 949 SIGIO_UNLOCK(); 950 crfree(sigio->sio_ucred); 951 free(sigio, M_SIGIO); 952 } 953 954 /* 955 * Free a list of sigio structures. 956 * We only need to lock the SIGIO_LOCK because we have made ourselves 957 * inaccessible to callers of fsetown and therefore do not need to lock 958 * the proc or pgrp struct for the list manipulation. 959 */ 960 void 961 funsetownlst(struct sigiolst *sigiolst) 962 { 963 struct proc *p; 964 struct pgrp *pg; 965 struct sigio *sigio; 966 967 sigio = SLIST_FIRST(sigiolst); 968 if (sigio == NULL) 969 return; 970 p = NULL; 971 pg = NULL; 972 973 /* 974 * Every entry of the list should belong 975 * to a single proc or pgrp. 976 */ 977 if (sigio->sio_pgid < 0) { 978 pg = sigio->sio_pgrp; 979 PGRP_LOCK_ASSERT(pg, MA_NOTOWNED); 980 } else /* if (sigio->sio_pgid > 0) */ { 981 p = sigio->sio_proc; 982 PROC_LOCK_ASSERT(p, MA_NOTOWNED); 983 } 984 985 SIGIO_LOCK(); 986 while ((sigio = SLIST_FIRST(sigiolst)) != NULL) { 987 *(sigio->sio_myref) = NULL; 988 if (pg != NULL) { 989 KASSERT(sigio->sio_pgid < 0, 990 ("Proc sigio in pgrp sigio list")); 991 KASSERT(sigio->sio_pgrp == pg, 992 ("Bogus pgrp in sigio list")); 993 PGRP_LOCK(pg); 994 SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, 995 sio_pgsigio); 996 PGRP_UNLOCK(pg); 997 } else /* if (p != NULL) */ { 998 KASSERT(sigio->sio_pgid > 0, 999 ("Pgrp sigio in proc sigio list")); 1000 KASSERT(sigio->sio_proc == p, 1001 ("Bogus proc in sigio list")); 1002 PROC_LOCK(p); 1003 SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, 1004 sio_pgsigio); 1005 PROC_UNLOCK(p); 1006 } 1007 SIGIO_UNLOCK(); 1008 crfree(sigio->sio_ucred); 1009 free(sigio, M_SIGIO); 1010 SIGIO_LOCK(); 1011 } 1012 SIGIO_UNLOCK(); 1013 } 1014 1015 /* 1016 * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). 1017 * 1018 * After permission checking, add a sigio structure to the sigio list for 1019 * the process or process group. 1020 */ 1021 int 1022 fsetown(pid_t pgid, struct sigio **sigiop) 1023 { 1024 struct proc *proc; 1025 struct pgrp *pgrp; 1026 struct sigio *sigio; 1027 int ret; 1028 1029 if (pgid == 0) { 1030 funsetown(sigiop); 1031 return (0); 1032 } 1033 1034 ret = 0; 1035 1036 /* Allocate and fill in the new sigio out of locks. */ 1037 sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK); 1038 sigio->sio_pgid = pgid; 1039 sigio->sio_ucred = crhold(curthread->td_ucred); 1040 sigio->sio_myref = sigiop; 1041 1042 sx_slock(&proctree_lock); 1043 if (pgid > 0) { 1044 proc = pfind(pgid); 1045 if (proc == NULL) { 1046 ret = ESRCH; 1047 goto fail; 1048 } 1049 1050 /* 1051 * Policy - Don't allow a process to FSETOWN a process 1052 * in another session. 1053 * 1054 * Remove this test to allow maximum flexibility or 1055 * restrict FSETOWN to the current process or process 1056 * group for maximum safety. 1057 */ 1058 PROC_UNLOCK(proc); 1059 if (proc->p_session != curthread->td_proc->p_session) { 1060 ret = EPERM; 1061 goto fail; 1062 } 1063 1064 pgrp = NULL; 1065 } else /* if (pgid < 0) */ { 1066 pgrp = pgfind(-pgid); 1067 if (pgrp == NULL) { 1068 ret = ESRCH; 1069 goto fail; 1070 } 1071 PGRP_UNLOCK(pgrp); 1072 1073 /* 1074 * Policy - Don't allow a process to FSETOWN a process 1075 * in another session. 1076 * 1077 * Remove this test to allow maximum flexibility or 1078 * restrict FSETOWN to the current process or process 1079 * group for maximum safety. 1080 */ 1081 if (pgrp->pg_session != curthread->td_proc->p_session) { 1082 ret = EPERM; 1083 goto fail; 1084 } 1085 1086 proc = NULL; 1087 } 1088 funsetown(sigiop); 1089 if (pgid > 0) { 1090 PROC_LOCK(proc); 1091 /* 1092 * Since funsetownlst() is called without the proctree 1093 * locked, we need to check for P_WEXIT. 1094 * XXX: is ESRCH correct? 1095 */ 1096 if ((proc->p_flag & P_WEXIT) != 0) { 1097 PROC_UNLOCK(proc); 1098 ret = ESRCH; 1099 goto fail; 1100 } 1101 SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio); 1102 sigio->sio_proc = proc; 1103 PROC_UNLOCK(proc); 1104 } else { 1105 PGRP_LOCK(pgrp); 1106 SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio); 1107 sigio->sio_pgrp = pgrp; 1108 PGRP_UNLOCK(pgrp); 1109 } 1110 sx_sunlock(&proctree_lock); 1111 SIGIO_LOCK(); 1112 *sigiop = sigio; 1113 SIGIO_UNLOCK(); 1114 return (0); 1115 1116 fail: 1117 sx_sunlock(&proctree_lock); 1118 crfree(sigio->sio_ucred); 1119 free(sigio, M_SIGIO); 1120 return (ret); 1121 } 1122 1123 /* 1124 * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). 1125 */ 1126 pid_t 1127 fgetown(sigiop) 1128 struct sigio **sigiop; 1129 { 1130 pid_t pgid; 1131 1132 SIGIO_LOCK(); 1133 pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0; 1134 SIGIO_UNLOCK(); 1135 return (pgid); 1136 } 1137 1138 /* 1139 * Function drops the filedesc lock on return. 1140 */ 1141 static int 1142 closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, 1143 int holdleaders) 1144 { 1145 int error; 1146 1147 FILEDESC_XLOCK_ASSERT(fdp); 1148 1149 if (holdleaders) { 1150 if (td->td_proc->p_fdtol != NULL) { 1151 /* 1152 * Ask fdfree() to sleep to ensure that all relevant 1153 * process leaders can be traversed in closef(). 1154 */ 1155 fdp->fd_holdleaderscount++; 1156 } else { 1157 holdleaders = 0; 1158 } 1159 } 1160 1161 /* 1162 * We now hold the fp reference that used to be owned by the 1163 * descriptor array. We have to unlock the FILEDESC *AFTER* 1164 * knote_fdclose to prevent a race of the fd getting opened, a knote 1165 * added, and deleteing a knote for the new fd. 1166 */ 1167 knote_fdclose(td, fd); 1168 1169 /* 1170 * We need to notify mqueue if the object is of type mqueue. 1171 */ 1172 if (fp->f_type == DTYPE_MQUEUE) 1173 mq_fdclose(td, fd, fp); 1174 FILEDESC_XUNLOCK(fdp); 1175 1176 error = closef(fp, td); 1177 if (holdleaders) { 1178 FILEDESC_XLOCK(fdp); 1179 fdp->fd_holdleaderscount--; 1180 if (fdp->fd_holdleaderscount == 0 && 1181 fdp->fd_holdleaderswakeup != 0) { 1182 fdp->fd_holdleaderswakeup = 0; 1183 wakeup(&fdp->fd_holdleaderscount); 1184 } 1185 FILEDESC_XUNLOCK(fdp); 1186 } 1187 return (error); 1188 } 1189 1190 /* 1191 * Close a file descriptor. 1192 */ 1193 #ifndef _SYS_SYSPROTO_H_ 1194 struct close_args { 1195 int fd; 1196 }; 1197 #endif 1198 /* ARGSUSED */ 1199 int 1200 sys_close(td, uap) 1201 struct thread *td; 1202 struct close_args *uap; 1203 { 1204 1205 return (kern_close(td, uap->fd)); 1206 } 1207 1208 int 1209 kern_close(td, fd) 1210 struct thread *td; 1211 int fd; 1212 { 1213 struct filedesc *fdp; 1214 struct file *fp; 1215 1216 fdp = td->td_proc->p_fd; 1217 1218 AUDIT_SYSCLOSE(td, fd); 1219 1220 FILEDESC_XLOCK(fdp); 1221 if ((fp = fget_locked(fdp, fd)) == NULL) { 1222 FILEDESC_XUNLOCK(fdp); 1223 return (EBADF); 1224 } 1225 fdfree(fdp, fd); 1226 1227 /* closefp() drops the FILEDESC lock for us. */ 1228 return (closefp(fdp, fd, fp, td, 1)); 1229 } 1230 1231 /* 1232 * Close open file descriptors. 1233 */ 1234 #ifndef _SYS_SYSPROTO_H_ 1235 struct closefrom_args { 1236 int lowfd; 1237 }; 1238 #endif 1239 /* ARGSUSED */ 1240 int 1241 sys_closefrom(struct thread *td, struct closefrom_args *uap) 1242 { 1243 struct filedesc *fdp; 1244 int fd; 1245 1246 fdp = td->td_proc->p_fd; 1247 AUDIT_ARG_FD(uap->lowfd); 1248 1249 /* 1250 * Treat negative starting file descriptor values identical to 1251 * closefrom(0) which closes all files. 1252 */ 1253 if (uap->lowfd < 0) 1254 uap->lowfd = 0; 1255 FILEDESC_SLOCK(fdp); 1256 for (fd = uap->lowfd; fd <= fdp->fd_lastfile; fd++) { 1257 if (fdp->fd_ofiles[fd].fde_file != NULL) { 1258 FILEDESC_SUNLOCK(fdp); 1259 (void)kern_close(td, fd); 1260 FILEDESC_SLOCK(fdp); 1261 } 1262 } 1263 FILEDESC_SUNLOCK(fdp); 1264 return (0); 1265 } 1266 1267 #if defined(COMPAT_43) 1268 /* 1269 * Return status information about a file descriptor. 1270 */ 1271 #ifndef _SYS_SYSPROTO_H_ 1272 struct ofstat_args { 1273 int fd; 1274 struct ostat *sb; 1275 }; 1276 #endif 1277 /* ARGSUSED */ 1278 int 1279 ofstat(struct thread *td, struct ofstat_args *uap) 1280 { 1281 struct ostat oub; 1282 struct stat ub; 1283 int error; 1284 1285 error = kern_fstat(td, uap->fd, &ub); 1286 if (error == 0) { 1287 cvtstat(&ub, &oub); 1288 error = copyout(&oub, uap->sb, sizeof(oub)); 1289 } 1290 return (error); 1291 } 1292 #endif /* COMPAT_43 */ 1293 1294 /* 1295 * Return status information about a file descriptor. 1296 */ 1297 #ifndef _SYS_SYSPROTO_H_ 1298 struct fstat_args { 1299 int fd; 1300 struct stat *sb; 1301 }; 1302 #endif 1303 /* ARGSUSED */ 1304 int 1305 sys_fstat(struct thread *td, struct fstat_args *uap) 1306 { 1307 struct stat ub; 1308 int error; 1309 1310 error = kern_fstat(td, uap->fd, &ub); 1311 if (error == 0) 1312 error = copyout(&ub, uap->sb, sizeof(ub)); 1313 return (error); 1314 } 1315 1316 int 1317 kern_fstat(struct thread *td, int fd, struct stat *sbp) 1318 { 1319 struct file *fp; 1320 cap_rights_t rights; 1321 int error; 1322 1323 AUDIT_ARG_FD(fd); 1324 1325 error = fget(td, fd, cap_rights_init(&rights, CAP_FSTAT), &fp); 1326 if (error != 0) 1327 return (error); 1328 1329 AUDIT_ARG_FILE(td->td_proc, fp); 1330 1331 error = fo_stat(fp, sbp, td->td_ucred, td); 1332 fdrop(fp, td); 1333 #ifdef KTRACE 1334 if (error == 0 && KTRPOINT(td, KTR_STRUCT)) 1335 ktrstat(sbp); 1336 #endif 1337 return (error); 1338 } 1339 1340 /* 1341 * Return status information about a file descriptor. 1342 */ 1343 #ifndef _SYS_SYSPROTO_H_ 1344 struct nfstat_args { 1345 int fd; 1346 struct nstat *sb; 1347 }; 1348 #endif 1349 /* ARGSUSED */ 1350 int 1351 sys_nfstat(struct thread *td, struct nfstat_args *uap) 1352 { 1353 struct nstat nub; 1354 struct stat ub; 1355 int error; 1356 1357 error = kern_fstat(td, uap->fd, &ub); 1358 if (error == 0) { 1359 cvtnstat(&ub, &nub); 1360 error = copyout(&nub, uap->sb, sizeof(nub)); 1361 } 1362 return (error); 1363 } 1364 1365 /* 1366 * Return pathconf information about a file descriptor. 1367 */ 1368 #ifndef _SYS_SYSPROTO_H_ 1369 struct fpathconf_args { 1370 int fd; 1371 int name; 1372 }; 1373 #endif 1374 /* ARGSUSED */ 1375 int 1376 sys_fpathconf(struct thread *td, struct fpathconf_args *uap) 1377 { 1378 struct file *fp; 1379 struct vnode *vp; 1380 cap_rights_t rights; 1381 int error; 1382 1383 error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FPATHCONF), &fp); 1384 if (error != 0) 1385 return (error); 1386 1387 /* If asynchronous I/O is available, it works for all descriptors. */ 1388 if (uap->name == _PC_ASYNC_IO) { 1389 td->td_retval[0] = async_io_version; 1390 goto out; 1391 } 1392 vp = fp->f_vnode; 1393 if (vp != NULL) { 1394 vn_lock(vp, LK_SHARED | LK_RETRY); 1395 error = VOP_PATHCONF(vp, uap->name, td->td_retval); 1396 VOP_UNLOCK(vp, 0); 1397 } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) { 1398 if (uap->name != _PC_PIPE_BUF) { 1399 error = EINVAL; 1400 } else { 1401 td->td_retval[0] = PIPE_BUF; 1402 error = 0; 1403 } 1404 } else { 1405 error = EOPNOTSUPP; 1406 } 1407 out: 1408 fdrop(fp, td); 1409 return (error); 1410 } 1411 1412 /* 1413 * Initialize filecaps structure. 1414 */ 1415 void 1416 filecaps_init(struct filecaps *fcaps) 1417 { 1418 1419 bzero(fcaps, sizeof(*fcaps)); 1420 fcaps->fc_nioctls = -1; 1421 } 1422 1423 /* 1424 * Copy filecaps structure allocating memory for ioctls array if needed. 1425 */ 1426 void 1427 filecaps_copy(const struct filecaps *src, struct filecaps *dst) 1428 { 1429 size_t size; 1430 1431 *dst = *src; 1432 if (src->fc_ioctls != NULL) { 1433 KASSERT(src->fc_nioctls > 0, 1434 ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); 1435 1436 size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; 1437 dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK); 1438 bcopy(src->fc_ioctls, dst->fc_ioctls, size); 1439 } 1440 } 1441 1442 /* 1443 * Move filecaps structure to the new place and clear the old place. 1444 */ 1445 void 1446 filecaps_move(struct filecaps *src, struct filecaps *dst) 1447 { 1448 1449 *dst = *src; 1450 bzero(src, sizeof(*src)); 1451 } 1452 1453 /* 1454 * Fill the given filecaps structure with full rights. 1455 */ 1456 static void 1457 filecaps_fill(struct filecaps *fcaps) 1458 { 1459 1460 CAP_ALL(&fcaps->fc_rights); 1461 fcaps->fc_ioctls = NULL; 1462 fcaps->fc_nioctls = -1; 1463 fcaps->fc_fcntls = CAP_FCNTL_ALL; 1464 } 1465 1466 /* 1467 * Free memory allocated within filecaps structure. 1468 */ 1469 void 1470 filecaps_free(struct filecaps *fcaps) 1471 { 1472 1473 free(fcaps->fc_ioctls, M_FILECAPS); 1474 bzero(fcaps, sizeof(*fcaps)); 1475 } 1476 1477 /* 1478 * Validate the given filecaps structure. 1479 */ 1480 static void 1481 filecaps_validate(const struct filecaps *fcaps, const char *func) 1482 { 1483 1484 KASSERT(cap_rights_is_valid(&fcaps->fc_rights), 1485 ("%s: invalid rights", func)); 1486 KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0, 1487 ("%s: invalid fcntls", func)); 1488 KASSERT(fcaps->fc_fcntls == 0 || 1489 cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL), 1490 ("%s: fcntls without CAP_FCNTL", func)); 1491 KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 : 1492 (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0), 1493 ("%s: invalid ioctls", func)); 1494 KASSERT(fcaps->fc_nioctls == 0 || 1495 cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL), 1496 ("%s: ioctls without CAP_IOCTL", func)); 1497 } 1498 1499 static void 1500 fdgrowtable_exp(struct filedesc *fdp, int nfd) 1501 { 1502 int nfd1; 1503 1504 FILEDESC_XLOCK_ASSERT(fdp); 1505 1506 nfd1 = fdp->fd_nfiles * 2; 1507 if (nfd1 < nfd) 1508 nfd1 = nfd; 1509 fdgrowtable(fdp, nfd1); 1510 } 1511 1512 /* 1513 * Grow the file table to accomodate (at least) nfd descriptors. 1514 */ 1515 static void 1516 fdgrowtable(struct filedesc *fdp, int nfd) 1517 { 1518 struct filedesc0 *fdp0; 1519 struct freetable *ft; 1520 struct fdescenttbl *ntable; 1521 struct fdescenttbl *otable; 1522 int nnfiles, onfiles; 1523 NDSLOTTYPE *nmap, *omap; 1524 1525 /* 1526 * If lastfile is -1 this struct filedesc was just allocated and we are 1527 * growing it to accomodate for the one we are going to copy from. There 1528 * is no need to have a lock on this one as it's not visible to anyone. 1529 */ 1530 if (fdp->fd_lastfile != -1) 1531 FILEDESC_XLOCK_ASSERT(fdp); 1532 1533 KASSERT(fdp->fd_nfiles > 0, ("zero-length file table")); 1534 1535 /* save old values */ 1536 onfiles = fdp->fd_nfiles; 1537 otable = fdp->fd_files; 1538 omap = fdp->fd_map; 1539 1540 /* compute the size of the new table */ 1541 nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */ 1542 if (nnfiles <= onfiles) 1543 /* the table is already large enough */ 1544 return; 1545 1546 /* 1547 * Allocate a new table. We need enough space for the number of 1548 * entries, file entries themselves and the struct freetable we will use 1549 * when we decommission the table and place it on the freelist. 1550 * We place the struct freetable in the middle so we don't have 1551 * to worry about padding. 1552 */ 1553 ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) + 1554 nnfiles * sizeof(ntable->fdt_ofiles[0]) + 1555 sizeof(struct freetable), 1556 M_FILEDESC, M_ZERO | M_WAITOK); 1557 /* copy the old data */ 1558 ntable->fdt_nfiles = nnfiles; 1559 memcpy(ntable->fdt_ofiles, otable->fdt_ofiles, 1560 onfiles * sizeof(ntable->fdt_ofiles[0])); 1561 1562 /* 1563 * Allocate a new map only if the old is not large enough. It will 1564 * grow at a slower rate than the table as it can map more 1565 * entries than the table can hold. 1566 */ 1567 if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) { 1568 nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC, 1569 M_ZERO | M_WAITOK); 1570 /* copy over the old data and update the pointer */ 1571 memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap)); 1572 fdp->fd_map = nmap; 1573 } 1574 1575 /* 1576 * Make sure that ntable is correctly initialized before we replace 1577 * fd_files poiner. Otherwise fget_unlocked() may see inconsistent 1578 * data. 1579 */ 1580 atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable); 1581 1582 /* 1583 * Do not free the old file table, as some threads may still 1584 * reference entries within it. Instead, place it on a freelist 1585 * which will be processed when the struct filedesc is released. 1586 * 1587 * Note that if onfiles == NDFILE, we're dealing with the original 1588 * static allocation contained within (struct filedesc0 *)fdp, 1589 * which must not be freed. 1590 */ 1591 if (onfiles > NDFILE) { 1592 ft = (struct freetable *)&otable->fdt_ofiles[onfiles]; 1593 fdp0 = (struct filedesc0 *)fdp; 1594 ft->ft_table = otable; 1595 SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next); 1596 } 1597 /* 1598 * The map does not have the same possibility of threads still 1599 * holding references to it. So always free it as long as it 1600 * does not reference the original static allocation. 1601 */ 1602 if (NDSLOTS(onfiles) > NDSLOTS(NDFILE)) 1603 free(omap, M_FILEDESC); 1604 } 1605 1606 /* 1607 * Allocate a file descriptor for the process. 1608 */ 1609 int 1610 fdalloc(struct thread *td, int minfd, int *result) 1611 { 1612 struct proc *p = td->td_proc; 1613 struct filedesc *fdp = p->p_fd; 1614 int fd, maxfd, allocfd; 1615 #ifdef RACCT 1616 int error; 1617 #endif 1618 1619 FILEDESC_XLOCK_ASSERT(fdp); 1620 1621 if (fdp->fd_freefile > minfd) 1622 minfd = fdp->fd_freefile; 1623 1624 maxfd = getmaxfd(p); 1625 1626 /* 1627 * Search the bitmap for a free descriptor starting at minfd. 1628 * If none is found, grow the file table. 1629 */ 1630 fd = fd_first_free(fdp, minfd, fdp->fd_nfiles); 1631 if (fd >= maxfd) 1632 return (EMFILE); 1633 if (fd >= fdp->fd_nfiles) { 1634 allocfd = min(fd * 2, maxfd); 1635 #ifdef RACCT 1636 if (racct_enable) { 1637 PROC_LOCK(p); 1638 error = racct_set(p, RACCT_NOFILE, allocfd); 1639 PROC_UNLOCK(p); 1640 if (error != 0) 1641 return (EMFILE); 1642 } 1643 #endif 1644 /* 1645 * fd is already equal to first free descriptor >= minfd, so 1646 * we only need to grow the table and we are done. 1647 */ 1648 fdgrowtable_exp(fdp, allocfd); 1649 } 1650 1651 /* 1652 * Perform some sanity checks, then mark the file descriptor as 1653 * used and return it to the caller. 1654 */ 1655 KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles), 1656 ("invalid descriptor %d", fd)); 1657 KASSERT(!fdisused(fdp, fd), 1658 ("fd_first_free() returned non-free descriptor")); 1659 KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, 1660 ("file descriptor isn't free")); 1661 KASSERT(fdp->fd_ofiles[fd].fde_flags == 0, ("file flags are set")); 1662 fdused(fdp, fd); 1663 *result = fd; 1664 return (0); 1665 } 1666 1667 /* 1668 * Allocate n file descriptors for the process. 1669 */ 1670 int 1671 fdallocn(struct thread *td, int minfd, int *fds, int n) 1672 { 1673 struct proc *p = td->td_proc; 1674 struct filedesc *fdp = p->p_fd; 1675 int i; 1676 1677 FILEDESC_XLOCK_ASSERT(fdp); 1678 1679 for (i = 0; i < n; i++) 1680 if (fdalloc(td, 0, &fds[i]) != 0) 1681 break; 1682 1683 if (i < n) { 1684 for (i--; i >= 0; i--) 1685 fdunused(fdp, fds[i]); 1686 return (EMFILE); 1687 } 1688 1689 return (0); 1690 } 1691 1692 /* 1693 * Create a new open file structure and allocate a file decriptor for the 1694 * process that refers to it. We add one reference to the file for the 1695 * descriptor table and one reference for resultfp. This is to prevent us 1696 * being preempted and the entry in the descriptor table closed after we 1697 * release the FILEDESC lock. 1698 */ 1699 int 1700 falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags) 1701 { 1702 struct file *fp; 1703 int error, fd; 1704 1705 error = falloc_noinstall(td, &fp); 1706 if (error) 1707 return (error); /* no reference held on error */ 1708 1709 error = finstall(td, fp, &fd, flags, NULL); 1710 if (error) { 1711 fdrop(fp, td); /* one reference (fp only) */ 1712 return (error); 1713 } 1714 1715 if (resultfp != NULL) 1716 *resultfp = fp; /* copy out result */ 1717 else 1718 fdrop(fp, td); /* release local reference */ 1719 1720 if (resultfd != NULL) 1721 *resultfd = fd; 1722 1723 return (0); 1724 } 1725 1726 /* 1727 * Create a new open file structure without allocating a file descriptor. 1728 */ 1729 int 1730 falloc_noinstall(struct thread *td, struct file **resultfp) 1731 { 1732 struct file *fp; 1733 int maxuserfiles = maxfiles - (maxfiles / 20); 1734 static struct timeval lastfail; 1735 static int curfail; 1736 1737 KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__)); 1738 1739 if ((openfiles >= maxuserfiles && 1740 priv_check(td, PRIV_MAXFILES) != 0) || 1741 openfiles >= maxfiles) { 1742 if (ppsratecheck(&lastfail, &curfail, 1)) { 1743 printf("kern.maxfiles limit exceeded by uid %i, " 1744 "please see tuning(7).\n", td->td_ucred->cr_ruid); 1745 } 1746 return (ENFILE); 1747 } 1748 atomic_add_int(&openfiles, 1); 1749 fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO); 1750 refcount_init(&fp->f_count, 1); 1751 fp->f_cred = crhold(td->td_ucred); 1752 fp->f_ops = &badfileops; 1753 *resultfp = fp; 1754 return (0); 1755 } 1756 1757 /* 1758 * Install a file in a file descriptor table. 1759 */ 1760 int 1761 finstall(struct thread *td, struct file *fp, int *fd, int flags, 1762 struct filecaps *fcaps) 1763 { 1764 struct filedesc *fdp = td->td_proc->p_fd; 1765 struct filedescent *fde; 1766 int error; 1767 1768 KASSERT(fd != NULL, ("%s: fd == NULL", __func__)); 1769 KASSERT(fp != NULL, ("%s: fp == NULL", __func__)); 1770 if (fcaps != NULL) 1771 filecaps_validate(fcaps, __func__); 1772 1773 FILEDESC_XLOCK(fdp); 1774 if ((error = fdalloc(td, 0, fd))) { 1775 FILEDESC_XUNLOCK(fdp); 1776 return (error); 1777 } 1778 fhold(fp); 1779 fde = &fdp->fd_ofiles[*fd]; 1780 #ifdef CAPABILITIES 1781 seq_write_begin(&fde->fde_seq); 1782 #endif 1783 fde->fde_file = fp; 1784 if ((flags & O_CLOEXEC) != 0) 1785 fde->fde_flags |= UF_EXCLOSE; 1786 if (fcaps != NULL) 1787 filecaps_move(fcaps, &fde->fde_caps); 1788 else 1789 filecaps_fill(&fde->fde_caps); 1790 #ifdef CAPABILITIES 1791 seq_write_end(&fde->fde_seq); 1792 #endif 1793 FILEDESC_XUNLOCK(fdp); 1794 return (0); 1795 } 1796 1797 /* 1798 * Build a new filedesc structure from another. 1799 * Copy the current, root, and jail root vnode references. 1800 * 1801 * If fdp is not NULL, return with it shared locked. 1802 */ 1803 struct filedesc * 1804 fdinit(struct filedesc *fdp, bool prepfiles) 1805 { 1806 struct filedesc0 *newfdp0; 1807 struct filedesc *newfdp; 1808 1809 newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO); 1810 newfdp = &newfdp0->fd_fd; 1811 1812 /* Create the file descriptor table. */ 1813 FILEDESC_LOCK_INIT(newfdp); 1814 newfdp->fd_refcnt = 1; 1815 newfdp->fd_holdcnt = 1; 1816 newfdp->fd_cmask = CMASK; 1817 newfdp->fd_map = newfdp0->fd_dmap; 1818 newfdp->fd_lastfile = -1; 1819 newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles; 1820 newfdp->fd_files->fdt_nfiles = NDFILE; 1821 1822 if (fdp == NULL) 1823 return (newfdp); 1824 1825 if (prepfiles && fdp->fd_lastfile >= newfdp->fd_nfiles) 1826 fdgrowtable(newfdp, fdp->fd_lastfile + 1); 1827 1828 FILEDESC_SLOCK(fdp); 1829 newfdp->fd_cdir = fdp->fd_cdir; 1830 if (newfdp->fd_cdir) 1831 VREF(newfdp->fd_cdir); 1832 newfdp->fd_rdir = fdp->fd_rdir; 1833 if (newfdp->fd_rdir) 1834 VREF(newfdp->fd_rdir); 1835 newfdp->fd_jdir = fdp->fd_jdir; 1836 if (newfdp->fd_jdir) 1837 VREF(newfdp->fd_jdir); 1838 1839 if (!prepfiles) { 1840 FILEDESC_SUNLOCK(fdp); 1841 } else { 1842 while (fdp->fd_lastfile >= newfdp->fd_nfiles) { 1843 FILEDESC_SUNLOCK(fdp); 1844 fdgrowtable(newfdp, fdp->fd_lastfile + 1); 1845 FILEDESC_SLOCK(fdp); 1846 } 1847 } 1848 1849 return (newfdp); 1850 } 1851 1852 static struct filedesc * 1853 fdhold(struct proc *p) 1854 { 1855 struct filedesc *fdp; 1856 1857 mtx_lock(&fdesc_mtx); 1858 fdp = p->p_fd; 1859 if (fdp != NULL) 1860 fdp->fd_holdcnt++; 1861 mtx_unlock(&fdesc_mtx); 1862 return (fdp); 1863 } 1864 1865 static void 1866 fddrop(struct filedesc *fdp) 1867 { 1868 int i; 1869 1870 if (fdp->fd_holdcnt > 1) { 1871 mtx_lock(&fdesc_mtx); 1872 i = --fdp->fd_holdcnt; 1873 mtx_unlock(&fdesc_mtx); 1874 if (i > 0) 1875 return; 1876 } 1877 1878 FILEDESC_LOCK_DESTROY(fdp); 1879 uma_zfree(filedesc0_zone, fdp); 1880 } 1881 1882 /* 1883 * Share a filedesc structure. 1884 */ 1885 struct filedesc * 1886 fdshare(struct filedesc *fdp) 1887 { 1888 1889 FILEDESC_XLOCK(fdp); 1890 fdp->fd_refcnt++; 1891 FILEDESC_XUNLOCK(fdp); 1892 return (fdp); 1893 } 1894 1895 /* 1896 * Unshare a filedesc structure, if necessary by making a copy 1897 */ 1898 void 1899 fdunshare(struct thread *td) 1900 { 1901 struct filedesc *tmp; 1902 struct proc *p = td->td_proc; 1903 1904 if (p->p_fd->fd_refcnt == 1) 1905 return; 1906 1907 tmp = fdcopy(p->p_fd); 1908 fdescfree(td); 1909 p->p_fd = tmp; 1910 } 1911 1912 /* 1913 * Copy a filedesc structure. A NULL pointer in returns a NULL reference, 1914 * this is to ease callers, not catch errors. 1915 */ 1916 struct filedesc * 1917 fdcopy(struct filedesc *fdp) 1918 { 1919 struct filedesc *newfdp; 1920 struct filedescent *nfde, *ofde; 1921 int i; 1922 1923 MPASS(fdp != NULL); 1924 1925 newfdp = fdinit(fdp, true); 1926 /* copy all passable descriptors (i.e. not kqueue) */ 1927 newfdp->fd_freefile = -1; 1928 for (i = 0; i <= fdp->fd_lastfile; ++i) { 1929 ofde = &fdp->fd_ofiles[i]; 1930 if (ofde->fde_file == NULL || 1931 (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0) { 1932 if (newfdp->fd_freefile == -1) 1933 newfdp->fd_freefile = i; 1934 continue; 1935 } 1936 nfde = &newfdp->fd_ofiles[i]; 1937 *nfde = *ofde; 1938 filecaps_copy(&ofde->fde_caps, &nfde->fde_caps); 1939 fhold(nfde->fde_file); 1940 fdused_init(newfdp, i); 1941 newfdp->fd_lastfile = i; 1942 } 1943 if (newfdp->fd_freefile == -1) 1944 newfdp->fd_freefile = i; 1945 newfdp->fd_cmask = fdp->fd_cmask; 1946 FILEDESC_SUNLOCK(fdp); 1947 return (newfdp); 1948 } 1949 1950 /* 1951 * Clear POSIX style locks. This is only used when fdp looses a reference (i.e. 1952 * one of processes using it exits) and the table used to be shared. 1953 */ 1954 static void 1955 fdclearlocks(struct thread *td) 1956 { 1957 struct filedesc *fdp; 1958 struct filedesc_to_leader *fdtol; 1959 struct flock lf; 1960 struct file *fp; 1961 struct proc *p; 1962 struct vnode *vp; 1963 int i; 1964 1965 p = td->td_proc; 1966 fdp = p->p_fd; 1967 fdtol = p->p_fdtol; 1968 MPASS(fdtol != NULL); 1969 1970 FILEDESC_XLOCK(fdp); 1971 KASSERT(fdtol->fdl_refcount > 0, 1972 ("filedesc_to_refcount botch: fdl_refcount=%d", 1973 fdtol->fdl_refcount)); 1974 if (fdtol->fdl_refcount == 1 && 1975 (p->p_leader->p_flag & P_ADVLOCK) != 0) { 1976 for (i = 0; i <= fdp->fd_lastfile; i++) { 1977 fp = fdp->fd_ofiles[i].fde_file; 1978 if (fp == NULL || fp->f_type != DTYPE_VNODE) 1979 continue; 1980 fhold(fp); 1981 FILEDESC_XUNLOCK(fdp); 1982 lf.l_whence = SEEK_SET; 1983 lf.l_start = 0; 1984 lf.l_len = 0; 1985 lf.l_type = F_UNLCK; 1986 vp = fp->f_vnode; 1987 (void) VOP_ADVLOCK(vp, 1988 (caddr_t)p->p_leader, F_UNLCK, 1989 &lf, F_POSIX); 1990 FILEDESC_XLOCK(fdp); 1991 fdrop(fp, td); 1992 } 1993 } 1994 retry: 1995 if (fdtol->fdl_refcount == 1) { 1996 if (fdp->fd_holdleaderscount > 0 && 1997 (p->p_leader->p_flag & P_ADVLOCK) != 0) { 1998 /* 1999 * close() or do_dup() has cleared a reference 2000 * in a shared file descriptor table. 2001 */ 2002 fdp->fd_holdleaderswakeup = 1; 2003 sx_sleep(&fdp->fd_holdleaderscount, 2004 FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); 2005 goto retry; 2006 } 2007 if (fdtol->fdl_holdcount > 0) { 2008 /* 2009 * Ensure that fdtol->fdl_leader remains 2010 * valid in closef(). 2011 */ 2012 fdtol->fdl_wakeup = 1; 2013 sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK, 2014 "fdlhold", 0); 2015 goto retry; 2016 } 2017 } 2018 fdtol->fdl_refcount--; 2019 if (fdtol->fdl_refcount == 0 && 2020 fdtol->fdl_holdcount == 0) { 2021 fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; 2022 fdtol->fdl_prev->fdl_next = fdtol->fdl_next; 2023 } else 2024 fdtol = NULL; 2025 p->p_fdtol = NULL; 2026 FILEDESC_XUNLOCK(fdp); 2027 if (fdtol != NULL) 2028 free(fdtol, M_FILEDESC_TO_LEADER); 2029 } 2030 2031 /* 2032 * Release a filedesc structure. 2033 */ 2034 void 2035 fdescfree(struct thread *td) 2036 { 2037 struct filedesc0 *fdp0; 2038 struct filedesc *fdp; 2039 struct freetable *ft, *tft; 2040 struct filedescent *fde; 2041 struct file *fp; 2042 struct vnode *cdir, *jdir, *rdir; 2043 int i; 2044 2045 fdp = td->td_proc->p_fd; 2046 MPASS(fdp != NULL); 2047 2048 #ifdef RACCT 2049 if (racct_enable) { 2050 PROC_LOCK(td->td_proc); 2051 racct_set(td->td_proc, RACCT_NOFILE, 0); 2052 PROC_UNLOCK(td->td_proc); 2053 } 2054 #endif 2055 2056 if (td->td_proc->p_fdtol != NULL) 2057 fdclearlocks(td); 2058 2059 mtx_lock(&fdesc_mtx); 2060 td->td_proc->p_fd = NULL; 2061 mtx_unlock(&fdesc_mtx); 2062 2063 FILEDESC_XLOCK(fdp); 2064 i = --fdp->fd_refcnt; 2065 if (i > 0) { 2066 FILEDESC_XUNLOCK(fdp); 2067 return; 2068 } 2069 2070 cdir = fdp->fd_cdir; 2071 fdp->fd_cdir = NULL; 2072 rdir = fdp->fd_rdir; 2073 fdp->fd_rdir = NULL; 2074 jdir = fdp->fd_jdir; 2075 fdp->fd_jdir = NULL; 2076 FILEDESC_XUNLOCK(fdp); 2077 2078 for (i = 0; i <= fdp->fd_lastfile; i++) { 2079 fde = &fdp->fd_ofiles[i]; 2080 fp = fde->fde_file; 2081 if (fp != NULL) { 2082 fdefree_last(fde); 2083 (void) closef(fp, td); 2084 } 2085 } 2086 2087 if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE)) 2088 free(fdp->fd_map, M_FILEDESC); 2089 if (fdp->fd_nfiles > NDFILE) 2090 free(fdp->fd_files, M_FILEDESC); 2091 2092 fdp0 = (struct filedesc0 *)fdp; 2093 SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft) 2094 free(ft->ft_table, M_FILEDESC); 2095 2096 if (cdir != NULL) 2097 vrele(cdir); 2098 if (rdir != NULL) 2099 vrele(rdir); 2100 if (jdir != NULL) 2101 vrele(jdir); 2102 2103 fddrop(fdp); 2104 } 2105 2106 /* 2107 * For setugid programs, we don't want to people to use that setugidness 2108 * to generate error messages which write to a file which otherwise would 2109 * otherwise be off-limits to the process. We check for filesystems where 2110 * the vnode can change out from under us after execve (like [lin]procfs). 2111 * 2112 * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is 2113 * sufficient. We also don't check for setugidness since we know we are. 2114 */ 2115 static bool 2116 is_unsafe(struct file *fp) 2117 { 2118 struct vnode *vp; 2119 2120 if (fp->f_type != DTYPE_VNODE) 2121 return (false); 2122 2123 vp = fp->f_vnode; 2124 return ((vp->v_vflag & VV_PROCDEP) != 0); 2125 } 2126 2127 /* 2128 * Make this setguid thing safe, if at all possible. 2129 */ 2130 void 2131 fdsetugidsafety(struct thread *td) 2132 { 2133 struct filedesc *fdp; 2134 struct file *fp; 2135 int i; 2136 2137 fdp = td->td_proc->p_fd; 2138 KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); 2139 MPASS(fdp->fd_nfiles >= 3); 2140 for (i = 0; i <= 2; i++) { 2141 fp = fdp->fd_ofiles[i].fde_file; 2142 if (fp != NULL && is_unsafe(fp)) { 2143 FILEDESC_XLOCK(fdp); 2144 knote_fdclose(td, i); 2145 /* 2146 * NULL-out descriptor prior to close to avoid 2147 * a race while close blocks. 2148 */ 2149 fdfree(fdp, i); 2150 FILEDESC_XUNLOCK(fdp); 2151 (void) closef(fp, td); 2152 } 2153 } 2154 } 2155 2156 /* 2157 * If a specific file object occupies a specific file descriptor, close the 2158 * file descriptor entry and drop a reference on the file object. This is a 2159 * convenience function to handle a subsequent error in a function that calls 2160 * falloc() that handles the race that another thread might have closed the 2161 * file descriptor out from under the thread creating the file object. 2162 */ 2163 void 2164 fdclose(struct thread *td, struct file *fp, int idx) 2165 { 2166 struct filedesc *fdp = td->td_proc->p_fd; 2167 2168 FILEDESC_XLOCK(fdp); 2169 if (fdp->fd_ofiles[idx].fde_file == fp) { 2170 fdfree(fdp, idx); 2171 FILEDESC_XUNLOCK(fdp); 2172 fdrop(fp, td); 2173 } else 2174 FILEDESC_XUNLOCK(fdp); 2175 } 2176 2177 /* 2178 * Close any files on exec? 2179 */ 2180 void 2181 fdcloseexec(struct thread *td) 2182 { 2183 struct filedesc *fdp; 2184 struct filedescent *fde; 2185 struct file *fp; 2186 int i; 2187 2188 fdp = td->td_proc->p_fd; 2189 KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); 2190 for (i = 0; i <= fdp->fd_lastfile; i++) { 2191 fde = &fdp->fd_ofiles[i]; 2192 fp = fde->fde_file; 2193 if (fp != NULL && (fp->f_type == DTYPE_MQUEUE || 2194 (fde->fde_flags & UF_EXCLOSE))) { 2195 FILEDESC_XLOCK(fdp); 2196 fdfree(fdp, i); 2197 (void) closefp(fdp, i, fp, td, 0); 2198 /* closefp() drops the FILEDESC lock. */ 2199 } 2200 } 2201 } 2202 2203 /* 2204 * It is unsafe for set[ug]id processes to be started with file 2205 * descriptors 0..2 closed, as these descriptors are given implicit 2206 * significance in the Standard C library. fdcheckstd() will create a 2207 * descriptor referencing /dev/null for each of stdin, stdout, and 2208 * stderr that is not already open. 2209 */ 2210 int 2211 fdcheckstd(struct thread *td) 2212 { 2213 struct filedesc *fdp; 2214 register_t save; 2215 int i, error, devnull; 2216 2217 fdp = td->td_proc->p_fd; 2218 KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); 2219 MPASS(fdp->fd_nfiles >= 3); 2220 devnull = -1; 2221 for (i = 0; i <= 2; i++) { 2222 if (fdp->fd_ofiles[i].fde_file != NULL) 2223 continue; 2224 2225 save = td->td_retval[0]; 2226 if (devnull != -1) { 2227 error = do_dup(td, DUP_FIXED, devnull, i); 2228 } else { 2229 error = kern_openat(td, AT_FDCWD, "/dev/null", 2230 UIO_SYSSPACE, O_RDWR, 0); 2231 if (error == 0) { 2232 devnull = td->td_retval[0]; 2233 KASSERT(devnull == i, ("we didn't get our fd")); 2234 } 2235 } 2236 td->td_retval[0] = save; 2237 if (error != 0) 2238 return (error); 2239 } 2240 return (0); 2241 } 2242 2243 /* 2244 * Internal form of close. Decrement reference count on file structure. 2245 * Note: td may be NULL when closing a file that was being passed in a 2246 * message. 2247 * 2248 * XXXRW: Giant is not required for the caller, but often will be held; this 2249 * makes it moderately likely the Giant will be recursed in the VFS case. 2250 */ 2251 int 2252 closef(struct file *fp, struct thread *td) 2253 { 2254 struct vnode *vp; 2255 struct flock lf; 2256 struct filedesc_to_leader *fdtol; 2257 struct filedesc *fdp; 2258 2259 /* 2260 * POSIX record locking dictates that any close releases ALL 2261 * locks owned by this process. This is handled by setting 2262 * a flag in the unlock to free ONLY locks obeying POSIX 2263 * semantics, and not to free BSD-style file locks. 2264 * If the descriptor was in a message, POSIX-style locks 2265 * aren't passed with the descriptor, and the thread pointer 2266 * will be NULL. Callers should be careful only to pass a 2267 * NULL thread pointer when there really is no owning 2268 * context that might have locks, or the locks will be 2269 * leaked. 2270 */ 2271 if (fp->f_type == DTYPE_VNODE && td != NULL) { 2272 vp = fp->f_vnode; 2273 if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { 2274 lf.l_whence = SEEK_SET; 2275 lf.l_start = 0; 2276 lf.l_len = 0; 2277 lf.l_type = F_UNLCK; 2278 (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, 2279 F_UNLCK, &lf, F_POSIX); 2280 } 2281 fdtol = td->td_proc->p_fdtol; 2282 if (fdtol != NULL) { 2283 /* 2284 * Handle special case where file descriptor table is 2285 * shared between multiple process leaders. 2286 */ 2287 fdp = td->td_proc->p_fd; 2288 FILEDESC_XLOCK(fdp); 2289 for (fdtol = fdtol->fdl_next; 2290 fdtol != td->td_proc->p_fdtol; 2291 fdtol = fdtol->fdl_next) { 2292 if ((fdtol->fdl_leader->p_flag & 2293 P_ADVLOCK) == 0) 2294 continue; 2295 fdtol->fdl_holdcount++; 2296 FILEDESC_XUNLOCK(fdp); 2297 lf.l_whence = SEEK_SET; 2298 lf.l_start = 0; 2299 lf.l_len = 0; 2300 lf.l_type = F_UNLCK; 2301 vp = fp->f_vnode; 2302 (void) VOP_ADVLOCK(vp, 2303 (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf, 2304 F_POSIX); 2305 FILEDESC_XLOCK(fdp); 2306 fdtol->fdl_holdcount--; 2307 if (fdtol->fdl_holdcount == 0 && 2308 fdtol->fdl_wakeup != 0) { 2309 fdtol->fdl_wakeup = 0; 2310 wakeup(fdtol); 2311 } 2312 } 2313 FILEDESC_XUNLOCK(fdp); 2314 } 2315 } 2316 return (fdrop(fp, td)); 2317 } 2318 2319 /* 2320 * Initialize the file pointer with the specified properties. 2321 * 2322 * The ops are set with release semantics to be certain that the flags, type, 2323 * and data are visible when ops is. This is to prevent ops methods from being 2324 * called with bad data. 2325 */ 2326 void 2327 finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops) 2328 { 2329 fp->f_data = data; 2330 fp->f_flag = flag; 2331 fp->f_type = type; 2332 atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops); 2333 } 2334 2335 int 2336 fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, 2337 struct file **fpp, seq_t *seqp) 2338 { 2339 #ifdef CAPABILITIES 2340 struct filedescent *fde; 2341 #endif 2342 struct fdescenttbl *fdt; 2343 struct file *fp; 2344 u_int count; 2345 #ifdef CAPABILITIES 2346 seq_t seq; 2347 cap_rights_t haverights; 2348 int error; 2349 #endif 2350 2351 fdt = fdp->fd_files; 2352 if ((u_int)fd >= fdt->fdt_nfiles) 2353 return (EBADF); 2354 /* 2355 * Fetch the descriptor locklessly. We avoid fdrop() races by 2356 * never raising a refcount above 0. To accomplish this we have 2357 * to use a cmpset loop rather than an atomic_add. The descriptor 2358 * must be re-verified once we acquire a reference to be certain 2359 * that the identity is still correct and we did not lose a race 2360 * due to preemption. 2361 */ 2362 for (;;) { 2363 #ifdef CAPABILITIES 2364 seq = seq_read(fd_seq(fdt, fd)); 2365 fde = &fdt->fdt_ofiles[fd]; 2366 haverights = *cap_rights_fde(fde); 2367 fp = fde->fde_file; 2368 if (!seq_consistent(fd_seq(fdt, fd), seq)) { 2369 cpu_spinwait(); 2370 continue; 2371 } 2372 #else 2373 fp = fdt->fdt_ofiles[fd].fde_file; 2374 #endif 2375 if (fp == NULL) 2376 return (EBADF); 2377 #ifdef CAPABILITIES 2378 if (needrightsp != NULL) { 2379 error = cap_check(&haverights, needrightsp); 2380 if (error != 0) 2381 return (error); 2382 } 2383 #endif 2384 retry: 2385 count = fp->f_count; 2386 if (count == 0) { 2387 /* 2388 * Force a reload. Other thread could reallocate the 2389 * table before this fd was closed, so it possible that 2390 * there is a stale fp pointer in cached version. 2391 */ 2392 fdt = *(struct fdescenttbl * volatile *)&(fdp->fd_files); 2393 continue; 2394 } 2395 /* 2396 * Use an acquire barrier to force re-reading of fdt so it is 2397 * refreshed for verification. 2398 */ 2399 if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) == 0) 2400 goto retry; 2401 fdt = fdp->fd_files; 2402 #ifdef CAPABILITIES 2403 if (seq_consistent_nomb(fd_seq(fdt, fd), seq)) 2404 #else 2405 if (fp == fdt->fdt_ofiles[fd].fde_file) 2406 #endif 2407 break; 2408 fdrop(fp, curthread); 2409 } 2410 *fpp = fp; 2411 if (seqp != NULL) { 2412 #ifdef CAPABILITIES 2413 *seqp = seq; 2414 #endif 2415 } 2416 return (0); 2417 } 2418 2419 /* 2420 * Extract the file pointer associated with the specified descriptor for the 2421 * current user process. 2422 * 2423 * If the descriptor doesn't exist or doesn't match 'flags', EBADF is 2424 * returned. 2425 * 2426 * File's rights will be checked against the capability rights mask. 2427 * 2428 * If an error occured the non-zero error is returned and *fpp is set to 2429 * NULL. Otherwise *fpp is held and set and zero is returned. Caller is 2430 * responsible for fdrop(). 2431 */ 2432 static __inline int 2433 _fget(struct thread *td, int fd, struct file **fpp, int flags, 2434 cap_rights_t *needrightsp, seq_t *seqp) 2435 { 2436 struct filedesc *fdp; 2437 struct file *fp; 2438 cap_rights_t needrights; 2439 int error; 2440 2441 *fpp = NULL; 2442 fdp = td->td_proc->p_fd; 2443 if (needrightsp != NULL) 2444 needrights = *needrightsp; 2445 else 2446 cap_rights_init(&needrights); 2447 error = fget_unlocked(fdp, fd, &needrights, &fp, seqp); 2448 if (error != 0) 2449 return (error); 2450 if (fp->f_ops == &badfileops) { 2451 fdrop(fp, td); 2452 return (EBADF); 2453 } 2454 2455 /* 2456 * FREAD and FWRITE failure return EBADF as per POSIX. 2457 */ 2458 error = 0; 2459 switch (flags) { 2460 case FREAD: 2461 case FWRITE: 2462 if ((fp->f_flag & flags) == 0) 2463 error = EBADF; 2464 break; 2465 case FEXEC: 2466 if ((fp->f_flag & (FREAD | FEXEC)) == 0 || 2467 ((fp->f_flag & FWRITE) != 0)) 2468 error = EBADF; 2469 break; 2470 case 0: 2471 break; 2472 default: 2473 KASSERT(0, ("wrong flags")); 2474 } 2475 2476 if (error != 0) { 2477 fdrop(fp, td); 2478 return (error); 2479 } 2480 2481 *fpp = fp; 2482 return (0); 2483 } 2484 2485 int 2486 fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) 2487 { 2488 2489 return (_fget(td, fd, fpp, 0, rightsp, NULL)); 2490 } 2491 2492 int 2493 fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, u_char *maxprotp, 2494 struct file **fpp) 2495 { 2496 int error; 2497 #ifndef CAPABILITIES 2498 error = _fget(td, fd, fpp, 0, rightsp, NULL); 2499 if (maxprotp != NULL) 2500 *maxprotp = VM_PROT_ALL; 2501 #else 2502 struct filedesc *fdp = td->td_proc->p_fd; 2503 seq_t seq; 2504 2505 MPASS(cap_rights_is_set(rightsp, CAP_MMAP)); 2506 for (;;) { 2507 error = _fget(td, fd, fpp, 0, rightsp, &seq); 2508 if (error != 0) 2509 return (error); 2510 /* 2511 * If requested, convert capability rights to access flags. 2512 */ 2513 if (maxprotp != NULL) 2514 *maxprotp = cap_rights_to_vmprot(cap_rights(fdp, fd)); 2515 if (!fd_modified(fdp, fd, seq)) 2516 break; 2517 fdrop(*fpp, td); 2518 } 2519 #endif 2520 return (error); 2521 } 2522 2523 int 2524 fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) 2525 { 2526 2527 return (_fget(td, fd, fpp, FREAD, rightsp, NULL)); 2528 } 2529 2530 int 2531 fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) 2532 { 2533 2534 return (_fget(td, fd, fpp, FWRITE, rightsp, NULL)); 2535 } 2536 2537 int 2538 fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl, 2539 struct file **fpp) 2540 { 2541 struct filedesc *fdp = td->td_proc->p_fd; 2542 #ifndef CAPABILITIES 2543 return (fget_unlocked(fdp, fd, rightsp, fpp, NULL)); 2544 #else 2545 int error; 2546 seq_t seq; 2547 2548 MPASS(cap_rights_is_set(rightsp, CAP_FCNTL)); 2549 for (;;) { 2550 error = fget_unlocked(fdp, fd, rightsp, fpp, &seq); 2551 if (error != 0) 2552 return (error); 2553 error = cap_fcntl_check(fdp, fd, needfcntl); 2554 if (!fd_modified(fdp, fd, seq)) 2555 break; 2556 fdrop(*fpp, td); 2557 } 2558 if (error != 0) { 2559 fdrop(*fpp, td); 2560 *fpp = NULL; 2561 } 2562 return (error); 2563 #endif 2564 } 2565 2566 /* 2567 * Like fget() but loads the underlying vnode, or returns an error if the 2568 * descriptor does not represent a vnode. Note that pipes use vnodes but 2569 * never have VM objects. The returned vnode will be vref()'d. 2570 * 2571 * XXX: what about the unused flags ? 2572 */ 2573 static __inline int 2574 _fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp, 2575 struct vnode **vpp) 2576 { 2577 struct file *fp; 2578 int error; 2579 2580 *vpp = NULL; 2581 error = _fget(td, fd, &fp, flags, needrightsp, NULL); 2582 if (error != 0) 2583 return (error); 2584 if (fp->f_vnode == NULL) { 2585 error = EINVAL; 2586 } else { 2587 *vpp = fp->f_vnode; 2588 vref(*vpp); 2589 } 2590 fdrop(fp, td); 2591 2592 return (error); 2593 } 2594 2595 int 2596 fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) 2597 { 2598 2599 return (_fgetvp(td, fd, 0, rightsp, vpp)); 2600 } 2601 2602 int 2603 fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp, 2604 struct filecaps *havecaps, struct vnode **vpp) 2605 { 2606 struct filedesc *fdp; 2607 struct file *fp; 2608 #ifdef CAPABILITIES 2609 int error; 2610 #endif 2611 2612 fdp = td->td_proc->p_fd; 2613 fp = fget_locked(fdp, fd); 2614 if (fp == NULL || fp->f_ops == &badfileops) 2615 return (EBADF); 2616 2617 #ifdef CAPABILITIES 2618 if (needrightsp != NULL) { 2619 error = cap_check(cap_rights(fdp, fd), needrightsp); 2620 if (error != 0) 2621 return (error); 2622 } 2623 #endif 2624 2625 if (fp->f_vnode == NULL) 2626 return (EINVAL); 2627 2628 *vpp = fp->f_vnode; 2629 vref(*vpp); 2630 filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, havecaps); 2631 2632 return (0); 2633 } 2634 2635 int 2636 fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) 2637 { 2638 2639 return (_fgetvp(td, fd, FREAD, rightsp, vpp)); 2640 } 2641 2642 int 2643 fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) 2644 { 2645 2646 return (_fgetvp(td, fd, FEXEC, rightsp, vpp)); 2647 } 2648 2649 #ifdef notyet 2650 int 2651 fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp, 2652 struct vnode **vpp) 2653 { 2654 2655 return (_fgetvp(td, fd, FWRITE, rightsp, vpp)); 2656 } 2657 #endif 2658 2659 /* 2660 * Like fget() but loads the underlying socket, or returns an error if the 2661 * descriptor does not represent a socket. 2662 * 2663 * We bump the ref count on the returned socket. XXX Also obtain the SX lock 2664 * in the future. 2665 * 2666 * Note: fgetsock() and fputsock() are deprecated, as consumers should rely 2667 * on their file descriptor reference to prevent the socket from being free'd 2668 * during use. 2669 */ 2670 int 2671 fgetsock(struct thread *td, int fd, cap_rights_t *rightsp, struct socket **spp, 2672 u_int *fflagp) 2673 { 2674 struct file *fp; 2675 int error; 2676 2677 *spp = NULL; 2678 if (fflagp != NULL) 2679 *fflagp = 0; 2680 if ((error = _fget(td, fd, &fp, 0, rightsp, NULL)) != 0) 2681 return (error); 2682 if (fp->f_type != DTYPE_SOCKET) { 2683 error = ENOTSOCK; 2684 } else { 2685 *spp = fp->f_data; 2686 if (fflagp) 2687 *fflagp = fp->f_flag; 2688 SOCK_LOCK(*spp); 2689 soref(*spp); 2690 SOCK_UNLOCK(*spp); 2691 } 2692 fdrop(fp, td); 2693 2694 return (error); 2695 } 2696 2697 /* 2698 * Drop the reference count on the socket and XXX release the SX lock in the 2699 * future. The last reference closes the socket. 2700 * 2701 * Note: fputsock() is deprecated, see comment for fgetsock(). 2702 */ 2703 void 2704 fputsock(struct socket *so) 2705 { 2706 2707 ACCEPT_LOCK(); 2708 SOCK_LOCK(so); 2709 CURVNET_SET(so->so_vnet); 2710 sorele(so); 2711 CURVNET_RESTORE(); 2712 } 2713 2714 /* 2715 * Handle the last reference to a file being closed. 2716 */ 2717 int 2718 _fdrop(struct file *fp, struct thread *td) 2719 { 2720 int error; 2721 2722 if (fp->f_count != 0) 2723 panic("fdrop: count %d", fp->f_count); 2724 error = fo_close(fp, td); 2725 atomic_subtract_int(&openfiles, 1); 2726 crfree(fp->f_cred); 2727 free(fp->f_advice, M_FADVISE); 2728 uma_zfree(file_zone, fp); 2729 2730 return (error); 2731 } 2732 2733 /* 2734 * Apply an advisory lock on a file descriptor. 2735 * 2736 * Just attempt to get a record lock of the requested type on the entire file 2737 * (l_whence = SEEK_SET, l_start = 0, l_len = 0). 2738 */ 2739 #ifndef _SYS_SYSPROTO_H_ 2740 struct flock_args { 2741 int fd; 2742 int how; 2743 }; 2744 #endif 2745 /* ARGSUSED */ 2746 int 2747 sys_flock(struct thread *td, struct flock_args *uap) 2748 { 2749 struct file *fp; 2750 struct vnode *vp; 2751 struct flock lf; 2752 cap_rights_t rights; 2753 int error; 2754 2755 error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FLOCK), &fp); 2756 if (error != 0) 2757 return (error); 2758 if (fp->f_type != DTYPE_VNODE) { 2759 fdrop(fp, td); 2760 return (EOPNOTSUPP); 2761 } 2762 2763 vp = fp->f_vnode; 2764 lf.l_whence = SEEK_SET; 2765 lf.l_start = 0; 2766 lf.l_len = 0; 2767 if (uap->how & LOCK_UN) { 2768 lf.l_type = F_UNLCK; 2769 atomic_clear_int(&fp->f_flag, FHASLOCK); 2770 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); 2771 goto done2; 2772 } 2773 if (uap->how & LOCK_EX) 2774 lf.l_type = F_WRLCK; 2775 else if (uap->how & LOCK_SH) 2776 lf.l_type = F_RDLCK; 2777 else { 2778 error = EBADF; 2779 goto done2; 2780 } 2781 atomic_set_int(&fp->f_flag, FHASLOCK); 2782 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, 2783 (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); 2784 done2: 2785 fdrop(fp, td); 2786 return (error); 2787 } 2788 /* 2789 * Duplicate the specified descriptor to a free descriptor. 2790 */ 2791 int 2792 dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode, 2793 int openerror, int *indxp) 2794 { 2795 struct filedescent *newfde, *oldfde; 2796 struct file *fp; 2797 int error, indx; 2798 2799 KASSERT(openerror == ENODEV || openerror == ENXIO, 2800 ("unexpected error %d in %s", openerror, __func__)); 2801 2802 /* 2803 * If the to-be-dup'd fd number is greater than the allowed number 2804 * of file descriptors, or the fd to be dup'd has already been 2805 * closed, then reject. 2806 */ 2807 FILEDESC_XLOCK(fdp); 2808 if ((fp = fget_locked(fdp, dfd)) == NULL) { 2809 FILEDESC_XUNLOCK(fdp); 2810 return (EBADF); 2811 } 2812 2813 error = fdalloc(td, 0, &indx); 2814 if (error != 0) { 2815 FILEDESC_XUNLOCK(fdp); 2816 return (error); 2817 } 2818 2819 /* 2820 * There are two cases of interest here. 2821 * 2822 * For ENODEV simply dup (dfd) to file descriptor (indx) and return. 2823 * 2824 * For ENXIO steal away the file structure from (dfd) and store it in 2825 * (indx). (dfd) is effectively closed by this operation. 2826 */ 2827 switch (openerror) { 2828 case ENODEV: 2829 /* 2830 * Check that the mode the file is being opened for is a 2831 * subset of the mode of the existing descriptor. 2832 */ 2833 if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { 2834 fdunused(fdp, indx); 2835 FILEDESC_XUNLOCK(fdp); 2836 return (EACCES); 2837 } 2838 fhold(fp); 2839 newfde = &fdp->fd_ofiles[indx]; 2840 oldfde = &fdp->fd_ofiles[dfd]; 2841 #ifdef CAPABILITIES 2842 seq_write_begin(&newfde->fde_seq); 2843 #endif 2844 memcpy(newfde, oldfde, fde_change_size); 2845 filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps); 2846 #ifdef CAPABILITIES 2847 seq_write_end(&newfde->fde_seq); 2848 #endif 2849 break; 2850 case ENXIO: 2851 /* 2852 * Steal away the file pointer from dfd and stuff it into indx. 2853 */ 2854 newfde = &fdp->fd_ofiles[indx]; 2855 oldfde = &fdp->fd_ofiles[dfd]; 2856 #ifdef CAPABILITIES 2857 seq_write_begin(&newfde->fde_seq); 2858 #endif 2859 memcpy(newfde, oldfde, fde_change_size); 2860 bzero(oldfde, fde_change_size); 2861 fdunused(fdp, dfd); 2862 #ifdef CAPABILITIES 2863 seq_write_end(&newfde->fde_seq); 2864 #endif 2865 break; 2866 } 2867 FILEDESC_XUNLOCK(fdp); 2868 *indxp = indx; 2869 return (0); 2870 } 2871 2872 /* 2873 * Scan all active processes and prisons to see if any of them have a current 2874 * or root directory of `olddp'. If so, replace them with the new mount point. 2875 */ 2876 void 2877 mountcheckdirs(struct vnode *olddp, struct vnode *newdp) 2878 { 2879 struct filedesc *fdp; 2880 struct prison *pr; 2881 struct proc *p; 2882 int nrele; 2883 2884 if (vrefcnt(olddp) == 1) 2885 return; 2886 nrele = 0; 2887 sx_slock(&allproc_lock); 2888 FOREACH_PROC_IN_SYSTEM(p) { 2889 fdp = fdhold(p); 2890 if (fdp == NULL) 2891 continue; 2892 FILEDESC_XLOCK(fdp); 2893 if (fdp->fd_cdir == olddp) { 2894 vref(newdp); 2895 fdp->fd_cdir = newdp; 2896 nrele++; 2897 } 2898 if (fdp->fd_rdir == olddp) { 2899 vref(newdp); 2900 fdp->fd_rdir = newdp; 2901 nrele++; 2902 } 2903 if (fdp->fd_jdir == olddp) { 2904 vref(newdp); 2905 fdp->fd_jdir = newdp; 2906 nrele++; 2907 } 2908 FILEDESC_XUNLOCK(fdp); 2909 fddrop(fdp); 2910 } 2911 sx_sunlock(&allproc_lock); 2912 if (rootvnode == olddp) { 2913 vref(newdp); 2914 rootvnode = newdp; 2915 nrele++; 2916 } 2917 mtx_lock(&prison0.pr_mtx); 2918 if (prison0.pr_root == olddp) { 2919 vref(newdp); 2920 prison0.pr_root = newdp; 2921 nrele++; 2922 } 2923 mtx_unlock(&prison0.pr_mtx); 2924 sx_slock(&allprison_lock); 2925 TAILQ_FOREACH(pr, &allprison, pr_list) { 2926 mtx_lock(&pr->pr_mtx); 2927 if (pr->pr_root == olddp) { 2928 vref(newdp); 2929 pr->pr_root = newdp; 2930 nrele++; 2931 } 2932 mtx_unlock(&pr->pr_mtx); 2933 } 2934 sx_sunlock(&allprison_lock); 2935 while (nrele--) 2936 vrele(olddp); 2937 } 2938 2939 struct filedesc_to_leader * 2940 filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader) 2941 { 2942 struct filedesc_to_leader *fdtol; 2943 2944 fdtol = malloc(sizeof(struct filedesc_to_leader), 2945 M_FILEDESC_TO_LEADER, 2946 M_WAITOK); 2947 fdtol->fdl_refcount = 1; 2948 fdtol->fdl_holdcount = 0; 2949 fdtol->fdl_wakeup = 0; 2950 fdtol->fdl_leader = leader; 2951 if (old != NULL) { 2952 FILEDESC_XLOCK(fdp); 2953 fdtol->fdl_next = old->fdl_next; 2954 fdtol->fdl_prev = old; 2955 old->fdl_next = fdtol; 2956 fdtol->fdl_next->fdl_prev = fdtol; 2957 FILEDESC_XUNLOCK(fdp); 2958 } else { 2959 fdtol->fdl_next = fdtol; 2960 fdtol->fdl_prev = fdtol; 2961 } 2962 return (fdtol); 2963 } 2964 2965 /* 2966 * Get file structures globally. 2967 */ 2968 static int 2969 sysctl_kern_file(SYSCTL_HANDLER_ARGS) 2970 { 2971 struct xfile xf; 2972 struct filedesc *fdp; 2973 struct file *fp; 2974 struct proc *p; 2975 int error, n; 2976 2977 error = sysctl_wire_old_buffer(req, 0); 2978 if (error != 0) 2979 return (error); 2980 if (req->oldptr == NULL) { 2981 n = 0; 2982 sx_slock(&allproc_lock); 2983 FOREACH_PROC_IN_SYSTEM(p) { 2984 if (p->p_state == PRS_NEW) 2985 continue; 2986 fdp = fdhold(p); 2987 if (fdp == NULL) 2988 continue; 2989 /* overestimates sparse tables. */ 2990 if (fdp->fd_lastfile > 0) 2991 n += fdp->fd_lastfile; 2992 fddrop(fdp); 2993 } 2994 sx_sunlock(&allproc_lock); 2995 return (SYSCTL_OUT(req, 0, n * sizeof(xf))); 2996 } 2997 error = 0; 2998 bzero(&xf, sizeof(xf)); 2999 xf.xf_size = sizeof(xf); 3000 sx_slock(&allproc_lock); 3001 FOREACH_PROC_IN_SYSTEM(p) { 3002 PROC_LOCK(p); 3003 if (p->p_state == PRS_NEW) { 3004 PROC_UNLOCK(p); 3005 continue; 3006 } 3007 if (p_cansee(req->td, p) != 0) { 3008 PROC_UNLOCK(p); 3009 continue; 3010 } 3011 xf.xf_pid = p->p_pid; 3012 xf.xf_uid = p->p_ucred->cr_uid; 3013 PROC_UNLOCK(p); 3014 fdp = fdhold(p); 3015 if (fdp == NULL) 3016 continue; 3017 FILEDESC_SLOCK(fdp); 3018 for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) { 3019 if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) 3020 continue; 3021 xf.xf_fd = n; 3022 xf.xf_file = fp; 3023 xf.xf_data = fp->f_data; 3024 xf.xf_vnode = fp->f_vnode; 3025 xf.xf_type = fp->f_type; 3026 xf.xf_count = fp->f_count; 3027 xf.xf_msgcount = 0; 3028 xf.xf_offset = foffset_get(fp); 3029 xf.xf_flag = fp->f_flag; 3030 error = SYSCTL_OUT(req, &xf, sizeof(xf)); 3031 if (error) 3032 break; 3033 } 3034 FILEDESC_SUNLOCK(fdp); 3035 fddrop(fdp); 3036 if (error) 3037 break; 3038 } 3039 sx_sunlock(&allproc_lock); 3040 return (error); 3041 } 3042 3043 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE, 3044 0, 0, sysctl_kern_file, "S,xfile", "Entire file table"); 3045 3046 #ifdef KINFO_FILE_SIZE 3047 CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE); 3048 #endif 3049 3050 static int 3051 xlate_fflags(int fflags) 3052 { 3053 static const struct { 3054 int fflag; 3055 int kf_fflag; 3056 } fflags_table[] = { 3057 { FAPPEND, KF_FLAG_APPEND }, 3058 { FASYNC, KF_FLAG_ASYNC }, 3059 { FFSYNC, KF_FLAG_FSYNC }, 3060 { FHASLOCK, KF_FLAG_HASLOCK }, 3061 { FNONBLOCK, KF_FLAG_NONBLOCK }, 3062 { FREAD, KF_FLAG_READ }, 3063 { FWRITE, KF_FLAG_WRITE }, 3064 { O_CREAT, KF_FLAG_CREAT }, 3065 { O_DIRECT, KF_FLAG_DIRECT }, 3066 { O_EXCL, KF_FLAG_EXCL }, 3067 { O_EXEC, KF_FLAG_EXEC }, 3068 { O_EXLOCK, KF_FLAG_EXLOCK }, 3069 { O_NOFOLLOW, KF_FLAG_NOFOLLOW }, 3070 { O_SHLOCK, KF_FLAG_SHLOCK }, 3071 { O_TRUNC, KF_FLAG_TRUNC } 3072 }; 3073 unsigned int i; 3074 int kflags; 3075 3076 kflags = 0; 3077 for (i = 0; i < nitems(fflags_table); i++) 3078 if (fflags & fflags_table[i].fflag) 3079 kflags |= fflags_table[i].kf_fflag; 3080 return (kflags); 3081 } 3082 3083 /* Trim unused data from kf_path by truncating the structure size. */ 3084 static void 3085 pack_kinfo(struct kinfo_file *kif) 3086 { 3087 3088 kif->kf_structsize = offsetof(struct kinfo_file, kf_path) + 3089 strlen(kif->kf_path) + 1; 3090 kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t)); 3091 } 3092 3093 static void 3094 export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp, 3095 struct kinfo_file *kif, struct filedesc *fdp) 3096 { 3097 int error; 3098 3099 bzero(kif, sizeof(*kif)); 3100 3101 /* Set a default type to allow for empty fill_kinfo() methods. */ 3102 kif->kf_type = KF_TYPE_UNKNOWN; 3103 kif->kf_flags = xlate_fflags(fp->f_flag); 3104 if (rightsp != NULL) 3105 kif->kf_cap_rights = *rightsp; 3106 else 3107 cap_rights_init(&kif->kf_cap_rights); 3108 kif->kf_fd = fd; 3109 kif->kf_ref_count = fp->f_count; 3110 kif->kf_offset = foffset_get(fp); 3111 3112 /* 3113 * This may drop the filedesc lock, so the 'fp' cannot be 3114 * accessed after this call. 3115 */ 3116 error = fo_fill_kinfo(fp, kif, fdp); 3117 if (error == 0) 3118 kif->kf_status |= KF_ATTR_VALID; 3119 pack_kinfo(kif); 3120 } 3121 3122 static void 3123 export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags, 3124 struct kinfo_file *kif) 3125 { 3126 int error; 3127 3128 bzero(kif, sizeof(*kif)); 3129 3130 kif->kf_type = KF_TYPE_VNODE; 3131 error = vn_fill_kinfo_vnode(vp, kif); 3132 if (error == 0) 3133 kif->kf_status |= KF_ATTR_VALID; 3134 kif->kf_flags = xlate_fflags(fflags); 3135 cap_rights_init(&kif->kf_cap_rights); 3136 kif->kf_fd = fd; 3137 kif->kf_ref_count = -1; 3138 kif->kf_offset = -1; 3139 pack_kinfo(kif); 3140 vrele(vp); 3141 } 3142 3143 struct export_fd_buf { 3144 struct filedesc *fdp; 3145 struct sbuf *sb; 3146 ssize_t remainder; 3147 struct kinfo_file kif; 3148 }; 3149 3150 static int 3151 export_kinfo_to_sb(struct export_fd_buf *efbuf) 3152 { 3153 struct kinfo_file *kif; 3154 3155 kif = &efbuf->kif; 3156 if (efbuf->remainder != -1) { 3157 if (efbuf->remainder < kif->kf_structsize) { 3158 /* Terminate export. */ 3159 efbuf->remainder = 0; 3160 return (0); 3161 } 3162 efbuf->remainder -= kif->kf_structsize; 3163 } 3164 return (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) == 0 ? 0 : ENOMEM); 3165 } 3166 3167 static int 3168 export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp, 3169 struct export_fd_buf *efbuf) 3170 { 3171 int error; 3172 3173 if (efbuf->remainder == 0) 3174 return (0); 3175 export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp); 3176 FILEDESC_SUNLOCK(efbuf->fdp); 3177 error = export_kinfo_to_sb(efbuf); 3178 FILEDESC_SLOCK(efbuf->fdp); 3179 return (error); 3180 } 3181 3182 static int 3183 export_vnode_to_sb(struct vnode *vp, int fd, int fflags, 3184 struct export_fd_buf *efbuf) 3185 { 3186 int error; 3187 3188 if (efbuf->remainder == 0) 3189 return (0); 3190 if (efbuf->fdp != NULL) 3191 FILEDESC_SUNLOCK(efbuf->fdp); 3192 export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif); 3193 error = export_kinfo_to_sb(efbuf); 3194 if (efbuf->fdp != NULL) 3195 FILEDESC_SLOCK(efbuf->fdp); 3196 return (error); 3197 } 3198 3199 /* 3200 * Store a process file descriptor information to sbuf. 3201 * 3202 * Takes a locked proc as argument, and returns with the proc unlocked. 3203 */ 3204 int 3205 kern_proc_filedesc_out(struct proc *p, struct sbuf *sb, ssize_t maxlen) 3206 { 3207 struct file *fp; 3208 struct filedesc *fdp; 3209 struct export_fd_buf *efbuf; 3210 struct vnode *cttyvp, *textvp, *tracevp; 3211 int error, i; 3212 cap_rights_t rights; 3213 3214 PROC_LOCK_ASSERT(p, MA_OWNED); 3215 3216 /* ktrace vnode */ 3217 tracevp = p->p_tracevp; 3218 if (tracevp != NULL) 3219 vref(tracevp); 3220 /* text vnode */ 3221 textvp = p->p_textvp; 3222 if (textvp != NULL) 3223 vref(textvp); 3224 /* Controlling tty. */ 3225 cttyvp = NULL; 3226 if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) { 3227 cttyvp = p->p_pgrp->pg_session->s_ttyvp; 3228 if (cttyvp != NULL) 3229 vref(cttyvp); 3230 } 3231 fdp = fdhold(p); 3232 PROC_UNLOCK(p); 3233 efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); 3234 efbuf->fdp = NULL; 3235 efbuf->sb = sb; 3236 efbuf->remainder = maxlen; 3237 if (tracevp != NULL) 3238 export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, FREAD | FWRITE, 3239 efbuf); 3240 if (textvp != NULL) 3241 export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, efbuf); 3242 if (cttyvp != NULL) 3243 export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, FREAD | FWRITE, 3244 efbuf); 3245 error = 0; 3246 if (fdp == NULL) 3247 goto fail; 3248 efbuf->fdp = fdp; 3249 FILEDESC_SLOCK(fdp); 3250 /* working directory */ 3251 if (fdp->fd_cdir != NULL) { 3252 vref(fdp->fd_cdir); 3253 export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD, FREAD, efbuf); 3254 } 3255 /* root directory */ 3256 if (fdp->fd_rdir != NULL) { 3257 vref(fdp->fd_rdir); 3258 export_vnode_to_sb(fdp->fd_rdir, KF_FD_TYPE_ROOT, FREAD, efbuf); 3259 } 3260 /* jail directory */ 3261 if (fdp->fd_jdir != NULL) { 3262 vref(fdp->fd_jdir); 3263 export_vnode_to_sb(fdp->fd_jdir, KF_FD_TYPE_JAIL, FREAD, efbuf); 3264 } 3265 for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) { 3266 if ((fp = fdp->fd_ofiles[i].fde_file) == NULL) 3267 continue; 3268 #ifdef CAPABILITIES 3269 rights = *cap_rights(fdp, i); 3270 #else /* !CAPABILITIES */ 3271 cap_rights_init(&rights); 3272 #endif 3273 /* 3274 * Create sysctl entry. It is OK to drop the filedesc 3275 * lock inside of export_file_to_sb() as we will 3276 * re-validate and re-evaluate its properties when the 3277 * loop continues. 3278 */ 3279 error = export_file_to_sb(fp, i, &rights, efbuf); 3280 if (error != 0 || efbuf->remainder == 0) 3281 break; 3282 } 3283 FILEDESC_SUNLOCK(fdp); 3284 fddrop(fdp); 3285 fail: 3286 free(efbuf, M_TEMP); 3287 return (error); 3288 } 3289 3290 #define FILEDESC_SBUF_SIZE (sizeof(struct kinfo_file) * 5) 3291 3292 /* 3293 * Get per-process file descriptors for use by procstat(1), et al. 3294 */ 3295 static int 3296 sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS) 3297 { 3298 struct sbuf sb; 3299 struct proc *p; 3300 ssize_t maxlen; 3301 int error, error2, *name; 3302 3303 name = (int *)arg1; 3304 3305 sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req); 3306 sbuf_clear_flags(&sb, SBUF_INCLUDENUL); 3307 error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); 3308 if (error != 0) { 3309 sbuf_delete(&sb); 3310 return (error); 3311 } 3312 maxlen = req->oldptr != NULL ? req->oldlen : -1; 3313 error = kern_proc_filedesc_out(p, &sb, maxlen); 3314 error2 = sbuf_finish(&sb); 3315 sbuf_delete(&sb); 3316 return (error != 0 ? error : error2); 3317 } 3318 3319 #ifdef KINFO_OFILE_SIZE 3320 CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE); 3321 #endif 3322 3323 #ifdef COMPAT_FREEBSD7 3324 static void 3325 kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif) 3326 { 3327 3328 okif->kf_structsize = sizeof(*okif); 3329 okif->kf_type = kif->kf_type; 3330 okif->kf_fd = kif->kf_fd; 3331 okif->kf_ref_count = kif->kf_ref_count; 3332 okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE | 3333 KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK | 3334 KF_FLAG_DIRECT | KF_FLAG_HASLOCK); 3335 okif->kf_offset = kif->kf_offset; 3336 okif->kf_vnode_type = kif->kf_vnode_type; 3337 okif->kf_sock_domain = kif->kf_sock_domain; 3338 okif->kf_sock_type = kif->kf_sock_type; 3339 okif->kf_sock_protocol = kif->kf_sock_protocol; 3340 strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path)); 3341 okif->kf_sa_local = kif->kf_sa_local; 3342 okif->kf_sa_peer = kif->kf_sa_peer; 3343 } 3344 3345 static int 3346 export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif, 3347 struct kinfo_ofile *okif, struct filedesc *fdp, struct sysctl_req *req) 3348 { 3349 int error; 3350 3351 vref(vp); 3352 FILEDESC_SUNLOCK(fdp); 3353 export_vnode_to_kinfo(vp, type, 0, kif); 3354 kinfo_to_okinfo(kif, okif); 3355 error = SYSCTL_OUT(req, okif, sizeof(*okif)); 3356 FILEDESC_SLOCK(fdp); 3357 return (error); 3358 } 3359 3360 /* 3361 * Get per-process file descriptors for use by procstat(1), et al. 3362 */ 3363 static int 3364 sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS) 3365 { 3366 struct kinfo_ofile *okif; 3367 struct kinfo_file *kif; 3368 struct filedesc *fdp; 3369 int error, i, *name; 3370 struct file *fp; 3371 struct proc *p; 3372 3373 name = (int *)arg1; 3374 error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); 3375 if (error != 0) 3376 return (error); 3377 fdp = fdhold(p); 3378 PROC_UNLOCK(p); 3379 if (fdp == NULL) 3380 return (ENOENT); 3381 kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK); 3382 okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK); 3383 FILEDESC_SLOCK(fdp); 3384 if (fdp->fd_cdir != NULL) 3385 export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif, 3386 okif, fdp, req); 3387 if (fdp->fd_rdir != NULL) 3388 export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif, 3389 okif, fdp, req); 3390 if (fdp->fd_jdir != NULL) 3391 export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif, 3392 okif, fdp, req); 3393 for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) { 3394 if ((fp = fdp->fd_ofiles[i].fde_file) == NULL) 3395 continue; 3396 export_file_to_kinfo(fp, i, NULL, kif, fdp); 3397 FILEDESC_SUNLOCK(fdp); 3398 kinfo_to_okinfo(kif, okif); 3399 error = SYSCTL_OUT(req, okif, sizeof(*okif)); 3400 FILEDESC_SLOCK(fdp); 3401 if (error) 3402 break; 3403 } 3404 FILEDESC_SUNLOCK(fdp); 3405 fddrop(fdp); 3406 free(kif, M_TEMP); 3407 free(okif, M_TEMP); 3408 return (0); 3409 } 3410 3411 static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, 3412 CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc, 3413 "Process ofiledesc entries"); 3414 #endif /* COMPAT_FREEBSD7 */ 3415 3416 int 3417 vntype_to_kinfo(int vtype) 3418 { 3419 struct { 3420 int vtype; 3421 int kf_vtype; 3422 } vtypes_table[] = { 3423 { VBAD, KF_VTYPE_VBAD }, 3424 { VBLK, KF_VTYPE_VBLK }, 3425 { VCHR, KF_VTYPE_VCHR }, 3426 { VDIR, KF_VTYPE_VDIR }, 3427 { VFIFO, KF_VTYPE_VFIFO }, 3428 { VLNK, KF_VTYPE_VLNK }, 3429 { VNON, KF_VTYPE_VNON }, 3430 { VREG, KF_VTYPE_VREG }, 3431 { VSOCK, KF_VTYPE_VSOCK } 3432 }; 3433 unsigned int i; 3434 3435 /* 3436 * Perform vtype translation. 3437 */ 3438 for (i = 0; i < nitems(vtypes_table); i++) 3439 if (vtypes_table[i].vtype == vtype) 3440 return (vtypes_table[i].kf_vtype); 3441 3442 return (KF_VTYPE_UNKNOWN); 3443 } 3444 3445 static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, 3446 CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc, 3447 "Process filedesc entries"); 3448 3449 /* 3450 * Store a process current working directory information to sbuf. 3451 * 3452 * Takes a locked proc as argument, and returns with the proc unlocked. 3453 */ 3454 int 3455 kern_proc_cwd_out(struct proc *p, struct sbuf *sb, ssize_t maxlen) 3456 { 3457 struct filedesc *fdp; 3458 struct export_fd_buf *efbuf; 3459 int error; 3460 3461 PROC_LOCK_ASSERT(p, MA_OWNED); 3462 3463 fdp = fdhold(p); 3464 PROC_UNLOCK(p); 3465 if (fdp == NULL) 3466 return (EINVAL); 3467 3468 efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); 3469 efbuf->fdp = fdp; 3470 efbuf->sb = sb; 3471 efbuf->remainder = maxlen; 3472 3473 FILEDESC_SLOCK(fdp); 3474 if (fdp->fd_cdir == NULL) 3475 error = EINVAL; 3476 else { 3477 vref(fdp->fd_cdir); 3478 error = export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD, 3479 FREAD, efbuf); 3480 } 3481 FILEDESC_SUNLOCK(fdp); 3482 fddrop(fdp); 3483 free(efbuf, M_TEMP); 3484 return (error); 3485 } 3486 3487 /* 3488 * Get per-process current working directory. 3489 */ 3490 static int 3491 sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS) 3492 { 3493 struct sbuf sb; 3494 struct proc *p; 3495 ssize_t maxlen; 3496 int error, error2, *name; 3497 3498 name = (int *)arg1; 3499 3500 sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req); 3501 sbuf_clear_flags(&sb, SBUF_INCLUDENUL); 3502 error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); 3503 if (error != 0) { 3504 sbuf_delete(&sb); 3505 return (error); 3506 } 3507 maxlen = req->oldptr != NULL ? req->oldlen : -1; 3508 error = kern_proc_cwd_out(p, &sb, maxlen); 3509 error2 = sbuf_finish(&sb); 3510 sbuf_delete(&sb); 3511 return (error != 0 ? error : error2); 3512 } 3513 3514 static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE, 3515 sysctl_kern_proc_cwd, "Process current working directory"); 3516 3517 #ifdef DDB 3518 /* 3519 * For the purposes of debugging, generate a human-readable string for the 3520 * file type. 3521 */ 3522 static const char * 3523 file_type_to_name(short type) 3524 { 3525 3526 switch (type) { 3527 case 0: 3528 return ("zero"); 3529 case DTYPE_VNODE: 3530 return ("vnod"); 3531 case DTYPE_SOCKET: 3532 return ("sock"); 3533 case DTYPE_PIPE: 3534 return ("pipe"); 3535 case DTYPE_FIFO: 3536 return ("fifo"); 3537 case DTYPE_KQUEUE: 3538 return ("kque"); 3539 case DTYPE_CRYPTO: 3540 return ("crpt"); 3541 case DTYPE_MQUEUE: 3542 return ("mque"); 3543 case DTYPE_SHM: 3544 return ("shm"); 3545 case DTYPE_SEM: 3546 return ("ksem"); 3547 default: 3548 return ("unkn"); 3549 } 3550 } 3551 3552 /* 3553 * For the purposes of debugging, identify a process (if any, perhaps one of 3554 * many) that references the passed file in its file descriptor array. Return 3555 * NULL if none. 3556 */ 3557 static struct proc * 3558 file_to_first_proc(struct file *fp) 3559 { 3560 struct filedesc *fdp; 3561 struct proc *p; 3562 int n; 3563 3564 FOREACH_PROC_IN_SYSTEM(p) { 3565 if (p->p_state == PRS_NEW) 3566 continue; 3567 fdp = p->p_fd; 3568 if (fdp == NULL) 3569 continue; 3570 for (n = 0; n <= fdp->fd_lastfile; n++) { 3571 if (fp == fdp->fd_ofiles[n].fde_file) 3572 return (p); 3573 } 3574 } 3575 return (NULL); 3576 } 3577 3578 static void 3579 db_print_file(struct file *fp, int header) 3580 { 3581 struct proc *p; 3582 3583 if (header) 3584 db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n", 3585 "File", "Type", "Data", "Flag", "GCFl", "Count", 3586 "MCount", "Vnode", "FPID", "FCmd"); 3587 p = file_to_first_proc(fp); 3588 db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp, 3589 file_type_to_name(fp->f_type), fp->f_data, fp->f_flag, 3590 0, fp->f_count, 0, fp->f_vnode, 3591 p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-"); 3592 } 3593 3594 DB_SHOW_COMMAND(file, db_show_file) 3595 { 3596 struct file *fp; 3597 3598 if (!have_addr) { 3599 db_printf("usage: show file <addr>\n"); 3600 return; 3601 } 3602 fp = (struct file *)addr; 3603 db_print_file(fp, 1); 3604 } 3605 3606 DB_SHOW_COMMAND(files, db_show_files) 3607 { 3608 struct filedesc *fdp; 3609 struct file *fp; 3610 struct proc *p; 3611 int header; 3612 int n; 3613 3614 header = 1; 3615 FOREACH_PROC_IN_SYSTEM(p) { 3616 if (p->p_state == PRS_NEW) 3617 continue; 3618 if ((fdp = p->p_fd) == NULL) 3619 continue; 3620 for (n = 0; n <= fdp->fd_lastfile; ++n) { 3621 if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) 3622 continue; 3623 db_print_file(fp, header); 3624 header = 0; 3625 } 3626 } 3627 } 3628 #endif 3629 3630 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, 3631 &maxfilesperproc, 0, "Maximum files allowed open per process"); 3632 3633 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, 3634 &maxfiles, 0, "Maximum number of files"); 3635 3636 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, 3637 __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files"); 3638 3639 /* ARGSUSED*/ 3640 static void 3641 filelistinit(void *dummy) 3642 { 3643 3644 file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, 3645 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 3646 filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0), 3647 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 3648 mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); 3649 mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF); 3650 } 3651 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL); 3652 3653 /*-------------------------------------------------------------------*/ 3654 3655 static int 3656 badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, 3657 int flags, struct thread *td) 3658 { 3659 3660 return (EBADF); 3661 } 3662 3663 static int 3664 badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, 3665 struct thread *td) 3666 { 3667 3668 return (EINVAL); 3669 } 3670 3671 static int 3672 badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, 3673 struct thread *td) 3674 { 3675 3676 return (EBADF); 3677 } 3678 3679 static int 3680 badfo_poll(struct file *fp, int events, struct ucred *active_cred, 3681 struct thread *td) 3682 { 3683 3684 return (0); 3685 } 3686 3687 static int 3688 badfo_kqfilter(struct file *fp, struct knote *kn) 3689 { 3690 3691 return (EBADF); 3692 } 3693 3694 static int 3695 badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, 3696 struct thread *td) 3697 { 3698 3699 return (EBADF); 3700 } 3701 3702 static int 3703 badfo_close(struct file *fp, struct thread *td) 3704 { 3705 3706 return (0); 3707 } 3708 3709 static int 3710 badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 3711 struct thread *td) 3712 { 3713 3714 return (EBADF); 3715 } 3716 3717 static int 3718 badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 3719 struct thread *td) 3720 { 3721 3722 return (EBADF); 3723 } 3724 3725 static int 3726 badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, 3727 struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, 3728 int kflags, struct thread *td) 3729 { 3730 3731 return (EBADF); 3732 } 3733 3734 static int 3735 badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 3736 { 3737 3738 return (0); 3739 } 3740 3741 struct fileops badfileops = { 3742 .fo_read = badfo_readwrite, 3743 .fo_write = badfo_readwrite, 3744 .fo_truncate = badfo_truncate, 3745 .fo_ioctl = badfo_ioctl, 3746 .fo_poll = badfo_poll, 3747 .fo_kqfilter = badfo_kqfilter, 3748 .fo_stat = badfo_stat, 3749 .fo_close = badfo_close, 3750 .fo_chmod = badfo_chmod, 3751 .fo_chown = badfo_chown, 3752 .fo_sendfile = badfo_sendfile, 3753 .fo_fill_kinfo = badfo_fill_kinfo, 3754 }; 3755 3756 int 3757 invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred, 3758 int flags, struct thread *td) 3759 { 3760 3761 return (EOPNOTSUPP); 3762 } 3763 3764 int 3765 invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, 3766 struct thread *td) 3767 { 3768 3769 return (EINVAL); 3770 } 3771 3772 int 3773 invfo_ioctl(struct file *fp, u_long com, void *data, 3774 struct ucred *active_cred, struct thread *td) 3775 { 3776 3777 return (ENOTTY); 3778 } 3779 3780 int 3781 invfo_poll(struct file *fp, int events, struct ucred *active_cred, 3782 struct thread *td) 3783 { 3784 3785 return (poll_no_poll(events)); 3786 } 3787 3788 int 3789 invfo_kqfilter(struct file *fp, struct knote *kn) 3790 { 3791 3792 return (EINVAL); 3793 } 3794 3795 int 3796 invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 3797 struct thread *td) 3798 { 3799 3800 return (EINVAL); 3801 } 3802 3803 int 3804 invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 3805 struct thread *td) 3806 { 3807 3808 return (EINVAL); 3809 } 3810 3811 int 3812 invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, 3813 struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, 3814 int kflags, struct thread *td) 3815 { 3816 3817 return (EINVAL); 3818 } 3819 3820 /*-------------------------------------------------------------------*/ 3821 3822 /* 3823 * File Descriptor pseudo-device driver (/dev/fd/). 3824 * 3825 * Opening minor device N dup()s the file (if any) connected to file 3826 * descriptor N belonging to the calling process. Note that this driver 3827 * consists of only the ``open()'' routine, because all subsequent 3828 * references to this file will be direct to the other driver. 3829 * 3830 * XXX: we could give this one a cloning event handler if necessary. 3831 */ 3832 3833 /* ARGSUSED */ 3834 static int 3835 fdopen(struct cdev *dev, int mode, int type, struct thread *td) 3836 { 3837 3838 /* 3839 * XXX Kludge: set curthread->td_dupfd to contain the value of the 3840 * the file descriptor being sought for duplication. The error 3841 * return ensures that the vnode for this device will be released 3842 * by vn_open. Open will detect this special error and take the 3843 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN 3844 * will simply report the error. 3845 */ 3846 td->td_dupfd = dev2unit(dev); 3847 return (ENODEV); 3848 } 3849 3850 static struct cdevsw fildesc_cdevsw = { 3851 .d_version = D_VERSION, 3852 .d_open = fdopen, 3853 .d_name = "FD", 3854 }; 3855 3856 static void 3857 fildesc_drvinit(void *unused) 3858 { 3859 struct cdev *dev; 3860 3861 dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL, 3862 UID_ROOT, GID_WHEEL, 0666, "fd/0"); 3863 make_dev_alias(dev, "stdin"); 3864 dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL, 3865 UID_ROOT, GID_WHEEL, 0666, "fd/1"); 3866 make_dev_alias(dev, "stdout"); 3867 dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL, 3868 UID_ROOT, GID_WHEEL, 0666, "fd/2"); 3869 make_dev_alias(dev, "stderr"); 3870 } 3871 3872 SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL); 3873