1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_capsicum.h" 41 #include "opt_compat.h" 42 #include "opt_ddb.h" 43 #include "opt_ktrace.h" 44 #include "opt_procdesc.h" 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 49 #include <sys/capability.h> 50 #include <sys/conf.h> 51 #include <sys/domain.h> 52 #include <sys/fcntl.h> 53 #include <sys/file.h> 54 #include <sys/filedesc.h> 55 #include <sys/filio.h> 56 #include <sys/jail.h> 57 #include <sys/kernel.h> 58 #include <sys/limits.h> 59 #include <sys/lock.h> 60 #include <sys/malloc.h> 61 #include <sys/mman.h> 62 #include <sys/mount.h> 63 #include <sys/mqueue.h> 64 #include <sys/mutex.h> 65 #include <sys/namei.h> 66 #include <sys/selinfo.h> 67 #include <sys/pipe.h> 68 #include <sys/priv.h> 69 #include <sys/proc.h> 70 #include <sys/procdesc.h> 71 #include <sys/protosw.h> 72 #include <sys/racct.h> 73 #include <sys/resourcevar.h> 74 #include <sys/signalvar.h> 75 #include <sys/socketvar.h> 76 #include <sys/stat.h> 77 #include <sys/sx.h> 78 #include <sys/syscallsubr.h> 79 #include <sys/sysctl.h> 80 #include <sys/sysproto.h> 81 #include <sys/tty.h> 82 #include <sys/unistd.h> 83 #include <sys/un.h> 84 #include <sys/unpcb.h> 85 #include <sys/user.h> 86 #include <sys/vnode.h> 87 #ifdef KTRACE 88 #include <sys/ktrace.h> 89 #endif 90 91 #include <net/vnet.h> 92 93 #include <netinet/in.h> 94 #include <netinet/in_pcb.h> 95 96 #include <security/audit/audit.h> 97 98 #include <vm/uma.h> 99 #include <vm/vm.h> 100 101 #include <ddb/ddb.h> 102 103 static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table"); 104 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader", 105 "file desc to leader structures"); 106 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); 107 MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities"); 108 109 MALLOC_DECLARE(M_FADVISE); 110 111 static uma_zone_t file_zone; 112 113 114 static int closefp(struct filedesc *fdp, int fd, struct file *fp, 115 struct thread *td, int holdleaders); 116 static int fd_first_free(struct filedesc *fdp, int low, int size); 117 static int fd_last_used(struct filedesc *fdp, int size); 118 static void fdgrowtable(struct filedesc *fdp, int nfd); 119 static void fdunused(struct filedesc *fdp, int fd); 120 static void fdused(struct filedesc *fdp, int fd); 121 static int fill_pipe_info(struct pipe *pi, struct kinfo_file *kif); 122 static int fill_procdesc_info(struct procdesc *pdp, 123 struct kinfo_file *kif); 124 static int fill_pts_info(struct tty *tp, struct kinfo_file *kif); 125 static int fill_shm_info(struct file *fp, struct kinfo_file *kif); 126 static int fill_socket_info(struct socket *so, struct kinfo_file *kif); 127 static int fill_vnode_info(struct vnode *vp, struct kinfo_file *kif); 128 129 /* 130 * Each process has: 131 * 132 * - An array of open file descriptors (fd_ofiles) 133 * - An array of file flags (fd_ofileflags) 134 * - A bitmap recording which descriptors are in use (fd_map) 135 * 136 * A process starts out with NDFILE descriptors. The value of NDFILE has 137 * been selected based the historical limit of 20 open files, and an 138 * assumption that the majority of processes, especially short-lived 139 * processes like shells, will never need more. 140 * 141 * If this initial allocation is exhausted, a larger descriptor table and 142 * map are allocated dynamically, and the pointers in the process's struct 143 * filedesc are updated to point to those. This is repeated every time 144 * the process runs out of file descriptors (provided it hasn't hit its 145 * resource limit). 146 * 147 * Since threads may hold references to individual descriptor table 148 * entries, the tables are never freed. Instead, they are placed on a 149 * linked list and freed only when the struct filedesc is released. 150 */ 151 #define NDFILE 20 152 #define NDSLOTSIZE sizeof(NDSLOTTYPE) 153 #define NDENTRIES (NDSLOTSIZE * __CHAR_BIT) 154 #define NDSLOT(x) ((x) / NDENTRIES) 155 #define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES)) 156 #define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES) 157 158 /* 159 * SLIST entry used to keep track of ofiles which must be reclaimed when 160 * the process exits. 161 */ 162 struct freetable { 163 struct filedescent *ft_table; 164 SLIST_ENTRY(freetable) ft_next; 165 }; 166 167 /* 168 * Initial allocation: a filedesc structure + the head of SLIST used to 169 * keep track of old ofiles + enough space for NDFILE descriptors. 170 */ 171 struct filedesc0 { 172 struct filedesc fd_fd; 173 SLIST_HEAD(, freetable) fd_free; 174 struct filedescent fd_dfiles[NDFILE]; 175 NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)]; 176 }; 177 178 /* 179 * Descriptor management. 180 */ 181 volatile int openfiles; /* actual number of open files */ 182 struct mtx sigio_lock; /* mtx to protect pointers to sigio */ 183 void (*mq_fdclose)(struct thread *td, int fd, struct file *fp); 184 185 /* A mutex to protect the association between a proc and filedesc. */ 186 static struct mtx fdesc_mtx; 187 188 /* 189 * If low >= size, just return low. Otherwise find the first zero bit in the 190 * given bitmap, starting at low and not exceeding size - 1. Return size if 191 * not found. 192 */ 193 static int 194 fd_first_free(struct filedesc *fdp, int low, int size) 195 { 196 NDSLOTTYPE *map = fdp->fd_map; 197 NDSLOTTYPE mask; 198 int off, maxoff; 199 200 if (low >= size) 201 return (low); 202 203 off = NDSLOT(low); 204 if (low % NDENTRIES) { 205 mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES))); 206 if ((mask &= ~map[off]) != 0UL) 207 return (off * NDENTRIES + ffsl(mask) - 1); 208 ++off; 209 } 210 for (maxoff = NDSLOTS(size); off < maxoff; ++off) 211 if (map[off] != ~0UL) 212 return (off * NDENTRIES + ffsl(~map[off]) - 1); 213 return (size); 214 } 215 216 /* 217 * Find the highest non-zero bit in the given bitmap, starting at 0 and 218 * not exceeding size - 1. Return -1 if not found. 219 */ 220 static int 221 fd_last_used(struct filedesc *fdp, int size) 222 { 223 NDSLOTTYPE *map = fdp->fd_map; 224 NDSLOTTYPE mask; 225 int off, minoff; 226 227 off = NDSLOT(size); 228 if (size % NDENTRIES) { 229 mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES)); 230 if ((mask &= map[off]) != 0) 231 return (off * NDENTRIES + flsl(mask) - 1); 232 --off; 233 } 234 for (minoff = NDSLOT(0); off >= minoff; --off) 235 if (map[off] != 0) 236 return (off * NDENTRIES + flsl(map[off]) - 1); 237 return (-1); 238 } 239 240 static int 241 fdisused(struct filedesc *fdp, int fd) 242 { 243 244 FILEDESC_LOCK_ASSERT(fdp); 245 246 KASSERT(fd >= 0 && fd < fdp->fd_nfiles, 247 ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles)); 248 249 return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0); 250 } 251 252 /* 253 * Mark a file descriptor as used. 254 */ 255 static void 256 fdused(struct filedesc *fdp, int fd) 257 { 258 259 FILEDESC_XLOCK_ASSERT(fdp); 260 261 KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd)); 262 263 fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd); 264 if (fd > fdp->fd_lastfile) 265 fdp->fd_lastfile = fd; 266 if (fd == fdp->fd_freefile) 267 fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles); 268 } 269 270 /* 271 * Mark a file descriptor as unused. 272 */ 273 static void 274 fdunused(struct filedesc *fdp, int fd) 275 { 276 277 FILEDESC_XLOCK_ASSERT(fdp); 278 279 KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd)); 280 KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, 281 ("fd=%d is still in use", fd)); 282 283 fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd); 284 if (fd < fdp->fd_freefile) 285 fdp->fd_freefile = fd; 286 if (fd == fdp->fd_lastfile) 287 fdp->fd_lastfile = fd_last_used(fdp, fd); 288 } 289 290 /* 291 * Free a file descriptor. 292 */ 293 static inline void 294 fdfree(struct filedesc *fdp, int fd) 295 { 296 struct filedescent *fde; 297 298 fde = &fdp->fd_ofiles[fd]; 299 filecaps_free(&fde->fde_caps); 300 bzero(fde, sizeof(*fde)); 301 fdunused(fdp, fd); 302 } 303 304 /* 305 * System calls on descriptors. 306 */ 307 #ifndef _SYS_SYSPROTO_H_ 308 struct getdtablesize_args { 309 int dummy; 310 }; 311 #endif 312 /* ARGSUSED */ 313 int 314 sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap) 315 { 316 struct proc *p = td->td_proc; 317 uint64_t lim; 318 319 PROC_LOCK(p); 320 td->td_retval[0] = 321 min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); 322 lim = racct_get_limit(td->td_proc, RACCT_NOFILE); 323 PROC_UNLOCK(p); 324 if (lim < td->td_retval[0]) 325 td->td_retval[0] = lim; 326 return (0); 327 } 328 329 /* 330 * Duplicate a file descriptor to a particular value. 331 * 332 * Note: keep in mind that a potential race condition exists when closing 333 * descriptors from a shared descriptor table (via rfork). 334 */ 335 #ifndef _SYS_SYSPROTO_H_ 336 struct dup2_args { 337 u_int from; 338 u_int to; 339 }; 340 #endif 341 /* ARGSUSED */ 342 int 343 sys_dup2(struct thread *td, struct dup2_args *uap) 344 { 345 346 return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to, 347 td->td_retval)); 348 } 349 350 /* 351 * Duplicate a file descriptor. 352 */ 353 #ifndef _SYS_SYSPROTO_H_ 354 struct dup_args { 355 u_int fd; 356 }; 357 #endif 358 /* ARGSUSED */ 359 int 360 sys_dup(struct thread *td, struct dup_args *uap) 361 { 362 363 return (do_dup(td, 0, (int)uap->fd, 0, td->td_retval)); 364 } 365 366 /* 367 * The file control system call. 368 */ 369 #ifndef _SYS_SYSPROTO_H_ 370 struct fcntl_args { 371 int fd; 372 int cmd; 373 long arg; 374 }; 375 #endif 376 /* ARGSUSED */ 377 int 378 sys_fcntl(struct thread *td, struct fcntl_args *uap) 379 { 380 struct flock fl; 381 struct __oflock ofl; 382 intptr_t arg; 383 int error; 384 int cmd; 385 386 error = 0; 387 cmd = uap->cmd; 388 switch (uap->cmd) { 389 case F_OGETLK: 390 case F_OSETLK: 391 case F_OSETLKW: 392 /* 393 * Convert old flock structure to new. 394 */ 395 error = copyin((void *)(intptr_t)uap->arg, &ofl, sizeof(ofl)); 396 fl.l_start = ofl.l_start; 397 fl.l_len = ofl.l_len; 398 fl.l_pid = ofl.l_pid; 399 fl.l_type = ofl.l_type; 400 fl.l_whence = ofl.l_whence; 401 fl.l_sysid = 0; 402 403 switch (uap->cmd) { 404 case F_OGETLK: 405 cmd = F_GETLK; 406 break; 407 case F_OSETLK: 408 cmd = F_SETLK; 409 break; 410 case F_OSETLKW: 411 cmd = F_SETLKW; 412 break; 413 } 414 arg = (intptr_t)&fl; 415 break; 416 case F_GETLK: 417 case F_SETLK: 418 case F_SETLKW: 419 case F_SETLK_REMOTE: 420 error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl)); 421 arg = (intptr_t)&fl; 422 break; 423 default: 424 arg = uap->arg; 425 break; 426 } 427 if (error) 428 return (error); 429 error = kern_fcntl(td, uap->fd, cmd, arg); 430 if (error) 431 return (error); 432 if (uap->cmd == F_OGETLK) { 433 ofl.l_start = fl.l_start; 434 ofl.l_len = fl.l_len; 435 ofl.l_pid = fl.l_pid; 436 ofl.l_type = fl.l_type; 437 ofl.l_whence = fl.l_whence; 438 error = copyout(&ofl, (void *)(intptr_t)uap->arg, sizeof(ofl)); 439 } else if (uap->cmd == F_GETLK) { 440 error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl)); 441 } 442 return (error); 443 } 444 445 int 446 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) 447 { 448 struct filedesc *fdp; 449 struct flock *flp; 450 struct file *fp, *fp2; 451 struct filedescent *fde; 452 struct proc *p; 453 struct vnode *vp; 454 int error, flg, tmp; 455 u_int old, new; 456 uint64_t bsize; 457 off_t foffset; 458 459 error = 0; 460 flg = F_POSIX; 461 p = td->td_proc; 462 fdp = p->p_fd; 463 464 switch (cmd) { 465 case F_DUPFD: 466 tmp = arg; 467 error = do_dup(td, DUP_FCNTL, fd, tmp, td->td_retval); 468 break; 469 470 case F_DUPFD_CLOEXEC: 471 tmp = arg; 472 error = do_dup(td, DUP_FCNTL | DUP_CLOEXEC, fd, tmp, 473 td->td_retval); 474 break; 475 476 case F_DUP2FD: 477 tmp = arg; 478 error = do_dup(td, DUP_FIXED, fd, tmp, td->td_retval); 479 break; 480 481 case F_DUP2FD_CLOEXEC: 482 tmp = arg; 483 error = do_dup(td, DUP_FIXED | DUP_CLOEXEC, fd, tmp, 484 td->td_retval); 485 break; 486 487 case F_GETFD: 488 FILEDESC_SLOCK(fdp); 489 if ((fp = fget_locked(fdp, fd)) == NULL) { 490 FILEDESC_SUNLOCK(fdp); 491 error = EBADF; 492 break; 493 } 494 fde = &fdp->fd_ofiles[fd]; 495 td->td_retval[0] = 496 (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0; 497 FILEDESC_SUNLOCK(fdp); 498 break; 499 500 case F_SETFD: 501 FILEDESC_XLOCK(fdp); 502 if ((fp = fget_locked(fdp, fd)) == NULL) { 503 FILEDESC_XUNLOCK(fdp); 504 error = EBADF; 505 break; 506 } 507 fde = &fdp->fd_ofiles[fd]; 508 fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) | 509 (arg & FD_CLOEXEC ? UF_EXCLOSE : 0); 510 FILEDESC_XUNLOCK(fdp); 511 break; 512 513 case F_GETFL: 514 error = fget_unlocked(fdp, fd, CAP_FCNTL, F_GETFL, &fp, NULL); 515 if (error != 0) 516 break; 517 td->td_retval[0] = OFLAGS(fp->f_flag); 518 fdrop(fp, td); 519 break; 520 521 case F_SETFL: 522 error = fget_unlocked(fdp, fd, CAP_FCNTL, F_SETFL, &fp, NULL); 523 if (error != 0) 524 break; 525 do { 526 tmp = flg = fp->f_flag; 527 tmp &= ~FCNTLFLAGS; 528 tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS; 529 } while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); 530 tmp = fp->f_flag & FNONBLOCK; 531 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 532 if (error != 0) { 533 fdrop(fp, td); 534 break; 535 } 536 tmp = fp->f_flag & FASYNC; 537 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); 538 if (error == 0) { 539 fdrop(fp, td); 540 break; 541 } 542 atomic_clear_int(&fp->f_flag, FNONBLOCK); 543 tmp = 0; 544 (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 545 fdrop(fp, td); 546 break; 547 548 case F_GETOWN: 549 error = fget_unlocked(fdp, fd, CAP_FCNTL, F_GETOWN, &fp, NULL); 550 if (error != 0) 551 break; 552 error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td); 553 if (error == 0) 554 td->td_retval[0] = tmp; 555 fdrop(fp, td); 556 break; 557 558 case F_SETOWN: 559 error = fget_unlocked(fdp, fd, CAP_FCNTL, F_SETOWN, &fp, NULL); 560 if (error != 0) 561 break; 562 tmp = arg; 563 error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td); 564 fdrop(fp, td); 565 break; 566 567 case F_SETLK_REMOTE: 568 error = priv_check(td, PRIV_NFS_LOCKD); 569 if (error) 570 return (error); 571 flg = F_REMOTE; 572 goto do_setlk; 573 574 case F_SETLKW: 575 flg |= F_WAIT; 576 /* FALLTHROUGH F_SETLK */ 577 578 case F_SETLK: 579 do_setlk: 580 error = fget_unlocked(fdp, fd, CAP_FLOCK, 0, &fp, NULL); 581 if (error != 0) 582 break; 583 if (fp->f_type != DTYPE_VNODE) { 584 error = EBADF; 585 fdrop(fp, td); 586 break; 587 } 588 589 flp = (struct flock *)arg; 590 if (flp->l_whence == SEEK_CUR) { 591 foffset = foffset_get(fp); 592 if (foffset < 0 || 593 (flp->l_start > 0 && 594 foffset > OFF_MAX - flp->l_start)) { 595 FILEDESC_SUNLOCK(fdp); 596 error = EOVERFLOW; 597 fdrop(fp, td); 598 break; 599 } 600 flp->l_start += foffset; 601 } 602 603 vp = fp->f_vnode; 604 switch (flp->l_type) { 605 case F_RDLCK: 606 if ((fp->f_flag & FREAD) == 0) { 607 error = EBADF; 608 break; 609 } 610 PROC_LOCK(p->p_leader); 611 p->p_leader->p_flag |= P_ADVLOCK; 612 PROC_UNLOCK(p->p_leader); 613 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, 614 flp, flg); 615 break; 616 case F_WRLCK: 617 if ((fp->f_flag & FWRITE) == 0) { 618 error = EBADF; 619 break; 620 } 621 PROC_LOCK(p->p_leader); 622 p->p_leader->p_flag |= P_ADVLOCK; 623 PROC_UNLOCK(p->p_leader); 624 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, 625 flp, flg); 626 break; 627 case F_UNLCK: 628 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, 629 flp, flg); 630 break; 631 case F_UNLCKSYS: 632 /* 633 * Temporary api for testing remote lock 634 * infrastructure. 635 */ 636 if (flg != F_REMOTE) { 637 error = EINVAL; 638 break; 639 } 640 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, 641 F_UNLCKSYS, flp, flg); 642 break; 643 default: 644 error = EINVAL; 645 break; 646 } 647 if (error != 0 || flp->l_type == F_UNLCK || 648 flp->l_type == F_UNLCKSYS) { 649 fdrop(fp, td); 650 break; 651 } 652 653 /* 654 * Check for a race with close. 655 * 656 * The vnode is now advisory locked (or unlocked, but this case 657 * is not really important) as the caller requested. 658 * We had to drop the filedesc lock, so we need to recheck if 659 * the descriptor is still valid, because if it was closed 660 * in the meantime we need to remove advisory lock from the 661 * vnode - close on any descriptor leading to an advisory 662 * locked vnode, removes that lock. 663 * We will return 0 on purpose in that case, as the result of 664 * successful advisory lock might have been externally visible 665 * already. This is fine - effectively we pretend to the caller 666 * that the closing thread was a bit slower and that the 667 * advisory lock succeeded before the close. 668 */ 669 error = fget_unlocked(fdp, fd, 0, 0, &fp2, NULL); 670 if (error != 0) { 671 fdrop(fp, td); 672 break; 673 } 674 if (fp != fp2) { 675 flp->l_whence = SEEK_SET; 676 flp->l_start = 0; 677 flp->l_len = 0; 678 flp->l_type = F_UNLCK; 679 (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, 680 F_UNLCK, flp, F_POSIX); 681 } 682 fdrop(fp, td); 683 fdrop(fp2, td); 684 break; 685 686 case F_GETLK: 687 error = fget_unlocked(fdp, fd, CAP_FLOCK, 0, &fp, NULL); 688 if (error != 0) 689 break; 690 if (fp->f_type != DTYPE_VNODE) { 691 error = EBADF; 692 fdrop(fp, td); 693 break; 694 } 695 flp = (struct flock *)arg; 696 if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK && 697 flp->l_type != F_UNLCK) { 698 error = EINVAL; 699 fdrop(fp, td); 700 break; 701 } 702 if (flp->l_whence == SEEK_CUR) { 703 foffset = foffset_get(fp); 704 if ((flp->l_start > 0 && 705 foffset > OFF_MAX - flp->l_start) || 706 (flp->l_start < 0 && 707 foffset < OFF_MIN - flp->l_start)) { 708 FILEDESC_SUNLOCK(fdp); 709 error = EOVERFLOW; 710 fdrop(fp, td); 711 break; 712 } 713 flp->l_start += foffset; 714 } 715 vp = fp->f_vnode; 716 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp, 717 F_POSIX); 718 fdrop(fp, td); 719 break; 720 721 case F_RDAHEAD: 722 arg = arg ? 128 * 1024: 0; 723 /* FALLTHROUGH */ 724 case F_READAHEAD: 725 error = fget_unlocked(fdp, fd, 0, 0, &fp, NULL); 726 if (error != 0) 727 break; 728 if (fp->f_type != DTYPE_VNODE) { 729 fdrop(fp, td); 730 error = EBADF; 731 break; 732 } 733 if (arg >= 0) { 734 vp = fp->f_vnode; 735 error = vn_lock(vp, LK_SHARED); 736 if (error != 0) { 737 fdrop(fp, td); 738 break; 739 } 740 bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize; 741 VOP_UNLOCK(vp, 0); 742 fp->f_seqcount = (arg + bsize - 1) / bsize; 743 do { 744 new = old = fp->f_flag; 745 new |= FRDAHEAD; 746 } while (!atomic_cmpset_rel_int(&fp->f_flag, old, new)); 747 } else { 748 do { 749 new = old = fp->f_flag; 750 new &= ~FRDAHEAD; 751 } while (!atomic_cmpset_rel_int(&fp->f_flag, old, new)); 752 } 753 fdrop(fp, td); 754 break; 755 756 default: 757 error = EINVAL; 758 break; 759 } 760 return (error); 761 } 762 763 /* 764 * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD). 765 */ 766 int 767 do_dup(struct thread *td, int flags, int old, int new, 768 register_t *retval) 769 { 770 struct filedesc *fdp; 771 struct filedescent *oldfde, *newfde; 772 struct proc *p; 773 struct file *fp; 774 struct file *delfp; 775 int error, maxfd; 776 777 p = td->td_proc; 778 fdp = p->p_fd; 779 780 /* 781 * Verify we have a valid descriptor to dup from and possibly to 782 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should 783 * return EINVAL when the new descriptor is out of bounds. 784 */ 785 if (old < 0) 786 return (EBADF); 787 if (new < 0) 788 return (flags & DUP_FCNTL ? EINVAL : EBADF); 789 PROC_LOCK(p); 790 maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); 791 PROC_UNLOCK(p); 792 if (new >= maxfd) 793 return (flags & DUP_FCNTL ? EINVAL : EBADF); 794 795 FILEDESC_XLOCK(fdp); 796 if (fget_locked(fdp, old) == NULL) { 797 FILEDESC_XUNLOCK(fdp); 798 return (EBADF); 799 } 800 oldfde = &fdp->fd_ofiles[old]; 801 if (flags & DUP_FIXED && old == new) { 802 *retval = new; 803 if (flags & DUP_CLOEXEC) 804 fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE; 805 FILEDESC_XUNLOCK(fdp); 806 return (0); 807 } 808 fp = oldfde->fde_file; 809 fhold(fp); 810 811 /* 812 * If the caller specified a file descriptor, make sure the file 813 * table is large enough to hold it, and grab it. Otherwise, just 814 * allocate a new descriptor the usual way. 815 */ 816 if (flags & DUP_FIXED) { 817 if (new >= fdp->fd_nfiles) { 818 /* 819 * The resource limits are here instead of e.g. 820 * fdalloc(), because the file descriptor table may be 821 * shared between processes, so we can't really use 822 * racct_add()/racct_sub(). Instead of counting the 823 * number of actually allocated descriptors, just put 824 * the limit on the size of the file descriptor table. 825 */ 826 #ifdef RACCT 827 PROC_LOCK(p); 828 error = racct_set(p, RACCT_NOFILE, new + 1); 829 PROC_UNLOCK(p); 830 if (error != 0) { 831 FILEDESC_XUNLOCK(fdp); 832 fdrop(fp, td); 833 return (EMFILE); 834 } 835 #endif 836 fdgrowtable(fdp, new + 1); 837 oldfde = &fdp->fd_ofiles[old]; 838 } 839 newfde = &fdp->fd_ofiles[new]; 840 if (newfde->fde_file == NULL) 841 fdused(fdp, new); 842 } else { 843 if ((error = fdalloc(td, new, &new)) != 0) { 844 FILEDESC_XUNLOCK(fdp); 845 fdrop(fp, td); 846 return (error); 847 } 848 newfde = &fdp->fd_ofiles[new]; 849 } 850 851 KASSERT(fp == oldfde->fde_file, ("old fd has been modified")); 852 KASSERT(old != new, ("new fd is same as old")); 853 854 delfp = newfde->fde_file; 855 856 /* 857 * Duplicate the source descriptor. 858 */ 859 *newfde = *oldfde; 860 filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps); 861 if ((flags & DUP_CLOEXEC) != 0) 862 newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE; 863 else 864 newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE; 865 if (new > fdp->fd_lastfile) 866 fdp->fd_lastfile = new; 867 *retval = new; 868 869 if (delfp != NULL) { 870 (void) closefp(fdp, new, delfp, td, 1); 871 /* closefp() drops the FILEDESC lock for us. */ 872 } else { 873 FILEDESC_XUNLOCK(fdp); 874 } 875 876 return (0); 877 } 878 879 /* 880 * If sigio is on the list associated with a process or process group, 881 * disable signalling from the device, remove sigio from the list and 882 * free sigio. 883 */ 884 void 885 funsetown(struct sigio **sigiop) 886 { 887 struct sigio *sigio; 888 889 SIGIO_LOCK(); 890 sigio = *sigiop; 891 if (sigio == NULL) { 892 SIGIO_UNLOCK(); 893 return; 894 } 895 *(sigio->sio_myref) = NULL; 896 if ((sigio)->sio_pgid < 0) { 897 struct pgrp *pg = (sigio)->sio_pgrp; 898 PGRP_LOCK(pg); 899 SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio, 900 sigio, sio_pgsigio); 901 PGRP_UNLOCK(pg); 902 } else { 903 struct proc *p = (sigio)->sio_proc; 904 PROC_LOCK(p); 905 SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio, 906 sigio, sio_pgsigio); 907 PROC_UNLOCK(p); 908 } 909 SIGIO_UNLOCK(); 910 crfree(sigio->sio_ucred); 911 free(sigio, M_SIGIO); 912 } 913 914 /* 915 * Free a list of sigio structures. 916 * We only need to lock the SIGIO_LOCK because we have made ourselves 917 * inaccessible to callers of fsetown and therefore do not need to lock 918 * the proc or pgrp struct for the list manipulation. 919 */ 920 void 921 funsetownlst(struct sigiolst *sigiolst) 922 { 923 struct proc *p; 924 struct pgrp *pg; 925 struct sigio *sigio; 926 927 sigio = SLIST_FIRST(sigiolst); 928 if (sigio == NULL) 929 return; 930 p = NULL; 931 pg = NULL; 932 933 /* 934 * Every entry of the list should belong 935 * to a single proc or pgrp. 936 */ 937 if (sigio->sio_pgid < 0) { 938 pg = sigio->sio_pgrp; 939 PGRP_LOCK_ASSERT(pg, MA_NOTOWNED); 940 } else /* if (sigio->sio_pgid > 0) */ { 941 p = sigio->sio_proc; 942 PROC_LOCK_ASSERT(p, MA_NOTOWNED); 943 } 944 945 SIGIO_LOCK(); 946 while ((sigio = SLIST_FIRST(sigiolst)) != NULL) { 947 *(sigio->sio_myref) = NULL; 948 if (pg != NULL) { 949 KASSERT(sigio->sio_pgid < 0, 950 ("Proc sigio in pgrp sigio list")); 951 KASSERT(sigio->sio_pgrp == pg, 952 ("Bogus pgrp in sigio list")); 953 PGRP_LOCK(pg); 954 SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, 955 sio_pgsigio); 956 PGRP_UNLOCK(pg); 957 } else /* if (p != NULL) */ { 958 KASSERT(sigio->sio_pgid > 0, 959 ("Pgrp sigio in proc sigio list")); 960 KASSERT(sigio->sio_proc == p, 961 ("Bogus proc in sigio list")); 962 PROC_LOCK(p); 963 SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, 964 sio_pgsigio); 965 PROC_UNLOCK(p); 966 } 967 SIGIO_UNLOCK(); 968 crfree(sigio->sio_ucred); 969 free(sigio, M_SIGIO); 970 SIGIO_LOCK(); 971 } 972 SIGIO_UNLOCK(); 973 } 974 975 /* 976 * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). 977 * 978 * After permission checking, add a sigio structure to the sigio list for 979 * the process or process group. 980 */ 981 int 982 fsetown(pid_t pgid, struct sigio **sigiop) 983 { 984 struct proc *proc; 985 struct pgrp *pgrp; 986 struct sigio *sigio; 987 int ret; 988 989 if (pgid == 0) { 990 funsetown(sigiop); 991 return (0); 992 } 993 994 ret = 0; 995 996 /* Allocate and fill in the new sigio out of locks. */ 997 sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK); 998 sigio->sio_pgid = pgid; 999 sigio->sio_ucred = crhold(curthread->td_ucred); 1000 sigio->sio_myref = sigiop; 1001 1002 sx_slock(&proctree_lock); 1003 if (pgid > 0) { 1004 proc = pfind(pgid); 1005 if (proc == NULL) { 1006 ret = ESRCH; 1007 goto fail; 1008 } 1009 1010 /* 1011 * Policy - Don't allow a process to FSETOWN a process 1012 * in another session. 1013 * 1014 * Remove this test to allow maximum flexibility or 1015 * restrict FSETOWN to the current process or process 1016 * group for maximum safety. 1017 */ 1018 PROC_UNLOCK(proc); 1019 if (proc->p_session != curthread->td_proc->p_session) { 1020 ret = EPERM; 1021 goto fail; 1022 } 1023 1024 pgrp = NULL; 1025 } else /* if (pgid < 0) */ { 1026 pgrp = pgfind(-pgid); 1027 if (pgrp == NULL) { 1028 ret = ESRCH; 1029 goto fail; 1030 } 1031 PGRP_UNLOCK(pgrp); 1032 1033 /* 1034 * Policy - Don't allow a process to FSETOWN a process 1035 * in another session. 1036 * 1037 * Remove this test to allow maximum flexibility or 1038 * restrict FSETOWN to the current process or process 1039 * group for maximum safety. 1040 */ 1041 if (pgrp->pg_session != curthread->td_proc->p_session) { 1042 ret = EPERM; 1043 goto fail; 1044 } 1045 1046 proc = NULL; 1047 } 1048 funsetown(sigiop); 1049 if (pgid > 0) { 1050 PROC_LOCK(proc); 1051 /* 1052 * Since funsetownlst() is called without the proctree 1053 * locked, we need to check for P_WEXIT. 1054 * XXX: is ESRCH correct? 1055 */ 1056 if ((proc->p_flag & P_WEXIT) != 0) { 1057 PROC_UNLOCK(proc); 1058 ret = ESRCH; 1059 goto fail; 1060 } 1061 SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio); 1062 sigio->sio_proc = proc; 1063 PROC_UNLOCK(proc); 1064 } else { 1065 PGRP_LOCK(pgrp); 1066 SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio); 1067 sigio->sio_pgrp = pgrp; 1068 PGRP_UNLOCK(pgrp); 1069 } 1070 sx_sunlock(&proctree_lock); 1071 SIGIO_LOCK(); 1072 *sigiop = sigio; 1073 SIGIO_UNLOCK(); 1074 return (0); 1075 1076 fail: 1077 sx_sunlock(&proctree_lock); 1078 crfree(sigio->sio_ucred); 1079 free(sigio, M_SIGIO); 1080 return (ret); 1081 } 1082 1083 /* 1084 * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). 1085 */ 1086 pid_t 1087 fgetown(sigiop) 1088 struct sigio **sigiop; 1089 { 1090 pid_t pgid; 1091 1092 SIGIO_LOCK(); 1093 pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0; 1094 SIGIO_UNLOCK(); 1095 return (pgid); 1096 } 1097 1098 /* 1099 * Function drops the filedesc lock on return. 1100 */ 1101 static int 1102 closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, 1103 int holdleaders) 1104 { 1105 int error; 1106 1107 FILEDESC_XLOCK_ASSERT(fdp); 1108 1109 if (holdleaders) { 1110 if (td->td_proc->p_fdtol != NULL) { 1111 /* 1112 * Ask fdfree() to sleep to ensure that all relevant 1113 * process leaders can be traversed in closef(). 1114 */ 1115 fdp->fd_holdleaderscount++; 1116 } else { 1117 holdleaders = 0; 1118 } 1119 } 1120 1121 /* 1122 * We now hold the fp reference that used to be owned by the 1123 * descriptor array. We have to unlock the FILEDESC *AFTER* 1124 * knote_fdclose to prevent a race of the fd getting opened, a knote 1125 * added, and deleteing a knote for the new fd. 1126 */ 1127 knote_fdclose(td, fd); 1128 1129 /* 1130 * We need to notify mqueue if the object is of type mqueue. 1131 */ 1132 if (fp->f_type == DTYPE_MQUEUE) 1133 mq_fdclose(td, fd, fp); 1134 FILEDESC_XUNLOCK(fdp); 1135 1136 error = closef(fp, td); 1137 if (holdleaders) { 1138 FILEDESC_XLOCK(fdp); 1139 fdp->fd_holdleaderscount--; 1140 if (fdp->fd_holdleaderscount == 0 && 1141 fdp->fd_holdleaderswakeup != 0) { 1142 fdp->fd_holdleaderswakeup = 0; 1143 wakeup(&fdp->fd_holdleaderscount); 1144 } 1145 FILEDESC_XUNLOCK(fdp); 1146 } 1147 return (error); 1148 } 1149 1150 /* 1151 * Close a file descriptor. 1152 */ 1153 #ifndef _SYS_SYSPROTO_H_ 1154 struct close_args { 1155 int fd; 1156 }; 1157 #endif 1158 /* ARGSUSED */ 1159 int 1160 sys_close(td, uap) 1161 struct thread *td; 1162 struct close_args *uap; 1163 { 1164 1165 return (kern_close(td, uap->fd)); 1166 } 1167 1168 int 1169 kern_close(td, fd) 1170 struct thread *td; 1171 int fd; 1172 { 1173 struct filedesc *fdp; 1174 struct file *fp; 1175 1176 fdp = td->td_proc->p_fd; 1177 1178 AUDIT_SYSCLOSE(td, fd); 1179 1180 FILEDESC_XLOCK(fdp); 1181 if ((fp = fget_locked(fdp, fd)) == NULL) { 1182 FILEDESC_XUNLOCK(fdp); 1183 return (EBADF); 1184 } 1185 fdfree(fdp, fd); 1186 1187 /* closefp() drops the FILEDESC lock for us. */ 1188 return (closefp(fdp, fd, fp, td, 1)); 1189 } 1190 1191 /* 1192 * Close open file descriptors. 1193 */ 1194 #ifndef _SYS_SYSPROTO_H_ 1195 struct closefrom_args { 1196 int lowfd; 1197 }; 1198 #endif 1199 /* ARGSUSED */ 1200 int 1201 sys_closefrom(struct thread *td, struct closefrom_args *uap) 1202 { 1203 struct filedesc *fdp; 1204 int fd; 1205 1206 fdp = td->td_proc->p_fd; 1207 AUDIT_ARG_FD(uap->lowfd); 1208 1209 /* 1210 * Treat negative starting file descriptor values identical to 1211 * closefrom(0) which closes all files. 1212 */ 1213 if (uap->lowfd < 0) 1214 uap->lowfd = 0; 1215 FILEDESC_SLOCK(fdp); 1216 for (fd = uap->lowfd; fd < fdp->fd_nfiles; fd++) { 1217 if (fdp->fd_ofiles[fd].fde_file != NULL) { 1218 FILEDESC_SUNLOCK(fdp); 1219 (void)kern_close(td, fd); 1220 FILEDESC_SLOCK(fdp); 1221 } 1222 } 1223 FILEDESC_SUNLOCK(fdp); 1224 return (0); 1225 } 1226 1227 #if defined(COMPAT_43) 1228 /* 1229 * Return status information about a file descriptor. 1230 */ 1231 #ifndef _SYS_SYSPROTO_H_ 1232 struct ofstat_args { 1233 int fd; 1234 struct ostat *sb; 1235 }; 1236 #endif 1237 /* ARGSUSED */ 1238 int 1239 ofstat(struct thread *td, struct ofstat_args *uap) 1240 { 1241 struct ostat oub; 1242 struct stat ub; 1243 int error; 1244 1245 error = kern_fstat(td, uap->fd, &ub); 1246 if (error == 0) { 1247 cvtstat(&ub, &oub); 1248 error = copyout(&oub, uap->sb, sizeof(oub)); 1249 } 1250 return (error); 1251 } 1252 #endif /* COMPAT_43 */ 1253 1254 /* 1255 * Return status information about a file descriptor. 1256 */ 1257 #ifndef _SYS_SYSPROTO_H_ 1258 struct fstat_args { 1259 int fd; 1260 struct stat *sb; 1261 }; 1262 #endif 1263 /* ARGSUSED */ 1264 int 1265 sys_fstat(struct thread *td, struct fstat_args *uap) 1266 { 1267 struct stat ub; 1268 int error; 1269 1270 error = kern_fstat(td, uap->fd, &ub); 1271 if (error == 0) 1272 error = copyout(&ub, uap->sb, sizeof(ub)); 1273 return (error); 1274 } 1275 1276 int 1277 kern_fstat(struct thread *td, int fd, struct stat *sbp) 1278 { 1279 struct file *fp; 1280 int error; 1281 1282 AUDIT_ARG_FD(fd); 1283 1284 if ((error = fget(td, fd, CAP_FSTAT, &fp)) != 0) 1285 return (error); 1286 1287 AUDIT_ARG_FILE(td->td_proc, fp); 1288 1289 error = fo_stat(fp, sbp, td->td_ucred, td); 1290 fdrop(fp, td); 1291 #ifdef KTRACE 1292 if (error == 0 && KTRPOINT(td, KTR_STRUCT)) 1293 ktrstat(sbp); 1294 #endif 1295 return (error); 1296 } 1297 1298 /* 1299 * Return status information about a file descriptor. 1300 */ 1301 #ifndef _SYS_SYSPROTO_H_ 1302 struct nfstat_args { 1303 int fd; 1304 struct nstat *sb; 1305 }; 1306 #endif 1307 /* ARGSUSED */ 1308 int 1309 sys_nfstat(struct thread *td, struct nfstat_args *uap) 1310 { 1311 struct nstat nub; 1312 struct stat ub; 1313 int error; 1314 1315 error = kern_fstat(td, uap->fd, &ub); 1316 if (error == 0) { 1317 cvtnstat(&ub, &nub); 1318 error = copyout(&nub, uap->sb, sizeof(nub)); 1319 } 1320 return (error); 1321 } 1322 1323 /* 1324 * Return pathconf information about a file descriptor. 1325 */ 1326 #ifndef _SYS_SYSPROTO_H_ 1327 struct fpathconf_args { 1328 int fd; 1329 int name; 1330 }; 1331 #endif 1332 /* ARGSUSED */ 1333 int 1334 sys_fpathconf(struct thread *td, struct fpathconf_args *uap) 1335 { 1336 struct file *fp; 1337 struct vnode *vp; 1338 int error; 1339 1340 if ((error = fget(td, uap->fd, CAP_FPATHCONF, &fp)) != 0) 1341 return (error); 1342 1343 /* If asynchronous I/O is available, it works for all descriptors. */ 1344 if (uap->name == _PC_ASYNC_IO) { 1345 td->td_retval[0] = async_io_version; 1346 goto out; 1347 } 1348 vp = fp->f_vnode; 1349 if (vp != NULL) { 1350 vn_lock(vp, LK_SHARED | LK_RETRY); 1351 error = VOP_PATHCONF(vp, uap->name, td->td_retval); 1352 VOP_UNLOCK(vp, 0); 1353 } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) { 1354 if (uap->name != _PC_PIPE_BUF) { 1355 error = EINVAL; 1356 } else { 1357 td->td_retval[0] = PIPE_BUF; 1358 error = 0; 1359 } 1360 } else { 1361 error = EOPNOTSUPP; 1362 } 1363 out: 1364 fdrop(fp, td); 1365 return (error); 1366 } 1367 1368 /* 1369 * Initialize filecaps structure. 1370 */ 1371 void 1372 filecaps_init(struct filecaps *fcaps) 1373 { 1374 1375 bzero(fcaps, sizeof(*fcaps)); 1376 fcaps->fc_nioctls = -1; 1377 } 1378 1379 /* 1380 * Copy filecaps structure allocating memory for ioctls array if needed. 1381 */ 1382 void 1383 filecaps_copy(const struct filecaps *src, struct filecaps *dst) 1384 { 1385 size_t size; 1386 1387 *dst = *src; 1388 if (src->fc_ioctls != NULL) { 1389 KASSERT(src->fc_nioctls > 0, 1390 ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); 1391 1392 size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; 1393 dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK); 1394 bcopy(src->fc_ioctls, dst->fc_ioctls, size); 1395 } 1396 } 1397 1398 /* 1399 * Move filecaps structure to the new place and clear the old place. 1400 */ 1401 void 1402 filecaps_move(struct filecaps *src, struct filecaps *dst) 1403 { 1404 1405 *dst = *src; 1406 bzero(src, sizeof(*src)); 1407 } 1408 1409 /* 1410 * Fill the given filecaps structure with full rights. 1411 */ 1412 static void 1413 filecaps_fill(struct filecaps *fcaps) 1414 { 1415 1416 fcaps->fc_rights = CAP_ALL; 1417 fcaps->fc_ioctls = NULL; 1418 fcaps->fc_nioctls = -1; 1419 fcaps->fc_fcntls = CAP_FCNTL_ALL; 1420 } 1421 1422 /* 1423 * Free memory allocated within filecaps structure. 1424 */ 1425 void 1426 filecaps_free(struct filecaps *fcaps) 1427 { 1428 1429 free(fcaps->fc_ioctls, M_FILECAPS); 1430 bzero(fcaps, sizeof(*fcaps)); 1431 } 1432 1433 /* 1434 * Validate the given filecaps structure. 1435 */ 1436 static void 1437 filecaps_validate(const struct filecaps *fcaps, const char *func) 1438 { 1439 1440 KASSERT((fcaps->fc_rights & ~CAP_MASK_VALID) == 0, 1441 ("%s: invalid rights", func)); 1442 KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0, 1443 ("%s: invalid fcntls", func)); 1444 KASSERT(fcaps->fc_fcntls == 0 || (fcaps->fc_rights & CAP_FCNTL) != 0, 1445 ("%s: fcntls without CAP_FCNTL", func)); 1446 KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 : 1447 (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0), 1448 ("%s: invalid ioctls", func)); 1449 KASSERT(fcaps->fc_nioctls == 0 || (fcaps->fc_rights & CAP_IOCTL) != 0, 1450 ("%s: ioctls without CAP_IOCTL", func)); 1451 } 1452 1453 /* 1454 * Grow the file table to accomodate (at least) nfd descriptors. 1455 */ 1456 static void 1457 fdgrowtable(struct filedesc *fdp, int nfd) 1458 { 1459 struct filedesc0 *fdp0; 1460 struct freetable *ft; 1461 struct filedescent *ntable; 1462 struct filedescent *otable; 1463 int nnfiles, onfiles; 1464 NDSLOTTYPE *nmap, *omap; 1465 1466 FILEDESC_XLOCK_ASSERT(fdp); 1467 1468 KASSERT(fdp->fd_nfiles > 0, ("zero-length file table")); 1469 1470 /* save old values */ 1471 onfiles = fdp->fd_nfiles; 1472 otable = fdp->fd_ofiles; 1473 omap = fdp->fd_map; 1474 1475 /* compute the size of the new table */ 1476 nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */ 1477 if (nnfiles <= onfiles) 1478 /* the table is already large enough */ 1479 return; 1480 1481 /* 1482 * Allocate a new table and map. We need enough space for the 1483 * file entries themselves and the struct freetable we will use 1484 * when we decommission the table and place it on the freelist. 1485 * We place the struct freetable in the middle so we don't have 1486 * to worry about padding. 1487 */ 1488 ntable = malloc(nnfiles * sizeof(ntable[0]) + sizeof(struct freetable), 1489 M_FILEDESC, M_ZERO | M_WAITOK); 1490 nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC, 1491 M_ZERO | M_WAITOK); 1492 1493 /* copy the old data over and point at the new tables */ 1494 memcpy(ntable, otable, onfiles * sizeof(*otable)); 1495 memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap)); 1496 1497 /* update the pointers and counters */ 1498 fdp->fd_nfiles = nnfiles; 1499 memcpy(ntable, otable, onfiles * sizeof(ntable[0])); 1500 fdp->fd_ofiles = ntable; 1501 fdp->fd_map = nmap; 1502 1503 /* 1504 * Do not free the old file table, as some threads may still 1505 * reference entries within it. Instead, place it on a freelist 1506 * which will be processed when the struct filedesc is released. 1507 * 1508 * Do, however, free the old map. 1509 * 1510 * Note that if onfiles == NDFILE, we're dealing with the original 1511 * static allocation contained within (struct filedesc0 *)fdp, 1512 * which must not be freed. 1513 */ 1514 if (onfiles > NDFILE) { 1515 ft = (struct freetable *)&otable[onfiles]; 1516 fdp0 = (struct filedesc0 *)fdp; 1517 ft->ft_table = otable; 1518 SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next); 1519 free(omap, M_FILEDESC); 1520 } 1521 } 1522 1523 /* 1524 * Allocate a file descriptor for the process. 1525 */ 1526 int 1527 fdalloc(struct thread *td, int minfd, int *result) 1528 { 1529 struct proc *p = td->td_proc; 1530 struct filedesc *fdp = p->p_fd; 1531 int fd = -1, maxfd, allocfd; 1532 #ifdef RACCT 1533 int error; 1534 #endif 1535 1536 FILEDESC_XLOCK_ASSERT(fdp); 1537 1538 if (fdp->fd_freefile > minfd) 1539 minfd = fdp->fd_freefile; 1540 1541 PROC_LOCK(p); 1542 maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); 1543 PROC_UNLOCK(p); 1544 1545 /* 1546 * Search the bitmap for a free descriptor starting at minfd. 1547 * If none is found, grow the file table. 1548 */ 1549 fd = fd_first_free(fdp, minfd, fdp->fd_nfiles); 1550 if (fd >= maxfd) 1551 return (EMFILE); 1552 if (fd >= fdp->fd_nfiles) { 1553 allocfd = min(fd * 2, maxfd); 1554 #ifdef RACCT 1555 PROC_LOCK(p); 1556 error = racct_set(p, RACCT_NOFILE, allocfd); 1557 PROC_UNLOCK(p); 1558 if (error != 0) 1559 return (EMFILE); 1560 #endif 1561 /* 1562 * fd is already equal to first free descriptor >= minfd, so 1563 * we only need to grow the table and we are done. 1564 */ 1565 fdgrowtable(fdp, allocfd); 1566 } 1567 1568 /* 1569 * Perform some sanity checks, then mark the file descriptor as 1570 * used and return it to the caller. 1571 */ 1572 KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles), 1573 ("invalid descriptor %d", fd)); 1574 KASSERT(!fdisused(fdp, fd), 1575 ("fd_first_free() returned non-free descriptor")); 1576 KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, 1577 ("file descriptor isn't free")); 1578 KASSERT(fdp->fd_ofiles[fd].fde_flags == 0, ("file flags are set")); 1579 fdused(fdp, fd); 1580 *result = fd; 1581 return (0); 1582 } 1583 1584 /* 1585 * Check to see whether n user file descriptors are available to the process 1586 * p. 1587 */ 1588 int 1589 fdavail(struct thread *td, int n) 1590 { 1591 struct proc *p = td->td_proc; 1592 struct filedesc *fdp = td->td_proc->p_fd; 1593 int i, lim, last; 1594 1595 FILEDESC_LOCK_ASSERT(fdp); 1596 1597 /* 1598 * XXX: This is only called from uipc_usrreq.c:unp_externalize(); 1599 * call racct_add() from there instead of dealing with containers 1600 * here. 1601 */ 1602 PROC_LOCK(p); 1603 lim = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); 1604 PROC_UNLOCK(p); 1605 if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0) 1606 return (1); 1607 last = min(fdp->fd_nfiles, lim); 1608 for (i = fdp->fd_freefile; i < last; i++) { 1609 if (fdp->fd_ofiles[i].fde_file == NULL && --n <= 0) 1610 return (1); 1611 } 1612 return (0); 1613 } 1614 1615 /* 1616 * Create a new open file structure and allocate a file decriptor for the 1617 * process that refers to it. We add one reference to the file for the 1618 * descriptor table and one reference for resultfp. This is to prevent us 1619 * being preempted and the entry in the descriptor table closed after we 1620 * release the FILEDESC lock. 1621 */ 1622 int 1623 falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags) 1624 { 1625 struct file *fp; 1626 int error, fd; 1627 1628 error = falloc_noinstall(td, &fp); 1629 if (error) 1630 return (error); /* no reference held on error */ 1631 1632 error = finstall(td, fp, &fd, flags, NULL); 1633 if (error) { 1634 fdrop(fp, td); /* one reference (fp only) */ 1635 return (error); 1636 } 1637 1638 if (resultfp != NULL) 1639 *resultfp = fp; /* copy out result */ 1640 else 1641 fdrop(fp, td); /* release local reference */ 1642 1643 if (resultfd != NULL) 1644 *resultfd = fd; 1645 1646 return (0); 1647 } 1648 1649 /* 1650 * Create a new open file structure without allocating a file descriptor. 1651 */ 1652 int 1653 falloc_noinstall(struct thread *td, struct file **resultfp) 1654 { 1655 struct file *fp; 1656 int maxuserfiles = maxfiles - (maxfiles / 20); 1657 static struct timeval lastfail; 1658 static int curfail; 1659 1660 KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__)); 1661 1662 if ((openfiles >= maxuserfiles && 1663 priv_check(td, PRIV_MAXFILES) != 0) || 1664 openfiles >= maxfiles) { 1665 if (ppsratecheck(&lastfail, &curfail, 1)) { 1666 printf("kern.maxfiles limit exceeded by uid %i, " 1667 "please see tuning(7).\n", td->td_ucred->cr_ruid); 1668 } 1669 return (ENFILE); 1670 } 1671 atomic_add_int(&openfiles, 1); 1672 fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO); 1673 refcount_init(&fp->f_count, 1); 1674 fp->f_cred = crhold(td->td_ucred); 1675 fp->f_ops = &badfileops; 1676 fp->f_data = NULL; 1677 fp->f_vnode = NULL; 1678 *resultfp = fp; 1679 return (0); 1680 } 1681 1682 /* 1683 * Install a file in a file descriptor table. 1684 */ 1685 int 1686 finstall(struct thread *td, struct file *fp, int *fd, int flags, 1687 struct filecaps *fcaps) 1688 { 1689 struct filedesc *fdp = td->td_proc->p_fd; 1690 struct filedescent *fde; 1691 int error; 1692 1693 KASSERT(fd != NULL, ("%s: fd == NULL", __func__)); 1694 KASSERT(fp != NULL, ("%s: fp == NULL", __func__)); 1695 if (fcaps != NULL) 1696 filecaps_validate(fcaps, __func__); 1697 1698 FILEDESC_XLOCK(fdp); 1699 if ((error = fdalloc(td, 0, fd))) { 1700 FILEDESC_XUNLOCK(fdp); 1701 return (error); 1702 } 1703 fhold(fp); 1704 fde = &fdp->fd_ofiles[*fd]; 1705 fde->fde_file = fp; 1706 if ((flags & O_CLOEXEC) != 0) 1707 fde->fde_flags |= UF_EXCLOSE; 1708 if (fcaps != NULL) 1709 filecaps_move(fcaps, &fde->fde_caps); 1710 else 1711 filecaps_fill(&fde->fde_caps); 1712 FILEDESC_XUNLOCK(fdp); 1713 return (0); 1714 } 1715 1716 /* 1717 * Build a new filedesc structure from another. 1718 * Copy the current, root, and jail root vnode references. 1719 */ 1720 struct filedesc * 1721 fdinit(struct filedesc *fdp) 1722 { 1723 struct filedesc0 *newfdp; 1724 1725 newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO); 1726 FILEDESC_LOCK_INIT(&newfdp->fd_fd); 1727 if (fdp != NULL) { 1728 FILEDESC_XLOCK(fdp); 1729 newfdp->fd_fd.fd_cdir = fdp->fd_cdir; 1730 if (newfdp->fd_fd.fd_cdir) 1731 VREF(newfdp->fd_fd.fd_cdir); 1732 newfdp->fd_fd.fd_rdir = fdp->fd_rdir; 1733 if (newfdp->fd_fd.fd_rdir) 1734 VREF(newfdp->fd_fd.fd_rdir); 1735 newfdp->fd_fd.fd_jdir = fdp->fd_jdir; 1736 if (newfdp->fd_fd.fd_jdir) 1737 VREF(newfdp->fd_fd.fd_jdir); 1738 FILEDESC_XUNLOCK(fdp); 1739 } 1740 1741 /* Create the file descriptor table. */ 1742 newfdp->fd_fd.fd_refcnt = 1; 1743 newfdp->fd_fd.fd_holdcnt = 1; 1744 newfdp->fd_fd.fd_cmask = CMASK; 1745 newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles; 1746 newfdp->fd_fd.fd_nfiles = NDFILE; 1747 newfdp->fd_fd.fd_map = newfdp->fd_dmap; 1748 newfdp->fd_fd.fd_lastfile = -1; 1749 return (&newfdp->fd_fd); 1750 } 1751 1752 static struct filedesc * 1753 fdhold(struct proc *p) 1754 { 1755 struct filedesc *fdp; 1756 1757 mtx_lock(&fdesc_mtx); 1758 fdp = p->p_fd; 1759 if (fdp != NULL) 1760 fdp->fd_holdcnt++; 1761 mtx_unlock(&fdesc_mtx); 1762 return (fdp); 1763 } 1764 1765 static void 1766 fddrop(struct filedesc *fdp) 1767 { 1768 struct filedesc0 *fdp0; 1769 struct freetable *ft; 1770 int i; 1771 1772 mtx_lock(&fdesc_mtx); 1773 i = --fdp->fd_holdcnt; 1774 mtx_unlock(&fdesc_mtx); 1775 if (i > 0) 1776 return; 1777 1778 FILEDESC_LOCK_DESTROY(fdp); 1779 fdp0 = (struct filedesc0 *)fdp; 1780 while ((ft = SLIST_FIRST(&fdp0->fd_free)) != NULL) { 1781 SLIST_REMOVE_HEAD(&fdp0->fd_free, ft_next); 1782 free(ft->ft_table, M_FILEDESC); 1783 } 1784 free(fdp, M_FILEDESC); 1785 } 1786 1787 /* 1788 * Share a filedesc structure. 1789 */ 1790 struct filedesc * 1791 fdshare(struct filedesc *fdp) 1792 { 1793 1794 FILEDESC_XLOCK(fdp); 1795 fdp->fd_refcnt++; 1796 FILEDESC_XUNLOCK(fdp); 1797 return (fdp); 1798 } 1799 1800 /* 1801 * Unshare a filedesc structure, if necessary by making a copy 1802 */ 1803 void 1804 fdunshare(struct proc *p, struct thread *td) 1805 { 1806 1807 FILEDESC_XLOCK(p->p_fd); 1808 if (p->p_fd->fd_refcnt > 1) { 1809 struct filedesc *tmp; 1810 1811 FILEDESC_XUNLOCK(p->p_fd); 1812 tmp = fdcopy(p->p_fd); 1813 fdescfree(td); 1814 p->p_fd = tmp; 1815 } else 1816 FILEDESC_XUNLOCK(p->p_fd); 1817 } 1818 1819 /* 1820 * Copy a filedesc structure. A NULL pointer in returns a NULL reference, 1821 * this is to ease callers, not catch errors. 1822 */ 1823 struct filedesc * 1824 fdcopy(struct filedesc *fdp) 1825 { 1826 struct filedesc *newfdp; 1827 struct filedescent *nfde, *ofde; 1828 int i; 1829 1830 /* Certain daemons might not have file descriptors. */ 1831 if (fdp == NULL) 1832 return (NULL); 1833 1834 newfdp = fdinit(fdp); 1835 FILEDESC_SLOCK(fdp); 1836 while (fdp->fd_lastfile >= newfdp->fd_nfiles) { 1837 FILEDESC_SUNLOCK(fdp); 1838 FILEDESC_XLOCK(newfdp); 1839 fdgrowtable(newfdp, fdp->fd_lastfile + 1); 1840 FILEDESC_XUNLOCK(newfdp); 1841 FILEDESC_SLOCK(fdp); 1842 } 1843 /* copy all passable descriptors (i.e. not kqueue) */ 1844 newfdp->fd_freefile = -1; 1845 for (i = 0; i <= fdp->fd_lastfile; ++i) { 1846 ofde = &fdp->fd_ofiles[i]; 1847 if (fdisused(fdp, i) && 1848 (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) && 1849 ofde->fde_file->f_ops != &badfileops) { 1850 nfde = &newfdp->fd_ofiles[i]; 1851 *nfde = *ofde; 1852 filecaps_copy(&ofde->fde_caps, &nfde->fde_caps); 1853 fhold(nfde->fde_file); 1854 newfdp->fd_lastfile = i; 1855 } else { 1856 if (newfdp->fd_freefile == -1) 1857 newfdp->fd_freefile = i; 1858 } 1859 } 1860 newfdp->fd_cmask = fdp->fd_cmask; 1861 FILEDESC_SUNLOCK(fdp); 1862 FILEDESC_XLOCK(newfdp); 1863 for (i = 0; i <= newfdp->fd_lastfile; ++i) { 1864 if (newfdp->fd_ofiles[i].fde_file != NULL) 1865 fdused(newfdp, i); 1866 } 1867 if (newfdp->fd_freefile == -1) 1868 newfdp->fd_freefile = i; 1869 FILEDESC_XUNLOCK(newfdp); 1870 return (newfdp); 1871 } 1872 1873 /* 1874 * Release a filedesc structure. 1875 */ 1876 void 1877 fdescfree(struct thread *td) 1878 { 1879 struct filedesc *fdp; 1880 int i; 1881 struct filedesc_to_leader *fdtol; 1882 struct file *fp; 1883 struct vnode *cdir, *jdir, *rdir, *vp; 1884 struct flock lf; 1885 1886 /* Certain daemons might not have file descriptors. */ 1887 fdp = td->td_proc->p_fd; 1888 if (fdp == NULL) 1889 return; 1890 1891 #ifdef RACCT 1892 PROC_LOCK(td->td_proc); 1893 racct_set(td->td_proc, RACCT_NOFILE, 0); 1894 PROC_UNLOCK(td->td_proc); 1895 #endif 1896 1897 /* Check for special need to clear POSIX style locks */ 1898 fdtol = td->td_proc->p_fdtol; 1899 if (fdtol != NULL) { 1900 FILEDESC_XLOCK(fdp); 1901 KASSERT(fdtol->fdl_refcount > 0, 1902 ("filedesc_to_refcount botch: fdl_refcount=%d", 1903 fdtol->fdl_refcount)); 1904 if (fdtol->fdl_refcount == 1 && 1905 (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { 1906 for (i = 0; i <= fdp->fd_lastfile; i++) { 1907 fp = fdp->fd_ofiles[i].fde_file; 1908 if (fp == NULL || fp->f_type != DTYPE_VNODE) 1909 continue; 1910 fhold(fp); 1911 FILEDESC_XUNLOCK(fdp); 1912 lf.l_whence = SEEK_SET; 1913 lf.l_start = 0; 1914 lf.l_len = 0; 1915 lf.l_type = F_UNLCK; 1916 vp = fp->f_vnode; 1917 (void) VOP_ADVLOCK(vp, 1918 (caddr_t)td->td_proc->p_leader, F_UNLCK, 1919 &lf, F_POSIX); 1920 FILEDESC_XLOCK(fdp); 1921 fdrop(fp, td); 1922 } 1923 } 1924 retry: 1925 if (fdtol->fdl_refcount == 1) { 1926 if (fdp->fd_holdleaderscount > 0 && 1927 (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { 1928 /* 1929 * close() or do_dup() has cleared a reference 1930 * in a shared file descriptor table. 1931 */ 1932 fdp->fd_holdleaderswakeup = 1; 1933 sx_sleep(&fdp->fd_holdleaderscount, 1934 FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); 1935 goto retry; 1936 } 1937 if (fdtol->fdl_holdcount > 0) { 1938 /* 1939 * Ensure that fdtol->fdl_leader remains 1940 * valid in closef(). 1941 */ 1942 fdtol->fdl_wakeup = 1; 1943 sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK, 1944 "fdlhold", 0); 1945 goto retry; 1946 } 1947 } 1948 fdtol->fdl_refcount--; 1949 if (fdtol->fdl_refcount == 0 && 1950 fdtol->fdl_holdcount == 0) { 1951 fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; 1952 fdtol->fdl_prev->fdl_next = fdtol->fdl_next; 1953 } else 1954 fdtol = NULL; 1955 td->td_proc->p_fdtol = NULL; 1956 FILEDESC_XUNLOCK(fdp); 1957 if (fdtol != NULL) 1958 free(fdtol, M_FILEDESC_TO_LEADER); 1959 } 1960 FILEDESC_XLOCK(fdp); 1961 i = --fdp->fd_refcnt; 1962 FILEDESC_XUNLOCK(fdp); 1963 if (i > 0) 1964 return; 1965 1966 for (i = 0; i <= fdp->fd_lastfile; i++) { 1967 fp = fdp->fd_ofiles[i].fde_file; 1968 if (fp != NULL) { 1969 FILEDESC_XLOCK(fdp); 1970 fdfree(fdp, i); 1971 FILEDESC_XUNLOCK(fdp); 1972 (void) closef(fp, td); 1973 } 1974 } 1975 FILEDESC_XLOCK(fdp); 1976 1977 /* XXX This should happen earlier. */ 1978 mtx_lock(&fdesc_mtx); 1979 td->td_proc->p_fd = NULL; 1980 mtx_unlock(&fdesc_mtx); 1981 1982 if (fdp->fd_nfiles > NDFILE) 1983 free(fdp->fd_ofiles, M_FILEDESC); 1984 if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE)) 1985 free(fdp->fd_map, M_FILEDESC); 1986 1987 fdp->fd_nfiles = 0; 1988 1989 cdir = fdp->fd_cdir; 1990 fdp->fd_cdir = NULL; 1991 rdir = fdp->fd_rdir; 1992 fdp->fd_rdir = NULL; 1993 jdir = fdp->fd_jdir; 1994 fdp->fd_jdir = NULL; 1995 FILEDESC_XUNLOCK(fdp); 1996 1997 if (cdir != NULL) 1998 vrele(cdir); 1999 if (rdir != NULL) 2000 vrele(rdir); 2001 if (jdir != NULL) 2002 vrele(jdir); 2003 2004 fddrop(fdp); 2005 } 2006 2007 /* 2008 * For setugid programs, we don't want to people to use that setugidness 2009 * to generate error messages which write to a file which otherwise would 2010 * otherwise be off-limits to the process. We check for filesystems where 2011 * the vnode can change out from under us after execve (like [lin]procfs). 2012 * 2013 * Since setugidsafety calls this only for fd 0, 1 and 2, this check is 2014 * sufficient. We also don't check for setugidness since we know we are. 2015 */ 2016 static int 2017 is_unsafe(struct file *fp) 2018 { 2019 if (fp->f_type == DTYPE_VNODE) { 2020 struct vnode *vp = fp->f_vnode; 2021 2022 if ((vp->v_vflag & VV_PROCDEP) != 0) 2023 return (1); 2024 } 2025 return (0); 2026 } 2027 2028 /* 2029 * Make this setguid thing safe, if at all possible. 2030 */ 2031 void 2032 setugidsafety(struct thread *td) 2033 { 2034 struct filedesc *fdp; 2035 struct file *fp; 2036 int i; 2037 2038 /* Certain daemons might not have file descriptors. */ 2039 fdp = td->td_proc->p_fd; 2040 if (fdp == NULL) 2041 return; 2042 2043 /* 2044 * Note: fdp->fd_ofiles may be reallocated out from under us while 2045 * we are blocked in a close. Be careful! 2046 */ 2047 FILEDESC_XLOCK(fdp); 2048 for (i = 0; i <= fdp->fd_lastfile; i++) { 2049 if (i > 2) 2050 break; 2051 fp = fdp->fd_ofiles[i].fde_file; 2052 if (fp != NULL && is_unsafe(fp)) { 2053 knote_fdclose(td, i); 2054 /* 2055 * NULL-out descriptor prior to close to avoid 2056 * a race while close blocks. 2057 */ 2058 fdfree(fdp, i); 2059 FILEDESC_XUNLOCK(fdp); 2060 (void) closef(fp, td); 2061 FILEDESC_XLOCK(fdp); 2062 } 2063 } 2064 FILEDESC_XUNLOCK(fdp); 2065 } 2066 2067 /* 2068 * If a specific file object occupies a specific file descriptor, close the 2069 * file descriptor entry and drop a reference on the file object. This is a 2070 * convenience function to handle a subsequent error in a function that calls 2071 * falloc() that handles the race that another thread might have closed the 2072 * file descriptor out from under the thread creating the file object. 2073 */ 2074 void 2075 fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td) 2076 { 2077 2078 FILEDESC_XLOCK(fdp); 2079 if (fdp->fd_ofiles[idx].fde_file == fp) { 2080 fdfree(fdp, idx); 2081 FILEDESC_XUNLOCK(fdp); 2082 fdrop(fp, td); 2083 } else 2084 FILEDESC_XUNLOCK(fdp); 2085 } 2086 2087 /* 2088 * Close any files on exec? 2089 */ 2090 void 2091 fdcloseexec(struct thread *td) 2092 { 2093 struct filedesc *fdp; 2094 struct filedescent *fde; 2095 struct file *fp; 2096 int i; 2097 2098 /* Certain daemons might not have file descriptors. */ 2099 fdp = td->td_proc->p_fd; 2100 if (fdp == NULL) 2101 return; 2102 2103 /* 2104 * We cannot cache fd_ofiles since operations 2105 * may block and rip them out from under us. 2106 */ 2107 FILEDESC_XLOCK(fdp); 2108 for (i = 0; i <= fdp->fd_lastfile; i++) { 2109 fde = &fdp->fd_ofiles[i]; 2110 fp = fde->fde_file; 2111 if (fp != NULL && (fp->f_type == DTYPE_MQUEUE || 2112 (fde->fde_flags & UF_EXCLOSE))) { 2113 fdfree(fdp, i); 2114 (void) closefp(fdp, i, fp, td, 0); 2115 /* closefp() drops the FILEDESC lock. */ 2116 FILEDESC_XLOCK(fdp); 2117 } 2118 } 2119 FILEDESC_XUNLOCK(fdp); 2120 } 2121 2122 /* 2123 * It is unsafe for set[ug]id processes to be started with file 2124 * descriptors 0..2 closed, as these descriptors are given implicit 2125 * significance in the Standard C library. fdcheckstd() will create a 2126 * descriptor referencing /dev/null for each of stdin, stdout, and 2127 * stderr that is not already open. 2128 */ 2129 int 2130 fdcheckstd(struct thread *td) 2131 { 2132 struct filedesc *fdp; 2133 register_t retval, save; 2134 int i, error, devnull; 2135 2136 fdp = td->td_proc->p_fd; 2137 if (fdp == NULL) 2138 return (0); 2139 KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); 2140 devnull = -1; 2141 error = 0; 2142 for (i = 0; i < 3; i++) { 2143 if (fdp->fd_ofiles[i].fde_file != NULL) 2144 continue; 2145 if (devnull < 0) { 2146 save = td->td_retval[0]; 2147 error = kern_open(td, "/dev/null", UIO_SYSSPACE, 2148 O_RDWR, 0); 2149 devnull = td->td_retval[0]; 2150 td->td_retval[0] = save; 2151 if (error) 2152 break; 2153 KASSERT(devnull == i, ("oof, we didn't get our fd")); 2154 } else { 2155 error = do_dup(td, DUP_FIXED, devnull, i, &retval); 2156 if (error != 0) 2157 break; 2158 } 2159 } 2160 return (error); 2161 } 2162 2163 /* 2164 * Internal form of close. Decrement reference count on file structure. 2165 * Note: td may be NULL when closing a file that was being passed in a 2166 * message. 2167 * 2168 * XXXRW: Giant is not required for the caller, but often will be held; this 2169 * makes it moderately likely the Giant will be recursed in the VFS case. 2170 */ 2171 int 2172 closef(struct file *fp, struct thread *td) 2173 { 2174 struct vnode *vp; 2175 struct flock lf; 2176 struct filedesc_to_leader *fdtol; 2177 struct filedesc *fdp; 2178 2179 /* 2180 * POSIX record locking dictates that any close releases ALL 2181 * locks owned by this process. This is handled by setting 2182 * a flag in the unlock to free ONLY locks obeying POSIX 2183 * semantics, and not to free BSD-style file locks. 2184 * If the descriptor was in a message, POSIX-style locks 2185 * aren't passed with the descriptor, and the thread pointer 2186 * will be NULL. Callers should be careful only to pass a 2187 * NULL thread pointer when there really is no owning 2188 * context that might have locks, or the locks will be 2189 * leaked. 2190 */ 2191 if (fp->f_type == DTYPE_VNODE && td != NULL) { 2192 vp = fp->f_vnode; 2193 if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { 2194 lf.l_whence = SEEK_SET; 2195 lf.l_start = 0; 2196 lf.l_len = 0; 2197 lf.l_type = F_UNLCK; 2198 (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, 2199 F_UNLCK, &lf, F_POSIX); 2200 } 2201 fdtol = td->td_proc->p_fdtol; 2202 if (fdtol != NULL) { 2203 /* 2204 * Handle special case where file descriptor table is 2205 * shared between multiple process leaders. 2206 */ 2207 fdp = td->td_proc->p_fd; 2208 FILEDESC_XLOCK(fdp); 2209 for (fdtol = fdtol->fdl_next; 2210 fdtol != td->td_proc->p_fdtol; 2211 fdtol = fdtol->fdl_next) { 2212 if ((fdtol->fdl_leader->p_flag & 2213 P_ADVLOCK) == 0) 2214 continue; 2215 fdtol->fdl_holdcount++; 2216 FILEDESC_XUNLOCK(fdp); 2217 lf.l_whence = SEEK_SET; 2218 lf.l_start = 0; 2219 lf.l_len = 0; 2220 lf.l_type = F_UNLCK; 2221 vp = fp->f_vnode; 2222 (void) VOP_ADVLOCK(vp, 2223 (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf, 2224 F_POSIX); 2225 FILEDESC_XLOCK(fdp); 2226 fdtol->fdl_holdcount--; 2227 if (fdtol->fdl_holdcount == 0 && 2228 fdtol->fdl_wakeup != 0) { 2229 fdtol->fdl_wakeup = 0; 2230 wakeup(fdtol); 2231 } 2232 } 2233 FILEDESC_XUNLOCK(fdp); 2234 } 2235 } 2236 return (fdrop(fp, td)); 2237 } 2238 2239 /* 2240 * Initialize the file pointer with the specified properties. 2241 * 2242 * The ops are set with release semantics to be certain that the flags, type, 2243 * and data are visible when ops is. This is to prevent ops methods from being 2244 * called with bad data. 2245 */ 2246 void 2247 finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops) 2248 { 2249 fp->f_data = data; 2250 fp->f_flag = flag; 2251 fp->f_type = type; 2252 atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops); 2253 } 2254 2255 int 2256 fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t needrights, 2257 int needfcntl, struct file **fpp, cap_rights_t *haverightsp) 2258 { 2259 struct file *fp; 2260 u_int count; 2261 #ifdef CAPABILITIES 2262 cap_rights_t haverights; 2263 int error; 2264 #endif 2265 2266 if (fd < 0 || fd >= fdp->fd_nfiles) 2267 return (EBADF); 2268 /* 2269 * Fetch the descriptor locklessly. We avoid fdrop() races by 2270 * never raising a refcount above 0. To accomplish this we have 2271 * to use a cmpset loop rather than an atomic_add. The descriptor 2272 * must be re-verified once we acquire a reference to be certain 2273 * that the identity is still correct and we did not lose a race 2274 * due to preemption. 2275 */ 2276 for (;;) { 2277 fp = fdp->fd_ofiles[fd].fde_file; 2278 if (fp == NULL) 2279 return (EBADF); 2280 #ifdef CAPABILITIES 2281 haverights = cap_rights(fdp, fd); 2282 error = cap_check(haverights, needrights); 2283 if (error != 0) 2284 return (error); 2285 if ((needrights & CAP_FCNTL) != 0) { 2286 error = cap_fcntl_check(fdp, fd, needfcntl); 2287 if (error != 0) 2288 return (error); 2289 } 2290 #endif 2291 count = fp->f_count; 2292 if (count == 0) 2293 continue; 2294 /* 2295 * Use an acquire barrier to prevent caching of fd_ofiles 2296 * so it is refreshed for verification. 2297 */ 2298 if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) != 1) 2299 continue; 2300 if (fp == fdp->fd_ofiles[fd].fde_file) 2301 break; 2302 fdrop(fp, curthread); 2303 } 2304 *fpp = fp; 2305 if (haverightsp != NULL) { 2306 #ifdef CAPABILITIES 2307 *haverightsp = haverights; 2308 #else 2309 *haverightsp = CAP_ALL; 2310 #endif 2311 } 2312 return (0); 2313 } 2314 2315 /* 2316 * Extract the file pointer associated with the specified descriptor for the 2317 * current user process. 2318 * 2319 * If the descriptor doesn't exist or doesn't match 'flags', EBADF is 2320 * returned. 2321 * 2322 * File's rights will be checked against the capability rights mask. 2323 * 2324 * If an error occured the non-zero error is returned and *fpp is set to 2325 * NULL. Otherwise *fpp is held and set and zero is returned. Caller is 2326 * responsible for fdrop(). 2327 */ 2328 static __inline int 2329 _fget(struct thread *td, int fd, struct file **fpp, int flags, 2330 cap_rights_t needrights, u_char *maxprotp) 2331 { 2332 struct filedesc *fdp; 2333 struct file *fp; 2334 cap_rights_t haverights; 2335 int error; 2336 2337 *fpp = NULL; 2338 if (td == NULL || (fdp = td->td_proc->p_fd) == NULL) 2339 return (EBADF); 2340 if (maxprotp != NULL) 2341 needrights |= CAP_MMAP; 2342 error = fget_unlocked(fdp, fd, needrights, 0, &fp, &haverights); 2343 if (error != 0) 2344 return (error); 2345 if (fp->f_ops == &badfileops) { 2346 fdrop(fp, td); 2347 return (EBADF); 2348 } 2349 2350 #ifdef CAPABILITIES 2351 /* 2352 * If requested, convert capability rights to access flags. 2353 */ 2354 if (maxprotp != NULL) 2355 *maxprotp = cap_rights_to_vmprot(haverights); 2356 #else /* !CAPABILITIES */ 2357 if (maxprotp != NULL) 2358 *maxprotp = VM_PROT_ALL; 2359 #endif /* CAPABILITIES */ 2360 2361 /* 2362 * FREAD and FWRITE failure return EBADF as per POSIX. 2363 */ 2364 error = 0; 2365 switch (flags) { 2366 case FREAD: 2367 case FWRITE: 2368 if ((fp->f_flag & flags) == 0) 2369 error = EBADF; 2370 break; 2371 case FEXEC: 2372 if ((fp->f_flag & (FREAD | FEXEC)) == 0 || 2373 ((fp->f_flag & FWRITE) != 0)) 2374 error = EBADF; 2375 break; 2376 case 0: 2377 break; 2378 default: 2379 KASSERT(0, ("wrong flags")); 2380 } 2381 2382 if (error != 0) { 2383 fdrop(fp, td); 2384 return (error); 2385 } 2386 2387 *fpp = fp; 2388 return (0); 2389 } 2390 2391 int 2392 fget(struct thread *td, int fd, cap_rights_t rights, struct file **fpp) 2393 { 2394 2395 return(_fget(td, fd, fpp, 0, rights, NULL)); 2396 } 2397 2398 int 2399 fget_mmap(struct thread *td, int fd, cap_rights_t rights, u_char *maxprotp, 2400 struct file **fpp) 2401 { 2402 2403 return (_fget(td, fd, fpp, 0, rights, maxprotp)); 2404 } 2405 2406 int 2407 fget_read(struct thread *td, int fd, cap_rights_t rights, struct file **fpp) 2408 { 2409 2410 return(_fget(td, fd, fpp, FREAD, rights, NULL)); 2411 } 2412 2413 int 2414 fget_write(struct thread *td, int fd, cap_rights_t rights, struct file **fpp) 2415 { 2416 2417 return (_fget(td, fd, fpp, FWRITE, rights, NULL)); 2418 } 2419 2420 /* 2421 * Like fget() but loads the underlying vnode, or returns an error if the 2422 * descriptor does not represent a vnode. Note that pipes use vnodes but 2423 * never have VM objects. The returned vnode will be vref()'d. 2424 * 2425 * XXX: what about the unused flags ? 2426 */ 2427 static __inline int 2428 _fgetvp(struct thread *td, int fd, int flags, cap_rights_t needrights, 2429 struct vnode **vpp) 2430 { 2431 struct file *fp; 2432 int error; 2433 2434 *vpp = NULL; 2435 error = _fget(td, fd, &fp, flags, needrights, NULL); 2436 if (error) 2437 return (error); 2438 if (fp->f_vnode == NULL) { 2439 error = EINVAL; 2440 } else { 2441 *vpp = fp->f_vnode; 2442 vref(*vpp); 2443 } 2444 fdrop(fp, td); 2445 2446 return (error); 2447 } 2448 2449 int 2450 fgetvp(struct thread *td, int fd, cap_rights_t rights, struct vnode **vpp) 2451 { 2452 2453 return (_fgetvp(td, fd, 0, rights, vpp)); 2454 } 2455 2456 int 2457 fgetvp_rights(struct thread *td, int fd, cap_rights_t need, 2458 struct filecaps *havecaps, struct vnode **vpp) 2459 { 2460 struct filedesc *fdp; 2461 struct file *fp; 2462 #ifdef CAPABILITIES 2463 int error; 2464 #endif 2465 2466 if (td == NULL || (fdp = td->td_proc->p_fd) == NULL) 2467 return (EBADF); 2468 2469 fp = fget_locked(fdp, fd); 2470 if (fp == NULL || fp->f_ops == &badfileops) 2471 return (EBADF); 2472 2473 #ifdef CAPABILITIES 2474 error = cap_check(cap_rights(fdp, fd), need); 2475 if (error != 0) 2476 return (error); 2477 #endif 2478 2479 if (fp->f_vnode == NULL) 2480 return (EINVAL); 2481 2482 *vpp = fp->f_vnode; 2483 vref(*vpp); 2484 filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, havecaps); 2485 2486 return (0); 2487 } 2488 2489 int 2490 fgetvp_read(struct thread *td, int fd, cap_rights_t rights, struct vnode **vpp) 2491 { 2492 2493 return (_fgetvp(td, fd, FREAD, rights, vpp)); 2494 } 2495 2496 int 2497 fgetvp_exec(struct thread *td, int fd, cap_rights_t rights, struct vnode **vpp) 2498 { 2499 2500 return (_fgetvp(td, fd, FEXEC, rights, vpp)); 2501 } 2502 2503 #ifdef notyet 2504 int 2505 fgetvp_write(struct thread *td, int fd, cap_rights_t rights, 2506 struct vnode **vpp) 2507 { 2508 2509 return (_fgetvp(td, fd, FWRITE, rights, vpp)); 2510 } 2511 #endif 2512 2513 /* 2514 * Like fget() but loads the underlying socket, or returns an error if the 2515 * descriptor does not represent a socket. 2516 * 2517 * We bump the ref count on the returned socket. XXX Also obtain the SX lock 2518 * in the future. 2519 * 2520 * Note: fgetsock() and fputsock() are deprecated, as consumers should rely 2521 * on their file descriptor reference to prevent the socket from being free'd 2522 * during use. 2523 */ 2524 int 2525 fgetsock(struct thread *td, int fd, cap_rights_t rights, struct socket **spp, 2526 u_int *fflagp) 2527 { 2528 struct file *fp; 2529 int error; 2530 2531 *spp = NULL; 2532 if (fflagp != NULL) 2533 *fflagp = 0; 2534 if ((error = _fget(td, fd, &fp, 0, rights, NULL)) != 0) 2535 return (error); 2536 if (fp->f_type != DTYPE_SOCKET) { 2537 error = ENOTSOCK; 2538 } else { 2539 *spp = fp->f_data; 2540 if (fflagp) 2541 *fflagp = fp->f_flag; 2542 SOCK_LOCK(*spp); 2543 soref(*spp); 2544 SOCK_UNLOCK(*spp); 2545 } 2546 fdrop(fp, td); 2547 2548 return (error); 2549 } 2550 2551 /* 2552 * Drop the reference count on the socket and XXX release the SX lock in the 2553 * future. The last reference closes the socket. 2554 * 2555 * Note: fputsock() is deprecated, see comment for fgetsock(). 2556 */ 2557 void 2558 fputsock(struct socket *so) 2559 { 2560 2561 ACCEPT_LOCK(); 2562 SOCK_LOCK(so); 2563 CURVNET_SET(so->so_vnet); 2564 sorele(so); 2565 CURVNET_RESTORE(); 2566 } 2567 2568 /* 2569 * Handle the last reference to a file being closed. 2570 */ 2571 int 2572 _fdrop(struct file *fp, struct thread *td) 2573 { 2574 int error; 2575 2576 error = 0; 2577 if (fp->f_count != 0) 2578 panic("fdrop: count %d", fp->f_count); 2579 if (fp->f_ops != &badfileops) 2580 error = fo_close(fp, td); 2581 atomic_subtract_int(&openfiles, 1); 2582 crfree(fp->f_cred); 2583 free(fp->f_advice, M_FADVISE); 2584 uma_zfree(file_zone, fp); 2585 2586 return (error); 2587 } 2588 2589 /* 2590 * Apply an advisory lock on a file descriptor. 2591 * 2592 * Just attempt to get a record lock of the requested type on the entire file 2593 * (l_whence = SEEK_SET, l_start = 0, l_len = 0). 2594 */ 2595 #ifndef _SYS_SYSPROTO_H_ 2596 struct flock_args { 2597 int fd; 2598 int how; 2599 }; 2600 #endif 2601 /* ARGSUSED */ 2602 int 2603 sys_flock(struct thread *td, struct flock_args *uap) 2604 { 2605 struct file *fp; 2606 struct vnode *vp; 2607 struct flock lf; 2608 int error; 2609 2610 if ((error = fget(td, uap->fd, CAP_FLOCK, &fp)) != 0) 2611 return (error); 2612 if (fp->f_type != DTYPE_VNODE) { 2613 fdrop(fp, td); 2614 return (EOPNOTSUPP); 2615 } 2616 2617 vp = fp->f_vnode; 2618 lf.l_whence = SEEK_SET; 2619 lf.l_start = 0; 2620 lf.l_len = 0; 2621 if (uap->how & LOCK_UN) { 2622 lf.l_type = F_UNLCK; 2623 atomic_clear_int(&fp->f_flag, FHASLOCK); 2624 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); 2625 goto done2; 2626 } 2627 if (uap->how & LOCK_EX) 2628 lf.l_type = F_WRLCK; 2629 else if (uap->how & LOCK_SH) 2630 lf.l_type = F_RDLCK; 2631 else { 2632 error = EBADF; 2633 goto done2; 2634 } 2635 atomic_set_int(&fp->f_flag, FHASLOCK); 2636 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, 2637 (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); 2638 done2: 2639 fdrop(fp, td); 2640 return (error); 2641 } 2642 /* 2643 * Duplicate the specified descriptor to a free descriptor. 2644 */ 2645 int 2646 dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode, 2647 int openerror, int *indxp) 2648 { 2649 struct file *fp; 2650 int error, indx; 2651 2652 KASSERT(openerror == ENODEV || openerror == ENXIO, 2653 ("unexpected error %d in %s", openerror, __func__)); 2654 2655 /* 2656 * If the to-be-dup'd fd number is greater than the allowed number 2657 * of file descriptors, or the fd to be dup'd has already been 2658 * closed, then reject. 2659 */ 2660 FILEDESC_XLOCK(fdp); 2661 if ((fp = fget_locked(fdp, dfd)) == NULL) { 2662 FILEDESC_XUNLOCK(fdp); 2663 return (EBADF); 2664 } 2665 2666 error = fdalloc(td, 0, &indx); 2667 if (error != 0) { 2668 FILEDESC_XUNLOCK(fdp); 2669 return (error); 2670 } 2671 2672 /* 2673 * There are two cases of interest here. 2674 * 2675 * For ENODEV simply dup (dfd) to file descriptor (indx) and return. 2676 * 2677 * For ENXIO steal away the file structure from (dfd) and store it in 2678 * (indx). (dfd) is effectively closed by this operation. 2679 */ 2680 switch (openerror) { 2681 case ENODEV: 2682 /* 2683 * Check that the mode the file is being opened for is a 2684 * subset of the mode of the existing descriptor. 2685 */ 2686 if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { 2687 fdunused(fdp, indx); 2688 FILEDESC_XUNLOCK(fdp); 2689 return (EACCES); 2690 } 2691 fhold(fp); 2692 fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd]; 2693 filecaps_copy(&fdp->fd_ofiles[dfd].fde_caps, 2694 &fdp->fd_ofiles[indx].fde_caps); 2695 break; 2696 case ENXIO: 2697 /* 2698 * Steal away the file pointer from dfd and stuff it into indx. 2699 */ 2700 fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd]; 2701 bzero(&fdp->fd_ofiles[dfd], sizeof(fdp->fd_ofiles[dfd])); 2702 fdunused(fdp, dfd); 2703 break; 2704 } 2705 FILEDESC_XUNLOCK(fdp); 2706 *indxp = indx; 2707 return (0); 2708 } 2709 2710 /* 2711 * Scan all active processes and prisons to see if any of them have a current 2712 * or root directory of `olddp'. If so, replace them with the new mount point. 2713 */ 2714 void 2715 mountcheckdirs(struct vnode *olddp, struct vnode *newdp) 2716 { 2717 struct filedesc *fdp; 2718 struct prison *pr; 2719 struct proc *p; 2720 int nrele; 2721 2722 if (vrefcnt(olddp) == 1) 2723 return; 2724 nrele = 0; 2725 sx_slock(&allproc_lock); 2726 FOREACH_PROC_IN_SYSTEM(p) { 2727 fdp = fdhold(p); 2728 if (fdp == NULL) 2729 continue; 2730 FILEDESC_XLOCK(fdp); 2731 if (fdp->fd_cdir == olddp) { 2732 vref(newdp); 2733 fdp->fd_cdir = newdp; 2734 nrele++; 2735 } 2736 if (fdp->fd_rdir == olddp) { 2737 vref(newdp); 2738 fdp->fd_rdir = newdp; 2739 nrele++; 2740 } 2741 if (fdp->fd_jdir == olddp) { 2742 vref(newdp); 2743 fdp->fd_jdir = newdp; 2744 nrele++; 2745 } 2746 FILEDESC_XUNLOCK(fdp); 2747 fddrop(fdp); 2748 } 2749 sx_sunlock(&allproc_lock); 2750 if (rootvnode == olddp) { 2751 vref(newdp); 2752 rootvnode = newdp; 2753 nrele++; 2754 } 2755 mtx_lock(&prison0.pr_mtx); 2756 if (prison0.pr_root == olddp) { 2757 vref(newdp); 2758 prison0.pr_root = newdp; 2759 nrele++; 2760 } 2761 mtx_unlock(&prison0.pr_mtx); 2762 sx_slock(&allprison_lock); 2763 TAILQ_FOREACH(pr, &allprison, pr_list) { 2764 mtx_lock(&pr->pr_mtx); 2765 if (pr->pr_root == olddp) { 2766 vref(newdp); 2767 pr->pr_root = newdp; 2768 nrele++; 2769 } 2770 mtx_unlock(&pr->pr_mtx); 2771 } 2772 sx_sunlock(&allprison_lock); 2773 while (nrele--) 2774 vrele(olddp); 2775 } 2776 2777 struct filedesc_to_leader * 2778 filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader) 2779 { 2780 struct filedesc_to_leader *fdtol; 2781 2782 fdtol = malloc(sizeof(struct filedesc_to_leader), 2783 M_FILEDESC_TO_LEADER, 2784 M_WAITOK); 2785 fdtol->fdl_refcount = 1; 2786 fdtol->fdl_holdcount = 0; 2787 fdtol->fdl_wakeup = 0; 2788 fdtol->fdl_leader = leader; 2789 if (old != NULL) { 2790 FILEDESC_XLOCK(fdp); 2791 fdtol->fdl_next = old->fdl_next; 2792 fdtol->fdl_prev = old; 2793 old->fdl_next = fdtol; 2794 fdtol->fdl_next->fdl_prev = fdtol; 2795 FILEDESC_XUNLOCK(fdp); 2796 } else { 2797 fdtol->fdl_next = fdtol; 2798 fdtol->fdl_prev = fdtol; 2799 } 2800 return (fdtol); 2801 } 2802 2803 /* 2804 * Get file structures globally. 2805 */ 2806 static int 2807 sysctl_kern_file(SYSCTL_HANDLER_ARGS) 2808 { 2809 struct xfile xf; 2810 struct filedesc *fdp; 2811 struct file *fp; 2812 struct proc *p; 2813 int error, n; 2814 2815 error = sysctl_wire_old_buffer(req, 0); 2816 if (error != 0) 2817 return (error); 2818 if (req->oldptr == NULL) { 2819 n = 0; 2820 sx_slock(&allproc_lock); 2821 FOREACH_PROC_IN_SYSTEM(p) { 2822 if (p->p_state == PRS_NEW) 2823 continue; 2824 fdp = fdhold(p); 2825 if (fdp == NULL) 2826 continue; 2827 /* overestimates sparse tables. */ 2828 if (fdp->fd_lastfile > 0) 2829 n += fdp->fd_lastfile; 2830 fddrop(fdp); 2831 } 2832 sx_sunlock(&allproc_lock); 2833 return (SYSCTL_OUT(req, 0, n * sizeof(xf))); 2834 } 2835 error = 0; 2836 bzero(&xf, sizeof(xf)); 2837 xf.xf_size = sizeof(xf); 2838 sx_slock(&allproc_lock); 2839 FOREACH_PROC_IN_SYSTEM(p) { 2840 PROC_LOCK(p); 2841 if (p->p_state == PRS_NEW) { 2842 PROC_UNLOCK(p); 2843 continue; 2844 } 2845 if (p_cansee(req->td, p) != 0) { 2846 PROC_UNLOCK(p); 2847 continue; 2848 } 2849 xf.xf_pid = p->p_pid; 2850 xf.xf_uid = p->p_ucred->cr_uid; 2851 PROC_UNLOCK(p); 2852 fdp = fdhold(p); 2853 if (fdp == NULL) 2854 continue; 2855 FILEDESC_SLOCK(fdp); 2856 for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) { 2857 if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) 2858 continue; 2859 xf.xf_fd = n; 2860 xf.xf_file = fp; 2861 xf.xf_data = fp->f_data; 2862 xf.xf_vnode = fp->f_vnode; 2863 xf.xf_type = fp->f_type; 2864 xf.xf_count = fp->f_count; 2865 xf.xf_msgcount = 0; 2866 xf.xf_offset = foffset_get(fp); 2867 xf.xf_flag = fp->f_flag; 2868 error = SYSCTL_OUT(req, &xf, sizeof(xf)); 2869 if (error) 2870 break; 2871 } 2872 FILEDESC_SUNLOCK(fdp); 2873 fddrop(fdp); 2874 if (error) 2875 break; 2876 } 2877 sx_sunlock(&allproc_lock); 2878 return (error); 2879 } 2880 2881 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD, 2882 0, 0, sysctl_kern_file, "S,xfile", "Entire file table"); 2883 2884 #ifdef KINFO_OFILE_SIZE 2885 CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE); 2886 #endif 2887 2888 #ifdef COMPAT_FREEBSD7 2889 static int 2890 export_vnode_for_osysctl(struct vnode *vp, int type, 2891 struct kinfo_ofile *kif, struct filedesc *fdp, struct sysctl_req *req) 2892 { 2893 int error; 2894 char *fullpath, *freepath; 2895 2896 bzero(kif, sizeof(*kif)); 2897 kif->kf_structsize = sizeof(*kif); 2898 2899 vref(vp); 2900 kif->kf_fd = type; 2901 kif->kf_type = KF_TYPE_VNODE; 2902 /* This function only handles directories. */ 2903 if (vp->v_type != VDIR) { 2904 vrele(vp); 2905 return (ENOTDIR); 2906 } 2907 kif->kf_vnode_type = KF_VTYPE_VDIR; 2908 2909 /* 2910 * This is not a true file descriptor, so we set a bogus refcount 2911 * and offset to indicate these fields should be ignored. 2912 */ 2913 kif->kf_ref_count = -1; 2914 kif->kf_offset = -1; 2915 2916 freepath = NULL; 2917 fullpath = "-"; 2918 FILEDESC_SUNLOCK(fdp); 2919 vn_fullpath(curthread, vp, &fullpath, &freepath); 2920 vrele(vp); 2921 strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path)); 2922 if (freepath != NULL) 2923 free(freepath, M_TEMP); 2924 error = SYSCTL_OUT(req, kif, sizeof(*kif)); 2925 FILEDESC_SLOCK(fdp); 2926 return (error); 2927 } 2928 2929 /* 2930 * Get per-process file descriptors for use by procstat(1), et al. 2931 */ 2932 static int 2933 sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS) 2934 { 2935 char *fullpath, *freepath; 2936 struct kinfo_ofile *kif; 2937 struct filedesc *fdp; 2938 int error, i, *name; 2939 struct shmfd *shmfd; 2940 struct socket *so; 2941 struct vnode *vp; 2942 struct file *fp; 2943 struct proc *p; 2944 struct tty *tp; 2945 2946 name = (int *)arg1; 2947 error = pget((pid_t)name[0], PGET_CANDEBUG, &p); 2948 if (error != 0) 2949 return (error); 2950 fdp = fdhold(p); 2951 PROC_UNLOCK(p); 2952 if (fdp == NULL) 2953 return (ENOENT); 2954 kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK); 2955 FILEDESC_SLOCK(fdp); 2956 if (fdp->fd_cdir != NULL) 2957 export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif, 2958 fdp, req); 2959 if (fdp->fd_rdir != NULL) 2960 export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif, 2961 fdp, req); 2962 if (fdp->fd_jdir != NULL) 2963 export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif, 2964 fdp, req); 2965 for (i = 0; i < fdp->fd_nfiles; i++) { 2966 if ((fp = fdp->fd_ofiles[i].fde_file) == NULL) 2967 continue; 2968 bzero(kif, sizeof(*kif)); 2969 kif->kf_structsize = sizeof(*kif); 2970 vp = NULL; 2971 so = NULL; 2972 tp = NULL; 2973 shmfd = NULL; 2974 kif->kf_fd = i; 2975 2976 switch (fp->f_type) { 2977 case DTYPE_VNODE: 2978 kif->kf_type = KF_TYPE_VNODE; 2979 vp = fp->f_vnode; 2980 break; 2981 2982 case DTYPE_SOCKET: 2983 kif->kf_type = KF_TYPE_SOCKET; 2984 so = fp->f_data; 2985 break; 2986 2987 case DTYPE_PIPE: 2988 kif->kf_type = KF_TYPE_PIPE; 2989 break; 2990 2991 case DTYPE_FIFO: 2992 kif->kf_type = KF_TYPE_FIFO; 2993 vp = fp->f_vnode; 2994 break; 2995 2996 case DTYPE_KQUEUE: 2997 kif->kf_type = KF_TYPE_KQUEUE; 2998 break; 2999 3000 case DTYPE_CRYPTO: 3001 kif->kf_type = KF_TYPE_CRYPTO; 3002 break; 3003 3004 case DTYPE_MQUEUE: 3005 kif->kf_type = KF_TYPE_MQUEUE; 3006 break; 3007 3008 case DTYPE_SHM: 3009 kif->kf_type = KF_TYPE_SHM; 3010 shmfd = fp->f_data; 3011 break; 3012 3013 case DTYPE_SEM: 3014 kif->kf_type = KF_TYPE_SEM; 3015 break; 3016 3017 case DTYPE_PTS: 3018 kif->kf_type = KF_TYPE_PTS; 3019 tp = fp->f_data; 3020 break; 3021 3022 #ifdef PROCDESC 3023 case DTYPE_PROCDESC: 3024 kif->kf_type = KF_TYPE_PROCDESC; 3025 break; 3026 #endif 3027 3028 default: 3029 kif->kf_type = KF_TYPE_UNKNOWN; 3030 break; 3031 } 3032 kif->kf_ref_count = fp->f_count; 3033 if (fp->f_flag & FREAD) 3034 kif->kf_flags |= KF_FLAG_READ; 3035 if (fp->f_flag & FWRITE) 3036 kif->kf_flags |= KF_FLAG_WRITE; 3037 if (fp->f_flag & FAPPEND) 3038 kif->kf_flags |= KF_FLAG_APPEND; 3039 if (fp->f_flag & FASYNC) 3040 kif->kf_flags |= KF_FLAG_ASYNC; 3041 if (fp->f_flag & FFSYNC) 3042 kif->kf_flags |= KF_FLAG_FSYNC; 3043 if (fp->f_flag & FNONBLOCK) 3044 kif->kf_flags |= KF_FLAG_NONBLOCK; 3045 if (fp->f_flag & O_DIRECT) 3046 kif->kf_flags |= KF_FLAG_DIRECT; 3047 if (fp->f_flag & FHASLOCK) 3048 kif->kf_flags |= KF_FLAG_HASLOCK; 3049 kif->kf_offset = foffset_get(fp); 3050 if (vp != NULL) { 3051 vref(vp); 3052 switch (vp->v_type) { 3053 case VNON: 3054 kif->kf_vnode_type = KF_VTYPE_VNON; 3055 break; 3056 case VREG: 3057 kif->kf_vnode_type = KF_VTYPE_VREG; 3058 break; 3059 case VDIR: 3060 kif->kf_vnode_type = KF_VTYPE_VDIR; 3061 break; 3062 case VBLK: 3063 kif->kf_vnode_type = KF_VTYPE_VBLK; 3064 break; 3065 case VCHR: 3066 kif->kf_vnode_type = KF_VTYPE_VCHR; 3067 break; 3068 case VLNK: 3069 kif->kf_vnode_type = KF_VTYPE_VLNK; 3070 break; 3071 case VSOCK: 3072 kif->kf_vnode_type = KF_VTYPE_VSOCK; 3073 break; 3074 case VFIFO: 3075 kif->kf_vnode_type = KF_VTYPE_VFIFO; 3076 break; 3077 case VBAD: 3078 kif->kf_vnode_type = KF_VTYPE_VBAD; 3079 break; 3080 default: 3081 kif->kf_vnode_type = KF_VTYPE_UNKNOWN; 3082 break; 3083 } 3084 /* 3085 * It is OK to drop the filedesc lock here as we will 3086 * re-validate and re-evaluate its properties when 3087 * the loop continues. 3088 */ 3089 freepath = NULL; 3090 fullpath = "-"; 3091 FILEDESC_SUNLOCK(fdp); 3092 vn_fullpath(curthread, vp, &fullpath, &freepath); 3093 vrele(vp); 3094 strlcpy(kif->kf_path, fullpath, 3095 sizeof(kif->kf_path)); 3096 if (freepath != NULL) 3097 free(freepath, M_TEMP); 3098 FILEDESC_SLOCK(fdp); 3099 } 3100 if (so != NULL) { 3101 struct sockaddr *sa; 3102 3103 if (so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa) 3104 == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) { 3105 bcopy(sa, &kif->kf_sa_local, sa->sa_len); 3106 free(sa, M_SONAME); 3107 } 3108 if (so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa) 3109 == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) { 3110 bcopy(sa, &kif->kf_sa_peer, sa->sa_len); 3111 free(sa, M_SONAME); 3112 } 3113 kif->kf_sock_domain = 3114 so->so_proto->pr_domain->dom_family; 3115 kif->kf_sock_type = so->so_type; 3116 kif->kf_sock_protocol = so->so_proto->pr_protocol; 3117 } 3118 if (tp != NULL) { 3119 strlcpy(kif->kf_path, tty_devname(tp), 3120 sizeof(kif->kf_path)); 3121 } 3122 if (shmfd != NULL) 3123 shm_path(shmfd, kif->kf_path, sizeof(kif->kf_path)); 3124 error = SYSCTL_OUT(req, kif, sizeof(*kif)); 3125 if (error) 3126 break; 3127 } 3128 FILEDESC_SUNLOCK(fdp); 3129 fddrop(fdp); 3130 free(kif, M_TEMP); 3131 return (0); 3132 } 3133 3134 static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, CTLFLAG_RD, 3135 sysctl_kern_proc_ofiledesc, "Process ofiledesc entries"); 3136 #endif /* COMPAT_FREEBSD7 */ 3137 3138 #ifdef KINFO_FILE_SIZE 3139 CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE); 3140 #endif 3141 3142 static int 3143 export_fd_for_sysctl(void *data, int type, int fd, int fflags, int refcnt, 3144 int64_t offset, cap_rights_t fd_cap_rights, struct kinfo_file *kif, 3145 struct sysctl_req *req) 3146 { 3147 struct { 3148 int fflag; 3149 int kf_fflag; 3150 } fflags_table[] = { 3151 { FAPPEND, KF_FLAG_APPEND }, 3152 { FASYNC, KF_FLAG_ASYNC }, 3153 { FFSYNC, KF_FLAG_FSYNC }, 3154 { FHASLOCK, KF_FLAG_HASLOCK }, 3155 { FNONBLOCK, KF_FLAG_NONBLOCK }, 3156 { FREAD, KF_FLAG_READ }, 3157 { FWRITE, KF_FLAG_WRITE }, 3158 { O_CREAT, KF_FLAG_CREAT }, 3159 { O_DIRECT, KF_FLAG_DIRECT }, 3160 { O_EXCL, KF_FLAG_EXCL }, 3161 { O_EXEC, KF_FLAG_EXEC }, 3162 { O_EXLOCK, KF_FLAG_EXLOCK }, 3163 { O_NOFOLLOW, KF_FLAG_NOFOLLOW }, 3164 { O_SHLOCK, KF_FLAG_SHLOCK }, 3165 { O_TRUNC, KF_FLAG_TRUNC } 3166 }; 3167 #define NFFLAGS (sizeof(fflags_table) / sizeof(*fflags_table)) 3168 struct vnode *vp; 3169 int error; 3170 unsigned int i; 3171 3172 bzero(kif, sizeof(*kif)); 3173 switch (type) { 3174 case KF_TYPE_FIFO: 3175 case KF_TYPE_VNODE: 3176 vp = (struct vnode *)data; 3177 error = fill_vnode_info(vp, kif); 3178 vrele(vp); 3179 break; 3180 case KF_TYPE_SOCKET: 3181 error = fill_socket_info((struct socket *)data, kif); 3182 break; 3183 case KF_TYPE_PIPE: 3184 error = fill_pipe_info((struct pipe *)data, kif); 3185 break; 3186 case KF_TYPE_PTS: 3187 error = fill_pts_info((struct tty *)data, kif); 3188 break; 3189 case KF_TYPE_PROCDESC: 3190 error = fill_procdesc_info((struct procdesc *)data, kif); 3191 break; 3192 case KF_TYPE_SHM: 3193 error = fill_shm_info((struct file *)data, kif); 3194 break; 3195 default: 3196 error = 0; 3197 } 3198 if (error == 0) 3199 kif->kf_status |= KF_ATTR_VALID; 3200 3201 /* 3202 * Translate file access flags. 3203 */ 3204 for (i = 0; i < NFFLAGS; i++) 3205 if (fflags & fflags_table[i].fflag) 3206 kif->kf_flags |= fflags_table[i].kf_fflag; 3207 kif->kf_cap_rights = fd_cap_rights; 3208 kif->kf_fd = fd; 3209 kif->kf_type = type; 3210 kif->kf_ref_count = refcnt; 3211 kif->kf_offset = offset; 3212 /* Pack record size down */ 3213 kif->kf_structsize = offsetof(struct kinfo_file, kf_path) + 3214 strlen(kif->kf_path) + 1; 3215 kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t)); 3216 error = SYSCTL_OUT(req, kif, kif->kf_structsize); 3217 return (error); 3218 } 3219 3220 /* 3221 * Get per-process file descriptors for use by procstat(1), et al. 3222 */ 3223 static int 3224 sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS) 3225 { 3226 struct file *fp; 3227 struct filedesc *fdp; 3228 struct kinfo_file *kif; 3229 struct proc *p; 3230 struct vnode *cttyvp, *textvp, *tracevp; 3231 size_t oldidx; 3232 int64_t offset; 3233 void *data; 3234 int error, i, *name; 3235 int type, refcnt, fflags; 3236 cap_rights_t fd_cap_rights; 3237 3238 name = (int *)arg1; 3239 error = pget((pid_t)name[0], PGET_CANDEBUG, &p); 3240 if (error != 0) 3241 return (error); 3242 /* ktrace vnode */ 3243 tracevp = p->p_tracevp; 3244 if (tracevp != NULL) 3245 vref(tracevp); 3246 /* text vnode */ 3247 textvp = p->p_textvp; 3248 if (textvp != NULL) 3249 vref(textvp); 3250 /* Controlling tty. */ 3251 cttyvp = NULL; 3252 if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) { 3253 cttyvp = p->p_pgrp->pg_session->s_ttyvp; 3254 if (cttyvp != NULL) 3255 vref(cttyvp); 3256 } 3257 fdp = fdhold(p); 3258 PROC_UNLOCK(p); 3259 kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK); 3260 if (tracevp != NULL) 3261 export_fd_for_sysctl(tracevp, KF_TYPE_VNODE, KF_FD_TYPE_TRACE, 3262 FREAD | FWRITE, -1, -1, 0, kif, req); 3263 if (textvp != NULL) 3264 export_fd_for_sysctl(textvp, KF_TYPE_VNODE, KF_FD_TYPE_TEXT, 3265 FREAD, -1, -1, 0, kif, req); 3266 if (cttyvp != NULL) 3267 export_fd_for_sysctl(cttyvp, KF_TYPE_VNODE, KF_FD_TYPE_CTTY, 3268 FREAD | FWRITE, -1, -1, 0, kif, req); 3269 if (fdp == NULL) 3270 goto fail; 3271 FILEDESC_SLOCK(fdp); 3272 /* working directory */ 3273 if (fdp->fd_cdir != NULL) { 3274 vref(fdp->fd_cdir); 3275 data = fdp->fd_cdir; 3276 FILEDESC_SUNLOCK(fdp); 3277 export_fd_for_sysctl(data, KF_TYPE_VNODE, KF_FD_TYPE_CWD, 3278 FREAD, -1, -1, 0, kif, req); 3279 FILEDESC_SLOCK(fdp); 3280 } 3281 /* root directory */ 3282 if (fdp->fd_rdir != NULL) { 3283 vref(fdp->fd_rdir); 3284 data = fdp->fd_rdir; 3285 FILEDESC_SUNLOCK(fdp); 3286 export_fd_for_sysctl(data, KF_TYPE_VNODE, KF_FD_TYPE_ROOT, 3287 FREAD, -1, -1, 0, kif, req); 3288 FILEDESC_SLOCK(fdp); 3289 } 3290 /* jail directory */ 3291 if (fdp->fd_jdir != NULL) { 3292 vref(fdp->fd_jdir); 3293 data = fdp->fd_jdir; 3294 FILEDESC_SUNLOCK(fdp); 3295 export_fd_for_sysctl(data, KF_TYPE_VNODE, KF_FD_TYPE_JAIL, 3296 FREAD, -1, -1, 0, kif, req); 3297 FILEDESC_SLOCK(fdp); 3298 } 3299 for (i = 0; i < fdp->fd_nfiles; i++) { 3300 if ((fp = fdp->fd_ofiles[i].fde_file) == NULL) 3301 continue; 3302 data = NULL; 3303 #ifdef CAPABILITIES 3304 fd_cap_rights = cap_rights(fdp, i); 3305 #else /* !CAPABILITIES */ 3306 fd_cap_rights = 0; 3307 #endif 3308 switch (fp->f_type) { 3309 case DTYPE_VNODE: 3310 type = KF_TYPE_VNODE; 3311 vref(fp->f_vnode); 3312 data = fp->f_vnode; 3313 break; 3314 3315 case DTYPE_SOCKET: 3316 type = KF_TYPE_SOCKET; 3317 data = fp->f_data; 3318 break; 3319 3320 case DTYPE_PIPE: 3321 type = KF_TYPE_PIPE; 3322 data = fp->f_data; 3323 break; 3324 3325 case DTYPE_FIFO: 3326 type = KF_TYPE_FIFO; 3327 vref(fp->f_vnode); 3328 data = fp->f_vnode; 3329 break; 3330 3331 case DTYPE_KQUEUE: 3332 type = KF_TYPE_KQUEUE; 3333 break; 3334 3335 case DTYPE_CRYPTO: 3336 type = KF_TYPE_CRYPTO; 3337 break; 3338 3339 case DTYPE_MQUEUE: 3340 type = KF_TYPE_MQUEUE; 3341 break; 3342 3343 case DTYPE_SHM: 3344 type = KF_TYPE_SHM; 3345 data = fp; 3346 break; 3347 3348 case DTYPE_SEM: 3349 type = KF_TYPE_SEM; 3350 break; 3351 3352 case DTYPE_PTS: 3353 type = KF_TYPE_PTS; 3354 data = fp->f_data; 3355 break; 3356 3357 #ifdef PROCDESC 3358 case DTYPE_PROCDESC: 3359 type = KF_TYPE_PROCDESC; 3360 data = fp->f_data; 3361 break; 3362 #endif 3363 3364 default: 3365 type = KF_TYPE_UNKNOWN; 3366 break; 3367 } 3368 refcnt = fp->f_count; 3369 fflags = fp->f_flag; 3370 offset = foffset_get(fp); 3371 3372 /* 3373 * Create sysctl entry. 3374 * It is OK to drop the filedesc lock here as we will 3375 * re-validate and re-evaluate its properties when 3376 * the loop continues. 3377 */ 3378 oldidx = req->oldidx; 3379 if (type == KF_TYPE_VNODE || type == KF_TYPE_FIFO) 3380 FILEDESC_SUNLOCK(fdp); 3381 error = export_fd_for_sysctl(data, type, i, fflags, refcnt, 3382 offset, fd_cap_rights, kif, req); 3383 if (type == KF_TYPE_VNODE || type == KF_TYPE_FIFO) 3384 FILEDESC_SLOCK(fdp); 3385 if (error) { 3386 if (error == ENOMEM) { 3387 /* 3388 * The hack to keep the ABI of sysctl 3389 * kern.proc.filedesc intact, but not 3390 * to account a partially copied 3391 * kinfo_file into the oldidx. 3392 */ 3393 req->oldidx = oldidx; 3394 error = 0; 3395 } 3396 break; 3397 } 3398 } 3399 FILEDESC_SUNLOCK(fdp); 3400 fail: 3401 if (fdp != NULL) 3402 fddrop(fdp); 3403 free(kif, M_TEMP); 3404 return (error); 3405 } 3406 3407 int 3408 vntype_to_kinfo(int vtype) 3409 { 3410 struct { 3411 int vtype; 3412 int kf_vtype; 3413 } vtypes_table[] = { 3414 { VBAD, KF_VTYPE_VBAD }, 3415 { VBLK, KF_VTYPE_VBLK }, 3416 { VCHR, KF_VTYPE_VCHR }, 3417 { VDIR, KF_VTYPE_VDIR }, 3418 { VFIFO, KF_VTYPE_VFIFO }, 3419 { VLNK, KF_VTYPE_VLNK }, 3420 { VNON, KF_VTYPE_VNON }, 3421 { VREG, KF_VTYPE_VREG }, 3422 { VSOCK, KF_VTYPE_VSOCK } 3423 }; 3424 #define NVTYPES (sizeof(vtypes_table) / sizeof(*vtypes_table)) 3425 unsigned int i; 3426 3427 /* 3428 * Perform vtype translation. 3429 */ 3430 for (i = 0; i < NVTYPES; i++) 3431 if (vtypes_table[i].vtype == vtype) 3432 break; 3433 if (i < NVTYPES) 3434 return (vtypes_table[i].kf_vtype); 3435 3436 return (KF_VTYPE_UNKNOWN); 3437 } 3438 3439 static int 3440 fill_vnode_info(struct vnode *vp, struct kinfo_file *kif) 3441 { 3442 struct vattr va; 3443 char *fullpath, *freepath; 3444 int error; 3445 3446 if (vp == NULL) 3447 return (1); 3448 kif->kf_vnode_type = vntype_to_kinfo(vp->v_type); 3449 freepath = NULL; 3450 fullpath = "-"; 3451 error = vn_fullpath(curthread, vp, &fullpath, &freepath); 3452 if (error == 0) { 3453 strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path)); 3454 } 3455 if (freepath != NULL) 3456 free(freepath, M_TEMP); 3457 3458 /* 3459 * Retrieve vnode attributes. 3460 */ 3461 va.va_fsid = VNOVAL; 3462 va.va_rdev = NODEV; 3463 vn_lock(vp, LK_SHARED | LK_RETRY); 3464 error = VOP_GETATTR(vp, &va, curthread->td_ucred); 3465 VOP_UNLOCK(vp, 0); 3466 if (error != 0) 3467 return (error); 3468 if (va.va_fsid != VNOVAL) 3469 kif->kf_un.kf_file.kf_file_fsid = va.va_fsid; 3470 else 3471 kif->kf_un.kf_file.kf_file_fsid = 3472 vp->v_mount->mnt_stat.f_fsid.val[0]; 3473 kif->kf_un.kf_file.kf_file_fileid = va.va_fileid; 3474 kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode); 3475 kif->kf_un.kf_file.kf_file_size = va.va_size; 3476 kif->kf_un.kf_file.kf_file_rdev = va.va_rdev; 3477 return (0); 3478 } 3479 3480 static int 3481 fill_socket_info(struct socket *so, struct kinfo_file *kif) 3482 { 3483 struct sockaddr *sa; 3484 struct inpcb *inpcb; 3485 struct unpcb *unpcb; 3486 int error; 3487 3488 if (so == NULL) 3489 return (1); 3490 kif->kf_sock_domain = so->so_proto->pr_domain->dom_family; 3491 kif->kf_sock_type = so->so_type; 3492 kif->kf_sock_protocol = so->so_proto->pr_protocol; 3493 kif->kf_un.kf_sock.kf_sock_pcb = (uintptr_t)so->so_pcb; 3494 switch(kif->kf_sock_domain) { 3495 case AF_INET: 3496 case AF_INET6: 3497 if (kif->kf_sock_protocol == IPPROTO_TCP) { 3498 if (so->so_pcb != NULL) { 3499 inpcb = (struct inpcb *)(so->so_pcb); 3500 kif->kf_un.kf_sock.kf_sock_inpcb = 3501 (uintptr_t)inpcb->inp_ppcb; 3502 } 3503 } 3504 break; 3505 case AF_UNIX: 3506 if (so->so_pcb != NULL) { 3507 unpcb = (struct unpcb *)(so->so_pcb); 3508 if (unpcb->unp_conn) { 3509 kif->kf_un.kf_sock.kf_sock_unpconn = 3510 (uintptr_t)unpcb->unp_conn; 3511 kif->kf_un.kf_sock.kf_sock_rcv_sb_state = 3512 so->so_rcv.sb_state; 3513 kif->kf_un.kf_sock.kf_sock_snd_sb_state = 3514 so->so_snd.sb_state; 3515 } 3516 } 3517 break; 3518 } 3519 error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); 3520 if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) { 3521 bcopy(sa, &kif->kf_sa_local, sa->sa_len); 3522 free(sa, M_SONAME); 3523 } 3524 error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa); 3525 if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) { 3526 bcopy(sa, &kif->kf_sa_peer, sa->sa_len); 3527 free(sa, M_SONAME); 3528 } 3529 strncpy(kif->kf_path, so->so_proto->pr_domain->dom_name, 3530 sizeof(kif->kf_path)); 3531 return (0); 3532 } 3533 3534 static int 3535 fill_pts_info(struct tty *tp, struct kinfo_file *kif) 3536 { 3537 3538 if (tp == NULL) 3539 return (1); 3540 kif->kf_un.kf_pts.kf_pts_dev = tty_udev(tp); 3541 strlcpy(kif->kf_path, tty_devname(tp), sizeof(kif->kf_path)); 3542 return (0); 3543 } 3544 3545 static int 3546 fill_pipe_info(struct pipe *pi, struct kinfo_file *kif) 3547 { 3548 3549 if (pi == NULL) 3550 return (1); 3551 kif->kf_un.kf_pipe.kf_pipe_addr = (uintptr_t)pi; 3552 kif->kf_un.kf_pipe.kf_pipe_peer = (uintptr_t)pi->pipe_peer; 3553 kif->kf_un.kf_pipe.kf_pipe_buffer_cnt = pi->pipe_buffer.cnt; 3554 return (0); 3555 } 3556 3557 static int 3558 fill_procdesc_info(struct procdesc *pdp, struct kinfo_file *kif) 3559 { 3560 3561 if (pdp == NULL) 3562 return (1); 3563 kif->kf_un.kf_proc.kf_pid = pdp->pd_pid; 3564 return (0); 3565 } 3566 3567 static int 3568 fill_shm_info(struct file *fp, struct kinfo_file *kif) 3569 { 3570 struct thread *td; 3571 struct stat sb; 3572 3573 td = curthread; 3574 if (fp->f_data == NULL) 3575 return (1); 3576 if (fo_stat(fp, &sb, td->td_ucred, td) != 0) 3577 return (1); 3578 shm_path(fp->f_data, kif->kf_path, sizeof(kif->kf_path)); 3579 kif->kf_un.kf_file.kf_file_mode = sb.st_mode; 3580 kif->kf_un.kf_file.kf_file_size = sb.st_size; 3581 return (0); 3582 } 3583 3584 static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, CTLFLAG_RD, 3585 sysctl_kern_proc_filedesc, "Process filedesc entries"); 3586 3587 #ifdef DDB 3588 /* 3589 * For the purposes of debugging, generate a human-readable string for the 3590 * file type. 3591 */ 3592 static const char * 3593 file_type_to_name(short type) 3594 { 3595 3596 switch (type) { 3597 case 0: 3598 return ("zero"); 3599 case DTYPE_VNODE: 3600 return ("vnod"); 3601 case DTYPE_SOCKET: 3602 return ("sock"); 3603 case DTYPE_PIPE: 3604 return ("pipe"); 3605 case DTYPE_FIFO: 3606 return ("fifo"); 3607 case DTYPE_KQUEUE: 3608 return ("kque"); 3609 case DTYPE_CRYPTO: 3610 return ("crpt"); 3611 case DTYPE_MQUEUE: 3612 return ("mque"); 3613 case DTYPE_SHM: 3614 return ("shm"); 3615 case DTYPE_SEM: 3616 return ("ksem"); 3617 default: 3618 return ("unkn"); 3619 } 3620 } 3621 3622 /* 3623 * For the purposes of debugging, identify a process (if any, perhaps one of 3624 * many) that references the passed file in its file descriptor array. Return 3625 * NULL if none. 3626 */ 3627 static struct proc * 3628 file_to_first_proc(struct file *fp) 3629 { 3630 struct filedesc *fdp; 3631 struct proc *p; 3632 int n; 3633 3634 FOREACH_PROC_IN_SYSTEM(p) { 3635 if (p->p_state == PRS_NEW) 3636 continue; 3637 fdp = p->p_fd; 3638 if (fdp == NULL) 3639 continue; 3640 for (n = 0; n < fdp->fd_nfiles; n++) { 3641 if (fp == fdp->fd_ofiles[n].fde_file) 3642 return (p); 3643 } 3644 } 3645 return (NULL); 3646 } 3647 3648 static void 3649 db_print_file(struct file *fp, int header) 3650 { 3651 struct proc *p; 3652 3653 if (header) 3654 db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n", 3655 "File", "Type", "Data", "Flag", "GCFl", "Count", 3656 "MCount", "Vnode", "FPID", "FCmd"); 3657 p = file_to_first_proc(fp); 3658 db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp, 3659 file_type_to_name(fp->f_type), fp->f_data, fp->f_flag, 3660 0, fp->f_count, 0, fp->f_vnode, 3661 p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-"); 3662 } 3663 3664 DB_SHOW_COMMAND(file, db_show_file) 3665 { 3666 struct file *fp; 3667 3668 if (!have_addr) { 3669 db_printf("usage: show file <addr>\n"); 3670 return; 3671 } 3672 fp = (struct file *)addr; 3673 db_print_file(fp, 1); 3674 } 3675 3676 DB_SHOW_COMMAND(files, db_show_files) 3677 { 3678 struct filedesc *fdp; 3679 struct file *fp; 3680 struct proc *p; 3681 int header; 3682 int n; 3683 3684 header = 1; 3685 FOREACH_PROC_IN_SYSTEM(p) { 3686 if (p->p_state == PRS_NEW) 3687 continue; 3688 if ((fdp = p->p_fd) == NULL) 3689 continue; 3690 for (n = 0; n < fdp->fd_nfiles; ++n) { 3691 if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) 3692 continue; 3693 db_print_file(fp, header); 3694 header = 0; 3695 } 3696 } 3697 } 3698 #endif 3699 3700 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, 3701 &maxfilesperproc, 0, "Maximum files allowed open per process"); 3702 3703 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, 3704 &maxfiles, 0, "Maximum number of files"); 3705 3706 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, 3707 __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files"); 3708 3709 /* ARGSUSED*/ 3710 static void 3711 filelistinit(void *dummy) 3712 { 3713 3714 file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, 3715 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 3716 mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); 3717 mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF); 3718 } 3719 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL); 3720 3721 /*-------------------------------------------------------------------*/ 3722 3723 static int 3724 badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, 3725 int flags, struct thread *td) 3726 { 3727 3728 return (EBADF); 3729 } 3730 3731 static int 3732 badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, 3733 struct thread *td) 3734 { 3735 3736 return (EINVAL); 3737 } 3738 3739 static int 3740 badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, 3741 struct thread *td) 3742 { 3743 3744 return (EBADF); 3745 } 3746 3747 static int 3748 badfo_poll(struct file *fp, int events, struct ucred *active_cred, 3749 struct thread *td) 3750 { 3751 3752 return (0); 3753 } 3754 3755 static int 3756 badfo_kqfilter(struct file *fp, struct knote *kn) 3757 { 3758 3759 return (EBADF); 3760 } 3761 3762 static int 3763 badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, 3764 struct thread *td) 3765 { 3766 3767 return (EBADF); 3768 } 3769 3770 static int 3771 badfo_close(struct file *fp, struct thread *td) 3772 { 3773 3774 return (EBADF); 3775 } 3776 3777 static int 3778 badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 3779 struct thread *td) 3780 { 3781 3782 return (EBADF); 3783 } 3784 3785 static int 3786 badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 3787 struct thread *td) 3788 { 3789 3790 return (EBADF); 3791 } 3792 3793 struct fileops badfileops = { 3794 .fo_read = badfo_readwrite, 3795 .fo_write = badfo_readwrite, 3796 .fo_truncate = badfo_truncate, 3797 .fo_ioctl = badfo_ioctl, 3798 .fo_poll = badfo_poll, 3799 .fo_kqfilter = badfo_kqfilter, 3800 .fo_stat = badfo_stat, 3801 .fo_close = badfo_close, 3802 .fo_chmod = badfo_chmod, 3803 .fo_chown = badfo_chown, 3804 }; 3805 3806 int 3807 invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 3808 struct thread *td) 3809 { 3810 3811 return (EINVAL); 3812 } 3813 3814 int 3815 invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 3816 struct thread *td) 3817 { 3818 3819 return (EINVAL); 3820 } 3821 3822 /*-------------------------------------------------------------------*/ 3823 3824 /* 3825 * File Descriptor pseudo-device driver (/dev/fd/). 3826 * 3827 * Opening minor device N dup()s the file (if any) connected to file 3828 * descriptor N belonging to the calling process. Note that this driver 3829 * consists of only the ``open()'' routine, because all subsequent 3830 * references to this file will be direct to the other driver. 3831 * 3832 * XXX: we could give this one a cloning event handler if necessary. 3833 */ 3834 3835 /* ARGSUSED */ 3836 static int 3837 fdopen(struct cdev *dev, int mode, int type, struct thread *td) 3838 { 3839 3840 /* 3841 * XXX Kludge: set curthread->td_dupfd to contain the value of the 3842 * the file descriptor being sought for duplication. The error 3843 * return ensures that the vnode for this device will be released 3844 * by vn_open. Open will detect this special error and take the 3845 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN 3846 * will simply report the error. 3847 */ 3848 td->td_dupfd = dev2unit(dev); 3849 return (ENODEV); 3850 } 3851 3852 static struct cdevsw fildesc_cdevsw = { 3853 .d_version = D_VERSION, 3854 .d_open = fdopen, 3855 .d_name = "FD", 3856 }; 3857 3858 static void 3859 fildesc_drvinit(void *unused) 3860 { 3861 struct cdev *dev; 3862 3863 dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL, 3864 UID_ROOT, GID_WHEEL, 0666, "fd/0"); 3865 make_dev_alias(dev, "stdin"); 3866 dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL, 3867 UID_ROOT, GID_WHEEL, 0666, "fd/1"); 3868 make_dev_alias(dev, "stdout"); 3869 dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL, 3870 UID_ROOT, GID_WHEEL, 0666, "fd/2"); 3871 make_dev_alias(dev, "stderr"); 3872 } 3873 3874 SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL); 3875