1 /*- 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 35 */ 36 37 /* 38 * External virtual filesystem routines 39 */ 40 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 #include "opt_compat.h" 45 #include "opt_ddb.h" 46 #include "opt_watchdog.h" 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/bio.h> 51 #include <sys/buf.h> 52 #include <sys/condvar.h> 53 #include <sys/conf.h> 54 #include <sys/dirent.h> 55 #include <sys/event.h> 56 #include <sys/eventhandler.h> 57 #include <sys/extattr.h> 58 #include <sys/file.h> 59 #include <sys/fcntl.h> 60 #include <sys/jail.h> 61 #include <sys/kdb.h> 62 #include <sys/kernel.h> 63 #include <sys/kthread.h> 64 #include <sys/lockf.h> 65 #include <sys/malloc.h> 66 #include <sys/mount.h> 67 #include <sys/namei.h> 68 #include <sys/pctrie.h> 69 #include <sys/priv.h> 70 #include <sys/reboot.h> 71 #include <sys/refcount.h> 72 #include <sys/rwlock.h> 73 #include <sys/sched.h> 74 #include <sys/sleepqueue.h> 75 #include <sys/smp.h> 76 #include <sys/stat.h> 77 #include <sys/sysctl.h> 78 #include <sys/syslog.h> 79 #include <sys/vmmeter.h> 80 #include <sys/vnode.h> 81 #include <sys/watchdog.h> 82 83 #include <machine/stdarg.h> 84 85 #include <security/mac/mac_framework.h> 86 87 #include <vm/vm.h> 88 #include <vm/vm_object.h> 89 #include <vm/vm_extern.h> 90 #include <vm/pmap.h> 91 #include <vm/vm_map.h> 92 #include <vm/vm_page.h> 93 #include <vm/vm_kern.h> 94 #include <vm/uma.h> 95 96 #ifdef DDB 97 #include <ddb/ddb.h> 98 #endif 99 100 static void delmntque(struct vnode *vp); 101 static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, 102 int slpflag, int slptimeo); 103 static void syncer_shutdown(void *arg, int howto); 104 static int vtryrecycle(struct vnode *vp); 105 static void v_init_counters(struct vnode *); 106 static void v_incr_usecount(struct vnode *); 107 static void v_incr_usecount_locked(struct vnode *); 108 static void v_incr_devcount(struct vnode *); 109 static void v_decr_devcount(struct vnode *); 110 static void vgonel(struct vnode *); 111 static void vfs_knllock(void *arg); 112 static void vfs_knlunlock(void *arg); 113 static void vfs_knl_assert_locked(void *arg); 114 static void vfs_knl_assert_unlocked(void *arg); 115 static void destroy_vpollinfo(struct vpollinfo *vi); 116 117 /* 118 * Number of vnodes in existence. Increased whenever getnewvnode() 119 * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode. 120 */ 121 static unsigned long numvnodes; 122 123 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, 124 "Number of vnodes in existence"); 125 126 static u_long vnodes_created; 127 SYSCTL_ULONG(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, 128 0, "Number of vnodes created by getnewvnode"); 129 130 /* 131 * Conversion tables for conversion from vnode types to inode formats 132 * and back. 133 */ 134 enum vtype iftovt_tab[16] = { 135 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 136 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 137 }; 138 int vttoif_tab[10] = { 139 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 140 S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT 141 }; 142 143 /* 144 * List of vnodes that are ready for recycling. 145 */ 146 static TAILQ_HEAD(freelst, vnode) vnode_free_list; 147 148 /* 149 * "Free" vnode target. Free vnodes are rarely completely free, but are 150 * just ones that are cheap to recycle. Usually they are for files which 151 * have been stat'd but not read; these usually have inode and namecache 152 * data attached to them. This target is the preferred minimum size of a 153 * sub-cache consisting mostly of such files. The system balances the size 154 * of this sub-cache with its complement to try to prevent either from 155 * thrashing while the other is relatively inactive. The targets express 156 * a preference for the best balance. 157 * 158 * "Above" this target there are 2 further targets (watermarks) related 159 * to recyling of free vnodes. In the best-operating case, the cache is 160 * exactly full, the free list has size between vlowat and vhiwat above the 161 * free target, and recycling from it and normal use maintains this state. 162 * Sometimes the free list is below vlowat or even empty, but this state 163 * is even better for immediate use provided the cache is not full. 164 * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free 165 * ones) to reach one of these states. The watermarks are currently hard- 166 * coded as 4% and 9% of the available space higher. These and the default 167 * of 25% for wantfreevnodes are too large if the memory size is large. 168 * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim 169 * whenever vnlru_proc() becomes active. 170 */ 171 static u_long wantfreevnodes; 172 SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, 173 &wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes"); 174 static u_long freevnodes; 175 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, 176 &freevnodes, 0, "Number of \"free\" vnodes"); 177 178 static u_long recycles_count; 179 SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 0, 180 "Number of vnodes recycled to meet vnode cache targets"); 181 182 /* 183 * Various variables used for debugging the new implementation of 184 * reassignbuf(). 185 * XXX these are probably of (very) limited utility now. 186 */ 187 static int reassignbufcalls; 188 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, 189 "Number of calls to reassignbuf"); 190 191 static u_long free_owe_inact; 192 SYSCTL_ULONG(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact, 0, 193 "Number of times free vnodes kept on active list due to VFS " 194 "owing inactivation"); 195 196 /* To keep more than one thread at a time from running vfs_getnewfsid */ 197 static struct mtx mntid_mtx; 198 199 /* 200 * Lock for any access to the following: 201 * vnode_free_list 202 * numvnodes 203 * freevnodes 204 */ 205 static struct mtx vnode_free_list_mtx; 206 207 /* Publicly exported FS */ 208 struct nfs_public nfs_pub; 209 210 static uma_zone_t buf_trie_zone; 211 212 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 213 static uma_zone_t vnode_zone; 214 static uma_zone_t vnodepoll_zone; 215 216 /* 217 * The workitem queue. 218 * 219 * It is useful to delay writes of file data and filesystem metadata 220 * for tens of seconds so that quickly created and deleted files need 221 * not waste disk bandwidth being created and removed. To realize this, 222 * we append vnodes to a "workitem" queue. When running with a soft 223 * updates implementation, most pending metadata dependencies should 224 * not wait for more than a few seconds. Thus, mounted on block devices 225 * are delayed only about a half the time that file data is delayed. 226 * Similarly, directory updates are more critical, so are only delayed 227 * about a third the time that file data is delayed. Thus, there are 228 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 229 * one each second (driven off the filesystem syncer process). The 230 * syncer_delayno variable indicates the next queue that is to be processed. 231 * Items that need to be processed soon are placed in this queue: 232 * 233 * syncer_workitem_pending[syncer_delayno] 234 * 235 * A delay of fifteen seconds is done by placing the request fifteen 236 * entries later in the queue: 237 * 238 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 239 * 240 */ 241 static int syncer_delayno; 242 static long syncer_mask; 243 LIST_HEAD(synclist, bufobj); 244 static struct synclist *syncer_workitem_pending; 245 /* 246 * The sync_mtx protects: 247 * bo->bo_synclist 248 * sync_vnode_count 249 * syncer_delayno 250 * syncer_state 251 * syncer_workitem_pending 252 * syncer_worklist_len 253 * rushjob 254 */ 255 static struct mtx sync_mtx; 256 static struct cv sync_wakeup; 257 258 #define SYNCER_MAXDELAY 32 259 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 260 static int syncdelay = 30; /* max time to delay syncing data */ 261 static int filedelay = 30; /* time to delay syncing files */ 262 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, 263 "Time to delay syncing files (in seconds)"); 264 static int dirdelay = 29; /* time to delay syncing directories */ 265 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, 266 "Time to delay syncing directories (in seconds)"); 267 static int metadelay = 28; /* time to delay syncing metadata */ 268 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, 269 "Time to delay syncing metadata (in seconds)"); 270 static int rushjob; /* number of slots to run ASAP */ 271 static int stat_rush_requests; /* number of times I/O speeded up */ 272 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, 273 "Number of times I/O speeded up (rush requests)"); 274 275 /* 276 * When shutting down the syncer, run it at four times normal speed. 277 */ 278 #define SYNCER_SHUTDOWN_SPEEDUP 4 279 static int sync_vnode_count; 280 static int syncer_worklist_len; 281 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } 282 syncer_state; 283 284 /* Target for maximum number of vnodes. */ 285 int desiredvnodes; 286 static int gapvnodes; /* gap between wanted and desired */ 287 static int vhiwat; /* enough extras after expansion */ 288 static int vlowat; /* minimal extras before expansion */ 289 static int vstir; /* nonzero to stir non-free vnodes */ 290 static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ 291 292 static int 293 sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS) 294 { 295 int error, old_desiredvnodes; 296 297 old_desiredvnodes = desiredvnodes; 298 if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0) 299 return (error); 300 if (old_desiredvnodes != desiredvnodes) { 301 wantfreevnodes = desiredvnodes / 4; 302 /* XXX locking seems to be incomplete. */ 303 vfs_hash_changesize(desiredvnodes); 304 cache_changesize(desiredvnodes); 305 } 306 return (0); 307 } 308 309 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, 310 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0, 311 sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes"); 312 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 313 &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)"); 314 static int vnlru_nowhere; 315 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, 316 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); 317 318 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ 319 static int vnsz2log; 320 321 /* 322 * Support for the bufobj clean & dirty pctrie. 323 */ 324 static void * 325 buf_trie_alloc(struct pctrie *ptree) 326 { 327 328 return uma_zalloc(buf_trie_zone, M_NOWAIT); 329 } 330 331 static void 332 buf_trie_free(struct pctrie *ptree, void *node) 333 { 334 335 uma_zfree(buf_trie_zone, node); 336 } 337 PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free); 338 339 /* 340 * Initialize the vnode management data structures. 341 * 342 * Reevaluate the following cap on the number of vnodes after the physical 343 * memory size exceeds 512GB. In the limit, as the physical memory size 344 * grows, the ratio of the memory size in KB to to vnodes approaches 64:1. 345 */ 346 #ifndef MAXVNODES_MAX 347 #define MAXVNODES_MAX (512 * 1024 * 1024 / 64) /* 8M */ 348 #endif 349 350 /* 351 * Initialize a vnode as it first enters the zone. 352 */ 353 static int 354 vnode_init(void *mem, int size, int flags) 355 { 356 struct vnode *vp; 357 struct bufobj *bo; 358 359 vp = mem; 360 bzero(vp, size); 361 /* 362 * Setup locks. 363 */ 364 vp->v_vnlock = &vp->v_lock; 365 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); 366 /* 367 * By default, don't allow shared locks unless filesystems opt-in. 368 */ 369 lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, 370 LK_NOSHARE | LK_IS_VNODE); 371 /* 372 * Initialize bufobj. 373 */ 374 bo = &vp->v_bufobj; 375 bo->__bo_vnode = vp; 376 rw_init(BO_LOCKPTR(bo), "bufobj interlock"); 377 bo->bo_private = vp; 378 TAILQ_INIT(&bo->bo_clean.bv_hd); 379 TAILQ_INIT(&bo->bo_dirty.bv_hd); 380 /* 381 * Initialize namecache. 382 */ 383 LIST_INIT(&vp->v_cache_src); 384 TAILQ_INIT(&vp->v_cache_dst); 385 /* 386 * Initialize rangelocks. 387 */ 388 rangelock_init(&vp->v_rl); 389 return (0); 390 } 391 392 /* 393 * Free a vnode when it is cleared from the zone. 394 */ 395 static void 396 vnode_fini(void *mem, int size) 397 { 398 struct vnode *vp; 399 struct bufobj *bo; 400 401 vp = mem; 402 rangelock_destroy(&vp->v_rl); 403 lockdestroy(vp->v_vnlock); 404 mtx_destroy(&vp->v_interlock); 405 bo = &vp->v_bufobj; 406 rw_destroy(BO_LOCKPTR(bo)); 407 } 408 409 /* 410 * Provide the size of NFS nclnode and NFS fh for calculation of the 411 * vnode memory consumption. The size is specified directly to 412 * eliminate dependency on NFS-private header. 413 * 414 * Other filesystems may use bigger or smaller (like UFS and ZFS) 415 * private inode data, but the NFS-based estimation is ample enough. 416 * Still, we care about differences in the size between 64- and 32-bit 417 * platforms. 418 * 419 * Namecache structure size is heuristically 420 * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. 421 */ 422 #ifdef _LP64 423 #define NFS_NCLNODE_SZ (528 + 64) 424 #define NC_SZ 148 425 #else 426 #define NFS_NCLNODE_SZ (360 + 32) 427 #define NC_SZ 92 428 #endif 429 430 static void 431 vntblinit(void *dummy __unused) 432 { 433 u_int i; 434 int physvnodes, virtvnodes; 435 436 /* 437 * Desiredvnodes is a function of the physical memory size and the 438 * kernel's heap size. Generally speaking, it scales with the 439 * physical memory size. The ratio of desiredvnodes to the physical 440 * memory size is 1:16 until desiredvnodes exceeds 98,304. 441 * Thereafter, the 442 * marginal ratio of desiredvnodes to the physical memory size is 443 * 1:64. However, desiredvnodes is limited by the kernel's heap 444 * size. The memory required by desiredvnodes vnodes and vm objects 445 * must not exceed 1/10th of the kernel's heap size. 446 */ 447 physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 + 448 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64; 449 virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + 450 sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); 451 desiredvnodes = min(physvnodes, virtvnodes); 452 if (desiredvnodes > MAXVNODES_MAX) { 453 if (bootverbose) 454 printf("Reducing kern.maxvnodes %d -> %d\n", 455 desiredvnodes, MAXVNODES_MAX); 456 desiredvnodes = MAXVNODES_MAX; 457 } 458 wantfreevnodes = desiredvnodes / 4; 459 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); 460 TAILQ_INIT(&vnode_free_list); 461 mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF); 462 vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, 463 vnode_init, vnode_fini, UMA_ALIGN_PTR, 0); 464 vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), 465 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 466 /* 467 * Preallocate enough nodes to support one-per buf so that 468 * we can not fail an insert. reassignbuf() callers can not 469 * tolerate the insertion failure. 470 */ 471 buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), 472 NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 473 UMA_ZONE_NOFREE | UMA_ZONE_VM); 474 uma_prealloc(buf_trie_zone, nbuf); 475 /* 476 * Initialize the filesystem syncer. 477 */ 478 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 479 &syncer_mask); 480 syncer_maxdelay = syncer_mask + 1; 481 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); 482 cv_init(&sync_wakeup, "syncer"); 483 for (i = 1; i <= sizeof(struct vnode); i <<= 1) 484 vnsz2log++; 485 vnsz2log--; 486 } 487 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); 488 489 490 /* 491 * Mark a mount point as busy. Used to synchronize access and to delay 492 * unmounting. Eventually, mountlist_mtx is not released on failure. 493 * 494 * vfs_busy() is a custom lock, it can block the caller. 495 * vfs_busy() only sleeps if the unmount is active on the mount point. 496 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any 497 * vnode belonging to mp. 498 * 499 * Lookup uses vfs_busy() to traverse mount points. 500 * root fs var fs 501 * / vnode lock A / vnode lock (/var) D 502 * /var vnode lock B /log vnode lock(/var/log) E 503 * vfs_busy lock C vfs_busy lock F 504 * 505 * Within each file system, the lock order is C->A->B and F->D->E. 506 * 507 * When traversing across mounts, the system follows that lock order: 508 * 509 * C->A->B 510 * | 511 * +->F->D->E 512 * 513 * The lookup() process for namei("/var") illustrates the process: 514 * VOP_LOOKUP() obtains B while A is held 515 * vfs_busy() obtains a shared lock on F while A and B are held 516 * vput() releases lock on B 517 * vput() releases lock on A 518 * VFS_ROOT() obtains lock on D while shared lock on F is held 519 * vfs_unbusy() releases shared lock on F 520 * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. 521 * Attempt to lock A (instead of vp_crossmp) while D is held would 522 * violate the global order, causing deadlocks. 523 * 524 * dounmount() locks B while F is drained. 525 */ 526 int 527 vfs_busy(struct mount *mp, int flags) 528 { 529 530 MPASS((flags & ~MBF_MASK) == 0); 531 CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); 532 533 MNT_ILOCK(mp); 534 MNT_REF(mp); 535 /* 536 * If mount point is currently being unmounted, sleep until the 537 * mount point fate is decided. If thread doing the unmounting fails, 538 * it will clear MNTK_UNMOUNT flag before waking us up, indicating 539 * that this mount point has survived the unmount attempt and vfs_busy 540 * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE 541 * flag in addition to MNTK_UNMOUNT, indicating that mount point is 542 * about to be really destroyed. vfs_busy needs to release its 543 * reference on the mount point in this case and return with ENOENT, 544 * telling the caller that mount mount it tried to busy is no longer 545 * valid. 546 */ 547 while (mp->mnt_kern_flag & MNTK_UNMOUNT) { 548 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { 549 MNT_REL(mp); 550 MNT_IUNLOCK(mp); 551 CTR1(KTR_VFS, "%s: failed busying before sleeping", 552 __func__); 553 return (ENOENT); 554 } 555 if (flags & MBF_MNTLSTLOCK) 556 mtx_unlock(&mountlist_mtx); 557 mp->mnt_kern_flag |= MNTK_MWAIT; 558 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); 559 if (flags & MBF_MNTLSTLOCK) 560 mtx_lock(&mountlist_mtx); 561 MNT_ILOCK(mp); 562 } 563 if (flags & MBF_MNTLSTLOCK) 564 mtx_unlock(&mountlist_mtx); 565 mp->mnt_lockref++; 566 MNT_IUNLOCK(mp); 567 return (0); 568 } 569 570 /* 571 * Free a busy filesystem. 572 */ 573 void 574 vfs_unbusy(struct mount *mp) 575 { 576 577 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 578 MNT_ILOCK(mp); 579 MNT_REL(mp); 580 KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref")); 581 mp->mnt_lockref--; 582 if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { 583 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); 584 CTR1(KTR_VFS, "%s: waking up waiters", __func__); 585 mp->mnt_kern_flag &= ~MNTK_DRAINING; 586 wakeup(&mp->mnt_lockref); 587 } 588 MNT_IUNLOCK(mp); 589 } 590 591 /* 592 * Lookup a mount point by filesystem identifier. 593 */ 594 struct mount * 595 vfs_getvfs(fsid_t *fsid) 596 { 597 struct mount *mp; 598 599 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 600 mtx_lock(&mountlist_mtx); 601 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 602 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 603 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 604 vfs_ref(mp); 605 mtx_unlock(&mountlist_mtx); 606 return (mp); 607 } 608 } 609 mtx_unlock(&mountlist_mtx); 610 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 611 return ((struct mount *) 0); 612 } 613 614 /* 615 * Lookup a mount point by filesystem identifier, busying it before 616 * returning. 617 * 618 * To avoid congestion on mountlist_mtx, implement simple direct-mapped 619 * cache for popular filesystem identifiers. The cache is lockess, using 620 * the fact that struct mount's are never freed. In worst case we may 621 * get pointer to unmounted or even different filesystem, so we have to 622 * check what we got, and go slow way if so. 623 */ 624 struct mount * 625 vfs_busyfs(fsid_t *fsid) 626 { 627 #define FSID_CACHE_SIZE 256 628 typedef struct mount * volatile vmp_t; 629 static vmp_t cache[FSID_CACHE_SIZE]; 630 struct mount *mp; 631 int error; 632 uint32_t hash; 633 634 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 635 hash = fsid->val[0] ^ fsid->val[1]; 636 hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); 637 mp = cache[hash]; 638 if (mp == NULL || 639 mp->mnt_stat.f_fsid.val[0] != fsid->val[0] || 640 mp->mnt_stat.f_fsid.val[1] != fsid->val[1]) 641 goto slow; 642 if (vfs_busy(mp, 0) != 0) { 643 cache[hash] = NULL; 644 goto slow; 645 } 646 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 647 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) 648 return (mp); 649 else 650 vfs_unbusy(mp); 651 652 slow: 653 mtx_lock(&mountlist_mtx); 654 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 655 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 656 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 657 error = vfs_busy(mp, MBF_MNTLSTLOCK); 658 if (error) { 659 cache[hash] = NULL; 660 mtx_unlock(&mountlist_mtx); 661 return (NULL); 662 } 663 cache[hash] = mp; 664 return (mp); 665 } 666 } 667 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 668 mtx_unlock(&mountlist_mtx); 669 return ((struct mount *) 0); 670 } 671 672 /* 673 * Check if a user can access privileged mount options. 674 */ 675 int 676 vfs_suser(struct mount *mp, struct thread *td) 677 { 678 int error; 679 680 /* 681 * If the thread is jailed, but this is not a jail-friendly file 682 * system, deny immediately. 683 */ 684 if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred)) 685 return (EPERM); 686 687 /* 688 * If the file system was mounted outside the jail of the calling 689 * thread, deny immediately. 690 */ 691 if (prison_check(td->td_ucred, mp->mnt_cred) != 0) 692 return (EPERM); 693 694 /* 695 * If file system supports delegated administration, we don't check 696 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified 697 * by the file system itself. 698 * If this is not the user that did original mount, we check for 699 * the PRIV_VFS_MOUNT_OWNER privilege. 700 */ 701 if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && 702 mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { 703 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) 704 return (error); 705 } 706 return (0); 707 } 708 709 /* 710 * Get a new unique fsid. Try to make its val[0] unique, since this value 711 * will be used to create fake device numbers for stat(). Also try (but 712 * not so hard) make its val[0] unique mod 2^16, since some emulators only 713 * support 16-bit device numbers. We end up with unique val[0]'s for the 714 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 715 * 716 * Keep in mind that several mounts may be running in parallel. Starting 717 * the search one past where the previous search terminated is both a 718 * micro-optimization and a defense against returning the same fsid to 719 * different mounts. 720 */ 721 void 722 vfs_getnewfsid(struct mount *mp) 723 { 724 static uint16_t mntid_base; 725 struct mount *nmp; 726 fsid_t tfsid; 727 int mtype; 728 729 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 730 mtx_lock(&mntid_mtx); 731 mtype = mp->mnt_vfc->vfc_typenum; 732 tfsid.val[1] = mtype; 733 mtype = (mtype & 0xFF) << 24; 734 for (;;) { 735 tfsid.val[0] = makedev(255, 736 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 737 mntid_base++; 738 if ((nmp = vfs_getvfs(&tfsid)) == NULL) 739 break; 740 vfs_rel(nmp); 741 } 742 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 743 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 744 mtx_unlock(&mntid_mtx); 745 } 746 747 /* 748 * Knob to control the precision of file timestamps: 749 * 750 * 0 = seconds only; nanoseconds zeroed. 751 * 1 = seconds and nanoseconds, accurate within 1/HZ. 752 * 2 = seconds and nanoseconds, truncated to microseconds. 753 * >=3 = seconds and nanoseconds, maximum precision. 754 */ 755 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 756 757 static int timestamp_precision = TSP_USEC; 758 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 759 ×tamp_precision, 0, "File timestamp precision (0: seconds, " 760 "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, " 761 "3+: sec + ns (max. precision))"); 762 763 /* 764 * Get a current timestamp. 765 */ 766 void 767 vfs_timestamp(struct timespec *tsp) 768 { 769 struct timeval tv; 770 771 switch (timestamp_precision) { 772 case TSP_SEC: 773 tsp->tv_sec = time_second; 774 tsp->tv_nsec = 0; 775 break; 776 case TSP_HZ: 777 getnanotime(tsp); 778 break; 779 case TSP_USEC: 780 microtime(&tv); 781 TIMEVAL_TO_TIMESPEC(&tv, tsp); 782 break; 783 case TSP_NSEC: 784 default: 785 nanotime(tsp); 786 break; 787 } 788 } 789 790 /* 791 * Set vnode attributes to VNOVAL 792 */ 793 void 794 vattr_null(struct vattr *vap) 795 { 796 797 vap->va_type = VNON; 798 vap->va_size = VNOVAL; 799 vap->va_bytes = VNOVAL; 800 vap->va_mode = VNOVAL; 801 vap->va_nlink = VNOVAL; 802 vap->va_uid = VNOVAL; 803 vap->va_gid = VNOVAL; 804 vap->va_fsid = VNOVAL; 805 vap->va_fileid = VNOVAL; 806 vap->va_blocksize = VNOVAL; 807 vap->va_rdev = VNOVAL; 808 vap->va_atime.tv_sec = VNOVAL; 809 vap->va_atime.tv_nsec = VNOVAL; 810 vap->va_mtime.tv_sec = VNOVAL; 811 vap->va_mtime.tv_nsec = VNOVAL; 812 vap->va_ctime.tv_sec = VNOVAL; 813 vap->va_ctime.tv_nsec = VNOVAL; 814 vap->va_birthtime.tv_sec = VNOVAL; 815 vap->va_birthtime.tv_nsec = VNOVAL; 816 vap->va_flags = VNOVAL; 817 vap->va_gen = VNOVAL; 818 vap->va_vaflags = 0; 819 } 820 821 /* 822 * This routine is called when we have too many vnodes. It attempts 823 * to free <count> vnodes and will potentially free vnodes that still 824 * have VM backing store (VM backing store is typically the cause 825 * of a vnode blowout so we want to do this). Therefore, this operation 826 * is not considered cheap. 827 * 828 * A number of conditions may prevent a vnode from being reclaimed. 829 * the buffer cache may have references on the vnode, a directory 830 * vnode may still have references due to the namei cache representing 831 * underlying files, or the vnode may be in active use. It is not 832 * desirable to reuse such vnodes. These conditions may cause the 833 * number of vnodes to reach some minimum value regardless of what 834 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 835 */ 836 static int 837 vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger) 838 { 839 struct vnode *vp; 840 int count, done, target; 841 842 done = 0; 843 vn_start_write(NULL, &mp, V_WAIT); 844 MNT_ILOCK(mp); 845 count = mp->mnt_nvnodelistsize; 846 target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1); 847 target = target / 10 + 1; 848 while (count != 0 && done < target) { 849 vp = TAILQ_FIRST(&mp->mnt_nvnodelist); 850 while (vp != NULL && vp->v_type == VMARKER) 851 vp = TAILQ_NEXT(vp, v_nmntvnodes); 852 if (vp == NULL) 853 break; 854 /* 855 * XXX LRU is completely broken for non-free vnodes. First 856 * by calling here in mountpoint order, then by moving 857 * unselected vnodes to the end here, and most grossly by 858 * removing the vlruvp() function that was supposed to 859 * maintain the order. (This function was born broken 860 * since syncer problems prevented it doing anything.) The 861 * order is closer to LRC (C = Created). 862 * 863 * LRU reclaiming of vnodes seems to have last worked in 864 * FreeBSD-3 where LRU wasn't mentioned under any spelling. 865 * Then there was no hold count, and inactive vnodes were 866 * simply put on the free list in LRU order. The separate 867 * lists also break LRU. We prefer to reclaim from the 868 * free list for technical reasons. This tends to thrash 869 * the free list to keep very unrecently used held vnodes. 870 * The problem is mitigated by keeping the free list large. 871 */ 872 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 873 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 874 --count; 875 if (!VI_TRYLOCK(vp)) 876 goto next_iter; 877 /* 878 * If it's been deconstructed already, it's still 879 * referenced, or it exceeds the trigger, skip it. 880 * Also skip free vnodes. We are trying to make space 881 * to expand the free list, not reduce it. 882 */ 883 if (vp->v_usecount || 884 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 885 ((vp->v_iflag & VI_FREE) != 0) || 886 (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL && 887 vp->v_object->resident_page_count > trigger)) { 888 VI_UNLOCK(vp); 889 goto next_iter; 890 } 891 MNT_IUNLOCK(mp); 892 vholdl(vp); 893 if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) { 894 vdrop(vp); 895 goto next_iter_mntunlocked; 896 } 897 VI_LOCK(vp); 898 /* 899 * v_usecount may have been bumped after VOP_LOCK() dropped 900 * the vnode interlock and before it was locked again. 901 * 902 * It is not necessary to recheck VI_DOOMED because it can 903 * only be set by another thread that holds both the vnode 904 * lock and vnode interlock. If another thread has the 905 * vnode lock before we get to VOP_LOCK() and obtains the 906 * vnode interlock after VOP_LOCK() drops the vnode 907 * interlock, the other thread will be unable to drop the 908 * vnode lock before our VOP_LOCK() call fails. 909 */ 910 if (vp->v_usecount || 911 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 912 (vp->v_iflag & VI_FREE) != 0 || 913 (vp->v_object != NULL && 914 vp->v_object->resident_page_count > trigger)) { 915 VOP_UNLOCK(vp, LK_INTERLOCK); 916 vdrop(vp); 917 goto next_iter_mntunlocked; 918 } 919 KASSERT((vp->v_iflag & VI_DOOMED) == 0, 920 ("VI_DOOMED unexpectedly detected in vlrureclaim()")); 921 atomic_add_long(&recycles_count, 1); 922 vgonel(vp); 923 VOP_UNLOCK(vp, 0); 924 vdropl(vp); 925 done++; 926 next_iter_mntunlocked: 927 if (!should_yield()) 928 goto relock_mnt; 929 goto yield; 930 next_iter: 931 if (!should_yield()) 932 continue; 933 MNT_IUNLOCK(mp); 934 yield: 935 kern_yield(PRI_USER); 936 relock_mnt: 937 MNT_ILOCK(mp); 938 } 939 MNT_IUNLOCK(mp); 940 vn_finished_write(mp); 941 return done; 942 } 943 944 static int max_vnlru_free = 10000; /* limit on vnode free requests per call */ 945 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free, 946 0, 947 "limit on vnode free requests per call to the vnlru_free routine"); 948 949 /* 950 * Attempt to reduce the free list by the requested amount. 951 */ 952 static void 953 vnlru_free_locked(int count, struct vfsops *mnt_op) 954 { 955 struct vnode *vp; 956 struct mount *mp; 957 958 mtx_assert(&vnode_free_list_mtx, MA_OWNED); 959 if (count > max_vnlru_free) 960 count = max_vnlru_free; 961 for (; count > 0; count--) { 962 vp = TAILQ_FIRST(&vnode_free_list); 963 /* 964 * The list can be modified while the free_list_mtx 965 * has been dropped and vp could be NULL here. 966 */ 967 if (!vp) 968 break; 969 VNASSERT(vp->v_op != NULL, vp, 970 ("vnlru_free: vnode already reclaimed.")); 971 KASSERT((vp->v_iflag & VI_FREE) != 0, 972 ("Removing vnode not on freelist")); 973 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 974 ("Mangling active vnode")); 975 TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); 976 977 /* 978 * Don't recycle if our vnode is from different type 979 * of mount point. Note that mp is type-safe, the 980 * check does not reach unmapped address even if 981 * vnode is reclaimed. 982 * Don't recycle if we can't get the interlock without 983 * blocking. 984 */ 985 if ((mnt_op != NULL && (mp = vp->v_mount) != NULL && 986 mp->mnt_op != mnt_op) || !VI_TRYLOCK(vp)) { 987 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist); 988 continue; 989 } 990 VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0, 991 vp, ("vp inconsistent on freelist")); 992 993 /* 994 * The clear of VI_FREE prevents activation of the 995 * vnode. There is no sense in putting the vnode on 996 * the mount point active list, only to remove it 997 * later during recycling. Inline the relevant part 998 * of vholdl(), to avoid triggering assertions or 999 * activating. 1000 */ 1001 freevnodes--; 1002 vp->v_iflag &= ~VI_FREE; 1003 refcount_acquire(&vp->v_holdcnt); 1004 1005 mtx_unlock(&vnode_free_list_mtx); 1006 VI_UNLOCK(vp); 1007 vtryrecycle(vp); 1008 /* 1009 * If the recycled succeeded this vdrop will actually free 1010 * the vnode. If not it will simply place it back on 1011 * the free list. 1012 */ 1013 vdrop(vp); 1014 mtx_lock(&vnode_free_list_mtx); 1015 } 1016 } 1017 1018 void 1019 vnlru_free(int count, struct vfsops *mnt_op) 1020 { 1021 1022 mtx_lock(&vnode_free_list_mtx); 1023 vnlru_free_locked(count, mnt_op); 1024 mtx_unlock(&vnode_free_list_mtx); 1025 } 1026 1027 1028 /* XXX some names and initialization are bad for limits and watermarks. */ 1029 static int 1030 vspace(void) 1031 { 1032 int space; 1033 1034 gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); 1035 vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ 1036 vlowat = vhiwat / 2; 1037 if (numvnodes > desiredvnodes) 1038 return (0); 1039 space = desiredvnodes - numvnodes; 1040 if (freevnodes > wantfreevnodes) 1041 space += freevnodes - wantfreevnodes; 1042 return (space); 1043 } 1044 1045 /* 1046 * Attempt to recycle vnodes in a context that is always safe to block. 1047 * Calling vlrurecycle() from the bowels of filesystem code has some 1048 * interesting deadlock problems. 1049 */ 1050 static struct proc *vnlruproc; 1051 static int vnlruproc_sig; 1052 1053 static void 1054 vnlru_proc(void) 1055 { 1056 struct mount *mp, *nmp; 1057 unsigned long ofreevnodes, onumvnodes; 1058 int done, force, reclaim_nc_src, trigger, usevnodes; 1059 1060 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, 1061 SHUTDOWN_PRI_FIRST); 1062 1063 force = 0; 1064 for (;;) { 1065 kproc_suspend_check(vnlruproc); 1066 mtx_lock(&vnode_free_list_mtx); 1067 /* 1068 * If numvnodes is too large (due to desiredvnodes being 1069 * adjusted using its sysctl, or emergency growth), first 1070 * try to reduce it by discarding from the free list. 1071 */ 1072 if (numvnodes > desiredvnodes && freevnodes > 0) 1073 vnlru_free_locked(ulmin(numvnodes - desiredvnodes, 1074 freevnodes), NULL); 1075 /* 1076 * Sleep if the vnode cache is in a good state. This is 1077 * when it is not over-full and has space for about a 4% 1078 * or 9% expansion (by growing its size or inexcessively 1079 * reducing its free list). Otherwise, try to reclaim 1080 * space for a 10% expansion. 1081 */ 1082 if (vstir && force == 0) { 1083 force = 1; 1084 vstir = 0; 1085 } 1086 if (vspace() >= vlowat && force == 0) { 1087 vnlruproc_sig = 0; 1088 wakeup(&vnlruproc_sig); 1089 msleep(vnlruproc, &vnode_free_list_mtx, 1090 PVFS|PDROP, "vlruwt", hz); 1091 continue; 1092 } 1093 mtx_unlock(&vnode_free_list_mtx); 1094 done = 0; 1095 ofreevnodes = freevnodes; 1096 onumvnodes = numvnodes; 1097 /* 1098 * Calculate parameters for recycling. These are the same 1099 * throughout the loop to give some semblance of fairness. 1100 * The trigger point is to avoid recycling vnodes with lots 1101 * of resident pages. We aren't trying to free memory; we 1102 * are trying to recycle or at least free vnodes. 1103 */ 1104 if (numvnodes <= desiredvnodes) 1105 usevnodes = numvnodes - freevnodes; 1106 else 1107 usevnodes = numvnodes; 1108 if (usevnodes <= 0) 1109 usevnodes = 1; 1110 /* 1111 * The trigger value is is chosen to give a conservatively 1112 * large value to ensure that it alone doesn't prevent 1113 * making progress. The value can easily be so large that 1114 * it is effectively infinite in some congested and 1115 * misconfigured cases, and this is necessary. Normally 1116 * it is about 8 to 100 (pages), which is quite large. 1117 */ 1118 trigger = vm_cnt.v_page_count * 2 / usevnodes; 1119 if (force < 2) 1120 trigger = vsmalltrigger; 1121 reclaim_nc_src = force >= 3; 1122 mtx_lock(&mountlist_mtx); 1123 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 1124 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { 1125 nmp = TAILQ_NEXT(mp, mnt_list); 1126 continue; 1127 } 1128 done += vlrureclaim(mp, reclaim_nc_src, trigger); 1129 mtx_lock(&mountlist_mtx); 1130 nmp = TAILQ_NEXT(mp, mnt_list); 1131 vfs_unbusy(mp); 1132 } 1133 mtx_unlock(&mountlist_mtx); 1134 if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) 1135 uma_reclaim(); 1136 if (done == 0) { 1137 if (force == 0 || force == 1) { 1138 force = 2; 1139 continue; 1140 } 1141 if (force == 2) { 1142 force = 3; 1143 continue; 1144 } 1145 force = 0; 1146 vnlru_nowhere++; 1147 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 1148 } else 1149 kern_yield(PRI_USER); 1150 /* 1151 * After becoming active to expand above low water, keep 1152 * active until above high water. 1153 */ 1154 force = vspace() < vhiwat; 1155 } 1156 } 1157 1158 static struct kproc_desc vnlru_kp = { 1159 "vnlru", 1160 vnlru_proc, 1161 &vnlruproc 1162 }; 1163 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, 1164 &vnlru_kp); 1165 1166 /* 1167 * Routines having to do with the management of the vnode table. 1168 */ 1169 1170 /* 1171 * Try to recycle a freed vnode. We abort if anyone picks up a reference 1172 * before we actually vgone(). This function must be called with the vnode 1173 * held to prevent the vnode from being returned to the free list midway 1174 * through vgone(). 1175 */ 1176 static int 1177 vtryrecycle(struct vnode *vp) 1178 { 1179 struct mount *vnmp; 1180 1181 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 1182 VNASSERT(vp->v_holdcnt, vp, 1183 ("vtryrecycle: Recycling vp %p without a reference.", vp)); 1184 /* 1185 * This vnode may found and locked via some other list, if so we 1186 * can't recycle it yet. 1187 */ 1188 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 1189 CTR2(KTR_VFS, 1190 "%s: impossible to recycle, vp %p lock is already held", 1191 __func__, vp); 1192 return (EWOULDBLOCK); 1193 } 1194 /* 1195 * Don't recycle if its filesystem is being suspended. 1196 */ 1197 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { 1198 VOP_UNLOCK(vp, 0); 1199 CTR2(KTR_VFS, 1200 "%s: impossible to recycle, cannot start the write for %p", 1201 __func__, vp); 1202 return (EBUSY); 1203 } 1204 /* 1205 * If we got this far, we need to acquire the interlock and see if 1206 * anyone picked up this vnode from another list. If not, we will 1207 * mark it with DOOMED via vgonel() so that anyone who does find it 1208 * will skip over it. 1209 */ 1210 VI_LOCK(vp); 1211 if (vp->v_usecount) { 1212 VOP_UNLOCK(vp, LK_INTERLOCK); 1213 vn_finished_write(vnmp); 1214 CTR2(KTR_VFS, 1215 "%s: impossible to recycle, %p is already referenced", 1216 __func__, vp); 1217 return (EBUSY); 1218 } 1219 if ((vp->v_iflag & VI_DOOMED) == 0) { 1220 atomic_add_long(&recycles_count, 1); 1221 vgonel(vp); 1222 } 1223 VOP_UNLOCK(vp, LK_INTERLOCK); 1224 vn_finished_write(vnmp); 1225 return (0); 1226 } 1227 1228 static void 1229 vcheckspace(void) 1230 { 1231 1232 if (vspace() < vlowat && vnlruproc_sig == 0) { 1233 vnlruproc_sig = 1; 1234 wakeup(vnlruproc); 1235 } 1236 } 1237 1238 /* 1239 * Wait if necessary for space for a new vnode. 1240 */ 1241 static int 1242 getnewvnode_wait(int suspended) 1243 { 1244 1245 mtx_assert(&vnode_free_list_mtx, MA_OWNED); 1246 if (numvnodes >= desiredvnodes) { 1247 if (suspended) { 1248 /* 1249 * The file system is being suspended. We cannot 1250 * risk a deadlock here, so allow allocation of 1251 * another vnode even if this would give too many. 1252 */ 1253 return (0); 1254 } 1255 if (vnlruproc_sig == 0) { 1256 vnlruproc_sig = 1; /* avoid unnecessary wakeups */ 1257 wakeup(vnlruproc); 1258 } 1259 msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS, 1260 "vlruwk", hz); 1261 } 1262 /* Post-adjust like the pre-adjust in getnewvnode(). */ 1263 if (numvnodes + 1 > desiredvnodes && freevnodes > 1) 1264 vnlru_free_locked(1, NULL); 1265 return (numvnodes >= desiredvnodes ? ENFILE : 0); 1266 } 1267 1268 /* 1269 * This hack is fragile, and probably not needed any more now that the 1270 * watermark handling works. 1271 */ 1272 void 1273 getnewvnode_reserve(u_int count) 1274 { 1275 struct thread *td; 1276 1277 /* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */ 1278 /* XXX no longer so quick, but this part is not racy. */ 1279 mtx_lock(&vnode_free_list_mtx); 1280 if (numvnodes + count > desiredvnodes && freevnodes > wantfreevnodes) 1281 vnlru_free_locked(ulmin(numvnodes + count - desiredvnodes, 1282 freevnodes - wantfreevnodes), NULL); 1283 mtx_unlock(&vnode_free_list_mtx); 1284 1285 td = curthread; 1286 /* First try to be quick and racy. */ 1287 if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) { 1288 td->td_vp_reserv += count; 1289 vcheckspace(); /* XXX no longer so quick, but more racy */ 1290 return; 1291 } else 1292 atomic_subtract_long(&numvnodes, count); 1293 1294 mtx_lock(&vnode_free_list_mtx); 1295 while (count > 0) { 1296 if (getnewvnode_wait(0) == 0) { 1297 count--; 1298 td->td_vp_reserv++; 1299 atomic_add_long(&numvnodes, 1); 1300 } 1301 } 1302 vcheckspace(); 1303 mtx_unlock(&vnode_free_list_mtx); 1304 } 1305 1306 /* 1307 * This hack is fragile, especially if desiredvnodes or wantvnodes are 1308 * misconfgured or changed significantly. Reducing desiredvnodes below 1309 * the reserved amount should cause bizarre behaviour like reducing it 1310 * below the number of active vnodes -- the system will try to reduce 1311 * numvnodes to match, but should fail, so the subtraction below should 1312 * not overflow. 1313 */ 1314 void 1315 getnewvnode_drop_reserve(void) 1316 { 1317 struct thread *td; 1318 1319 td = curthread; 1320 atomic_subtract_long(&numvnodes, td->td_vp_reserv); 1321 td->td_vp_reserv = 0; 1322 } 1323 1324 /* 1325 * Return the next vnode from the free list. 1326 */ 1327 int 1328 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, 1329 struct vnode **vpp) 1330 { 1331 struct vnode *vp; 1332 struct thread *td; 1333 struct lock_object *lo; 1334 static int cyclecount; 1335 int error; 1336 1337 CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); 1338 vp = NULL; 1339 td = curthread; 1340 if (td->td_vp_reserv > 0) { 1341 td->td_vp_reserv -= 1; 1342 goto alloc; 1343 } 1344 mtx_lock(&vnode_free_list_mtx); 1345 if (numvnodes < desiredvnodes) 1346 cyclecount = 0; 1347 else if (cyclecount++ >= freevnodes) { 1348 cyclecount = 0; 1349 vstir = 1; 1350 } 1351 /* 1352 * Grow the vnode cache if it will not be above its target max 1353 * after growing. Otherwise, if the free list is nonempty, try 1354 * to reclaim 1 item from it before growing the cache (possibly 1355 * above its target max if the reclamation failed or is delayed). 1356 * Otherwise, wait for some space. In all cases, schedule 1357 * vnlru_proc() if we are getting short of space. The watermarks 1358 * should be chosen so that we never wait or even reclaim from 1359 * the free list to below its target minimum. 1360 */ 1361 if (numvnodes + 1 <= desiredvnodes) 1362 ; 1363 else if (freevnodes > 0) 1364 vnlru_free_locked(1, NULL); 1365 else { 1366 error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag & 1367 MNTK_SUSPEND)); 1368 #if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ 1369 if (error != 0) { 1370 mtx_unlock(&vnode_free_list_mtx); 1371 return (error); 1372 } 1373 #endif 1374 } 1375 vcheckspace(); 1376 atomic_add_long(&numvnodes, 1); 1377 mtx_unlock(&vnode_free_list_mtx); 1378 alloc: 1379 atomic_add_long(&vnodes_created, 1); 1380 vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK); 1381 /* 1382 * Locks are given the generic name "vnode" when created. 1383 * Follow the historic practice of using the filesystem 1384 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. 1385 * 1386 * Locks live in a witness group keyed on their name. Thus, 1387 * when a lock is renamed, it must also move from the witness 1388 * group of its old name to the witness group of its new name. 1389 * 1390 * The change only needs to be made when the vnode moves 1391 * from one filesystem type to another. We ensure that each 1392 * filesystem use a single static name pointer for its tag so 1393 * that we can compare pointers rather than doing a strcmp(). 1394 */ 1395 lo = &vp->v_vnlock->lock_object; 1396 if (lo->lo_name != tag) { 1397 lo->lo_name = tag; 1398 WITNESS_DESTROY(lo); 1399 WITNESS_INIT(lo, tag); 1400 } 1401 /* 1402 * By default, don't allow shared locks unless filesystems opt-in. 1403 */ 1404 vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; 1405 /* 1406 * Finalize various vnode identity bits. 1407 */ 1408 KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); 1409 KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); 1410 KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); 1411 vp->v_type = VNON; 1412 vp->v_tag = tag; 1413 vp->v_op = vops; 1414 v_init_counters(vp); 1415 vp->v_bufobj.bo_ops = &buf_ops_bio; 1416 #ifdef MAC 1417 mac_vnode_init(vp); 1418 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) 1419 mac_vnode_associate_singlelabel(mp, vp); 1420 else if (mp == NULL && vops != &dead_vnodeops) 1421 printf("NULL mp in getnewvnode()\n"); 1422 #endif 1423 if (mp != NULL) { 1424 vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; 1425 if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) 1426 vp->v_vflag |= VV_NOKNOTE; 1427 } 1428 1429 /* 1430 * For the filesystems which do not use vfs_hash_insert(), 1431 * still initialize v_hash to have vfs_hash_index() useful. 1432 * E.g., nullfs uses vfs_hash_index() on the lower vnode for 1433 * its own hashing. 1434 */ 1435 vp->v_hash = (uintptr_t)vp >> vnsz2log; 1436 1437 *vpp = vp; 1438 return (0); 1439 } 1440 1441 /* 1442 * Delete from old mount point vnode list, if on one. 1443 */ 1444 static void 1445 delmntque(struct vnode *vp) 1446 { 1447 struct mount *mp; 1448 int active; 1449 1450 mp = vp->v_mount; 1451 if (mp == NULL) 1452 return; 1453 MNT_ILOCK(mp); 1454 VI_LOCK(vp); 1455 KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize, 1456 ("Active vnode list size %d > Vnode list size %d", 1457 mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize)); 1458 active = vp->v_iflag & VI_ACTIVE; 1459 vp->v_iflag &= ~VI_ACTIVE; 1460 if (active) { 1461 mtx_lock(&vnode_free_list_mtx); 1462 TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist); 1463 mp->mnt_activevnodelistsize--; 1464 mtx_unlock(&vnode_free_list_mtx); 1465 } 1466 vp->v_mount = NULL; 1467 VI_UNLOCK(vp); 1468 VNASSERT(mp->mnt_nvnodelistsize > 0, vp, 1469 ("bad mount point vnode list size")); 1470 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1471 mp->mnt_nvnodelistsize--; 1472 MNT_REL(mp); 1473 MNT_IUNLOCK(mp); 1474 } 1475 1476 static void 1477 insmntque_stddtr(struct vnode *vp, void *dtr_arg) 1478 { 1479 1480 vp->v_data = NULL; 1481 vp->v_op = &dead_vnodeops; 1482 vgone(vp); 1483 vput(vp); 1484 } 1485 1486 /* 1487 * Insert into list of vnodes for the new mount point, if available. 1488 */ 1489 int 1490 insmntque1(struct vnode *vp, struct mount *mp, 1491 void (*dtr)(struct vnode *, void *), void *dtr_arg) 1492 { 1493 1494 KASSERT(vp->v_mount == NULL, 1495 ("insmntque: vnode already on per mount vnode list")); 1496 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); 1497 ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); 1498 1499 /* 1500 * We acquire the vnode interlock early to ensure that the 1501 * vnode cannot be recycled by another process releasing a 1502 * holdcnt on it before we get it on both the vnode list 1503 * and the active vnode list. The mount mutex protects only 1504 * manipulation of the vnode list and the vnode freelist 1505 * mutex protects only manipulation of the active vnode list. 1506 * Hence the need to hold the vnode interlock throughout. 1507 */ 1508 MNT_ILOCK(mp); 1509 VI_LOCK(vp); 1510 if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 && 1511 ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || 1512 mp->mnt_nvnodelistsize == 0)) && 1513 (vp->v_vflag & VV_FORCEINSMQ) == 0) { 1514 VI_UNLOCK(vp); 1515 MNT_IUNLOCK(mp); 1516 if (dtr != NULL) 1517 dtr(vp, dtr_arg); 1518 return (EBUSY); 1519 } 1520 vp->v_mount = mp; 1521 MNT_REF(mp); 1522 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1523 VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, 1524 ("neg mount point vnode list size")); 1525 mp->mnt_nvnodelistsize++; 1526 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 1527 ("Activating already active vnode")); 1528 vp->v_iflag |= VI_ACTIVE; 1529 mtx_lock(&vnode_free_list_mtx); 1530 TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); 1531 mp->mnt_activevnodelistsize++; 1532 mtx_unlock(&vnode_free_list_mtx); 1533 VI_UNLOCK(vp); 1534 MNT_IUNLOCK(mp); 1535 return (0); 1536 } 1537 1538 int 1539 insmntque(struct vnode *vp, struct mount *mp) 1540 { 1541 1542 return (insmntque1(vp, mp, insmntque_stddtr, NULL)); 1543 } 1544 1545 /* 1546 * Flush out and invalidate all buffers associated with a bufobj 1547 * Called with the underlying object locked. 1548 */ 1549 int 1550 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) 1551 { 1552 int error; 1553 1554 BO_LOCK(bo); 1555 if (flags & V_SAVE) { 1556 error = bufobj_wwait(bo, slpflag, slptimeo); 1557 if (error) { 1558 BO_UNLOCK(bo); 1559 return (error); 1560 } 1561 if (bo->bo_dirty.bv_cnt > 0) { 1562 BO_UNLOCK(bo); 1563 if ((error = BO_SYNC(bo, MNT_WAIT)) != 0) 1564 return (error); 1565 /* 1566 * XXX We could save a lock/unlock if this was only 1567 * enabled under INVARIANTS 1568 */ 1569 BO_LOCK(bo); 1570 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 1571 panic("vinvalbuf: dirty bufs"); 1572 } 1573 } 1574 /* 1575 * If you alter this loop please notice that interlock is dropped and 1576 * reacquired in flushbuflist. Special care is needed to ensure that 1577 * no race conditions occur from this. 1578 */ 1579 do { 1580 error = flushbuflist(&bo->bo_clean, 1581 flags, bo, slpflag, slptimeo); 1582 if (error == 0 && !(flags & V_CLEANONLY)) 1583 error = flushbuflist(&bo->bo_dirty, 1584 flags, bo, slpflag, slptimeo); 1585 if (error != 0 && error != EAGAIN) { 1586 BO_UNLOCK(bo); 1587 return (error); 1588 } 1589 } while (error != 0); 1590 1591 /* 1592 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 1593 * have write I/O in-progress but if there is a VM object then the 1594 * VM object can also have read-I/O in-progress. 1595 */ 1596 do { 1597 bufobj_wwait(bo, 0, 0); 1598 BO_UNLOCK(bo); 1599 if (bo->bo_object != NULL) { 1600 VM_OBJECT_WLOCK(bo->bo_object); 1601 vm_object_pip_wait(bo->bo_object, "bovlbx"); 1602 VM_OBJECT_WUNLOCK(bo->bo_object); 1603 } 1604 BO_LOCK(bo); 1605 } while (bo->bo_numoutput > 0); 1606 BO_UNLOCK(bo); 1607 1608 /* 1609 * Destroy the copy in the VM cache, too. 1610 */ 1611 if (bo->bo_object != NULL && 1612 (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) { 1613 VM_OBJECT_WLOCK(bo->bo_object); 1614 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? 1615 OBJPR_CLEANONLY : 0); 1616 VM_OBJECT_WUNLOCK(bo->bo_object); 1617 } 1618 1619 #ifdef INVARIANTS 1620 BO_LOCK(bo); 1621 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 && 1622 (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) 1623 panic("vinvalbuf: flush failed"); 1624 BO_UNLOCK(bo); 1625 #endif 1626 return (0); 1627 } 1628 1629 /* 1630 * Flush out and invalidate all buffers associated with a vnode. 1631 * Called with the underlying object locked. 1632 */ 1633 int 1634 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) 1635 { 1636 1637 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 1638 ASSERT_VOP_LOCKED(vp, "vinvalbuf"); 1639 if (vp->v_object != NULL && vp->v_object->handle != vp) 1640 return (0); 1641 return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); 1642 } 1643 1644 /* 1645 * Flush out buffers on the specified list. 1646 * 1647 */ 1648 static int 1649 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, 1650 int slptimeo) 1651 { 1652 struct buf *bp, *nbp; 1653 int retval, error; 1654 daddr_t lblkno; 1655 b_xflags_t xflags; 1656 1657 ASSERT_BO_WLOCKED(bo); 1658 1659 retval = 0; 1660 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { 1661 if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) || 1662 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) { 1663 continue; 1664 } 1665 lblkno = 0; 1666 xflags = 0; 1667 if (nbp != NULL) { 1668 lblkno = nbp->b_lblkno; 1669 xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); 1670 } 1671 retval = EAGAIN; 1672 error = BUF_TIMELOCK(bp, 1673 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), 1674 "flushbuf", slpflag, slptimeo); 1675 if (error) { 1676 BO_LOCK(bo); 1677 return (error != ENOLCK ? error : EAGAIN); 1678 } 1679 KASSERT(bp->b_bufobj == bo, 1680 ("bp %p wrong b_bufobj %p should be %p", 1681 bp, bp->b_bufobj, bo)); 1682 /* 1683 * XXX Since there are no node locks for NFS, I 1684 * believe there is a slight chance that a delayed 1685 * write will occur while sleeping just above, so 1686 * check for it. 1687 */ 1688 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 1689 (flags & V_SAVE)) { 1690 bremfree(bp); 1691 bp->b_flags |= B_ASYNC; 1692 bwrite(bp); 1693 BO_LOCK(bo); 1694 return (EAGAIN); /* XXX: why not loop ? */ 1695 } 1696 bremfree(bp); 1697 bp->b_flags |= (B_INVAL | B_RELBUF); 1698 bp->b_flags &= ~B_ASYNC; 1699 brelse(bp); 1700 BO_LOCK(bo); 1701 nbp = gbincore(bo, lblkno); 1702 if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 1703 != xflags) 1704 break; /* nbp invalid */ 1705 } 1706 return (retval); 1707 } 1708 1709 int 1710 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) 1711 { 1712 struct buf *bp; 1713 int error; 1714 daddr_t lblkno; 1715 1716 ASSERT_BO_LOCKED(bo); 1717 1718 for (lblkno = startn;;) { 1719 again: 1720 bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno); 1721 if (bp == NULL || bp->b_lblkno >= endn || 1722 bp->b_lblkno < startn) 1723 break; 1724 error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 1725 LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); 1726 if (error != 0) { 1727 BO_RLOCK(bo); 1728 if (error == ENOLCK) 1729 goto again; 1730 return (error); 1731 } 1732 KASSERT(bp->b_bufobj == bo, 1733 ("bp %p wrong b_bufobj %p should be %p", 1734 bp, bp->b_bufobj, bo)); 1735 lblkno = bp->b_lblkno + 1; 1736 if ((bp->b_flags & B_MANAGED) == 0) 1737 bremfree(bp); 1738 bp->b_flags |= B_RELBUF; 1739 /* 1740 * In the VMIO case, use the B_NOREUSE flag to hint that the 1741 * pages backing each buffer in the range are unlikely to be 1742 * reused. Dirty buffers will have the hint applied once 1743 * they've been written. 1744 */ 1745 if (bp->b_vp->v_object != NULL) 1746 bp->b_flags |= B_NOREUSE; 1747 brelse(bp); 1748 BO_RLOCK(bo); 1749 } 1750 return (0); 1751 } 1752 1753 /* 1754 * Truncate a file's buffer and pages to a specified length. This 1755 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 1756 * sync activity. 1757 */ 1758 int 1759 vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize) 1760 { 1761 struct buf *bp, *nbp; 1762 int anyfreed; 1763 int trunclbn; 1764 struct bufobj *bo; 1765 1766 CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__, 1767 vp, cred, blksize, (uintmax_t)length); 1768 1769 /* 1770 * Round up to the *next* lbn. 1771 */ 1772 trunclbn = howmany(length, blksize); 1773 1774 ASSERT_VOP_LOCKED(vp, "vtruncbuf"); 1775 restart: 1776 bo = &vp->v_bufobj; 1777 BO_LOCK(bo); 1778 anyfreed = 1; 1779 for (;anyfreed;) { 1780 anyfreed = 0; 1781 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { 1782 if (bp->b_lblkno < trunclbn) 1783 continue; 1784 if (BUF_LOCK(bp, 1785 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1786 BO_LOCKPTR(bo)) == ENOLCK) 1787 goto restart; 1788 1789 bremfree(bp); 1790 bp->b_flags |= (B_INVAL | B_RELBUF); 1791 bp->b_flags &= ~B_ASYNC; 1792 brelse(bp); 1793 anyfreed = 1; 1794 1795 BO_LOCK(bo); 1796 if (nbp != NULL && 1797 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 1798 (nbp->b_vp != vp) || 1799 (nbp->b_flags & B_DELWRI))) { 1800 BO_UNLOCK(bo); 1801 goto restart; 1802 } 1803 } 1804 1805 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 1806 if (bp->b_lblkno < trunclbn) 1807 continue; 1808 if (BUF_LOCK(bp, 1809 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1810 BO_LOCKPTR(bo)) == ENOLCK) 1811 goto restart; 1812 bremfree(bp); 1813 bp->b_flags |= (B_INVAL | B_RELBUF); 1814 bp->b_flags &= ~B_ASYNC; 1815 brelse(bp); 1816 anyfreed = 1; 1817 1818 BO_LOCK(bo); 1819 if (nbp != NULL && 1820 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 1821 (nbp->b_vp != vp) || 1822 (nbp->b_flags & B_DELWRI) == 0)) { 1823 BO_UNLOCK(bo); 1824 goto restart; 1825 } 1826 } 1827 } 1828 1829 if (length > 0) { 1830 restartsync: 1831 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 1832 if (bp->b_lblkno > 0) 1833 continue; 1834 /* 1835 * Since we hold the vnode lock this should only 1836 * fail if we're racing with the buf daemon. 1837 */ 1838 if (BUF_LOCK(bp, 1839 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1840 BO_LOCKPTR(bo)) == ENOLCK) { 1841 goto restart; 1842 } 1843 VNASSERT((bp->b_flags & B_DELWRI), vp, 1844 ("buf(%p) on dirty queue without DELWRI", bp)); 1845 1846 bremfree(bp); 1847 bawrite(bp); 1848 BO_LOCK(bo); 1849 goto restartsync; 1850 } 1851 } 1852 1853 bufobj_wwait(bo, 0, 0); 1854 BO_UNLOCK(bo); 1855 vnode_pager_setsize(vp, length); 1856 1857 return (0); 1858 } 1859 1860 static void 1861 buf_vlist_remove(struct buf *bp) 1862 { 1863 struct bufv *bv; 1864 1865 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 1866 ASSERT_BO_WLOCKED(bp->b_bufobj); 1867 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) != 1868 (BX_VNDIRTY|BX_VNCLEAN), 1869 ("buf_vlist_remove: Buf %p is on two lists", bp)); 1870 if (bp->b_xflags & BX_VNDIRTY) 1871 bv = &bp->b_bufobj->bo_dirty; 1872 else 1873 bv = &bp->b_bufobj->bo_clean; 1874 BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); 1875 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); 1876 bv->bv_cnt--; 1877 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1878 } 1879 1880 /* 1881 * Add the buffer to the sorted clean or dirty block list. 1882 * 1883 * NOTE: xflags is passed as a constant, optimizing this inline function! 1884 */ 1885 static void 1886 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 1887 { 1888 struct bufv *bv; 1889 struct buf *n; 1890 int error; 1891 1892 ASSERT_BO_WLOCKED(bo); 1893 KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, 1894 ("dead bo %p", bo)); 1895 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, 1896 ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); 1897 bp->b_xflags |= xflags; 1898 if (xflags & BX_VNDIRTY) 1899 bv = &bo->bo_dirty; 1900 else 1901 bv = &bo->bo_clean; 1902 1903 /* 1904 * Keep the list ordered. Optimize empty list insertion. Assume 1905 * we tend to grow at the tail so lookup_le should usually be cheaper 1906 * than _ge. 1907 */ 1908 if (bv->bv_cnt == 0 || 1909 bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) 1910 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); 1911 else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) 1912 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); 1913 else 1914 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); 1915 error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); 1916 if (error) 1917 panic("buf_vlist_add: Preallocated nodes insufficient."); 1918 bv->bv_cnt++; 1919 } 1920 1921 /* 1922 * Look up a buffer using the buffer tries. 1923 */ 1924 struct buf * 1925 gbincore(struct bufobj *bo, daddr_t lblkno) 1926 { 1927 struct buf *bp; 1928 1929 ASSERT_BO_LOCKED(bo); 1930 bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); 1931 if (bp != NULL) 1932 return (bp); 1933 return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno); 1934 } 1935 1936 /* 1937 * Associate a buffer with a vnode. 1938 */ 1939 void 1940 bgetvp(struct vnode *vp, struct buf *bp) 1941 { 1942 struct bufobj *bo; 1943 1944 bo = &vp->v_bufobj; 1945 ASSERT_BO_WLOCKED(bo); 1946 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); 1947 1948 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); 1949 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, 1950 ("bgetvp: bp already attached! %p", bp)); 1951 1952 vhold(vp); 1953 bp->b_vp = vp; 1954 bp->b_bufobj = bo; 1955 /* 1956 * Insert onto list for new vnode. 1957 */ 1958 buf_vlist_add(bp, bo, BX_VNCLEAN); 1959 } 1960 1961 /* 1962 * Disassociate a buffer from a vnode. 1963 */ 1964 void 1965 brelvp(struct buf *bp) 1966 { 1967 struct bufobj *bo; 1968 struct vnode *vp; 1969 1970 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 1971 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 1972 1973 /* 1974 * Delete from old vnode list, if on one. 1975 */ 1976 vp = bp->b_vp; /* XXX */ 1977 bo = bp->b_bufobj; 1978 BO_LOCK(bo); 1979 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 1980 buf_vlist_remove(bp); 1981 else 1982 panic("brelvp: Buffer %p not on queue.", bp); 1983 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 1984 bo->bo_flag &= ~BO_ONWORKLST; 1985 mtx_lock(&sync_mtx); 1986 LIST_REMOVE(bo, bo_synclist); 1987 syncer_worklist_len--; 1988 mtx_unlock(&sync_mtx); 1989 } 1990 bp->b_vp = NULL; 1991 bp->b_bufobj = NULL; 1992 BO_UNLOCK(bo); 1993 vdrop(vp); 1994 } 1995 1996 /* 1997 * Add an item to the syncer work queue. 1998 */ 1999 static void 2000 vn_syncer_add_to_worklist(struct bufobj *bo, int delay) 2001 { 2002 int slot; 2003 2004 ASSERT_BO_WLOCKED(bo); 2005 2006 mtx_lock(&sync_mtx); 2007 if (bo->bo_flag & BO_ONWORKLST) 2008 LIST_REMOVE(bo, bo_synclist); 2009 else { 2010 bo->bo_flag |= BO_ONWORKLST; 2011 syncer_worklist_len++; 2012 } 2013 2014 if (delay > syncer_maxdelay - 2) 2015 delay = syncer_maxdelay - 2; 2016 slot = (syncer_delayno + delay) & syncer_mask; 2017 2018 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); 2019 mtx_unlock(&sync_mtx); 2020 } 2021 2022 static int 2023 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) 2024 { 2025 int error, len; 2026 2027 mtx_lock(&sync_mtx); 2028 len = syncer_worklist_len - sync_vnode_count; 2029 mtx_unlock(&sync_mtx); 2030 error = SYSCTL_OUT(req, &len, sizeof(len)); 2031 return (error); 2032 } 2033 2034 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, 2035 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); 2036 2037 static struct proc *updateproc; 2038 static void sched_sync(void); 2039 static struct kproc_desc up_kp = { 2040 "syncer", 2041 sched_sync, 2042 &updateproc 2043 }; 2044 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); 2045 2046 static int 2047 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) 2048 { 2049 struct vnode *vp; 2050 struct mount *mp; 2051 2052 *bo = LIST_FIRST(slp); 2053 if (*bo == NULL) 2054 return (0); 2055 vp = (*bo)->__bo_vnode; /* XXX */ 2056 if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) 2057 return (1); 2058 /* 2059 * We use vhold in case the vnode does not 2060 * successfully sync. vhold prevents the vnode from 2061 * going away when we unlock the sync_mtx so that 2062 * we can acquire the vnode interlock. 2063 */ 2064 vholdl(vp); 2065 mtx_unlock(&sync_mtx); 2066 VI_UNLOCK(vp); 2067 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 2068 vdrop(vp); 2069 mtx_lock(&sync_mtx); 2070 return (*bo == LIST_FIRST(slp)); 2071 } 2072 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2073 (void) VOP_FSYNC(vp, MNT_LAZY, td); 2074 VOP_UNLOCK(vp, 0); 2075 vn_finished_write(mp); 2076 BO_LOCK(*bo); 2077 if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { 2078 /* 2079 * Put us back on the worklist. The worklist 2080 * routine will remove us from our current 2081 * position and then add us back in at a later 2082 * position. 2083 */ 2084 vn_syncer_add_to_worklist(*bo, syncdelay); 2085 } 2086 BO_UNLOCK(*bo); 2087 vdrop(vp); 2088 mtx_lock(&sync_mtx); 2089 return (0); 2090 } 2091 2092 static int first_printf = 1; 2093 2094 /* 2095 * System filesystem synchronizer daemon. 2096 */ 2097 static void 2098 sched_sync(void) 2099 { 2100 struct synclist *next, *slp; 2101 struct bufobj *bo; 2102 long starttime; 2103 struct thread *td = curthread; 2104 int last_work_seen; 2105 int net_worklist_len; 2106 int syncer_final_iter; 2107 int error; 2108 2109 last_work_seen = 0; 2110 syncer_final_iter = 0; 2111 syncer_state = SYNCER_RUNNING; 2112 starttime = time_uptime; 2113 td->td_pflags |= TDP_NORUNNINGBUF; 2114 2115 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, 2116 SHUTDOWN_PRI_LAST); 2117 2118 mtx_lock(&sync_mtx); 2119 for (;;) { 2120 if (syncer_state == SYNCER_FINAL_DELAY && 2121 syncer_final_iter == 0) { 2122 mtx_unlock(&sync_mtx); 2123 kproc_suspend_check(td->td_proc); 2124 mtx_lock(&sync_mtx); 2125 } 2126 net_worklist_len = syncer_worklist_len - sync_vnode_count; 2127 if (syncer_state != SYNCER_RUNNING && 2128 starttime != time_uptime) { 2129 if (first_printf) { 2130 printf("\nSyncing disks, vnodes remaining... "); 2131 first_printf = 0; 2132 } 2133 printf("%d ", net_worklist_len); 2134 } 2135 starttime = time_uptime; 2136 2137 /* 2138 * Push files whose dirty time has expired. Be careful 2139 * of interrupt race on slp queue. 2140 * 2141 * Skip over empty worklist slots when shutting down. 2142 */ 2143 do { 2144 slp = &syncer_workitem_pending[syncer_delayno]; 2145 syncer_delayno += 1; 2146 if (syncer_delayno == syncer_maxdelay) 2147 syncer_delayno = 0; 2148 next = &syncer_workitem_pending[syncer_delayno]; 2149 /* 2150 * If the worklist has wrapped since the 2151 * it was emptied of all but syncer vnodes, 2152 * switch to the FINAL_DELAY state and run 2153 * for one more second. 2154 */ 2155 if (syncer_state == SYNCER_SHUTTING_DOWN && 2156 net_worklist_len == 0 && 2157 last_work_seen == syncer_delayno) { 2158 syncer_state = SYNCER_FINAL_DELAY; 2159 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; 2160 } 2161 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && 2162 syncer_worklist_len > 0); 2163 2164 /* 2165 * Keep track of the last time there was anything 2166 * on the worklist other than syncer vnodes. 2167 * Return to the SHUTTING_DOWN state if any 2168 * new work appears. 2169 */ 2170 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) 2171 last_work_seen = syncer_delayno; 2172 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) 2173 syncer_state = SYNCER_SHUTTING_DOWN; 2174 while (!LIST_EMPTY(slp)) { 2175 error = sync_vnode(slp, &bo, td); 2176 if (error == 1) { 2177 LIST_REMOVE(bo, bo_synclist); 2178 LIST_INSERT_HEAD(next, bo, bo_synclist); 2179 continue; 2180 } 2181 2182 if (first_printf == 0) { 2183 /* 2184 * Drop the sync mutex, because some watchdog 2185 * drivers need to sleep while patting 2186 */ 2187 mtx_unlock(&sync_mtx); 2188 wdog_kern_pat(WD_LASTVAL); 2189 mtx_lock(&sync_mtx); 2190 } 2191 2192 } 2193 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) 2194 syncer_final_iter--; 2195 /* 2196 * The variable rushjob allows the kernel to speed up the 2197 * processing of the filesystem syncer process. A rushjob 2198 * value of N tells the filesystem syncer to process the next 2199 * N seconds worth of work on its queue ASAP. Currently rushjob 2200 * is used by the soft update code to speed up the filesystem 2201 * syncer process when the incore state is getting so far 2202 * ahead of the disk that the kernel memory pool is being 2203 * threatened with exhaustion. 2204 */ 2205 if (rushjob > 0) { 2206 rushjob -= 1; 2207 continue; 2208 } 2209 /* 2210 * Just sleep for a short period of time between 2211 * iterations when shutting down to allow some I/O 2212 * to happen. 2213 * 2214 * If it has taken us less than a second to process the 2215 * current work, then wait. Otherwise start right over 2216 * again. We can still lose time if any single round 2217 * takes more than two seconds, but it does not really 2218 * matter as we are just trying to generally pace the 2219 * filesystem activity. 2220 */ 2221 if (syncer_state != SYNCER_RUNNING || 2222 time_uptime == starttime) { 2223 thread_lock(td); 2224 sched_prio(td, PPAUSE); 2225 thread_unlock(td); 2226 } 2227 if (syncer_state != SYNCER_RUNNING) 2228 cv_timedwait(&sync_wakeup, &sync_mtx, 2229 hz / SYNCER_SHUTDOWN_SPEEDUP); 2230 else if (time_uptime == starttime) 2231 cv_timedwait(&sync_wakeup, &sync_mtx, hz); 2232 } 2233 } 2234 2235 /* 2236 * Request the syncer daemon to speed up its work. 2237 * We never push it to speed up more than half of its 2238 * normal turn time, otherwise it could take over the cpu. 2239 */ 2240 int 2241 speedup_syncer(void) 2242 { 2243 int ret = 0; 2244 2245 mtx_lock(&sync_mtx); 2246 if (rushjob < syncdelay / 2) { 2247 rushjob += 1; 2248 stat_rush_requests += 1; 2249 ret = 1; 2250 } 2251 mtx_unlock(&sync_mtx); 2252 cv_broadcast(&sync_wakeup); 2253 return (ret); 2254 } 2255 2256 /* 2257 * Tell the syncer to speed up its work and run though its work 2258 * list several times, then tell it to shut down. 2259 */ 2260 static void 2261 syncer_shutdown(void *arg, int howto) 2262 { 2263 2264 if (howto & RB_NOSYNC) 2265 return; 2266 mtx_lock(&sync_mtx); 2267 syncer_state = SYNCER_SHUTTING_DOWN; 2268 rushjob = 0; 2269 mtx_unlock(&sync_mtx); 2270 cv_broadcast(&sync_wakeup); 2271 kproc_shutdown(arg, howto); 2272 } 2273 2274 void 2275 syncer_suspend(void) 2276 { 2277 2278 syncer_shutdown(updateproc, 0); 2279 } 2280 2281 void 2282 syncer_resume(void) 2283 { 2284 2285 mtx_lock(&sync_mtx); 2286 first_printf = 1; 2287 syncer_state = SYNCER_RUNNING; 2288 mtx_unlock(&sync_mtx); 2289 cv_broadcast(&sync_wakeup); 2290 kproc_resume(updateproc); 2291 } 2292 2293 /* 2294 * Reassign a buffer from one vnode to another. 2295 * Used to assign file specific control information 2296 * (indirect blocks) to the vnode to which they belong. 2297 */ 2298 void 2299 reassignbuf(struct buf *bp) 2300 { 2301 struct vnode *vp; 2302 struct bufobj *bo; 2303 int delay; 2304 #ifdef INVARIANTS 2305 struct bufv *bv; 2306 #endif 2307 2308 vp = bp->b_vp; 2309 bo = bp->b_bufobj; 2310 ++reassignbufcalls; 2311 2312 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", 2313 bp, bp->b_vp, bp->b_flags); 2314 /* 2315 * B_PAGING flagged buffers cannot be reassigned because their vp 2316 * is not fully linked in. 2317 */ 2318 if (bp->b_flags & B_PAGING) 2319 panic("cannot reassign paging buffer"); 2320 2321 /* 2322 * Delete from old vnode list, if on one. 2323 */ 2324 BO_LOCK(bo); 2325 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2326 buf_vlist_remove(bp); 2327 else 2328 panic("reassignbuf: Buffer %p not on queue.", bp); 2329 /* 2330 * If dirty, put on list of dirty buffers; otherwise insert onto list 2331 * of clean buffers. 2332 */ 2333 if (bp->b_flags & B_DELWRI) { 2334 if ((bo->bo_flag & BO_ONWORKLST) == 0) { 2335 switch (vp->v_type) { 2336 case VDIR: 2337 delay = dirdelay; 2338 break; 2339 case VCHR: 2340 delay = metadelay; 2341 break; 2342 default: 2343 delay = filedelay; 2344 } 2345 vn_syncer_add_to_worklist(bo, delay); 2346 } 2347 buf_vlist_add(bp, bo, BX_VNDIRTY); 2348 } else { 2349 buf_vlist_add(bp, bo, BX_VNCLEAN); 2350 2351 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2352 mtx_lock(&sync_mtx); 2353 LIST_REMOVE(bo, bo_synclist); 2354 syncer_worklist_len--; 2355 mtx_unlock(&sync_mtx); 2356 bo->bo_flag &= ~BO_ONWORKLST; 2357 } 2358 } 2359 #ifdef INVARIANTS 2360 bv = &bo->bo_clean; 2361 bp = TAILQ_FIRST(&bv->bv_hd); 2362 KASSERT(bp == NULL || bp->b_bufobj == bo, 2363 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2364 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2365 KASSERT(bp == NULL || bp->b_bufobj == bo, 2366 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2367 bv = &bo->bo_dirty; 2368 bp = TAILQ_FIRST(&bv->bv_hd); 2369 KASSERT(bp == NULL || bp->b_bufobj == bo, 2370 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2371 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2372 KASSERT(bp == NULL || bp->b_bufobj == bo, 2373 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2374 #endif 2375 BO_UNLOCK(bo); 2376 } 2377 2378 /* 2379 * A temporary hack until refcount_* APIs are sorted out. 2380 */ 2381 static __inline int 2382 vfs_refcount_acquire_if_not_zero(volatile u_int *count) 2383 { 2384 u_int old; 2385 2386 for (;;) { 2387 old = *count; 2388 if (old == 0) 2389 return (0); 2390 if (atomic_cmpset_int(count, old, old + 1)) 2391 return (1); 2392 } 2393 } 2394 2395 static __inline int 2396 vfs_refcount_release_if_not_last(volatile u_int *count) 2397 { 2398 u_int old; 2399 2400 for (;;) { 2401 old = *count; 2402 if (old == 1) 2403 return (0); 2404 if (atomic_cmpset_int(count, old, old - 1)) 2405 return (1); 2406 } 2407 } 2408 2409 static void 2410 v_init_counters(struct vnode *vp) 2411 { 2412 2413 VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, 2414 vp, ("%s called for an initialized vnode", __FUNCTION__)); 2415 ASSERT_VI_UNLOCKED(vp, __FUNCTION__); 2416 2417 refcount_init(&vp->v_holdcnt, 1); 2418 refcount_init(&vp->v_usecount, 1); 2419 } 2420 2421 static void 2422 v_incr_usecount_locked(struct vnode *vp) 2423 { 2424 2425 ASSERT_VI_LOCKED(vp, __func__); 2426 if ((vp->v_iflag & VI_OWEINACT) != 0) { 2427 VNASSERT(vp->v_usecount == 0, vp, 2428 ("vnode with usecount and VI_OWEINACT set")); 2429 vp->v_iflag &= ~VI_OWEINACT; 2430 } 2431 refcount_acquire(&vp->v_usecount); 2432 v_incr_devcount(vp); 2433 } 2434 2435 /* 2436 * Increment the use and hold counts on the vnode, taking care to reference 2437 * the driver's usecount if this is a chardev. The _vhold() will remove 2438 * the vnode from the free list if it is presently free. 2439 */ 2440 static void 2441 v_incr_usecount(struct vnode *vp) 2442 { 2443 2444 ASSERT_VI_UNLOCKED(vp, __func__); 2445 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2446 2447 if (vp->v_type != VCHR && 2448 vfs_refcount_acquire_if_not_zero(&vp->v_usecount)) { 2449 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, 2450 ("vnode with usecount and VI_OWEINACT set")); 2451 } else { 2452 VI_LOCK(vp); 2453 v_incr_usecount_locked(vp); 2454 VI_UNLOCK(vp); 2455 } 2456 } 2457 2458 /* 2459 * Increment si_usecount of the associated device, if any. 2460 */ 2461 static void 2462 v_incr_devcount(struct vnode *vp) 2463 { 2464 2465 ASSERT_VI_LOCKED(vp, __FUNCTION__); 2466 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 2467 dev_lock(); 2468 vp->v_rdev->si_usecount++; 2469 dev_unlock(); 2470 } 2471 } 2472 2473 /* 2474 * Decrement si_usecount of the associated device, if any. 2475 */ 2476 static void 2477 v_decr_devcount(struct vnode *vp) 2478 { 2479 2480 ASSERT_VI_LOCKED(vp, __FUNCTION__); 2481 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 2482 dev_lock(); 2483 vp->v_rdev->si_usecount--; 2484 dev_unlock(); 2485 } 2486 } 2487 2488 /* 2489 * Grab a particular vnode from the free list, increment its 2490 * reference count and lock it. VI_DOOMED is set if the vnode 2491 * is being destroyed. Only callers who specify LK_RETRY will 2492 * see doomed vnodes. If inactive processing was delayed in 2493 * vput try to do it here. 2494 * 2495 * Notes on lockless counter manipulation: 2496 * _vhold, vputx and other routines make various decisions based 2497 * on either holdcnt or usecount being 0. As long as either counter 2498 * is not transitioning 0->1 nor 1->0, the manipulation can be done 2499 * with atomic operations. Otherwise the interlock is taken covering 2500 * both the atomic and additional actions. 2501 */ 2502 int 2503 vget(struct vnode *vp, int flags, struct thread *td) 2504 { 2505 int error, oweinact; 2506 2507 VNASSERT((flags & LK_TYPE_MASK) != 0, vp, 2508 ("vget: invalid lock operation")); 2509 2510 if ((flags & LK_INTERLOCK) != 0) 2511 ASSERT_VI_LOCKED(vp, __func__); 2512 else 2513 ASSERT_VI_UNLOCKED(vp, __func__); 2514 if ((flags & LK_VNHELD) != 0) 2515 VNASSERT((vp->v_holdcnt > 0), vp, 2516 ("vget: LK_VNHELD passed but vnode not held")); 2517 2518 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 2519 2520 if ((flags & LK_VNHELD) == 0) 2521 _vhold(vp, (flags & LK_INTERLOCK) != 0); 2522 2523 if ((error = vn_lock(vp, flags)) != 0) { 2524 vdrop(vp); 2525 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, 2526 vp); 2527 return (error); 2528 } 2529 if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0) 2530 panic("vget: vn_lock failed to return ENOENT\n"); 2531 /* 2532 * We don't guarantee that any particular close will 2533 * trigger inactive processing so just make a best effort 2534 * here at preventing a reference to a removed file. If 2535 * we don't succeed no harm is done. 2536 * 2537 * Upgrade our holdcnt to a usecount. 2538 */ 2539 if (vp->v_type != VCHR && 2540 vfs_refcount_acquire_if_not_zero(&vp->v_usecount)) { 2541 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, 2542 ("vnode with usecount and VI_OWEINACT set")); 2543 } else { 2544 VI_LOCK(vp); 2545 if ((vp->v_iflag & VI_OWEINACT) == 0) { 2546 oweinact = 0; 2547 } else { 2548 oweinact = 1; 2549 vp->v_iflag &= ~VI_OWEINACT; 2550 } 2551 refcount_acquire(&vp->v_usecount); 2552 v_incr_devcount(vp); 2553 if (oweinact && VOP_ISLOCKED(vp) == LK_EXCLUSIVE && 2554 (flags & LK_NOWAIT) == 0) 2555 vinactive(vp, td); 2556 VI_UNLOCK(vp); 2557 } 2558 return (0); 2559 } 2560 2561 /* 2562 * Increase the reference count of a vnode. 2563 */ 2564 void 2565 vref(struct vnode *vp) 2566 { 2567 2568 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2569 _vhold(vp, false); 2570 v_incr_usecount(vp); 2571 } 2572 2573 void 2574 vrefl(struct vnode *vp) 2575 { 2576 2577 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2578 _vhold(vp, true); 2579 v_incr_usecount_locked(vp); 2580 } 2581 2582 /* 2583 * Return reference count of a vnode. 2584 * 2585 * The results of this call are only guaranteed when some mechanism is used to 2586 * stop other processes from gaining references to the vnode. This may be the 2587 * case if the caller holds the only reference. This is also useful when stale 2588 * data is acceptable as race conditions may be accounted for by some other 2589 * means. 2590 */ 2591 int 2592 vrefcnt(struct vnode *vp) 2593 { 2594 2595 return (vp->v_usecount); 2596 } 2597 2598 #define VPUTX_VRELE 1 2599 #define VPUTX_VPUT 2 2600 #define VPUTX_VUNREF 3 2601 2602 /* 2603 * Decrement the use and hold counts for a vnode. 2604 * 2605 * See an explanation near vget() as to why atomic operation is safe. 2606 */ 2607 static void 2608 vputx(struct vnode *vp, int func) 2609 { 2610 int error; 2611 2612 KASSERT(vp != NULL, ("vputx: null vp")); 2613 if (func == VPUTX_VUNREF) 2614 ASSERT_VOP_LOCKED(vp, "vunref"); 2615 else if (func == VPUTX_VPUT) 2616 ASSERT_VOP_LOCKED(vp, "vput"); 2617 else 2618 KASSERT(func == VPUTX_VRELE, ("vputx: wrong func")); 2619 ASSERT_VI_UNLOCKED(vp, __func__); 2620 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2621 2622 if (vp->v_type != VCHR && 2623 vfs_refcount_release_if_not_last(&vp->v_usecount)) { 2624 if (func == VPUTX_VPUT) 2625 VOP_UNLOCK(vp, 0); 2626 vdrop(vp); 2627 return; 2628 } 2629 2630 VI_LOCK(vp); 2631 2632 /* 2633 * We want to hold the vnode until the inactive finishes to 2634 * prevent vgone() races. We drop the use count here and the 2635 * hold count below when we're done. 2636 */ 2637 if (!refcount_release(&vp->v_usecount) || 2638 (vp->v_iflag & VI_DOINGINACT)) { 2639 if (func == VPUTX_VPUT) 2640 VOP_UNLOCK(vp, 0); 2641 v_decr_devcount(vp); 2642 vdropl(vp); 2643 return; 2644 } 2645 2646 v_decr_devcount(vp); 2647 2648 error = 0; 2649 2650 if (vp->v_usecount != 0) { 2651 vprint("vputx: usecount not zero", vp); 2652 panic("vputx: usecount not zero"); 2653 } 2654 2655 CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp); 2656 2657 /* 2658 * We must call VOP_INACTIVE with the node locked. Mark 2659 * as VI_DOINGINACT to avoid recursion. 2660 */ 2661 vp->v_iflag |= VI_OWEINACT; 2662 switch (func) { 2663 case VPUTX_VRELE: 2664 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 2665 VI_LOCK(vp); 2666 break; 2667 case VPUTX_VPUT: 2668 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 2669 error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | 2670 LK_NOWAIT); 2671 VI_LOCK(vp); 2672 } 2673 break; 2674 case VPUTX_VUNREF: 2675 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 2676 error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); 2677 VI_LOCK(vp); 2678 } 2679 break; 2680 } 2681 VNASSERT(vp->v_usecount == 0 || (vp->v_iflag & VI_OWEINACT) == 0, vp, 2682 ("vnode with usecount and VI_OWEINACT set")); 2683 if (error == 0) { 2684 if (vp->v_iflag & VI_OWEINACT) 2685 vinactive(vp, curthread); 2686 if (func != VPUTX_VUNREF) 2687 VOP_UNLOCK(vp, 0); 2688 } 2689 vdropl(vp); 2690 } 2691 2692 /* 2693 * Vnode put/release. 2694 * If count drops to zero, call inactive routine and return to freelist. 2695 */ 2696 void 2697 vrele(struct vnode *vp) 2698 { 2699 2700 vputx(vp, VPUTX_VRELE); 2701 } 2702 2703 /* 2704 * Release an already locked vnode. This give the same effects as 2705 * unlock+vrele(), but takes less time and avoids releasing and 2706 * re-aquiring the lock (as vrele() acquires the lock internally.) 2707 */ 2708 void 2709 vput(struct vnode *vp) 2710 { 2711 2712 vputx(vp, VPUTX_VPUT); 2713 } 2714 2715 /* 2716 * Release an exclusively locked vnode. Do not unlock the vnode lock. 2717 */ 2718 void 2719 vunref(struct vnode *vp) 2720 { 2721 2722 vputx(vp, VPUTX_VUNREF); 2723 } 2724 2725 /* 2726 * Increase the hold count and activate if this is the first reference. 2727 */ 2728 void 2729 _vhold(struct vnode *vp, bool locked) 2730 { 2731 struct mount *mp; 2732 2733 if (locked) 2734 ASSERT_VI_LOCKED(vp, __func__); 2735 else 2736 ASSERT_VI_UNLOCKED(vp, __func__); 2737 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2738 if (!locked && vfs_refcount_acquire_if_not_zero(&vp->v_holdcnt)) { 2739 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, 2740 ("_vhold: vnode with holdcnt is free")); 2741 return; 2742 } 2743 2744 if (!locked) 2745 VI_LOCK(vp); 2746 if ((vp->v_iflag & VI_FREE) == 0) { 2747 refcount_acquire(&vp->v_holdcnt); 2748 if (!locked) 2749 VI_UNLOCK(vp); 2750 return; 2751 } 2752 VNASSERT(vp->v_holdcnt == 0, vp, 2753 ("%s: wrong hold count", __func__)); 2754 VNASSERT(vp->v_op != NULL, vp, 2755 ("%s: vnode already reclaimed.", __func__)); 2756 /* 2757 * Remove a vnode from the free list, mark it as in use, 2758 * and put it on the active list. 2759 */ 2760 mtx_lock(&vnode_free_list_mtx); 2761 TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); 2762 freevnodes--; 2763 vp->v_iflag &= ~VI_FREE; 2764 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 2765 ("Activating already active vnode")); 2766 vp->v_iflag |= VI_ACTIVE; 2767 mp = vp->v_mount; 2768 TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); 2769 mp->mnt_activevnodelistsize++; 2770 mtx_unlock(&vnode_free_list_mtx); 2771 refcount_acquire(&vp->v_holdcnt); 2772 if (!locked) 2773 VI_UNLOCK(vp); 2774 } 2775 2776 /* 2777 * Drop the hold count of the vnode. If this is the last reference to 2778 * the vnode we place it on the free list unless it has been vgone'd 2779 * (marked VI_DOOMED) in which case we will free it. 2780 * 2781 * Because the vnode vm object keeps a hold reference on the vnode if 2782 * there is at least one resident non-cached page, the vnode cannot 2783 * leave the active list without the page cleanup done. 2784 */ 2785 void 2786 _vdrop(struct vnode *vp, bool locked) 2787 { 2788 struct bufobj *bo; 2789 struct mount *mp; 2790 int active; 2791 2792 if (locked) 2793 ASSERT_VI_LOCKED(vp, __func__); 2794 else 2795 ASSERT_VI_UNLOCKED(vp, __func__); 2796 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2797 if ((int)vp->v_holdcnt <= 0) 2798 panic("vdrop: holdcnt %d", vp->v_holdcnt); 2799 if (vfs_refcount_release_if_not_last(&vp->v_holdcnt)) { 2800 if (locked) 2801 VI_UNLOCK(vp); 2802 return; 2803 } 2804 2805 if (!locked) 2806 VI_LOCK(vp); 2807 if (refcount_release(&vp->v_holdcnt) == 0) { 2808 VI_UNLOCK(vp); 2809 return; 2810 } 2811 if ((vp->v_iflag & VI_DOOMED) == 0) { 2812 /* 2813 * Mark a vnode as free: remove it from its active list 2814 * and put it up for recycling on the freelist. 2815 */ 2816 VNASSERT(vp->v_op != NULL, vp, 2817 ("vdropl: vnode already reclaimed.")); 2818 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, 2819 ("vnode already free")); 2820 VNASSERT(vp->v_holdcnt == 0, vp, 2821 ("vdropl: freeing when we shouldn't")); 2822 active = vp->v_iflag & VI_ACTIVE; 2823 if ((vp->v_iflag & VI_OWEINACT) == 0) { 2824 vp->v_iflag &= ~VI_ACTIVE; 2825 mp = vp->v_mount; 2826 mtx_lock(&vnode_free_list_mtx); 2827 if (active) { 2828 TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, 2829 v_actfreelist); 2830 mp->mnt_activevnodelistsize--; 2831 } 2832 TAILQ_INSERT_TAIL(&vnode_free_list, vp, 2833 v_actfreelist); 2834 freevnodes++; 2835 vp->v_iflag |= VI_FREE; 2836 mtx_unlock(&vnode_free_list_mtx); 2837 } else { 2838 atomic_add_long(&free_owe_inact, 1); 2839 } 2840 VI_UNLOCK(vp); 2841 return; 2842 } 2843 /* 2844 * The vnode has been marked for destruction, so free it. 2845 * 2846 * The vnode will be returned to the zone where it will 2847 * normally remain until it is needed for another vnode. We 2848 * need to cleanup (or verify that the cleanup has already 2849 * been done) any residual data left from its current use 2850 * so as not to contaminate the freshly allocated vnode. 2851 */ 2852 CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); 2853 atomic_subtract_long(&numvnodes, 1); 2854 bo = &vp->v_bufobj; 2855 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, 2856 ("cleaned vnode still on the free list.")); 2857 VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); 2858 VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count")); 2859 VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); 2860 VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); 2861 VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); 2862 VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); 2863 VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, 2864 ("clean blk trie not empty")); 2865 VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); 2866 VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, 2867 ("dirty blk trie not empty")); 2868 VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); 2869 VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); 2870 VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for ..")); 2871 VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, 2872 ("Dangling rangelock waiters")); 2873 VI_UNLOCK(vp); 2874 #ifdef MAC 2875 mac_vnode_destroy(vp); 2876 #endif 2877 if (vp->v_pollinfo != NULL) { 2878 destroy_vpollinfo(vp->v_pollinfo); 2879 vp->v_pollinfo = NULL; 2880 } 2881 #ifdef INVARIANTS 2882 /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */ 2883 vp->v_op = NULL; 2884 #endif 2885 bzero(&vp->v_un, sizeof(vp->v_un)); 2886 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 2887 vp->v_iflag = 0; 2888 vp->v_vflag = 0; 2889 bo->bo_flag = 0; 2890 uma_zfree(vnode_zone, vp); 2891 } 2892 2893 /* 2894 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT 2895 * flags. DOINGINACT prevents us from recursing in calls to vinactive. 2896 * OWEINACT tracks whether a vnode missed a call to inactive due to a 2897 * failed lock upgrade. 2898 */ 2899 void 2900 vinactive(struct vnode *vp, struct thread *td) 2901 { 2902 struct vm_object *obj; 2903 2904 ASSERT_VOP_ELOCKED(vp, "vinactive"); 2905 ASSERT_VI_LOCKED(vp, "vinactive"); 2906 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, 2907 ("vinactive: recursed on VI_DOINGINACT")); 2908 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2909 vp->v_iflag |= VI_DOINGINACT; 2910 vp->v_iflag &= ~VI_OWEINACT; 2911 VI_UNLOCK(vp); 2912 /* 2913 * Before moving off the active list, we must be sure that any 2914 * modified pages are converted into the vnode's dirty 2915 * buffers, since these will no longer be checked once the 2916 * vnode is on the inactive list. 2917 * 2918 * The write-out of the dirty pages is asynchronous. At the 2919 * point that VOP_INACTIVE() is called, there could still be 2920 * pending I/O and dirty pages in the object. 2921 */ 2922 obj = vp->v_object; 2923 if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) { 2924 VM_OBJECT_WLOCK(obj); 2925 vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC); 2926 VM_OBJECT_WUNLOCK(obj); 2927 } 2928 VOP_INACTIVE(vp, td); 2929 VI_LOCK(vp); 2930 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, 2931 ("vinactive: lost VI_DOINGINACT")); 2932 vp->v_iflag &= ~VI_DOINGINACT; 2933 } 2934 2935 /* 2936 * Remove any vnodes in the vnode table belonging to mount point mp. 2937 * 2938 * If FORCECLOSE is not specified, there should not be any active ones, 2939 * return error if any are found (nb: this is a user error, not a 2940 * system error). If FORCECLOSE is specified, detach any active vnodes 2941 * that are found. 2942 * 2943 * If WRITECLOSE is set, only flush out regular file vnodes open for 2944 * writing. 2945 * 2946 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. 2947 * 2948 * `rootrefs' specifies the base reference count for the root vnode 2949 * of this filesystem. The root vnode is considered busy if its 2950 * v_usecount exceeds this value. On a successful return, vflush(, td) 2951 * will call vrele() on the root vnode exactly rootrefs times. 2952 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 2953 * be zero. 2954 */ 2955 #ifdef DIAGNOSTIC 2956 static int busyprt = 0; /* print out busy vnodes */ 2957 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); 2958 #endif 2959 2960 int 2961 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) 2962 { 2963 struct vnode *vp, *mvp, *rootvp = NULL; 2964 struct vattr vattr; 2965 int busy = 0, error; 2966 2967 CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, 2968 rootrefs, flags); 2969 if (rootrefs > 0) { 2970 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 2971 ("vflush: bad args")); 2972 /* 2973 * Get the filesystem root vnode. We can vput() it 2974 * immediately, since with rootrefs > 0, it won't go away. 2975 */ 2976 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { 2977 CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", 2978 __func__, error); 2979 return (error); 2980 } 2981 vput(rootvp); 2982 } 2983 loop: 2984 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 2985 vholdl(vp); 2986 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); 2987 if (error) { 2988 vdrop(vp); 2989 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 2990 goto loop; 2991 } 2992 /* 2993 * Skip over a vnodes marked VV_SYSTEM. 2994 */ 2995 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 2996 VOP_UNLOCK(vp, 0); 2997 vdrop(vp); 2998 continue; 2999 } 3000 /* 3001 * If WRITECLOSE is set, flush out unlinked but still open 3002 * files (even if open only for reading) and regular file 3003 * vnodes open for writing. 3004 */ 3005 if (flags & WRITECLOSE) { 3006 if (vp->v_object != NULL) { 3007 VM_OBJECT_WLOCK(vp->v_object); 3008 vm_object_page_clean(vp->v_object, 0, 0, 0); 3009 VM_OBJECT_WUNLOCK(vp->v_object); 3010 } 3011 error = VOP_FSYNC(vp, MNT_WAIT, td); 3012 if (error != 0) { 3013 VOP_UNLOCK(vp, 0); 3014 vdrop(vp); 3015 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 3016 return (error); 3017 } 3018 error = VOP_GETATTR(vp, &vattr, td->td_ucred); 3019 VI_LOCK(vp); 3020 3021 if ((vp->v_type == VNON || 3022 (error == 0 && vattr.va_nlink > 0)) && 3023 (vp->v_writecount == 0 || vp->v_type != VREG)) { 3024 VOP_UNLOCK(vp, 0); 3025 vdropl(vp); 3026 continue; 3027 } 3028 } else 3029 VI_LOCK(vp); 3030 /* 3031 * With v_usecount == 0, all we need to do is clear out the 3032 * vnode data structures and we are done. 3033 * 3034 * If FORCECLOSE is set, forcibly close the vnode. 3035 */ 3036 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { 3037 vgonel(vp); 3038 } else { 3039 busy++; 3040 #ifdef DIAGNOSTIC 3041 if (busyprt) 3042 vprint("vflush: busy vnode", vp); 3043 #endif 3044 } 3045 VOP_UNLOCK(vp, 0); 3046 vdropl(vp); 3047 } 3048 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 3049 /* 3050 * If just the root vnode is busy, and if its refcount 3051 * is equal to `rootrefs', then go ahead and kill it. 3052 */ 3053 VI_LOCK(rootvp); 3054 KASSERT(busy > 0, ("vflush: not busy")); 3055 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, 3056 ("vflush: usecount %d < rootrefs %d", 3057 rootvp->v_usecount, rootrefs)); 3058 if (busy == 1 && rootvp->v_usecount == rootrefs) { 3059 VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); 3060 vgone(rootvp); 3061 VOP_UNLOCK(rootvp, 0); 3062 busy = 0; 3063 } else 3064 VI_UNLOCK(rootvp); 3065 } 3066 if (busy) { 3067 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, 3068 busy); 3069 return (EBUSY); 3070 } 3071 for (; rootrefs > 0; rootrefs--) 3072 vrele(rootvp); 3073 return (0); 3074 } 3075 3076 /* 3077 * Recycle an unused vnode to the front of the free list. 3078 */ 3079 int 3080 vrecycle(struct vnode *vp) 3081 { 3082 int recycled; 3083 3084 ASSERT_VOP_ELOCKED(vp, "vrecycle"); 3085 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3086 recycled = 0; 3087 VI_LOCK(vp); 3088 if (vp->v_usecount == 0) { 3089 recycled = 1; 3090 vgonel(vp); 3091 } 3092 VI_UNLOCK(vp); 3093 return (recycled); 3094 } 3095 3096 /* 3097 * Eliminate all activity associated with a vnode 3098 * in preparation for reuse. 3099 */ 3100 void 3101 vgone(struct vnode *vp) 3102 { 3103 VI_LOCK(vp); 3104 vgonel(vp); 3105 VI_UNLOCK(vp); 3106 } 3107 3108 static void 3109 notify_lowervp_vfs_dummy(struct mount *mp __unused, 3110 struct vnode *lowervp __unused) 3111 { 3112 } 3113 3114 /* 3115 * Notify upper mounts about reclaimed or unlinked vnode. 3116 */ 3117 void 3118 vfs_notify_upper(struct vnode *vp, int event) 3119 { 3120 static struct vfsops vgonel_vfsops = { 3121 .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy, 3122 .vfs_unlink_lowervp = notify_lowervp_vfs_dummy, 3123 }; 3124 struct mount *mp, *ump, *mmp; 3125 3126 mp = vp->v_mount; 3127 if (mp == NULL) 3128 return; 3129 3130 MNT_ILOCK(mp); 3131 if (TAILQ_EMPTY(&mp->mnt_uppers)) 3132 goto unlock; 3133 MNT_IUNLOCK(mp); 3134 mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO); 3135 mmp->mnt_op = &vgonel_vfsops; 3136 mmp->mnt_kern_flag |= MNTK_MARKER; 3137 MNT_ILOCK(mp); 3138 mp->mnt_kern_flag |= MNTK_VGONE_UPPER; 3139 for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) { 3140 if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) { 3141 ump = TAILQ_NEXT(ump, mnt_upper_link); 3142 continue; 3143 } 3144 TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link); 3145 MNT_IUNLOCK(mp); 3146 switch (event) { 3147 case VFS_NOTIFY_UPPER_RECLAIM: 3148 VFS_RECLAIM_LOWERVP(ump, vp); 3149 break; 3150 case VFS_NOTIFY_UPPER_UNLINK: 3151 VFS_UNLINK_LOWERVP(ump, vp); 3152 break; 3153 default: 3154 KASSERT(0, ("invalid event %d", event)); 3155 break; 3156 } 3157 MNT_ILOCK(mp); 3158 ump = TAILQ_NEXT(mmp, mnt_upper_link); 3159 TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link); 3160 } 3161 free(mmp, M_TEMP); 3162 mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER; 3163 if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) { 3164 mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER; 3165 wakeup(&mp->mnt_uppers); 3166 } 3167 unlock: 3168 MNT_IUNLOCK(mp); 3169 } 3170 3171 /* 3172 * vgone, with the vp interlock held. 3173 */ 3174 static void 3175 vgonel(struct vnode *vp) 3176 { 3177 struct thread *td; 3178 int oweinact; 3179 int active; 3180 struct mount *mp; 3181 3182 ASSERT_VOP_ELOCKED(vp, "vgonel"); 3183 ASSERT_VI_LOCKED(vp, "vgonel"); 3184 VNASSERT(vp->v_holdcnt, vp, 3185 ("vgonel: vp %p has no reference.", vp)); 3186 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3187 td = curthread; 3188 3189 /* 3190 * Don't vgonel if we're already doomed. 3191 */ 3192 if (vp->v_iflag & VI_DOOMED) 3193 return; 3194 vp->v_iflag |= VI_DOOMED; 3195 3196 /* 3197 * Check to see if the vnode is in use. If so, we have to call 3198 * VOP_CLOSE() and VOP_INACTIVE(). 3199 */ 3200 active = vp->v_usecount; 3201 oweinact = (vp->v_iflag & VI_OWEINACT); 3202 VI_UNLOCK(vp); 3203 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); 3204 3205 /* 3206 * If purging an active vnode, it must be closed and 3207 * deactivated before being reclaimed. 3208 */ 3209 if (active) 3210 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 3211 if (oweinact || active) { 3212 VI_LOCK(vp); 3213 if ((vp->v_iflag & VI_DOINGINACT) == 0) 3214 vinactive(vp, td); 3215 VI_UNLOCK(vp); 3216 } 3217 if (vp->v_type == VSOCK) 3218 vfs_unp_reclaim(vp); 3219 3220 /* 3221 * Clean out any buffers associated with the vnode. 3222 * If the flush fails, just toss the buffers. 3223 */ 3224 mp = NULL; 3225 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) 3226 (void) vn_start_secondary_write(vp, &mp, V_WAIT); 3227 if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { 3228 while (vinvalbuf(vp, 0, 0, 0) != 0) 3229 ; 3230 } 3231 3232 BO_LOCK(&vp->v_bufobj); 3233 KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && 3234 vp->v_bufobj.bo_dirty.bv_cnt == 0 && 3235 TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && 3236 vp->v_bufobj.bo_clean.bv_cnt == 0, 3237 ("vp %p bufobj not invalidated", vp)); 3238 vp->v_bufobj.bo_flag |= BO_DEAD; 3239 BO_UNLOCK(&vp->v_bufobj); 3240 3241 /* 3242 * Reclaim the vnode. 3243 */ 3244 if (VOP_RECLAIM(vp, td)) 3245 panic("vgone: cannot reclaim"); 3246 if (mp != NULL) 3247 vn_finished_secondary_write(mp); 3248 VNASSERT(vp->v_object == NULL, vp, 3249 ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag)); 3250 /* 3251 * Clear the advisory locks and wake up waiting threads. 3252 */ 3253 (void)VOP_ADVLOCKPURGE(vp); 3254 vp->v_lockf = NULL; 3255 /* 3256 * Delete from old mount point vnode list. 3257 */ 3258 delmntque(vp); 3259 cache_purge(vp); 3260 /* 3261 * Done with purge, reset to the standard lock and invalidate 3262 * the vnode. 3263 */ 3264 VI_LOCK(vp); 3265 vp->v_vnlock = &vp->v_lock; 3266 vp->v_op = &dead_vnodeops; 3267 vp->v_tag = "none"; 3268 vp->v_type = VBAD; 3269 } 3270 3271 /* 3272 * Calculate the total number of references to a special device. 3273 */ 3274 int 3275 vcount(struct vnode *vp) 3276 { 3277 int count; 3278 3279 dev_lock(); 3280 count = vp->v_rdev->si_usecount; 3281 dev_unlock(); 3282 return (count); 3283 } 3284 3285 /* 3286 * Same as above, but using the struct cdev *as argument 3287 */ 3288 int 3289 count_dev(struct cdev *dev) 3290 { 3291 int count; 3292 3293 dev_lock(); 3294 count = dev->si_usecount; 3295 dev_unlock(); 3296 return(count); 3297 } 3298 3299 /* 3300 * Print out a description of a vnode. 3301 */ 3302 static char *typename[] = 3303 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", 3304 "VMARKER"}; 3305 3306 void 3307 vn_printf(struct vnode *vp, const char *fmt, ...) 3308 { 3309 va_list ap; 3310 char buf[256], buf2[16]; 3311 u_long flags; 3312 3313 va_start(ap, fmt); 3314 vprintf(fmt, ap); 3315 va_end(ap); 3316 printf("%p: ", (void *)vp); 3317 printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]); 3318 printf(" usecount %d, writecount %d, refcount %d mountedhere %p\n", 3319 vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere); 3320 buf[0] = '\0'; 3321 buf[1] = '\0'; 3322 if (vp->v_vflag & VV_ROOT) 3323 strlcat(buf, "|VV_ROOT", sizeof(buf)); 3324 if (vp->v_vflag & VV_ISTTY) 3325 strlcat(buf, "|VV_ISTTY", sizeof(buf)); 3326 if (vp->v_vflag & VV_NOSYNC) 3327 strlcat(buf, "|VV_NOSYNC", sizeof(buf)); 3328 if (vp->v_vflag & VV_ETERNALDEV) 3329 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); 3330 if (vp->v_vflag & VV_CACHEDLABEL) 3331 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); 3332 if (vp->v_vflag & VV_TEXT) 3333 strlcat(buf, "|VV_TEXT", sizeof(buf)); 3334 if (vp->v_vflag & VV_COPYONWRITE) 3335 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); 3336 if (vp->v_vflag & VV_SYSTEM) 3337 strlcat(buf, "|VV_SYSTEM", sizeof(buf)); 3338 if (vp->v_vflag & VV_PROCDEP) 3339 strlcat(buf, "|VV_PROCDEP", sizeof(buf)); 3340 if (vp->v_vflag & VV_NOKNOTE) 3341 strlcat(buf, "|VV_NOKNOTE", sizeof(buf)); 3342 if (vp->v_vflag & VV_DELETED) 3343 strlcat(buf, "|VV_DELETED", sizeof(buf)); 3344 if (vp->v_vflag & VV_MD) 3345 strlcat(buf, "|VV_MD", sizeof(buf)); 3346 if (vp->v_vflag & VV_FORCEINSMQ) 3347 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); 3348 flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | 3349 VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | 3350 VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ); 3351 if (flags != 0) { 3352 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); 3353 strlcat(buf, buf2, sizeof(buf)); 3354 } 3355 if (vp->v_iflag & VI_MOUNT) 3356 strlcat(buf, "|VI_MOUNT", sizeof(buf)); 3357 if (vp->v_iflag & VI_DOOMED) 3358 strlcat(buf, "|VI_DOOMED", sizeof(buf)); 3359 if (vp->v_iflag & VI_FREE) 3360 strlcat(buf, "|VI_FREE", sizeof(buf)); 3361 if (vp->v_iflag & VI_ACTIVE) 3362 strlcat(buf, "|VI_ACTIVE", sizeof(buf)); 3363 if (vp->v_iflag & VI_DOINGINACT) 3364 strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); 3365 if (vp->v_iflag & VI_OWEINACT) 3366 strlcat(buf, "|VI_OWEINACT", sizeof(buf)); 3367 flags = vp->v_iflag & ~(VI_MOUNT | VI_DOOMED | VI_FREE | 3368 VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT); 3369 if (flags != 0) { 3370 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); 3371 strlcat(buf, buf2, sizeof(buf)); 3372 } 3373 printf(" flags (%s)\n", buf + 1); 3374 if (mtx_owned(VI_MTX(vp))) 3375 printf(" VI_LOCKed"); 3376 if (vp->v_object != NULL) 3377 printf(" v_object %p ref %d pages %d " 3378 "cleanbuf %d dirtybuf %d\n", 3379 vp->v_object, vp->v_object->ref_count, 3380 vp->v_object->resident_page_count, 3381 vp->v_bufobj.bo_clean.bv_cnt, 3382 vp->v_bufobj.bo_dirty.bv_cnt); 3383 printf(" "); 3384 lockmgr_printinfo(vp->v_vnlock); 3385 if (vp->v_data != NULL) 3386 VOP_PRINT(vp); 3387 } 3388 3389 #ifdef DDB 3390 /* 3391 * List all of the locked vnodes in the system. 3392 * Called when debugging the kernel. 3393 */ 3394 DB_SHOW_COMMAND(lockedvnods, lockedvnodes) 3395 { 3396 struct mount *mp; 3397 struct vnode *vp; 3398 3399 /* 3400 * Note: because this is DDB, we can't obey the locking semantics 3401 * for these structures, which means we could catch an inconsistent 3402 * state and dereference a nasty pointer. Not much to be done 3403 * about that. 3404 */ 3405 db_printf("Locked vnodes\n"); 3406 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3407 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 3408 if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) 3409 vprint("", vp); 3410 } 3411 } 3412 } 3413 3414 /* 3415 * Show details about the given vnode. 3416 */ 3417 DB_SHOW_COMMAND(vnode, db_show_vnode) 3418 { 3419 struct vnode *vp; 3420 3421 if (!have_addr) 3422 return; 3423 vp = (struct vnode *)addr; 3424 vn_printf(vp, "vnode "); 3425 } 3426 3427 /* 3428 * Show details about the given mount point. 3429 */ 3430 DB_SHOW_COMMAND(mount, db_show_mount) 3431 { 3432 struct mount *mp; 3433 struct vfsopt *opt; 3434 struct statfs *sp; 3435 struct vnode *vp; 3436 char buf[512]; 3437 uint64_t mflags; 3438 u_int flags; 3439 3440 if (!have_addr) { 3441 /* No address given, print short info about all mount points. */ 3442 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3443 db_printf("%p %s on %s (%s)\n", mp, 3444 mp->mnt_stat.f_mntfromname, 3445 mp->mnt_stat.f_mntonname, 3446 mp->mnt_stat.f_fstypename); 3447 if (db_pager_quit) 3448 break; 3449 } 3450 db_printf("\nMore info: show mount <addr>\n"); 3451 return; 3452 } 3453 3454 mp = (struct mount *)addr; 3455 db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, 3456 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); 3457 3458 buf[0] = '\0'; 3459 mflags = mp->mnt_flag; 3460 #define MNT_FLAG(flag) do { \ 3461 if (mflags & (flag)) { \ 3462 if (buf[0] != '\0') \ 3463 strlcat(buf, ", ", sizeof(buf)); \ 3464 strlcat(buf, (#flag) + 4, sizeof(buf)); \ 3465 mflags &= ~(flag); \ 3466 } \ 3467 } while (0) 3468 MNT_FLAG(MNT_RDONLY); 3469 MNT_FLAG(MNT_SYNCHRONOUS); 3470 MNT_FLAG(MNT_NOEXEC); 3471 MNT_FLAG(MNT_NOSUID); 3472 MNT_FLAG(MNT_NFS4ACLS); 3473 MNT_FLAG(MNT_UNION); 3474 MNT_FLAG(MNT_ASYNC); 3475 MNT_FLAG(MNT_SUIDDIR); 3476 MNT_FLAG(MNT_SOFTDEP); 3477 MNT_FLAG(MNT_NOSYMFOLLOW); 3478 MNT_FLAG(MNT_GJOURNAL); 3479 MNT_FLAG(MNT_MULTILABEL); 3480 MNT_FLAG(MNT_ACLS); 3481 MNT_FLAG(MNT_NOATIME); 3482 MNT_FLAG(MNT_NOCLUSTERR); 3483 MNT_FLAG(MNT_NOCLUSTERW); 3484 MNT_FLAG(MNT_SUJ); 3485 MNT_FLAG(MNT_EXRDONLY); 3486 MNT_FLAG(MNT_EXPORTED); 3487 MNT_FLAG(MNT_DEFEXPORTED); 3488 MNT_FLAG(MNT_EXPORTANON); 3489 MNT_FLAG(MNT_EXKERB); 3490 MNT_FLAG(MNT_EXPUBLIC); 3491 MNT_FLAG(MNT_LOCAL); 3492 MNT_FLAG(MNT_QUOTA); 3493 MNT_FLAG(MNT_ROOTFS); 3494 MNT_FLAG(MNT_USER); 3495 MNT_FLAG(MNT_IGNORE); 3496 MNT_FLAG(MNT_UPDATE); 3497 MNT_FLAG(MNT_DELEXPORT); 3498 MNT_FLAG(MNT_RELOAD); 3499 MNT_FLAG(MNT_FORCE); 3500 MNT_FLAG(MNT_SNAPSHOT); 3501 MNT_FLAG(MNT_BYFSID); 3502 #undef MNT_FLAG 3503 if (mflags != 0) { 3504 if (buf[0] != '\0') 3505 strlcat(buf, ", ", sizeof(buf)); 3506 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 3507 "0x%016jx", mflags); 3508 } 3509 db_printf(" mnt_flag = %s\n", buf); 3510 3511 buf[0] = '\0'; 3512 flags = mp->mnt_kern_flag; 3513 #define MNT_KERN_FLAG(flag) do { \ 3514 if (flags & (flag)) { \ 3515 if (buf[0] != '\0') \ 3516 strlcat(buf, ", ", sizeof(buf)); \ 3517 strlcat(buf, (#flag) + 5, sizeof(buf)); \ 3518 flags &= ~(flag); \ 3519 } \ 3520 } while (0) 3521 MNT_KERN_FLAG(MNTK_UNMOUNTF); 3522 MNT_KERN_FLAG(MNTK_ASYNC); 3523 MNT_KERN_FLAG(MNTK_SOFTDEP); 3524 MNT_KERN_FLAG(MNTK_NOINSMNTQ); 3525 MNT_KERN_FLAG(MNTK_DRAINING); 3526 MNT_KERN_FLAG(MNTK_REFEXPIRE); 3527 MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); 3528 MNT_KERN_FLAG(MNTK_SHARED_WRITES); 3529 MNT_KERN_FLAG(MNTK_NO_IOPF); 3530 MNT_KERN_FLAG(MNTK_VGONE_UPPER); 3531 MNT_KERN_FLAG(MNTK_VGONE_WAITER); 3532 MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT); 3533 MNT_KERN_FLAG(MNTK_MARKER); 3534 MNT_KERN_FLAG(MNTK_USES_BCACHE); 3535 MNT_KERN_FLAG(MNTK_NOASYNC); 3536 MNT_KERN_FLAG(MNTK_UNMOUNT); 3537 MNT_KERN_FLAG(MNTK_MWAIT); 3538 MNT_KERN_FLAG(MNTK_SUSPEND); 3539 MNT_KERN_FLAG(MNTK_SUSPEND2); 3540 MNT_KERN_FLAG(MNTK_SUSPENDED); 3541 MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); 3542 MNT_KERN_FLAG(MNTK_NOKNOTE); 3543 #undef MNT_KERN_FLAG 3544 if (flags != 0) { 3545 if (buf[0] != '\0') 3546 strlcat(buf, ", ", sizeof(buf)); 3547 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 3548 "0x%08x", flags); 3549 } 3550 db_printf(" mnt_kern_flag = %s\n", buf); 3551 3552 db_printf(" mnt_opt = "); 3553 opt = TAILQ_FIRST(mp->mnt_opt); 3554 if (opt != NULL) { 3555 db_printf("%s", opt->name); 3556 opt = TAILQ_NEXT(opt, link); 3557 while (opt != NULL) { 3558 db_printf(", %s", opt->name); 3559 opt = TAILQ_NEXT(opt, link); 3560 } 3561 } 3562 db_printf("\n"); 3563 3564 sp = &mp->mnt_stat; 3565 db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " 3566 "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " 3567 "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " 3568 "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", 3569 (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, 3570 (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, 3571 (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, 3572 (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, 3573 (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, 3574 (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, 3575 (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, 3576 (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); 3577 3578 db_printf(" mnt_cred = { uid=%u ruid=%u", 3579 (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); 3580 if (jailed(mp->mnt_cred)) 3581 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); 3582 db_printf(" }\n"); 3583 db_printf(" mnt_ref = %d\n", mp->mnt_ref); 3584 db_printf(" mnt_gen = %d\n", mp->mnt_gen); 3585 db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); 3586 db_printf(" mnt_activevnodelistsize = %d\n", 3587 mp->mnt_activevnodelistsize); 3588 db_printf(" mnt_writeopcount = %d\n", mp->mnt_writeopcount); 3589 db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); 3590 db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); 3591 db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); 3592 db_printf(" mnt_lockref = %d\n", mp->mnt_lockref); 3593 db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); 3594 db_printf(" mnt_secondary_accwrites = %d\n", 3595 mp->mnt_secondary_accwrites); 3596 db_printf(" mnt_gjprovider = %s\n", 3597 mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); 3598 3599 db_printf("\n\nList of active vnodes\n"); 3600 TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) { 3601 if (vp->v_type != VMARKER) { 3602 vn_printf(vp, "vnode "); 3603 if (db_pager_quit) 3604 break; 3605 } 3606 } 3607 db_printf("\n\nList of inactive vnodes\n"); 3608 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 3609 if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) { 3610 vn_printf(vp, "vnode "); 3611 if (db_pager_quit) 3612 break; 3613 } 3614 } 3615 } 3616 #endif /* DDB */ 3617 3618 /* 3619 * Fill in a struct xvfsconf based on a struct vfsconf. 3620 */ 3621 static int 3622 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) 3623 { 3624 struct xvfsconf xvfsp; 3625 3626 bzero(&xvfsp, sizeof(xvfsp)); 3627 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 3628 xvfsp.vfc_typenum = vfsp->vfc_typenum; 3629 xvfsp.vfc_refcount = vfsp->vfc_refcount; 3630 xvfsp.vfc_flags = vfsp->vfc_flags; 3631 /* 3632 * These are unused in userland, we keep them 3633 * to not break binary compatibility. 3634 */ 3635 xvfsp.vfc_vfsops = NULL; 3636 xvfsp.vfc_next = NULL; 3637 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 3638 } 3639 3640 #ifdef COMPAT_FREEBSD32 3641 struct xvfsconf32 { 3642 uint32_t vfc_vfsops; 3643 char vfc_name[MFSNAMELEN]; 3644 int32_t vfc_typenum; 3645 int32_t vfc_refcount; 3646 int32_t vfc_flags; 3647 uint32_t vfc_next; 3648 }; 3649 3650 static int 3651 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) 3652 { 3653 struct xvfsconf32 xvfsp; 3654 3655 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 3656 xvfsp.vfc_typenum = vfsp->vfc_typenum; 3657 xvfsp.vfc_refcount = vfsp->vfc_refcount; 3658 xvfsp.vfc_flags = vfsp->vfc_flags; 3659 xvfsp.vfc_vfsops = 0; 3660 xvfsp.vfc_next = 0; 3661 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 3662 } 3663 #endif 3664 3665 /* 3666 * Top level filesystem related information gathering. 3667 */ 3668 static int 3669 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) 3670 { 3671 struct vfsconf *vfsp; 3672 int error; 3673 3674 error = 0; 3675 vfsconf_slock(); 3676 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 3677 #ifdef COMPAT_FREEBSD32 3678 if (req->flags & SCTL_MASK32) 3679 error = vfsconf2x32(req, vfsp); 3680 else 3681 #endif 3682 error = vfsconf2x(req, vfsp); 3683 if (error) 3684 break; 3685 } 3686 vfsconf_sunlock(); 3687 return (error); 3688 } 3689 3690 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | 3691 CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, 3692 "S,xvfsconf", "List of all configured filesystems"); 3693 3694 #ifndef BURN_BRIDGES 3695 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 3696 3697 static int 3698 vfs_sysctl(SYSCTL_HANDLER_ARGS) 3699 { 3700 int *name = (int *)arg1 - 1; /* XXX */ 3701 u_int namelen = arg2 + 1; /* XXX */ 3702 struct vfsconf *vfsp; 3703 3704 log(LOG_WARNING, "userland calling deprecated sysctl, " 3705 "please rebuild world\n"); 3706 3707 #if 1 || defined(COMPAT_PRELITE2) 3708 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 3709 if (namelen == 1) 3710 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 3711 #endif 3712 3713 switch (name[1]) { 3714 case VFS_MAXTYPENUM: 3715 if (namelen != 2) 3716 return (ENOTDIR); 3717 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 3718 case VFS_CONF: 3719 if (namelen != 3) 3720 return (ENOTDIR); /* overloaded */ 3721 vfsconf_slock(); 3722 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 3723 if (vfsp->vfc_typenum == name[2]) 3724 break; 3725 } 3726 vfsconf_sunlock(); 3727 if (vfsp == NULL) 3728 return (EOPNOTSUPP); 3729 #ifdef COMPAT_FREEBSD32 3730 if (req->flags & SCTL_MASK32) 3731 return (vfsconf2x32(req, vfsp)); 3732 else 3733 #endif 3734 return (vfsconf2x(req, vfsp)); 3735 } 3736 return (EOPNOTSUPP); 3737 } 3738 3739 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | 3740 CTLFLAG_MPSAFE, vfs_sysctl, 3741 "Generic filesystem"); 3742 3743 #if 1 || defined(COMPAT_PRELITE2) 3744 3745 static int 3746 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 3747 { 3748 int error; 3749 struct vfsconf *vfsp; 3750 struct ovfsconf ovfs; 3751 3752 vfsconf_slock(); 3753 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 3754 bzero(&ovfs, sizeof(ovfs)); 3755 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 3756 strcpy(ovfs.vfc_name, vfsp->vfc_name); 3757 ovfs.vfc_index = vfsp->vfc_typenum; 3758 ovfs.vfc_refcount = vfsp->vfc_refcount; 3759 ovfs.vfc_flags = vfsp->vfc_flags; 3760 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 3761 if (error != 0) { 3762 vfsconf_sunlock(); 3763 return (error); 3764 } 3765 } 3766 vfsconf_sunlock(); 3767 return (0); 3768 } 3769 3770 #endif /* 1 || COMPAT_PRELITE2 */ 3771 #endif /* !BURN_BRIDGES */ 3772 3773 #define KINFO_VNODESLOP 10 3774 #ifdef notyet 3775 /* 3776 * Dump vnode list (via sysctl). 3777 */ 3778 /* ARGSUSED */ 3779 static int 3780 sysctl_vnode(SYSCTL_HANDLER_ARGS) 3781 { 3782 struct xvnode *xvn; 3783 struct mount *mp; 3784 struct vnode *vp; 3785 int error, len, n; 3786 3787 /* 3788 * Stale numvnodes access is not fatal here. 3789 */ 3790 req->lock = 0; 3791 len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; 3792 if (!req->oldptr) 3793 /* Make an estimate */ 3794 return (SYSCTL_OUT(req, 0, len)); 3795 3796 error = sysctl_wire_old_buffer(req, 0); 3797 if (error != 0) 3798 return (error); 3799 xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); 3800 n = 0; 3801 mtx_lock(&mountlist_mtx); 3802 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3803 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) 3804 continue; 3805 MNT_ILOCK(mp); 3806 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 3807 if (n == len) 3808 break; 3809 vref(vp); 3810 xvn[n].xv_size = sizeof *xvn; 3811 xvn[n].xv_vnode = vp; 3812 xvn[n].xv_id = 0; /* XXX compat */ 3813 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field 3814 XV_COPY(usecount); 3815 XV_COPY(writecount); 3816 XV_COPY(holdcnt); 3817 XV_COPY(mount); 3818 XV_COPY(numoutput); 3819 XV_COPY(type); 3820 #undef XV_COPY 3821 xvn[n].xv_flag = vp->v_vflag; 3822 3823 switch (vp->v_type) { 3824 case VREG: 3825 case VDIR: 3826 case VLNK: 3827 break; 3828 case VBLK: 3829 case VCHR: 3830 if (vp->v_rdev == NULL) { 3831 vrele(vp); 3832 continue; 3833 } 3834 xvn[n].xv_dev = dev2udev(vp->v_rdev); 3835 break; 3836 case VSOCK: 3837 xvn[n].xv_socket = vp->v_socket; 3838 break; 3839 case VFIFO: 3840 xvn[n].xv_fifo = vp->v_fifoinfo; 3841 break; 3842 case VNON: 3843 case VBAD: 3844 default: 3845 /* shouldn't happen? */ 3846 vrele(vp); 3847 continue; 3848 } 3849 vrele(vp); 3850 ++n; 3851 } 3852 MNT_IUNLOCK(mp); 3853 mtx_lock(&mountlist_mtx); 3854 vfs_unbusy(mp); 3855 if (n == len) 3856 break; 3857 } 3858 mtx_unlock(&mountlist_mtx); 3859 3860 error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); 3861 free(xvn, M_TEMP); 3862 return (error); 3863 } 3864 3865 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD | 3866 CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode", 3867 ""); 3868 #endif 3869 3870 static void 3871 unmount_or_warn(struct mount *mp) 3872 { 3873 int error; 3874 3875 error = dounmount(mp, MNT_FORCE, curthread); 3876 if (error != 0) { 3877 printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); 3878 if (error == EBUSY) 3879 printf("BUSY)\n"); 3880 else 3881 printf("%d)\n", error); 3882 } 3883 } 3884 3885 /* 3886 * Unmount all filesystems. The list is traversed in reverse order 3887 * of mounting to avoid dependencies. 3888 */ 3889 void 3890 vfs_unmountall(void) 3891 { 3892 struct mount *mp, *tmp; 3893 3894 CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); 3895 3896 /* 3897 * Since this only runs when rebooting, it is not interlocked. 3898 */ 3899 TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { 3900 vfs_ref(mp); 3901 3902 /* 3903 * Forcibly unmounting "/dev" before "/" would prevent clean 3904 * unmount of the latter. 3905 */ 3906 if (mp == rootdevmp) 3907 continue; 3908 3909 unmount_or_warn(mp); 3910 } 3911 3912 if (rootdevmp != NULL) 3913 unmount_or_warn(rootdevmp); 3914 } 3915 3916 /* 3917 * perform msync on all vnodes under a mount point 3918 * the mount point must be locked. 3919 */ 3920 void 3921 vfs_msync(struct mount *mp, int flags) 3922 { 3923 struct vnode *vp, *mvp; 3924 struct vm_object *obj; 3925 3926 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 3927 MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { 3928 obj = vp->v_object; 3929 if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 && 3930 (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) { 3931 if (!vget(vp, 3932 LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, 3933 curthread)) { 3934 if (vp->v_vflag & VV_NOSYNC) { /* unlinked */ 3935 vput(vp); 3936 continue; 3937 } 3938 3939 obj = vp->v_object; 3940 if (obj != NULL) { 3941 VM_OBJECT_WLOCK(obj); 3942 vm_object_page_clean(obj, 0, 0, 3943 flags == MNT_WAIT ? 3944 OBJPC_SYNC : OBJPC_NOSYNC); 3945 VM_OBJECT_WUNLOCK(obj); 3946 } 3947 vput(vp); 3948 } 3949 } else 3950 VI_UNLOCK(vp); 3951 } 3952 } 3953 3954 static void 3955 destroy_vpollinfo_free(struct vpollinfo *vi) 3956 { 3957 3958 knlist_destroy(&vi->vpi_selinfo.si_note); 3959 mtx_destroy(&vi->vpi_lock); 3960 uma_zfree(vnodepoll_zone, vi); 3961 } 3962 3963 static void 3964 destroy_vpollinfo(struct vpollinfo *vi) 3965 { 3966 3967 knlist_clear(&vi->vpi_selinfo.si_note, 1); 3968 seldrain(&vi->vpi_selinfo); 3969 destroy_vpollinfo_free(vi); 3970 } 3971 3972 /* 3973 * Initialize per-vnode helper structure to hold poll-related state. 3974 */ 3975 void 3976 v_addpollinfo(struct vnode *vp) 3977 { 3978 struct vpollinfo *vi; 3979 3980 if (vp->v_pollinfo != NULL) 3981 return; 3982 vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO); 3983 mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); 3984 knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, 3985 vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked); 3986 VI_LOCK(vp); 3987 if (vp->v_pollinfo != NULL) { 3988 VI_UNLOCK(vp); 3989 destroy_vpollinfo_free(vi); 3990 return; 3991 } 3992 vp->v_pollinfo = vi; 3993 VI_UNLOCK(vp); 3994 } 3995 3996 /* 3997 * Record a process's interest in events which might happen to 3998 * a vnode. Because poll uses the historic select-style interface 3999 * internally, this routine serves as both the ``check for any 4000 * pending events'' and the ``record my interest in future events'' 4001 * functions. (These are done together, while the lock is held, 4002 * to avoid race conditions.) 4003 */ 4004 int 4005 vn_pollrecord(struct vnode *vp, struct thread *td, int events) 4006 { 4007 4008 v_addpollinfo(vp); 4009 mtx_lock(&vp->v_pollinfo->vpi_lock); 4010 if (vp->v_pollinfo->vpi_revents & events) { 4011 /* 4012 * This leaves events we are not interested 4013 * in available for the other process which 4014 * which presumably had requested them 4015 * (otherwise they would never have been 4016 * recorded). 4017 */ 4018 events &= vp->v_pollinfo->vpi_revents; 4019 vp->v_pollinfo->vpi_revents &= ~events; 4020 4021 mtx_unlock(&vp->v_pollinfo->vpi_lock); 4022 return (events); 4023 } 4024 vp->v_pollinfo->vpi_events |= events; 4025 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 4026 mtx_unlock(&vp->v_pollinfo->vpi_lock); 4027 return (0); 4028 } 4029 4030 /* 4031 * Routine to create and manage a filesystem syncer vnode. 4032 */ 4033 #define sync_close ((int (*)(struct vop_close_args *))nullop) 4034 static int sync_fsync(struct vop_fsync_args *); 4035 static int sync_inactive(struct vop_inactive_args *); 4036 static int sync_reclaim(struct vop_reclaim_args *); 4037 4038 static struct vop_vector sync_vnodeops = { 4039 .vop_bypass = VOP_EOPNOTSUPP, 4040 .vop_close = sync_close, /* close */ 4041 .vop_fsync = sync_fsync, /* fsync */ 4042 .vop_inactive = sync_inactive, /* inactive */ 4043 .vop_reclaim = sync_reclaim, /* reclaim */ 4044 .vop_lock1 = vop_stdlock, /* lock */ 4045 .vop_unlock = vop_stdunlock, /* unlock */ 4046 .vop_islocked = vop_stdislocked, /* islocked */ 4047 }; 4048 4049 /* 4050 * Create a new filesystem syncer vnode for the specified mount point. 4051 */ 4052 void 4053 vfs_allocate_syncvnode(struct mount *mp) 4054 { 4055 struct vnode *vp; 4056 struct bufobj *bo; 4057 static long start, incr, next; 4058 int error; 4059 4060 /* Allocate a new vnode */ 4061 error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); 4062 if (error != 0) 4063 panic("vfs_allocate_syncvnode: getnewvnode() failed"); 4064 vp->v_type = VNON; 4065 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 4066 vp->v_vflag |= VV_FORCEINSMQ; 4067 error = insmntque(vp, mp); 4068 if (error != 0) 4069 panic("vfs_allocate_syncvnode: insmntque() failed"); 4070 vp->v_vflag &= ~VV_FORCEINSMQ; 4071 VOP_UNLOCK(vp, 0); 4072 /* 4073 * Place the vnode onto the syncer worklist. We attempt to 4074 * scatter them about on the list so that they will go off 4075 * at evenly distributed times even if all the filesystems 4076 * are mounted at once. 4077 */ 4078 next += incr; 4079 if (next == 0 || next > syncer_maxdelay) { 4080 start /= 2; 4081 incr /= 2; 4082 if (start == 0) { 4083 start = syncer_maxdelay / 2; 4084 incr = syncer_maxdelay; 4085 } 4086 next = start; 4087 } 4088 bo = &vp->v_bufobj; 4089 BO_LOCK(bo); 4090 vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); 4091 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ 4092 mtx_lock(&sync_mtx); 4093 sync_vnode_count++; 4094 if (mp->mnt_syncer == NULL) { 4095 mp->mnt_syncer = vp; 4096 vp = NULL; 4097 } 4098 mtx_unlock(&sync_mtx); 4099 BO_UNLOCK(bo); 4100 if (vp != NULL) { 4101 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 4102 vgone(vp); 4103 vput(vp); 4104 } 4105 } 4106 4107 void 4108 vfs_deallocate_syncvnode(struct mount *mp) 4109 { 4110 struct vnode *vp; 4111 4112 mtx_lock(&sync_mtx); 4113 vp = mp->mnt_syncer; 4114 if (vp != NULL) 4115 mp->mnt_syncer = NULL; 4116 mtx_unlock(&sync_mtx); 4117 if (vp != NULL) 4118 vrele(vp); 4119 } 4120 4121 /* 4122 * Do a lazy sync of the filesystem. 4123 */ 4124 static int 4125 sync_fsync(struct vop_fsync_args *ap) 4126 { 4127 struct vnode *syncvp = ap->a_vp; 4128 struct mount *mp = syncvp->v_mount; 4129 int error, save; 4130 struct bufobj *bo; 4131 4132 /* 4133 * We only need to do something if this is a lazy evaluation. 4134 */ 4135 if (ap->a_waitfor != MNT_LAZY) 4136 return (0); 4137 4138 /* 4139 * Move ourselves to the back of the sync list. 4140 */ 4141 bo = &syncvp->v_bufobj; 4142 BO_LOCK(bo); 4143 vn_syncer_add_to_worklist(bo, syncdelay); 4144 BO_UNLOCK(bo); 4145 4146 /* 4147 * Walk the list of vnodes pushing all that are dirty and 4148 * not already on the sync list. 4149 */ 4150 if (vfs_busy(mp, MBF_NOWAIT) != 0) 4151 return (0); 4152 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { 4153 vfs_unbusy(mp); 4154 return (0); 4155 } 4156 save = curthread_pflags_set(TDP_SYNCIO); 4157 vfs_msync(mp, MNT_NOWAIT); 4158 error = VFS_SYNC(mp, MNT_LAZY); 4159 curthread_pflags_restore(save); 4160 vn_finished_write(mp); 4161 vfs_unbusy(mp); 4162 return (error); 4163 } 4164 4165 /* 4166 * The syncer vnode is no referenced. 4167 */ 4168 static int 4169 sync_inactive(struct vop_inactive_args *ap) 4170 { 4171 4172 vgone(ap->a_vp); 4173 return (0); 4174 } 4175 4176 /* 4177 * The syncer vnode is no longer needed and is being decommissioned. 4178 * 4179 * Modifications to the worklist must be protected by sync_mtx. 4180 */ 4181 static int 4182 sync_reclaim(struct vop_reclaim_args *ap) 4183 { 4184 struct vnode *vp = ap->a_vp; 4185 struct bufobj *bo; 4186 4187 bo = &vp->v_bufobj; 4188 BO_LOCK(bo); 4189 mtx_lock(&sync_mtx); 4190 if (vp->v_mount->mnt_syncer == vp) 4191 vp->v_mount->mnt_syncer = NULL; 4192 if (bo->bo_flag & BO_ONWORKLST) { 4193 LIST_REMOVE(bo, bo_synclist); 4194 syncer_worklist_len--; 4195 sync_vnode_count--; 4196 bo->bo_flag &= ~BO_ONWORKLST; 4197 } 4198 mtx_unlock(&sync_mtx); 4199 BO_UNLOCK(bo); 4200 4201 return (0); 4202 } 4203 4204 /* 4205 * Check if vnode represents a disk device 4206 */ 4207 int 4208 vn_isdisk(struct vnode *vp, int *errp) 4209 { 4210 int error; 4211 4212 if (vp->v_type != VCHR) { 4213 error = ENOTBLK; 4214 goto out; 4215 } 4216 error = 0; 4217 dev_lock(); 4218 if (vp->v_rdev == NULL) 4219 error = ENXIO; 4220 else if (vp->v_rdev->si_devsw == NULL) 4221 error = ENXIO; 4222 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) 4223 error = ENOTBLK; 4224 dev_unlock(); 4225 out: 4226 if (errp != NULL) 4227 *errp = error; 4228 return (error == 0); 4229 } 4230 4231 /* 4232 * Common filesystem object access control check routine. Accepts a 4233 * vnode's type, "mode", uid and gid, requested access mode, credentials, 4234 * and optional call-by-reference privused argument allowing vaccess() 4235 * to indicate to the caller whether privilege was used to satisfy the 4236 * request (obsoleted). Returns 0 on success, or an errno on failure. 4237 */ 4238 int 4239 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, 4240 accmode_t accmode, struct ucred *cred, int *privused) 4241 { 4242 accmode_t dac_granted; 4243 accmode_t priv_granted; 4244 4245 KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, 4246 ("invalid bit in accmode")); 4247 KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), 4248 ("VAPPEND without VWRITE")); 4249 4250 /* 4251 * Look for a normal, non-privileged way to access the file/directory 4252 * as requested. If it exists, go with that. 4253 */ 4254 4255 if (privused != NULL) 4256 *privused = 0; 4257 4258 dac_granted = 0; 4259 4260 /* Check the owner. */ 4261 if (cred->cr_uid == file_uid) { 4262 dac_granted |= VADMIN; 4263 if (file_mode & S_IXUSR) 4264 dac_granted |= VEXEC; 4265 if (file_mode & S_IRUSR) 4266 dac_granted |= VREAD; 4267 if (file_mode & S_IWUSR) 4268 dac_granted |= (VWRITE | VAPPEND); 4269 4270 if ((accmode & dac_granted) == accmode) 4271 return (0); 4272 4273 goto privcheck; 4274 } 4275 4276 /* Otherwise, check the groups (first match) */ 4277 if (groupmember(file_gid, cred)) { 4278 if (file_mode & S_IXGRP) 4279 dac_granted |= VEXEC; 4280 if (file_mode & S_IRGRP) 4281 dac_granted |= VREAD; 4282 if (file_mode & S_IWGRP) 4283 dac_granted |= (VWRITE | VAPPEND); 4284 4285 if ((accmode & dac_granted) == accmode) 4286 return (0); 4287 4288 goto privcheck; 4289 } 4290 4291 /* Otherwise, check everyone else. */ 4292 if (file_mode & S_IXOTH) 4293 dac_granted |= VEXEC; 4294 if (file_mode & S_IROTH) 4295 dac_granted |= VREAD; 4296 if (file_mode & S_IWOTH) 4297 dac_granted |= (VWRITE | VAPPEND); 4298 if ((accmode & dac_granted) == accmode) 4299 return (0); 4300 4301 privcheck: 4302 /* 4303 * Build a privilege mask to determine if the set of privileges 4304 * satisfies the requirements when combined with the granted mask 4305 * from above. For each privilege, if the privilege is required, 4306 * bitwise or the request type onto the priv_granted mask. 4307 */ 4308 priv_granted = 0; 4309 4310 if (type == VDIR) { 4311 /* 4312 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC 4313 * requests, instead of PRIV_VFS_EXEC. 4314 */ 4315 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 4316 !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0)) 4317 priv_granted |= VEXEC; 4318 } else { 4319 /* 4320 * Ensure that at least one execute bit is on. Otherwise, 4321 * a privileged user will always succeed, and we don't want 4322 * this to happen unless the file really is executable. 4323 */ 4324 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 4325 (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && 4326 !priv_check_cred(cred, PRIV_VFS_EXEC, 0)) 4327 priv_granted |= VEXEC; 4328 } 4329 4330 if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && 4331 !priv_check_cred(cred, PRIV_VFS_READ, 0)) 4332 priv_granted |= VREAD; 4333 4334 if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && 4335 !priv_check_cred(cred, PRIV_VFS_WRITE, 0)) 4336 priv_granted |= (VWRITE | VAPPEND); 4337 4338 if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && 4339 !priv_check_cred(cred, PRIV_VFS_ADMIN, 0)) 4340 priv_granted |= VADMIN; 4341 4342 if ((accmode & (priv_granted | dac_granted)) == accmode) { 4343 /* XXX audit: privilege used */ 4344 if (privused != NULL) 4345 *privused = 1; 4346 return (0); 4347 } 4348 4349 return ((accmode & VADMIN) ? EPERM : EACCES); 4350 } 4351 4352 /* 4353 * Credential check based on process requesting service, and per-attribute 4354 * permissions. 4355 */ 4356 int 4357 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, 4358 struct thread *td, accmode_t accmode) 4359 { 4360 4361 /* 4362 * Kernel-invoked always succeeds. 4363 */ 4364 if (cred == NOCRED) 4365 return (0); 4366 4367 /* 4368 * Do not allow privileged processes in jail to directly manipulate 4369 * system attributes. 4370 */ 4371 switch (attrnamespace) { 4372 case EXTATTR_NAMESPACE_SYSTEM: 4373 /* Potentially should be: return (EPERM); */ 4374 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0)); 4375 case EXTATTR_NAMESPACE_USER: 4376 return (VOP_ACCESS(vp, accmode, cred, td)); 4377 default: 4378 return (EPERM); 4379 } 4380 } 4381 4382 #ifdef DEBUG_VFS_LOCKS 4383 /* 4384 * This only exists to suppress warnings from unlocked specfs accesses. It is 4385 * no longer ok to have an unlocked VFS. 4386 */ 4387 #define IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL || \ 4388 (vp)->v_type == VCHR || (vp)->v_type == VBAD) 4389 4390 int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ 4391 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, 4392 "Drop into debugger on lock violation"); 4393 4394 int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ 4395 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 4396 0, "Check for interlock across VOPs"); 4397 4398 int vfs_badlock_print = 1; /* Print lock violations. */ 4399 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 4400 0, "Print lock violations"); 4401 4402 #ifdef KDB 4403 int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ 4404 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, 4405 &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); 4406 #endif 4407 4408 static void 4409 vfs_badlock(const char *msg, const char *str, struct vnode *vp) 4410 { 4411 4412 #ifdef KDB 4413 if (vfs_badlock_backtrace) 4414 kdb_backtrace(); 4415 #endif 4416 if (vfs_badlock_print) 4417 printf("%s: %p %s\n", str, (void *)vp, msg); 4418 if (vfs_badlock_ddb) 4419 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 4420 } 4421 4422 void 4423 assert_vi_locked(struct vnode *vp, const char *str) 4424 { 4425 4426 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) 4427 vfs_badlock("interlock is not locked but should be", str, vp); 4428 } 4429 4430 void 4431 assert_vi_unlocked(struct vnode *vp, const char *str) 4432 { 4433 4434 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) 4435 vfs_badlock("interlock is locked but should not be", str, vp); 4436 } 4437 4438 void 4439 assert_vop_locked(struct vnode *vp, const char *str) 4440 { 4441 int locked; 4442 4443 if (!IGNORE_LOCK(vp)) { 4444 locked = VOP_ISLOCKED(vp); 4445 if (locked == 0 || locked == LK_EXCLOTHER) 4446 vfs_badlock("is not locked but should be", str, vp); 4447 } 4448 } 4449 4450 void 4451 assert_vop_unlocked(struct vnode *vp, const char *str) 4452 { 4453 4454 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) 4455 vfs_badlock("is locked but should not be", str, vp); 4456 } 4457 4458 void 4459 assert_vop_elocked(struct vnode *vp, const char *str) 4460 { 4461 4462 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 4463 vfs_badlock("is not exclusive locked but should be", str, vp); 4464 } 4465 4466 #if 0 4467 void 4468 assert_vop_elocked_other(struct vnode *vp, const char *str) 4469 { 4470 4471 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER) 4472 vfs_badlock("is not exclusive locked by another thread", 4473 str, vp); 4474 } 4475 4476 void 4477 assert_vop_slocked(struct vnode *vp, const char *str) 4478 { 4479 4480 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED) 4481 vfs_badlock("is not locked shared but should be", str, vp); 4482 } 4483 #endif /* 0 */ 4484 #endif /* DEBUG_VFS_LOCKS */ 4485 4486 void 4487 vop_rename_fail(struct vop_rename_args *ap) 4488 { 4489 4490 if (ap->a_tvp != NULL) 4491 vput(ap->a_tvp); 4492 if (ap->a_tdvp == ap->a_tvp) 4493 vrele(ap->a_tdvp); 4494 else 4495 vput(ap->a_tdvp); 4496 vrele(ap->a_fdvp); 4497 vrele(ap->a_fvp); 4498 } 4499 4500 void 4501 vop_rename_pre(void *ap) 4502 { 4503 struct vop_rename_args *a = ap; 4504 4505 #ifdef DEBUG_VFS_LOCKS 4506 if (a->a_tvp) 4507 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); 4508 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); 4509 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); 4510 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); 4511 4512 /* Check the source (from). */ 4513 if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && 4514 (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) 4515 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); 4516 if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) 4517 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); 4518 4519 /* Check the target. */ 4520 if (a->a_tvp) 4521 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); 4522 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); 4523 #endif 4524 if (a->a_tdvp != a->a_fdvp) 4525 vhold(a->a_fdvp); 4526 if (a->a_tvp != a->a_fvp) 4527 vhold(a->a_fvp); 4528 vhold(a->a_tdvp); 4529 if (a->a_tvp) 4530 vhold(a->a_tvp); 4531 } 4532 4533 #ifdef DEBUG_VFS_LOCKS 4534 void 4535 vop_strategy_pre(void *ap) 4536 { 4537 struct vop_strategy_args *a; 4538 struct buf *bp; 4539 4540 a = ap; 4541 bp = a->a_bp; 4542 4543 /* 4544 * Cluster ops lock their component buffers but not the IO container. 4545 */ 4546 if ((bp->b_flags & B_CLUSTER) != 0) 4547 return; 4548 4549 if (panicstr == NULL && !BUF_ISLOCKED(bp)) { 4550 if (vfs_badlock_print) 4551 printf( 4552 "VOP_STRATEGY: bp is not locked but should be\n"); 4553 if (vfs_badlock_ddb) 4554 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 4555 } 4556 } 4557 4558 void 4559 vop_lock_pre(void *ap) 4560 { 4561 struct vop_lock1_args *a = ap; 4562 4563 if ((a->a_flags & LK_INTERLOCK) == 0) 4564 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 4565 else 4566 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); 4567 } 4568 4569 void 4570 vop_lock_post(void *ap, int rc) 4571 { 4572 struct vop_lock1_args *a = ap; 4573 4574 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 4575 if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) 4576 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); 4577 } 4578 4579 void 4580 vop_unlock_pre(void *ap) 4581 { 4582 struct vop_unlock_args *a = ap; 4583 4584 if (a->a_flags & LK_INTERLOCK) 4585 ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK"); 4586 ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); 4587 } 4588 4589 void 4590 vop_unlock_post(void *ap, int rc) 4591 { 4592 struct vop_unlock_args *a = ap; 4593 4594 if (a->a_flags & LK_INTERLOCK) 4595 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK"); 4596 } 4597 #endif 4598 4599 void 4600 vop_create_post(void *ap, int rc) 4601 { 4602 struct vop_create_args *a = ap; 4603 4604 if (!rc) 4605 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4606 } 4607 4608 void 4609 vop_deleteextattr_post(void *ap, int rc) 4610 { 4611 struct vop_deleteextattr_args *a = ap; 4612 4613 if (!rc) 4614 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 4615 } 4616 4617 void 4618 vop_link_post(void *ap, int rc) 4619 { 4620 struct vop_link_args *a = ap; 4621 4622 if (!rc) { 4623 VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK); 4624 VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE); 4625 } 4626 } 4627 4628 void 4629 vop_mkdir_post(void *ap, int rc) 4630 { 4631 struct vop_mkdir_args *a = ap; 4632 4633 if (!rc) 4634 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); 4635 } 4636 4637 void 4638 vop_mknod_post(void *ap, int rc) 4639 { 4640 struct vop_mknod_args *a = ap; 4641 4642 if (!rc) 4643 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4644 } 4645 4646 void 4647 vop_reclaim_post(void *ap, int rc) 4648 { 4649 struct vop_reclaim_args *a = ap; 4650 4651 if (!rc) 4652 VFS_KNOTE_LOCKED(a->a_vp, NOTE_REVOKE); 4653 } 4654 4655 void 4656 vop_remove_post(void *ap, int rc) 4657 { 4658 struct vop_remove_args *a = ap; 4659 4660 if (!rc) { 4661 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4662 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); 4663 } 4664 } 4665 4666 void 4667 vop_rename_post(void *ap, int rc) 4668 { 4669 struct vop_rename_args *a = ap; 4670 long hint; 4671 4672 if (!rc) { 4673 hint = NOTE_WRITE; 4674 if (a->a_fdvp == a->a_tdvp) { 4675 if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) 4676 hint |= NOTE_LINK; 4677 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 4678 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 4679 } else { 4680 hint |= NOTE_EXTEND; 4681 if (a->a_fvp->v_type == VDIR) 4682 hint |= NOTE_LINK; 4683 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 4684 4685 if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && 4686 a->a_tvp->v_type == VDIR) 4687 hint &= ~NOTE_LINK; 4688 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 4689 } 4690 4691 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); 4692 if (a->a_tvp) 4693 VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); 4694 } 4695 if (a->a_tdvp != a->a_fdvp) 4696 vdrop(a->a_fdvp); 4697 if (a->a_tvp != a->a_fvp) 4698 vdrop(a->a_fvp); 4699 vdrop(a->a_tdvp); 4700 if (a->a_tvp) 4701 vdrop(a->a_tvp); 4702 } 4703 4704 void 4705 vop_rmdir_post(void *ap, int rc) 4706 { 4707 struct vop_rmdir_args *a = ap; 4708 4709 if (!rc) { 4710 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); 4711 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); 4712 } 4713 } 4714 4715 void 4716 vop_setattr_post(void *ap, int rc) 4717 { 4718 struct vop_setattr_args *a = ap; 4719 4720 if (!rc) 4721 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 4722 } 4723 4724 void 4725 vop_setextattr_post(void *ap, int rc) 4726 { 4727 struct vop_setextattr_args *a = ap; 4728 4729 if (!rc) 4730 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 4731 } 4732 4733 void 4734 vop_symlink_post(void *ap, int rc) 4735 { 4736 struct vop_symlink_args *a = ap; 4737 4738 if (!rc) 4739 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4740 } 4741 4742 void 4743 vop_open_post(void *ap, int rc) 4744 { 4745 struct vop_open_args *a = ap; 4746 4747 if (!rc) 4748 VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); 4749 } 4750 4751 void 4752 vop_close_post(void *ap, int rc) 4753 { 4754 struct vop_close_args *a = ap; 4755 4756 if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ 4757 (a->a_vp->v_iflag & VI_DOOMED) == 0)) { 4758 VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? 4759 NOTE_CLOSE_WRITE : NOTE_CLOSE); 4760 } 4761 } 4762 4763 void 4764 vop_read_post(void *ap, int rc) 4765 { 4766 struct vop_read_args *a = ap; 4767 4768 if (!rc) 4769 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 4770 } 4771 4772 void 4773 vop_readdir_post(void *ap, int rc) 4774 { 4775 struct vop_readdir_args *a = ap; 4776 4777 if (!rc) 4778 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 4779 } 4780 4781 static struct knlist fs_knlist; 4782 4783 static void 4784 vfs_event_init(void *arg) 4785 { 4786 knlist_init_mtx(&fs_knlist, NULL); 4787 } 4788 /* XXX - correct order? */ 4789 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); 4790 4791 void 4792 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) 4793 { 4794 4795 KNOTE_UNLOCKED(&fs_knlist, event); 4796 } 4797 4798 static int filt_fsattach(struct knote *kn); 4799 static void filt_fsdetach(struct knote *kn); 4800 static int filt_fsevent(struct knote *kn, long hint); 4801 4802 struct filterops fs_filtops = { 4803 .f_isfd = 0, 4804 .f_attach = filt_fsattach, 4805 .f_detach = filt_fsdetach, 4806 .f_event = filt_fsevent 4807 }; 4808 4809 static int 4810 filt_fsattach(struct knote *kn) 4811 { 4812 4813 kn->kn_flags |= EV_CLEAR; 4814 knlist_add(&fs_knlist, kn, 0); 4815 return (0); 4816 } 4817 4818 static void 4819 filt_fsdetach(struct knote *kn) 4820 { 4821 4822 knlist_remove(&fs_knlist, kn, 0); 4823 } 4824 4825 static int 4826 filt_fsevent(struct knote *kn, long hint) 4827 { 4828 4829 kn->kn_fflags |= hint; 4830 return (kn->kn_fflags != 0); 4831 } 4832 4833 static int 4834 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) 4835 { 4836 struct vfsidctl vc; 4837 int error; 4838 struct mount *mp; 4839 4840 error = SYSCTL_IN(req, &vc, sizeof(vc)); 4841 if (error) 4842 return (error); 4843 if (vc.vc_vers != VFS_CTL_VERS1) 4844 return (EINVAL); 4845 mp = vfs_getvfs(&vc.vc_fsid); 4846 if (mp == NULL) 4847 return (ENOENT); 4848 /* ensure that a specific sysctl goes to the right filesystem. */ 4849 if (strcmp(vc.vc_fstypename, "*") != 0 && 4850 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { 4851 vfs_rel(mp); 4852 return (EINVAL); 4853 } 4854 VCTLTOREQ(&vc, req); 4855 error = VFS_SYSCTL(mp, vc.vc_op, req); 4856 vfs_rel(mp); 4857 return (error); 4858 } 4859 4860 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR, 4861 NULL, 0, sysctl_vfs_ctl, "", 4862 "Sysctl by fsid"); 4863 4864 /* 4865 * Function to initialize a va_filerev field sensibly. 4866 * XXX: Wouldn't a random number make a lot more sense ?? 4867 */ 4868 u_quad_t 4869 init_va_filerev(void) 4870 { 4871 struct bintime bt; 4872 4873 getbinuptime(&bt); 4874 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); 4875 } 4876 4877 static int filt_vfsread(struct knote *kn, long hint); 4878 static int filt_vfswrite(struct knote *kn, long hint); 4879 static int filt_vfsvnode(struct knote *kn, long hint); 4880 static void filt_vfsdetach(struct knote *kn); 4881 static struct filterops vfsread_filtops = { 4882 .f_isfd = 1, 4883 .f_detach = filt_vfsdetach, 4884 .f_event = filt_vfsread 4885 }; 4886 static struct filterops vfswrite_filtops = { 4887 .f_isfd = 1, 4888 .f_detach = filt_vfsdetach, 4889 .f_event = filt_vfswrite 4890 }; 4891 static struct filterops vfsvnode_filtops = { 4892 .f_isfd = 1, 4893 .f_detach = filt_vfsdetach, 4894 .f_event = filt_vfsvnode 4895 }; 4896 4897 static void 4898 vfs_knllock(void *arg) 4899 { 4900 struct vnode *vp = arg; 4901 4902 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 4903 } 4904 4905 static void 4906 vfs_knlunlock(void *arg) 4907 { 4908 struct vnode *vp = arg; 4909 4910 VOP_UNLOCK(vp, 0); 4911 } 4912 4913 static void 4914 vfs_knl_assert_locked(void *arg) 4915 { 4916 #ifdef DEBUG_VFS_LOCKS 4917 struct vnode *vp = arg; 4918 4919 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); 4920 #endif 4921 } 4922 4923 static void 4924 vfs_knl_assert_unlocked(void *arg) 4925 { 4926 #ifdef DEBUG_VFS_LOCKS 4927 struct vnode *vp = arg; 4928 4929 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); 4930 #endif 4931 } 4932 4933 int 4934 vfs_kqfilter(struct vop_kqfilter_args *ap) 4935 { 4936 struct vnode *vp = ap->a_vp; 4937 struct knote *kn = ap->a_kn; 4938 struct knlist *knl; 4939 4940 switch (kn->kn_filter) { 4941 case EVFILT_READ: 4942 kn->kn_fop = &vfsread_filtops; 4943 break; 4944 case EVFILT_WRITE: 4945 kn->kn_fop = &vfswrite_filtops; 4946 break; 4947 case EVFILT_VNODE: 4948 kn->kn_fop = &vfsvnode_filtops; 4949 break; 4950 default: 4951 return (EINVAL); 4952 } 4953 4954 kn->kn_hook = (caddr_t)vp; 4955 4956 v_addpollinfo(vp); 4957 if (vp->v_pollinfo == NULL) 4958 return (ENOMEM); 4959 knl = &vp->v_pollinfo->vpi_selinfo.si_note; 4960 vhold(vp); 4961 knlist_add(knl, kn, 0); 4962 4963 return (0); 4964 } 4965 4966 /* 4967 * Detach knote from vnode 4968 */ 4969 static void 4970 filt_vfsdetach(struct knote *kn) 4971 { 4972 struct vnode *vp = (struct vnode *)kn->kn_hook; 4973 4974 KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); 4975 knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); 4976 vdrop(vp); 4977 } 4978 4979 /*ARGSUSED*/ 4980 static int 4981 filt_vfsread(struct knote *kn, long hint) 4982 { 4983 struct vnode *vp = (struct vnode *)kn->kn_hook; 4984 struct vattr va; 4985 int res; 4986 4987 /* 4988 * filesystem is gone, so set the EOF flag and schedule 4989 * the knote for deletion. 4990 */ 4991 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 4992 VI_LOCK(vp); 4993 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 4994 VI_UNLOCK(vp); 4995 return (1); 4996 } 4997 4998 if (VOP_GETATTR(vp, &va, curthread->td_ucred)) 4999 return (0); 5000 5001 VI_LOCK(vp); 5002 kn->kn_data = va.va_size - kn->kn_fp->f_offset; 5003 res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; 5004 VI_UNLOCK(vp); 5005 return (res); 5006 } 5007 5008 /*ARGSUSED*/ 5009 static int 5010 filt_vfswrite(struct knote *kn, long hint) 5011 { 5012 struct vnode *vp = (struct vnode *)kn->kn_hook; 5013 5014 VI_LOCK(vp); 5015 5016 /* 5017 * filesystem is gone, so set the EOF flag and schedule 5018 * the knote for deletion. 5019 */ 5020 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) 5021 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 5022 5023 kn->kn_data = 0; 5024 VI_UNLOCK(vp); 5025 return (1); 5026 } 5027 5028 static int 5029 filt_vfsvnode(struct knote *kn, long hint) 5030 { 5031 struct vnode *vp = (struct vnode *)kn->kn_hook; 5032 int res; 5033 5034 VI_LOCK(vp); 5035 if (kn->kn_sfflags & hint) 5036 kn->kn_fflags |= hint; 5037 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 5038 kn->kn_flags |= EV_EOF; 5039 VI_UNLOCK(vp); 5040 return (1); 5041 } 5042 res = (kn->kn_fflags != 0); 5043 VI_UNLOCK(vp); 5044 return (res); 5045 } 5046 5047 int 5048 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) 5049 { 5050 int error; 5051 5052 if (dp->d_reclen > ap->a_uio->uio_resid) 5053 return (ENAMETOOLONG); 5054 error = uiomove(dp, dp->d_reclen, ap->a_uio); 5055 if (error) { 5056 if (ap->a_ncookies != NULL) { 5057 if (ap->a_cookies != NULL) 5058 free(ap->a_cookies, M_TEMP); 5059 ap->a_cookies = NULL; 5060 *ap->a_ncookies = 0; 5061 } 5062 return (error); 5063 } 5064 if (ap->a_ncookies == NULL) 5065 return (0); 5066 5067 KASSERT(ap->a_cookies, 5068 ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); 5069 5070 *ap->a_cookies = realloc(*ap->a_cookies, 5071 (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO); 5072 (*ap->a_cookies)[*ap->a_ncookies] = off; 5073 *ap->a_ncookies += 1; 5074 return (0); 5075 } 5076 5077 /* 5078 * Mark for update the access time of the file if the filesystem 5079 * supports VOP_MARKATIME. This functionality is used by execve and 5080 * mmap, so we want to avoid the I/O implied by directly setting 5081 * va_atime for the sake of efficiency. 5082 */ 5083 void 5084 vfs_mark_atime(struct vnode *vp, struct ucred *cred) 5085 { 5086 struct mount *mp; 5087 5088 mp = vp->v_mount; 5089 ASSERT_VOP_LOCKED(vp, "vfs_mark_atime"); 5090 if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) 5091 (void)VOP_MARKATIME(vp); 5092 } 5093 5094 /* 5095 * The purpose of this routine is to remove granularity from accmode_t, 5096 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, 5097 * VADMIN and VAPPEND. 5098 * 5099 * If it returns 0, the caller is supposed to continue with the usual 5100 * access checks using 'accmode' as modified by this routine. If it 5101 * returns nonzero value, the caller is supposed to return that value 5102 * as errno. 5103 * 5104 * Note that after this routine runs, accmode may be zero. 5105 */ 5106 int 5107 vfs_unixify_accmode(accmode_t *accmode) 5108 { 5109 /* 5110 * There is no way to specify explicit "deny" rule using 5111 * file mode or POSIX.1e ACLs. 5112 */ 5113 if (*accmode & VEXPLICIT_DENY) { 5114 *accmode = 0; 5115 return (0); 5116 } 5117 5118 /* 5119 * None of these can be translated into usual access bits. 5120 * Also, the common case for NFSv4 ACLs is to not contain 5121 * either of these bits. Caller should check for VWRITE 5122 * on the containing directory instead. 5123 */ 5124 if (*accmode & (VDELETE_CHILD | VDELETE)) 5125 return (EPERM); 5126 5127 if (*accmode & VADMIN_PERMS) { 5128 *accmode &= ~VADMIN_PERMS; 5129 *accmode |= VADMIN; 5130 } 5131 5132 /* 5133 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL 5134 * or VSYNCHRONIZE using file mode or POSIX.1e ACL. 5135 */ 5136 *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); 5137 5138 return (0); 5139 } 5140 5141 /* 5142 * These are helper functions for filesystems to traverse all 5143 * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. 5144 * 5145 * This interface replaces MNT_VNODE_FOREACH. 5146 */ 5147 5148 MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); 5149 5150 struct vnode * 5151 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) 5152 { 5153 struct vnode *vp; 5154 5155 if (should_yield()) 5156 kern_yield(PRI_USER); 5157 MNT_ILOCK(mp); 5158 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 5159 vp = TAILQ_NEXT(*mvp, v_nmntvnodes); 5160 while (vp != NULL && (vp->v_type == VMARKER || 5161 (vp->v_iflag & VI_DOOMED) != 0)) 5162 vp = TAILQ_NEXT(vp, v_nmntvnodes); 5163 5164 /* Check if we are done */ 5165 if (vp == NULL) { 5166 __mnt_vnode_markerfree_all(mvp, mp); 5167 /* MNT_IUNLOCK(mp); -- done in above function */ 5168 mtx_assert(MNT_MTX(mp), MA_NOTOWNED); 5169 return (NULL); 5170 } 5171 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 5172 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 5173 VI_LOCK(vp); 5174 MNT_IUNLOCK(mp); 5175 return (vp); 5176 } 5177 5178 struct vnode * 5179 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) 5180 { 5181 struct vnode *vp; 5182 5183 *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 5184 MNT_ILOCK(mp); 5185 MNT_REF(mp); 5186 (*mvp)->v_type = VMARKER; 5187 5188 vp = TAILQ_FIRST(&mp->mnt_nvnodelist); 5189 while (vp != NULL && (vp->v_type == VMARKER || 5190 (vp->v_iflag & VI_DOOMED) != 0)) 5191 vp = TAILQ_NEXT(vp, v_nmntvnodes); 5192 5193 /* Check if we are done */ 5194 if (vp == NULL) { 5195 MNT_REL(mp); 5196 MNT_IUNLOCK(mp); 5197 free(*mvp, M_VNODE_MARKER); 5198 *mvp = NULL; 5199 return (NULL); 5200 } 5201 (*mvp)->v_mount = mp; 5202 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 5203 VI_LOCK(vp); 5204 MNT_IUNLOCK(mp); 5205 return (vp); 5206 } 5207 5208 5209 void 5210 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) 5211 { 5212 5213 if (*mvp == NULL) { 5214 MNT_IUNLOCK(mp); 5215 return; 5216 } 5217 5218 mtx_assert(MNT_MTX(mp), MA_OWNED); 5219 5220 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 5221 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 5222 MNT_REL(mp); 5223 MNT_IUNLOCK(mp); 5224 free(*mvp, M_VNODE_MARKER); 5225 *mvp = NULL; 5226 } 5227 5228 /* 5229 * These are helper functions for filesystems to traverse their 5230 * active vnodes. See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h 5231 */ 5232 static void 5233 mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) 5234 { 5235 5236 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 5237 5238 MNT_ILOCK(mp); 5239 MNT_REL(mp); 5240 MNT_IUNLOCK(mp); 5241 free(*mvp, M_VNODE_MARKER); 5242 *mvp = NULL; 5243 } 5244 5245 static struct vnode * 5246 mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) 5247 { 5248 struct vnode *vp, *nvp; 5249 5250 mtx_assert(&vnode_free_list_mtx, MA_OWNED); 5251 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 5252 restart: 5253 vp = TAILQ_NEXT(*mvp, v_actfreelist); 5254 TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); 5255 while (vp != NULL) { 5256 if (vp->v_type == VMARKER) { 5257 vp = TAILQ_NEXT(vp, v_actfreelist); 5258 continue; 5259 } 5260 if (!VI_TRYLOCK(vp)) { 5261 if (mp_ncpus == 1 || should_yield()) { 5262 TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist); 5263 mtx_unlock(&vnode_free_list_mtx); 5264 pause("vnacti", 1); 5265 mtx_lock(&vnode_free_list_mtx); 5266 goto restart; 5267 } 5268 continue; 5269 } 5270 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); 5271 KASSERT(vp->v_mount == mp || vp->v_mount == NULL, 5272 ("alien vnode on the active list %p %p", vp, mp)); 5273 if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0) 5274 break; 5275 nvp = TAILQ_NEXT(vp, v_actfreelist); 5276 VI_UNLOCK(vp); 5277 vp = nvp; 5278 } 5279 5280 /* Check if we are done */ 5281 if (vp == NULL) { 5282 mtx_unlock(&vnode_free_list_mtx); 5283 mnt_vnode_markerfree_active(mvp, mp); 5284 return (NULL); 5285 } 5286 TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist); 5287 mtx_unlock(&vnode_free_list_mtx); 5288 ASSERT_VI_LOCKED(vp, "active iter"); 5289 KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp)); 5290 return (vp); 5291 } 5292 5293 struct vnode * 5294 __mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) 5295 { 5296 5297 if (should_yield()) 5298 kern_yield(PRI_USER); 5299 mtx_lock(&vnode_free_list_mtx); 5300 return (mnt_vnode_next_active(mvp, mp)); 5301 } 5302 5303 struct vnode * 5304 __mnt_vnode_first_active(struct vnode **mvp, struct mount *mp) 5305 { 5306 struct vnode *vp; 5307 5308 *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 5309 MNT_ILOCK(mp); 5310 MNT_REF(mp); 5311 MNT_IUNLOCK(mp); 5312 (*mvp)->v_type = VMARKER; 5313 (*mvp)->v_mount = mp; 5314 5315 mtx_lock(&vnode_free_list_mtx); 5316 vp = TAILQ_FIRST(&mp->mnt_activevnodelist); 5317 if (vp == NULL) { 5318 mtx_unlock(&vnode_free_list_mtx); 5319 mnt_vnode_markerfree_active(mvp, mp); 5320 return (NULL); 5321 } 5322 TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist); 5323 return (mnt_vnode_next_active(mvp, mp)); 5324 } 5325 5326 void 5327 __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) 5328 { 5329 5330 if (*mvp == NULL) 5331 return; 5332 5333 mtx_lock(&vnode_free_list_mtx); 5334 TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); 5335 mtx_unlock(&vnode_free_list_mtx); 5336 mnt_vnode_markerfree_active(mvp, mp); 5337 } 5338