1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 37 */ 38 39 /* 40 * External virtual filesystem routines 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_ddb.h" 47 #include "opt_watchdog.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/bio.h> 52 #include <sys/buf.h> 53 #include <sys/condvar.h> 54 #include <sys/conf.h> 55 #include <sys/counter.h> 56 #include <sys/dirent.h> 57 #include <sys/event.h> 58 #include <sys/eventhandler.h> 59 #include <sys/extattr.h> 60 #include <sys/file.h> 61 #include <sys/fcntl.h> 62 #include <sys/jail.h> 63 #include <sys/kdb.h> 64 #include <sys/kernel.h> 65 #include <sys/kthread.h> 66 #include <sys/ktr.h> 67 #include <sys/lockf.h> 68 #include <sys/malloc.h> 69 #include <sys/mount.h> 70 #include <sys/namei.h> 71 #include <sys/pctrie.h> 72 #include <sys/priv.h> 73 #include <sys/reboot.h> 74 #include <sys/refcount.h> 75 #include <sys/rwlock.h> 76 #include <sys/sched.h> 77 #include <sys/sleepqueue.h> 78 #include <sys/smp.h> 79 #include <sys/stat.h> 80 #include <sys/sysctl.h> 81 #include <sys/syslog.h> 82 #include <sys/vmmeter.h> 83 #include <sys/vnode.h> 84 #include <sys/watchdog.h> 85 86 #include <machine/stdarg.h> 87 88 #include <security/mac/mac_framework.h> 89 90 #include <vm/vm.h> 91 #include <vm/vm_object.h> 92 #include <vm/vm_extern.h> 93 #include <vm/pmap.h> 94 #include <vm/vm_map.h> 95 #include <vm/vm_page.h> 96 #include <vm/vm_kern.h> 97 #include <vm/uma.h> 98 99 #ifdef DDB 100 #include <ddb/ddb.h> 101 #endif 102 103 static void delmntque(struct vnode *vp); 104 static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, 105 int slpflag, int slptimeo); 106 static void syncer_shutdown(void *arg, int howto); 107 static int vtryrecycle(struct vnode *vp); 108 static void v_init_counters(struct vnode *); 109 static void v_incr_usecount(struct vnode *); 110 static void v_incr_usecount_locked(struct vnode *); 111 static void v_incr_devcount(struct vnode *); 112 static void v_decr_devcount(struct vnode *); 113 static void vgonel(struct vnode *); 114 static void vfs_knllock(void *arg); 115 static void vfs_knlunlock(void *arg); 116 static void vfs_knl_assert_locked(void *arg); 117 static void vfs_knl_assert_unlocked(void *arg); 118 static void vnlru_return_batches(struct vfsops *mnt_op); 119 static void destroy_vpollinfo(struct vpollinfo *vi); 120 121 /* 122 * These fences are intended for cases where some synchronization is 123 * needed between access of v_iflags and lockless vnode refcount (v_holdcnt 124 * and v_usecount) updates. Access to v_iflags is generally synchronized 125 * by the interlock, but we have some internal assertions that check vnode 126 * flags without acquiring the lock. Thus, these fences are INVARIANTS-only 127 * for now. 128 */ 129 #ifdef INVARIANTS 130 #define VNODE_REFCOUNT_FENCE_ACQ() atomic_thread_fence_acq() 131 #define VNODE_REFCOUNT_FENCE_REL() atomic_thread_fence_rel() 132 #else 133 #define VNODE_REFCOUNT_FENCE_ACQ() 134 #define VNODE_REFCOUNT_FENCE_REL() 135 #endif 136 137 /* 138 * Number of vnodes in existence. Increased whenever getnewvnode() 139 * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode. 140 */ 141 static unsigned long numvnodes; 142 143 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, 144 "Number of vnodes in existence"); 145 146 static counter_u64_t vnodes_created; 147 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, 148 "Number of vnodes created by getnewvnode"); 149 150 static u_long mnt_free_list_batch = 128; 151 SYSCTL_ULONG(_vfs, OID_AUTO, mnt_free_list_batch, CTLFLAG_RW, 152 &mnt_free_list_batch, 0, "Limit of vnodes held on mnt's free list"); 153 154 /* 155 * Conversion tables for conversion from vnode types to inode formats 156 * and back. 157 */ 158 enum vtype iftovt_tab[16] = { 159 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 160 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON 161 }; 162 int vttoif_tab[10] = { 163 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 164 S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT 165 }; 166 167 /* 168 * List of vnodes that are ready for recycling. 169 */ 170 static TAILQ_HEAD(freelst, vnode) vnode_free_list; 171 172 /* 173 * "Free" vnode target. Free vnodes are rarely completely free, but are 174 * just ones that are cheap to recycle. Usually they are for files which 175 * have been stat'd but not read; these usually have inode and namecache 176 * data attached to them. This target is the preferred minimum size of a 177 * sub-cache consisting mostly of such files. The system balances the size 178 * of this sub-cache with its complement to try to prevent either from 179 * thrashing while the other is relatively inactive. The targets express 180 * a preference for the best balance. 181 * 182 * "Above" this target there are 2 further targets (watermarks) related 183 * to recyling of free vnodes. In the best-operating case, the cache is 184 * exactly full, the free list has size between vlowat and vhiwat above the 185 * free target, and recycling from it and normal use maintains this state. 186 * Sometimes the free list is below vlowat or even empty, but this state 187 * is even better for immediate use provided the cache is not full. 188 * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free 189 * ones) to reach one of these states. The watermarks are currently hard- 190 * coded as 4% and 9% of the available space higher. These and the default 191 * of 25% for wantfreevnodes are too large if the memory size is large. 192 * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim 193 * whenever vnlru_proc() becomes active. 194 */ 195 static u_long wantfreevnodes; 196 SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, 197 &wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes"); 198 static u_long freevnodes; 199 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, 200 &freevnodes, 0, "Number of \"free\" vnodes"); 201 202 static counter_u64_t recycles_count; 203 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 204 "Number of vnodes recycled to meet vnode cache targets"); 205 206 /* 207 * Various variables used for debugging the new implementation of 208 * reassignbuf(). 209 * XXX these are probably of (very) limited utility now. 210 */ 211 static int reassignbufcalls; 212 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, 213 "Number of calls to reassignbuf"); 214 215 static counter_u64_t free_owe_inact; 216 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact, 217 "Number of times free vnodes kept on active list due to VFS " 218 "owing inactivation"); 219 220 /* To keep more than one thread at a time from running vfs_getnewfsid */ 221 static struct mtx mntid_mtx; 222 223 /* 224 * Lock for any access to the following: 225 * vnode_free_list 226 * numvnodes 227 * freevnodes 228 */ 229 static struct mtx vnode_free_list_mtx; 230 231 /* Publicly exported FS */ 232 struct nfs_public nfs_pub; 233 234 static uma_zone_t buf_trie_zone; 235 236 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 237 static uma_zone_t vnode_zone; 238 static uma_zone_t vnodepoll_zone; 239 240 /* 241 * The workitem queue. 242 * 243 * It is useful to delay writes of file data and filesystem metadata 244 * for tens of seconds so that quickly created and deleted files need 245 * not waste disk bandwidth being created and removed. To realize this, 246 * we append vnodes to a "workitem" queue. When running with a soft 247 * updates implementation, most pending metadata dependencies should 248 * not wait for more than a few seconds. Thus, mounted on block devices 249 * are delayed only about a half the time that file data is delayed. 250 * Similarly, directory updates are more critical, so are only delayed 251 * about a third the time that file data is delayed. Thus, there are 252 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 253 * one each second (driven off the filesystem syncer process). The 254 * syncer_delayno variable indicates the next queue that is to be processed. 255 * Items that need to be processed soon are placed in this queue: 256 * 257 * syncer_workitem_pending[syncer_delayno] 258 * 259 * A delay of fifteen seconds is done by placing the request fifteen 260 * entries later in the queue: 261 * 262 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 263 * 264 */ 265 static int syncer_delayno; 266 static long syncer_mask; 267 LIST_HEAD(synclist, bufobj); 268 static struct synclist *syncer_workitem_pending; 269 /* 270 * The sync_mtx protects: 271 * bo->bo_synclist 272 * sync_vnode_count 273 * syncer_delayno 274 * syncer_state 275 * syncer_workitem_pending 276 * syncer_worklist_len 277 * rushjob 278 */ 279 static struct mtx sync_mtx; 280 static struct cv sync_wakeup; 281 282 #define SYNCER_MAXDELAY 32 283 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 284 static int syncdelay = 30; /* max time to delay syncing data */ 285 static int filedelay = 30; /* time to delay syncing files */ 286 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, 287 "Time to delay syncing files (in seconds)"); 288 static int dirdelay = 29; /* time to delay syncing directories */ 289 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, 290 "Time to delay syncing directories (in seconds)"); 291 static int metadelay = 28; /* time to delay syncing metadata */ 292 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, 293 "Time to delay syncing metadata (in seconds)"); 294 static int rushjob; /* number of slots to run ASAP */ 295 static int stat_rush_requests; /* number of times I/O speeded up */ 296 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, 297 "Number of times I/O speeded up (rush requests)"); 298 299 /* 300 * When shutting down the syncer, run it at four times normal speed. 301 */ 302 #define SYNCER_SHUTDOWN_SPEEDUP 4 303 static int sync_vnode_count; 304 static int syncer_worklist_len; 305 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } 306 syncer_state; 307 308 /* Target for maximum number of vnodes. */ 309 int desiredvnodes; 310 static int gapvnodes; /* gap between wanted and desired */ 311 static int vhiwat; /* enough extras after expansion */ 312 static int vlowat; /* minimal extras before expansion */ 313 static int vstir; /* nonzero to stir non-free vnodes */ 314 static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ 315 316 static int 317 sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS) 318 { 319 int error, old_desiredvnodes; 320 321 old_desiredvnodes = desiredvnodes; 322 if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0) 323 return (error); 324 if (old_desiredvnodes != desiredvnodes) { 325 wantfreevnodes = desiredvnodes / 4; 326 /* XXX locking seems to be incomplete. */ 327 vfs_hash_changesize(desiredvnodes); 328 cache_changesize(desiredvnodes); 329 } 330 return (0); 331 } 332 333 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, 334 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0, 335 sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes"); 336 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 337 &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)"); 338 static int vnlru_nowhere; 339 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, 340 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); 341 342 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ 343 static int vnsz2log; 344 345 /* 346 * Support for the bufobj clean & dirty pctrie. 347 */ 348 static void * 349 buf_trie_alloc(struct pctrie *ptree) 350 { 351 352 return uma_zalloc(buf_trie_zone, M_NOWAIT); 353 } 354 355 static void 356 buf_trie_free(struct pctrie *ptree, void *node) 357 { 358 359 uma_zfree(buf_trie_zone, node); 360 } 361 PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free); 362 363 /* 364 * Initialize the vnode management data structures. 365 * 366 * Reevaluate the following cap on the number of vnodes after the physical 367 * memory size exceeds 512GB. In the limit, as the physical memory size 368 * grows, the ratio of the memory size in KB to vnodes approaches 64:1. 369 */ 370 #ifndef MAXVNODES_MAX 371 #define MAXVNODES_MAX (512 * 1024 * 1024 / 64) /* 8M */ 372 #endif 373 374 /* 375 * Initialize a vnode as it first enters the zone. 376 */ 377 static int 378 vnode_init(void *mem, int size, int flags) 379 { 380 struct vnode *vp; 381 382 vp = mem; 383 bzero(vp, size); 384 /* 385 * Setup locks. 386 */ 387 vp->v_vnlock = &vp->v_lock; 388 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); 389 /* 390 * By default, don't allow shared locks unless filesystems opt-in. 391 */ 392 lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, 393 LK_NOSHARE | LK_IS_VNODE); 394 /* 395 * Initialize bufobj. 396 */ 397 bufobj_init(&vp->v_bufobj, vp); 398 /* 399 * Initialize namecache. 400 */ 401 LIST_INIT(&vp->v_cache_src); 402 TAILQ_INIT(&vp->v_cache_dst); 403 /* 404 * Initialize rangelocks. 405 */ 406 rangelock_init(&vp->v_rl); 407 return (0); 408 } 409 410 /* 411 * Free a vnode when it is cleared from the zone. 412 */ 413 static void 414 vnode_fini(void *mem, int size) 415 { 416 struct vnode *vp; 417 struct bufobj *bo; 418 419 vp = mem; 420 rangelock_destroy(&vp->v_rl); 421 lockdestroy(vp->v_vnlock); 422 mtx_destroy(&vp->v_interlock); 423 bo = &vp->v_bufobj; 424 rw_destroy(BO_LOCKPTR(bo)); 425 } 426 427 /* 428 * Provide the size of NFS nclnode and NFS fh for calculation of the 429 * vnode memory consumption. The size is specified directly to 430 * eliminate dependency on NFS-private header. 431 * 432 * Other filesystems may use bigger or smaller (like UFS and ZFS) 433 * private inode data, but the NFS-based estimation is ample enough. 434 * Still, we care about differences in the size between 64- and 32-bit 435 * platforms. 436 * 437 * Namecache structure size is heuristically 438 * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. 439 */ 440 #ifdef _LP64 441 #define NFS_NCLNODE_SZ (528 + 64) 442 #define NC_SZ 148 443 #else 444 #define NFS_NCLNODE_SZ (360 + 32) 445 #define NC_SZ 92 446 #endif 447 448 static void 449 vntblinit(void *dummy __unused) 450 { 451 u_int i; 452 int physvnodes, virtvnodes; 453 454 /* 455 * Desiredvnodes is a function of the physical memory size and the 456 * kernel's heap size. Generally speaking, it scales with the 457 * physical memory size. The ratio of desiredvnodes to the physical 458 * memory size is 1:16 until desiredvnodes exceeds 98,304. 459 * Thereafter, the 460 * marginal ratio of desiredvnodes to the physical memory size is 461 * 1:64. However, desiredvnodes is limited by the kernel's heap 462 * size. The memory required by desiredvnodes vnodes and vm objects 463 * must not exceed 1/10th of the kernel's heap size. 464 */ 465 physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 + 466 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64; 467 virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + 468 sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); 469 desiredvnodes = min(physvnodes, virtvnodes); 470 if (desiredvnodes > MAXVNODES_MAX) { 471 if (bootverbose) 472 printf("Reducing kern.maxvnodes %d -> %d\n", 473 desiredvnodes, MAXVNODES_MAX); 474 desiredvnodes = MAXVNODES_MAX; 475 } 476 wantfreevnodes = desiredvnodes / 4; 477 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); 478 TAILQ_INIT(&vnode_free_list); 479 mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF); 480 vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, 481 vnode_init, vnode_fini, UMA_ALIGN_PTR, 0); 482 vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), 483 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 484 /* 485 * Preallocate enough nodes to support one-per buf so that 486 * we can not fail an insert. reassignbuf() callers can not 487 * tolerate the insertion failure. 488 */ 489 buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), 490 NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 491 UMA_ZONE_NOFREE | UMA_ZONE_VM); 492 uma_prealloc(buf_trie_zone, nbuf); 493 494 vnodes_created = counter_u64_alloc(M_WAITOK); 495 recycles_count = counter_u64_alloc(M_WAITOK); 496 free_owe_inact = counter_u64_alloc(M_WAITOK); 497 498 /* 499 * Initialize the filesystem syncer. 500 */ 501 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 502 &syncer_mask); 503 syncer_maxdelay = syncer_mask + 1; 504 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); 505 cv_init(&sync_wakeup, "syncer"); 506 for (i = 1; i <= sizeof(struct vnode); i <<= 1) 507 vnsz2log++; 508 vnsz2log--; 509 } 510 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); 511 512 513 /* 514 * Mark a mount point as busy. Used to synchronize access and to delay 515 * unmounting. Eventually, mountlist_mtx is not released on failure. 516 * 517 * vfs_busy() is a custom lock, it can block the caller. 518 * vfs_busy() only sleeps if the unmount is active on the mount point. 519 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any 520 * vnode belonging to mp. 521 * 522 * Lookup uses vfs_busy() to traverse mount points. 523 * root fs var fs 524 * / vnode lock A / vnode lock (/var) D 525 * /var vnode lock B /log vnode lock(/var/log) E 526 * vfs_busy lock C vfs_busy lock F 527 * 528 * Within each file system, the lock order is C->A->B and F->D->E. 529 * 530 * When traversing across mounts, the system follows that lock order: 531 * 532 * C->A->B 533 * | 534 * +->F->D->E 535 * 536 * The lookup() process for namei("/var") illustrates the process: 537 * VOP_LOOKUP() obtains B while A is held 538 * vfs_busy() obtains a shared lock on F while A and B are held 539 * vput() releases lock on B 540 * vput() releases lock on A 541 * VFS_ROOT() obtains lock on D while shared lock on F is held 542 * vfs_unbusy() releases shared lock on F 543 * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. 544 * Attempt to lock A (instead of vp_crossmp) while D is held would 545 * violate the global order, causing deadlocks. 546 * 547 * dounmount() locks B while F is drained. 548 */ 549 int 550 vfs_busy(struct mount *mp, int flags) 551 { 552 553 MPASS((flags & ~MBF_MASK) == 0); 554 CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); 555 556 MNT_ILOCK(mp); 557 MNT_REF(mp); 558 /* 559 * If mount point is currently being unmounted, sleep until the 560 * mount point fate is decided. If thread doing the unmounting fails, 561 * it will clear MNTK_UNMOUNT flag before waking us up, indicating 562 * that this mount point has survived the unmount attempt and vfs_busy 563 * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE 564 * flag in addition to MNTK_UNMOUNT, indicating that mount point is 565 * about to be really destroyed. vfs_busy needs to release its 566 * reference on the mount point in this case and return with ENOENT, 567 * telling the caller that mount mount it tried to busy is no longer 568 * valid. 569 */ 570 while (mp->mnt_kern_flag & MNTK_UNMOUNT) { 571 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { 572 MNT_REL(mp); 573 MNT_IUNLOCK(mp); 574 CTR1(KTR_VFS, "%s: failed busying before sleeping", 575 __func__); 576 return (ENOENT); 577 } 578 if (flags & MBF_MNTLSTLOCK) 579 mtx_unlock(&mountlist_mtx); 580 mp->mnt_kern_flag |= MNTK_MWAIT; 581 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); 582 if (flags & MBF_MNTLSTLOCK) 583 mtx_lock(&mountlist_mtx); 584 MNT_ILOCK(mp); 585 } 586 if (flags & MBF_MNTLSTLOCK) 587 mtx_unlock(&mountlist_mtx); 588 mp->mnt_lockref++; 589 MNT_IUNLOCK(mp); 590 return (0); 591 } 592 593 /* 594 * Free a busy filesystem. 595 */ 596 void 597 vfs_unbusy(struct mount *mp) 598 { 599 600 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 601 MNT_ILOCK(mp); 602 MNT_REL(mp); 603 KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref")); 604 mp->mnt_lockref--; 605 if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { 606 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); 607 CTR1(KTR_VFS, "%s: waking up waiters", __func__); 608 mp->mnt_kern_flag &= ~MNTK_DRAINING; 609 wakeup(&mp->mnt_lockref); 610 } 611 MNT_IUNLOCK(mp); 612 } 613 614 /* 615 * Lookup a mount point by filesystem identifier. 616 */ 617 struct mount * 618 vfs_getvfs(fsid_t *fsid) 619 { 620 struct mount *mp; 621 622 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 623 mtx_lock(&mountlist_mtx); 624 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 625 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 626 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 627 vfs_ref(mp); 628 mtx_unlock(&mountlist_mtx); 629 return (mp); 630 } 631 } 632 mtx_unlock(&mountlist_mtx); 633 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 634 return ((struct mount *) 0); 635 } 636 637 /* 638 * Lookup a mount point by filesystem identifier, busying it before 639 * returning. 640 * 641 * To avoid congestion on mountlist_mtx, implement simple direct-mapped 642 * cache for popular filesystem identifiers. The cache is lockess, using 643 * the fact that struct mount's are never freed. In worst case we may 644 * get pointer to unmounted or even different filesystem, so we have to 645 * check what we got, and go slow way if so. 646 */ 647 struct mount * 648 vfs_busyfs(fsid_t *fsid) 649 { 650 #define FSID_CACHE_SIZE 256 651 typedef struct mount * volatile vmp_t; 652 static vmp_t cache[FSID_CACHE_SIZE]; 653 struct mount *mp; 654 int error; 655 uint32_t hash; 656 657 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 658 hash = fsid->val[0] ^ fsid->val[1]; 659 hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); 660 mp = cache[hash]; 661 if (mp == NULL || 662 mp->mnt_stat.f_fsid.val[0] != fsid->val[0] || 663 mp->mnt_stat.f_fsid.val[1] != fsid->val[1]) 664 goto slow; 665 if (vfs_busy(mp, 0) != 0) { 666 cache[hash] = NULL; 667 goto slow; 668 } 669 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 670 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) 671 return (mp); 672 else 673 vfs_unbusy(mp); 674 675 slow: 676 mtx_lock(&mountlist_mtx); 677 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 678 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 679 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 680 error = vfs_busy(mp, MBF_MNTLSTLOCK); 681 if (error) { 682 cache[hash] = NULL; 683 mtx_unlock(&mountlist_mtx); 684 return (NULL); 685 } 686 cache[hash] = mp; 687 return (mp); 688 } 689 } 690 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 691 mtx_unlock(&mountlist_mtx); 692 return ((struct mount *) 0); 693 } 694 695 /* 696 * Check if a user can access privileged mount options. 697 */ 698 int 699 vfs_suser(struct mount *mp, struct thread *td) 700 { 701 int error; 702 703 if (jailed(td->td_ucred)) { 704 /* 705 * If the jail of the calling thread lacks permission for 706 * this type of file system, deny immediately. 707 */ 708 if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag)) 709 return (EPERM); 710 711 /* 712 * If the file system was mounted outside the jail of the 713 * calling thread, deny immediately. 714 */ 715 if (prison_check(td->td_ucred, mp->mnt_cred) != 0) 716 return (EPERM); 717 } 718 719 /* 720 * If file system supports delegated administration, we don't check 721 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified 722 * by the file system itself. 723 * If this is not the user that did original mount, we check for 724 * the PRIV_VFS_MOUNT_OWNER privilege. 725 */ 726 if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && 727 mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { 728 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) 729 return (error); 730 } 731 return (0); 732 } 733 734 /* 735 * Get a new unique fsid. Try to make its val[0] unique, since this value 736 * will be used to create fake device numbers for stat(). Also try (but 737 * not so hard) make its val[0] unique mod 2^16, since some emulators only 738 * support 16-bit device numbers. We end up with unique val[0]'s for the 739 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 740 * 741 * Keep in mind that several mounts may be running in parallel. Starting 742 * the search one past where the previous search terminated is both a 743 * micro-optimization and a defense against returning the same fsid to 744 * different mounts. 745 */ 746 void 747 vfs_getnewfsid(struct mount *mp) 748 { 749 static uint16_t mntid_base; 750 struct mount *nmp; 751 fsid_t tfsid; 752 int mtype; 753 754 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 755 mtx_lock(&mntid_mtx); 756 mtype = mp->mnt_vfc->vfc_typenum; 757 tfsid.val[1] = mtype; 758 mtype = (mtype & 0xFF) << 24; 759 for (;;) { 760 tfsid.val[0] = makedev(255, 761 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 762 mntid_base++; 763 if ((nmp = vfs_getvfs(&tfsid)) == NULL) 764 break; 765 vfs_rel(nmp); 766 } 767 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 768 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 769 mtx_unlock(&mntid_mtx); 770 } 771 772 /* 773 * Knob to control the precision of file timestamps: 774 * 775 * 0 = seconds only; nanoseconds zeroed. 776 * 1 = seconds and nanoseconds, accurate within 1/HZ. 777 * 2 = seconds and nanoseconds, truncated to microseconds. 778 * >=3 = seconds and nanoseconds, maximum precision. 779 */ 780 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 781 782 static int timestamp_precision = TSP_USEC; 783 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 784 ×tamp_precision, 0, "File timestamp precision (0: seconds, " 785 "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, " 786 "3+: sec + ns (max. precision))"); 787 788 /* 789 * Get a current timestamp. 790 */ 791 void 792 vfs_timestamp(struct timespec *tsp) 793 { 794 struct timeval tv; 795 796 switch (timestamp_precision) { 797 case TSP_SEC: 798 tsp->tv_sec = time_second; 799 tsp->tv_nsec = 0; 800 break; 801 case TSP_HZ: 802 getnanotime(tsp); 803 break; 804 case TSP_USEC: 805 microtime(&tv); 806 TIMEVAL_TO_TIMESPEC(&tv, tsp); 807 break; 808 case TSP_NSEC: 809 default: 810 nanotime(tsp); 811 break; 812 } 813 } 814 815 /* 816 * Set vnode attributes to VNOVAL 817 */ 818 void 819 vattr_null(struct vattr *vap) 820 { 821 822 vap->va_type = VNON; 823 vap->va_size = VNOVAL; 824 vap->va_bytes = VNOVAL; 825 vap->va_mode = VNOVAL; 826 vap->va_nlink = VNOVAL; 827 vap->va_uid = VNOVAL; 828 vap->va_gid = VNOVAL; 829 vap->va_fsid = VNOVAL; 830 vap->va_fileid = VNOVAL; 831 vap->va_blocksize = VNOVAL; 832 vap->va_rdev = VNOVAL; 833 vap->va_atime.tv_sec = VNOVAL; 834 vap->va_atime.tv_nsec = VNOVAL; 835 vap->va_mtime.tv_sec = VNOVAL; 836 vap->va_mtime.tv_nsec = VNOVAL; 837 vap->va_ctime.tv_sec = VNOVAL; 838 vap->va_ctime.tv_nsec = VNOVAL; 839 vap->va_birthtime.tv_sec = VNOVAL; 840 vap->va_birthtime.tv_nsec = VNOVAL; 841 vap->va_flags = VNOVAL; 842 vap->va_gen = VNOVAL; 843 vap->va_vaflags = 0; 844 } 845 846 /* 847 * This routine is called when we have too many vnodes. It attempts 848 * to free <count> vnodes and will potentially free vnodes that still 849 * have VM backing store (VM backing store is typically the cause 850 * of a vnode blowout so we want to do this). Therefore, this operation 851 * is not considered cheap. 852 * 853 * A number of conditions may prevent a vnode from being reclaimed. 854 * the buffer cache may have references on the vnode, a directory 855 * vnode may still have references due to the namei cache representing 856 * underlying files, or the vnode may be in active use. It is not 857 * desirable to reuse such vnodes. These conditions may cause the 858 * number of vnodes to reach some minimum value regardless of what 859 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 860 */ 861 static int 862 vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger) 863 { 864 struct vnode *vp; 865 int count, done, target; 866 867 done = 0; 868 vn_start_write(NULL, &mp, V_WAIT); 869 MNT_ILOCK(mp); 870 count = mp->mnt_nvnodelistsize; 871 target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1); 872 target = target / 10 + 1; 873 while (count != 0 && done < target) { 874 vp = TAILQ_FIRST(&mp->mnt_nvnodelist); 875 while (vp != NULL && vp->v_type == VMARKER) 876 vp = TAILQ_NEXT(vp, v_nmntvnodes); 877 if (vp == NULL) 878 break; 879 /* 880 * XXX LRU is completely broken for non-free vnodes. First 881 * by calling here in mountpoint order, then by moving 882 * unselected vnodes to the end here, and most grossly by 883 * removing the vlruvp() function that was supposed to 884 * maintain the order. (This function was born broken 885 * since syncer problems prevented it doing anything.) The 886 * order is closer to LRC (C = Created). 887 * 888 * LRU reclaiming of vnodes seems to have last worked in 889 * FreeBSD-3 where LRU wasn't mentioned under any spelling. 890 * Then there was no hold count, and inactive vnodes were 891 * simply put on the free list in LRU order. The separate 892 * lists also break LRU. We prefer to reclaim from the 893 * free list for technical reasons. This tends to thrash 894 * the free list to keep very unrecently used held vnodes. 895 * The problem is mitigated by keeping the free list large. 896 */ 897 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 898 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 899 --count; 900 if (!VI_TRYLOCK(vp)) 901 goto next_iter; 902 /* 903 * If it's been deconstructed already, it's still 904 * referenced, or it exceeds the trigger, skip it. 905 * Also skip free vnodes. We are trying to make space 906 * to expand the free list, not reduce it. 907 */ 908 if (vp->v_usecount || 909 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 910 ((vp->v_iflag & VI_FREE) != 0) || 911 (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL && 912 vp->v_object->resident_page_count > trigger)) { 913 VI_UNLOCK(vp); 914 goto next_iter; 915 } 916 MNT_IUNLOCK(mp); 917 vholdl(vp); 918 if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) { 919 vdrop(vp); 920 goto next_iter_mntunlocked; 921 } 922 VI_LOCK(vp); 923 /* 924 * v_usecount may have been bumped after VOP_LOCK() dropped 925 * the vnode interlock and before it was locked again. 926 * 927 * It is not necessary to recheck VI_DOOMED because it can 928 * only be set by another thread that holds both the vnode 929 * lock and vnode interlock. If another thread has the 930 * vnode lock before we get to VOP_LOCK() and obtains the 931 * vnode interlock after VOP_LOCK() drops the vnode 932 * interlock, the other thread will be unable to drop the 933 * vnode lock before our VOP_LOCK() call fails. 934 */ 935 if (vp->v_usecount || 936 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 937 (vp->v_iflag & VI_FREE) != 0 || 938 (vp->v_object != NULL && 939 vp->v_object->resident_page_count > trigger)) { 940 VOP_UNLOCK(vp, LK_INTERLOCK); 941 vdrop(vp); 942 goto next_iter_mntunlocked; 943 } 944 KASSERT((vp->v_iflag & VI_DOOMED) == 0, 945 ("VI_DOOMED unexpectedly detected in vlrureclaim()")); 946 counter_u64_add(recycles_count, 1); 947 vgonel(vp); 948 VOP_UNLOCK(vp, 0); 949 vdropl(vp); 950 done++; 951 next_iter_mntunlocked: 952 if (!should_yield()) 953 goto relock_mnt; 954 goto yield; 955 next_iter: 956 if (!should_yield()) 957 continue; 958 MNT_IUNLOCK(mp); 959 yield: 960 kern_yield(PRI_USER); 961 relock_mnt: 962 MNT_ILOCK(mp); 963 } 964 MNT_IUNLOCK(mp); 965 vn_finished_write(mp); 966 return done; 967 } 968 969 static int max_vnlru_free = 10000; /* limit on vnode free requests per call */ 970 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free, 971 0, 972 "limit on vnode free requests per call to the vnlru_free routine"); 973 974 /* 975 * Attempt to reduce the free list by the requested amount. 976 */ 977 static void 978 vnlru_free_locked(int count, struct vfsops *mnt_op) 979 { 980 struct vnode *vp; 981 struct mount *mp; 982 bool tried_batches; 983 984 tried_batches = false; 985 mtx_assert(&vnode_free_list_mtx, MA_OWNED); 986 if (count > max_vnlru_free) 987 count = max_vnlru_free; 988 for (; count > 0; count--) { 989 vp = TAILQ_FIRST(&vnode_free_list); 990 /* 991 * The list can be modified while the free_list_mtx 992 * has been dropped and vp could be NULL here. 993 */ 994 if (vp == NULL) { 995 if (tried_batches) 996 break; 997 mtx_unlock(&vnode_free_list_mtx); 998 vnlru_return_batches(mnt_op); 999 tried_batches = true; 1000 mtx_lock(&vnode_free_list_mtx); 1001 continue; 1002 } 1003 1004 VNASSERT(vp->v_op != NULL, vp, 1005 ("vnlru_free: vnode already reclaimed.")); 1006 KASSERT((vp->v_iflag & VI_FREE) != 0, 1007 ("Removing vnode not on freelist")); 1008 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 1009 ("Mangling active vnode")); 1010 TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); 1011 1012 /* 1013 * Don't recycle if our vnode is from different type 1014 * of mount point. Note that mp is type-safe, the 1015 * check does not reach unmapped address even if 1016 * vnode is reclaimed. 1017 * Don't recycle if we can't get the interlock without 1018 * blocking. 1019 */ 1020 if ((mnt_op != NULL && (mp = vp->v_mount) != NULL && 1021 mp->mnt_op != mnt_op) || !VI_TRYLOCK(vp)) { 1022 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist); 1023 continue; 1024 } 1025 VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0, 1026 vp, ("vp inconsistent on freelist")); 1027 1028 /* 1029 * The clear of VI_FREE prevents activation of the 1030 * vnode. There is no sense in putting the vnode on 1031 * the mount point active list, only to remove it 1032 * later during recycling. Inline the relevant part 1033 * of vholdl(), to avoid triggering assertions or 1034 * activating. 1035 */ 1036 freevnodes--; 1037 vp->v_iflag &= ~VI_FREE; 1038 VNODE_REFCOUNT_FENCE_REL(); 1039 refcount_acquire(&vp->v_holdcnt); 1040 1041 mtx_unlock(&vnode_free_list_mtx); 1042 VI_UNLOCK(vp); 1043 vtryrecycle(vp); 1044 /* 1045 * If the recycled succeeded this vdrop will actually free 1046 * the vnode. If not it will simply place it back on 1047 * the free list. 1048 */ 1049 vdrop(vp); 1050 mtx_lock(&vnode_free_list_mtx); 1051 } 1052 } 1053 1054 void 1055 vnlru_free(int count, struct vfsops *mnt_op) 1056 { 1057 1058 mtx_lock(&vnode_free_list_mtx); 1059 vnlru_free_locked(count, mnt_op); 1060 mtx_unlock(&vnode_free_list_mtx); 1061 } 1062 1063 1064 /* XXX some names and initialization are bad for limits and watermarks. */ 1065 static int 1066 vspace(void) 1067 { 1068 int space; 1069 1070 gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); 1071 vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ 1072 vlowat = vhiwat / 2; 1073 if (numvnodes > desiredvnodes) 1074 return (0); 1075 space = desiredvnodes - numvnodes; 1076 if (freevnodes > wantfreevnodes) 1077 space += freevnodes - wantfreevnodes; 1078 return (space); 1079 } 1080 1081 static void 1082 vnlru_return_batch_locked(struct mount *mp) 1083 { 1084 struct vnode *vp; 1085 1086 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 1087 1088 if (mp->mnt_tmpfreevnodelistsize == 0) 1089 return; 1090 1091 TAILQ_FOREACH(vp, &mp->mnt_tmpfreevnodelist, v_actfreelist) { 1092 VNASSERT((vp->v_mflag & VMP_TMPMNTFREELIST) != 0, vp, 1093 ("vnode without VMP_TMPMNTFREELIST on mnt_tmpfreevnodelist")); 1094 vp->v_mflag &= ~VMP_TMPMNTFREELIST; 1095 } 1096 mtx_lock(&vnode_free_list_mtx); 1097 TAILQ_CONCAT(&vnode_free_list, &mp->mnt_tmpfreevnodelist, v_actfreelist); 1098 freevnodes += mp->mnt_tmpfreevnodelistsize; 1099 mtx_unlock(&vnode_free_list_mtx); 1100 mp->mnt_tmpfreevnodelistsize = 0; 1101 } 1102 1103 static void 1104 vnlru_return_batch(struct mount *mp) 1105 { 1106 1107 mtx_lock(&mp->mnt_listmtx); 1108 vnlru_return_batch_locked(mp); 1109 mtx_unlock(&mp->mnt_listmtx); 1110 } 1111 1112 static void 1113 vnlru_return_batches(struct vfsops *mnt_op) 1114 { 1115 struct mount *mp, *nmp; 1116 bool need_unbusy; 1117 1118 mtx_lock(&mountlist_mtx); 1119 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 1120 need_unbusy = false; 1121 if (mnt_op != NULL && mp->mnt_op != mnt_op) 1122 goto next; 1123 if (mp->mnt_tmpfreevnodelistsize == 0) 1124 goto next; 1125 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) == 0) { 1126 vnlru_return_batch(mp); 1127 need_unbusy = true; 1128 mtx_lock(&mountlist_mtx); 1129 } 1130 next: 1131 nmp = TAILQ_NEXT(mp, mnt_list); 1132 if (need_unbusy) 1133 vfs_unbusy(mp); 1134 } 1135 mtx_unlock(&mountlist_mtx); 1136 } 1137 1138 /* 1139 * Attempt to recycle vnodes in a context that is always safe to block. 1140 * Calling vlrurecycle() from the bowels of filesystem code has some 1141 * interesting deadlock problems. 1142 */ 1143 static struct proc *vnlruproc; 1144 static int vnlruproc_sig; 1145 1146 static void 1147 vnlru_proc(void) 1148 { 1149 struct mount *mp, *nmp; 1150 unsigned long onumvnodes; 1151 int done, force, reclaim_nc_src, trigger, usevnodes; 1152 1153 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, 1154 SHUTDOWN_PRI_FIRST); 1155 1156 force = 0; 1157 for (;;) { 1158 kproc_suspend_check(vnlruproc); 1159 mtx_lock(&vnode_free_list_mtx); 1160 /* 1161 * If numvnodes is too large (due to desiredvnodes being 1162 * adjusted using its sysctl, or emergency growth), first 1163 * try to reduce it by discarding from the free list. 1164 */ 1165 if (numvnodes > desiredvnodes) 1166 vnlru_free_locked(numvnodes - desiredvnodes, NULL); 1167 /* 1168 * Sleep if the vnode cache is in a good state. This is 1169 * when it is not over-full and has space for about a 4% 1170 * or 9% expansion (by growing its size or inexcessively 1171 * reducing its free list). Otherwise, try to reclaim 1172 * space for a 10% expansion. 1173 */ 1174 if (vstir && force == 0) { 1175 force = 1; 1176 vstir = 0; 1177 } 1178 if (vspace() >= vlowat && force == 0) { 1179 vnlruproc_sig = 0; 1180 wakeup(&vnlruproc_sig); 1181 msleep(vnlruproc, &vnode_free_list_mtx, 1182 PVFS|PDROP, "vlruwt", hz); 1183 continue; 1184 } 1185 mtx_unlock(&vnode_free_list_mtx); 1186 done = 0; 1187 onumvnodes = numvnodes; 1188 /* 1189 * Calculate parameters for recycling. These are the same 1190 * throughout the loop to give some semblance of fairness. 1191 * The trigger point is to avoid recycling vnodes with lots 1192 * of resident pages. We aren't trying to free memory; we 1193 * are trying to recycle or at least free vnodes. 1194 */ 1195 if (numvnodes <= desiredvnodes) 1196 usevnodes = numvnodes - freevnodes; 1197 else 1198 usevnodes = numvnodes; 1199 if (usevnodes <= 0) 1200 usevnodes = 1; 1201 /* 1202 * The trigger value is is chosen to give a conservatively 1203 * large value to ensure that it alone doesn't prevent 1204 * making progress. The value can easily be so large that 1205 * it is effectively infinite in some congested and 1206 * misconfigured cases, and this is necessary. Normally 1207 * it is about 8 to 100 (pages), which is quite large. 1208 */ 1209 trigger = vm_cnt.v_page_count * 2 / usevnodes; 1210 if (force < 2) 1211 trigger = vsmalltrigger; 1212 reclaim_nc_src = force >= 3; 1213 mtx_lock(&mountlist_mtx); 1214 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 1215 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { 1216 nmp = TAILQ_NEXT(mp, mnt_list); 1217 continue; 1218 } 1219 done += vlrureclaim(mp, reclaim_nc_src, trigger); 1220 mtx_lock(&mountlist_mtx); 1221 nmp = TAILQ_NEXT(mp, mnt_list); 1222 vfs_unbusy(mp); 1223 } 1224 mtx_unlock(&mountlist_mtx); 1225 if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) 1226 uma_reclaim(); 1227 if (done == 0) { 1228 if (force == 0 || force == 1) { 1229 force = 2; 1230 continue; 1231 } 1232 if (force == 2) { 1233 force = 3; 1234 continue; 1235 } 1236 force = 0; 1237 vnlru_nowhere++; 1238 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 1239 } else 1240 kern_yield(PRI_USER); 1241 /* 1242 * After becoming active to expand above low water, keep 1243 * active until above high water. 1244 */ 1245 force = vspace() < vhiwat; 1246 } 1247 } 1248 1249 static struct kproc_desc vnlru_kp = { 1250 "vnlru", 1251 vnlru_proc, 1252 &vnlruproc 1253 }; 1254 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, 1255 &vnlru_kp); 1256 1257 /* 1258 * Routines having to do with the management of the vnode table. 1259 */ 1260 1261 /* 1262 * Try to recycle a freed vnode. We abort if anyone picks up a reference 1263 * before we actually vgone(). This function must be called with the vnode 1264 * held to prevent the vnode from being returned to the free list midway 1265 * through vgone(). 1266 */ 1267 static int 1268 vtryrecycle(struct vnode *vp) 1269 { 1270 struct mount *vnmp; 1271 1272 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 1273 VNASSERT(vp->v_holdcnt, vp, 1274 ("vtryrecycle: Recycling vp %p without a reference.", vp)); 1275 /* 1276 * This vnode may found and locked via some other list, if so we 1277 * can't recycle it yet. 1278 */ 1279 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 1280 CTR2(KTR_VFS, 1281 "%s: impossible to recycle, vp %p lock is already held", 1282 __func__, vp); 1283 return (EWOULDBLOCK); 1284 } 1285 /* 1286 * Don't recycle if its filesystem is being suspended. 1287 */ 1288 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { 1289 VOP_UNLOCK(vp, 0); 1290 CTR2(KTR_VFS, 1291 "%s: impossible to recycle, cannot start the write for %p", 1292 __func__, vp); 1293 return (EBUSY); 1294 } 1295 /* 1296 * If we got this far, we need to acquire the interlock and see if 1297 * anyone picked up this vnode from another list. If not, we will 1298 * mark it with DOOMED via vgonel() so that anyone who does find it 1299 * will skip over it. 1300 */ 1301 VI_LOCK(vp); 1302 if (vp->v_usecount) { 1303 VOP_UNLOCK(vp, LK_INTERLOCK); 1304 vn_finished_write(vnmp); 1305 CTR2(KTR_VFS, 1306 "%s: impossible to recycle, %p is already referenced", 1307 __func__, vp); 1308 return (EBUSY); 1309 } 1310 if ((vp->v_iflag & VI_DOOMED) == 0) { 1311 counter_u64_add(recycles_count, 1); 1312 vgonel(vp); 1313 } 1314 VOP_UNLOCK(vp, LK_INTERLOCK); 1315 vn_finished_write(vnmp); 1316 return (0); 1317 } 1318 1319 static void 1320 vcheckspace(void) 1321 { 1322 1323 if (vspace() < vlowat && vnlruproc_sig == 0) { 1324 vnlruproc_sig = 1; 1325 wakeup(vnlruproc); 1326 } 1327 } 1328 1329 /* 1330 * Wait if necessary for space for a new vnode. 1331 */ 1332 static int 1333 getnewvnode_wait(int suspended) 1334 { 1335 1336 mtx_assert(&vnode_free_list_mtx, MA_OWNED); 1337 if (numvnodes >= desiredvnodes) { 1338 if (suspended) { 1339 /* 1340 * The file system is being suspended. We cannot 1341 * risk a deadlock here, so allow allocation of 1342 * another vnode even if this would give too many. 1343 */ 1344 return (0); 1345 } 1346 if (vnlruproc_sig == 0) { 1347 vnlruproc_sig = 1; /* avoid unnecessary wakeups */ 1348 wakeup(vnlruproc); 1349 } 1350 msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS, 1351 "vlruwk", hz); 1352 } 1353 /* Post-adjust like the pre-adjust in getnewvnode(). */ 1354 if (numvnodes + 1 > desiredvnodes && freevnodes > 1) 1355 vnlru_free_locked(1, NULL); 1356 return (numvnodes >= desiredvnodes ? ENFILE : 0); 1357 } 1358 1359 /* 1360 * This hack is fragile, and probably not needed any more now that the 1361 * watermark handling works. 1362 */ 1363 void 1364 getnewvnode_reserve(u_int count) 1365 { 1366 struct thread *td; 1367 1368 /* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */ 1369 /* XXX no longer so quick, but this part is not racy. */ 1370 mtx_lock(&vnode_free_list_mtx); 1371 if (numvnodes + count > desiredvnodes && freevnodes > wantfreevnodes) 1372 vnlru_free_locked(ulmin(numvnodes + count - desiredvnodes, 1373 freevnodes - wantfreevnodes), NULL); 1374 mtx_unlock(&vnode_free_list_mtx); 1375 1376 td = curthread; 1377 /* First try to be quick and racy. */ 1378 if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) { 1379 td->td_vp_reserv += count; 1380 vcheckspace(); /* XXX no longer so quick, but more racy */ 1381 return; 1382 } else 1383 atomic_subtract_long(&numvnodes, count); 1384 1385 mtx_lock(&vnode_free_list_mtx); 1386 while (count > 0) { 1387 if (getnewvnode_wait(0) == 0) { 1388 count--; 1389 td->td_vp_reserv++; 1390 atomic_add_long(&numvnodes, 1); 1391 } 1392 } 1393 vcheckspace(); 1394 mtx_unlock(&vnode_free_list_mtx); 1395 } 1396 1397 /* 1398 * This hack is fragile, especially if desiredvnodes or wantvnodes are 1399 * misconfgured or changed significantly. Reducing desiredvnodes below 1400 * the reserved amount should cause bizarre behaviour like reducing it 1401 * below the number of active vnodes -- the system will try to reduce 1402 * numvnodes to match, but should fail, so the subtraction below should 1403 * not overflow. 1404 */ 1405 void 1406 getnewvnode_drop_reserve(void) 1407 { 1408 struct thread *td; 1409 1410 td = curthread; 1411 atomic_subtract_long(&numvnodes, td->td_vp_reserv); 1412 td->td_vp_reserv = 0; 1413 } 1414 1415 /* 1416 * Return the next vnode from the free list. 1417 */ 1418 int 1419 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, 1420 struct vnode **vpp) 1421 { 1422 struct vnode *vp; 1423 struct thread *td; 1424 struct lock_object *lo; 1425 static int cyclecount; 1426 int error __unused; 1427 1428 CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); 1429 vp = NULL; 1430 td = curthread; 1431 if (td->td_vp_reserv > 0) { 1432 td->td_vp_reserv -= 1; 1433 goto alloc; 1434 } 1435 mtx_lock(&vnode_free_list_mtx); 1436 if (numvnodes < desiredvnodes) 1437 cyclecount = 0; 1438 else if (cyclecount++ >= freevnodes) { 1439 cyclecount = 0; 1440 vstir = 1; 1441 } 1442 /* 1443 * Grow the vnode cache if it will not be above its target max 1444 * after growing. Otherwise, if the free list is nonempty, try 1445 * to reclaim 1 item from it before growing the cache (possibly 1446 * above its target max if the reclamation failed or is delayed). 1447 * Otherwise, wait for some space. In all cases, schedule 1448 * vnlru_proc() if we are getting short of space. The watermarks 1449 * should be chosen so that we never wait or even reclaim from 1450 * the free list to below its target minimum. 1451 */ 1452 if (numvnodes + 1 <= desiredvnodes) 1453 ; 1454 else if (freevnodes > 0) 1455 vnlru_free_locked(1, NULL); 1456 else { 1457 error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag & 1458 MNTK_SUSPEND)); 1459 #if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ 1460 if (error != 0) { 1461 mtx_unlock(&vnode_free_list_mtx); 1462 return (error); 1463 } 1464 #endif 1465 } 1466 vcheckspace(); 1467 atomic_add_long(&numvnodes, 1); 1468 mtx_unlock(&vnode_free_list_mtx); 1469 alloc: 1470 counter_u64_add(vnodes_created, 1); 1471 vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK); 1472 /* 1473 * Locks are given the generic name "vnode" when created. 1474 * Follow the historic practice of using the filesystem 1475 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. 1476 * 1477 * Locks live in a witness group keyed on their name. Thus, 1478 * when a lock is renamed, it must also move from the witness 1479 * group of its old name to the witness group of its new name. 1480 * 1481 * The change only needs to be made when the vnode moves 1482 * from one filesystem type to another. We ensure that each 1483 * filesystem use a single static name pointer for its tag so 1484 * that we can compare pointers rather than doing a strcmp(). 1485 */ 1486 lo = &vp->v_vnlock->lock_object; 1487 if (lo->lo_name != tag) { 1488 lo->lo_name = tag; 1489 WITNESS_DESTROY(lo); 1490 WITNESS_INIT(lo, tag); 1491 } 1492 /* 1493 * By default, don't allow shared locks unless filesystems opt-in. 1494 */ 1495 vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; 1496 /* 1497 * Finalize various vnode identity bits. 1498 */ 1499 KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); 1500 KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); 1501 KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); 1502 vp->v_type = VNON; 1503 vp->v_tag = tag; 1504 vp->v_op = vops; 1505 v_init_counters(vp); 1506 vp->v_bufobj.bo_ops = &buf_ops_bio; 1507 #ifdef DIAGNOSTIC 1508 if (mp == NULL && vops != &dead_vnodeops) 1509 printf("NULL mp in getnewvnode(9), tag %s\n", tag); 1510 #endif 1511 #ifdef MAC 1512 mac_vnode_init(vp); 1513 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) 1514 mac_vnode_associate_singlelabel(mp, vp); 1515 #endif 1516 if (mp != NULL) { 1517 vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; 1518 if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) 1519 vp->v_vflag |= VV_NOKNOTE; 1520 } 1521 1522 /* 1523 * For the filesystems which do not use vfs_hash_insert(), 1524 * still initialize v_hash to have vfs_hash_index() useful. 1525 * E.g., nullfs uses vfs_hash_index() on the lower vnode for 1526 * its own hashing. 1527 */ 1528 vp->v_hash = (uintptr_t)vp >> vnsz2log; 1529 1530 *vpp = vp; 1531 return (0); 1532 } 1533 1534 /* 1535 * Delete from old mount point vnode list, if on one. 1536 */ 1537 static void 1538 delmntque(struct vnode *vp) 1539 { 1540 struct mount *mp; 1541 int active; 1542 1543 mp = vp->v_mount; 1544 if (mp == NULL) 1545 return; 1546 MNT_ILOCK(mp); 1547 VI_LOCK(vp); 1548 KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize, 1549 ("Active vnode list size %d > Vnode list size %d", 1550 mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize)); 1551 active = vp->v_iflag & VI_ACTIVE; 1552 vp->v_iflag &= ~VI_ACTIVE; 1553 if (active) { 1554 mtx_lock(&mp->mnt_listmtx); 1555 TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist); 1556 mp->mnt_activevnodelistsize--; 1557 mtx_unlock(&mp->mnt_listmtx); 1558 } 1559 vp->v_mount = NULL; 1560 VI_UNLOCK(vp); 1561 VNASSERT(mp->mnt_nvnodelistsize > 0, vp, 1562 ("bad mount point vnode list size")); 1563 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1564 mp->mnt_nvnodelistsize--; 1565 MNT_REL(mp); 1566 MNT_IUNLOCK(mp); 1567 } 1568 1569 static void 1570 insmntque_stddtr(struct vnode *vp, void *dtr_arg) 1571 { 1572 1573 vp->v_data = NULL; 1574 vp->v_op = &dead_vnodeops; 1575 vgone(vp); 1576 vput(vp); 1577 } 1578 1579 /* 1580 * Insert into list of vnodes for the new mount point, if available. 1581 */ 1582 int 1583 insmntque1(struct vnode *vp, struct mount *mp, 1584 void (*dtr)(struct vnode *, void *), void *dtr_arg) 1585 { 1586 1587 KASSERT(vp->v_mount == NULL, 1588 ("insmntque: vnode already on per mount vnode list")); 1589 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); 1590 ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); 1591 1592 /* 1593 * We acquire the vnode interlock early to ensure that the 1594 * vnode cannot be recycled by another process releasing a 1595 * holdcnt on it before we get it on both the vnode list 1596 * and the active vnode list. The mount mutex protects only 1597 * manipulation of the vnode list and the vnode freelist 1598 * mutex protects only manipulation of the active vnode list. 1599 * Hence the need to hold the vnode interlock throughout. 1600 */ 1601 MNT_ILOCK(mp); 1602 VI_LOCK(vp); 1603 if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 && 1604 ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || 1605 mp->mnt_nvnodelistsize == 0)) && 1606 (vp->v_vflag & VV_FORCEINSMQ) == 0) { 1607 VI_UNLOCK(vp); 1608 MNT_IUNLOCK(mp); 1609 if (dtr != NULL) 1610 dtr(vp, dtr_arg); 1611 return (EBUSY); 1612 } 1613 vp->v_mount = mp; 1614 MNT_REF(mp); 1615 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1616 VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, 1617 ("neg mount point vnode list size")); 1618 mp->mnt_nvnodelistsize++; 1619 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 1620 ("Activating already active vnode")); 1621 vp->v_iflag |= VI_ACTIVE; 1622 mtx_lock(&mp->mnt_listmtx); 1623 TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); 1624 mp->mnt_activevnodelistsize++; 1625 mtx_unlock(&mp->mnt_listmtx); 1626 VI_UNLOCK(vp); 1627 MNT_IUNLOCK(mp); 1628 return (0); 1629 } 1630 1631 int 1632 insmntque(struct vnode *vp, struct mount *mp) 1633 { 1634 1635 return (insmntque1(vp, mp, insmntque_stddtr, NULL)); 1636 } 1637 1638 /* 1639 * Flush out and invalidate all buffers associated with a bufobj 1640 * Called with the underlying object locked. 1641 */ 1642 int 1643 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) 1644 { 1645 int error; 1646 1647 BO_LOCK(bo); 1648 if (flags & V_SAVE) { 1649 error = bufobj_wwait(bo, slpflag, slptimeo); 1650 if (error) { 1651 BO_UNLOCK(bo); 1652 return (error); 1653 } 1654 if (bo->bo_dirty.bv_cnt > 0) { 1655 BO_UNLOCK(bo); 1656 if ((error = BO_SYNC(bo, MNT_WAIT)) != 0) 1657 return (error); 1658 /* 1659 * XXX We could save a lock/unlock if this was only 1660 * enabled under INVARIANTS 1661 */ 1662 BO_LOCK(bo); 1663 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 1664 panic("vinvalbuf: dirty bufs"); 1665 } 1666 } 1667 /* 1668 * If you alter this loop please notice that interlock is dropped and 1669 * reacquired in flushbuflist. Special care is needed to ensure that 1670 * no race conditions occur from this. 1671 */ 1672 do { 1673 error = flushbuflist(&bo->bo_clean, 1674 flags, bo, slpflag, slptimeo); 1675 if (error == 0 && !(flags & V_CLEANONLY)) 1676 error = flushbuflist(&bo->bo_dirty, 1677 flags, bo, slpflag, slptimeo); 1678 if (error != 0 && error != EAGAIN) { 1679 BO_UNLOCK(bo); 1680 return (error); 1681 } 1682 } while (error != 0); 1683 1684 /* 1685 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 1686 * have write I/O in-progress but if there is a VM object then the 1687 * VM object can also have read-I/O in-progress. 1688 */ 1689 do { 1690 bufobj_wwait(bo, 0, 0); 1691 if ((flags & V_VMIO) == 0) { 1692 BO_UNLOCK(bo); 1693 if (bo->bo_object != NULL) { 1694 VM_OBJECT_WLOCK(bo->bo_object); 1695 vm_object_pip_wait(bo->bo_object, "bovlbx"); 1696 VM_OBJECT_WUNLOCK(bo->bo_object); 1697 } 1698 BO_LOCK(bo); 1699 } 1700 } while (bo->bo_numoutput > 0); 1701 BO_UNLOCK(bo); 1702 1703 /* 1704 * Destroy the copy in the VM cache, too. 1705 */ 1706 if (bo->bo_object != NULL && 1707 (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) { 1708 VM_OBJECT_WLOCK(bo->bo_object); 1709 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? 1710 OBJPR_CLEANONLY : 0); 1711 VM_OBJECT_WUNLOCK(bo->bo_object); 1712 } 1713 1714 #ifdef INVARIANTS 1715 BO_LOCK(bo); 1716 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO | 1717 V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 || 1718 bo->bo_clean.bv_cnt > 0)) 1719 panic("vinvalbuf: flush failed"); 1720 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 && 1721 bo->bo_dirty.bv_cnt > 0) 1722 panic("vinvalbuf: flush dirty failed"); 1723 BO_UNLOCK(bo); 1724 #endif 1725 return (0); 1726 } 1727 1728 /* 1729 * Flush out and invalidate all buffers associated with a vnode. 1730 * Called with the underlying object locked. 1731 */ 1732 int 1733 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) 1734 { 1735 1736 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 1737 ASSERT_VOP_LOCKED(vp, "vinvalbuf"); 1738 if (vp->v_object != NULL && vp->v_object->handle != vp) 1739 return (0); 1740 return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); 1741 } 1742 1743 /* 1744 * Flush out buffers on the specified list. 1745 * 1746 */ 1747 static int 1748 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, 1749 int slptimeo) 1750 { 1751 struct buf *bp, *nbp; 1752 int retval, error; 1753 daddr_t lblkno; 1754 b_xflags_t xflags; 1755 1756 ASSERT_BO_WLOCKED(bo); 1757 1758 retval = 0; 1759 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { 1760 /* 1761 * If we are flushing both V_NORMAL and V_ALT buffers then 1762 * do not skip any buffers. If we are flushing only V_NORMAL 1763 * buffers then skip buffers marked as BX_ALTDATA. If we are 1764 * flushing only V_ALT buffers then skip buffers not marked 1765 * as BX_ALTDATA. 1766 */ 1767 if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) && 1768 (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) || 1769 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) { 1770 continue; 1771 } 1772 if (nbp != NULL) { 1773 lblkno = nbp->b_lblkno; 1774 xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); 1775 } 1776 retval = EAGAIN; 1777 error = BUF_TIMELOCK(bp, 1778 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), 1779 "flushbuf", slpflag, slptimeo); 1780 if (error) { 1781 BO_LOCK(bo); 1782 return (error != ENOLCK ? error : EAGAIN); 1783 } 1784 KASSERT(bp->b_bufobj == bo, 1785 ("bp %p wrong b_bufobj %p should be %p", 1786 bp, bp->b_bufobj, bo)); 1787 /* 1788 * XXX Since there are no node locks for NFS, I 1789 * believe there is a slight chance that a delayed 1790 * write will occur while sleeping just above, so 1791 * check for it. 1792 */ 1793 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 1794 (flags & V_SAVE)) { 1795 bremfree(bp); 1796 bp->b_flags |= B_ASYNC; 1797 bwrite(bp); 1798 BO_LOCK(bo); 1799 return (EAGAIN); /* XXX: why not loop ? */ 1800 } 1801 bremfree(bp); 1802 bp->b_flags |= (B_INVAL | B_RELBUF); 1803 bp->b_flags &= ~B_ASYNC; 1804 brelse(bp); 1805 BO_LOCK(bo); 1806 if (nbp == NULL) 1807 break; 1808 nbp = gbincore(bo, lblkno); 1809 if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 1810 != xflags) 1811 break; /* nbp invalid */ 1812 } 1813 return (retval); 1814 } 1815 1816 int 1817 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) 1818 { 1819 struct buf *bp; 1820 int error; 1821 daddr_t lblkno; 1822 1823 ASSERT_BO_LOCKED(bo); 1824 1825 for (lblkno = startn;;) { 1826 again: 1827 bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno); 1828 if (bp == NULL || bp->b_lblkno >= endn || 1829 bp->b_lblkno < startn) 1830 break; 1831 error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 1832 LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); 1833 if (error != 0) { 1834 BO_RLOCK(bo); 1835 if (error == ENOLCK) 1836 goto again; 1837 return (error); 1838 } 1839 KASSERT(bp->b_bufobj == bo, 1840 ("bp %p wrong b_bufobj %p should be %p", 1841 bp, bp->b_bufobj, bo)); 1842 lblkno = bp->b_lblkno + 1; 1843 if ((bp->b_flags & B_MANAGED) == 0) 1844 bremfree(bp); 1845 bp->b_flags |= B_RELBUF; 1846 /* 1847 * In the VMIO case, use the B_NOREUSE flag to hint that the 1848 * pages backing each buffer in the range are unlikely to be 1849 * reused. Dirty buffers will have the hint applied once 1850 * they've been written. 1851 */ 1852 if ((bp->b_flags & B_VMIO) != 0) 1853 bp->b_flags |= B_NOREUSE; 1854 brelse(bp); 1855 BO_RLOCK(bo); 1856 } 1857 return (0); 1858 } 1859 1860 /* 1861 * Truncate a file's buffer and pages to a specified length. This 1862 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 1863 * sync activity. 1864 */ 1865 int 1866 vtruncbuf(struct vnode *vp, off_t length, int blksize) 1867 { 1868 struct buf *bp, *nbp; 1869 int anyfreed; 1870 daddr_t trunclbn; 1871 struct bufobj *bo; 1872 1873 CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, 1874 vp, blksize, (uintmax_t)length); 1875 1876 /* 1877 * Round up to the *next* lbn. 1878 */ 1879 trunclbn = howmany(length, blksize); 1880 1881 ASSERT_VOP_LOCKED(vp, "vtruncbuf"); 1882 restart: 1883 bo = &vp->v_bufobj; 1884 BO_LOCK(bo); 1885 anyfreed = 1; 1886 for (;anyfreed;) { 1887 anyfreed = 0; 1888 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { 1889 if (bp->b_lblkno < trunclbn) 1890 continue; 1891 if (BUF_LOCK(bp, 1892 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1893 BO_LOCKPTR(bo)) == ENOLCK) 1894 goto restart; 1895 1896 bremfree(bp); 1897 bp->b_flags |= (B_INVAL | B_RELBUF); 1898 bp->b_flags &= ~B_ASYNC; 1899 brelse(bp); 1900 anyfreed = 1; 1901 1902 BO_LOCK(bo); 1903 if (nbp != NULL && 1904 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 1905 (nbp->b_vp != vp) || 1906 (nbp->b_flags & B_DELWRI))) { 1907 BO_UNLOCK(bo); 1908 goto restart; 1909 } 1910 } 1911 1912 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 1913 if (bp->b_lblkno < trunclbn) 1914 continue; 1915 if (BUF_LOCK(bp, 1916 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1917 BO_LOCKPTR(bo)) == ENOLCK) 1918 goto restart; 1919 bremfree(bp); 1920 bp->b_flags |= (B_INVAL | B_RELBUF); 1921 bp->b_flags &= ~B_ASYNC; 1922 brelse(bp); 1923 anyfreed = 1; 1924 1925 BO_LOCK(bo); 1926 if (nbp != NULL && 1927 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 1928 (nbp->b_vp != vp) || 1929 (nbp->b_flags & B_DELWRI) == 0)) { 1930 BO_UNLOCK(bo); 1931 goto restart; 1932 } 1933 } 1934 } 1935 1936 if (length > 0) { 1937 restartsync: 1938 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 1939 if (bp->b_lblkno > 0) 1940 continue; 1941 /* 1942 * Since we hold the vnode lock this should only 1943 * fail if we're racing with the buf daemon. 1944 */ 1945 if (BUF_LOCK(bp, 1946 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1947 BO_LOCKPTR(bo)) == ENOLCK) { 1948 goto restart; 1949 } 1950 VNASSERT((bp->b_flags & B_DELWRI), vp, 1951 ("buf(%p) on dirty queue without DELWRI", bp)); 1952 1953 bremfree(bp); 1954 bawrite(bp); 1955 BO_LOCK(bo); 1956 goto restartsync; 1957 } 1958 } 1959 1960 bufobj_wwait(bo, 0, 0); 1961 BO_UNLOCK(bo); 1962 vnode_pager_setsize(vp, length); 1963 1964 return (0); 1965 } 1966 1967 static void 1968 buf_vlist_remove(struct buf *bp) 1969 { 1970 struct bufv *bv; 1971 1972 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 1973 ASSERT_BO_WLOCKED(bp->b_bufobj); 1974 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) != 1975 (BX_VNDIRTY|BX_VNCLEAN), 1976 ("buf_vlist_remove: Buf %p is on two lists", bp)); 1977 if (bp->b_xflags & BX_VNDIRTY) 1978 bv = &bp->b_bufobj->bo_dirty; 1979 else 1980 bv = &bp->b_bufobj->bo_clean; 1981 BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); 1982 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); 1983 bv->bv_cnt--; 1984 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1985 } 1986 1987 /* 1988 * Add the buffer to the sorted clean or dirty block list. 1989 * 1990 * NOTE: xflags is passed as a constant, optimizing this inline function! 1991 */ 1992 static void 1993 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 1994 { 1995 struct bufv *bv; 1996 struct buf *n; 1997 int error; 1998 1999 ASSERT_BO_WLOCKED(bo); 2000 KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, 2001 ("dead bo %p", bo)); 2002 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, 2003 ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); 2004 bp->b_xflags |= xflags; 2005 if (xflags & BX_VNDIRTY) 2006 bv = &bo->bo_dirty; 2007 else 2008 bv = &bo->bo_clean; 2009 2010 /* 2011 * Keep the list ordered. Optimize empty list insertion. Assume 2012 * we tend to grow at the tail so lookup_le should usually be cheaper 2013 * than _ge. 2014 */ 2015 if (bv->bv_cnt == 0 || 2016 bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) 2017 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); 2018 else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) 2019 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); 2020 else 2021 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); 2022 error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); 2023 if (error) 2024 panic("buf_vlist_add: Preallocated nodes insufficient."); 2025 bv->bv_cnt++; 2026 } 2027 2028 /* 2029 * Look up a buffer using the buffer tries. 2030 */ 2031 struct buf * 2032 gbincore(struct bufobj *bo, daddr_t lblkno) 2033 { 2034 struct buf *bp; 2035 2036 ASSERT_BO_LOCKED(bo); 2037 bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); 2038 if (bp != NULL) 2039 return (bp); 2040 return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno); 2041 } 2042 2043 /* 2044 * Associate a buffer with a vnode. 2045 */ 2046 void 2047 bgetvp(struct vnode *vp, struct buf *bp) 2048 { 2049 struct bufobj *bo; 2050 2051 bo = &vp->v_bufobj; 2052 ASSERT_BO_WLOCKED(bo); 2053 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); 2054 2055 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); 2056 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, 2057 ("bgetvp: bp already attached! %p", bp)); 2058 2059 vhold(vp); 2060 bp->b_vp = vp; 2061 bp->b_bufobj = bo; 2062 /* 2063 * Insert onto list for new vnode. 2064 */ 2065 buf_vlist_add(bp, bo, BX_VNCLEAN); 2066 } 2067 2068 /* 2069 * Disassociate a buffer from a vnode. 2070 */ 2071 void 2072 brelvp(struct buf *bp) 2073 { 2074 struct bufobj *bo; 2075 struct vnode *vp; 2076 2077 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 2078 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 2079 2080 /* 2081 * Delete from old vnode list, if on one. 2082 */ 2083 vp = bp->b_vp; /* XXX */ 2084 bo = bp->b_bufobj; 2085 BO_LOCK(bo); 2086 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2087 buf_vlist_remove(bp); 2088 else 2089 panic("brelvp: Buffer %p not on queue.", bp); 2090 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2091 bo->bo_flag &= ~BO_ONWORKLST; 2092 mtx_lock(&sync_mtx); 2093 LIST_REMOVE(bo, bo_synclist); 2094 syncer_worklist_len--; 2095 mtx_unlock(&sync_mtx); 2096 } 2097 bp->b_vp = NULL; 2098 bp->b_bufobj = NULL; 2099 BO_UNLOCK(bo); 2100 vdrop(vp); 2101 } 2102 2103 /* 2104 * Add an item to the syncer work queue. 2105 */ 2106 static void 2107 vn_syncer_add_to_worklist(struct bufobj *bo, int delay) 2108 { 2109 int slot; 2110 2111 ASSERT_BO_WLOCKED(bo); 2112 2113 mtx_lock(&sync_mtx); 2114 if (bo->bo_flag & BO_ONWORKLST) 2115 LIST_REMOVE(bo, bo_synclist); 2116 else { 2117 bo->bo_flag |= BO_ONWORKLST; 2118 syncer_worklist_len++; 2119 } 2120 2121 if (delay > syncer_maxdelay - 2) 2122 delay = syncer_maxdelay - 2; 2123 slot = (syncer_delayno + delay) & syncer_mask; 2124 2125 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); 2126 mtx_unlock(&sync_mtx); 2127 } 2128 2129 static int 2130 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) 2131 { 2132 int error, len; 2133 2134 mtx_lock(&sync_mtx); 2135 len = syncer_worklist_len - sync_vnode_count; 2136 mtx_unlock(&sync_mtx); 2137 error = SYSCTL_OUT(req, &len, sizeof(len)); 2138 return (error); 2139 } 2140 2141 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, 2142 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); 2143 2144 static struct proc *updateproc; 2145 static void sched_sync(void); 2146 static struct kproc_desc up_kp = { 2147 "syncer", 2148 sched_sync, 2149 &updateproc 2150 }; 2151 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); 2152 2153 static int 2154 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) 2155 { 2156 struct vnode *vp; 2157 struct mount *mp; 2158 2159 *bo = LIST_FIRST(slp); 2160 if (*bo == NULL) 2161 return (0); 2162 vp = bo2vnode(*bo); 2163 if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) 2164 return (1); 2165 /* 2166 * We use vhold in case the vnode does not 2167 * successfully sync. vhold prevents the vnode from 2168 * going away when we unlock the sync_mtx so that 2169 * we can acquire the vnode interlock. 2170 */ 2171 vholdl(vp); 2172 mtx_unlock(&sync_mtx); 2173 VI_UNLOCK(vp); 2174 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 2175 vdrop(vp); 2176 mtx_lock(&sync_mtx); 2177 return (*bo == LIST_FIRST(slp)); 2178 } 2179 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2180 (void) VOP_FSYNC(vp, MNT_LAZY, td); 2181 VOP_UNLOCK(vp, 0); 2182 vn_finished_write(mp); 2183 BO_LOCK(*bo); 2184 if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { 2185 /* 2186 * Put us back on the worklist. The worklist 2187 * routine will remove us from our current 2188 * position and then add us back in at a later 2189 * position. 2190 */ 2191 vn_syncer_add_to_worklist(*bo, syncdelay); 2192 } 2193 BO_UNLOCK(*bo); 2194 vdrop(vp); 2195 mtx_lock(&sync_mtx); 2196 return (0); 2197 } 2198 2199 static int first_printf = 1; 2200 2201 /* 2202 * System filesystem synchronizer daemon. 2203 */ 2204 static void 2205 sched_sync(void) 2206 { 2207 struct synclist *next, *slp; 2208 struct bufobj *bo; 2209 long starttime; 2210 struct thread *td = curthread; 2211 int last_work_seen; 2212 int net_worklist_len; 2213 int syncer_final_iter; 2214 int error; 2215 2216 last_work_seen = 0; 2217 syncer_final_iter = 0; 2218 syncer_state = SYNCER_RUNNING; 2219 starttime = time_uptime; 2220 td->td_pflags |= TDP_NORUNNINGBUF; 2221 2222 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, 2223 SHUTDOWN_PRI_LAST); 2224 2225 mtx_lock(&sync_mtx); 2226 for (;;) { 2227 if (syncer_state == SYNCER_FINAL_DELAY && 2228 syncer_final_iter == 0) { 2229 mtx_unlock(&sync_mtx); 2230 kproc_suspend_check(td->td_proc); 2231 mtx_lock(&sync_mtx); 2232 } 2233 net_worklist_len = syncer_worklist_len - sync_vnode_count; 2234 if (syncer_state != SYNCER_RUNNING && 2235 starttime != time_uptime) { 2236 if (first_printf) { 2237 printf("\nSyncing disks, vnodes remaining... "); 2238 first_printf = 0; 2239 } 2240 printf("%d ", net_worklist_len); 2241 } 2242 starttime = time_uptime; 2243 2244 /* 2245 * Push files whose dirty time has expired. Be careful 2246 * of interrupt race on slp queue. 2247 * 2248 * Skip over empty worklist slots when shutting down. 2249 */ 2250 do { 2251 slp = &syncer_workitem_pending[syncer_delayno]; 2252 syncer_delayno += 1; 2253 if (syncer_delayno == syncer_maxdelay) 2254 syncer_delayno = 0; 2255 next = &syncer_workitem_pending[syncer_delayno]; 2256 /* 2257 * If the worklist has wrapped since the 2258 * it was emptied of all but syncer vnodes, 2259 * switch to the FINAL_DELAY state and run 2260 * for one more second. 2261 */ 2262 if (syncer_state == SYNCER_SHUTTING_DOWN && 2263 net_worklist_len == 0 && 2264 last_work_seen == syncer_delayno) { 2265 syncer_state = SYNCER_FINAL_DELAY; 2266 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; 2267 } 2268 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && 2269 syncer_worklist_len > 0); 2270 2271 /* 2272 * Keep track of the last time there was anything 2273 * on the worklist other than syncer vnodes. 2274 * Return to the SHUTTING_DOWN state if any 2275 * new work appears. 2276 */ 2277 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) 2278 last_work_seen = syncer_delayno; 2279 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) 2280 syncer_state = SYNCER_SHUTTING_DOWN; 2281 while (!LIST_EMPTY(slp)) { 2282 error = sync_vnode(slp, &bo, td); 2283 if (error == 1) { 2284 LIST_REMOVE(bo, bo_synclist); 2285 LIST_INSERT_HEAD(next, bo, bo_synclist); 2286 continue; 2287 } 2288 2289 if (first_printf == 0) { 2290 /* 2291 * Drop the sync mutex, because some watchdog 2292 * drivers need to sleep while patting 2293 */ 2294 mtx_unlock(&sync_mtx); 2295 wdog_kern_pat(WD_LASTVAL); 2296 mtx_lock(&sync_mtx); 2297 } 2298 2299 } 2300 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) 2301 syncer_final_iter--; 2302 /* 2303 * The variable rushjob allows the kernel to speed up the 2304 * processing of the filesystem syncer process. A rushjob 2305 * value of N tells the filesystem syncer to process the next 2306 * N seconds worth of work on its queue ASAP. Currently rushjob 2307 * is used by the soft update code to speed up the filesystem 2308 * syncer process when the incore state is getting so far 2309 * ahead of the disk that the kernel memory pool is being 2310 * threatened with exhaustion. 2311 */ 2312 if (rushjob > 0) { 2313 rushjob -= 1; 2314 continue; 2315 } 2316 /* 2317 * Just sleep for a short period of time between 2318 * iterations when shutting down to allow some I/O 2319 * to happen. 2320 * 2321 * If it has taken us less than a second to process the 2322 * current work, then wait. Otherwise start right over 2323 * again. We can still lose time if any single round 2324 * takes more than two seconds, but it does not really 2325 * matter as we are just trying to generally pace the 2326 * filesystem activity. 2327 */ 2328 if (syncer_state != SYNCER_RUNNING || 2329 time_uptime == starttime) { 2330 thread_lock(td); 2331 sched_prio(td, PPAUSE); 2332 thread_unlock(td); 2333 } 2334 if (syncer_state != SYNCER_RUNNING) 2335 cv_timedwait(&sync_wakeup, &sync_mtx, 2336 hz / SYNCER_SHUTDOWN_SPEEDUP); 2337 else if (time_uptime == starttime) 2338 cv_timedwait(&sync_wakeup, &sync_mtx, hz); 2339 } 2340 } 2341 2342 /* 2343 * Request the syncer daemon to speed up its work. 2344 * We never push it to speed up more than half of its 2345 * normal turn time, otherwise it could take over the cpu. 2346 */ 2347 int 2348 speedup_syncer(void) 2349 { 2350 int ret = 0; 2351 2352 mtx_lock(&sync_mtx); 2353 if (rushjob < syncdelay / 2) { 2354 rushjob += 1; 2355 stat_rush_requests += 1; 2356 ret = 1; 2357 } 2358 mtx_unlock(&sync_mtx); 2359 cv_broadcast(&sync_wakeup); 2360 return (ret); 2361 } 2362 2363 /* 2364 * Tell the syncer to speed up its work and run though its work 2365 * list several times, then tell it to shut down. 2366 */ 2367 static void 2368 syncer_shutdown(void *arg, int howto) 2369 { 2370 2371 if (howto & RB_NOSYNC) 2372 return; 2373 mtx_lock(&sync_mtx); 2374 syncer_state = SYNCER_SHUTTING_DOWN; 2375 rushjob = 0; 2376 mtx_unlock(&sync_mtx); 2377 cv_broadcast(&sync_wakeup); 2378 kproc_shutdown(arg, howto); 2379 } 2380 2381 void 2382 syncer_suspend(void) 2383 { 2384 2385 syncer_shutdown(updateproc, 0); 2386 } 2387 2388 void 2389 syncer_resume(void) 2390 { 2391 2392 mtx_lock(&sync_mtx); 2393 first_printf = 1; 2394 syncer_state = SYNCER_RUNNING; 2395 mtx_unlock(&sync_mtx); 2396 cv_broadcast(&sync_wakeup); 2397 kproc_resume(updateproc); 2398 } 2399 2400 /* 2401 * Reassign a buffer from one vnode to another. 2402 * Used to assign file specific control information 2403 * (indirect blocks) to the vnode to which they belong. 2404 */ 2405 void 2406 reassignbuf(struct buf *bp) 2407 { 2408 struct vnode *vp; 2409 struct bufobj *bo; 2410 int delay; 2411 #ifdef INVARIANTS 2412 struct bufv *bv; 2413 #endif 2414 2415 vp = bp->b_vp; 2416 bo = bp->b_bufobj; 2417 ++reassignbufcalls; 2418 2419 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", 2420 bp, bp->b_vp, bp->b_flags); 2421 /* 2422 * B_PAGING flagged buffers cannot be reassigned because their vp 2423 * is not fully linked in. 2424 */ 2425 if (bp->b_flags & B_PAGING) 2426 panic("cannot reassign paging buffer"); 2427 2428 /* 2429 * Delete from old vnode list, if on one. 2430 */ 2431 BO_LOCK(bo); 2432 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2433 buf_vlist_remove(bp); 2434 else 2435 panic("reassignbuf: Buffer %p not on queue.", bp); 2436 /* 2437 * If dirty, put on list of dirty buffers; otherwise insert onto list 2438 * of clean buffers. 2439 */ 2440 if (bp->b_flags & B_DELWRI) { 2441 if ((bo->bo_flag & BO_ONWORKLST) == 0) { 2442 switch (vp->v_type) { 2443 case VDIR: 2444 delay = dirdelay; 2445 break; 2446 case VCHR: 2447 delay = metadelay; 2448 break; 2449 default: 2450 delay = filedelay; 2451 } 2452 vn_syncer_add_to_worklist(bo, delay); 2453 } 2454 buf_vlist_add(bp, bo, BX_VNDIRTY); 2455 } else { 2456 buf_vlist_add(bp, bo, BX_VNCLEAN); 2457 2458 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2459 mtx_lock(&sync_mtx); 2460 LIST_REMOVE(bo, bo_synclist); 2461 syncer_worklist_len--; 2462 mtx_unlock(&sync_mtx); 2463 bo->bo_flag &= ~BO_ONWORKLST; 2464 } 2465 } 2466 #ifdef INVARIANTS 2467 bv = &bo->bo_clean; 2468 bp = TAILQ_FIRST(&bv->bv_hd); 2469 KASSERT(bp == NULL || bp->b_bufobj == bo, 2470 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2471 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2472 KASSERT(bp == NULL || bp->b_bufobj == bo, 2473 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2474 bv = &bo->bo_dirty; 2475 bp = TAILQ_FIRST(&bv->bv_hd); 2476 KASSERT(bp == NULL || bp->b_bufobj == bo, 2477 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2478 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2479 KASSERT(bp == NULL || bp->b_bufobj == bo, 2480 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2481 #endif 2482 BO_UNLOCK(bo); 2483 } 2484 2485 static void 2486 v_init_counters(struct vnode *vp) 2487 { 2488 2489 VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, 2490 vp, ("%s called for an initialized vnode", __FUNCTION__)); 2491 ASSERT_VI_UNLOCKED(vp, __FUNCTION__); 2492 2493 refcount_init(&vp->v_holdcnt, 1); 2494 refcount_init(&vp->v_usecount, 1); 2495 } 2496 2497 static void 2498 v_incr_usecount_locked(struct vnode *vp) 2499 { 2500 2501 ASSERT_VI_LOCKED(vp, __func__); 2502 if ((vp->v_iflag & VI_OWEINACT) != 0) { 2503 VNASSERT(vp->v_usecount == 0, vp, 2504 ("vnode with usecount and VI_OWEINACT set")); 2505 vp->v_iflag &= ~VI_OWEINACT; 2506 } 2507 refcount_acquire(&vp->v_usecount); 2508 v_incr_devcount(vp); 2509 } 2510 2511 /* 2512 * Increment the use count on the vnode, taking care to reference 2513 * the driver's usecount if this is a chardev. 2514 */ 2515 static void 2516 v_incr_usecount(struct vnode *vp) 2517 { 2518 2519 ASSERT_VI_UNLOCKED(vp, __func__); 2520 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2521 2522 if (vp->v_type != VCHR && 2523 refcount_acquire_if_not_zero(&vp->v_usecount)) { 2524 VNODE_REFCOUNT_FENCE_ACQ(); 2525 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, 2526 ("vnode with usecount and VI_OWEINACT set")); 2527 } else { 2528 VI_LOCK(vp); 2529 v_incr_usecount_locked(vp); 2530 VI_UNLOCK(vp); 2531 } 2532 } 2533 2534 /* 2535 * Increment si_usecount of the associated device, if any. 2536 */ 2537 static void 2538 v_incr_devcount(struct vnode *vp) 2539 { 2540 2541 ASSERT_VI_LOCKED(vp, __FUNCTION__); 2542 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 2543 dev_lock(); 2544 vp->v_rdev->si_usecount++; 2545 dev_unlock(); 2546 } 2547 } 2548 2549 /* 2550 * Decrement si_usecount of the associated device, if any. 2551 */ 2552 static void 2553 v_decr_devcount(struct vnode *vp) 2554 { 2555 2556 ASSERT_VI_LOCKED(vp, __FUNCTION__); 2557 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 2558 dev_lock(); 2559 vp->v_rdev->si_usecount--; 2560 dev_unlock(); 2561 } 2562 } 2563 2564 /* 2565 * Grab a particular vnode from the free list, increment its 2566 * reference count and lock it. VI_DOOMED is set if the vnode 2567 * is being destroyed. Only callers who specify LK_RETRY will 2568 * see doomed vnodes. If inactive processing was delayed in 2569 * vput try to do it here. 2570 * 2571 * Notes on lockless counter manipulation: 2572 * _vhold, vputx and other routines make various decisions based 2573 * on either holdcnt or usecount being 0. As long as either counter 2574 * is not transitioning 0->1 nor 1->0, the manipulation can be done 2575 * with atomic operations. Otherwise the interlock is taken covering 2576 * both the atomic and additional actions. 2577 */ 2578 int 2579 vget(struct vnode *vp, int flags, struct thread *td) 2580 { 2581 int error, oweinact; 2582 2583 VNASSERT((flags & LK_TYPE_MASK) != 0, vp, 2584 ("vget: invalid lock operation")); 2585 2586 if ((flags & LK_INTERLOCK) != 0) 2587 ASSERT_VI_LOCKED(vp, __func__); 2588 else 2589 ASSERT_VI_UNLOCKED(vp, __func__); 2590 if ((flags & LK_VNHELD) != 0) 2591 VNASSERT((vp->v_holdcnt > 0), vp, 2592 ("vget: LK_VNHELD passed but vnode not held")); 2593 2594 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 2595 2596 if ((flags & LK_VNHELD) == 0) 2597 _vhold(vp, (flags & LK_INTERLOCK) != 0); 2598 2599 if ((error = vn_lock(vp, flags)) != 0) { 2600 vdrop(vp); 2601 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, 2602 vp); 2603 return (error); 2604 } 2605 if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0) 2606 panic("vget: vn_lock failed to return ENOENT\n"); 2607 /* 2608 * We don't guarantee that any particular close will 2609 * trigger inactive processing so just make a best effort 2610 * here at preventing a reference to a removed file. If 2611 * we don't succeed no harm is done. 2612 * 2613 * Upgrade our holdcnt to a usecount. 2614 */ 2615 if (vp->v_type == VCHR || 2616 !refcount_acquire_if_not_zero(&vp->v_usecount)) { 2617 VI_LOCK(vp); 2618 if ((vp->v_iflag & VI_OWEINACT) == 0) { 2619 oweinact = 0; 2620 } else { 2621 oweinact = 1; 2622 vp->v_iflag &= ~VI_OWEINACT; 2623 VNODE_REFCOUNT_FENCE_REL(); 2624 } 2625 refcount_acquire(&vp->v_usecount); 2626 v_incr_devcount(vp); 2627 if (oweinact && VOP_ISLOCKED(vp) == LK_EXCLUSIVE && 2628 (flags & LK_NOWAIT) == 0) 2629 vinactive(vp, td); 2630 VI_UNLOCK(vp); 2631 } 2632 return (0); 2633 } 2634 2635 /* 2636 * Increase the reference (use) and hold count of a vnode. 2637 * This will also remove the vnode from the free list if it is presently free. 2638 */ 2639 void 2640 vref(struct vnode *vp) 2641 { 2642 2643 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2644 _vhold(vp, false); 2645 v_incr_usecount(vp); 2646 } 2647 2648 void 2649 vrefl(struct vnode *vp) 2650 { 2651 2652 ASSERT_VI_LOCKED(vp, __func__); 2653 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2654 _vhold(vp, true); 2655 v_incr_usecount_locked(vp); 2656 } 2657 2658 void 2659 vrefact(struct vnode *vp) 2660 { 2661 2662 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2663 if (__predict_false(vp->v_type == VCHR)) { 2664 VNASSERT(vp->v_holdcnt > 0 && vp->v_usecount > 0, vp, 2665 ("%s: wrong ref counts", __func__)); 2666 vref(vp); 2667 return; 2668 } 2669 #ifdef INVARIANTS 2670 int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 2671 VNASSERT(old > 0, vp, ("%s: wrong hold count", __func__)); 2672 old = atomic_fetchadd_int(&vp->v_usecount, 1); 2673 VNASSERT(old > 0, vp, ("%s: wrong use count", __func__)); 2674 #else 2675 refcount_acquire(&vp->v_holdcnt); 2676 refcount_acquire(&vp->v_usecount); 2677 #endif 2678 } 2679 2680 /* 2681 * Return reference count of a vnode. 2682 * 2683 * The results of this call are only guaranteed when some mechanism is used to 2684 * stop other processes from gaining references to the vnode. This may be the 2685 * case if the caller holds the only reference. This is also useful when stale 2686 * data is acceptable as race conditions may be accounted for by some other 2687 * means. 2688 */ 2689 int 2690 vrefcnt(struct vnode *vp) 2691 { 2692 2693 return (vp->v_usecount); 2694 } 2695 2696 #define VPUTX_VRELE 1 2697 #define VPUTX_VPUT 2 2698 #define VPUTX_VUNREF 3 2699 2700 /* 2701 * Decrement the use and hold counts for a vnode. 2702 * 2703 * See an explanation near vget() as to why atomic operation is safe. 2704 */ 2705 static void 2706 vputx(struct vnode *vp, int func) 2707 { 2708 int error; 2709 2710 KASSERT(vp != NULL, ("vputx: null vp")); 2711 if (func == VPUTX_VUNREF) 2712 ASSERT_VOP_LOCKED(vp, "vunref"); 2713 else if (func == VPUTX_VPUT) 2714 ASSERT_VOP_LOCKED(vp, "vput"); 2715 else 2716 KASSERT(func == VPUTX_VRELE, ("vputx: wrong func")); 2717 ASSERT_VI_UNLOCKED(vp, __func__); 2718 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2719 2720 if (vp->v_type != VCHR && 2721 refcount_release_if_not_last(&vp->v_usecount)) { 2722 if (func == VPUTX_VPUT) 2723 VOP_UNLOCK(vp, 0); 2724 vdrop(vp); 2725 return; 2726 } 2727 2728 VI_LOCK(vp); 2729 2730 /* 2731 * We want to hold the vnode until the inactive finishes to 2732 * prevent vgone() races. We drop the use count here and the 2733 * hold count below when we're done. 2734 */ 2735 if (!refcount_release(&vp->v_usecount) || 2736 (vp->v_iflag & VI_DOINGINACT)) { 2737 if (func == VPUTX_VPUT) 2738 VOP_UNLOCK(vp, 0); 2739 v_decr_devcount(vp); 2740 vdropl(vp); 2741 return; 2742 } 2743 2744 v_decr_devcount(vp); 2745 2746 error = 0; 2747 2748 if (vp->v_usecount != 0) { 2749 vn_printf(vp, "vputx: usecount not zero for vnode "); 2750 panic("vputx: usecount not zero"); 2751 } 2752 2753 CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp); 2754 2755 /* 2756 * We must call VOP_INACTIVE with the node locked. Mark 2757 * as VI_DOINGINACT to avoid recursion. 2758 */ 2759 vp->v_iflag |= VI_OWEINACT; 2760 switch (func) { 2761 case VPUTX_VRELE: 2762 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 2763 VI_LOCK(vp); 2764 break; 2765 case VPUTX_VPUT: 2766 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 2767 error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | 2768 LK_NOWAIT); 2769 VI_LOCK(vp); 2770 } 2771 break; 2772 case VPUTX_VUNREF: 2773 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 2774 error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); 2775 VI_LOCK(vp); 2776 } 2777 break; 2778 } 2779 VNASSERT(vp->v_usecount == 0 || (vp->v_iflag & VI_OWEINACT) == 0, vp, 2780 ("vnode with usecount and VI_OWEINACT set")); 2781 if (error == 0) { 2782 if (vp->v_iflag & VI_OWEINACT) 2783 vinactive(vp, curthread); 2784 if (func != VPUTX_VUNREF) 2785 VOP_UNLOCK(vp, 0); 2786 } 2787 vdropl(vp); 2788 } 2789 2790 /* 2791 * Vnode put/release. 2792 * If count drops to zero, call inactive routine and return to freelist. 2793 */ 2794 void 2795 vrele(struct vnode *vp) 2796 { 2797 2798 vputx(vp, VPUTX_VRELE); 2799 } 2800 2801 /* 2802 * Release an already locked vnode. This give the same effects as 2803 * unlock+vrele(), but takes less time and avoids releasing and 2804 * re-aquiring the lock (as vrele() acquires the lock internally.) 2805 */ 2806 void 2807 vput(struct vnode *vp) 2808 { 2809 2810 vputx(vp, VPUTX_VPUT); 2811 } 2812 2813 /* 2814 * Release an exclusively locked vnode. Do not unlock the vnode lock. 2815 */ 2816 void 2817 vunref(struct vnode *vp) 2818 { 2819 2820 vputx(vp, VPUTX_VUNREF); 2821 } 2822 2823 /* 2824 * Increase the hold count and activate if this is the first reference. 2825 */ 2826 void 2827 _vhold(struct vnode *vp, bool locked) 2828 { 2829 struct mount *mp; 2830 2831 if (locked) 2832 ASSERT_VI_LOCKED(vp, __func__); 2833 else 2834 ASSERT_VI_UNLOCKED(vp, __func__); 2835 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2836 if (!locked) { 2837 if (refcount_acquire_if_not_zero(&vp->v_holdcnt)) { 2838 VNODE_REFCOUNT_FENCE_ACQ(); 2839 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, 2840 ("_vhold: vnode with holdcnt is free")); 2841 return; 2842 } 2843 VI_LOCK(vp); 2844 } 2845 if ((vp->v_iflag & VI_FREE) == 0) { 2846 refcount_acquire(&vp->v_holdcnt); 2847 if (!locked) 2848 VI_UNLOCK(vp); 2849 return; 2850 } 2851 VNASSERT(vp->v_holdcnt == 0, vp, 2852 ("%s: wrong hold count", __func__)); 2853 VNASSERT(vp->v_op != NULL, vp, 2854 ("%s: vnode already reclaimed.", __func__)); 2855 /* 2856 * Remove a vnode from the free list, mark it as in use, 2857 * and put it on the active list. 2858 */ 2859 VNASSERT(vp->v_mount != NULL, vp, 2860 ("_vhold: vnode not on per mount vnode list")); 2861 mp = vp->v_mount; 2862 mtx_lock(&mp->mnt_listmtx); 2863 if ((vp->v_mflag & VMP_TMPMNTFREELIST) != 0) { 2864 TAILQ_REMOVE(&mp->mnt_tmpfreevnodelist, vp, v_actfreelist); 2865 mp->mnt_tmpfreevnodelistsize--; 2866 vp->v_mflag &= ~VMP_TMPMNTFREELIST; 2867 } else { 2868 mtx_lock(&vnode_free_list_mtx); 2869 TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); 2870 freevnodes--; 2871 mtx_unlock(&vnode_free_list_mtx); 2872 } 2873 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 2874 ("Activating already active vnode")); 2875 vp->v_iflag &= ~VI_FREE; 2876 vp->v_iflag |= VI_ACTIVE; 2877 TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); 2878 mp->mnt_activevnodelistsize++; 2879 mtx_unlock(&mp->mnt_listmtx); 2880 refcount_acquire(&vp->v_holdcnt); 2881 if (!locked) 2882 VI_UNLOCK(vp); 2883 } 2884 2885 /* 2886 * Drop the hold count of the vnode. If this is the last reference to 2887 * the vnode we place it on the free list unless it has been vgone'd 2888 * (marked VI_DOOMED) in which case we will free it. 2889 * 2890 * Because the vnode vm object keeps a hold reference on the vnode if 2891 * there is at least one resident non-cached page, the vnode cannot 2892 * leave the active list without the page cleanup done. 2893 */ 2894 void 2895 _vdrop(struct vnode *vp, bool locked) 2896 { 2897 struct bufobj *bo; 2898 struct mount *mp; 2899 int active; 2900 2901 if (locked) 2902 ASSERT_VI_LOCKED(vp, __func__); 2903 else 2904 ASSERT_VI_UNLOCKED(vp, __func__); 2905 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2906 if ((int)vp->v_holdcnt <= 0) 2907 panic("vdrop: holdcnt %d", vp->v_holdcnt); 2908 if (!locked) { 2909 if (refcount_release_if_not_last(&vp->v_holdcnt)) 2910 return; 2911 VI_LOCK(vp); 2912 } 2913 if (refcount_release(&vp->v_holdcnt) == 0) { 2914 VI_UNLOCK(vp); 2915 return; 2916 } 2917 if ((vp->v_iflag & VI_DOOMED) == 0) { 2918 /* 2919 * Mark a vnode as free: remove it from its active list 2920 * and put it up for recycling on the freelist. 2921 */ 2922 VNASSERT(vp->v_op != NULL, vp, 2923 ("vdropl: vnode already reclaimed.")); 2924 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, 2925 ("vnode already free")); 2926 VNASSERT(vp->v_holdcnt == 0, vp, 2927 ("vdropl: freeing when we shouldn't")); 2928 active = vp->v_iflag & VI_ACTIVE; 2929 if ((vp->v_iflag & VI_OWEINACT) == 0) { 2930 vp->v_iflag &= ~VI_ACTIVE; 2931 mp = vp->v_mount; 2932 if (mp != NULL) { 2933 mtx_lock(&mp->mnt_listmtx); 2934 if (active) { 2935 TAILQ_REMOVE(&mp->mnt_activevnodelist, 2936 vp, v_actfreelist); 2937 mp->mnt_activevnodelistsize--; 2938 } 2939 TAILQ_INSERT_TAIL(&mp->mnt_tmpfreevnodelist, 2940 vp, v_actfreelist); 2941 mp->mnt_tmpfreevnodelistsize++; 2942 vp->v_iflag |= VI_FREE; 2943 vp->v_mflag |= VMP_TMPMNTFREELIST; 2944 VI_UNLOCK(vp); 2945 if (mp->mnt_tmpfreevnodelistsize >= 2946 mnt_free_list_batch) 2947 vnlru_return_batch_locked(mp); 2948 mtx_unlock(&mp->mnt_listmtx); 2949 } else { 2950 VNASSERT(active == 0, vp, 2951 ("vdropl: active vnode not on per mount " 2952 "vnode list")); 2953 mtx_lock(&vnode_free_list_mtx); 2954 TAILQ_INSERT_TAIL(&vnode_free_list, vp, 2955 v_actfreelist); 2956 freevnodes++; 2957 vp->v_iflag |= VI_FREE; 2958 VI_UNLOCK(vp); 2959 mtx_unlock(&vnode_free_list_mtx); 2960 } 2961 } else { 2962 VI_UNLOCK(vp); 2963 counter_u64_add(free_owe_inact, 1); 2964 } 2965 return; 2966 } 2967 /* 2968 * The vnode has been marked for destruction, so free it. 2969 * 2970 * The vnode will be returned to the zone where it will 2971 * normally remain until it is needed for another vnode. We 2972 * need to cleanup (or verify that the cleanup has already 2973 * been done) any residual data left from its current use 2974 * so as not to contaminate the freshly allocated vnode. 2975 */ 2976 CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); 2977 atomic_subtract_long(&numvnodes, 1); 2978 bo = &vp->v_bufobj; 2979 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, 2980 ("cleaned vnode still on the free list.")); 2981 VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); 2982 VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count")); 2983 VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); 2984 VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); 2985 VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); 2986 VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); 2987 VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, 2988 ("clean blk trie not empty")); 2989 VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); 2990 VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, 2991 ("dirty blk trie not empty")); 2992 VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); 2993 VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); 2994 VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for ..")); 2995 VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, 2996 ("Dangling rangelock waiters")); 2997 VI_UNLOCK(vp); 2998 #ifdef MAC 2999 mac_vnode_destroy(vp); 3000 #endif 3001 if (vp->v_pollinfo != NULL) { 3002 destroy_vpollinfo(vp->v_pollinfo); 3003 vp->v_pollinfo = NULL; 3004 } 3005 #ifdef INVARIANTS 3006 /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */ 3007 vp->v_op = NULL; 3008 #endif 3009 vp->v_mountedhere = NULL; 3010 vp->v_unpcb = NULL; 3011 vp->v_rdev = NULL; 3012 vp->v_fifoinfo = NULL; 3013 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 3014 vp->v_iflag = 0; 3015 vp->v_vflag = 0; 3016 bo->bo_flag = 0; 3017 uma_zfree(vnode_zone, vp); 3018 } 3019 3020 /* 3021 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT 3022 * flags. DOINGINACT prevents us from recursing in calls to vinactive. 3023 * OWEINACT tracks whether a vnode missed a call to inactive due to a 3024 * failed lock upgrade. 3025 */ 3026 void 3027 vinactive(struct vnode *vp, struct thread *td) 3028 { 3029 struct vm_object *obj; 3030 3031 ASSERT_VOP_ELOCKED(vp, "vinactive"); 3032 ASSERT_VI_LOCKED(vp, "vinactive"); 3033 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, 3034 ("vinactive: recursed on VI_DOINGINACT")); 3035 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3036 vp->v_iflag |= VI_DOINGINACT; 3037 vp->v_iflag &= ~VI_OWEINACT; 3038 VI_UNLOCK(vp); 3039 /* 3040 * Before moving off the active list, we must be sure that any 3041 * modified pages are converted into the vnode's dirty 3042 * buffers, since these will no longer be checked once the 3043 * vnode is on the inactive list. 3044 * 3045 * The write-out of the dirty pages is asynchronous. At the 3046 * point that VOP_INACTIVE() is called, there could still be 3047 * pending I/O and dirty pages in the object. 3048 */ 3049 if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 3050 (obj->flags & OBJ_MIGHTBEDIRTY) != 0) { 3051 VM_OBJECT_WLOCK(obj); 3052 vm_object_page_clean(obj, 0, 0, 0); 3053 VM_OBJECT_WUNLOCK(obj); 3054 } 3055 VOP_INACTIVE(vp, td); 3056 VI_LOCK(vp); 3057 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, 3058 ("vinactive: lost VI_DOINGINACT")); 3059 vp->v_iflag &= ~VI_DOINGINACT; 3060 } 3061 3062 /* 3063 * Remove any vnodes in the vnode table belonging to mount point mp. 3064 * 3065 * If FORCECLOSE is not specified, there should not be any active ones, 3066 * return error if any are found (nb: this is a user error, not a 3067 * system error). If FORCECLOSE is specified, detach any active vnodes 3068 * that are found. 3069 * 3070 * If WRITECLOSE is set, only flush out regular file vnodes open for 3071 * writing. 3072 * 3073 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. 3074 * 3075 * `rootrefs' specifies the base reference count for the root vnode 3076 * of this filesystem. The root vnode is considered busy if its 3077 * v_usecount exceeds this value. On a successful return, vflush(, td) 3078 * will call vrele() on the root vnode exactly rootrefs times. 3079 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 3080 * be zero. 3081 */ 3082 #ifdef DIAGNOSTIC 3083 static int busyprt = 0; /* print out busy vnodes */ 3084 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); 3085 #endif 3086 3087 int 3088 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) 3089 { 3090 struct vnode *vp, *mvp, *rootvp = NULL; 3091 struct vattr vattr; 3092 int busy = 0, error; 3093 3094 CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, 3095 rootrefs, flags); 3096 if (rootrefs > 0) { 3097 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 3098 ("vflush: bad args")); 3099 /* 3100 * Get the filesystem root vnode. We can vput() it 3101 * immediately, since with rootrefs > 0, it won't go away. 3102 */ 3103 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { 3104 CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", 3105 __func__, error); 3106 return (error); 3107 } 3108 vput(rootvp); 3109 } 3110 loop: 3111 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 3112 vholdl(vp); 3113 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); 3114 if (error) { 3115 vdrop(vp); 3116 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 3117 goto loop; 3118 } 3119 /* 3120 * Skip over a vnodes marked VV_SYSTEM. 3121 */ 3122 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 3123 VOP_UNLOCK(vp, 0); 3124 vdrop(vp); 3125 continue; 3126 } 3127 /* 3128 * If WRITECLOSE is set, flush out unlinked but still open 3129 * files (even if open only for reading) and regular file 3130 * vnodes open for writing. 3131 */ 3132 if (flags & WRITECLOSE) { 3133 if (vp->v_object != NULL) { 3134 VM_OBJECT_WLOCK(vp->v_object); 3135 vm_object_page_clean(vp->v_object, 0, 0, 0); 3136 VM_OBJECT_WUNLOCK(vp->v_object); 3137 } 3138 error = VOP_FSYNC(vp, MNT_WAIT, td); 3139 if (error != 0) { 3140 VOP_UNLOCK(vp, 0); 3141 vdrop(vp); 3142 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 3143 return (error); 3144 } 3145 error = VOP_GETATTR(vp, &vattr, td->td_ucred); 3146 VI_LOCK(vp); 3147 3148 if ((vp->v_type == VNON || 3149 (error == 0 && vattr.va_nlink > 0)) && 3150 (vp->v_writecount <= 0 || vp->v_type != VREG)) { 3151 VOP_UNLOCK(vp, 0); 3152 vdropl(vp); 3153 continue; 3154 } 3155 } else 3156 VI_LOCK(vp); 3157 /* 3158 * With v_usecount == 0, all we need to do is clear out the 3159 * vnode data structures and we are done. 3160 * 3161 * If FORCECLOSE is set, forcibly close the vnode. 3162 */ 3163 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { 3164 vgonel(vp); 3165 } else { 3166 busy++; 3167 #ifdef DIAGNOSTIC 3168 if (busyprt) 3169 vn_printf(vp, "vflush: busy vnode "); 3170 #endif 3171 } 3172 VOP_UNLOCK(vp, 0); 3173 vdropl(vp); 3174 } 3175 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 3176 /* 3177 * If just the root vnode is busy, and if its refcount 3178 * is equal to `rootrefs', then go ahead and kill it. 3179 */ 3180 VI_LOCK(rootvp); 3181 KASSERT(busy > 0, ("vflush: not busy")); 3182 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, 3183 ("vflush: usecount %d < rootrefs %d", 3184 rootvp->v_usecount, rootrefs)); 3185 if (busy == 1 && rootvp->v_usecount == rootrefs) { 3186 VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); 3187 vgone(rootvp); 3188 VOP_UNLOCK(rootvp, 0); 3189 busy = 0; 3190 } else 3191 VI_UNLOCK(rootvp); 3192 } 3193 if (busy) { 3194 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, 3195 busy); 3196 return (EBUSY); 3197 } 3198 for (; rootrefs > 0; rootrefs--) 3199 vrele(rootvp); 3200 return (0); 3201 } 3202 3203 /* 3204 * Recycle an unused vnode to the front of the free list. 3205 */ 3206 int 3207 vrecycle(struct vnode *vp) 3208 { 3209 int recycled; 3210 3211 VI_LOCK(vp); 3212 recycled = vrecyclel(vp); 3213 VI_UNLOCK(vp); 3214 return (recycled); 3215 } 3216 3217 /* 3218 * vrecycle, with the vp interlock held. 3219 */ 3220 int 3221 vrecyclel(struct vnode *vp) 3222 { 3223 int recycled; 3224 3225 ASSERT_VOP_ELOCKED(vp, __func__); 3226 ASSERT_VI_LOCKED(vp, __func__); 3227 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3228 recycled = 0; 3229 if (vp->v_usecount == 0) { 3230 recycled = 1; 3231 vgonel(vp); 3232 } 3233 return (recycled); 3234 } 3235 3236 /* 3237 * Eliminate all activity associated with a vnode 3238 * in preparation for reuse. 3239 */ 3240 void 3241 vgone(struct vnode *vp) 3242 { 3243 VI_LOCK(vp); 3244 vgonel(vp); 3245 VI_UNLOCK(vp); 3246 } 3247 3248 static void 3249 notify_lowervp_vfs_dummy(struct mount *mp __unused, 3250 struct vnode *lowervp __unused) 3251 { 3252 } 3253 3254 /* 3255 * Notify upper mounts about reclaimed or unlinked vnode. 3256 */ 3257 void 3258 vfs_notify_upper(struct vnode *vp, int event) 3259 { 3260 static struct vfsops vgonel_vfsops = { 3261 .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy, 3262 .vfs_unlink_lowervp = notify_lowervp_vfs_dummy, 3263 }; 3264 struct mount *mp, *ump, *mmp; 3265 3266 mp = vp->v_mount; 3267 if (mp == NULL) 3268 return; 3269 3270 MNT_ILOCK(mp); 3271 if (TAILQ_EMPTY(&mp->mnt_uppers)) 3272 goto unlock; 3273 MNT_IUNLOCK(mp); 3274 mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO); 3275 mmp->mnt_op = &vgonel_vfsops; 3276 mmp->mnt_kern_flag |= MNTK_MARKER; 3277 MNT_ILOCK(mp); 3278 mp->mnt_kern_flag |= MNTK_VGONE_UPPER; 3279 for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) { 3280 if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) { 3281 ump = TAILQ_NEXT(ump, mnt_upper_link); 3282 continue; 3283 } 3284 TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link); 3285 MNT_IUNLOCK(mp); 3286 switch (event) { 3287 case VFS_NOTIFY_UPPER_RECLAIM: 3288 VFS_RECLAIM_LOWERVP(ump, vp); 3289 break; 3290 case VFS_NOTIFY_UPPER_UNLINK: 3291 VFS_UNLINK_LOWERVP(ump, vp); 3292 break; 3293 default: 3294 KASSERT(0, ("invalid event %d", event)); 3295 break; 3296 } 3297 MNT_ILOCK(mp); 3298 ump = TAILQ_NEXT(mmp, mnt_upper_link); 3299 TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link); 3300 } 3301 free(mmp, M_TEMP); 3302 mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER; 3303 if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) { 3304 mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER; 3305 wakeup(&mp->mnt_uppers); 3306 } 3307 unlock: 3308 MNT_IUNLOCK(mp); 3309 } 3310 3311 /* 3312 * vgone, with the vp interlock held. 3313 */ 3314 static void 3315 vgonel(struct vnode *vp) 3316 { 3317 struct thread *td; 3318 int oweinact; 3319 int active; 3320 struct mount *mp; 3321 3322 ASSERT_VOP_ELOCKED(vp, "vgonel"); 3323 ASSERT_VI_LOCKED(vp, "vgonel"); 3324 VNASSERT(vp->v_holdcnt, vp, 3325 ("vgonel: vp %p has no reference.", vp)); 3326 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3327 td = curthread; 3328 3329 /* 3330 * Don't vgonel if we're already doomed. 3331 */ 3332 if (vp->v_iflag & VI_DOOMED) 3333 return; 3334 vp->v_iflag |= VI_DOOMED; 3335 3336 /* 3337 * Check to see if the vnode is in use. If so, we have to call 3338 * VOP_CLOSE() and VOP_INACTIVE(). 3339 */ 3340 active = vp->v_usecount; 3341 oweinact = (vp->v_iflag & VI_OWEINACT); 3342 VI_UNLOCK(vp); 3343 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); 3344 3345 /* 3346 * If purging an active vnode, it must be closed and 3347 * deactivated before being reclaimed. 3348 */ 3349 if (active) 3350 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 3351 if (oweinact || active) { 3352 VI_LOCK(vp); 3353 if ((vp->v_iflag & VI_DOINGINACT) == 0) 3354 vinactive(vp, td); 3355 VI_UNLOCK(vp); 3356 } 3357 if (vp->v_type == VSOCK) 3358 vfs_unp_reclaim(vp); 3359 3360 /* 3361 * Clean out any buffers associated with the vnode. 3362 * If the flush fails, just toss the buffers. 3363 */ 3364 mp = NULL; 3365 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) 3366 (void) vn_start_secondary_write(vp, &mp, V_WAIT); 3367 if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { 3368 while (vinvalbuf(vp, 0, 0, 0) != 0) 3369 ; 3370 } 3371 3372 BO_LOCK(&vp->v_bufobj); 3373 KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && 3374 vp->v_bufobj.bo_dirty.bv_cnt == 0 && 3375 TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && 3376 vp->v_bufobj.bo_clean.bv_cnt == 0, 3377 ("vp %p bufobj not invalidated", vp)); 3378 3379 /* 3380 * For VMIO bufobj, BO_DEAD is set in vm_object_terminate() 3381 * after the object's page queue is flushed. 3382 */ 3383 if (vp->v_bufobj.bo_object == NULL) 3384 vp->v_bufobj.bo_flag |= BO_DEAD; 3385 BO_UNLOCK(&vp->v_bufobj); 3386 3387 /* 3388 * Reclaim the vnode. 3389 */ 3390 if (VOP_RECLAIM(vp, td)) 3391 panic("vgone: cannot reclaim"); 3392 if (mp != NULL) 3393 vn_finished_secondary_write(mp); 3394 VNASSERT(vp->v_object == NULL, vp, 3395 ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag)); 3396 /* 3397 * Clear the advisory locks and wake up waiting threads. 3398 */ 3399 (void)VOP_ADVLOCKPURGE(vp); 3400 vp->v_lockf = NULL; 3401 /* 3402 * Delete from old mount point vnode list. 3403 */ 3404 delmntque(vp); 3405 cache_purge(vp); 3406 /* 3407 * Done with purge, reset to the standard lock and invalidate 3408 * the vnode. 3409 */ 3410 VI_LOCK(vp); 3411 vp->v_vnlock = &vp->v_lock; 3412 vp->v_op = &dead_vnodeops; 3413 vp->v_tag = "none"; 3414 vp->v_type = VBAD; 3415 } 3416 3417 /* 3418 * Calculate the total number of references to a special device. 3419 */ 3420 int 3421 vcount(struct vnode *vp) 3422 { 3423 int count; 3424 3425 dev_lock(); 3426 count = vp->v_rdev->si_usecount; 3427 dev_unlock(); 3428 return (count); 3429 } 3430 3431 /* 3432 * Same as above, but using the struct cdev *as argument 3433 */ 3434 int 3435 count_dev(struct cdev *dev) 3436 { 3437 int count; 3438 3439 dev_lock(); 3440 count = dev->si_usecount; 3441 dev_unlock(); 3442 return(count); 3443 } 3444 3445 /* 3446 * Print out a description of a vnode. 3447 */ 3448 static char *typename[] = 3449 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", 3450 "VMARKER"}; 3451 3452 void 3453 vn_printf(struct vnode *vp, const char *fmt, ...) 3454 { 3455 va_list ap; 3456 char buf[256], buf2[16]; 3457 u_long flags; 3458 3459 va_start(ap, fmt); 3460 vprintf(fmt, ap); 3461 va_end(ap); 3462 printf("%p: ", (void *)vp); 3463 printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]); 3464 printf(" usecount %d, writecount %d, refcount %d", 3465 vp->v_usecount, vp->v_writecount, vp->v_holdcnt); 3466 switch (vp->v_type) { 3467 case VDIR: 3468 printf(" mountedhere %p\n", vp->v_mountedhere); 3469 break; 3470 case VCHR: 3471 printf(" rdev %p\n", vp->v_rdev); 3472 break; 3473 case VSOCK: 3474 printf(" socket %p\n", vp->v_unpcb); 3475 break; 3476 case VFIFO: 3477 printf(" fifoinfo %p\n", vp->v_fifoinfo); 3478 break; 3479 default: 3480 printf("\n"); 3481 break; 3482 } 3483 buf[0] = '\0'; 3484 buf[1] = '\0'; 3485 if (vp->v_vflag & VV_ROOT) 3486 strlcat(buf, "|VV_ROOT", sizeof(buf)); 3487 if (vp->v_vflag & VV_ISTTY) 3488 strlcat(buf, "|VV_ISTTY", sizeof(buf)); 3489 if (vp->v_vflag & VV_NOSYNC) 3490 strlcat(buf, "|VV_NOSYNC", sizeof(buf)); 3491 if (vp->v_vflag & VV_ETERNALDEV) 3492 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); 3493 if (vp->v_vflag & VV_CACHEDLABEL) 3494 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); 3495 if (vp->v_vflag & VV_COPYONWRITE) 3496 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); 3497 if (vp->v_vflag & VV_SYSTEM) 3498 strlcat(buf, "|VV_SYSTEM", sizeof(buf)); 3499 if (vp->v_vflag & VV_PROCDEP) 3500 strlcat(buf, "|VV_PROCDEP", sizeof(buf)); 3501 if (vp->v_vflag & VV_NOKNOTE) 3502 strlcat(buf, "|VV_NOKNOTE", sizeof(buf)); 3503 if (vp->v_vflag & VV_DELETED) 3504 strlcat(buf, "|VV_DELETED", sizeof(buf)); 3505 if (vp->v_vflag & VV_MD) 3506 strlcat(buf, "|VV_MD", sizeof(buf)); 3507 if (vp->v_vflag & VV_FORCEINSMQ) 3508 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); 3509 flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | 3510 VV_CACHEDLABEL | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | 3511 VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ); 3512 if (flags != 0) { 3513 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); 3514 strlcat(buf, buf2, sizeof(buf)); 3515 } 3516 if (vp->v_iflag & VI_MOUNT) 3517 strlcat(buf, "|VI_MOUNT", sizeof(buf)); 3518 if (vp->v_iflag & VI_DOOMED) 3519 strlcat(buf, "|VI_DOOMED", sizeof(buf)); 3520 if (vp->v_iflag & VI_FREE) 3521 strlcat(buf, "|VI_FREE", sizeof(buf)); 3522 if (vp->v_iflag & VI_ACTIVE) 3523 strlcat(buf, "|VI_ACTIVE", sizeof(buf)); 3524 if (vp->v_iflag & VI_DOINGINACT) 3525 strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); 3526 if (vp->v_iflag & VI_OWEINACT) 3527 strlcat(buf, "|VI_OWEINACT", sizeof(buf)); 3528 flags = vp->v_iflag & ~(VI_MOUNT | VI_DOOMED | VI_FREE | 3529 VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT); 3530 if (flags != 0) { 3531 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); 3532 strlcat(buf, buf2, sizeof(buf)); 3533 } 3534 printf(" flags (%s)\n", buf + 1); 3535 if (mtx_owned(VI_MTX(vp))) 3536 printf(" VI_LOCKed"); 3537 if (vp->v_object != NULL) 3538 printf(" v_object %p ref %d pages %d " 3539 "cleanbuf %d dirtybuf %d\n", 3540 vp->v_object, vp->v_object->ref_count, 3541 vp->v_object->resident_page_count, 3542 vp->v_bufobj.bo_clean.bv_cnt, 3543 vp->v_bufobj.bo_dirty.bv_cnt); 3544 printf(" "); 3545 lockmgr_printinfo(vp->v_vnlock); 3546 if (vp->v_data != NULL) 3547 VOP_PRINT(vp); 3548 } 3549 3550 #ifdef DDB 3551 /* 3552 * List all of the locked vnodes in the system. 3553 * Called when debugging the kernel. 3554 */ 3555 DB_SHOW_COMMAND(lockedvnods, lockedvnodes) 3556 { 3557 struct mount *mp; 3558 struct vnode *vp; 3559 3560 /* 3561 * Note: because this is DDB, we can't obey the locking semantics 3562 * for these structures, which means we could catch an inconsistent 3563 * state and dereference a nasty pointer. Not much to be done 3564 * about that. 3565 */ 3566 db_printf("Locked vnodes\n"); 3567 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3568 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 3569 if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) 3570 vn_printf(vp, "vnode "); 3571 } 3572 } 3573 } 3574 3575 /* 3576 * Show details about the given vnode. 3577 */ 3578 DB_SHOW_COMMAND(vnode, db_show_vnode) 3579 { 3580 struct vnode *vp; 3581 3582 if (!have_addr) 3583 return; 3584 vp = (struct vnode *)addr; 3585 vn_printf(vp, "vnode "); 3586 } 3587 3588 /* 3589 * Show details about the given mount point. 3590 */ 3591 DB_SHOW_COMMAND(mount, db_show_mount) 3592 { 3593 struct mount *mp; 3594 struct vfsopt *opt; 3595 struct statfs *sp; 3596 struct vnode *vp; 3597 char buf[512]; 3598 uint64_t mflags; 3599 u_int flags; 3600 3601 if (!have_addr) { 3602 /* No address given, print short info about all mount points. */ 3603 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3604 db_printf("%p %s on %s (%s)\n", mp, 3605 mp->mnt_stat.f_mntfromname, 3606 mp->mnt_stat.f_mntonname, 3607 mp->mnt_stat.f_fstypename); 3608 if (db_pager_quit) 3609 break; 3610 } 3611 db_printf("\nMore info: show mount <addr>\n"); 3612 return; 3613 } 3614 3615 mp = (struct mount *)addr; 3616 db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, 3617 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); 3618 3619 buf[0] = '\0'; 3620 mflags = mp->mnt_flag; 3621 #define MNT_FLAG(flag) do { \ 3622 if (mflags & (flag)) { \ 3623 if (buf[0] != '\0') \ 3624 strlcat(buf, ", ", sizeof(buf)); \ 3625 strlcat(buf, (#flag) + 4, sizeof(buf)); \ 3626 mflags &= ~(flag); \ 3627 } \ 3628 } while (0) 3629 MNT_FLAG(MNT_RDONLY); 3630 MNT_FLAG(MNT_SYNCHRONOUS); 3631 MNT_FLAG(MNT_NOEXEC); 3632 MNT_FLAG(MNT_NOSUID); 3633 MNT_FLAG(MNT_NFS4ACLS); 3634 MNT_FLAG(MNT_UNION); 3635 MNT_FLAG(MNT_ASYNC); 3636 MNT_FLAG(MNT_SUIDDIR); 3637 MNT_FLAG(MNT_SOFTDEP); 3638 MNT_FLAG(MNT_NOSYMFOLLOW); 3639 MNT_FLAG(MNT_GJOURNAL); 3640 MNT_FLAG(MNT_MULTILABEL); 3641 MNT_FLAG(MNT_ACLS); 3642 MNT_FLAG(MNT_NOATIME); 3643 MNT_FLAG(MNT_NOCLUSTERR); 3644 MNT_FLAG(MNT_NOCLUSTERW); 3645 MNT_FLAG(MNT_SUJ); 3646 MNT_FLAG(MNT_EXRDONLY); 3647 MNT_FLAG(MNT_EXPORTED); 3648 MNT_FLAG(MNT_DEFEXPORTED); 3649 MNT_FLAG(MNT_EXPORTANON); 3650 MNT_FLAG(MNT_EXKERB); 3651 MNT_FLAG(MNT_EXPUBLIC); 3652 MNT_FLAG(MNT_LOCAL); 3653 MNT_FLAG(MNT_QUOTA); 3654 MNT_FLAG(MNT_ROOTFS); 3655 MNT_FLAG(MNT_USER); 3656 MNT_FLAG(MNT_IGNORE); 3657 MNT_FLAG(MNT_UPDATE); 3658 MNT_FLAG(MNT_DELEXPORT); 3659 MNT_FLAG(MNT_RELOAD); 3660 MNT_FLAG(MNT_FORCE); 3661 MNT_FLAG(MNT_SNAPSHOT); 3662 MNT_FLAG(MNT_BYFSID); 3663 #undef MNT_FLAG 3664 if (mflags != 0) { 3665 if (buf[0] != '\0') 3666 strlcat(buf, ", ", sizeof(buf)); 3667 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 3668 "0x%016jx", mflags); 3669 } 3670 db_printf(" mnt_flag = %s\n", buf); 3671 3672 buf[0] = '\0'; 3673 flags = mp->mnt_kern_flag; 3674 #define MNT_KERN_FLAG(flag) do { \ 3675 if (flags & (flag)) { \ 3676 if (buf[0] != '\0') \ 3677 strlcat(buf, ", ", sizeof(buf)); \ 3678 strlcat(buf, (#flag) + 5, sizeof(buf)); \ 3679 flags &= ~(flag); \ 3680 } \ 3681 } while (0) 3682 MNT_KERN_FLAG(MNTK_UNMOUNTF); 3683 MNT_KERN_FLAG(MNTK_ASYNC); 3684 MNT_KERN_FLAG(MNTK_SOFTDEP); 3685 MNT_KERN_FLAG(MNTK_NOINSMNTQ); 3686 MNT_KERN_FLAG(MNTK_DRAINING); 3687 MNT_KERN_FLAG(MNTK_REFEXPIRE); 3688 MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); 3689 MNT_KERN_FLAG(MNTK_SHARED_WRITES); 3690 MNT_KERN_FLAG(MNTK_NO_IOPF); 3691 MNT_KERN_FLAG(MNTK_VGONE_UPPER); 3692 MNT_KERN_FLAG(MNTK_VGONE_WAITER); 3693 MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT); 3694 MNT_KERN_FLAG(MNTK_MARKER); 3695 MNT_KERN_FLAG(MNTK_USES_BCACHE); 3696 MNT_KERN_FLAG(MNTK_NOASYNC); 3697 MNT_KERN_FLAG(MNTK_UNMOUNT); 3698 MNT_KERN_FLAG(MNTK_MWAIT); 3699 MNT_KERN_FLAG(MNTK_SUSPEND); 3700 MNT_KERN_FLAG(MNTK_SUSPEND2); 3701 MNT_KERN_FLAG(MNTK_SUSPENDED); 3702 MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); 3703 MNT_KERN_FLAG(MNTK_NOKNOTE); 3704 #undef MNT_KERN_FLAG 3705 if (flags != 0) { 3706 if (buf[0] != '\0') 3707 strlcat(buf, ", ", sizeof(buf)); 3708 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 3709 "0x%08x", flags); 3710 } 3711 db_printf(" mnt_kern_flag = %s\n", buf); 3712 3713 db_printf(" mnt_opt = "); 3714 opt = TAILQ_FIRST(mp->mnt_opt); 3715 if (opt != NULL) { 3716 db_printf("%s", opt->name); 3717 opt = TAILQ_NEXT(opt, link); 3718 while (opt != NULL) { 3719 db_printf(", %s", opt->name); 3720 opt = TAILQ_NEXT(opt, link); 3721 } 3722 } 3723 db_printf("\n"); 3724 3725 sp = &mp->mnt_stat; 3726 db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " 3727 "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " 3728 "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " 3729 "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", 3730 (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, 3731 (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, 3732 (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, 3733 (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, 3734 (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, 3735 (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, 3736 (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, 3737 (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); 3738 3739 db_printf(" mnt_cred = { uid=%u ruid=%u", 3740 (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); 3741 if (jailed(mp->mnt_cred)) 3742 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); 3743 db_printf(" }\n"); 3744 db_printf(" mnt_ref = %d\n", mp->mnt_ref); 3745 db_printf(" mnt_gen = %d\n", mp->mnt_gen); 3746 db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); 3747 db_printf(" mnt_activevnodelistsize = %d\n", 3748 mp->mnt_activevnodelistsize); 3749 db_printf(" mnt_writeopcount = %d\n", mp->mnt_writeopcount); 3750 db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); 3751 db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); 3752 db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); 3753 db_printf(" mnt_lockref = %d\n", mp->mnt_lockref); 3754 db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); 3755 db_printf(" mnt_secondary_accwrites = %d\n", 3756 mp->mnt_secondary_accwrites); 3757 db_printf(" mnt_gjprovider = %s\n", 3758 mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); 3759 3760 db_printf("\n\nList of active vnodes\n"); 3761 TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) { 3762 if (vp->v_type != VMARKER) { 3763 vn_printf(vp, "vnode "); 3764 if (db_pager_quit) 3765 break; 3766 } 3767 } 3768 db_printf("\n\nList of inactive vnodes\n"); 3769 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 3770 if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) { 3771 vn_printf(vp, "vnode "); 3772 if (db_pager_quit) 3773 break; 3774 } 3775 } 3776 } 3777 #endif /* DDB */ 3778 3779 /* 3780 * Fill in a struct xvfsconf based on a struct vfsconf. 3781 */ 3782 static int 3783 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) 3784 { 3785 struct xvfsconf xvfsp; 3786 3787 bzero(&xvfsp, sizeof(xvfsp)); 3788 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 3789 xvfsp.vfc_typenum = vfsp->vfc_typenum; 3790 xvfsp.vfc_refcount = vfsp->vfc_refcount; 3791 xvfsp.vfc_flags = vfsp->vfc_flags; 3792 /* 3793 * These are unused in userland, we keep them 3794 * to not break binary compatibility. 3795 */ 3796 xvfsp.vfc_vfsops = NULL; 3797 xvfsp.vfc_next = NULL; 3798 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 3799 } 3800 3801 #ifdef COMPAT_FREEBSD32 3802 struct xvfsconf32 { 3803 uint32_t vfc_vfsops; 3804 char vfc_name[MFSNAMELEN]; 3805 int32_t vfc_typenum; 3806 int32_t vfc_refcount; 3807 int32_t vfc_flags; 3808 uint32_t vfc_next; 3809 }; 3810 3811 static int 3812 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) 3813 { 3814 struct xvfsconf32 xvfsp; 3815 3816 bzero(&xvfsp, sizeof(xvfsp)); 3817 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 3818 xvfsp.vfc_typenum = vfsp->vfc_typenum; 3819 xvfsp.vfc_refcount = vfsp->vfc_refcount; 3820 xvfsp.vfc_flags = vfsp->vfc_flags; 3821 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 3822 } 3823 #endif 3824 3825 /* 3826 * Top level filesystem related information gathering. 3827 */ 3828 static int 3829 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) 3830 { 3831 struct vfsconf *vfsp; 3832 int error; 3833 3834 error = 0; 3835 vfsconf_slock(); 3836 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 3837 #ifdef COMPAT_FREEBSD32 3838 if (req->flags & SCTL_MASK32) 3839 error = vfsconf2x32(req, vfsp); 3840 else 3841 #endif 3842 error = vfsconf2x(req, vfsp); 3843 if (error) 3844 break; 3845 } 3846 vfsconf_sunlock(); 3847 return (error); 3848 } 3849 3850 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | 3851 CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, 3852 "S,xvfsconf", "List of all configured filesystems"); 3853 3854 #ifndef BURN_BRIDGES 3855 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 3856 3857 static int 3858 vfs_sysctl(SYSCTL_HANDLER_ARGS) 3859 { 3860 int *name = (int *)arg1 - 1; /* XXX */ 3861 u_int namelen = arg2 + 1; /* XXX */ 3862 struct vfsconf *vfsp; 3863 3864 log(LOG_WARNING, "userland calling deprecated sysctl, " 3865 "please rebuild world\n"); 3866 3867 #if 1 || defined(COMPAT_PRELITE2) 3868 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 3869 if (namelen == 1) 3870 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 3871 #endif 3872 3873 switch (name[1]) { 3874 case VFS_MAXTYPENUM: 3875 if (namelen != 2) 3876 return (ENOTDIR); 3877 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 3878 case VFS_CONF: 3879 if (namelen != 3) 3880 return (ENOTDIR); /* overloaded */ 3881 vfsconf_slock(); 3882 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 3883 if (vfsp->vfc_typenum == name[2]) 3884 break; 3885 } 3886 vfsconf_sunlock(); 3887 if (vfsp == NULL) 3888 return (EOPNOTSUPP); 3889 #ifdef COMPAT_FREEBSD32 3890 if (req->flags & SCTL_MASK32) 3891 return (vfsconf2x32(req, vfsp)); 3892 else 3893 #endif 3894 return (vfsconf2x(req, vfsp)); 3895 } 3896 return (EOPNOTSUPP); 3897 } 3898 3899 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | 3900 CTLFLAG_MPSAFE, vfs_sysctl, 3901 "Generic filesystem"); 3902 3903 #if 1 || defined(COMPAT_PRELITE2) 3904 3905 static int 3906 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 3907 { 3908 int error; 3909 struct vfsconf *vfsp; 3910 struct ovfsconf ovfs; 3911 3912 vfsconf_slock(); 3913 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 3914 bzero(&ovfs, sizeof(ovfs)); 3915 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 3916 strcpy(ovfs.vfc_name, vfsp->vfc_name); 3917 ovfs.vfc_index = vfsp->vfc_typenum; 3918 ovfs.vfc_refcount = vfsp->vfc_refcount; 3919 ovfs.vfc_flags = vfsp->vfc_flags; 3920 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 3921 if (error != 0) { 3922 vfsconf_sunlock(); 3923 return (error); 3924 } 3925 } 3926 vfsconf_sunlock(); 3927 return (0); 3928 } 3929 3930 #endif /* 1 || COMPAT_PRELITE2 */ 3931 #endif /* !BURN_BRIDGES */ 3932 3933 #define KINFO_VNODESLOP 10 3934 #ifdef notyet 3935 /* 3936 * Dump vnode list (via sysctl). 3937 */ 3938 /* ARGSUSED */ 3939 static int 3940 sysctl_vnode(SYSCTL_HANDLER_ARGS) 3941 { 3942 struct xvnode *xvn; 3943 struct mount *mp; 3944 struct vnode *vp; 3945 int error, len, n; 3946 3947 /* 3948 * Stale numvnodes access is not fatal here. 3949 */ 3950 req->lock = 0; 3951 len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; 3952 if (!req->oldptr) 3953 /* Make an estimate */ 3954 return (SYSCTL_OUT(req, 0, len)); 3955 3956 error = sysctl_wire_old_buffer(req, 0); 3957 if (error != 0) 3958 return (error); 3959 xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); 3960 n = 0; 3961 mtx_lock(&mountlist_mtx); 3962 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3963 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) 3964 continue; 3965 MNT_ILOCK(mp); 3966 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 3967 if (n == len) 3968 break; 3969 vref(vp); 3970 xvn[n].xv_size = sizeof *xvn; 3971 xvn[n].xv_vnode = vp; 3972 xvn[n].xv_id = 0; /* XXX compat */ 3973 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field 3974 XV_COPY(usecount); 3975 XV_COPY(writecount); 3976 XV_COPY(holdcnt); 3977 XV_COPY(mount); 3978 XV_COPY(numoutput); 3979 XV_COPY(type); 3980 #undef XV_COPY 3981 xvn[n].xv_flag = vp->v_vflag; 3982 3983 switch (vp->v_type) { 3984 case VREG: 3985 case VDIR: 3986 case VLNK: 3987 break; 3988 case VBLK: 3989 case VCHR: 3990 if (vp->v_rdev == NULL) { 3991 vrele(vp); 3992 continue; 3993 } 3994 xvn[n].xv_dev = dev2udev(vp->v_rdev); 3995 break; 3996 case VSOCK: 3997 xvn[n].xv_socket = vp->v_socket; 3998 break; 3999 case VFIFO: 4000 xvn[n].xv_fifo = vp->v_fifoinfo; 4001 break; 4002 case VNON: 4003 case VBAD: 4004 default: 4005 /* shouldn't happen? */ 4006 vrele(vp); 4007 continue; 4008 } 4009 vrele(vp); 4010 ++n; 4011 } 4012 MNT_IUNLOCK(mp); 4013 mtx_lock(&mountlist_mtx); 4014 vfs_unbusy(mp); 4015 if (n == len) 4016 break; 4017 } 4018 mtx_unlock(&mountlist_mtx); 4019 4020 error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); 4021 free(xvn, M_TEMP); 4022 return (error); 4023 } 4024 4025 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD | 4026 CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode", 4027 ""); 4028 #endif 4029 4030 static void 4031 unmount_or_warn(struct mount *mp) 4032 { 4033 int error; 4034 4035 error = dounmount(mp, MNT_FORCE, curthread); 4036 if (error != 0) { 4037 printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); 4038 if (error == EBUSY) 4039 printf("BUSY)\n"); 4040 else 4041 printf("%d)\n", error); 4042 } 4043 } 4044 4045 /* 4046 * Unmount all filesystems. The list is traversed in reverse order 4047 * of mounting to avoid dependencies. 4048 */ 4049 void 4050 vfs_unmountall(void) 4051 { 4052 struct mount *mp, *tmp; 4053 4054 CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); 4055 4056 /* 4057 * Since this only runs when rebooting, it is not interlocked. 4058 */ 4059 TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { 4060 vfs_ref(mp); 4061 4062 /* 4063 * Forcibly unmounting "/dev" before "/" would prevent clean 4064 * unmount of the latter. 4065 */ 4066 if (mp == rootdevmp) 4067 continue; 4068 4069 unmount_or_warn(mp); 4070 } 4071 4072 if (rootdevmp != NULL) 4073 unmount_or_warn(rootdevmp); 4074 } 4075 4076 /* 4077 * perform msync on all vnodes under a mount point 4078 * the mount point must be locked. 4079 */ 4080 void 4081 vfs_msync(struct mount *mp, int flags) 4082 { 4083 struct vnode *vp, *mvp; 4084 struct vm_object *obj; 4085 4086 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 4087 4088 vnlru_return_batch(mp); 4089 4090 MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { 4091 obj = vp->v_object; 4092 if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 && 4093 (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) { 4094 if (!vget(vp, 4095 LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, 4096 curthread)) { 4097 if (vp->v_vflag & VV_NOSYNC) { /* unlinked */ 4098 vput(vp); 4099 continue; 4100 } 4101 4102 obj = vp->v_object; 4103 if (obj != NULL) { 4104 VM_OBJECT_WLOCK(obj); 4105 vm_object_page_clean(obj, 0, 0, 4106 flags == MNT_WAIT ? 4107 OBJPC_SYNC : OBJPC_NOSYNC); 4108 VM_OBJECT_WUNLOCK(obj); 4109 } 4110 vput(vp); 4111 } 4112 } else 4113 VI_UNLOCK(vp); 4114 } 4115 } 4116 4117 static void 4118 destroy_vpollinfo_free(struct vpollinfo *vi) 4119 { 4120 4121 knlist_destroy(&vi->vpi_selinfo.si_note); 4122 mtx_destroy(&vi->vpi_lock); 4123 uma_zfree(vnodepoll_zone, vi); 4124 } 4125 4126 static void 4127 destroy_vpollinfo(struct vpollinfo *vi) 4128 { 4129 4130 knlist_clear(&vi->vpi_selinfo.si_note, 1); 4131 seldrain(&vi->vpi_selinfo); 4132 destroy_vpollinfo_free(vi); 4133 } 4134 4135 /* 4136 * Initialize per-vnode helper structure to hold poll-related state. 4137 */ 4138 void 4139 v_addpollinfo(struct vnode *vp) 4140 { 4141 struct vpollinfo *vi; 4142 4143 if (vp->v_pollinfo != NULL) 4144 return; 4145 vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO); 4146 mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); 4147 knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, 4148 vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked); 4149 VI_LOCK(vp); 4150 if (vp->v_pollinfo != NULL) { 4151 VI_UNLOCK(vp); 4152 destroy_vpollinfo_free(vi); 4153 return; 4154 } 4155 vp->v_pollinfo = vi; 4156 VI_UNLOCK(vp); 4157 } 4158 4159 /* 4160 * Record a process's interest in events which might happen to 4161 * a vnode. Because poll uses the historic select-style interface 4162 * internally, this routine serves as both the ``check for any 4163 * pending events'' and the ``record my interest in future events'' 4164 * functions. (These are done together, while the lock is held, 4165 * to avoid race conditions.) 4166 */ 4167 int 4168 vn_pollrecord(struct vnode *vp, struct thread *td, int events) 4169 { 4170 4171 v_addpollinfo(vp); 4172 mtx_lock(&vp->v_pollinfo->vpi_lock); 4173 if (vp->v_pollinfo->vpi_revents & events) { 4174 /* 4175 * This leaves events we are not interested 4176 * in available for the other process which 4177 * which presumably had requested them 4178 * (otherwise they would never have been 4179 * recorded). 4180 */ 4181 events &= vp->v_pollinfo->vpi_revents; 4182 vp->v_pollinfo->vpi_revents &= ~events; 4183 4184 mtx_unlock(&vp->v_pollinfo->vpi_lock); 4185 return (events); 4186 } 4187 vp->v_pollinfo->vpi_events |= events; 4188 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 4189 mtx_unlock(&vp->v_pollinfo->vpi_lock); 4190 return (0); 4191 } 4192 4193 /* 4194 * Routine to create and manage a filesystem syncer vnode. 4195 */ 4196 #define sync_close ((int (*)(struct vop_close_args *))nullop) 4197 static int sync_fsync(struct vop_fsync_args *); 4198 static int sync_inactive(struct vop_inactive_args *); 4199 static int sync_reclaim(struct vop_reclaim_args *); 4200 4201 static struct vop_vector sync_vnodeops = { 4202 .vop_bypass = VOP_EOPNOTSUPP, 4203 .vop_close = sync_close, /* close */ 4204 .vop_fsync = sync_fsync, /* fsync */ 4205 .vop_inactive = sync_inactive, /* inactive */ 4206 .vop_reclaim = sync_reclaim, /* reclaim */ 4207 .vop_lock1 = vop_stdlock, /* lock */ 4208 .vop_unlock = vop_stdunlock, /* unlock */ 4209 .vop_islocked = vop_stdislocked, /* islocked */ 4210 }; 4211 4212 /* 4213 * Create a new filesystem syncer vnode for the specified mount point. 4214 */ 4215 void 4216 vfs_allocate_syncvnode(struct mount *mp) 4217 { 4218 struct vnode *vp; 4219 struct bufobj *bo; 4220 static long start, incr, next; 4221 int error; 4222 4223 /* Allocate a new vnode */ 4224 error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); 4225 if (error != 0) 4226 panic("vfs_allocate_syncvnode: getnewvnode() failed"); 4227 vp->v_type = VNON; 4228 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 4229 vp->v_vflag |= VV_FORCEINSMQ; 4230 error = insmntque(vp, mp); 4231 if (error != 0) 4232 panic("vfs_allocate_syncvnode: insmntque() failed"); 4233 vp->v_vflag &= ~VV_FORCEINSMQ; 4234 VOP_UNLOCK(vp, 0); 4235 /* 4236 * Place the vnode onto the syncer worklist. We attempt to 4237 * scatter them about on the list so that they will go off 4238 * at evenly distributed times even if all the filesystems 4239 * are mounted at once. 4240 */ 4241 next += incr; 4242 if (next == 0 || next > syncer_maxdelay) { 4243 start /= 2; 4244 incr /= 2; 4245 if (start == 0) { 4246 start = syncer_maxdelay / 2; 4247 incr = syncer_maxdelay; 4248 } 4249 next = start; 4250 } 4251 bo = &vp->v_bufobj; 4252 BO_LOCK(bo); 4253 vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); 4254 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ 4255 mtx_lock(&sync_mtx); 4256 sync_vnode_count++; 4257 if (mp->mnt_syncer == NULL) { 4258 mp->mnt_syncer = vp; 4259 vp = NULL; 4260 } 4261 mtx_unlock(&sync_mtx); 4262 BO_UNLOCK(bo); 4263 if (vp != NULL) { 4264 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 4265 vgone(vp); 4266 vput(vp); 4267 } 4268 } 4269 4270 void 4271 vfs_deallocate_syncvnode(struct mount *mp) 4272 { 4273 struct vnode *vp; 4274 4275 mtx_lock(&sync_mtx); 4276 vp = mp->mnt_syncer; 4277 if (vp != NULL) 4278 mp->mnt_syncer = NULL; 4279 mtx_unlock(&sync_mtx); 4280 if (vp != NULL) 4281 vrele(vp); 4282 } 4283 4284 /* 4285 * Do a lazy sync of the filesystem. 4286 */ 4287 static int 4288 sync_fsync(struct vop_fsync_args *ap) 4289 { 4290 struct vnode *syncvp = ap->a_vp; 4291 struct mount *mp = syncvp->v_mount; 4292 int error, save; 4293 struct bufobj *bo; 4294 4295 /* 4296 * We only need to do something if this is a lazy evaluation. 4297 */ 4298 if (ap->a_waitfor != MNT_LAZY) 4299 return (0); 4300 4301 /* 4302 * Move ourselves to the back of the sync list. 4303 */ 4304 bo = &syncvp->v_bufobj; 4305 BO_LOCK(bo); 4306 vn_syncer_add_to_worklist(bo, syncdelay); 4307 BO_UNLOCK(bo); 4308 4309 /* 4310 * Walk the list of vnodes pushing all that are dirty and 4311 * not already on the sync list. 4312 */ 4313 if (vfs_busy(mp, MBF_NOWAIT) != 0) 4314 return (0); 4315 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { 4316 vfs_unbusy(mp); 4317 return (0); 4318 } 4319 save = curthread_pflags_set(TDP_SYNCIO); 4320 vfs_msync(mp, MNT_NOWAIT); 4321 error = VFS_SYNC(mp, MNT_LAZY); 4322 curthread_pflags_restore(save); 4323 vn_finished_write(mp); 4324 vfs_unbusy(mp); 4325 return (error); 4326 } 4327 4328 /* 4329 * The syncer vnode is no referenced. 4330 */ 4331 static int 4332 sync_inactive(struct vop_inactive_args *ap) 4333 { 4334 4335 vgone(ap->a_vp); 4336 return (0); 4337 } 4338 4339 /* 4340 * The syncer vnode is no longer needed and is being decommissioned. 4341 * 4342 * Modifications to the worklist must be protected by sync_mtx. 4343 */ 4344 static int 4345 sync_reclaim(struct vop_reclaim_args *ap) 4346 { 4347 struct vnode *vp = ap->a_vp; 4348 struct bufobj *bo; 4349 4350 bo = &vp->v_bufobj; 4351 BO_LOCK(bo); 4352 mtx_lock(&sync_mtx); 4353 if (vp->v_mount->mnt_syncer == vp) 4354 vp->v_mount->mnt_syncer = NULL; 4355 if (bo->bo_flag & BO_ONWORKLST) { 4356 LIST_REMOVE(bo, bo_synclist); 4357 syncer_worklist_len--; 4358 sync_vnode_count--; 4359 bo->bo_flag &= ~BO_ONWORKLST; 4360 } 4361 mtx_unlock(&sync_mtx); 4362 BO_UNLOCK(bo); 4363 4364 return (0); 4365 } 4366 4367 /* 4368 * Check if vnode represents a disk device 4369 */ 4370 int 4371 vn_isdisk(struct vnode *vp, int *errp) 4372 { 4373 int error; 4374 4375 if (vp->v_type != VCHR) { 4376 error = ENOTBLK; 4377 goto out; 4378 } 4379 error = 0; 4380 dev_lock(); 4381 if (vp->v_rdev == NULL) 4382 error = ENXIO; 4383 else if (vp->v_rdev->si_devsw == NULL) 4384 error = ENXIO; 4385 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) 4386 error = ENOTBLK; 4387 dev_unlock(); 4388 out: 4389 if (errp != NULL) 4390 *errp = error; 4391 return (error == 0); 4392 } 4393 4394 /* 4395 * Common filesystem object access control check routine. Accepts a 4396 * vnode's type, "mode", uid and gid, requested access mode, credentials, 4397 * and optional call-by-reference privused argument allowing vaccess() 4398 * to indicate to the caller whether privilege was used to satisfy the 4399 * request (obsoleted). Returns 0 on success, or an errno on failure. 4400 */ 4401 int 4402 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, 4403 accmode_t accmode, struct ucred *cred, int *privused) 4404 { 4405 accmode_t dac_granted; 4406 accmode_t priv_granted; 4407 4408 KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, 4409 ("invalid bit in accmode")); 4410 KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), 4411 ("VAPPEND without VWRITE")); 4412 4413 /* 4414 * Look for a normal, non-privileged way to access the file/directory 4415 * as requested. If it exists, go with that. 4416 */ 4417 4418 if (privused != NULL) 4419 *privused = 0; 4420 4421 dac_granted = 0; 4422 4423 /* Check the owner. */ 4424 if (cred->cr_uid == file_uid) { 4425 dac_granted |= VADMIN; 4426 if (file_mode & S_IXUSR) 4427 dac_granted |= VEXEC; 4428 if (file_mode & S_IRUSR) 4429 dac_granted |= VREAD; 4430 if (file_mode & S_IWUSR) 4431 dac_granted |= (VWRITE | VAPPEND); 4432 4433 if ((accmode & dac_granted) == accmode) 4434 return (0); 4435 4436 goto privcheck; 4437 } 4438 4439 /* Otherwise, check the groups (first match) */ 4440 if (groupmember(file_gid, cred)) { 4441 if (file_mode & S_IXGRP) 4442 dac_granted |= VEXEC; 4443 if (file_mode & S_IRGRP) 4444 dac_granted |= VREAD; 4445 if (file_mode & S_IWGRP) 4446 dac_granted |= (VWRITE | VAPPEND); 4447 4448 if ((accmode & dac_granted) == accmode) 4449 return (0); 4450 4451 goto privcheck; 4452 } 4453 4454 /* Otherwise, check everyone else. */ 4455 if (file_mode & S_IXOTH) 4456 dac_granted |= VEXEC; 4457 if (file_mode & S_IROTH) 4458 dac_granted |= VREAD; 4459 if (file_mode & S_IWOTH) 4460 dac_granted |= (VWRITE | VAPPEND); 4461 if ((accmode & dac_granted) == accmode) 4462 return (0); 4463 4464 privcheck: 4465 /* 4466 * Build a privilege mask to determine if the set of privileges 4467 * satisfies the requirements when combined with the granted mask 4468 * from above. For each privilege, if the privilege is required, 4469 * bitwise or the request type onto the priv_granted mask. 4470 */ 4471 priv_granted = 0; 4472 4473 if (type == VDIR) { 4474 /* 4475 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC 4476 * requests, instead of PRIV_VFS_EXEC. 4477 */ 4478 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 4479 !priv_check_cred(cred, PRIV_VFS_LOOKUP)) 4480 priv_granted |= VEXEC; 4481 } else { 4482 /* 4483 * Ensure that at least one execute bit is on. Otherwise, 4484 * a privileged user will always succeed, and we don't want 4485 * this to happen unless the file really is executable. 4486 */ 4487 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 4488 (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && 4489 !priv_check_cred(cred, PRIV_VFS_EXEC)) 4490 priv_granted |= VEXEC; 4491 } 4492 4493 if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && 4494 !priv_check_cred(cred, PRIV_VFS_READ)) 4495 priv_granted |= VREAD; 4496 4497 if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && 4498 !priv_check_cred(cred, PRIV_VFS_WRITE)) 4499 priv_granted |= (VWRITE | VAPPEND); 4500 4501 if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && 4502 !priv_check_cred(cred, PRIV_VFS_ADMIN)) 4503 priv_granted |= VADMIN; 4504 4505 if ((accmode & (priv_granted | dac_granted)) == accmode) { 4506 /* XXX audit: privilege used */ 4507 if (privused != NULL) 4508 *privused = 1; 4509 return (0); 4510 } 4511 4512 return ((accmode & VADMIN) ? EPERM : EACCES); 4513 } 4514 4515 /* 4516 * Credential check based on process requesting service, and per-attribute 4517 * permissions. 4518 */ 4519 int 4520 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, 4521 struct thread *td, accmode_t accmode) 4522 { 4523 4524 /* 4525 * Kernel-invoked always succeeds. 4526 */ 4527 if (cred == NOCRED) 4528 return (0); 4529 4530 /* 4531 * Do not allow privileged processes in jail to directly manipulate 4532 * system attributes. 4533 */ 4534 switch (attrnamespace) { 4535 case EXTATTR_NAMESPACE_SYSTEM: 4536 /* Potentially should be: return (EPERM); */ 4537 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); 4538 case EXTATTR_NAMESPACE_USER: 4539 return (VOP_ACCESS(vp, accmode, cred, td)); 4540 default: 4541 return (EPERM); 4542 } 4543 } 4544 4545 #ifdef DEBUG_VFS_LOCKS 4546 /* 4547 * This only exists to suppress warnings from unlocked specfs accesses. It is 4548 * no longer ok to have an unlocked VFS. 4549 */ 4550 #define IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL || \ 4551 (vp)->v_type == VCHR || (vp)->v_type == VBAD) 4552 4553 int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ 4554 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, 4555 "Drop into debugger on lock violation"); 4556 4557 int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ 4558 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 4559 0, "Check for interlock across VOPs"); 4560 4561 int vfs_badlock_print = 1; /* Print lock violations. */ 4562 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 4563 0, "Print lock violations"); 4564 4565 int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */ 4566 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode, 4567 0, "Print vnode details on lock violations"); 4568 4569 #ifdef KDB 4570 int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ 4571 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, 4572 &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); 4573 #endif 4574 4575 static void 4576 vfs_badlock(const char *msg, const char *str, struct vnode *vp) 4577 { 4578 4579 #ifdef KDB 4580 if (vfs_badlock_backtrace) 4581 kdb_backtrace(); 4582 #endif 4583 if (vfs_badlock_vnode) 4584 vn_printf(vp, "vnode "); 4585 if (vfs_badlock_print) 4586 printf("%s: %p %s\n", str, (void *)vp, msg); 4587 if (vfs_badlock_ddb) 4588 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 4589 } 4590 4591 void 4592 assert_vi_locked(struct vnode *vp, const char *str) 4593 { 4594 4595 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) 4596 vfs_badlock("interlock is not locked but should be", str, vp); 4597 } 4598 4599 void 4600 assert_vi_unlocked(struct vnode *vp, const char *str) 4601 { 4602 4603 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) 4604 vfs_badlock("interlock is locked but should not be", str, vp); 4605 } 4606 4607 void 4608 assert_vop_locked(struct vnode *vp, const char *str) 4609 { 4610 int locked; 4611 4612 if (!IGNORE_LOCK(vp)) { 4613 locked = VOP_ISLOCKED(vp); 4614 if (locked == 0 || locked == LK_EXCLOTHER) 4615 vfs_badlock("is not locked but should be", str, vp); 4616 } 4617 } 4618 4619 void 4620 assert_vop_unlocked(struct vnode *vp, const char *str) 4621 { 4622 4623 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) 4624 vfs_badlock("is locked but should not be", str, vp); 4625 } 4626 4627 void 4628 assert_vop_elocked(struct vnode *vp, const char *str) 4629 { 4630 4631 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 4632 vfs_badlock("is not exclusive locked but should be", str, vp); 4633 } 4634 #endif /* DEBUG_VFS_LOCKS */ 4635 4636 void 4637 vop_rename_fail(struct vop_rename_args *ap) 4638 { 4639 4640 if (ap->a_tvp != NULL) 4641 vput(ap->a_tvp); 4642 if (ap->a_tdvp == ap->a_tvp) 4643 vrele(ap->a_tdvp); 4644 else 4645 vput(ap->a_tdvp); 4646 vrele(ap->a_fdvp); 4647 vrele(ap->a_fvp); 4648 } 4649 4650 void 4651 vop_rename_pre(void *ap) 4652 { 4653 struct vop_rename_args *a = ap; 4654 4655 #ifdef DEBUG_VFS_LOCKS 4656 if (a->a_tvp) 4657 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); 4658 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); 4659 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); 4660 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); 4661 4662 /* Check the source (from). */ 4663 if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && 4664 (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) 4665 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); 4666 if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) 4667 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); 4668 4669 /* Check the target. */ 4670 if (a->a_tvp) 4671 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); 4672 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); 4673 #endif 4674 if (a->a_tdvp != a->a_fdvp) 4675 vhold(a->a_fdvp); 4676 if (a->a_tvp != a->a_fvp) 4677 vhold(a->a_fvp); 4678 vhold(a->a_tdvp); 4679 if (a->a_tvp) 4680 vhold(a->a_tvp); 4681 } 4682 4683 #ifdef DEBUG_VFS_LOCKS 4684 void 4685 vop_strategy_pre(void *ap) 4686 { 4687 struct vop_strategy_args *a; 4688 struct buf *bp; 4689 4690 a = ap; 4691 bp = a->a_bp; 4692 4693 /* 4694 * Cluster ops lock their component buffers but not the IO container. 4695 */ 4696 if ((bp->b_flags & B_CLUSTER) != 0) 4697 return; 4698 4699 if (panicstr == NULL && !BUF_ISLOCKED(bp)) { 4700 if (vfs_badlock_print) 4701 printf( 4702 "VOP_STRATEGY: bp is not locked but should be\n"); 4703 if (vfs_badlock_ddb) 4704 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 4705 } 4706 } 4707 4708 void 4709 vop_lock_pre(void *ap) 4710 { 4711 struct vop_lock1_args *a = ap; 4712 4713 if ((a->a_flags & LK_INTERLOCK) == 0) 4714 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 4715 else 4716 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); 4717 } 4718 4719 void 4720 vop_lock_post(void *ap, int rc) 4721 { 4722 struct vop_lock1_args *a = ap; 4723 4724 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 4725 if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) 4726 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); 4727 } 4728 4729 void 4730 vop_unlock_pre(void *ap) 4731 { 4732 struct vop_unlock_args *a = ap; 4733 4734 if (a->a_flags & LK_INTERLOCK) 4735 ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK"); 4736 ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); 4737 } 4738 4739 void 4740 vop_unlock_post(void *ap, int rc) 4741 { 4742 struct vop_unlock_args *a = ap; 4743 4744 if (a->a_flags & LK_INTERLOCK) 4745 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK"); 4746 } 4747 #endif 4748 4749 void 4750 vop_create_post(void *ap, int rc) 4751 { 4752 struct vop_create_args *a = ap; 4753 4754 if (!rc) 4755 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4756 } 4757 4758 void 4759 vop_deleteextattr_post(void *ap, int rc) 4760 { 4761 struct vop_deleteextattr_args *a = ap; 4762 4763 if (!rc) 4764 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 4765 } 4766 4767 void 4768 vop_link_post(void *ap, int rc) 4769 { 4770 struct vop_link_args *a = ap; 4771 4772 if (!rc) { 4773 VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK); 4774 VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE); 4775 } 4776 } 4777 4778 void 4779 vop_mkdir_post(void *ap, int rc) 4780 { 4781 struct vop_mkdir_args *a = ap; 4782 4783 if (!rc) 4784 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); 4785 } 4786 4787 void 4788 vop_mknod_post(void *ap, int rc) 4789 { 4790 struct vop_mknod_args *a = ap; 4791 4792 if (!rc) 4793 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4794 } 4795 4796 void 4797 vop_reclaim_post(void *ap, int rc) 4798 { 4799 struct vop_reclaim_args *a = ap; 4800 4801 if (!rc) 4802 VFS_KNOTE_LOCKED(a->a_vp, NOTE_REVOKE); 4803 } 4804 4805 void 4806 vop_remove_post(void *ap, int rc) 4807 { 4808 struct vop_remove_args *a = ap; 4809 4810 if (!rc) { 4811 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4812 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); 4813 } 4814 } 4815 4816 void 4817 vop_rename_post(void *ap, int rc) 4818 { 4819 struct vop_rename_args *a = ap; 4820 long hint; 4821 4822 if (!rc) { 4823 hint = NOTE_WRITE; 4824 if (a->a_fdvp == a->a_tdvp) { 4825 if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) 4826 hint |= NOTE_LINK; 4827 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 4828 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 4829 } else { 4830 hint |= NOTE_EXTEND; 4831 if (a->a_fvp->v_type == VDIR) 4832 hint |= NOTE_LINK; 4833 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 4834 4835 if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && 4836 a->a_tvp->v_type == VDIR) 4837 hint &= ~NOTE_LINK; 4838 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 4839 } 4840 4841 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); 4842 if (a->a_tvp) 4843 VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); 4844 } 4845 if (a->a_tdvp != a->a_fdvp) 4846 vdrop(a->a_fdvp); 4847 if (a->a_tvp != a->a_fvp) 4848 vdrop(a->a_fvp); 4849 vdrop(a->a_tdvp); 4850 if (a->a_tvp) 4851 vdrop(a->a_tvp); 4852 } 4853 4854 void 4855 vop_rmdir_post(void *ap, int rc) 4856 { 4857 struct vop_rmdir_args *a = ap; 4858 4859 if (!rc) { 4860 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); 4861 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); 4862 } 4863 } 4864 4865 void 4866 vop_setattr_post(void *ap, int rc) 4867 { 4868 struct vop_setattr_args *a = ap; 4869 4870 if (!rc) 4871 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 4872 } 4873 4874 void 4875 vop_setextattr_post(void *ap, int rc) 4876 { 4877 struct vop_setextattr_args *a = ap; 4878 4879 if (!rc) 4880 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 4881 } 4882 4883 void 4884 vop_symlink_post(void *ap, int rc) 4885 { 4886 struct vop_symlink_args *a = ap; 4887 4888 if (!rc) 4889 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4890 } 4891 4892 void 4893 vop_open_post(void *ap, int rc) 4894 { 4895 struct vop_open_args *a = ap; 4896 4897 if (!rc) 4898 VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); 4899 } 4900 4901 void 4902 vop_close_post(void *ap, int rc) 4903 { 4904 struct vop_close_args *a = ap; 4905 4906 if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ 4907 (a->a_vp->v_iflag & VI_DOOMED) == 0)) { 4908 VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? 4909 NOTE_CLOSE_WRITE : NOTE_CLOSE); 4910 } 4911 } 4912 4913 void 4914 vop_read_post(void *ap, int rc) 4915 { 4916 struct vop_read_args *a = ap; 4917 4918 if (!rc) 4919 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 4920 } 4921 4922 void 4923 vop_readdir_post(void *ap, int rc) 4924 { 4925 struct vop_readdir_args *a = ap; 4926 4927 if (!rc) 4928 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 4929 } 4930 4931 static struct knlist fs_knlist; 4932 4933 static void 4934 vfs_event_init(void *arg) 4935 { 4936 knlist_init_mtx(&fs_knlist, NULL); 4937 } 4938 /* XXX - correct order? */ 4939 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); 4940 4941 void 4942 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) 4943 { 4944 4945 KNOTE_UNLOCKED(&fs_knlist, event); 4946 } 4947 4948 static int filt_fsattach(struct knote *kn); 4949 static void filt_fsdetach(struct knote *kn); 4950 static int filt_fsevent(struct knote *kn, long hint); 4951 4952 struct filterops fs_filtops = { 4953 .f_isfd = 0, 4954 .f_attach = filt_fsattach, 4955 .f_detach = filt_fsdetach, 4956 .f_event = filt_fsevent 4957 }; 4958 4959 static int 4960 filt_fsattach(struct knote *kn) 4961 { 4962 4963 kn->kn_flags |= EV_CLEAR; 4964 knlist_add(&fs_knlist, kn, 0); 4965 return (0); 4966 } 4967 4968 static void 4969 filt_fsdetach(struct knote *kn) 4970 { 4971 4972 knlist_remove(&fs_knlist, kn, 0); 4973 } 4974 4975 static int 4976 filt_fsevent(struct knote *kn, long hint) 4977 { 4978 4979 kn->kn_fflags |= hint; 4980 return (kn->kn_fflags != 0); 4981 } 4982 4983 static int 4984 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) 4985 { 4986 struct vfsidctl vc; 4987 int error; 4988 struct mount *mp; 4989 4990 error = SYSCTL_IN(req, &vc, sizeof(vc)); 4991 if (error) 4992 return (error); 4993 if (vc.vc_vers != VFS_CTL_VERS1) 4994 return (EINVAL); 4995 mp = vfs_getvfs(&vc.vc_fsid); 4996 if (mp == NULL) 4997 return (ENOENT); 4998 /* ensure that a specific sysctl goes to the right filesystem. */ 4999 if (strcmp(vc.vc_fstypename, "*") != 0 && 5000 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { 5001 vfs_rel(mp); 5002 return (EINVAL); 5003 } 5004 VCTLTOREQ(&vc, req); 5005 error = VFS_SYSCTL(mp, vc.vc_op, req); 5006 vfs_rel(mp); 5007 return (error); 5008 } 5009 5010 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR, 5011 NULL, 0, sysctl_vfs_ctl, "", 5012 "Sysctl by fsid"); 5013 5014 /* 5015 * Function to initialize a va_filerev field sensibly. 5016 * XXX: Wouldn't a random number make a lot more sense ?? 5017 */ 5018 u_quad_t 5019 init_va_filerev(void) 5020 { 5021 struct bintime bt; 5022 5023 getbinuptime(&bt); 5024 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); 5025 } 5026 5027 static int filt_vfsread(struct knote *kn, long hint); 5028 static int filt_vfswrite(struct knote *kn, long hint); 5029 static int filt_vfsvnode(struct knote *kn, long hint); 5030 static void filt_vfsdetach(struct knote *kn); 5031 static struct filterops vfsread_filtops = { 5032 .f_isfd = 1, 5033 .f_detach = filt_vfsdetach, 5034 .f_event = filt_vfsread 5035 }; 5036 static struct filterops vfswrite_filtops = { 5037 .f_isfd = 1, 5038 .f_detach = filt_vfsdetach, 5039 .f_event = filt_vfswrite 5040 }; 5041 static struct filterops vfsvnode_filtops = { 5042 .f_isfd = 1, 5043 .f_detach = filt_vfsdetach, 5044 .f_event = filt_vfsvnode 5045 }; 5046 5047 static void 5048 vfs_knllock(void *arg) 5049 { 5050 struct vnode *vp = arg; 5051 5052 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5053 } 5054 5055 static void 5056 vfs_knlunlock(void *arg) 5057 { 5058 struct vnode *vp = arg; 5059 5060 VOP_UNLOCK(vp, 0); 5061 } 5062 5063 static void 5064 vfs_knl_assert_locked(void *arg) 5065 { 5066 #ifdef DEBUG_VFS_LOCKS 5067 struct vnode *vp = arg; 5068 5069 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); 5070 #endif 5071 } 5072 5073 static void 5074 vfs_knl_assert_unlocked(void *arg) 5075 { 5076 #ifdef DEBUG_VFS_LOCKS 5077 struct vnode *vp = arg; 5078 5079 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); 5080 #endif 5081 } 5082 5083 int 5084 vfs_kqfilter(struct vop_kqfilter_args *ap) 5085 { 5086 struct vnode *vp = ap->a_vp; 5087 struct knote *kn = ap->a_kn; 5088 struct knlist *knl; 5089 5090 switch (kn->kn_filter) { 5091 case EVFILT_READ: 5092 kn->kn_fop = &vfsread_filtops; 5093 break; 5094 case EVFILT_WRITE: 5095 kn->kn_fop = &vfswrite_filtops; 5096 break; 5097 case EVFILT_VNODE: 5098 kn->kn_fop = &vfsvnode_filtops; 5099 break; 5100 default: 5101 return (EINVAL); 5102 } 5103 5104 kn->kn_hook = (caddr_t)vp; 5105 5106 v_addpollinfo(vp); 5107 if (vp->v_pollinfo == NULL) 5108 return (ENOMEM); 5109 knl = &vp->v_pollinfo->vpi_selinfo.si_note; 5110 vhold(vp); 5111 knlist_add(knl, kn, 0); 5112 5113 return (0); 5114 } 5115 5116 /* 5117 * Detach knote from vnode 5118 */ 5119 static void 5120 filt_vfsdetach(struct knote *kn) 5121 { 5122 struct vnode *vp = (struct vnode *)kn->kn_hook; 5123 5124 KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); 5125 knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); 5126 vdrop(vp); 5127 } 5128 5129 /*ARGSUSED*/ 5130 static int 5131 filt_vfsread(struct knote *kn, long hint) 5132 { 5133 struct vnode *vp = (struct vnode *)kn->kn_hook; 5134 struct vattr va; 5135 int res; 5136 5137 /* 5138 * filesystem is gone, so set the EOF flag and schedule 5139 * the knote for deletion. 5140 */ 5141 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 5142 VI_LOCK(vp); 5143 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 5144 VI_UNLOCK(vp); 5145 return (1); 5146 } 5147 5148 if (VOP_GETATTR(vp, &va, curthread->td_ucred)) 5149 return (0); 5150 5151 VI_LOCK(vp); 5152 kn->kn_data = va.va_size - kn->kn_fp->f_offset; 5153 res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; 5154 VI_UNLOCK(vp); 5155 return (res); 5156 } 5157 5158 /*ARGSUSED*/ 5159 static int 5160 filt_vfswrite(struct knote *kn, long hint) 5161 { 5162 struct vnode *vp = (struct vnode *)kn->kn_hook; 5163 5164 VI_LOCK(vp); 5165 5166 /* 5167 * filesystem is gone, so set the EOF flag and schedule 5168 * the knote for deletion. 5169 */ 5170 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) 5171 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 5172 5173 kn->kn_data = 0; 5174 VI_UNLOCK(vp); 5175 return (1); 5176 } 5177 5178 static int 5179 filt_vfsvnode(struct knote *kn, long hint) 5180 { 5181 struct vnode *vp = (struct vnode *)kn->kn_hook; 5182 int res; 5183 5184 VI_LOCK(vp); 5185 if (kn->kn_sfflags & hint) 5186 kn->kn_fflags |= hint; 5187 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 5188 kn->kn_flags |= EV_EOF; 5189 VI_UNLOCK(vp); 5190 return (1); 5191 } 5192 res = (kn->kn_fflags != 0); 5193 VI_UNLOCK(vp); 5194 return (res); 5195 } 5196 5197 int 5198 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) 5199 { 5200 int error; 5201 5202 if (dp->d_reclen > ap->a_uio->uio_resid) 5203 return (ENAMETOOLONG); 5204 error = uiomove(dp, dp->d_reclen, ap->a_uio); 5205 if (error) { 5206 if (ap->a_ncookies != NULL) { 5207 if (ap->a_cookies != NULL) 5208 free(ap->a_cookies, M_TEMP); 5209 ap->a_cookies = NULL; 5210 *ap->a_ncookies = 0; 5211 } 5212 return (error); 5213 } 5214 if (ap->a_ncookies == NULL) 5215 return (0); 5216 5217 KASSERT(ap->a_cookies, 5218 ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); 5219 5220 *ap->a_cookies = realloc(*ap->a_cookies, 5221 (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO); 5222 (*ap->a_cookies)[*ap->a_ncookies] = off; 5223 *ap->a_ncookies += 1; 5224 return (0); 5225 } 5226 5227 /* 5228 * Mark for update the access time of the file if the filesystem 5229 * supports VOP_MARKATIME. This functionality is used by execve and 5230 * mmap, so we want to avoid the I/O implied by directly setting 5231 * va_atime for the sake of efficiency. 5232 */ 5233 void 5234 vfs_mark_atime(struct vnode *vp, struct ucred *cred) 5235 { 5236 struct mount *mp; 5237 5238 mp = vp->v_mount; 5239 ASSERT_VOP_LOCKED(vp, "vfs_mark_atime"); 5240 if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) 5241 (void)VOP_MARKATIME(vp); 5242 } 5243 5244 /* 5245 * The purpose of this routine is to remove granularity from accmode_t, 5246 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, 5247 * VADMIN and VAPPEND. 5248 * 5249 * If it returns 0, the caller is supposed to continue with the usual 5250 * access checks using 'accmode' as modified by this routine. If it 5251 * returns nonzero value, the caller is supposed to return that value 5252 * as errno. 5253 * 5254 * Note that after this routine runs, accmode may be zero. 5255 */ 5256 int 5257 vfs_unixify_accmode(accmode_t *accmode) 5258 { 5259 /* 5260 * There is no way to specify explicit "deny" rule using 5261 * file mode or POSIX.1e ACLs. 5262 */ 5263 if (*accmode & VEXPLICIT_DENY) { 5264 *accmode = 0; 5265 return (0); 5266 } 5267 5268 /* 5269 * None of these can be translated into usual access bits. 5270 * Also, the common case for NFSv4 ACLs is to not contain 5271 * either of these bits. Caller should check for VWRITE 5272 * on the containing directory instead. 5273 */ 5274 if (*accmode & (VDELETE_CHILD | VDELETE)) 5275 return (EPERM); 5276 5277 if (*accmode & VADMIN_PERMS) { 5278 *accmode &= ~VADMIN_PERMS; 5279 *accmode |= VADMIN; 5280 } 5281 5282 /* 5283 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL 5284 * or VSYNCHRONIZE using file mode or POSIX.1e ACL. 5285 */ 5286 *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); 5287 5288 return (0); 5289 } 5290 5291 /* 5292 * These are helper functions for filesystems to traverse all 5293 * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. 5294 * 5295 * This interface replaces MNT_VNODE_FOREACH. 5296 */ 5297 5298 MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); 5299 5300 struct vnode * 5301 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) 5302 { 5303 struct vnode *vp; 5304 5305 if (should_yield()) 5306 kern_yield(PRI_USER); 5307 MNT_ILOCK(mp); 5308 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 5309 for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL; 5310 vp = TAILQ_NEXT(vp, v_nmntvnodes)) { 5311 /* Allow a racy peek at VI_DOOMED to save a lock acquisition. */ 5312 if (vp->v_type == VMARKER || (vp->v_iflag & VI_DOOMED) != 0) 5313 continue; 5314 VI_LOCK(vp); 5315 if ((vp->v_iflag & VI_DOOMED) != 0) { 5316 VI_UNLOCK(vp); 5317 continue; 5318 } 5319 break; 5320 } 5321 if (vp == NULL) { 5322 __mnt_vnode_markerfree_all(mvp, mp); 5323 /* MNT_IUNLOCK(mp); -- done in above function */ 5324 mtx_assert(MNT_MTX(mp), MA_NOTOWNED); 5325 return (NULL); 5326 } 5327 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 5328 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 5329 MNT_IUNLOCK(mp); 5330 return (vp); 5331 } 5332 5333 struct vnode * 5334 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) 5335 { 5336 struct vnode *vp; 5337 5338 *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 5339 MNT_ILOCK(mp); 5340 MNT_REF(mp); 5341 (*mvp)->v_mount = mp; 5342 (*mvp)->v_type = VMARKER; 5343 5344 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 5345 /* Allow a racy peek at VI_DOOMED to save a lock acquisition. */ 5346 if (vp->v_type == VMARKER || (vp->v_iflag & VI_DOOMED) != 0) 5347 continue; 5348 VI_LOCK(vp); 5349 if ((vp->v_iflag & VI_DOOMED) != 0) { 5350 VI_UNLOCK(vp); 5351 continue; 5352 } 5353 break; 5354 } 5355 if (vp == NULL) { 5356 MNT_REL(mp); 5357 MNT_IUNLOCK(mp); 5358 free(*mvp, M_VNODE_MARKER); 5359 *mvp = NULL; 5360 return (NULL); 5361 } 5362 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 5363 MNT_IUNLOCK(mp); 5364 return (vp); 5365 } 5366 5367 void 5368 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) 5369 { 5370 5371 if (*mvp == NULL) { 5372 MNT_IUNLOCK(mp); 5373 return; 5374 } 5375 5376 mtx_assert(MNT_MTX(mp), MA_OWNED); 5377 5378 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 5379 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 5380 MNT_REL(mp); 5381 MNT_IUNLOCK(mp); 5382 free(*mvp, M_VNODE_MARKER); 5383 *mvp = NULL; 5384 } 5385 5386 /* 5387 * These are helper functions for filesystems to traverse their 5388 * active vnodes. See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h 5389 */ 5390 static void 5391 mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) 5392 { 5393 5394 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 5395 5396 MNT_ILOCK(mp); 5397 MNT_REL(mp); 5398 MNT_IUNLOCK(mp); 5399 free(*mvp, M_VNODE_MARKER); 5400 *mvp = NULL; 5401 } 5402 5403 /* 5404 * Relock the mp mount vnode list lock with the vp vnode interlock in the 5405 * conventional lock order during mnt_vnode_next_active iteration. 5406 * 5407 * On entry, the mount vnode list lock is held and the vnode interlock is not. 5408 * The list lock is dropped and reacquired. On success, both locks are held. 5409 * On failure, the mount vnode list lock is held but the vnode interlock is 5410 * not, and the procedure may have yielded. 5411 */ 5412 static bool 5413 mnt_vnode_next_active_relock(struct vnode *mvp, struct mount *mp, 5414 struct vnode *vp) 5415 { 5416 const struct vnode *tmp; 5417 bool held, ret; 5418 5419 VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && 5420 TAILQ_NEXT(mvp, v_actfreelist) != NULL, mvp, 5421 ("%s: bad marker", __func__)); 5422 VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, 5423 ("%s: inappropriate vnode", __func__)); 5424 ASSERT_VI_UNLOCKED(vp, __func__); 5425 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 5426 5427 ret = false; 5428 5429 TAILQ_REMOVE(&mp->mnt_activevnodelist, mvp, v_actfreelist); 5430 TAILQ_INSERT_BEFORE(vp, mvp, v_actfreelist); 5431 5432 /* 5433 * Use a hold to prevent vp from disappearing while the mount vnode 5434 * list lock is dropped and reacquired. Normally a hold would be 5435 * acquired with vhold(), but that might try to acquire the vnode 5436 * interlock, which would be a LOR with the mount vnode list lock. 5437 */ 5438 held = refcount_acquire_if_not_zero(&vp->v_holdcnt); 5439 mtx_unlock(&mp->mnt_listmtx); 5440 if (!held) 5441 goto abort; 5442 VI_LOCK(vp); 5443 if (!refcount_release_if_not_last(&vp->v_holdcnt)) { 5444 vdropl(vp); 5445 goto abort; 5446 } 5447 mtx_lock(&mp->mnt_listmtx); 5448 5449 /* 5450 * Determine whether the vnode is still the next one after the marker, 5451 * excepting any other markers. If the vnode has not been doomed by 5452 * vgone() then the hold should have ensured that it remained on the 5453 * active list. If it has been doomed but is still on the active list, 5454 * don't abort, but rather skip over it (avoid spinning on doomed 5455 * vnodes). 5456 */ 5457 tmp = mvp; 5458 do { 5459 tmp = TAILQ_NEXT(tmp, v_actfreelist); 5460 } while (tmp != NULL && tmp->v_type == VMARKER); 5461 if (tmp != vp) { 5462 mtx_unlock(&mp->mnt_listmtx); 5463 VI_UNLOCK(vp); 5464 goto abort; 5465 } 5466 5467 ret = true; 5468 goto out; 5469 abort: 5470 maybe_yield(); 5471 mtx_lock(&mp->mnt_listmtx); 5472 out: 5473 if (ret) 5474 ASSERT_VI_LOCKED(vp, __func__); 5475 else 5476 ASSERT_VI_UNLOCKED(vp, __func__); 5477 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 5478 return (ret); 5479 } 5480 5481 static struct vnode * 5482 mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) 5483 { 5484 struct vnode *vp, *nvp; 5485 5486 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 5487 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 5488 restart: 5489 vp = TAILQ_NEXT(*mvp, v_actfreelist); 5490 while (vp != NULL) { 5491 if (vp->v_type == VMARKER) { 5492 vp = TAILQ_NEXT(vp, v_actfreelist); 5493 continue; 5494 } 5495 /* 5496 * Try-lock because this is the wrong lock order. If that does 5497 * not succeed, drop the mount vnode list lock and try to 5498 * reacquire it and the vnode interlock in the right order. 5499 */ 5500 if (!VI_TRYLOCK(vp) && 5501 !mnt_vnode_next_active_relock(*mvp, mp, vp)) 5502 goto restart; 5503 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); 5504 KASSERT(vp->v_mount == mp || vp->v_mount == NULL, 5505 ("alien vnode on the active list %p %p", vp, mp)); 5506 if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0) 5507 break; 5508 nvp = TAILQ_NEXT(vp, v_actfreelist); 5509 VI_UNLOCK(vp); 5510 vp = nvp; 5511 } 5512 TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); 5513 5514 /* Check if we are done */ 5515 if (vp == NULL) { 5516 mtx_unlock(&mp->mnt_listmtx); 5517 mnt_vnode_markerfree_active(mvp, mp); 5518 return (NULL); 5519 } 5520 TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist); 5521 mtx_unlock(&mp->mnt_listmtx); 5522 ASSERT_VI_LOCKED(vp, "active iter"); 5523 KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp)); 5524 return (vp); 5525 } 5526 5527 struct vnode * 5528 __mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) 5529 { 5530 5531 if (should_yield()) 5532 kern_yield(PRI_USER); 5533 mtx_lock(&mp->mnt_listmtx); 5534 return (mnt_vnode_next_active(mvp, mp)); 5535 } 5536 5537 struct vnode * 5538 __mnt_vnode_first_active(struct vnode **mvp, struct mount *mp) 5539 { 5540 struct vnode *vp; 5541 5542 *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 5543 MNT_ILOCK(mp); 5544 MNT_REF(mp); 5545 MNT_IUNLOCK(mp); 5546 (*mvp)->v_type = VMARKER; 5547 (*mvp)->v_mount = mp; 5548 5549 mtx_lock(&mp->mnt_listmtx); 5550 vp = TAILQ_FIRST(&mp->mnt_activevnodelist); 5551 if (vp == NULL) { 5552 mtx_unlock(&mp->mnt_listmtx); 5553 mnt_vnode_markerfree_active(mvp, mp); 5554 return (NULL); 5555 } 5556 TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist); 5557 return (mnt_vnode_next_active(mvp, mp)); 5558 } 5559 5560 void 5561 __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) 5562 { 5563 5564 if (*mvp == NULL) 5565 return; 5566 5567 mtx_lock(&mp->mnt_listmtx); 5568 TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); 5569 mtx_unlock(&mp->mnt_listmtx); 5570 mnt_vnode_markerfree_active(mvp, mp); 5571 } 5572