1 /*- 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 35 */ 36 37 /* 38 * External virtual filesystem routines 39 */ 40 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 #include "opt_compat.h" 45 #include "opt_ddb.h" 46 #include "opt_watchdog.h" 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/bio.h> 51 #include <sys/buf.h> 52 #include <sys/condvar.h> 53 #include <sys/conf.h> 54 #include <sys/dirent.h> 55 #include <sys/event.h> 56 #include <sys/eventhandler.h> 57 #include <sys/extattr.h> 58 #include <sys/file.h> 59 #include <sys/fcntl.h> 60 #include <sys/jail.h> 61 #include <sys/kdb.h> 62 #include <sys/kernel.h> 63 #include <sys/kthread.h> 64 #include <sys/lockf.h> 65 #include <sys/malloc.h> 66 #include <sys/mount.h> 67 #include <sys/namei.h> 68 #include <sys/pctrie.h> 69 #include <sys/priv.h> 70 #include <sys/reboot.h> 71 #include <sys/refcount.h> 72 #include <sys/rwlock.h> 73 #include <sys/sched.h> 74 #include <sys/sleepqueue.h> 75 #include <sys/smp.h> 76 #include <sys/stat.h> 77 #include <sys/sysctl.h> 78 #include <sys/syslog.h> 79 #include <sys/vmmeter.h> 80 #include <sys/vnode.h> 81 #include <sys/watchdog.h> 82 83 #include <machine/stdarg.h> 84 85 #include <security/mac/mac_framework.h> 86 87 #include <vm/vm.h> 88 #include <vm/vm_object.h> 89 #include <vm/vm_extern.h> 90 #include <vm/pmap.h> 91 #include <vm/vm_map.h> 92 #include <vm/vm_page.h> 93 #include <vm/vm_kern.h> 94 #include <vm/uma.h> 95 96 #ifdef DDB 97 #include <ddb/ddb.h> 98 #endif 99 100 static void delmntque(struct vnode *vp); 101 static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, 102 int slpflag, int slptimeo); 103 static void syncer_shutdown(void *arg, int howto); 104 static int vtryrecycle(struct vnode *vp); 105 static void v_init_counters(struct vnode *); 106 static void v_incr_usecount(struct vnode *); 107 static void v_incr_usecount_locked(struct vnode *); 108 static void v_incr_devcount(struct vnode *); 109 static void v_decr_devcount(struct vnode *); 110 static void vgonel(struct vnode *); 111 static void vfs_knllock(void *arg); 112 static void vfs_knlunlock(void *arg); 113 static void vfs_knl_assert_locked(void *arg); 114 static void vfs_knl_assert_unlocked(void *arg); 115 static void destroy_vpollinfo(struct vpollinfo *vi); 116 117 /* 118 * Number of vnodes in existence. Increased whenever getnewvnode() 119 * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode. 120 */ 121 static unsigned long numvnodes; 122 123 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, 124 "Number of vnodes in existence"); 125 126 static u_long vnodes_created; 127 SYSCTL_ULONG(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, 128 0, "Number of vnodes created by getnewvnode"); 129 130 /* 131 * Conversion tables for conversion from vnode types to inode formats 132 * and back. 133 */ 134 enum vtype iftovt_tab[16] = { 135 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 136 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 137 }; 138 int vttoif_tab[10] = { 139 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 140 S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT 141 }; 142 143 /* 144 * List of vnodes that are ready for recycling. 145 */ 146 static TAILQ_HEAD(freelst, vnode) vnode_free_list; 147 148 /* 149 * "Free" vnode target. Free vnodes are rarely completely free, but are 150 * just ones that are cheap to recycle. Usually they are for files which 151 * have been stat'd but not read; these usually have inode and namecache 152 * data attached to them. This target is the preferred minimum size of a 153 * sub-cache consisting mostly of such files. The system balances the size 154 * of this sub-cache with its complement to try to prevent either from 155 * thrashing while the other is relatively inactive. The targets express 156 * a preference for the best balance. 157 * 158 * "Above" this target there are 2 further targets (watermarks) related 159 * to recyling of free vnodes. In the best-operating case, the cache is 160 * exactly full, the free list has size between vlowat and vhiwat above the 161 * free target, and recycling from it and normal use maintains this state. 162 * Sometimes the free list is below vlowat or even empty, but this state 163 * is even better for immediate use provided the cache is not full. 164 * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free 165 * ones) to reach one of these states. The watermarks are currently hard- 166 * coded as 4% and 9% of the available space higher. These and the default 167 * of 25% for wantfreevnodes are too large if the memory size is large. 168 * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim 169 * whenever vnlru_proc() becomes active. 170 */ 171 static u_long wantfreevnodes; 172 SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, 173 &wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes"); 174 static u_long freevnodes; 175 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, 176 &freevnodes, 0, "Number of \"free\" vnodes"); 177 178 static u_long recycles_count; 179 SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 0, 180 "Number of vnodes recycled to meet vnode cache targets"); 181 182 /* 183 * Various variables used for debugging the new implementation of 184 * reassignbuf(). 185 * XXX these are probably of (very) limited utility now. 186 */ 187 static int reassignbufcalls; 188 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, 189 "Number of calls to reassignbuf"); 190 191 static u_long free_owe_inact; 192 SYSCTL_ULONG(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact, 0, 193 "Number of times free vnodes kept on active list due to VFS " 194 "owing inactivation"); 195 196 /* To keep more than one thread at a time from running vfs_getnewfsid */ 197 static struct mtx mntid_mtx; 198 199 /* 200 * Lock for any access to the following: 201 * vnode_free_list 202 * numvnodes 203 * freevnodes 204 */ 205 static struct mtx vnode_free_list_mtx; 206 207 /* Publicly exported FS */ 208 struct nfs_public nfs_pub; 209 210 static uma_zone_t buf_trie_zone; 211 212 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 213 static uma_zone_t vnode_zone; 214 static uma_zone_t vnodepoll_zone; 215 216 /* 217 * The workitem queue. 218 * 219 * It is useful to delay writes of file data and filesystem metadata 220 * for tens of seconds so that quickly created and deleted files need 221 * not waste disk bandwidth being created and removed. To realize this, 222 * we append vnodes to a "workitem" queue. When running with a soft 223 * updates implementation, most pending metadata dependencies should 224 * not wait for more than a few seconds. Thus, mounted on block devices 225 * are delayed only about a half the time that file data is delayed. 226 * Similarly, directory updates are more critical, so are only delayed 227 * about a third the time that file data is delayed. Thus, there are 228 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 229 * one each second (driven off the filesystem syncer process). The 230 * syncer_delayno variable indicates the next queue that is to be processed. 231 * Items that need to be processed soon are placed in this queue: 232 * 233 * syncer_workitem_pending[syncer_delayno] 234 * 235 * A delay of fifteen seconds is done by placing the request fifteen 236 * entries later in the queue: 237 * 238 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 239 * 240 */ 241 static int syncer_delayno; 242 static long syncer_mask; 243 LIST_HEAD(synclist, bufobj); 244 static struct synclist *syncer_workitem_pending; 245 /* 246 * The sync_mtx protects: 247 * bo->bo_synclist 248 * sync_vnode_count 249 * syncer_delayno 250 * syncer_state 251 * syncer_workitem_pending 252 * syncer_worklist_len 253 * rushjob 254 */ 255 static struct mtx sync_mtx; 256 static struct cv sync_wakeup; 257 258 #define SYNCER_MAXDELAY 32 259 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 260 static int syncdelay = 30; /* max time to delay syncing data */ 261 static int filedelay = 30; /* time to delay syncing files */ 262 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, 263 "Time to delay syncing files (in seconds)"); 264 static int dirdelay = 29; /* time to delay syncing directories */ 265 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, 266 "Time to delay syncing directories (in seconds)"); 267 static int metadelay = 28; /* time to delay syncing metadata */ 268 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, 269 "Time to delay syncing metadata (in seconds)"); 270 static int rushjob; /* number of slots to run ASAP */ 271 static int stat_rush_requests; /* number of times I/O speeded up */ 272 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, 273 "Number of times I/O speeded up (rush requests)"); 274 275 /* 276 * When shutting down the syncer, run it at four times normal speed. 277 */ 278 #define SYNCER_SHUTDOWN_SPEEDUP 4 279 static int sync_vnode_count; 280 static int syncer_worklist_len; 281 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } 282 syncer_state; 283 284 /* Target for maximum number of vnodes. */ 285 int desiredvnodes; 286 static int gapvnodes; /* gap between wanted and desired */ 287 static int vhiwat; /* enough extras after expansion */ 288 static int vlowat; /* minimal extras before expansion */ 289 static int vstir; /* nonzero to stir non-free vnodes */ 290 static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ 291 292 static int 293 sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS) 294 { 295 int error, old_desiredvnodes; 296 297 old_desiredvnodes = desiredvnodes; 298 if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0) 299 return (error); 300 if (old_desiredvnodes != desiredvnodes) { 301 wantfreevnodes = desiredvnodes / 4; 302 /* XXX locking seems to be incomplete. */ 303 vfs_hash_changesize(desiredvnodes); 304 cache_changesize(desiredvnodes); 305 } 306 return (0); 307 } 308 309 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, 310 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0, 311 sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes"); 312 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 313 &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)"); 314 static int vnlru_nowhere; 315 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, 316 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); 317 318 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ 319 static int vnsz2log; 320 321 /* 322 * Support for the bufobj clean & dirty pctrie. 323 */ 324 static void * 325 buf_trie_alloc(struct pctrie *ptree) 326 { 327 328 return uma_zalloc(buf_trie_zone, M_NOWAIT); 329 } 330 331 static void 332 buf_trie_free(struct pctrie *ptree, void *node) 333 { 334 335 uma_zfree(buf_trie_zone, node); 336 } 337 PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free); 338 339 /* 340 * Initialize the vnode management data structures. 341 * 342 * Reevaluate the following cap on the number of vnodes after the physical 343 * memory size exceeds 512GB. In the limit, as the physical memory size 344 * grows, the ratio of the memory size in KB to to vnodes approaches 64:1. 345 */ 346 #ifndef MAXVNODES_MAX 347 #define MAXVNODES_MAX (512 * 1024 * 1024 / 64) /* 8M */ 348 #endif 349 350 /* 351 * Initialize a vnode as it first enters the zone. 352 */ 353 static int 354 vnode_init(void *mem, int size, int flags) 355 { 356 struct vnode *vp; 357 struct bufobj *bo; 358 359 vp = mem; 360 bzero(vp, size); 361 /* 362 * Setup locks. 363 */ 364 vp->v_vnlock = &vp->v_lock; 365 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); 366 /* 367 * By default, don't allow shared locks unless filesystems opt-in. 368 */ 369 lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, 370 LK_NOSHARE | LK_IS_VNODE); 371 /* 372 * Initialize bufobj. 373 */ 374 bo = &vp->v_bufobj; 375 bo->__bo_vnode = vp; 376 rw_init(BO_LOCKPTR(bo), "bufobj interlock"); 377 bo->bo_private = vp; 378 TAILQ_INIT(&bo->bo_clean.bv_hd); 379 TAILQ_INIT(&bo->bo_dirty.bv_hd); 380 /* 381 * Initialize namecache. 382 */ 383 LIST_INIT(&vp->v_cache_src); 384 TAILQ_INIT(&vp->v_cache_dst); 385 /* 386 * Initialize rangelocks. 387 */ 388 rangelock_init(&vp->v_rl); 389 return (0); 390 } 391 392 /* 393 * Free a vnode when it is cleared from the zone. 394 */ 395 static void 396 vnode_fini(void *mem, int size) 397 { 398 struct vnode *vp; 399 struct bufobj *bo; 400 401 vp = mem; 402 rangelock_destroy(&vp->v_rl); 403 lockdestroy(vp->v_vnlock); 404 mtx_destroy(&vp->v_interlock); 405 bo = &vp->v_bufobj; 406 rw_destroy(BO_LOCKPTR(bo)); 407 } 408 409 /* 410 * Provide the size of NFS nclnode and NFS fh for calculation of the 411 * vnode memory consumption. The size is specified directly to 412 * eliminate dependency on NFS-private header. 413 * 414 * Other filesystems may use bigger or smaller (like UFS and ZFS) 415 * private inode data, but the NFS-based estimation is ample enough. 416 * Still, we care about differences in the size between 64- and 32-bit 417 * platforms. 418 * 419 * Namecache structure size is heuristically 420 * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. 421 */ 422 #ifdef _LP64 423 #define NFS_NCLNODE_SZ (528 + 64) 424 #define NC_SZ 148 425 #else 426 #define NFS_NCLNODE_SZ (360 + 32) 427 #define NC_SZ 92 428 #endif 429 430 static void 431 vntblinit(void *dummy __unused) 432 { 433 u_int i; 434 int physvnodes, virtvnodes; 435 436 /* 437 * Desiredvnodes is a function of the physical memory size and the 438 * kernel's heap size. Generally speaking, it scales with the 439 * physical memory size. The ratio of desiredvnodes to the physical 440 * memory size is 1:16 until desiredvnodes exceeds 98,304. 441 * Thereafter, the 442 * marginal ratio of desiredvnodes to the physical memory size is 443 * 1:64. However, desiredvnodes is limited by the kernel's heap 444 * size. The memory required by desiredvnodes vnodes and vm objects 445 * must not exceed 1/10th of the kernel's heap size. 446 */ 447 physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 + 448 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64; 449 virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + 450 sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); 451 desiredvnodes = min(physvnodes, virtvnodes); 452 if (desiredvnodes > MAXVNODES_MAX) { 453 if (bootverbose) 454 printf("Reducing kern.maxvnodes %d -> %d\n", 455 desiredvnodes, MAXVNODES_MAX); 456 desiredvnodes = MAXVNODES_MAX; 457 } 458 wantfreevnodes = desiredvnodes / 4; 459 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); 460 TAILQ_INIT(&vnode_free_list); 461 mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF); 462 vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, 463 vnode_init, vnode_fini, UMA_ALIGN_PTR, 0); 464 vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), 465 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 466 /* 467 * Preallocate enough nodes to support one-per buf so that 468 * we can not fail an insert. reassignbuf() callers can not 469 * tolerate the insertion failure. 470 */ 471 buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), 472 NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 473 UMA_ZONE_NOFREE | UMA_ZONE_VM); 474 uma_prealloc(buf_trie_zone, nbuf); 475 /* 476 * Initialize the filesystem syncer. 477 */ 478 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 479 &syncer_mask); 480 syncer_maxdelay = syncer_mask + 1; 481 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); 482 cv_init(&sync_wakeup, "syncer"); 483 for (i = 1; i <= sizeof(struct vnode); i <<= 1) 484 vnsz2log++; 485 vnsz2log--; 486 } 487 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); 488 489 490 /* 491 * Mark a mount point as busy. Used to synchronize access and to delay 492 * unmounting. Eventually, mountlist_mtx is not released on failure. 493 * 494 * vfs_busy() is a custom lock, it can block the caller. 495 * vfs_busy() only sleeps if the unmount is active on the mount point. 496 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any 497 * vnode belonging to mp. 498 * 499 * Lookup uses vfs_busy() to traverse mount points. 500 * root fs var fs 501 * / vnode lock A / vnode lock (/var) D 502 * /var vnode lock B /log vnode lock(/var/log) E 503 * vfs_busy lock C vfs_busy lock F 504 * 505 * Within each file system, the lock order is C->A->B and F->D->E. 506 * 507 * When traversing across mounts, the system follows that lock order: 508 * 509 * C->A->B 510 * | 511 * +->F->D->E 512 * 513 * The lookup() process for namei("/var") illustrates the process: 514 * VOP_LOOKUP() obtains B while A is held 515 * vfs_busy() obtains a shared lock on F while A and B are held 516 * vput() releases lock on B 517 * vput() releases lock on A 518 * VFS_ROOT() obtains lock on D while shared lock on F is held 519 * vfs_unbusy() releases shared lock on F 520 * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. 521 * Attempt to lock A (instead of vp_crossmp) while D is held would 522 * violate the global order, causing deadlocks. 523 * 524 * dounmount() locks B while F is drained. 525 */ 526 int 527 vfs_busy(struct mount *mp, int flags) 528 { 529 530 MPASS((flags & ~MBF_MASK) == 0); 531 CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); 532 533 MNT_ILOCK(mp); 534 MNT_REF(mp); 535 /* 536 * If mount point is currently being unmounted, sleep until the 537 * mount point fate is decided. If thread doing the unmounting fails, 538 * it will clear MNTK_UNMOUNT flag before waking us up, indicating 539 * that this mount point has survived the unmount attempt and vfs_busy 540 * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE 541 * flag in addition to MNTK_UNMOUNT, indicating that mount point is 542 * about to be really destroyed. vfs_busy needs to release its 543 * reference on the mount point in this case and return with ENOENT, 544 * telling the caller that mount mount it tried to busy is no longer 545 * valid. 546 */ 547 while (mp->mnt_kern_flag & MNTK_UNMOUNT) { 548 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { 549 MNT_REL(mp); 550 MNT_IUNLOCK(mp); 551 CTR1(KTR_VFS, "%s: failed busying before sleeping", 552 __func__); 553 return (ENOENT); 554 } 555 if (flags & MBF_MNTLSTLOCK) 556 mtx_unlock(&mountlist_mtx); 557 mp->mnt_kern_flag |= MNTK_MWAIT; 558 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); 559 if (flags & MBF_MNTLSTLOCK) 560 mtx_lock(&mountlist_mtx); 561 MNT_ILOCK(mp); 562 } 563 if (flags & MBF_MNTLSTLOCK) 564 mtx_unlock(&mountlist_mtx); 565 mp->mnt_lockref++; 566 MNT_IUNLOCK(mp); 567 return (0); 568 } 569 570 /* 571 * Free a busy filesystem. 572 */ 573 void 574 vfs_unbusy(struct mount *mp) 575 { 576 577 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 578 MNT_ILOCK(mp); 579 MNT_REL(mp); 580 KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref")); 581 mp->mnt_lockref--; 582 if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { 583 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); 584 CTR1(KTR_VFS, "%s: waking up waiters", __func__); 585 mp->mnt_kern_flag &= ~MNTK_DRAINING; 586 wakeup(&mp->mnt_lockref); 587 } 588 MNT_IUNLOCK(mp); 589 } 590 591 /* 592 * Lookup a mount point by filesystem identifier. 593 */ 594 struct mount * 595 vfs_getvfs(fsid_t *fsid) 596 { 597 struct mount *mp; 598 599 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 600 mtx_lock(&mountlist_mtx); 601 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 602 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 603 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 604 vfs_ref(mp); 605 mtx_unlock(&mountlist_mtx); 606 return (mp); 607 } 608 } 609 mtx_unlock(&mountlist_mtx); 610 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 611 return ((struct mount *) 0); 612 } 613 614 /* 615 * Lookup a mount point by filesystem identifier, busying it before 616 * returning. 617 * 618 * To avoid congestion on mountlist_mtx, implement simple direct-mapped 619 * cache for popular filesystem identifiers. The cache is lockess, using 620 * the fact that struct mount's are never freed. In worst case we may 621 * get pointer to unmounted or even different filesystem, so we have to 622 * check what we got, and go slow way if so. 623 */ 624 struct mount * 625 vfs_busyfs(fsid_t *fsid) 626 { 627 #define FSID_CACHE_SIZE 256 628 typedef struct mount * volatile vmp_t; 629 static vmp_t cache[FSID_CACHE_SIZE]; 630 struct mount *mp; 631 int error; 632 uint32_t hash; 633 634 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 635 hash = fsid->val[0] ^ fsid->val[1]; 636 hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); 637 mp = cache[hash]; 638 if (mp == NULL || 639 mp->mnt_stat.f_fsid.val[0] != fsid->val[0] || 640 mp->mnt_stat.f_fsid.val[1] != fsid->val[1]) 641 goto slow; 642 if (vfs_busy(mp, 0) != 0) { 643 cache[hash] = NULL; 644 goto slow; 645 } 646 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 647 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) 648 return (mp); 649 else 650 vfs_unbusy(mp); 651 652 slow: 653 mtx_lock(&mountlist_mtx); 654 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 655 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 656 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 657 error = vfs_busy(mp, MBF_MNTLSTLOCK); 658 if (error) { 659 cache[hash] = NULL; 660 mtx_unlock(&mountlist_mtx); 661 return (NULL); 662 } 663 cache[hash] = mp; 664 return (mp); 665 } 666 } 667 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 668 mtx_unlock(&mountlist_mtx); 669 return ((struct mount *) 0); 670 } 671 672 /* 673 * Check if a user can access privileged mount options. 674 */ 675 int 676 vfs_suser(struct mount *mp, struct thread *td) 677 { 678 int error; 679 680 /* 681 * If the thread is jailed, but this is not a jail-friendly file 682 * system, deny immediately. 683 */ 684 if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred)) 685 return (EPERM); 686 687 /* 688 * If the file system was mounted outside the jail of the calling 689 * thread, deny immediately. 690 */ 691 if (prison_check(td->td_ucred, mp->mnt_cred) != 0) 692 return (EPERM); 693 694 /* 695 * If file system supports delegated administration, we don't check 696 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified 697 * by the file system itself. 698 * If this is not the user that did original mount, we check for 699 * the PRIV_VFS_MOUNT_OWNER privilege. 700 */ 701 if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && 702 mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { 703 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) 704 return (error); 705 } 706 return (0); 707 } 708 709 /* 710 * Get a new unique fsid. Try to make its val[0] unique, since this value 711 * will be used to create fake device numbers for stat(). Also try (but 712 * not so hard) make its val[0] unique mod 2^16, since some emulators only 713 * support 16-bit device numbers. We end up with unique val[0]'s for the 714 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 715 * 716 * Keep in mind that several mounts may be running in parallel. Starting 717 * the search one past where the previous search terminated is both a 718 * micro-optimization and a defense against returning the same fsid to 719 * different mounts. 720 */ 721 void 722 vfs_getnewfsid(struct mount *mp) 723 { 724 static uint16_t mntid_base; 725 struct mount *nmp; 726 fsid_t tfsid; 727 int mtype; 728 729 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 730 mtx_lock(&mntid_mtx); 731 mtype = mp->mnt_vfc->vfc_typenum; 732 tfsid.val[1] = mtype; 733 mtype = (mtype & 0xFF) << 24; 734 for (;;) { 735 tfsid.val[0] = makedev(255, 736 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 737 mntid_base++; 738 if ((nmp = vfs_getvfs(&tfsid)) == NULL) 739 break; 740 vfs_rel(nmp); 741 } 742 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 743 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 744 mtx_unlock(&mntid_mtx); 745 } 746 747 /* 748 * Knob to control the precision of file timestamps: 749 * 750 * 0 = seconds only; nanoseconds zeroed. 751 * 1 = seconds and nanoseconds, accurate within 1/HZ. 752 * 2 = seconds and nanoseconds, truncated to microseconds. 753 * >=3 = seconds and nanoseconds, maximum precision. 754 */ 755 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 756 757 static int timestamp_precision = TSP_USEC; 758 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 759 ×tamp_precision, 0, "File timestamp precision (0: seconds, " 760 "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, " 761 "3+: sec + ns (max. precision))"); 762 763 /* 764 * Get a current timestamp. 765 */ 766 void 767 vfs_timestamp(struct timespec *tsp) 768 { 769 struct timeval tv; 770 771 switch (timestamp_precision) { 772 case TSP_SEC: 773 tsp->tv_sec = time_second; 774 tsp->tv_nsec = 0; 775 break; 776 case TSP_HZ: 777 getnanotime(tsp); 778 break; 779 case TSP_USEC: 780 microtime(&tv); 781 TIMEVAL_TO_TIMESPEC(&tv, tsp); 782 break; 783 case TSP_NSEC: 784 default: 785 nanotime(tsp); 786 break; 787 } 788 } 789 790 /* 791 * Set vnode attributes to VNOVAL 792 */ 793 void 794 vattr_null(struct vattr *vap) 795 { 796 797 vap->va_type = VNON; 798 vap->va_size = VNOVAL; 799 vap->va_bytes = VNOVAL; 800 vap->va_mode = VNOVAL; 801 vap->va_nlink = VNOVAL; 802 vap->va_uid = VNOVAL; 803 vap->va_gid = VNOVAL; 804 vap->va_fsid = VNOVAL; 805 vap->va_fileid = VNOVAL; 806 vap->va_blocksize = VNOVAL; 807 vap->va_rdev = VNOVAL; 808 vap->va_atime.tv_sec = VNOVAL; 809 vap->va_atime.tv_nsec = VNOVAL; 810 vap->va_mtime.tv_sec = VNOVAL; 811 vap->va_mtime.tv_nsec = VNOVAL; 812 vap->va_ctime.tv_sec = VNOVAL; 813 vap->va_ctime.tv_nsec = VNOVAL; 814 vap->va_birthtime.tv_sec = VNOVAL; 815 vap->va_birthtime.tv_nsec = VNOVAL; 816 vap->va_flags = VNOVAL; 817 vap->va_gen = VNOVAL; 818 vap->va_vaflags = 0; 819 } 820 821 /* 822 * This routine is called when we have too many vnodes. It attempts 823 * to free <count> vnodes and will potentially free vnodes that still 824 * have VM backing store (VM backing store is typically the cause 825 * of a vnode blowout so we want to do this). Therefore, this operation 826 * is not considered cheap. 827 * 828 * A number of conditions may prevent a vnode from being reclaimed. 829 * the buffer cache may have references on the vnode, a directory 830 * vnode may still have references due to the namei cache representing 831 * underlying files, or the vnode may be in active use. It is not 832 * desirable to reuse such vnodes. These conditions may cause the 833 * number of vnodes to reach some minimum value regardless of what 834 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 835 */ 836 static int 837 vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger) 838 { 839 struct vnode *vp; 840 int count, done, target; 841 842 done = 0; 843 vn_start_write(NULL, &mp, V_WAIT); 844 MNT_ILOCK(mp); 845 count = mp->mnt_nvnodelistsize; 846 target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1); 847 target = target / 10 + 1; 848 while (count != 0 && done < target) { 849 vp = TAILQ_FIRST(&mp->mnt_nvnodelist); 850 while (vp != NULL && vp->v_type == VMARKER) 851 vp = TAILQ_NEXT(vp, v_nmntvnodes); 852 if (vp == NULL) 853 break; 854 /* 855 * XXX LRU is completely broken for non-free vnodes. First 856 * by calling here in mountpoint order, then by moving 857 * unselected vnodes to the end here, and most grossly by 858 * removing the vlruvp() function that was supposed to 859 * maintain the order. (This function was born broken 860 * since syncer problems prevented it doing anything.) The 861 * order is closer to LRC (C = Created). 862 * 863 * LRU reclaiming of vnodes seems to have last worked in 864 * FreeBSD-3 where LRU wasn't mentioned under any spelling. 865 * Then there was no hold count, and inactive vnodes were 866 * simply put on the free list in LRU order. The separate 867 * lists also break LRU. We prefer to reclaim from the 868 * free list for technical reasons. This tends to thrash 869 * the free list to keep very unrecently used held vnodes. 870 * The problem is mitigated by keeping the free list large. 871 */ 872 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 873 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 874 --count; 875 if (!VI_TRYLOCK(vp)) 876 goto next_iter; 877 /* 878 * If it's been deconstructed already, it's still 879 * referenced, or it exceeds the trigger, skip it. 880 * Also skip free vnodes. We are trying to make space 881 * to expand the free list, not reduce it. 882 */ 883 if (vp->v_usecount || 884 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 885 ((vp->v_iflag & VI_FREE) != 0) || 886 (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL && 887 vp->v_object->resident_page_count > trigger)) { 888 VI_UNLOCK(vp); 889 goto next_iter; 890 } 891 MNT_IUNLOCK(mp); 892 vholdl(vp); 893 if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) { 894 vdrop(vp); 895 goto next_iter_mntunlocked; 896 } 897 VI_LOCK(vp); 898 /* 899 * v_usecount may have been bumped after VOP_LOCK() dropped 900 * the vnode interlock and before it was locked again. 901 * 902 * It is not necessary to recheck VI_DOOMED because it can 903 * only be set by another thread that holds both the vnode 904 * lock and vnode interlock. If another thread has the 905 * vnode lock before we get to VOP_LOCK() and obtains the 906 * vnode interlock after VOP_LOCK() drops the vnode 907 * interlock, the other thread will be unable to drop the 908 * vnode lock before our VOP_LOCK() call fails. 909 */ 910 if (vp->v_usecount || 911 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 912 (vp->v_iflag & VI_FREE) != 0 || 913 (vp->v_object != NULL && 914 vp->v_object->resident_page_count > trigger)) { 915 VOP_UNLOCK(vp, LK_INTERLOCK); 916 vdrop(vp); 917 goto next_iter_mntunlocked; 918 } 919 KASSERT((vp->v_iflag & VI_DOOMED) == 0, 920 ("VI_DOOMED unexpectedly detected in vlrureclaim()")); 921 atomic_add_long(&recycles_count, 1); 922 vgonel(vp); 923 VOP_UNLOCK(vp, 0); 924 vdropl(vp); 925 done++; 926 next_iter_mntunlocked: 927 if (!should_yield()) 928 goto relock_mnt; 929 goto yield; 930 next_iter: 931 if (!should_yield()) 932 continue; 933 MNT_IUNLOCK(mp); 934 yield: 935 kern_yield(PRI_USER); 936 relock_mnt: 937 MNT_ILOCK(mp); 938 } 939 MNT_IUNLOCK(mp); 940 vn_finished_write(mp); 941 return done; 942 } 943 944 static int max_vnlru_free = 10000; /* limit on vnode free requests per call */ 945 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free, 946 0, 947 "limit on vnode free requests per call to the vnlru_free routine"); 948 949 /* 950 * Attempt to reduce the free list by the requested amount. 951 */ 952 static void 953 vnlru_free_locked(int count, struct vfsops *mnt_op) 954 { 955 struct vnode *vp; 956 struct mount *mp; 957 958 mtx_assert(&vnode_free_list_mtx, MA_OWNED); 959 if (count > max_vnlru_free) 960 count = max_vnlru_free; 961 for (; count > 0; count--) { 962 vp = TAILQ_FIRST(&vnode_free_list); 963 /* 964 * The list can be modified while the free_list_mtx 965 * has been dropped and vp could be NULL here. 966 */ 967 if (!vp) 968 break; 969 VNASSERT(vp->v_op != NULL, vp, 970 ("vnlru_free: vnode already reclaimed.")); 971 KASSERT((vp->v_iflag & VI_FREE) != 0, 972 ("Removing vnode not on freelist")); 973 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 974 ("Mangling active vnode")); 975 TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); 976 977 /* 978 * Don't recycle if our vnode is from different type 979 * of mount point. Note that mp is type-safe, the 980 * check does not reach unmapped address even if 981 * vnode is reclaimed. 982 * Don't recycle if we can't get the interlock without 983 * blocking. 984 */ 985 if ((mnt_op != NULL && (mp = vp->v_mount) != NULL && 986 mp->mnt_op != mnt_op) || !VI_TRYLOCK(vp)) { 987 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist); 988 continue; 989 } 990 VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0, 991 vp, ("vp inconsistent on freelist")); 992 993 /* 994 * The clear of VI_FREE prevents activation of the 995 * vnode. There is no sense in putting the vnode on 996 * the mount point active list, only to remove it 997 * later during recycling. Inline the relevant part 998 * of vholdl(), to avoid triggering assertions or 999 * activating. 1000 */ 1001 freevnodes--; 1002 vp->v_iflag &= ~VI_FREE; 1003 refcount_acquire(&vp->v_holdcnt); 1004 1005 mtx_unlock(&vnode_free_list_mtx); 1006 VI_UNLOCK(vp); 1007 vtryrecycle(vp); 1008 /* 1009 * If the recycled succeeded this vdrop will actually free 1010 * the vnode. If not it will simply place it back on 1011 * the free list. 1012 */ 1013 vdrop(vp); 1014 mtx_lock(&vnode_free_list_mtx); 1015 } 1016 } 1017 1018 void 1019 vnlru_free(int count, struct vfsops *mnt_op) 1020 { 1021 1022 mtx_lock(&vnode_free_list_mtx); 1023 vnlru_free_locked(count, mnt_op); 1024 mtx_unlock(&vnode_free_list_mtx); 1025 } 1026 1027 1028 /* XXX some names and initialization are bad for limits and watermarks. */ 1029 static int 1030 vspace(void) 1031 { 1032 int space; 1033 1034 gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); 1035 vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ 1036 vlowat = vhiwat / 2; 1037 if (numvnodes > desiredvnodes) 1038 return (0); 1039 space = desiredvnodes - numvnodes; 1040 if (freevnodes > wantfreevnodes) 1041 space += freevnodes - wantfreevnodes; 1042 return (space); 1043 } 1044 1045 /* 1046 * Attempt to recycle vnodes in a context that is always safe to block. 1047 * Calling vlrurecycle() from the bowels of filesystem code has some 1048 * interesting deadlock problems. 1049 */ 1050 static struct proc *vnlruproc; 1051 static int vnlruproc_sig; 1052 1053 static void 1054 vnlru_proc(void) 1055 { 1056 struct mount *mp, *nmp; 1057 unsigned long ofreevnodes, onumvnodes; 1058 int done, force, reclaim_nc_src, trigger, usevnodes; 1059 1060 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, 1061 SHUTDOWN_PRI_FIRST); 1062 1063 force = 0; 1064 for (;;) { 1065 kproc_suspend_check(vnlruproc); 1066 mtx_lock(&vnode_free_list_mtx); 1067 /* 1068 * If numvnodes is too large (due to desiredvnodes being 1069 * adjusted using its sysctl, or emergency growth), first 1070 * try to reduce it by discarding from the free list. 1071 */ 1072 if (numvnodes > desiredvnodes && freevnodes > 0) 1073 vnlru_free_locked(ulmin(numvnodes - desiredvnodes, 1074 freevnodes), NULL); 1075 /* 1076 * Sleep if the vnode cache is in a good state. This is 1077 * when it is not over-full and has space for about a 4% 1078 * or 9% expansion (by growing its size or inexcessively 1079 * reducing its free list). Otherwise, try to reclaim 1080 * space for a 10% expansion. 1081 */ 1082 if (vstir && force == 0) { 1083 force = 1; 1084 vstir = 0; 1085 } 1086 if (vspace() >= vlowat && force == 0) { 1087 vnlruproc_sig = 0; 1088 wakeup(&vnlruproc_sig); 1089 msleep(vnlruproc, &vnode_free_list_mtx, 1090 PVFS|PDROP, "vlruwt", hz); 1091 continue; 1092 } 1093 mtx_unlock(&vnode_free_list_mtx); 1094 done = 0; 1095 ofreevnodes = freevnodes; 1096 onumvnodes = numvnodes; 1097 /* 1098 * Calculate parameters for recycling. These are the same 1099 * throughout the loop to give some semblance of fairness. 1100 * The trigger point is to avoid recycling vnodes with lots 1101 * of resident pages. We aren't trying to free memory; we 1102 * are trying to recycle or at least free vnodes. 1103 */ 1104 if (numvnodes <= desiredvnodes) 1105 usevnodes = numvnodes - freevnodes; 1106 else 1107 usevnodes = numvnodes; 1108 if (usevnodes <= 0) 1109 usevnodes = 1; 1110 /* 1111 * The trigger value is is chosen to give a conservatively 1112 * large value to ensure that it alone doesn't prevent 1113 * making progress. The value can easily be so large that 1114 * it is effectively infinite in some congested and 1115 * misconfigured cases, and this is necessary. Normally 1116 * it is about 8 to 100 (pages), which is quite large. 1117 */ 1118 trigger = vm_cnt.v_page_count * 2 / usevnodes; 1119 if (force < 2) 1120 trigger = vsmalltrigger; 1121 reclaim_nc_src = force >= 3; 1122 mtx_lock(&mountlist_mtx); 1123 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 1124 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { 1125 nmp = TAILQ_NEXT(mp, mnt_list); 1126 continue; 1127 } 1128 done += vlrureclaim(mp, reclaim_nc_src, trigger); 1129 mtx_lock(&mountlist_mtx); 1130 nmp = TAILQ_NEXT(mp, mnt_list); 1131 vfs_unbusy(mp); 1132 } 1133 mtx_unlock(&mountlist_mtx); 1134 if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) 1135 uma_reclaim(); 1136 if (done == 0) { 1137 if (force == 0 || force == 1) { 1138 force = 2; 1139 continue; 1140 } 1141 if (force == 2) { 1142 force = 3; 1143 continue; 1144 } 1145 force = 0; 1146 vnlru_nowhere++; 1147 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 1148 } else 1149 kern_yield(PRI_USER); 1150 /* 1151 * After becoming active to expand above low water, keep 1152 * active until above high water. 1153 */ 1154 force = vspace() < vhiwat; 1155 } 1156 } 1157 1158 static struct kproc_desc vnlru_kp = { 1159 "vnlru", 1160 vnlru_proc, 1161 &vnlruproc 1162 }; 1163 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, 1164 &vnlru_kp); 1165 1166 /* 1167 * Routines having to do with the management of the vnode table. 1168 */ 1169 1170 /* 1171 * Try to recycle a freed vnode. We abort if anyone picks up a reference 1172 * before we actually vgone(). This function must be called with the vnode 1173 * held to prevent the vnode from being returned to the free list midway 1174 * through vgone(). 1175 */ 1176 static int 1177 vtryrecycle(struct vnode *vp) 1178 { 1179 struct mount *vnmp; 1180 1181 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 1182 VNASSERT(vp->v_holdcnt, vp, 1183 ("vtryrecycle: Recycling vp %p without a reference.", vp)); 1184 /* 1185 * This vnode may found and locked via some other list, if so we 1186 * can't recycle it yet. 1187 */ 1188 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 1189 CTR2(KTR_VFS, 1190 "%s: impossible to recycle, vp %p lock is already held", 1191 __func__, vp); 1192 return (EWOULDBLOCK); 1193 } 1194 /* 1195 * Don't recycle if its filesystem is being suspended. 1196 */ 1197 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { 1198 VOP_UNLOCK(vp, 0); 1199 CTR2(KTR_VFS, 1200 "%s: impossible to recycle, cannot start the write for %p", 1201 __func__, vp); 1202 return (EBUSY); 1203 } 1204 /* 1205 * If we got this far, we need to acquire the interlock and see if 1206 * anyone picked up this vnode from another list. If not, we will 1207 * mark it with DOOMED via vgonel() so that anyone who does find it 1208 * will skip over it. 1209 */ 1210 VI_LOCK(vp); 1211 if (vp->v_usecount) { 1212 VOP_UNLOCK(vp, LK_INTERLOCK); 1213 vn_finished_write(vnmp); 1214 CTR2(KTR_VFS, 1215 "%s: impossible to recycle, %p is already referenced", 1216 __func__, vp); 1217 return (EBUSY); 1218 } 1219 if ((vp->v_iflag & VI_DOOMED) == 0) { 1220 atomic_add_long(&recycles_count, 1); 1221 vgonel(vp); 1222 } 1223 VOP_UNLOCK(vp, LK_INTERLOCK); 1224 vn_finished_write(vnmp); 1225 return (0); 1226 } 1227 1228 static void 1229 vcheckspace(void) 1230 { 1231 1232 if (vspace() < vlowat && vnlruproc_sig == 0) { 1233 vnlruproc_sig = 1; 1234 wakeup(vnlruproc); 1235 } 1236 } 1237 1238 /* 1239 * Wait if necessary for space for a new vnode. 1240 */ 1241 static int 1242 getnewvnode_wait(int suspended) 1243 { 1244 1245 mtx_assert(&vnode_free_list_mtx, MA_OWNED); 1246 if (numvnodes >= desiredvnodes) { 1247 if (suspended) { 1248 /* 1249 * The file system is being suspended. We cannot 1250 * risk a deadlock here, so allow allocation of 1251 * another vnode even if this would give too many. 1252 */ 1253 return (0); 1254 } 1255 if (vnlruproc_sig == 0) { 1256 vnlruproc_sig = 1; /* avoid unnecessary wakeups */ 1257 wakeup(vnlruproc); 1258 } 1259 msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS, 1260 "vlruwk", hz); 1261 } 1262 /* Post-adjust like the pre-adjust in getnewvnode(). */ 1263 if (numvnodes + 1 > desiredvnodes && freevnodes > 1) 1264 vnlru_free_locked(1, NULL); 1265 return (numvnodes >= desiredvnodes ? ENFILE : 0); 1266 } 1267 1268 /* 1269 * This hack is fragile, and probably not needed any more now that the 1270 * watermark handling works. 1271 */ 1272 void 1273 getnewvnode_reserve(u_int count) 1274 { 1275 struct thread *td; 1276 1277 /* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */ 1278 /* XXX no longer so quick, but this part is not racy. */ 1279 mtx_lock(&vnode_free_list_mtx); 1280 if (numvnodes + count > desiredvnodes && freevnodes > wantfreevnodes) 1281 vnlru_free_locked(ulmin(numvnodes + count - desiredvnodes, 1282 freevnodes - wantfreevnodes), NULL); 1283 mtx_unlock(&vnode_free_list_mtx); 1284 1285 td = curthread; 1286 /* First try to be quick and racy. */ 1287 if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) { 1288 td->td_vp_reserv += count; 1289 vcheckspace(); /* XXX no longer so quick, but more racy */ 1290 return; 1291 } else 1292 atomic_subtract_long(&numvnodes, count); 1293 1294 mtx_lock(&vnode_free_list_mtx); 1295 while (count > 0) { 1296 if (getnewvnode_wait(0) == 0) { 1297 count--; 1298 td->td_vp_reserv++; 1299 atomic_add_long(&numvnodes, 1); 1300 } 1301 } 1302 vcheckspace(); 1303 mtx_unlock(&vnode_free_list_mtx); 1304 } 1305 1306 /* 1307 * This hack is fragile, especially if desiredvnodes or wantvnodes are 1308 * misconfgured or changed significantly. Reducing desiredvnodes below 1309 * the reserved amount should cause bizarre behaviour like reducing it 1310 * below the number of active vnodes -- the system will try to reduce 1311 * numvnodes to match, but should fail, so the subtraction below should 1312 * not overflow. 1313 */ 1314 void 1315 getnewvnode_drop_reserve(void) 1316 { 1317 struct thread *td; 1318 1319 td = curthread; 1320 atomic_subtract_long(&numvnodes, td->td_vp_reserv); 1321 td->td_vp_reserv = 0; 1322 } 1323 1324 /* 1325 * Return the next vnode from the free list. 1326 */ 1327 int 1328 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, 1329 struct vnode **vpp) 1330 { 1331 struct vnode *vp; 1332 struct thread *td; 1333 struct lock_object *lo; 1334 static int cyclecount; 1335 int error; 1336 1337 CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); 1338 vp = NULL; 1339 td = curthread; 1340 if (td->td_vp_reserv > 0) { 1341 td->td_vp_reserv -= 1; 1342 goto alloc; 1343 } 1344 mtx_lock(&vnode_free_list_mtx); 1345 if (numvnodes < desiredvnodes) 1346 cyclecount = 0; 1347 else if (cyclecount++ >= freevnodes) { 1348 cyclecount = 0; 1349 vstir = 1; 1350 } 1351 /* 1352 * Grow the vnode cache if it will not be above its target max 1353 * after growing. Otherwise, if the free list is nonempty, try 1354 * to reclaim 1 item from it before growing the cache (possibly 1355 * above its target max if the reclamation failed or is delayed). 1356 * Otherwise, wait for some space. In all cases, schedule 1357 * vnlru_proc() if we are getting short of space. The watermarks 1358 * should be chosen so that we never wait or even reclaim from 1359 * the free list to below its target minimum. 1360 */ 1361 if (numvnodes + 1 <= desiredvnodes) 1362 ; 1363 else if (freevnodes > 0) 1364 vnlru_free_locked(1, NULL); 1365 else { 1366 error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag & 1367 MNTK_SUSPEND)); 1368 #if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ 1369 if (error != 0) { 1370 mtx_unlock(&vnode_free_list_mtx); 1371 return (error); 1372 } 1373 #endif 1374 } 1375 vcheckspace(); 1376 atomic_add_long(&numvnodes, 1); 1377 mtx_unlock(&vnode_free_list_mtx); 1378 alloc: 1379 atomic_add_long(&vnodes_created, 1); 1380 vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK); 1381 /* 1382 * Locks are given the generic name "vnode" when created. 1383 * Follow the historic practice of using the filesystem 1384 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. 1385 * 1386 * Locks live in a witness group keyed on their name. Thus, 1387 * when a lock is renamed, it must also move from the witness 1388 * group of its old name to the witness group of its new name. 1389 * 1390 * The change only needs to be made when the vnode moves 1391 * from one filesystem type to another. We ensure that each 1392 * filesystem use a single static name pointer for its tag so 1393 * that we can compare pointers rather than doing a strcmp(). 1394 */ 1395 lo = &vp->v_vnlock->lock_object; 1396 if (lo->lo_name != tag) { 1397 lo->lo_name = tag; 1398 WITNESS_DESTROY(lo); 1399 WITNESS_INIT(lo, tag); 1400 } 1401 /* 1402 * By default, don't allow shared locks unless filesystems opt-in. 1403 */ 1404 vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; 1405 /* 1406 * Finalize various vnode identity bits. 1407 */ 1408 KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); 1409 KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); 1410 KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); 1411 vp->v_type = VNON; 1412 vp->v_tag = tag; 1413 vp->v_op = vops; 1414 v_init_counters(vp); 1415 vp->v_bufobj.bo_ops = &buf_ops_bio; 1416 #ifdef MAC 1417 mac_vnode_init(vp); 1418 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) 1419 mac_vnode_associate_singlelabel(mp, vp); 1420 else if (mp == NULL && vops != &dead_vnodeops) 1421 printf("NULL mp in getnewvnode()\n"); 1422 #endif 1423 if (mp != NULL) { 1424 vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; 1425 if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) 1426 vp->v_vflag |= VV_NOKNOTE; 1427 } 1428 1429 /* 1430 * For the filesystems which do not use vfs_hash_insert(), 1431 * still initialize v_hash to have vfs_hash_index() useful. 1432 * E.g., nullfs uses vfs_hash_index() on the lower vnode for 1433 * its own hashing. 1434 */ 1435 vp->v_hash = (uintptr_t)vp >> vnsz2log; 1436 1437 *vpp = vp; 1438 return (0); 1439 } 1440 1441 /* 1442 * Delete from old mount point vnode list, if on one. 1443 */ 1444 static void 1445 delmntque(struct vnode *vp) 1446 { 1447 struct mount *mp; 1448 int active; 1449 1450 mp = vp->v_mount; 1451 if (mp == NULL) 1452 return; 1453 MNT_ILOCK(mp); 1454 VI_LOCK(vp); 1455 KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize, 1456 ("Active vnode list size %d > Vnode list size %d", 1457 mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize)); 1458 active = vp->v_iflag & VI_ACTIVE; 1459 vp->v_iflag &= ~VI_ACTIVE; 1460 if (active) { 1461 mtx_lock(&vnode_free_list_mtx); 1462 TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist); 1463 mp->mnt_activevnodelistsize--; 1464 mtx_unlock(&vnode_free_list_mtx); 1465 } 1466 vp->v_mount = NULL; 1467 VI_UNLOCK(vp); 1468 VNASSERT(mp->mnt_nvnodelistsize > 0, vp, 1469 ("bad mount point vnode list size")); 1470 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1471 mp->mnt_nvnodelistsize--; 1472 MNT_REL(mp); 1473 MNT_IUNLOCK(mp); 1474 } 1475 1476 static void 1477 insmntque_stddtr(struct vnode *vp, void *dtr_arg) 1478 { 1479 1480 vp->v_data = NULL; 1481 vp->v_op = &dead_vnodeops; 1482 vgone(vp); 1483 vput(vp); 1484 } 1485 1486 /* 1487 * Insert into list of vnodes for the new mount point, if available. 1488 */ 1489 int 1490 insmntque1(struct vnode *vp, struct mount *mp, 1491 void (*dtr)(struct vnode *, void *), void *dtr_arg) 1492 { 1493 1494 KASSERT(vp->v_mount == NULL, 1495 ("insmntque: vnode already on per mount vnode list")); 1496 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); 1497 ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); 1498 1499 /* 1500 * We acquire the vnode interlock early to ensure that the 1501 * vnode cannot be recycled by another process releasing a 1502 * holdcnt on it before we get it on both the vnode list 1503 * and the active vnode list. The mount mutex protects only 1504 * manipulation of the vnode list and the vnode freelist 1505 * mutex protects only manipulation of the active vnode list. 1506 * Hence the need to hold the vnode interlock throughout. 1507 */ 1508 MNT_ILOCK(mp); 1509 VI_LOCK(vp); 1510 if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 && 1511 ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || 1512 mp->mnt_nvnodelistsize == 0)) && 1513 (vp->v_vflag & VV_FORCEINSMQ) == 0) { 1514 VI_UNLOCK(vp); 1515 MNT_IUNLOCK(mp); 1516 if (dtr != NULL) 1517 dtr(vp, dtr_arg); 1518 return (EBUSY); 1519 } 1520 vp->v_mount = mp; 1521 MNT_REF(mp); 1522 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1523 VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, 1524 ("neg mount point vnode list size")); 1525 mp->mnt_nvnodelistsize++; 1526 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 1527 ("Activating already active vnode")); 1528 vp->v_iflag |= VI_ACTIVE; 1529 mtx_lock(&vnode_free_list_mtx); 1530 TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); 1531 mp->mnt_activevnodelistsize++; 1532 mtx_unlock(&vnode_free_list_mtx); 1533 VI_UNLOCK(vp); 1534 MNT_IUNLOCK(mp); 1535 return (0); 1536 } 1537 1538 int 1539 insmntque(struct vnode *vp, struct mount *mp) 1540 { 1541 1542 return (insmntque1(vp, mp, insmntque_stddtr, NULL)); 1543 } 1544 1545 /* 1546 * Flush out and invalidate all buffers associated with a bufobj 1547 * Called with the underlying object locked. 1548 */ 1549 int 1550 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) 1551 { 1552 int error; 1553 1554 BO_LOCK(bo); 1555 if (flags & V_SAVE) { 1556 error = bufobj_wwait(bo, slpflag, slptimeo); 1557 if (error) { 1558 BO_UNLOCK(bo); 1559 return (error); 1560 } 1561 if (bo->bo_dirty.bv_cnt > 0) { 1562 BO_UNLOCK(bo); 1563 if ((error = BO_SYNC(bo, MNT_WAIT)) != 0) 1564 return (error); 1565 /* 1566 * XXX We could save a lock/unlock if this was only 1567 * enabled under INVARIANTS 1568 */ 1569 BO_LOCK(bo); 1570 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 1571 panic("vinvalbuf: dirty bufs"); 1572 } 1573 } 1574 /* 1575 * If you alter this loop please notice that interlock is dropped and 1576 * reacquired in flushbuflist. Special care is needed to ensure that 1577 * no race conditions occur from this. 1578 */ 1579 do { 1580 error = flushbuflist(&bo->bo_clean, 1581 flags, bo, slpflag, slptimeo); 1582 if (error == 0 && !(flags & V_CLEANONLY)) 1583 error = flushbuflist(&bo->bo_dirty, 1584 flags, bo, slpflag, slptimeo); 1585 if (error != 0 && error != EAGAIN) { 1586 BO_UNLOCK(bo); 1587 return (error); 1588 } 1589 } while (error != 0); 1590 1591 /* 1592 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 1593 * have write I/O in-progress but if there is a VM object then the 1594 * VM object can also have read-I/O in-progress. 1595 */ 1596 do { 1597 bufobj_wwait(bo, 0, 0); 1598 BO_UNLOCK(bo); 1599 if (bo->bo_object != NULL) { 1600 VM_OBJECT_WLOCK(bo->bo_object); 1601 vm_object_pip_wait(bo->bo_object, "bovlbx"); 1602 VM_OBJECT_WUNLOCK(bo->bo_object); 1603 } 1604 BO_LOCK(bo); 1605 } while (bo->bo_numoutput > 0); 1606 BO_UNLOCK(bo); 1607 1608 /* 1609 * Destroy the copy in the VM cache, too. 1610 */ 1611 if (bo->bo_object != NULL && 1612 (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) { 1613 VM_OBJECT_WLOCK(bo->bo_object); 1614 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? 1615 OBJPR_CLEANONLY : 0); 1616 VM_OBJECT_WUNLOCK(bo->bo_object); 1617 } 1618 1619 #ifdef INVARIANTS 1620 BO_LOCK(bo); 1621 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 && 1622 (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) 1623 panic("vinvalbuf: flush failed"); 1624 BO_UNLOCK(bo); 1625 #endif 1626 return (0); 1627 } 1628 1629 /* 1630 * Flush out and invalidate all buffers associated with a vnode. 1631 * Called with the underlying object locked. 1632 */ 1633 int 1634 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) 1635 { 1636 1637 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 1638 ASSERT_VOP_LOCKED(vp, "vinvalbuf"); 1639 if (vp->v_object != NULL && vp->v_object->handle != vp) 1640 return (0); 1641 return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); 1642 } 1643 1644 /* 1645 * Flush out buffers on the specified list. 1646 * 1647 */ 1648 static int 1649 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, 1650 int slptimeo) 1651 { 1652 struct buf *bp, *nbp; 1653 int retval, error; 1654 daddr_t lblkno; 1655 b_xflags_t xflags; 1656 1657 ASSERT_BO_WLOCKED(bo); 1658 1659 retval = 0; 1660 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { 1661 if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) || 1662 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) { 1663 continue; 1664 } 1665 lblkno = 0; 1666 xflags = 0; 1667 if (nbp != NULL) { 1668 lblkno = nbp->b_lblkno; 1669 xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); 1670 } 1671 retval = EAGAIN; 1672 error = BUF_TIMELOCK(bp, 1673 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), 1674 "flushbuf", slpflag, slptimeo); 1675 if (error) { 1676 BO_LOCK(bo); 1677 return (error != ENOLCK ? error : EAGAIN); 1678 } 1679 KASSERT(bp->b_bufobj == bo, 1680 ("bp %p wrong b_bufobj %p should be %p", 1681 bp, bp->b_bufobj, bo)); 1682 /* 1683 * XXX Since there are no node locks for NFS, I 1684 * believe there is a slight chance that a delayed 1685 * write will occur while sleeping just above, so 1686 * check for it. 1687 */ 1688 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 1689 (flags & V_SAVE)) { 1690 bremfree(bp); 1691 bp->b_flags |= B_ASYNC; 1692 bwrite(bp); 1693 BO_LOCK(bo); 1694 return (EAGAIN); /* XXX: why not loop ? */ 1695 } 1696 bremfree(bp); 1697 bp->b_flags |= (B_INVAL | B_RELBUF); 1698 bp->b_flags &= ~B_ASYNC; 1699 brelse(bp); 1700 BO_LOCK(bo); 1701 nbp = gbincore(bo, lblkno); 1702 if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 1703 != xflags) 1704 break; /* nbp invalid */ 1705 } 1706 return (retval); 1707 } 1708 1709 int 1710 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) 1711 { 1712 struct buf *bp; 1713 int error; 1714 daddr_t lblkno; 1715 1716 ASSERT_BO_LOCKED(bo); 1717 1718 for (lblkno = startn;;) { 1719 again: 1720 bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno); 1721 if (bp == NULL || bp->b_lblkno >= endn || 1722 bp->b_lblkno < startn) 1723 break; 1724 error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 1725 LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); 1726 if (error != 0) { 1727 BO_RLOCK(bo); 1728 if (error == ENOLCK) 1729 goto again; 1730 return (error); 1731 } 1732 KASSERT(bp->b_bufobj == bo, 1733 ("bp %p wrong b_bufobj %p should be %p", 1734 bp, bp->b_bufobj, bo)); 1735 lblkno = bp->b_lblkno + 1; 1736 if ((bp->b_flags & B_MANAGED) == 0) 1737 bremfree(bp); 1738 bp->b_flags |= B_RELBUF; 1739 /* 1740 * In the VMIO case, use the B_NOREUSE flag to hint that the 1741 * pages backing each buffer in the range are unlikely to be 1742 * reused. Dirty buffers will have the hint applied once 1743 * they've been written. 1744 */ 1745 if (bp->b_vp->v_object != NULL) 1746 bp->b_flags |= B_NOREUSE; 1747 brelse(bp); 1748 BO_RLOCK(bo); 1749 } 1750 return (0); 1751 } 1752 1753 /* 1754 * Truncate a file's buffer and pages to a specified length. This 1755 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 1756 * sync activity. 1757 */ 1758 int 1759 vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize) 1760 { 1761 struct buf *bp, *nbp; 1762 int anyfreed; 1763 int trunclbn; 1764 struct bufobj *bo; 1765 1766 CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__, 1767 vp, cred, blksize, (uintmax_t)length); 1768 1769 /* 1770 * Round up to the *next* lbn. 1771 */ 1772 trunclbn = howmany(length, blksize); 1773 1774 ASSERT_VOP_LOCKED(vp, "vtruncbuf"); 1775 restart: 1776 bo = &vp->v_bufobj; 1777 BO_LOCK(bo); 1778 anyfreed = 1; 1779 for (;anyfreed;) { 1780 anyfreed = 0; 1781 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { 1782 if (bp->b_lblkno < trunclbn) 1783 continue; 1784 if (BUF_LOCK(bp, 1785 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1786 BO_LOCKPTR(bo)) == ENOLCK) 1787 goto restart; 1788 1789 bremfree(bp); 1790 bp->b_flags |= (B_INVAL | B_RELBUF); 1791 bp->b_flags &= ~B_ASYNC; 1792 brelse(bp); 1793 anyfreed = 1; 1794 1795 BO_LOCK(bo); 1796 if (nbp != NULL && 1797 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 1798 (nbp->b_vp != vp) || 1799 (nbp->b_flags & B_DELWRI))) { 1800 BO_UNLOCK(bo); 1801 goto restart; 1802 } 1803 } 1804 1805 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 1806 if (bp->b_lblkno < trunclbn) 1807 continue; 1808 if (BUF_LOCK(bp, 1809 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1810 BO_LOCKPTR(bo)) == ENOLCK) 1811 goto restart; 1812 bremfree(bp); 1813 bp->b_flags |= (B_INVAL | B_RELBUF); 1814 bp->b_flags &= ~B_ASYNC; 1815 brelse(bp); 1816 anyfreed = 1; 1817 1818 BO_LOCK(bo); 1819 if (nbp != NULL && 1820 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 1821 (nbp->b_vp != vp) || 1822 (nbp->b_flags & B_DELWRI) == 0)) { 1823 BO_UNLOCK(bo); 1824 goto restart; 1825 } 1826 } 1827 } 1828 1829 if (length > 0) { 1830 restartsync: 1831 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 1832 if (bp->b_lblkno > 0) 1833 continue; 1834 /* 1835 * Since we hold the vnode lock this should only 1836 * fail if we're racing with the buf daemon. 1837 */ 1838 if (BUF_LOCK(bp, 1839 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1840 BO_LOCKPTR(bo)) == ENOLCK) { 1841 goto restart; 1842 } 1843 VNASSERT((bp->b_flags & B_DELWRI), vp, 1844 ("buf(%p) on dirty queue without DELWRI", bp)); 1845 1846 bremfree(bp); 1847 bawrite(bp); 1848 BO_LOCK(bo); 1849 goto restartsync; 1850 } 1851 } 1852 1853 bufobj_wwait(bo, 0, 0); 1854 BO_UNLOCK(bo); 1855 vnode_pager_setsize(vp, length); 1856 1857 return (0); 1858 } 1859 1860 static void 1861 buf_vlist_remove(struct buf *bp) 1862 { 1863 struct bufv *bv; 1864 1865 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 1866 ASSERT_BO_WLOCKED(bp->b_bufobj); 1867 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) != 1868 (BX_VNDIRTY|BX_VNCLEAN), 1869 ("buf_vlist_remove: Buf %p is on two lists", bp)); 1870 if (bp->b_xflags & BX_VNDIRTY) 1871 bv = &bp->b_bufobj->bo_dirty; 1872 else 1873 bv = &bp->b_bufobj->bo_clean; 1874 BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); 1875 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); 1876 bv->bv_cnt--; 1877 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1878 } 1879 1880 /* 1881 * Add the buffer to the sorted clean or dirty block list. 1882 * 1883 * NOTE: xflags is passed as a constant, optimizing this inline function! 1884 */ 1885 static void 1886 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 1887 { 1888 struct bufv *bv; 1889 struct buf *n; 1890 int error; 1891 1892 ASSERT_BO_WLOCKED(bo); 1893 KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, 1894 ("dead bo %p", bo)); 1895 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, 1896 ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); 1897 bp->b_xflags |= xflags; 1898 if (xflags & BX_VNDIRTY) 1899 bv = &bo->bo_dirty; 1900 else 1901 bv = &bo->bo_clean; 1902 1903 /* 1904 * Keep the list ordered. Optimize empty list insertion. Assume 1905 * we tend to grow at the tail so lookup_le should usually be cheaper 1906 * than _ge. 1907 */ 1908 if (bv->bv_cnt == 0 || 1909 bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) 1910 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); 1911 else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) 1912 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); 1913 else 1914 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); 1915 error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); 1916 if (error) 1917 panic("buf_vlist_add: Preallocated nodes insufficient."); 1918 bv->bv_cnt++; 1919 } 1920 1921 /* 1922 * Look up a buffer using the buffer tries. 1923 */ 1924 struct buf * 1925 gbincore(struct bufobj *bo, daddr_t lblkno) 1926 { 1927 struct buf *bp; 1928 1929 ASSERT_BO_LOCKED(bo); 1930 bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); 1931 if (bp != NULL) 1932 return (bp); 1933 return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno); 1934 } 1935 1936 /* 1937 * Associate a buffer with a vnode. 1938 */ 1939 void 1940 bgetvp(struct vnode *vp, struct buf *bp) 1941 { 1942 struct bufobj *bo; 1943 1944 bo = &vp->v_bufobj; 1945 ASSERT_BO_WLOCKED(bo); 1946 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); 1947 1948 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); 1949 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, 1950 ("bgetvp: bp already attached! %p", bp)); 1951 1952 vhold(vp); 1953 bp->b_vp = vp; 1954 bp->b_bufobj = bo; 1955 /* 1956 * Insert onto list for new vnode. 1957 */ 1958 buf_vlist_add(bp, bo, BX_VNCLEAN); 1959 } 1960 1961 /* 1962 * Disassociate a buffer from a vnode. 1963 */ 1964 void 1965 brelvp(struct buf *bp) 1966 { 1967 struct bufobj *bo; 1968 struct vnode *vp; 1969 1970 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 1971 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 1972 1973 /* 1974 * Delete from old vnode list, if on one. 1975 */ 1976 vp = bp->b_vp; /* XXX */ 1977 bo = bp->b_bufobj; 1978 BO_LOCK(bo); 1979 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 1980 buf_vlist_remove(bp); 1981 else 1982 panic("brelvp: Buffer %p not on queue.", bp); 1983 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 1984 bo->bo_flag &= ~BO_ONWORKLST; 1985 mtx_lock(&sync_mtx); 1986 LIST_REMOVE(bo, bo_synclist); 1987 syncer_worklist_len--; 1988 mtx_unlock(&sync_mtx); 1989 } 1990 bp->b_vp = NULL; 1991 bp->b_bufobj = NULL; 1992 BO_UNLOCK(bo); 1993 vdrop(vp); 1994 } 1995 1996 /* 1997 * Add an item to the syncer work queue. 1998 */ 1999 static void 2000 vn_syncer_add_to_worklist(struct bufobj *bo, int delay) 2001 { 2002 int slot; 2003 2004 ASSERT_BO_WLOCKED(bo); 2005 2006 mtx_lock(&sync_mtx); 2007 if (bo->bo_flag & BO_ONWORKLST) 2008 LIST_REMOVE(bo, bo_synclist); 2009 else { 2010 bo->bo_flag |= BO_ONWORKLST; 2011 syncer_worklist_len++; 2012 } 2013 2014 if (delay > syncer_maxdelay - 2) 2015 delay = syncer_maxdelay - 2; 2016 slot = (syncer_delayno + delay) & syncer_mask; 2017 2018 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); 2019 mtx_unlock(&sync_mtx); 2020 } 2021 2022 static int 2023 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) 2024 { 2025 int error, len; 2026 2027 mtx_lock(&sync_mtx); 2028 len = syncer_worklist_len - sync_vnode_count; 2029 mtx_unlock(&sync_mtx); 2030 error = SYSCTL_OUT(req, &len, sizeof(len)); 2031 return (error); 2032 } 2033 2034 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, 2035 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); 2036 2037 static struct proc *updateproc; 2038 static void sched_sync(void); 2039 static struct kproc_desc up_kp = { 2040 "syncer", 2041 sched_sync, 2042 &updateproc 2043 }; 2044 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); 2045 2046 static int 2047 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) 2048 { 2049 struct vnode *vp; 2050 struct mount *mp; 2051 2052 *bo = LIST_FIRST(slp); 2053 if (*bo == NULL) 2054 return (0); 2055 vp = (*bo)->__bo_vnode; /* XXX */ 2056 if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) 2057 return (1); 2058 /* 2059 * We use vhold in case the vnode does not 2060 * successfully sync. vhold prevents the vnode from 2061 * going away when we unlock the sync_mtx so that 2062 * we can acquire the vnode interlock. 2063 */ 2064 vholdl(vp); 2065 mtx_unlock(&sync_mtx); 2066 VI_UNLOCK(vp); 2067 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 2068 vdrop(vp); 2069 mtx_lock(&sync_mtx); 2070 return (*bo == LIST_FIRST(slp)); 2071 } 2072 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2073 (void) VOP_FSYNC(vp, MNT_LAZY, td); 2074 VOP_UNLOCK(vp, 0); 2075 vn_finished_write(mp); 2076 BO_LOCK(*bo); 2077 if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { 2078 /* 2079 * Put us back on the worklist. The worklist 2080 * routine will remove us from our current 2081 * position and then add us back in at a later 2082 * position. 2083 */ 2084 vn_syncer_add_to_worklist(*bo, syncdelay); 2085 } 2086 BO_UNLOCK(*bo); 2087 vdrop(vp); 2088 mtx_lock(&sync_mtx); 2089 return (0); 2090 } 2091 2092 static int first_printf = 1; 2093 2094 /* 2095 * System filesystem synchronizer daemon. 2096 */ 2097 static void 2098 sched_sync(void) 2099 { 2100 struct synclist *next, *slp; 2101 struct bufobj *bo; 2102 long starttime; 2103 struct thread *td = curthread; 2104 int last_work_seen; 2105 int net_worklist_len; 2106 int syncer_final_iter; 2107 int error; 2108 2109 last_work_seen = 0; 2110 syncer_final_iter = 0; 2111 syncer_state = SYNCER_RUNNING; 2112 starttime = time_uptime; 2113 td->td_pflags |= TDP_NORUNNINGBUF; 2114 2115 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, 2116 SHUTDOWN_PRI_LAST); 2117 2118 mtx_lock(&sync_mtx); 2119 for (;;) { 2120 if (syncer_state == SYNCER_FINAL_DELAY && 2121 syncer_final_iter == 0) { 2122 mtx_unlock(&sync_mtx); 2123 kproc_suspend_check(td->td_proc); 2124 mtx_lock(&sync_mtx); 2125 } 2126 net_worklist_len = syncer_worklist_len - sync_vnode_count; 2127 if (syncer_state != SYNCER_RUNNING && 2128 starttime != time_uptime) { 2129 if (first_printf) { 2130 printf("\nSyncing disks, vnodes remaining... "); 2131 first_printf = 0; 2132 } 2133 printf("%d ", net_worklist_len); 2134 } 2135 starttime = time_uptime; 2136 2137 /* 2138 * Push files whose dirty time has expired. Be careful 2139 * of interrupt race on slp queue. 2140 * 2141 * Skip over empty worklist slots when shutting down. 2142 */ 2143 do { 2144 slp = &syncer_workitem_pending[syncer_delayno]; 2145 syncer_delayno += 1; 2146 if (syncer_delayno == syncer_maxdelay) 2147 syncer_delayno = 0; 2148 next = &syncer_workitem_pending[syncer_delayno]; 2149 /* 2150 * If the worklist has wrapped since the 2151 * it was emptied of all but syncer vnodes, 2152 * switch to the FINAL_DELAY state and run 2153 * for one more second. 2154 */ 2155 if (syncer_state == SYNCER_SHUTTING_DOWN && 2156 net_worklist_len == 0 && 2157 last_work_seen == syncer_delayno) { 2158 syncer_state = SYNCER_FINAL_DELAY; 2159 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; 2160 } 2161 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && 2162 syncer_worklist_len > 0); 2163 2164 /* 2165 * Keep track of the last time there was anything 2166 * on the worklist other than syncer vnodes. 2167 * Return to the SHUTTING_DOWN state if any 2168 * new work appears. 2169 */ 2170 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) 2171 last_work_seen = syncer_delayno; 2172 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) 2173 syncer_state = SYNCER_SHUTTING_DOWN; 2174 while (!LIST_EMPTY(slp)) { 2175 error = sync_vnode(slp, &bo, td); 2176 if (error == 1) { 2177 LIST_REMOVE(bo, bo_synclist); 2178 LIST_INSERT_HEAD(next, bo, bo_synclist); 2179 continue; 2180 } 2181 2182 if (first_printf == 0) { 2183 /* 2184 * Drop the sync mutex, because some watchdog 2185 * drivers need to sleep while patting 2186 */ 2187 mtx_unlock(&sync_mtx); 2188 wdog_kern_pat(WD_LASTVAL); 2189 mtx_lock(&sync_mtx); 2190 } 2191 2192 } 2193 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) 2194 syncer_final_iter--; 2195 /* 2196 * The variable rushjob allows the kernel to speed up the 2197 * processing of the filesystem syncer process. A rushjob 2198 * value of N tells the filesystem syncer to process the next 2199 * N seconds worth of work on its queue ASAP. Currently rushjob 2200 * is used by the soft update code to speed up the filesystem 2201 * syncer process when the incore state is getting so far 2202 * ahead of the disk that the kernel memory pool is being 2203 * threatened with exhaustion. 2204 */ 2205 if (rushjob > 0) { 2206 rushjob -= 1; 2207 continue; 2208 } 2209 /* 2210 * Just sleep for a short period of time between 2211 * iterations when shutting down to allow some I/O 2212 * to happen. 2213 * 2214 * If it has taken us less than a second to process the 2215 * current work, then wait. Otherwise start right over 2216 * again. We can still lose time if any single round 2217 * takes more than two seconds, but it does not really 2218 * matter as we are just trying to generally pace the 2219 * filesystem activity. 2220 */ 2221 if (syncer_state != SYNCER_RUNNING || 2222 time_uptime == starttime) { 2223 thread_lock(td); 2224 sched_prio(td, PPAUSE); 2225 thread_unlock(td); 2226 } 2227 if (syncer_state != SYNCER_RUNNING) 2228 cv_timedwait(&sync_wakeup, &sync_mtx, 2229 hz / SYNCER_SHUTDOWN_SPEEDUP); 2230 else if (time_uptime == starttime) 2231 cv_timedwait(&sync_wakeup, &sync_mtx, hz); 2232 } 2233 } 2234 2235 /* 2236 * Request the syncer daemon to speed up its work. 2237 * We never push it to speed up more than half of its 2238 * normal turn time, otherwise it could take over the cpu. 2239 */ 2240 int 2241 speedup_syncer(void) 2242 { 2243 int ret = 0; 2244 2245 mtx_lock(&sync_mtx); 2246 if (rushjob < syncdelay / 2) { 2247 rushjob += 1; 2248 stat_rush_requests += 1; 2249 ret = 1; 2250 } 2251 mtx_unlock(&sync_mtx); 2252 cv_broadcast(&sync_wakeup); 2253 return (ret); 2254 } 2255 2256 /* 2257 * Tell the syncer to speed up its work and run though its work 2258 * list several times, then tell it to shut down. 2259 */ 2260 static void 2261 syncer_shutdown(void *arg, int howto) 2262 { 2263 2264 if (howto & RB_NOSYNC) 2265 return; 2266 mtx_lock(&sync_mtx); 2267 syncer_state = SYNCER_SHUTTING_DOWN; 2268 rushjob = 0; 2269 mtx_unlock(&sync_mtx); 2270 cv_broadcast(&sync_wakeup); 2271 kproc_shutdown(arg, howto); 2272 } 2273 2274 void 2275 syncer_suspend(void) 2276 { 2277 2278 syncer_shutdown(updateproc, 0); 2279 } 2280 2281 void 2282 syncer_resume(void) 2283 { 2284 2285 mtx_lock(&sync_mtx); 2286 first_printf = 1; 2287 syncer_state = SYNCER_RUNNING; 2288 mtx_unlock(&sync_mtx); 2289 cv_broadcast(&sync_wakeup); 2290 kproc_resume(updateproc); 2291 } 2292 2293 /* 2294 * Reassign a buffer from one vnode to another. 2295 * Used to assign file specific control information 2296 * (indirect blocks) to the vnode to which they belong. 2297 */ 2298 void 2299 reassignbuf(struct buf *bp) 2300 { 2301 struct vnode *vp; 2302 struct bufobj *bo; 2303 int delay; 2304 #ifdef INVARIANTS 2305 struct bufv *bv; 2306 #endif 2307 2308 vp = bp->b_vp; 2309 bo = bp->b_bufobj; 2310 ++reassignbufcalls; 2311 2312 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", 2313 bp, bp->b_vp, bp->b_flags); 2314 /* 2315 * B_PAGING flagged buffers cannot be reassigned because their vp 2316 * is not fully linked in. 2317 */ 2318 if (bp->b_flags & B_PAGING) 2319 panic("cannot reassign paging buffer"); 2320 2321 /* 2322 * Delete from old vnode list, if on one. 2323 */ 2324 BO_LOCK(bo); 2325 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2326 buf_vlist_remove(bp); 2327 else 2328 panic("reassignbuf: Buffer %p not on queue.", bp); 2329 /* 2330 * If dirty, put on list of dirty buffers; otherwise insert onto list 2331 * of clean buffers. 2332 */ 2333 if (bp->b_flags & B_DELWRI) { 2334 if ((bo->bo_flag & BO_ONWORKLST) == 0) { 2335 switch (vp->v_type) { 2336 case VDIR: 2337 delay = dirdelay; 2338 break; 2339 case VCHR: 2340 delay = metadelay; 2341 break; 2342 default: 2343 delay = filedelay; 2344 } 2345 vn_syncer_add_to_worklist(bo, delay); 2346 } 2347 buf_vlist_add(bp, bo, BX_VNDIRTY); 2348 } else { 2349 buf_vlist_add(bp, bo, BX_VNCLEAN); 2350 2351 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2352 mtx_lock(&sync_mtx); 2353 LIST_REMOVE(bo, bo_synclist); 2354 syncer_worklist_len--; 2355 mtx_unlock(&sync_mtx); 2356 bo->bo_flag &= ~BO_ONWORKLST; 2357 } 2358 } 2359 #ifdef INVARIANTS 2360 bv = &bo->bo_clean; 2361 bp = TAILQ_FIRST(&bv->bv_hd); 2362 KASSERT(bp == NULL || bp->b_bufobj == bo, 2363 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2364 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2365 KASSERT(bp == NULL || bp->b_bufobj == bo, 2366 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2367 bv = &bo->bo_dirty; 2368 bp = TAILQ_FIRST(&bv->bv_hd); 2369 KASSERT(bp == NULL || bp->b_bufobj == bo, 2370 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2371 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2372 KASSERT(bp == NULL || bp->b_bufobj == bo, 2373 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2374 #endif 2375 BO_UNLOCK(bo); 2376 } 2377 2378 /* 2379 * A temporary hack until refcount_* APIs are sorted out. 2380 */ 2381 static __inline int 2382 vfs_refcount_acquire_if_not_zero(volatile u_int *count) 2383 { 2384 u_int old; 2385 2386 for (;;) { 2387 old = *count; 2388 if (old == 0) 2389 return (0); 2390 if (atomic_cmpset_int(count, old, old + 1)) 2391 return (1); 2392 } 2393 } 2394 2395 static __inline int 2396 vfs_refcount_release_if_not_last(volatile u_int *count) 2397 { 2398 u_int old; 2399 2400 for (;;) { 2401 old = *count; 2402 if (old == 1) 2403 return (0); 2404 if (atomic_cmpset_int(count, old, old - 1)) 2405 return (1); 2406 } 2407 } 2408 2409 static void 2410 v_init_counters(struct vnode *vp) 2411 { 2412 2413 VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, 2414 vp, ("%s called for an initialized vnode", __FUNCTION__)); 2415 ASSERT_VI_UNLOCKED(vp, __FUNCTION__); 2416 2417 refcount_init(&vp->v_holdcnt, 1); 2418 refcount_init(&vp->v_usecount, 1); 2419 } 2420 2421 static void 2422 v_incr_usecount_locked(struct vnode *vp) 2423 { 2424 2425 ASSERT_VI_LOCKED(vp, __func__); 2426 if ((vp->v_iflag & VI_OWEINACT) != 0) { 2427 VNASSERT(vp->v_usecount == 0, vp, 2428 ("vnode with usecount and VI_OWEINACT set")); 2429 vp->v_iflag &= ~VI_OWEINACT; 2430 } 2431 refcount_acquire(&vp->v_usecount); 2432 v_incr_devcount(vp); 2433 } 2434 2435 /* 2436 * Increment the use and hold counts on the vnode, taking care to reference 2437 * the driver's usecount if this is a chardev. The _vhold() will remove 2438 * the vnode from the free list if it is presently free. 2439 */ 2440 static void 2441 v_incr_usecount(struct vnode *vp) 2442 { 2443 2444 ASSERT_VI_UNLOCKED(vp, __func__); 2445 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2446 2447 if (vp->v_type != VCHR && 2448 vfs_refcount_acquire_if_not_zero(&vp->v_usecount)) { 2449 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, 2450 ("vnode with usecount and VI_OWEINACT set")); 2451 } else { 2452 VI_LOCK(vp); 2453 v_incr_usecount_locked(vp); 2454 VI_UNLOCK(vp); 2455 } 2456 } 2457 2458 /* 2459 * Increment si_usecount of the associated device, if any. 2460 */ 2461 static void 2462 v_incr_devcount(struct vnode *vp) 2463 { 2464 2465 ASSERT_VI_LOCKED(vp, __FUNCTION__); 2466 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 2467 dev_lock(); 2468 vp->v_rdev->si_usecount++; 2469 dev_unlock(); 2470 } 2471 } 2472 2473 /* 2474 * Decrement si_usecount of the associated device, if any. 2475 */ 2476 static void 2477 v_decr_devcount(struct vnode *vp) 2478 { 2479 2480 ASSERT_VI_LOCKED(vp, __FUNCTION__); 2481 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 2482 dev_lock(); 2483 vp->v_rdev->si_usecount--; 2484 dev_unlock(); 2485 } 2486 } 2487 2488 /* 2489 * Grab a particular vnode from the free list, increment its 2490 * reference count and lock it. VI_DOOMED is set if the vnode 2491 * is being destroyed. Only callers who specify LK_RETRY will 2492 * see doomed vnodes. If inactive processing was delayed in 2493 * vput try to do it here. 2494 * 2495 * Notes on lockless counter manipulation: 2496 * _vhold, vputx and other routines make various decisions based 2497 * on either holdcnt or usecount being 0. As long as either counter 2498 * is not transitioning 0->1 nor 1->0, the manipulation can be done 2499 * with atomic operations. Otherwise the interlock is taken covering 2500 * both the atomic and additional actions. 2501 */ 2502 int 2503 vget(struct vnode *vp, int flags, struct thread *td) 2504 { 2505 int error, oweinact; 2506 2507 VNASSERT((flags & LK_TYPE_MASK) != 0, vp, 2508 ("vget: invalid lock operation")); 2509 2510 if ((flags & LK_INTERLOCK) != 0) 2511 ASSERT_VI_LOCKED(vp, __func__); 2512 else 2513 ASSERT_VI_UNLOCKED(vp, __func__); 2514 if ((flags & LK_VNHELD) != 0) 2515 VNASSERT((vp->v_holdcnt > 0), vp, 2516 ("vget: LK_VNHELD passed but vnode not held")); 2517 2518 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 2519 2520 if ((flags & LK_VNHELD) == 0) 2521 _vhold(vp, (flags & LK_INTERLOCK) != 0); 2522 2523 if ((error = vn_lock(vp, flags)) != 0) { 2524 vdrop(vp); 2525 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, 2526 vp); 2527 return (error); 2528 } 2529 if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0) 2530 panic("vget: vn_lock failed to return ENOENT\n"); 2531 /* 2532 * We don't guarantee that any particular close will 2533 * trigger inactive processing so just make a best effort 2534 * here at preventing a reference to a removed file. If 2535 * we don't succeed no harm is done. 2536 * 2537 * Upgrade our holdcnt to a usecount. 2538 */ 2539 if (vp->v_type == VCHR || 2540 !vfs_refcount_acquire_if_not_zero(&vp->v_usecount)) { 2541 VI_LOCK(vp); 2542 if ((vp->v_iflag & VI_OWEINACT) == 0) { 2543 oweinact = 0; 2544 } else { 2545 oweinact = 1; 2546 vp->v_iflag &= ~VI_OWEINACT; 2547 } 2548 refcount_acquire(&vp->v_usecount); 2549 v_incr_devcount(vp); 2550 if (oweinact && VOP_ISLOCKED(vp) == LK_EXCLUSIVE && 2551 (flags & LK_NOWAIT) == 0) 2552 vinactive(vp, td); 2553 VI_UNLOCK(vp); 2554 } 2555 return (0); 2556 } 2557 2558 /* 2559 * Increase the reference count of a vnode. 2560 */ 2561 void 2562 vref(struct vnode *vp) 2563 { 2564 2565 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2566 _vhold(vp, false); 2567 v_incr_usecount(vp); 2568 } 2569 2570 void 2571 vrefl(struct vnode *vp) 2572 { 2573 2574 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2575 _vhold(vp, true); 2576 v_incr_usecount_locked(vp); 2577 } 2578 2579 /* 2580 * Return reference count of a vnode. 2581 * 2582 * The results of this call are only guaranteed when some mechanism is used to 2583 * stop other processes from gaining references to the vnode. This may be the 2584 * case if the caller holds the only reference. This is also useful when stale 2585 * data is acceptable as race conditions may be accounted for by some other 2586 * means. 2587 */ 2588 int 2589 vrefcnt(struct vnode *vp) 2590 { 2591 2592 return (vp->v_usecount); 2593 } 2594 2595 #define VPUTX_VRELE 1 2596 #define VPUTX_VPUT 2 2597 #define VPUTX_VUNREF 3 2598 2599 /* 2600 * Decrement the use and hold counts for a vnode. 2601 * 2602 * See an explanation near vget() as to why atomic operation is safe. 2603 */ 2604 static void 2605 vputx(struct vnode *vp, int func) 2606 { 2607 int error; 2608 2609 KASSERT(vp != NULL, ("vputx: null vp")); 2610 if (func == VPUTX_VUNREF) 2611 ASSERT_VOP_LOCKED(vp, "vunref"); 2612 else if (func == VPUTX_VPUT) 2613 ASSERT_VOP_LOCKED(vp, "vput"); 2614 else 2615 KASSERT(func == VPUTX_VRELE, ("vputx: wrong func")); 2616 ASSERT_VI_UNLOCKED(vp, __func__); 2617 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2618 2619 if (vp->v_type != VCHR && 2620 vfs_refcount_release_if_not_last(&vp->v_usecount)) { 2621 if (func == VPUTX_VPUT) 2622 VOP_UNLOCK(vp, 0); 2623 vdrop(vp); 2624 return; 2625 } 2626 2627 VI_LOCK(vp); 2628 2629 /* 2630 * We want to hold the vnode until the inactive finishes to 2631 * prevent vgone() races. We drop the use count here and the 2632 * hold count below when we're done. 2633 */ 2634 if (!refcount_release(&vp->v_usecount) || 2635 (vp->v_iflag & VI_DOINGINACT)) { 2636 if (func == VPUTX_VPUT) 2637 VOP_UNLOCK(vp, 0); 2638 v_decr_devcount(vp); 2639 vdropl(vp); 2640 return; 2641 } 2642 2643 v_decr_devcount(vp); 2644 2645 error = 0; 2646 2647 if (vp->v_usecount != 0) { 2648 vn_printf(vp, "vputx: usecount not zero for vnode "); 2649 panic("vputx: usecount not zero"); 2650 } 2651 2652 CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp); 2653 2654 /* 2655 * We must call VOP_INACTIVE with the node locked. Mark 2656 * as VI_DOINGINACT to avoid recursion. 2657 */ 2658 vp->v_iflag |= VI_OWEINACT; 2659 switch (func) { 2660 case VPUTX_VRELE: 2661 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 2662 VI_LOCK(vp); 2663 break; 2664 case VPUTX_VPUT: 2665 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 2666 error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | 2667 LK_NOWAIT); 2668 VI_LOCK(vp); 2669 } 2670 break; 2671 case VPUTX_VUNREF: 2672 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 2673 error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); 2674 VI_LOCK(vp); 2675 } 2676 break; 2677 } 2678 VNASSERT(vp->v_usecount == 0 || (vp->v_iflag & VI_OWEINACT) == 0, vp, 2679 ("vnode with usecount and VI_OWEINACT set")); 2680 if (error == 0) { 2681 if (vp->v_iflag & VI_OWEINACT) 2682 vinactive(vp, curthread); 2683 if (func != VPUTX_VUNREF) 2684 VOP_UNLOCK(vp, 0); 2685 } 2686 vdropl(vp); 2687 } 2688 2689 /* 2690 * Vnode put/release. 2691 * If count drops to zero, call inactive routine and return to freelist. 2692 */ 2693 void 2694 vrele(struct vnode *vp) 2695 { 2696 2697 vputx(vp, VPUTX_VRELE); 2698 } 2699 2700 /* 2701 * Release an already locked vnode. This give the same effects as 2702 * unlock+vrele(), but takes less time and avoids releasing and 2703 * re-aquiring the lock (as vrele() acquires the lock internally.) 2704 */ 2705 void 2706 vput(struct vnode *vp) 2707 { 2708 2709 vputx(vp, VPUTX_VPUT); 2710 } 2711 2712 /* 2713 * Release an exclusively locked vnode. Do not unlock the vnode lock. 2714 */ 2715 void 2716 vunref(struct vnode *vp) 2717 { 2718 2719 vputx(vp, VPUTX_VUNREF); 2720 } 2721 2722 /* 2723 * Increase the hold count and activate if this is the first reference. 2724 */ 2725 void 2726 _vhold(struct vnode *vp, bool locked) 2727 { 2728 struct mount *mp; 2729 2730 if (locked) 2731 ASSERT_VI_LOCKED(vp, __func__); 2732 else 2733 ASSERT_VI_UNLOCKED(vp, __func__); 2734 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2735 if (!locked && vfs_refcount_acquire_if_not_zero(&vp->v_holdcnt)) { 2736 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, 2737 ("_vhold: vnode with holdcnt is free")); 2738 return; 2739 } 2740 2741 if (!locked) 2742 VI_LOCK(vp); 2743 if ((vp->v_iflag & VI_FREE) == 0) { 2744 refcount_acquire(&vp->v_holdcnt); 2745 if (!locked) 2746 VI_UNLOCK(vp); 2747 return; 2748 } 2749 VNASSERT(vp->v_holdcnt == 0, vp, 2750 ("%s: wrong hold count", __func__)); 2751 VNASSERT(vp->v_op != NULL, vp, 2752 ("%s: vnode already reclaimed.", __func__)); 2753 /* 2754 * Remove a vnode from the free list, mark it as in use, 2755 * and put it on the active list. 2756 */ 2757 mtx_lock(&vnode_free_list_mtx); 2758 TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); 2759 freevnodes--; 2760 vp->v_iflag &= ~VI_FREE; 2761 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 2762 ("Activating already active vnode")); 2763 vp->v_iflag |= VI_ACTIVE; 2764 mp = vp->v_mount; 2765 TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); 2766 mp->mnt_activevnodelistsize++; 2767 mtx_unlock(&vnode_free_list_mtx); 2768 refcount_acquire(&vp->v_holdcnt); 2769 if (!locked) 2770 VI_UNLOCK(vp); 2771 } 2772 2773 /* 2774 * Drop the hold count of the vnode. If this is the last reference to 2775 * the vnode we place it on the free list unless it has been vgone'd 2776 * (marked VI_DOOMED) in which case we will free it. 2777 * 2778 * Because the vnode vm object keeps a hold reference on the vnode if 2779 * there is at least one resident non-cached page, the vnode cannot 2780 * leave the active list without the page cleanup done. 2781 */ 2782 void 2783 _vdrop(struct vnode *vp, bool locked) 2784 { 2785 struct bufobj *bo; 2786 struct mount *mp; 2787 int active; 2788 2789 if (locked) 2790 ASSERT_VI_LOCKED(vp, __func__); 2791 else 2792 ASSERT_VI_UNLOCKED(vp, __func__); 2793 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2794 if ((int)vp->v_holdcnt <= 0) 2795 panic("vdrop: holdcnt %d", vp->v_holdcnt); 2796 if (vfs_refcount_release_if_not_last(&vp->v_holdcnt)) { 2797 if (locked) 2798 VI_UNLOCK(vp); 2799 return; 2800 } 2801 2802 if (!locked) 2803 VI_LOCK(vp); 2804 if (refcount_release(&vp->v_holdcnt) == 0) { 2805 VI_UNLOCK(vp); 2806 return; 2807 } 2808 if ((vp->v_iflag & VI_DOOMED) == 0) { 2809 /* 2810 * Mark a vnode as free: remove it from its active list 2811 * and put it up for recycling on the freelist. 2812 */ 2813 VNASSERT(vp->v_op != NULL, vp, 2814 ("vdropl: vnode already reclaimed.")); 2815 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, 2816 ("vnode already free")); 2817 VNASSERT(vp->v_holdcnt == 0, vp, 2818 ("vdropl: freeing when we shouldn't")); 2819 active = vp->v_iflag & VI_ACTIVE; 2820 if ((vp->v_iflag & VI_OWEINACT) == 0) { 2821 vp->v_iflag &= ~VI_ACTIVE; 2822 mp = vp->v_mount; 2823 mtx_lock(&vnode_free_list_mtx); 2824 if (active) { 2825 TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, 2826 v_actfreelist); 2827 mp->mnt_activevnodelistsize--; 2828 } 2829 TAILQ_INSERT_TAIL(&vnode_free_list, vp, 2830 v_actfreelist); 2831 freevnodes++; 2832 vp->v_iflag |= VI_FREE; 2833 mtx_unlock(&vnode_free_list_mtx); 2834 } else { 2835 atomic_add_long(&free_owe_inact, 1); 2836 } 2837 VI_UNLOCK(vp); 2838 return; 2839 } 2840 /* 2841 * The vnode has been marked for destruction, so free it. 2842 * 2843 * The vnode will be returned to the zone where it will 2844 * normally remain until it is needed for another vnode. We 2845 * need to cleanup (or verify that the cleanup has already 2846 * been done) any residual data left from its current use 2847 * so as not to contaminate the freshly allocated vnode. 2848 */ 2849 CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); 2850 atomic_subtract_long(&numvnodes, 1); 2851 bo = &vp->v_bufobj; 2852 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, 2853 ("cleaned vnode still on the free list.")); 2854 VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); 2855 VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count")); 2856 VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); 2857 VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); 2858 VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); 2859 VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); 2860 VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, 2861 ("clean blk trie not empty")); 2862 VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); 2863 VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, 2864 ("dirty blk trie not empty")); 2865 VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); 2866 VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); 2867 VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for ..")); 2868 VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, 2869 ("Dangling rangelock waiters")); 2870 VI_UNLOCK(vp); 2871 #ifdef MAC 2872 mac_vnode_destroy(vp); 2873 #endif 2874 if (vp->v_pollinfo != NULL) { 2875 destroy_vpollinfo(vp->v_pollinfo); 2876 vp->v_pollinfo = NULL; 2877 } 2878 #ifdef INVARIANTS 2879 /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */ 2880 vp->v_op = NULL; 2881 #endif 2882 bzero(&vp->v_un, sizeof(vp->v_un)); 2883 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 2884 vp->v_iflag = 0; 2885 vp->v_vflag = 0; 2886 bo->bo_flag = 0; 2887 uma_zfree(vnode_zone, vp); 2888 } 2889 2890 /* 2891 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT 2892 * flags. DOINGINACT prevents us from recursing in calls to vinactive. 2893 * OWEINACT tracks whether a vnode missed a call to inactive due to a 2894 * failed lock upgrade. 2895 */ 2896 void 2897 vinactive(struct vnode *vp, struct thread *td) 2898 { 2899 struct vm_object *obj; 2900 2901 ASSERT_VOP_ELOCKED(vp, "vinactive"); 2902 ASSERT_VI_LOCKED(vp, "vinactive"); 2903 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, 2904 ("vinactive: recursed on VI_DOINGINACT")); 2905 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2906 vp->v_iflag |= VI_DOINGINACT; 2907 vp->v_iflag &= ~VI_OWEINACT; 2908 VI_UNLOCK(vp); 2909 /* 2910 * Before moving off the active list, we must be sure that any 2911 * modified pages are converted into the vnode's dirty 2912 * buffers, since these will no longer be checked once the 2913 * vnode is on the inactive list. 2914 * 2915 * The write-out of the dirty pages is asynchronous. At the 2916 * point that VOP_INACTIVE() is called, there could still be 2917 * pending I/O and dirty pages in the object. 2918 */ 2919 obj = vp->v_object; 2920 if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) { 2921 VM_OBJECT_WLOCK(obj); 2922 vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC); 2923 VM_OBJECT_WUNLOCK(obj); 2924 } 2925 VOP_INACTIVE(vp, td); 2926 VI_LOCK(vp); 2927 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, 2928 ("vinactive: lost VI_DOINGINACT")); 2929 vp->v_iflag &= ~VI_DOINGINACT; 2930 } 2931 2932 /* 2933 * Remove any vnodes in the vnode table belonging to mount point mp. 2934 * 2935 * If FORCECLOSE is not specified, there should not be any active ones, 2936 * return error if any are found (nb: this is a user error, not a 2937 * system error). If FORCECLOSE is specified, detach any active vnodes 2938 * that are found. 2939 * 2940 * If WRITECLOSE is set, only flush out regular file vnodes open for 2941 * writing. 2942 * 2943 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. 2944 * 2945 * `rootrefs' specifies the base reference count for the root vnode 2946 * of this filesystem. The root vnode is considered busy if its 2947 * v_usecount exceeds this value. On a successful return, vflush(, td) 2948 * will call vrele() on the root vnode exactly rootrefs times. 2949 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 2950 * be zero. 2951 */ 2952 #ifdef DIAGNOSTIC 2953 static int busyprt = 0; /* print out busy vnodes */ 2954 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); 2955 #endif 2956 2957 int 2958 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) 2959 { 2960 struct vnode *vp, *mvp, *rootvp = NULL; 2961 struct vattr vattr; 2962 int busy = 0, error; 2963 2964 CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, 2965 rootrefs, flags); 2966 if (rootrefs > 0) { 2967 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 2968 ("vflush: bad args")); 2969 /* 2970 * Get the filesystem root vnode. We can vput() it 2971 * immediately, since with rootrefs > 0, it won't go away. 2972 */ 2973 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { 2974 CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", 2975 __func__, error); 2976 return (error); 2977 } 2978 vput(rootvp); 2979 } 2980 loop: 2981 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 2982 vholdl(vp); 2983 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); 2984 if (error) { 2985 vdrop(vp); 2986 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 2987 goto loop; 2988 } 2989 /* 2990 * Skip over a vnodes marked VV_SYSTEM. 2991 */ 2992 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 2993 VOP_UNLOCK(vp, 0); 2994 vdrop(vp); 2995 continue; 2996 } 2997 /* 2998 * If WRITECLOSE is set, flush out unlinked but still open 2999 * files (even if open only for reading) and regular file 3000 * vnodes open for writing. 3001 */ 3002 if (flags & WRITECLOSE) { 3003 if (vp->v_object != NULL) { 3004 VM_OBJECT_WLOCK(vp->v_object); 3005 vm_object_page_clean(vp->v_object, 0, 0, 0); 3006 VM_OBJECT_WUNLOCK(vp->v_object); 3007 } 3008 error = VOP_FSYNC(vp, MNT_WAIT, td); 3009 if (error != 0) { 3010 VOP_UNLOCK(vp, 0); 3011 vdrop(vp); 3012 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 3013 return (error); 3014 } 3015 error = VOP_GETATTR(vp, &vattr, td->td_ucred); 3016 VI_LOCK(vp); 3017 3018 if ((vp->v_type == VNON || 3019 (error == 0 && vattr.va_nlink > 0)) && 3020 (vp->v_writecount == 0 || vp->v_type != VREG)) { 3021 VOP_UNLOCK(vp, 0); 3022 vdropl(vp); 3023 continue; 3024 } 3025 } else 3026 VI_LOCK(vp); 3027 /* 3028 * With v_usecount == 0, all we need to do is clear out the 3029 * vnode data structures and we are done. 3030 * 3031 * If FORCECLOSE is set, forcibly close the vnode. 3032 */ 3033 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { 3034 vgonel(vp); 3035 } else { 3036 busy++; 3037 #ifdef DIAGNOSTIC 3038 if (busyprt) 3039 vn_printf(vp, "vflush: busy vnode "); 3040 #endif 3041 } 3042 VOP_UNLOCK(vp, 0); 3043 vdropl(vp); 3044 } 3045 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 3046 /* 3047 * If just the root vnode is busy, and if its refcount 3048 * is equal to `rootrefs', then go ahead and kill it. 3049 */ 3050 VI_LOCK(rootvp); 3051 KASSERT(busy > 0, ("vflush: not busy")); 3052 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, 3053 ("vflush: usecount %d < rootrefs %d", 3054 rootvp->v_usecount, rootrefs)); 3055 if (busy == 1 && rootvp->v_usecount == rootrefs) { 3056 VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); 3057 vgone(rootvp); 3058 VOP_UNLOCK(rootvp, 0); 3059 busy = 0; 3060 } else 3061 VI_UNLOCK(rootvp); 3062 } 3063 if (busy) { 3064 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, 3065 busy); 3066 return (EBUSY); 3067 } 3068 for (; rootrefs > 0; rootrefs--) 3069 vrele(rootvp); 3070 return (0); 3071 } 3072 3073 /* 3074 * Recycle an unused vnode to the front of the free list. 3075 */ 3076 int 3077 vrecycle(struct vnode *vp) 3078 { 3079 int recycled; 3080 3081 ASSERT_VOP_ELOCKED(vp, "vrecycle"); 3082 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3083 recycled = 0; 3084 VI_LOCK(vp); 3085 if (vp->v_usecount == 0) { 3086 recycled = 1; 3087 vgonel(vp); 3088 } 3089 VI_UNLOCK(vp); 3090 return (recycled); 3091 } 3092 3093 /* 3094 * Eliminate all activity associated with a vnode 3095 * in preparation for reuse. 3096 */ 3097 void 3098 vgone(struct vnode *vp) 3099 { 3100 VI_LOCK(vp); 3101 vgonel(vp); 3102 VI_UNLOCK(vp); 3103 } 3104 3105 static void 3106 notify_lowervp_vfs_dummy(struct mount *mp __unused, 3107 struct vnode *lowervp __unused) 3108 { 3109 } 3110 3111 /* 3112 * Notify upper mounts about reclaimed or unlinked vnode. 3113 */ 3114 void 3115 vfs_notify_upper(struct vnode *vp, int event) 3116 { 3117 static struct vfsops vgonel_vfsops = { 3118 .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy, 3119 .vfs_unlink_lowervp = notify_lowervp_vfs_dummy, 3120 }; 3121 struct mount *mp, *ump, *mmp; 3122 3123 mp = vp->v_mount; 3124 if (mp == NULL) 3125 return; 3126 3127 MNT_ILOCK(mp); 3128 if (TAILQ_EMPTY(&mp->mnt_uppers)) 3129 goto unlock; 3130 MNT_IUNLOCK(mp); 3131 mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO); 3132 mmp->mnt_op = &vgonel_vfsops; 3133 mmp->mnt_kern_flag |= MNTK_MARKER; 3134 MNT_ILOCK(mp); 3135 mp->mnt_kern_flag |= MNTK_VGONE_UPPER; 3136 for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) { 3137 if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) { 3138 ump = TAILQ_NEXT(ump, mnt_upper_link); 3139 continue; 3140 } 3141 TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link); 3142 MNT_IUNLOCK(mp); 3143 switch (event) { 3144 case VFS_NOTIFY_UPPER_RECLAIM: 3145 VFS_RECLAIM_LOWERVP(ump, vp); 3146 break; 3147 case VFS_NOTIFY_UPPER_UNLINK: 3148 VFS_UNLINK_LOWERVP(ump, vp); 3149 break; 3150 default: 3151 KASSERT(0, ("invalid event %d", event)); 3152 break; 3153 } 3154 MNT_ILOCK(mp); 3155 ump = TAILQ_NEXT(mmp, mnt_upper_link); 3156 TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link); 3157 } 3158 free(mmp, M_TEMP); 3159 mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER; 3160 if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) { 3161 mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER; 3162 wakeup(&mp->mnt_uppers); 3163 } 3164 unlock: 3165 MNT_IUNLOCK(mp); 3166 } 3167 3168 /* 3169 * vgone, with the vp interlock held. 3170 */ 3171 static void 3172 vgonel(struct vnode *vp) 3173 { 3174 struct thread *td; 3175 int oweinact; 3176 int active; 3177 struct mount *mp; 3178 3179 ASSERT_VOP_ELOCKED(vp, "vgonel"); 3180 ASSERT_VI_LOCKED(vp, "vgonel"); 3181 VNASSERT(vp->v_holdcnt, vp, 3182 ("vgonel: vp %p has no reference.", vp)); 3183 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3184 td = curthread; 3185 3186 /* 3187 * Don't vgonel if we're already doomed. 3188 */ 3189 if (vp->v_iflag & VI_DOOMED) 3190 return; 3191 vp->v_iflag |= VI_DOOMED; 3192 3193 /* 3194 * Check to see if the vnode is in use. If so, we have to call 3195 * VOP_CLOSE() and VOP_INACTIVE(). 3196 */ 3197 active = vp->v_usecount; 3198 oweinact = (vp->v_iflag & VI_OWEINACT); 3199 VI_UNLOCK(vp); 3200 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); 3201 3202 /* 3203 * If purging an active vnode, it must be closed and 3204 * deactivated before being reclaimed. 3205 */ 3206 if (active) 3207 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 3208 if (oweinact || active) { 3209 VI_LOCK(vp); 3210 if ((vp->v_iflag & VI_DOINGINACT) == 0) 3211 vinactive(vp, td); 3212 VI_UNLOCK(vp); 3213 } 3214 if (vp->v_type == VSOCK) 3215 vfs_unp_reclaim(vp); 3216 3217 /* 3218 * Clean out any buffers associated with the vnode. 3219 * If the flush fails, just toss the buffers. 3220 */ 3221 mp = NULL; 3222 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) 3223 (void) vn_start_secondary_write(vp, &mp, V_WAIT); 3224 if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { 3225 while (vinvalbuf(vp, 0, 0, 0) != 0) 3226 ; 3227 } 3228 3229 BO_LOCK(&vp->v_bufobj); 3230 KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && 3231 vp->v_bufobj.bo_dirty.bv_cnt == 0 && 3232 TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && 3233 vp->v_bufobj.bo_clean.bv_cnt == 0, 3234 ("vp %p bufobj not invalidated", vp)); 3235 3236 /* 3237 * For VMIO bufobj, BO_DEAD is set in vm_object_terminate() 3238 * after the object's page queue is flushed. 3239 */ 3240 if (vp->v_bufobj.bo_object == NULL) 3241 vp->v_bufobj.bo_flag |= BO_DEAD; 3242 BO_UNLOCK(&vp->v_bufobj); 3243 3244 /* 3245 * Reclaim the vnode. 3246 */ 3247 if (VOP_RECLAIM(vp, td)) 3248 panic("vgone: cannot reclaim"); 3249 if (mp != NULL) 3250 vn_finished_secondary_write(mp); 3251 VNASSERT(vp->v_object == NULL, vp, 3252 ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag)); 3253 /* 3254 * Clear the advisory locks and wake up waiting threads. 3255 */ 3256 (void)VOP_ADVLOCKPURGE(vp); 3257 vp->v_lockf = NULL; 3258 /* 3259 * Delete from old mount point vnode list. 3260 */ 3261 delmntque(vp); 3262 cache_purge(vp); 3263 /* 3264 * Done with purge, reset to the standard lock and invalidate 3265 * the vnode. 3266 */ 3267 VI_LOCK(vp); 3268 vp->v_vnlock = &vp->v_lock; 3269 vp->v_op = &dead_vnodeops; 3270 vp->v_tag = "none"; 3271 vp->v_type = VBAD; 3272 } 3273 3274 /* 3275 * Calculate the total number of references to a special device. 3276 */ 3277 int 3278 vcount(struct vnode *vp) 3279 { 3280 int count; 3281 3282 dev_lock(); 3283 count = vp->v_rdev->si_usecount; 3284 dev_unlock(); 3285 return (count); 3286 } 3287 3288 /* 3289 * Same as above, but using the struct cdev *as argument 3290 */ 3291 int 3292 count_dev(struct cdev *dev) 3293 { 3294 int count; 3295 3296 dev_lock(); 3297 count = dev->si_usecount; 3298 dev_unlock(); 3299 return(count); 3300 } 3301 3302 /* 3303 * Print out a description of a vnode. 3304 */ 3305 static char *typename[] = 3306 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", 3307 "VMARKER"}; 3308 3309 void 3310 vn_printf(struct vnode *vp, const char *fmt, ...) 3311 { 3312 va_list ap; 3313 char buf[256], buf2[16]; 3314 u_long flags; 3315 3316 va_start(ap, fmt); 3317 vprintf(fmt, ap); 3318 va_end(ap); 3319 printf("%p: ", (void *)vp); 3320 printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]); 3321 printf(" usecount %d, writecount %d, refcount %d mountedhere %p\n", 3322 vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere); 3323 buf[0] = '\0'; 3324 buf[1] = '\0'; 3325 if (vp->v_vflag & VV_ROOT) 3326 strlcat(buf, "|VV_ROOT", sizeof(buf)); 3327 if (vp->v_vflag & VV_ISTTY) 3328 strlcat(buf, "|VV_ISTTY", sizeof(buf)); 3329 if (vp->v_vflag & VV_NOSYNC) 3330 strlcat(buf, "|VV_NOSYNC", sizeof(buf)); 3331 if (vp->v_vflag & VV_ETERNALDEV) 3332 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); 3333 if (vp->v_vflag & VV_CACHEDLABEL) 3334 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); 3335 if (vp->v_vflag & VV_TEXT) 3336 strlcat(buf, "|VV_TEXT", sizeof(buf)); 3337 if (vp->v_vflag & VV_COPYONWRITE) 3338 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); 3339 if (vp->v_vflag & VV_SYSTEM) 3340 strlcat(buf, "|VV_SYSTEM", sizeof(buf)); 3341 if (vp->v_vflag & VV_PROCDEP) 3342 strlcat(buf, "|VV_PROCDEP", sizeof(buf)); 3343 if (vp->v_vflag & VV_NOKNOTE) 3344 strlcat(buf, "|VV_NOKNOTE", sizeof(buf)); 3345 if (vp->v_vflag & VV_DELETED) 3346 strlcat(buf, "|VV_DELETED", sizeof(buf)); 3347 if (vp->v_vflag & VV_MD) 3348 strlcat(buf, "|VV_MD", sizeof(buf)); 3349 if (vp->v_vflag & VV_FORCEINSMQ) 3350 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); 3351 flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | 3352 VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | 3353 VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ); 3354 if (flags != 0) { 3355 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); 3356 strlcat(buf, buf2, sizeof(buf)); 3357 } 3358 if (vp->v_iflag & VI_MOUNT) 3359 strlcat(buf, "|VI_MOUNT", sizeof(buf)); 3360 if (vp->v_iflag & VI_DOOMED) 3361 strlcat(buf, "|VI_DOOMED", sizeof(buf)); 3362 if (vp->v_iflag & VI_FREE) 3363 strlcat(buf, "|VI_FREE", sizeof(buf)); 3364 if (vp->v_iflag & VI_ACTIVE) 3365 strlcat(buf, "|VI_ACTIVE", sizeof(buf)); 3366 if (vp->v_iflag & VI_DOINGINACT) 3367 strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); 3368 if (vp->v_iflag & VI_OWEINACT) 3369 strlcat(buf, "|VI_OWEINACT", sizeof(buf)); 3370 flags = vp->v_iflag & ~(VI_MOUNT | VI_DOOMED | VI_FREE | 3371 VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT); 3372 if (flags != 0) { 3373 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); 3374 strlcat(buf, buf2, sizeof(buf)); 3375 } 3376 printf(" flags (%s)\n", buf + 1); 3377 if (mtx_owned(VI_MTX(vp))) 3378 printf(" VI_LOCKed"); 3379 if (vp->v_object != NULL) 3380 printf(" v_object %p ref %d pages %d " 3381 "cleanbuf %d dirtybuf %d\n", 3382 vp->v_object, vp->v_object->ref_count, 3383 vp->v_object->resident_page_count, 3384 vp->v_bufobj.bo_clean.bv_cnt, 3385 vp->v_bufobj.bo_dirty.bv_cnt); 3386 printf(" "); 3387 lockmgr_printinfo(vp->v_vnlock); 3388 if (vp->v_data != NULL) 3389 VOP_PRINT(vp); 3390 } 3391 3392 #ifdef DDB 3393 /* 3394 * List all of the locked vnodes in the system. 3395 * Called when debugging the kernel. 3396 */ 3397 DB_SHOW_COMMAND(lockedvnods, lockedvnodes) 3398 { 3399 struct mount *mp; 3400 struct vnode *vp; 3401 3402 /* 3403 * Note: because this is DDB, we can't obey the locking semantics 3404 * for these structures, which means we could catch an inconsistent 3405 * state and dereference a nasty pointer. Not much to be done 3406 * about that. 3407 */ 3408 db_printf("Locked vnodes\n"); 3409 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3410 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 3411 if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) 3412 vn_printf(vp, "vnode "); 3413 } 3414 } 3415 } 3416 3417 /* 3418 * Show details about the given vnode. 3419 */ 3420 DB_SHOW_COMMAND(vnode, db_show_vnode) 3421 { 3422 struct vnode *vp; 3423 3424 if (!have_addr) 3425 return; 3426 vp = (struct vnode *)addr; 3427 vn_printf(vp, "vnode "); 3428 } 3429 3430 /* 3431 * Show details about the given mount point. 3432 */ 3433 DB_SHOW_COMMAND(mount, db_show_mount) 3434 { 3435 struct mount *mp; 3436 struct vfsopt *opt; 3437 struct statfs *sp; 3438 struct vnode *vp; 3439 char buf[512]; 3440 uint64_t mflags; 3441 u_int flags; 3442 3443 if (!have_addr) { 3444 /* No address given, print short info about all mount points. */ 3445 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3446 db_printf("%p %s on %s (%s)\n", mp, 3447 mp->mnt_stat.f_mntfromname, 3448 mp->mnt_stat.f_mntonname, 3449 mp->mnt_stat.f_fstypename); 3450 if (db_pager_quit) 3451 break; 3452 } 3453 db_printf("\nMore info: show mount <addr>\n"); 3454 return; 3455 } 3456 3457 mp = (struct mount *)addr; 3458 db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, 3459 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); 3460 3461 buf[0] = '\0'; 3462 mflags = mp->mnt_flag; 3463 #define MNT_FLAG(flag) do { \ 3464 if (mflags & (flag)) { \ 3465 if (buf[0] != '\0') \ 3466 strlcat(buf, ", ", sizeof(buf)); \ 3467 strlcat(buf, (#flag) + 4, sizeof(buf)); \ 3468 mflags &= ~(flag); \ 3469 } \ 3470 } while (0) 3471 MNT_FLAG(MNT_RDONLY); 3472 MNT_FLAG(MNT_SYNCHRONOUS); 3473 MNT_FLAG(MNT_NOEXEC); 3474 MNT_FLAG(MNT_NOSUID); 3475 MNT_FLAG(MNT_NFS4ACLS); 3476 MNT_FLAG(MNT_UNION); 3477 MNT_FLAG(MNT_ASYNC); 3478 MNT_FLAG(MNT_SUIDDIR); 3479 MNT_FLAG(MNT_SOFTDEP); 3480 MNT_FLAG(MNT_NOSYMFOLLOW); 3481 MNT_FLAG(MNT_GJOURNAL); 3482 MNT_FLAG(MNT_MULTILABEL); 3483 MNT_FLAG(MNT_ACLS); 3484 MNT_FLAG(MNT_NOATIME); 3485 MNT_FLAG(MNT_NOCLUSTERR); 3486 MNT_FLAG(MNT_NOCLUSTERW); 3487 MNT_FLAG(MNT_SUJ); 3488 MNT_FLAG(MNT_EXRDONLY); 3489 MNT_FLAG(MNT_EXPORTED); 3490 MNT_FLAG(MNT_DEFEXPORTED); 3491 MNT_FLAG(MNT_EXPORTANON); 3492 MNT_FLAG(MNT_EXKERB); 3493 MNT_FLAG(MNT_EXPUBLIC); 3494 MNT_FLAG(MNT_LOCAL); 3495 MNT_FLAG(MNT_QUOTA); 3496 MNT_FLAG(MNT_ROOTFS); 3497 MNT_FLAG(MNT_USER); 3498 MNT_FLAG(MNT_IGNORE); 3499 MNT_FLAG(MNT_UPDATE); 3500 MNT_FLAG(MNT_DELEXPORT); 3501 MNT_FLAG(MNT_RELOAD); 3502 MNT_FLAG(MNT_FORCE); 3503 MNT_FLAG(MNT_SNAPSHOT); 3504 MNT_FLAG(MNT_BYFSID); 3505 #undef MNT_FLAG 3506 if (mflags != 0) { 3507 if (buf[0] != '\0') 3508 strlcat(buf, ", ", sizeof(buf)); 3509 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 3510 "0x%016jx", mflags); 3511 } 3512 db_printf(" mnt_flag = %s\n", buf); 3513 3514 buf[0] = '\0'; 3515 flags = mp->mnt_kern_flag; 3516 #define MNT_KERN_FLAG(flag) do { \ 3517 if (flags & (flag)) { \ 3518 if (buf[0] != '\0') \ 3519 strlcat(buf, ", ", sizeof(buf)); \ 3520 strlcat(buf, (#flag) + 5, sizeof(buf)); \ 3521 flags &= ~(flag); \ 3522 } \ 3523 } while (0) 3524 MNT_KERN_FLAG(MNTK_UNMOUNTF); 3525 MNT_KERN_FLAG(MNTK_ASYNC); 3526 MNT_KERN_FLAG(MNTK_SOFTDEP); 3527 MNT_KERN_FLAG(MNTK_NOINSMNTQ); 3528 MNT_KERN_FLAG(MNTK_DRAINING); 3529 MNT_KERN_FLAG(MNTK_REFEXPIRE); 3530 MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); 3531 MNT_KERN_FLAG(MNTK_SHARED_WRITES); 3532 MNT_KERN_FLAG(MNTK_NO_IOPF); 3533 MNT_KERN_FLAG(MNTK_VGONE_UPPER); 3534 MNT_KERN_FLAG(MNTK_VGONE_WAITER); 3535 MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT); 3536 MNT_KERN_FLAG(MNTK_MARKER); 3537 MNT_KERN_FLAG(MNTK_USES_BCACHE); 3538 MNT_KERN_FLAG(MNTK_NOASYNC); 3539 MNT_KERN_FLAG(MNTK_UNMOUNT); 3540 MNT_KERN_FLAG(MNTK_MWAIT); 3541 MNT_KERN_FLAG(MNTK_SUSPEND); 3542 MNT_KERN_FLAG(MNTK_SUSPEND2); 3543 MNT_KERN_FLAG(MNTK_SUSPENDED); 3544 MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); 3545 MNT_KERN_FLAG(MNTK_NOKNOTE); 3546 #undef MNT_KERN_FLAG 3547 if (flags != 0) { 3548 if (buf[0] != '\0') 3549 strlcat(buf, ", ", sizeof(buf)); 3550 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 3551 "0x%08x", flags); 3552 } 3553 db_printf(" mnt_kern_flag = %s\n", buf); 3554 3555 db_printf(" mnt_opt = "); 3556 opt = TAILQ_FIRST(mp->mnt_opt); 3557 if (opt != NULL) { 3558 db_printf("%s", opt->name); 3559 opt = TAILQ_NEXT(opt, link); 3560 while (opt != NULL) { 3561 db_printf(", %s", opt->name); 3562 opt = TAILQ_NEXT(opt, link); 3563 } 3564 } 3565 db_printf("\n"); 3566 3567 sp = &mp->mnt_stat; 3568 db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " 3569 "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " 3570 "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " 3571 "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", 3572 (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, 3573 (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, 3574 (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, 3575 (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, 3576 (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, 3577 (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, 3578 (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, 3579 (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); 3580 3581 db_printf(" mnt_cred = { uid=%u ruid=%u", 3582 (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); 3583 if (jailed(mp->mnt_cred)) 3584 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); 3585 db_printf(" }\n"); 3586 db_printf(" mnt_ref = %d\n", mp->mnt_ref); 3587 db_printf(" mnt_gen = %d\n", mp->mnt_gen); 3588 db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); 3589 db_printf(" mnt_activevnodelistsize = %d\n", 3590 mp->mnt_activevnodelistsize); 3591 db_printf(" mnt_writeopcount = %d\n", mp->mnt_writeopcount); 3592 db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); 3593 db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); 3594 db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); 3595 db_printf(" mnt_lockref = %d\n", mp->mnt_lockref); 3596 db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); 3597 db_printf(" mnt_secondary_accwrites = %d\n", 3598 mp->mnt_secondary_accwrites); 3599 db_printf(" mnt_gjprovider = %s\n", 3600 mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); 3601 3602 db_printf("\n\nList of active vnodes\n"); 3603 TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) { 3604 if (vp->v_type != VMARKER) { 3605 vn_printf(vp, "vnode "); 3606 if (db_pager_quit) 3607 break; 3608 } 3609 } 3610 db_printf("\n\nList of inactive vnodes\n"); 3611 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 3612 if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) { 3613 vn_printf(vp, "vnode "); 3614 if (db_pager_quit) 3615 break; 3616 } 3617 } 3618 } 3619 #endif /* DDB */ 3620 3621 /* 3622 * Fill in a struct xvfsconf based on a struct vfsconf. 3623 */ 3624 static int 3625 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) 3626 { 3627 struct xvfsconf xvfsp; 3628 3629 bzero(&xvfsp, sizeof(xvfsp)); 3630 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 3631 xvfsp.vfc_typenum = vfsp->vfc_typenum; 3632 xvfsp.vfc_refcount = vfsp->vfc_refcount; 3633 xvfsp.vfc_flags = vfsp->vfc_flags; 3634 /* 3635 * These are unused in userland, we keep them 3636 * to not break binary compatibility. 3637 */ 3638 xvfsp.vfc_vfsops = NULL; 3639 xvfsp.vfc_next = NULL; 3640 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 3641 } 3642 3643 #ifdef COMPAT_FREEBSD32 3644 struct xvfsconf32 { 3645 uint32_t vfc_vfsops; 3646 char vfc_name[MFSNAMELEN]; 3647 int32_t vfc_typenum; 3648 int32_t vfc_refcount; 3649 int32_t vfc_flags; 3650 uint32_t vfc_next; 3651 }; 3652 3653 static int 3654 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) 3655 { 3656 struct xvfsconf32 xvfsp; 3657 3658 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 3659 xvfsp.vfc_typenum = vfsp->vfc_typenum; 3660 xvfsp.vfc_refcount = vfsp->vfc_refcount; 3661 xvfsp.vfc_flags = vfsp->vfc_flags; 3662 xvfsp.vfc_vfsops = 0; 3663 xvfsp.vfc_next = 0; 3664 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 3665 } 3666 #endif 3667 3668 /* 3669 * Top level filesystem related information gathering. 3670 */ 3671 static int 3672 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) 3673 { 3674 struct vfsconf *vfsp; 3675 int error; 3676 3677 error = 0; 3678 vfsconf_slock(); 3679 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 3680 #ifdef COMPAT_FREEBSD32 3681 if (req->flags & SCTL_MASK32) 3682 error = vfsconf2x32(req, vfsp); 3683 else 3684 #endif 3685 error = vfsconf2x(req, vfsp); 3686 if (error) 3687 break; 3688 } 3689 vfsconf_sunlock(); 3690 return (error); 3691 } 3692 3693 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | 3694 CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, 3695 "S,xvfsconf", "List of all configured filesystems"); 3696 3697 #ifndef BURN_BRIDGES 3698 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 3699 3700 static int 3701 vfs_sysctl(SYSCTL_HANDLER_ARGS) 3702 { 3703 int *name = (int *)arg1 - 1; /* XXX */ 3704 u_int namelen = arg2 + 1; /* XXX */ 3705 struct vfsconf *vfsp; 3706 3707 log(LOG_WARNING, "userland calling deprecated sysctl, " 3708 "please rebuild world\n"); 3709 3710 #if 1 || defined(COMPAT_PRELITE2) 3711 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 3712 if (namelen == 1) 3713 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 3714 #endif 3715 3716 switch (name[1]) { 3717 case VFS_MAXTYPENUM: 3718 if (namelen != 2) 3719 return (ENOTDIR); 3720 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 3721 case VFS_CONF: 3722 if (namelen != 3) 3723 return (ENOTDIR); /* overloaded */ 3724 vfsconf_slock(); 3725 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 3726 if (vfsp->vfc_typenum == name[2]) 3727 break; 3728 } 3729 vfsconf_sunlock(); 3730 if (vfsp == NULL) 3731 return (EOPNOTSUPP); 3732 #ifdef COMPAT_FREEBSD32 3733 if (req->flags & SCTL_MASK32) 3734 return (vfsconf2x32(req, vfsp)); 3735 else 3736 #endif 3737 return (vfsconf2x(req, vfsp)); 3738 } 3739 return (EOPNOTSUPP); 3740 } 3741 3742 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | 3743 CTLFLAG_MPSAFE, vfs_sysctl, 3744 "Generic filesystem"); 3745 3746 #if 1 || defined(COMPAT_PRELITE2) 3747 3748 static int 3749 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 3750 { 3751 int error; 3752 struct vfsconf *vfsp; 3753 struct ovfsconf ovfs; 3754 3755 vfsconf_slock(); 3756 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 3757 bzero(&ovfs, sizeof(ovfs)); 3758 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 3759 strcpy(ovfs.vfc_name, vfsp->vfc_name); 3760 ovfs.vfc_index = vfsp->vfc_typenum; 3761 ovfs.vfc_refcount = vfsp->vfc_refcount; 3762 ovfs.vfc_flags = vfsp->vfc_flags; 3763 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 3764 if (error != 0) { 3765 vfsconf_sunlock(); 3766 return (error); 3767 } 3768 } 3769 vfsconf_sunlock(); 3770 return (0); 3771 } 3772 3773 #endif /* 1 || COMPAT_PRELITE2 */ 3774 #endif /* !BURN_BRIDGES */ 3775 3776 #define KINFO_VNODESLOP 10 3777 #ifdef notyet 3778 /* 3779 * Dump vnode list (via sysctl). 3780 */ 3781 /* ARGSUSED */ 3782 static int 3783 sysctl_vnode(SYSCTL_HANDLER_ARGS) 3784 { 3785 struct xvnode *xvn; 3786 struct mount *mp; 3787 struct vnode *vp; 3788 int error, len, n; 3789 3790 /* 3791 * Stale numvnodes access is not fatal here. 3792 */ 3793 req->lock = 0; 3794 len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; 3795 if (!req->oldptr) 3796 /* Make an estimate */ 3797 return (SYSCTL_OUT(req, 0, len)); 3798 3799 error = sysctl_wire_old_buffer(req, 0); 3800 if (error != 0) 3801 return (error); 3802 xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); 3803 n = 0; 3804 mtx_lock(&mountlist_mtx); 3805 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3806 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) 3807 continue; 3808 MNT_ILOCK(mp); 3809 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 3810 if (n == len) 3811 break; 3812 vref(vp); 3813 xvn[n].xv_size = sizeof *xvn; 3814 xvn[n].xv_vnode = vp; 3815 xvn[n].xv_id = 0; /* XXX compat */ 3816 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field 3817 XV_COPY(usecount); 3818 XV_COPY(writecount); 3819 XV_COPY(holdcnt); 3820 XV_COPY(mount); 3821 XV_COPY(numoutput); 3822 XV_COPY(type); 3823 #undef XV_COPY 3824 xvn[n].xv_flag = vp->v_vflag; 3825 3826 switch (vp->v_type) { 3827 case VREG: 3828 case VDIR: 3829 case VLNK: 3830 break; 3831 case VBLK: 3832 case VCHR: 3833 if (vp->v_rdev == NULL) { 3834 vrele(vp); 3835 continue; 3836 } 3837 xvn[n].xv_dev = dev2udev(vp->v_rdev); 3838 break; 3839 case VSOCK: 3840 xvn[n].xv_socket = vp->v_socket; 3841 break; 3842 case VFIFO: 3843 xvn[n].xv_fifo = vp->v_fifoinfo; 3844 break; 3845 case VNON: 3846 case VBAD: 3847 default: 3848 /* shouldn't happen? */ 3849 vrele(vp); 3850 continue; 3851 } 3852 vrele(vp); 3853 ++n; 3854 } 3855 MNT_IUNLOCK(mp); 3856 mtx_lock(&mountlist_mtx); 3857 vfs_unbusy(mp); 3858 if (n == len) 3859 break; 3860 } 3861 mtx_unlock(&mountlist_mtx); 3862 3863 error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); 3864 free(xvn, M_TEMP); 3865 return (error); 3866 } 3867 3868 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD | 3869 CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode", 3870 ""); 3871 #endif 3872 3873 static void 3874 unmount_or_warn(struct mount *mp) 3875 { 3876 int error; 3877 3878 error = dounmount(mp, MNT_FORCE, curthread); 3879 if (error != 0) { 3880 printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); 3881 if (error == EBUSY) 3882 printf("BUSY)\n"); 3883 else 3884 printf("%d)\n", error); 3885 } 3886 } 3887 3888 /* 3889 * Unmount all filesystems. The list is traversed in reverse order 3890 * of mounting to avoid dependencies. 3891 */ 3892 void 3893 vfs_unmountall(void) 3894 { 3895 struct mount *mp, *tmp; 3896 3897 CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); 3898 3899 /* 3900 * Since this only runs when rebooting, it is not interlocked. 3901 */ 3902 TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { 3903 vfs_ref(mp); 3904 3905 /* 3906 * Forcibly unmounting "/dev" before "/" would prevent clean 3907 * unmount of the latter. 3908 */ 3909 if (mp == rootdevmp) 3910 continue; 3911 3912 unmount_or_warn(mp); 3913 } 3914 3915 if (rootdevmp != NULL) 3916 unmount_or_warn(rootdevmp); 3917 } 3918 3919 /* 3920 * perform msync on all vnodes under a mount point 3921 * the mount point must be locked. 3922 */ 3923 void 3924 vfs_msync(struct mount *mp, int flags) 3925 { 3926 struct vnode *vp, *mvp; 3927 struct vm_object *obj; 3928 3929 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 3930 MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { 3931 obj = vp->v_object; 3932 if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 && 3933 (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) { 3934 if (!vget(vp, 3935 LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, 3936 curthread)) { 3937 if (vp->v_vflag & VV_NOSYNC) { /* unlinked */ 3938 vput(vp); 3939 continue; 3940 } 3941 3942 obj = vp->v_object; 3943 if (obj != NULL) { 3944 VM_OBJECT_WLOCK(obj); 3945 vm_object_page_clean(obj, 0, 0, 3946 flags == MNT_WAIT ? 3947 OBJPC_SYNC : OBJPC_NOSYNC); 3948 VM_OBJECT_WUNLOCK(obj); 3949 } 3950 vput(vp); 3951 } 3952 } else 3953 VI_UNLOCK(vp); 3954 } 3955 } 3956 3957 static void 3958 destroy_vpollinfo_free(struct vpollinfo *vi) 3959 { 3960 3961 knlist_destroy(&vi->vpi_selinfo.si_note); 3962 mtx_destroy(&vi->vpi_lock); 3963 uma_zfree(vnodepoll_zone, vi); 3964 } 3965 3966 static void 3967 destroy_vpollinfo(struct vpollinfo *vi) 3968 { 3969 3970 knlist_clear(&vi->vpi_selinfo.si_note, 1); 3971 seldrain(&vi->vpi_selinfo); 3972 destroy_vpollinfo_free(vi); 3973 } 3974 3975 /* 3976 * Initialize per-vnode helper structure to hold poll-related state. 3977 */ 3978 void 3979 v_addpollinfo(struct vnode *vp) 3980 { 3981 struct vpollinfo *vi; 3982 3983 if (vp->v_pollinfo != NULL) 3984 return; 3985 vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO); 3986 mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); 3987 knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, 3988 vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked); 3989 VI_LOCK(vp); 3990 if (vp->v_pollinfo != NULL) { 3991 VI_UNLOCK(vp); 3992 destroy_vpollinfo_free(vi); 3993 return; 3994 } 3995 vp->v_pollinfo = vi; 3996 VI_UNLOCK(vp); 3997 } 3998 3999 /* 4000 * Record a process's interest in events which might happen to 4001 * a vnode. Because poll uses the historic select-style interface 4002 * internally, this routine serves as both the ``check for any 4003 * pending events'' and the ``record my interest in future events'' 4004 * functions. (These are done together, while the lock is held, 4005 * to avoid race conditions.) 4006 */ 4007 int 4008 vn_pollrecord(struct vnode *vp, struct thread *td, int events) 4009 { 4010 4011 v_addpollinfo(vp); 4012 mtx_lock(&vp->v_pollinfo->vpi_lock); 4013 if (vp->v_pollinfo->vpi_revents & events) { 4014 /* 4015 * This leaves events we are not interested 4016 * in available for the other process which 4017 * which presumably had requested them 4018 * (otherwise they would never have been 4019 * recorded). 4020 */ 4021 events &= vp->v_pollinfo->vpi_revents; 4022 vp->v_pollinfo->vpi_revents &= ~events; 4023 4024 mtx_unlock(&vp->v_pollinfo->vpi_lock); 4025 return (events); 4026 } 4027 vp->v_pollinfo->vpi_events |= events; 4028 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 4029 mtx_unlock(&vp->v_pollinfo->vpi_lock); 4030 return (0); 4031 } 4032 4033 /* 4034 * Routine to create and manage a filesystem syncer vnode. 4035 */ 4036 #define sync_close ((int (*)(struct vop_close_args *))nullop) 4037 static int sync_fsync(struct vop_fsync_args *); 4038 static int sync_inactive(struct vop_inactive_args *); 4039 static int sync_reclaim(struct vop_reclaim_args *); 4040 4041 static struct vop_vector sync_vnodeops = { 4042 .vop_bypass = VOP_EOPNOTSUPP, 4043 .vop_close = sync_close, /* close */ 4044 .vop_fsync = sync_fsync, /* fsync */ 4045 .vop_inactive = sync_inactive, /* inactive */ 4046 .vop_reclaim = sync_reclaim, /* reclaim */ 4047 .vop_lock1 = vop_stdlock, /* lock */ 4048 .vop_unlock = vop_stdunlock, /* unlock */ 4049 .vop_islocked = vop_stdislocked, /* islocked */ 4050 }; 4051 4052 /* 4053 * Create a new filesystem syncer vnode for the specified mount point. 4054 */ 4055 void 4056 vfs_allocate_syncvnode(struct mount *mp) 4057 { 4058 struct vnode *vp; 4059 struct bufobj *bo; 4060 static long start, incr, next; 4061 int error; 4062 4063 /* Allocate a new vnode */ 4064 error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); 4065 if (error != 0) 4066 panic("vfs_allocate_syncvnode: getnewvnode() failed"); 4067 vp->v_type = VNON; 4068 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 4069 vp->v_vflag |= VV_FORCEINSMQ; 4070 error = insmntque(vp, mp); 4071 if (error != 0) 4072 panic("vfs_allocate_syncvnode: insmntque() failed"); 4073 vp->v_vflag &= ~VV_FORCEINSMQ; 4074 VOP_UNLOCK(vp, 0); 4075 /* 4076 * Place the vnode onto the syncer worklist. We attempt to 4077 * scatter them about on the list so that they will go off 4078 * at evenly distributed times even if all the filesystems 4079 * are mounted at once. 4080 */ 4081 next += incr; 4082 if (next == 0 || next > syncer_maxdelay) { 4083 start /= 2; 4084 incr /= 2; 4085 if (start == 0) { 4086 start = syncer_maxdelay / 2; 4087 incr = syncer_maxdelay; 4088 } 4089 next = start; 4090 } 4091 bo = &vp->v_bufobj; 4092 BO_LOCK(bo); 4093 vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); 4094 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ 4095 mtx_lock(&sync_mtx); 4096 sync_vnode_count++; 4097 if (mp->mnt_syncer == NULL) { 4098 mp->mnt_syncer = vp; 4099 vp = NULL; 4100 } 4101 mtx_unlock(&sync_mtx); 4102 BO_UNLOCK(bo); 4103 if (vp != NULL) { 4104 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 4105 vgone(vp); 4106 vput(vp); 4107 } 4108 } 4109 4110 void 4111 vfs_deallocate_syncvnode(struct mount *mp) 4112 { 4113 struct vnode *vp; 4114 4115 mtx_lock(&sync_mtx); 4116 vp = mp->mnt_syncer; 4117 if (vp != NULL) 4118 mp->mnt_syncer = NULL; 4119 mtx_unlock(&sync_mtx); 4120 if (vp != NULL) 4121 vrele(vp); 4122 } 4123 4124 /* 4125 * Do a lazy sync of the filesystem. 4126 */ 4127 static int 4128 sync_fsync(struct vop_fsync_args *ap) 4129 { 4130 struct vnode *syncvp = ap->a_vp; 4131 struct mount *mp = syncvp->v_mount; 4132 int error, save; 4133 struct bufobj *bo; 4134 4135 /* 4136 * We only need to do something if this is a lazy evaluation. 4137 */ 4138 if (ap->a_waitfor != MNT_LAZY) 4139 return (0); 4140 4141 /* 4142 * Move ourselves to the back of the sync list. 4143 */ 4144 bo = &syncvp->v_bufobj; 4145 BO_LOCK(bo); 4146 vn_syncer_add_to_worklist(bo, syncdelay); 4147 BO_UNLOCK(bo); 4148 4149 /* 4150 * Walk the list of vnodes pushing all that are dirty and 4151 * not already on the sync list. 4152 */ 4153 if (vfs_busy(mp, MBF_NOWAIT) != 0) 4154 return (0); 4155 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { 4156 vfs_unbusy(mp); 4157 return (0); 4158 } 4159 save = curthread_pflags_set(TDP_SYNCIO); 4160 vfs_msync(mp, MNT_NOWAIT); 4161 error = VFS_SYNC(mp, MNT_LAZY); 4162 curthread_pflags_restore(save); 4163 vn_finished_write(mp); 4164 vfs_unbusy(mp); 4165 return (error); 4166 } 4167 4168 /* 4169 * The syncer vnode is no referenced. 4170 */ 4171 static int 4172 sync_inactive(struct vop_inactive_args *ap) 4173 { 4174 4175 vgone(ap->a_vp); 4176 return (0); 4177 } 4178 4179 /* 4180 * The syncer vnode is no longer needed and is being decommissioned. 4181 * 4182 * Modifications to the worklist must be protected by sync_mtx. 4183 */ 4184 static int 4185 sync_reclaim(struct vop_reclaim_args *ap) 4186 { 4187 struct vnode *vp = ap->a_vp; 4188 struct bufobj *bo; 4189 4190 bo = &vp->v_bufobj; 4191 BO_LOCK(bo); 4192 mtx_lock(&sync_mtx); 4193 if (vp->v_mount->mnt_syncer == vp) 4194 vp->v_mount->mnt_syncer = NULL; 4195 if (bo->bo_flag & BO_ONWORKLST) { 4196 LIST_REMOVE(bo, bo_synclist); 4197 syncer_worklist_len--; 4198 sync_vnode_count--; 4199 bo->bo_flag &= ~BO_ONWORKLST; 4200 } 4201 mtx_unlock(&sync_mtx); 4202 BO_UNLOCK(bo); 4203 4204 return (0); 4205 } 4206 4207 /* 4208 * Check if vnode represents a disk device 4209 */ 4210 int 4211 vn_isdisk(struct vnode *vp, int *errp) 4212 { 4213 int error; 4214 4215 if (vp->v_type != VCHR) { 4216 error = ENOTBLK; 4217 goto out; 4218 } 4219 error = 0; 4220 dev_lock(); 4221 if (vp->v_rdev == NULL) 4222 error = ENXIO; 4223 else if (vp->v_rdev->si_devsw == NULL) 4224 error = ENXIO; 4225 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) 4226 error = ENOTBLK; 4227 dev_unlock(); 4228 out: 4229 if (errp != NULL) 4230 *errp = error; 4231 return (error == 0); 4232 } 4233 4234 /* 4235 * Common filesystem object access control check routine. Accepts a 4236 * vnode's type, "mode", uid and gid, requested access mode, credentials, 4237 * and optional call-by-reference privused argument allowing vaccess() 4238 * to indicate to the caller whether privilege was used to satisfy the 4239 * request (obsoleted). Returns 0 on success, or an errno on failure. 4240 */ 4241 int 4242 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, 4243 accmode_t accmode, struct ucred *cred, int *privused) 4244 { 4245 accmode_t dac_granted; 4246 accmode_t priv_granted; 4247 4248 KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, 4249 ("invalid bit in accmode")); 4250 KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), 4251 ("VAPPEND without VWRITE")); 4252 4253 /* 4254 * Look for a normal, non-privileged way to access the file/directory 4255 * as requested. If it exists, go with that. 4256 */ 4257 4258 if (privused != NULL) 4259 *privused = 0; 4260 4261 dac_granted = 0; 4262 4263 /* Check the owner. */ 4264 if (cred->cr_uid == file_uid) { 4265 dac_granted |= VADMIN; 4266 if (file_mode & S_IXUSR) 4267 dac_granted |= VEXEC; 4268 if (file_mode & S_IRUSR) 4269 dac_granted |= VREAD; 4270 if (file_mode & S_IWUSR) 4271 dac_granted |= (VWRITE | VAPPEND); 4272 4273 if ((accmode & dac_granted) == accmode) 4274 return (0); 4275 4276 goto privcheck; 4277 } 4278 4279 /* Otherwise, check the groups (first match) */ 4280 if (groupmember(file_gid, cred)) { 4281 if (file_mode & S_IXGRP) 4282 dac_granted |= VEXEC; 4283 if (file_mode & S_IRGRP) 4284 dac_granted |= VREAD; 4285 if (file_mode & S_IWGRP) 4286 dac_granted |= (VWRITE | VAPPEND); 4287 4288 if ((accmode & dac_granted) == accmode) 4289 return (0); 4290 4291 goto privcheck; 4292 } 4293 4294 /* Otherwise, check everyone else. */ 4295 if (file_mode & S_IXOTH) 4296 dac_granted |= VEXEC; 4297 if (file_mode & S_IROTH) 4298 dac_granted |= VREAD; 4299 if (file_mode & S_IWOTH) 4300 dac_granted |= (VWRITE | VAPPEND); 4301 if ((accmode & dac_granted) == accmode) 4302 return (0); 4303 4304 privcheck: 4305 /* 4306 * Build a privilege mask to determine if the set of privileges 4307 * satisfies the requirements when combined with the granted mask 4308 * from above. For each privilege, if the privilege is required, 4309 * bitwise or the request type onto the priv_granted mask. 4310 */ 4311 priv_granted = 0; 4312 4313 if (type == VDIR) { 4314 /* 4315 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC 4316 * requests, instead of PRIV_VFS_EXEC. 4317 */ 4318 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 4319 !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0)) 4320 priv_granted |= VEXEC; 4321 } else { 4322 /* 4323 * Ensure that at least one execute bit is on. Otherwise, 4324 * a privileged user will always succeed, and we don't want 4325 * this to happen unless the file really is executable. 4326 */ 4327 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 4328 (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && 4329 !priv_check_cred(cred, PRIV_VFS_EXEC, 0)) 4330 priv_granted |= VEXEC; 4331 } 4332 4333 if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && 4334 !priv_check_cred(cred, PRIV_VFS_READ, 0)) 4335 priv_granted |= VREAD; 4336 4337 if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && 4338 !priv_check_cred(cred, PRIV_VFS_WRITE, 0)) 4339 priv_granted |= (VWRITE | VAPPEND); 4340 4341 if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && 4342 !priv_check_cred(cred, PRIV_VFS_ADMIN, 0)) 4343 priv_granted |= VADMIN; 4344 4345 if ((accmode & (priv_granted | dac_granted)) == accmode) { 4346 /* XXX audit: privilege used */ 4347 if (privused != NULL) 4348 *privused = 1; 4349 return (0); 4350 } 4351 4352 return ((accmode & VADMIN) ? EPERM : EACCES); 4353 } 4354 4355 /* 4356 * Credential check based on process requesting service, and per-attribute 4357 * permissions. 4358 */ 4359 int 4360 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, 4361 struct thread *td, accmode_t accmode) 4362 { 4363 4364 /* 4365 * Kernel-invoked always succeeds. 4366 */ 4367 if (cred == NOCRED) 4368 return (0); 4369 4370 /* 4371 * Do not allow privileged processes in jail to directly manipulate 4372 * system attributes. 4373 */ 4374 switch (attrnamespace) { 4375 case EXTATTR_NAMESPACE_SYSTEM: 4376 /* Potentially should be: return (EPERM); */ 4377 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0)); 4378 case EXTATTR_NAMESPACE_USER: 4379 return (VOP_ACCESS(vp, accmode, cred, td)); 4380 default: 4381 return (EPERM); 4382 } 4383 } 4384 4385 #ifdef DEBUG_VFS_LOCKS 4386 /* 4387 * This only exists to suppress warnings from unlocked specfs accesses. It is 4388 * no longer ok to have an unlocked VFS. 4389 */ 4390 #define IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL || \ 4391 (vp)->v_type == VCHR || (vp)->v_type == VBAD) 4392 4393 int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ 4394 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, 4395 "Drop into debugger on lock violation"); 4396 4397 int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ 4398 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 4399 0, "Check for interlock across VOPs"); 4400 4401 int vfs_badlock_print = 1; /* Print lock violations. */ 4402 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 4403 0, "Print lock violations"); 4404 4405 int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */ 4406 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode, 4407 0, "Print vnode details on lock violations"); 4408 4409 #ifdef KDB 4410 int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ 4411 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, 4412 &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); 4413 #endif 4414 4415 static void 4416 vfs_badlock(const char *msg, const char *str, struct vnode *vp) 4417 { 4418 4419 #ifdef KDB 4420 if (vfs_badlock_backtrace) 4421 kdb_backtrace(); 4422 #endif 4423 if (vfs_badlock_vnode) 4424 vn_printf(vp, "vnode "); 4425 if (vfs_badlock_print) 4426 printf("%s: %p %s\n", str, (void *)vp, msg); 4427 if (vfs_badlock_ddb) 4428 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 4429 } 4430 4431 void 4432 assert_vi_locked(struct vnode *vp, const char *str) 4433 { 4434 4435 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) 4436 vfs_badlock("interlock is not locked but should be", str, vp); 4437 } 4438 4439 void 4440 assert_vi_unlocked(struct vnode *vp, const char *str) 4441 { 4442 4443 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) 4444 vfs_badlock("interlock is locked but should not be", str, vp); 4445 } 4446 4447 void 4448 assert_vop_locked(struct vnode *vp, const char *str) 4449 { 4450 int locked; 4451 4452 if (!IGNORE_LOCK(vp)) { 4453 locked = VOP_ISLOCKED(vp); 4454 if (locked == 0 || locked == LK_EXCLOTHER) 4455 vfs_badlock("is not locked but should be", str, vp); 4456 } 4457 } 4458 4459 void 4460 assert_vop_unlocked(struct vnode *vp, const char *str) 4461 { 4462 4463 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) 4464 vfs_badlock("is locked but should not be", str, vp); 4465 } 4466 4467 void 4468 assert_vop_elocked(struct vnode *vp, const char *str) 4469 { 4470 4471 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 4472 vfs_badlock("is not exclusive locked but should be", str, vp); 4473 } 4474 #endif /* DEBUG_VFS_LOCKS */ 4475 4476 void 4477 vop_rename_fail(struct vop_rename_args *ap) 4478 { 4479 4480 if (ap->a_tvp != NULL) 4481 vput(ap->a_tvp); 4482 if (ap->a_tdvp == ap->a_tvp) 4483 vrele(ap->a_tdvp); 4484 else 4485 vput(ap->a_tdvp); 4486 vrele(ap->a_fdvp); 4487 vrele(ap->a_fvp); 4488 } 4489 4490 void 4491 vop_rename_pre(void *ap) 4492 { 4493 struct vop_rename_args *a = ap; 4494 4495 #ifdef DEBUG_VFS_LOCKS 4496 if (a->a_tvp) 4497 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); 4498 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); 4499 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); 4500 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); 4501 4502 /* Check the source (from). */ 4503 if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && 4504 (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) 4505 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); 4506 if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) 4507 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); 4508 4509 /* Check the target. */ 4510 if (a->a_tvp) 4511 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); 4512 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); 4513 #endif 4514 if (a->a_tdvp != a->a_fdvp) 4515 vhold(a->a_fdvp); 4516 if (a->a_tvp != a->a_fvp) 4517 vhold(a->a_fvp); 4518 vhold(a->a_tdvp); 4519 if (a->a_tvp) 4520 vhold(a->a_tvp); 4521 } 4522 4523 #ifdef DEBUG_VFS_LOCKS 4524 void 4525 vop_strategy_pre(void *ap) 4526 { 4527 struct vop_strategy_args *a; 4528 struct buf *bp; 4529 4530 a = ap; 4531 bp = a->a_bp; 4532 4533 /* 4534 * Cluster ops lock their component buffers but not the IO container. 4535 */ 4536 if ((bp->b_flags & B_CLUSTER) != 0) 4537 return; 4538 4539 if (panicstr == NULL && !BUF_ISLOCKED(bp)) { 4540 if (vfs_badlock_print) 4541 printf( 4542 "VOP_STRATEGY: bp is not locked but should be\n"); 4543 if (vfs_badlock_ddb) 4544 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 4545 } 4546 } 4547 4548 void 4549 vop_lock_pre(void *ap) 4550 { 4551 struct vop_lock1_args *a = ap; 4552 4553 if ((a->a_flags & LK_INTERLOCK) == 0) 4554 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 4555 else 4556 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); 4557 } 4558 4559 void 4560 vop_lock_post(void *ap, int rc) 4561 { 4562 struct vop_lock1_args *a = ap; 4563 4564 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 4565 if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) 4566 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); 4567 } 4568 4569 void 4570 vop_unlock_pre(void *ap) 4571 { 4572 struct vop_unlock_args *a = ap; 4573 4574 if (a->a_flags & LK_INTERLOCK) 4575 ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK"); 4576 ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); 4577 } 4578 4579 void 4580 vop_unlock_post(void *ap, int rc) 4581 { 4582 struct vop_unlock_args *a = ap; 4583 4584 if (a->a_flags & LK_INTERLOCK) 4585 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK"); 4586 } 4587 #endif 4588 4589 void 4590 vop_create_post(void *ap, int rc) 4591 { 4592 struct vop_create_args *a = ap; 4593 4594 if (!rc) 4595 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4596 } 4597 4598 void 4599 vop_deleteextattr_post(void *ap, int rc) 4600 { 4601 struct vop_deleteextattr_args *a = ap; 4602 4603 if (!rc) 4604 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 4605 } 4606 4607 void 4608 vop_link_post(void *ap, int rc) 4609 { 4610 struct vop_link_args *a = ap; 4611 4612 if (!rc) { 4613 VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK); 4614 VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE); 4615 } 4616 } 4617 4618 void 4619 vop_mkdir_post(void *ap, int rc) 4620 { 4621 struct vop_mkdir_args *a = ap; 4622 4623 if (!rc) 4624 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); 4625 } 4626 4627 void 4628 vop_mknod_post(void *ap, int rc) 4629 { 4630 struct vop_mknod_args *a = ap; 4631 4632 if (!rc) 4633 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4634 } 4635 4636 void 4637 vop_reclaim_post(void *ap, int rc) 4638 { 4639 struct vop_reclaim_args *a = ap; 4640 4641 if (!rc) 4642 VFS_KNOTE_LOCKED(a->a_vp, NOTE_REVOKE); 4643 } 4644 4645 void 4646 vop_remove_post(void *ap, int rc) 4647 { 4648 struct vop_remove_args *a = ap; 4649 4650 if (!rc) { 4651 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4652 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); 4653 } 4654 } 4655 4656 void 4657 vop_rename_post(void *ap, int rc) 4658 { 4659 struct vop_rename_args *a = ap; 4660 long hint; 4661 4662 if (!rc) { 4663 hint = NOTE_WRITE; 4664 if (a->a_fdvp == a->a_tdvp) { 4665 if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) 4666 hint |= NOTE_LINK; 4667 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 4668 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 4669 } else { 4670 hint |= NOTE_EXTEND; 4671 if (a->a_fvp->v_type == VDIR) 4672 hint |= NOTE_LINK; 4673 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 4674 4675 if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && 4676 a->a_tvp->v_type == VDIR) 4677 hint &= ~NOTE_LINK; 4678 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 4679 } 4680 4681 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); 4682 if (a->a_tvp) 4683 VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); 4684 } 4685 if (a->a_tdvp != a->a_fdvp) 4686 vdrop(a->a_fdvp); 4687 if (a->a_tvp != a->a_fvp) 4688 vdrop(a->a_fvp); 4689 vdrop(a->a_tdvp); 4690 if (a->a_tvp) 4691 vdrop(a->a_tvp); 4692 } 4693 4694 void 4695 vop_rmdir_post(void *ap, int rc) 4696 { 4697 struct vop_rmdir_args *a = ap; 4698 4699 if (!rc) { 4700 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); 4701 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); 4702 } 4703 } 4704 4705 void 4706 vop_setattr_post(void *ap, int rc) 4707 { 4708 struct vop_setattr_args *a = ap; 4709 4710 if (!rc) 4711 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 4712 } 4713 4714 void 4715 vop_setextattr_post(void *ap, int rc) 4716 { 4717 struct vop_setextattr_args *a = ap; 4718 4719 if (!rc) 4720 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 4721 } 4722 4723 void 4724 vop_symlink_post(void *ap, int rc) 4725 { 4726 struct vop_symlink_args *a = ap; 4727 4728 if (!rc) 4729 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4730 } 4731 4732 void 4733 vop_open_post(void *ap, int rc) 4734 { 4735 struct vop_open_args *a = ap; 4736 4737 if (!rc) 4738 VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); 4739 } 4740 4741 void 4742 vop_close_post(void *ap, int rc) 4743 { 4744 struct vop_close_args *a = ap; 4745 4746 if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ 4747 (a->a_vp->v_iflag & VI_DOOMED) == 0)) { 4748 VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? 4749 NOTE_CLOSE_WRITE : NOTE_CLOSE); 4750 } 4751 } 4752 4753 void 4754 vop_read_post(void *ap, int rc) 4755 { 4756 struct vop_read_args *a = ap; 4757 4758 if (!rc) 4759 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 4760 } 4761 4762 void 4763 vop_readdir_post(void *ap, int rc) 4764 { 4765 struct vop_readdir_args *a = ap; 4766 4767 if (!rc) 4768 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 4769 } 4770 4771 static struct knlist fs_knlist; 4772 4773 static void 4774 vfs_event_init(void *arg) 4775 { 4776 knlist_init_mtx(&fs_knlist, NULL); 4777 } 4778 /* XXX - correct order? */ 4779 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); 4780 4781 void 4782 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) 4783 { 4784 4785 KNOTE_UNLOCKED(&fs_knlist, event); 4786 } 4787 4788 static int filt_fsattach(struct knote *kn); 4789 static void filt_fsdetach(struct knote *kn); 4790 static int filt_fsevent(struct knote *kn, long hint); 4791 4792 struct filterops fs_filtops = { 4793 .f_isfd = 0, 4794 .f_attach = filt_fsattach, 4795 .f_detach = filt_fsdetach, 4796 .f_event = filt_fsevent 4797 }; 4798 4799 static int 4800 filt_fsattach(struct knote *kn) 4801 { 4802 4803 kn->kn_flags |= EV_CLEAR; 4804 knlist_add(&fs_knlist, kn, 0); 4805 return (0); 4806 } 4807 4808 static void 4809 filt_fsdetach(struct knote *kn) 4810 { 4811 4812 knlist_remove(&fs_knlist, kn, 0); 4813 } 4814 4815 static int 4816 filt_fsevent(struct knote *kn, long hint) 4817 { 4818 4819 kn->kn_fflags |= hint; 4820 return (kn->kn_fflags != 0); 4821 } 4822 4823 static int 4824 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) 4825 { 4826 struct vfsidctl vc; 4827 int error; 4828 struct mount *mp; 4829 4830 error = SYSCTL_IN(req, &vc, sizeof(vc)); 4831 if (error) 4832 return (error); 4833 if (vc.vc_vers != VFS_CTL_VERS1) 4834 return (EINVAL); 4835 mp = vfs_getvfs(&vc.vc_fsid); 4836 if (mp == NULL) 4837 return (ENOENT); 4838 /* ensure that a specific sysctl goes to the right filesystem. */ 4839 if (strcmp(vc.vc_fstypename, "*") != 0 && 4840 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { 4841 vfs_rel(mp); 4842 return (EINVAL); 4843 } 4844 VCTLTOREQ(&vc, req); 4845 error = VFS_SYSCTL(mp, vc.vc_op, req); 4846 vfs_rel(mp); 4847 return (error); 4848 } 4849 4850 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR, 4851 NULL, 0, sysctl_vfs_ctl, "", 4852 "Sysctl by fsid"); 4853 4854 /* 4855 * Function to initialize a va_filerev field sensibly. 4856 * XXX: Wouldn't a random number make a lot more sense ?? 4857 */ 4858 u_quad_t 4859 init_va_filerev(void) 4860 { 4861 struct bintime bt; 4862 4863 getbinuptime(&bt); 4864 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); 4865 } 4866 4867 static int filt_vfsread(struct knote *kn, long hint); 4868 static int filt_vfswrite(struct knote *kn, long hint); 4869 static int filt_vfsvnode(struct knote *kn, long hint); 4870 static void filt_vfsdetach(struct knote *kn); 4871 static struct filterops vfsread_filtops = { 4872 .f_isfd = 1, 4873 .f_detach = filt_vfsdetach, 4874 .f_event = filt_vfsread 4875 }; 4876 static struct filterops vfswrite_filtops = { 4877 .f_isfd = 1, 4878 .f_detach = filt_vfsdetach, 4879 .f_event = filt_vfswrite 4880 }; 4881 static struct filterops vfsvnode_filtops = { 4882 .f_isfd = 1, 4883 .f_detach = filt_vfsdetach, 4884 .f_event = filt_vfsvnode 4885 }; 4886 4887 static void 4888 vfs_knllock(void *arg) 4889 { 4890 struct vnode *vp = arg; 4891 4892 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 4893 } 4894 4895 static void 4896 vfs_knlunlock(void *arg) 4897 { 4898 struct vnode *vp = arg; 4899 4900 VOP_UNLOCK(vp, 0); 4901 } 4902 4903 static void 4904 vfs_knl_assert_locked(void *arg) 4905 { 4906 #ifdef DEBUG_VFS_LOCKS 4907 struct vnode *vp = arg; 4908 4909 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); 4910 #endif 4911 } 4912 4913 static void 4914 vfs_knl_assert_unlocked(void *arg) 4915 { 4916 #ifdef DEBUG_VFS_LOCKS 4917 struct vnode *vp = arg; 4918 4919 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); 4920 #endif 4921 } 4922 4923 int 4924 vfs_kqfilter(struct vop_kqfilter_args *ap) 4925 { 4926 struct vnode *vp = ap->a_vp; 4927 struct knote *kn = ap->a_kn; 4928 struct knlist *knl; 4929 4930 switch (kn->kn_filter) { 4931 case EVFILT_READ: 4932 kn->kn_fop = &vfsread_filtops; 4933 break; 4934 case EVFILT_WRITE: 4935 kn->kn_fop = &vfswrite_filtops; 4936 break; 4937 case EVFILT_VNODE: 4938 kn->kn_fop = &vfsvnode_filtops; 4939 break; 4940 default: 4941 return (EINVAL); 4942 } 4943 4944 kn->kn_hook = (caddr_t)vp; 4945 4946 v_addpollinfo(vp); 4947 if (vp->v_pollinfo == NULL) 4948 return (ENOMEM); 4949 knl = &vp->v_pollinfo->vpi_selinfo.si_note; 4950 vhold(vp); 4951 knlist_add(knl, kn, 0); 4952 4953 return (0); 4954 } 4955 4956 /* 4957 * Detach knote from vnode 4958 */ 4959 static void 4960 filt_vfsdetach(struct knote *kn) 4961 { 4962 struct vnode *vp = (struct vnode *)kn->kn_hook; 4963 4964 KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); 4965 knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); 4966 vdrop(vp); 4967 } 4968 4969 /*ARGSUSED*/ 4970 static int 4971 filt_vfsread(struct knote *kn, long hint) 4972 { 4973 struct vnode *vp = (struct vnode *)kn->kn_hook; 4974 struct vattr va; 4975 int res; 4976 4977 /* 4978 * filesystem is gone, so set the EOF flag and schedule 4979 * the knote for deletion. 4980 */ 4981 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 4982 VI_LOCK(vp); 4983 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 4984 VI_UNLOCK(vp); 4985 return (1); 4986 } 4987 4988 if (VOP_GETATTR(vp, &va, curthread->td_ucred)) 4989 return (0); 4990 4991 VI_LOCK(vp); 4992 kn->kn_data = va.va_size - kn->kn_fp->f_offset; 4993 res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; 4994 VI_UNLOCK(vp); 4995 return (res); 4996 } 4997 4998 /*ARGSUSED*/ 4999 static int 5000 filt_vfswrite(struct knote *kn, long hint) 5001 { 5002 struct vnode *vp = (struct vnode *)kn->kn_hook; 5003 5004 VI_LOCK(vp); 5005 5006 /* 5007 * filesystem is gone, so set the EOF flag and schedule 5008 * the knote for deletion. 5009 */ 5010 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) 5011 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 5012 5013 kn->kn_data = 0; 5014 VI_UNLOCK(vp); 5015 return (1); 5016 } 5017 5018 static int 5019 filt_vfsvnode(struct knote *kn, long hint) 5020 { 5021 struct vnode *vp = (struct vnode *)kn->kn_hook; 5022 int res; 5023 5024 VI_LOCK(vp); 5025 if (kn->kn_sfflags & hint) 5026 kn->kn_fflags |= hint; 5027 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 5028 kn->kn_flags |= EV_EOF; 5029 VI_UNLOCK(vp); 5030 return (1); 5031 } 5032 res = (kn->kn_fflags != 0); 5033 VI_UNLOCK(vp); 5034 return (res); 5035 } 5036 5037 int 5038 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) 5039 { 5040 int error; 5041 5042 if (dp->d_reclen > ap->a_uio->uio_resid) 5043 return (ENAMETOOLONG); 5044 error = uiomove(dp, dp->d_reclen, ap->a_uio); 5045 if (error) { 5046 if (ap->a_ncookies != NULL) { 5047 if (ap->a_cookies != NULL) 5048 free(ap->a_cookies, M_TEMP); 5049 ap->a_cookies = NULL; 5050 *ap->a_ncookies = 0; 5051 } 5052 return (error); 5053 } 5054 if (ap->a_ncookies == NULL) 5055 return (0); 5056 5057 KASSERT(ap->a_cookies, 5058 ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); 5059 5060 *ap->a_cookies = realloc(*ap->a_cookies, 5061 (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO); 5062 (*ap->a_cookies)[*ap->a_ncookies] = off; 5063 *ap->a_ncookies += 1; 5064 return (0); 5065 } 5066 5067 /* 5068 * Mark for update the access time of the file if the filesystem 5069 * supports VOP_MARKATIME. This functionality is used by execve and 5070 * mmap, so we want to avoid the I/O implied by directly setting 5071 * va_atime for the sake of efficiency. 5072 */ 5073 void 5074 vfs_mark_atime(struct vnode *vp, struct ucred *cred) 5075 { 5076 struct mount *mp; 5077 5078 mp = vp->v_mount; 5079 ASSERT_VOP_LOCKED(vp, "vfs_mark_atime"); 5080 if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) 5081 (void)VOP_MARKATIME(vp); 5082 } 5083 5084 /* 5085 * The purpose of this routine is to remove granularity from accmode_t, 5086 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, 5087 * VADMIN and VAPPEND. 5088 * 5089 * If it returns 0, the caller is supposed to continue with the usual 5090 * access checks using 'accmode' as modified by this routine. If it 5091 * returns nonzero value, the caller is supposed to return that value 5092 * as errno. 5093 * 5094 * Note that after this routine runs, accmode may be zero. 5095 */ 5096 int 5097 vfs_unixify_accmode(accmode_t *accmode) 5098 { 5099 /* 5100 * There is no way to specify explicit "deny" rule using 5101 * file mode or POSIX.1e ACLs. 5102 */ 5103 if (*accmode & VEXPLICIT_DENY) { 5104 *accmode = 0; 5105 return (0); 5106 } 5107 5108 /* 5109 * None of these can be translated into usual access bits. 5110 * Also, the common case for NFSv4 ACLs is to not contain 5111 * either of these bits. Caller should check for VWRITE 5112 * on the containing directory instead. 5113 */ 5114 if (*accmode & (VDELETE_CHILD | VDELETE)) 5115 return (EPERM); 5116 5117 if (*accmode & VADMIN_PERMS) { 5118 *accmode &= ~VADMIN_PERMS; 5119 *accmode |= VADMIN; 5120 } 5121 5122 /* 5123 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL 5124 * or VSYNCHRONIZE using file mode or POSIX.1e ACL. 5125 */ 5126 *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); 5127 5128 return (0); 5129 } 5130 5131 /* 5132 * These are helper functions for filesystems to traverse all 5133 * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. 5134 * 5135 * This interface replaces MNT_VNODE_FOREACH. 5136 */ 5137 5138 MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); 5139 5140 struct vnode * 5141 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) 5142 { 5143 struct vnode *vp; 5144 5145 if (should_yield()) 5146 kern_yield(PRI_USER); 5147 MNT_ILOCK(mp); 5148 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 5149 vp = TAILQ_NEXT(*mvp, v_nmntvnodes); 5150 while (vp != NULL && (vp->v_type == VMARKER || 5151 (vp->v_iflag & VI_DOOMED) != 0)) 5152 vp = TAILQ_NEXT(vp, v_nmntvnodes); 5153 5154 /* Check if we are done */ 5155 if (vp == NULL) { 5156 __mnt_vnode_markerfree_all(mvp, mp); 5157 /* MNT_IUNLOCK(mp); -- done in above function */ 5158 mtx_assert(MNT_MTX(mp), MA_NOTOWNED); 5159 return (NULL); 5160 } 5161 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 5162 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 5163 VI_LOCK(vp); 5164 MNT_IUNLOCK(mp); 5165 return (vp); 5166 } 5167 5168 struct vnode * 5169 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) 5170 { 5171 struct vnode *vp; 5172 5173 *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 5174 MNT_ILOCK(mp); 5175 MNT_REF(mp); 5176 (*mvp)->v_type = VMARKER; 5177 5178 vp = TAILQ_FIRST(&mp->mnt_nvnodelist); 5179 while (vp != NULL && (vp->v_type == VMARKER || 5180 (vp->v_iflag & VI_DOOMED) != 0)) 5181 vp = TAILQ_NEXT(vp, v_nmntvnodes); 5182 5183 /* Check if we are done */ 5184 if (vp == NULL) { 5185 MNT_REL(mp); 5186 MNT_IUNLOCK(mp); 5187 free(*mvp, M_VNODE_MARKER); 5188 *mvp = NULL; 5189 return (NULL); 5190 } 5191 (*mvp)->v_mount = mp; 5192 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 5193 VI_LOCK(vp); 5194 MNT_IUNLOCK(mp); 5195 return (vp); 5196 } 5197 5198 5199 void 5200 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) 5201 { 5202 5203 if (*mvp == NULL) { 5204 MNT_IUNLOCK(mp); 5205 return; 5206 } 5207 5208 mtx_assert(MNT_MTX(mp), MA_OWNED); 5209 5210 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 5211 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 5212 MNT_REL(mp); 5213 MNT_IUNLOCK(mp); 5214 free(*mvp, M_VNODE_MARKER); 5215 *mvp = NULL; 5216 } 5217 5218 /* 5219 * These are helper functions for filesystems to traverse their 5220 * active vnodes. See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h 5221 */ 5222 static void 5223 mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) 5224 { 5225 5226 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 5227 5228 MNT_ILOCK(mp); 5229 MNT_REL(mp); 5230 MNT_IUNLOCK(mp); 5231 free(*mvp, M_VNODE_MARKER); 5232 *mvp = NULL; 5233 } 5234 5235 static struct vnode * 5236 mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) 5237 { 5238 struct vnode *vp, *nvp; 5239 5240 mtx_assert(&vnode_free_list_mtx, MA_OWNED); 5241 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 5242 restart: 5243 vp = TAILQ_NEXT(*mvp, v_actfreelist); 5244 TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); 5245 while (vp != NULL) { 5246 if (vp->v_type == VMARKER) { 5247 vp = TAILQ_NEXT(vp, v_actfreelist); 5248 continue; 5249 } 5250 if (!VI_TRYLOCK(vp)) { 5251 if (mp_ncpus == 1 || should_yield()) { 5252 TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist); 5253 mtx_unlock(&vnode_free_list_mtx); 5254 pause("vnacti", 1); 5255 mtx_lock(&vnode_free_list_mtx); 5256 goto restart; 5257 } 5258 continue; 5259 } 5260 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); 5261 KASSERT(vp->v_mount == mp || vp->v_mount == NULL, 5262 ("alien vnode on the active list %p %p", vp, mp)); 5263 if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0) 5264 break; 5265 nvp = TAILQ_NEXT(vp, v_actfreelist); 5266 VI_UNLOCK(vp); 5267 vp = nvp; 5268 } 5269 5270 /* Check if we are done */ 5271 if (vp == NULL) { 5272 mtx_unlock(&vnode_free_list_mtx); 5273 mnt_vnode_markerfree_active(mvp, mp); 5274 return (NULL); 5275 } 5276 TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist); 5277 mtx_unlock(&vnode_free_list_mtx); 5278 ASSERT_VI_LOCKED(vp, "active iter"); 5279 KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp)); 5280 return (vp); 5281 } 5282 5283 struct vnode * 5284 __mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) 5285 { 5286 5287 if (should_yield()) 5288 kern_yield(PRI_USER); 5289 mtx_lock(&vnode_free_list_mtx); 5290 return (mnt_vnode_next_active(mvp, mp)); 5291 } 5292 5293 struct vnode * 5294 __mnt_vnode_first_active(struct vnode **mvp, struct mount *mp) 5295 { 5296 struct vnode *vp; 5297 5298 *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 5299 MNT_ILOCK(mp); 5300 MNT_REF(mp); 5301 MNT_IUNLOCK(mp); 5302 (*mvp)->v_type = VMARKER; 5303 (*mvp)->v_mount = mp; 5304 5305 mtx_lock(&vnode_free_list_mtx); 5306 vp = TAILQ_FIRST(&mp->mnt_activevnodelist); 5307 if (vp == NULL) { 5308 mtx_unlock(&vnode_free_list_mtx); 5309 mnt_vnode_markerfree_active(mvp, mp); 5310 return (NULL); 5311 } 5312 TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist); 5313 return (mnt_vnode_next_active(mvp, mp)); 5314 } 5315 5316 void 5317 __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) 5318 { 5319 5320 if (*mvp == NULL) 5321 return; 5322 5323 mtx_lock(&vnode_free_list_mtx); 5324 TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); 5325 mtx_unlock(&vnode_free_list_mtx); 5326 mnt_vnode_markerfree_active(mvp, mp); 5327 } 5328