1 /*- 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 35 */ 36 37 /* 38 * External virtual filesystem routines 39 */ 40 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 #include "opt_compat.h" 45 #include "opt_ddb.h" 46 #include "opt_watchdog.h" 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/bio.h> 51 #include <sys/buf.h> 52 #include <sys/condvar.h> 53 #include <sys/conf.h> 54 #include <sys/dirent.h> 55 #include <sys/event.h> 56 #include <sys/eventhandler.h> 57 #include <sys/extattr.h> 58 #include <sys/file.h> 59 #include <sys/fcntl.h> 60 #include <sys/jail.h> 61 #include <sys/kdb.h> 62 #include <sys/kernel.h> 63 #include <sys/kthread.h> 64 #include <sys/lockf.h> 65 #include <sys/malloc.h> 66 #include <sys/mount.h> 67 #include <sys/namei.h> 68 #include <sys/pctrie.h> 69 #include <sys/priv.h> 70 #include <sys/reboot.h> 71 #include <sys/refcount.h> 72 #include <sys/rwlock.h> 73 #include <sys/sched.h> 74 #include <sys/sleepqueue.h> 75 #include <sys/smp.h> 76 #include <sys/stat.h> 77 #include <sys/sysctl.h> 78 #include <sys/syslog.h> 79 #include <sys/vmmeter.h> 80 #include <sys/vnode.h> 81 #include <sys/watchdog.h> 82 83 #include <machine/stdarg.h> 84 85 #include <security/mac/mac_framework.h> 86 87 #include <vm/vm.h> 88 #include <vm/vm_object.h> 89 #include <vm/vm_extern.h> 90 #include <vm/pmap.h> 91 #include <vm/vm_map.h> 92 #include <vm/vm_page.h> 93 #include <vm/vm_kern.h> 94 #include <vm/uma.h> 95 96 #ifdef DDB 97 #include <ddb/ddb.h> 98 #endif 99 100 static void delmntque(struct vnode *vp); 101 static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, 102 int slpflag, int slptimeo); 103 static void syncer_shutdown(void *arg, int howto); 104 static int vtryrecycle(struct vnode *vp); 105 static void v_init_counters(struct vnode *); 106 static void v_incr_usecount(struct vnode *); 107 static void v_incr_devcount(struct vnode *); 108 static void v_decr_devcount(struct vnode *); 109 static void vnlru_free(int); 110 static void vgonel(struct vnode *); 111 static void vfs_knllock(void *arg); 112 static void vfs_knlunlock(void *arg); 113 static void vfs_knl_assert_locked(void *arg); 114 static void vfs_knl_assert_unlocked(void *arg); 115 static void destroy_vpollinfo(struct vpollinfo *vi); 116 117 /* 118 * Number of vnodes in existence. Increased whenever getnewvnode() 119 * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode. 120 */ 121 static unsigned long numvnodes; 122 123 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, 124 "Number of vnodes in existence"); 125 126 static u_long vnodes_created; 127 SYSCTL_ULONG(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, 128 0, "Number of vnodes created by getnewvnode"); 129 130 /* 131 * Conversion tables for conversion from vnode types to inode formats 132 * and back. 133 */ 134 enum vtype iftovt_tab[16] = { 135 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 136 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 137 }; 138 int vttoif_tab[10] = { 139 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 140 S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT 141 }; 142 143 /* 144 * List of vnodes that are ready for recycling. 145 */ 146 static TAILQ_HEAD(freelst, vnode) vnode_free_list; 147 148 /* 149 * Free vnode target. Free vnodes may simply be files which have been stat'd 150 * but not read. This is somewhat common, and a small cache of such files 151 * should be kept to avoid recreation costs. 152 */ 153 static u_long wantfreevnodes; 154 SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); 155 /* Number of vnodes in the free list. */ 156 static u_long freevnodes; 157 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, 158 "Number of vnodes in the free list"); 159 160 static int vlru_allow_cache_src; 161 SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW, 162 &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode"); 163 164 static u_long recycles_count; 165 SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 0, 166 "Number of vnodes recycled to avoid exceding kern.maxvnodes"); 167 168 /* 169 * Various variables used for debugging the new implementation of 170 * reassignbuf(). 171 * XXX these are probably of (very) limited utility now. 172 */ 173 static int reassignbufcalls; 174 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, 175 "Number of calls to reassignbuf"); 176 177 static u_long free_owe_inact; 178 SYSCTL_ULONG(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact, 0, 179 "Number of times free vnodes kept on active list due to VFS " 180 "owing inactivation"); 181 182 /* To keep more than one thread at a time from running vfs_getnewfsid */ 183 static struct mtx mntid_mtx; 184 185 /* 186 * Lock for any access to the following: 187 * vnode_free_list 188 * numvnodes 189 * freevnodes 190 */ 191 static struct mtx vnode_free_list_mtx; 192 193 /* Publicly exported FS */ 194 struct nfs_public nfs_pub; 195 196 static uma_zone_t buf_trie_zone; 197 198 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 199 static uma_zone_t vnode_zone; 200 static uma_zone_t vnodepoll_zone; 201 202 /* 203 * The workitem queue. 204 * 205 * It is useful to delay writes of file data and filesystem metadata 206 * for tens of seconds so that quickly created and deleted files need 207 * not waste disk bandwidth being created and removed. To realize this, 208 * we append vnodes to a "workitem" queue. When running with a soft 209 * updates implementation, most pending metadata dependencies should 210 * not wait for more than a few seconds. Thus, mounted on block devices 211 * are delayed only about a half the time that file data is delayed. 212 * Similarly, directory updates are more critical, so are only delayed 213 * about a third the time that file data is delayed. Thus, there are 214 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 215 * one each second (driven off the filesystem syncer process). The 216 * syncer_delayno variable indicates the next queue that is to be processed. 217 * Items that need to be processed soon are placed in this queue: 218 * 219 * syncer_workitem_pending[syncer_delayno] 220 * 221 * A delay of fifteen seconds is done by placing the request fifteen 222 * entries later in the queue: 223 * 224 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 225 * 226 */ 227 static int syncer_delayno; 228 static long syncer_mask; 229 LIST_HEAD(synclist, bufobj); 230 static struct synclist *syncer_workitem_pending; 231 /* 232 * The sync_mtx protects: 233 * bo->bo_synclist 234 * sync_vnode_count 235 * syncer_delayno 236 * syncer_state 237 * syncer_workitem_pending 238 * syncer_worklist_len 239 * rushjob 240 */ 241 static struct mtx sync_mtx; 242 static struct cv sync_wakeup; 243 244 #define SYNCER_MAXDELAY 32 245 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 246 static int syncdelay = 30; /* max time to delay syncing data */ 247 static int filedelay = 30; /* time to delay syncing files */ 248 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, 249 "Time to delay syncing files (in seconds)"); 250 static int dirdelay = 29; /* time to delay syncing directories */ 251 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, 252 "Time to delay syncing directories (in seconds)"); 253 static int metadelay = 28; /* time to delay syncing metadata */ 254 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, 255 "Time to delay syncing metadata (in seconds)"); 256 static int rushjob; /* number of slots to run ASAP */ 257 static int stat_rush_requests; /* number of times I/O speeded up */ 258 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, 259 "Number of times I/O speeded up (rush requests)"); 260 261 /* 262 * When shutting down the syncer, run it at four times normal speed. 263 */ 264 #define SYNCER_SHUTDOWN_SPEEDUP 4 265 static int sync_vnode_count; 266 static int syncer_worklist_len; 267 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } 268 syncer_state; 269 270 /* 271 * Number of vnodes we want to exist at any one time. This is mostly used 272 * to size hash tables in vnode-related code. It is normally not used in 273 * getnewvnode(), as wantfreevnodes is normally nonzero.) 274 * 275 * XXX desiredvnodes is historical cruft and should not exist. 276 */ 277 int desiredvnodes; 278 279 static int 280 sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS) 281 { 282 int error, old_desiredvnodes; 283 284 old_desiredvnodes = desiredvnodes; 285 if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0) 286 return (error); 287 if (old_desiredvnodes != desiredvnodes) { 288 vfs_hash_changesize(desiredvnodes); 289 cache_changesize(desiredvnodes); 290 } 291 return (0); 292 } 293 294 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, 295 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0, 296 sysctl_update_desiredvnodes, "I", "Maximum number of vnodes"); 297 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 298 &wantfreevnodes, 0, "Minimum number of vnodes (legacy)"); 299 static int vnlru_nowhere; 300 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, 301 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); 302 303 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ 304 static int vnsz2log; 305 306 /* 307 * Support for the bufobj clean & dirty pctrie. 308 */ 309 static void * 310 buf_trie_alloc(struct pctrie *ptree) 311 { 312 313 return uma_zalloc(buf_trie_zone, M_NOWAIT); 314 } 315 316 static void 317 buf_trie_free(struct pctrie *ptree, void *node) 318 { 319 320 uma_zfree(buf_trie_zone, node); 321 } 322 PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free); 323 324 /* 325 * Initialize the vnode management data structures. 326 * 327 * Reevaluate the following cap on the number of vnodes after the physical 328 * memory size exceeds 512GB. In the limit, as the physical memory size 329 * grows, the ratio of physical pages to vnodes approaches sixteen to one. 330 */ 331 #ifndef MAXVNODES_MAX 332 #define MAXVNODES_MAX (512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16)) 333 #endif 334 static void 335 vntblinit(void *dummy __unused) 336 { 337 u_int i; 338 int physvnodes, virtvnodes; 339 340 /* 341 * Desiredvnodes is a function of the physical memory size and the 342 * kernel's heap size. Generally speaking, it scales with the 343 * physical memory size. The ratio of desiredvnodes to physical pages 344 * is one to four until desiredvnodes exceeds 98,304. Thereafter, the 345 * marginal ratio of desiredvnodes to physical pages is one to 346 * sixteen. However, desiredvnodes is limited by the kernel's heap 347 * size. The memory required by desiredvnodes vnodes and vm objects 348 * may not exceed one seventh of the kernel's heap size. 349 */ 350 physvnodes = maxproc + vm_cnt.v_page_count / 16 + 3 * min(98304 * 4, 351 vm_cnt.v_page_count) / 16; 352 virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) + 353 sizeof(struct vnode))); 354 desiredvnodes = min(physvnodes, virtvnodes); 355 if (desiredvnodes > MAXVNODES_MAX) { 356 if (bootverbose) 357 printf("Reducing kern.maxvnodes %d -> %d\n", 358 desiredvnodes, MAXVNODES_MAX); 359 desiredvnodes = MAXVNODES_MAX; 360 } 361 wantfreevnodes = desiredvnodes / 4; 362 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); 363 TAILQ_INIT(&vnode_free_list); 364 mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF); 365 vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, 366 NULL, NULL, UMA_ALIGN_PTR, 0); 367 vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), 368 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 369 /* 370 * Preallocate enough nodes to support one-per buf so that 371 * we can not fail an insert. reassignbuf() callers can not 372 * tolerate the insertion failure. 373 */ 374 buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), 375 NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 376 UMA_ZONE_NOFREE | UMA_ZONE_VM); 377 uma_prealloc(buf_trie_zone, nbuf); 378 /* 379 * Initialize the filesystem syncer. 380 */ 381 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 382 &syncer_mask); 383 syncer_maxdelay = syncer_mask + 1; 384 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); 385 cv_init(&sync_wakeup, "syncer"); 386 for (i = 1; i <= sizeof(struct vnode); i <<= 1) 387 vnsz2log++; 388 vnsz2log--; 389 } 390 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); 391 392 393 /* 394 * Mark a mount point as busy. Used to synchronize access and to delay 395 * unmounting. Eventually, mountlist_mtx is not released on failure. 396 * 397 * vfs_busy() is a custom lock, it can block the caller. 398 * vfs_busy() only sleeps if the unmount is active on the mount point. 399 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any 400 * vnode belonging to mp. 401 * 402 * Lookup uses vfs_busy() to traverse mount points. 403 * root fs var fs 404 * / vnode lock A / vnode lock (/var) D 405 * /var vnode lock B /log vnode lock(/var/log) E 406 * vfs_busy lock C vfs_busy lock F 407 * 408 * Within each file system, the lock order is C->A->B and F->D->E. 409 * 410 * When traversing across mounts, the system follows that lock order: 411 * 412 * C->A->B 413 * | 414 * +->F->D->E 415 * 416 * The lookup() process for namei("/var") illustrates the process: 417 * VOP_LOOKUP() obtains B while A is held 418 * vfs_busy() obtains a shared lock on F while A and B are held 419 * vput() releases lock on B 420 * vput() releases lock on A 421 * VFS_ROOT() obtains lock on D while shared lock on F is held 422 * vfs_unbusy() releases shared lock on F 423 * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. 424 * Attempt to lock A (instead of vp_crossmp) while D is held would 425 * violate the global order, causing deadlocks. 426 * 427 * dounmount() locks B while F is drained. 428 */ 429 int 430 vfs_busy(struct mount *mp, int flags) 431 { 432 433 MPASS((flags & ~MBF_MASK) == 0); 434 CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); 435 436 MNT_ILOCK(mp); 437 MNT_REF(mp); 438 /* 439 * If mount point is currenly being unmounted, sleep until the 440 * mount point fate is decided. If thread doing the unmounting fails, 441 * it will clear MNTK_UNMOUNT flag before waking us up, indicating 442 * that this mount point has survived the unmount attempt and vfs_busy 443 * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE 444 * flag in addition to MNTK_UNMOUNT, indicating that mount point is 445 * about to be really destroyed. vfs_busy needs to release its 446 * reference on the mount point in this case and return with ENOENT, 447 * telling the caller that mount mount it tried to busy is no longer 448 * valid. 449 */ 450 while (mp->mnt_kern_flag & MNTK_UNMOUNT) { 451 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { 452 MNT_REL(mp); 453 MNT_IUNLOCK(mp); 454 CTR1(KTR_VFS, "%s: failed busying before sleeping", 455 __func__); 456 return (ENOENT); 457 } 458 if (flags & MBF_MNTLSTLOCK) 459 mtx_unlock(&mountlist_mtx); 460 mp->mnt_kern_flag |= MNTK_MWAIT; 461 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); 462 if (flags & MBF_MNTLSTLOCK) 463 mtx_lock(&mountlist_mtx); 464 MNT_ILOCK(mp); 465 } 466 if (flags & MBF_MNTLSTLOCK) 467 mtx_unlock(&mountlist_mtx); 468 mp->mnt_lockref++; 469 MNT_IUNLOCK(mp); 470 return (0); 471 } 472 473 /* 474 * Free a busy filesystem. 475 */ 476 void 477 vfs_unbusy(struct mount *mp) 478 { 479 480 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 481 MNT_ILOCK(mp); 482 MNT_REL(mp); 483 KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref")); 484 mp->mnt_lockref--; 485 if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { 486 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); 487 CTR1(KTR_VFS, "%s: waking up waiters", __func__); 488 mp->mnt_kern_flag &= ~MNTK_DRAINING; 489 wakeup(&mp->mnt_lockref); 490 } 491 MNT_IUNLOCK(mp); 492 } 493 494 /* 495 * Lookup a mount point by filesystem identifier. 496 */ 497 struct mount * 498 vfs_getvfs(fsid_t *fsid) 499 { 500 struct mount *mp; 501 502 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 503 mtx_lock(&mountlist_mtx); 504 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 505 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 506 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 507 vfs_ref(mp); 508 mtx_unlock(&mountlist_mtx); 509 return (mp); 510 } 511 } 512 mtx_unlock(&mountlist_mtx); 513 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 514 return ((struct mount *) 0); 515 } 516 517 /* 518 * Lookup a mount point by filesystem identifier, busying it before 519 * returning. 520 * 521 * To avoid congestion on mountlist_mtx, implement simple direct-mapped 522 * cache for popular filesystem identifiers. The cache is lockess, using 523 * the fact that struct mount's are never freed. In worst case we may 524 * get pointer to unmounted or even different filesystem, so we have to 525 * check what we got, and go slow way if so. 526 */ 527 struct mount * 528 vfs_busyfs(fsid_t *fsid) 529 { 530 #define FSID_CACHE_SIZE 256 531 typedef struct mount * volatile vmp_t; 532 static vmp_t cache[FSID_CACHE_SIZE]; 533 struct mount *mp; 534 int error; 535 uint32_t hash; 536 537 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 538 hash = fsid->val[0] ^ fsid->val[1]; 539 hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); 540 mp = cache[hash]; 541 if (mp == NULL || 542 mp->mnt_stat.f_fsid.val[0] != fsid->val[0] || 543 mp->mnt_stat.f_fsid.val[1] != fsid->val[1]) 544 goto slow; 545 if (vfs_busy(mp, 0) != 0) { 546 cache[hash] = NULL; 547 goto slow; 548 } 549 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 550 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) 551 return (mp); 552 else 553 vfs_unbusy(mp); 554 555 slow: 556 mtx_lock(&mountlist_mtx); 557 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 558 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 559 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 560 error = vfs_busy(mp, MBF_MNTLSTLOCK); 561 if (error) { 562 cache[hash] = NULL; 563 mtx_unlock(&mountlist_mtx); 564 return (NULL); 565 } 566 cache[hash] = mp; 567 return (mp); 568 } 569 } 570 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 571 mtx_unlock(&mountlist_mtx); 572 return ((struct mount *) 0); 573 } 574 575 /* 576 * Check if a user can access privileged mount options. 577 */ 578 int 579 vfs_suser(struct mount *mp, struct thread *td) 580 { 581 int error; 582 583 /* 584 * If the thread is jailed, but this is not a jail-friendly file 585 * system, deny immediately. 586 */ 587 if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred)) 588 return (EPERM); 589 590 /* 591 * If the file system was mounted outside the jail of the calling 592 * thread, deny immediately. 593 */ 594 if (prison_check(td->td_ucred, mp->mnt_cred) != 0) 595 return (EPERM); 596 597 /* 598 * If file system supports delegated administration, we don't check 599 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified 600 * by the file system itself. 601 * If this is not the user that did original mount, we check for 602 * the PRIV_VFS_MOUNT_OWNER privilege. 603 */ 604 if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && 605 mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { 606 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) 607 return (error); 608 } 609 return (0); 610 } 611 612 /* 613 * Get a new unique fsid. Try to make its val[0] unique, since this value 614 * will be used to create fake device numbers for stat(). Also try (but 615 * not so hard) make its val[0] unique mod 2^16, since some emulators only 616 * support 16-bit device numbers. We end up with unique val[0]'s for the 617 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 618 * 619 * Keep in mind that several mounts may be running in parallel. Starting 620 * the search one past where the previous search terminated is both a 621 * micro-optimization and a defense against returning the same fsid to 622 * different mounts. 623 */ 624 void 625 vfs_getnewfsid(struct mount *mp) 626 { 627 static uint16_t mntid_base; 628 struct mount *nmp; 629 fsid_t tfsid; 630 int mtype; 631 632 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 633 mtx_lock(&mntid_mtx); 634 mtype = mp->mnt_vfc->vfc_typenum; 635 tfsid.val[1] = mtype; 636 mtype = (mtype & 0xFF) << 24; 637 for (;;) { 638 tfsid.val[0] = makedev(255, 639 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 640 mntid_base++; 641 if ((nmp = vfs_getvfs(&tfsid)) == NULL) 642 break; 643 vfs_rel(nmp); 644 } 645 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 646 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 647 mtx_unlock(&mntid_mtx); 648 } 649 650 /* 651 * Knob to control the precision of file timestamps: 652 * 653 * 0 = seconds only; nanoseconds zeroed. 654 * 1 = seconds and nanoseconds, accurate within 1/HZ. 655 * 2 = seconds and nanoseconds, truncated to microseconds. 656 * >=3 = seconds and nanoseconds, maximum precision. 657 */ 658 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 659 660 static int timestamp_precision = TSP_USEC; 661 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 662 ×tamp_precision, 0, "File timestamp precision (0: seconds, " 663 "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, " 664 "3+: sec + ns (max. precision))"); 665 666 /* 667 * Get a current timestamp. 668 */ 669 void 670 vfs_timestamp(struct timespec *tsp) 671 { 672 struct timeval tv; 673 674 switch (timestamp_precision) { 675 case TSP_SEC: 676 tsp->tv_sec = time_second; 677 tsp->tv_nsec = 0; 678 break; 679 case TSP_HZ: 680 getnanotime(tsp); 681 break; 682 case TSP_USEC: 683 microtime(&tv); 684 TIMEVAL_TO_TIMESPEC(&tv, tsp); 685 break; 686 case TSP_NSEC: 687 default: 688 nanotime(tsp); 689 break; 690 } 691 } 692 693 /* 694 * Set vnode attributes to VNOVAL 695 */ 696 void 697 vattr_null(struct vattr *vap) 698 { 699 700 vap->va_type = VNON; 701 vap->va_size = VNOVAL; 702 vap->va_bytes = VNOVAL; 703 vap->va_mode = VNOVAL; 704 vap->va_nlink = VNOVAL; 705 vap->va_uid = VNOVAL; 706 vap->va_gid = VNOVAL; 707 vap->va_fsid = VNOVAL; 708 vap->va_fileid = VNOVAL; 709 vap->va_blocksize = VNOVAL; 710 vap->va_rdev = VNOVAL; 711 vap->va_atime.tv_sec = VNOVAL; 712 vap->va_atime.tv_nsec = VNOVAL; 713 vap->va_mtime.tv_sec = VNOVAL; 714 vap->va_mtime.tv_nsec = VNOVAL; 715 vap->va_ctime.tv_sec = VNOVAL; 716 vap->va_ctime.tv_nsec = VNOVAL; 717 vap->va_birthtime.tv_sec = VNOVAL; 718 vap->va_birthtime.tv_nsec = VNOVAL; 719 vap->va_flags = VNOVAL; 720 vap->va_gen = VNOVAL; 721 vap->va_vaflags = 0; 722 } 723 724 /* 725 * This routine is called when we have too many vnodes. It attempts 726 * to free <count> vnodes and will potentially free vnodes that still 727 * have VM backing store (VM backing store is typically the cause 728 * of a vnode blowout so we want to do this). Therefore, this operation 729 * is not considered cheap. 730 * 731 * A number of conditions may prevent a vnode from being reclaimed. 732 * the buffer cache may have references on the vnode, a directory 733 * vnode may still have references due to the namei cache representing 734 * underlying files, or the vnode may be in active use. It is not 735 * desireable to reuse such vnodes. These conditions may cause the 736 * number of vnodes to reach some minimum value regardless of what 737 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 738 */ 739 static int 740 vlrureclaim(struct mount *mp) 741 { 742 struct vnode *vp; 743 int done; 744 int trigger; 745 int usevnodes; 746 int count; 747 748 /* 749 * Calculate the trigger point, don't allow user 750 * screwups to blow us up. This prevents us from 751 * recycling vnodes with lots of resident pages. We 752 * aren't trying to free memory, we are trying to 753 * free vnodes. 754 */ 755 usevnodes = desiredvnodes; 756 if (usevnodes <= 0) 757 usevnodes = 1; 758 trigger = vm_cnt.v_page_count * 2 / usevnodes; 759 done = 0; 760 vn_start_write(NULL, &mp, V_WAIT); 761 MNT_ILOCK(mp); 762 count = mp->mnt_nvnodelistsize / 10 + 1; 763 while (count != 0) { 764 vp = TAILQ_FIRST(&mp->mnt_nvnodelist); 765 while (vp != NULL && vp->v_type == VMARKER) 766 vp = TAILQ_NEXT(vp, v_nmntvnodes); 767 if (vp == NULL) 768 break; 769 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 770 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 771 --count; 772 if (!VI_TRYLOCK(vp)) 773 goto next_iter; 774 /* 775 * If it's been deconstructed already, it's still 776 * referenced, or it exceeds the trigger, skip it. 777 */ 778 if (vp->v_usecount || 779 (!vlru_allow_cache_src && 780 !LIST_EMPTY(&(vp)->v_cache_src)) || 781 (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL && 782 vp->v_object->resident_page_count > trigger)) { 783 VI_UNLOCK(vp); 784 goto next_iter; 785 } 786 MNT_IUNLOCK(mp); 787 vholdl(vp); 788 if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) { 789 vdrop(vp); 790 goto next_iter_mntunlocked; 791 } 792 VI_LOCK(vp); 793 /* 794 * v_usecount may have been bumped after VOP_LOCK() dropped 795 * the vnode interlock and before it was locked again. 796 * 797 * It is not necessary to recheck VI_DOOMED because it can 798 * only be set by another thread that holds both the vnode 799 * lock and vnode interlock. If another thread has the 800 * vnode lock before we get to VOP_LOCK() and obtains the 801 * vnode interlock after VOP_LOCK() drops the vnode 802 * interlock, the other thread will be unable to drop the 803 * vnode lock before our VOP_LOCK() call fails. 804 */ 805 if (vp->v_usecount || 806 (!vlru_allow_cache_src && 807 !LIST_EMPTY(&(vp)->v_cache_src)) || 808 (vp->v_object != NULL && 809 vp->v_object->resident_page_count > trigger)) { 810 VOP_UNLOCK(vp, LK_INTERLOCK); 811 vdrop(vp); 812 goto next_iter_mntunlocked; 813 } 814 KASSERT((vp->v_iflag & VI_DOOMED) == 0, 815 ("VI_DOOMED unexpectedly detected in vlrureclaim()")); 816 atomic_add_long(&recycles_count, 1); 817 vgonel(vp); 818 VOP_UNLOCK(vp, 0); 819 vdropl(vp); 820 done++; 821 next_iter_mntunlocked: 822 if (!should_yield()) 823 goto relock_mnt; 824 goto yield; 825 next_iter: 826 if (!should_yield()) 827 continue; 828 MNT_IUNLOCK(mp); 829 yield: 830 kern_yield(PRI_USER); 831 relock_mnt: 832 MNT_ILOCK(mp); 833 } 834 MNT_IUNLOCK(mp); 835 vn_finished_write(mp); 836 return done; 837 } 838 839 /* 840 * Attempt to keep the free list at wantfreevnodes length. 841 */ 842 static void 843 vnlru_free(int count) 844 { 845 struct vnode *vp; 846 847 mtx_assert(&vnode_free_list_mtx, MA_OWNED); 848 for (; count > 0; count--) { 849 vp = TAILQ_FIRST(&vnode_free_list); 850 /* 851 * The list can be modified while the free_list_mtx 852 * has been dropped and vp could be NULL here. 853 */ 854 if (!vp) 855 break; 856 VNASSERT(vp->v_op != NULL, vp, 857 ("vnlru_free: vnode already reclaimed.")); 858 KASSERT((vp->v_iflag & VI_FREE) != 0, 859 ("Removing vnode not on freelist")); 860 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 861 ("Mangling active vnode")); 862 TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); 863 /* 864 * Don't recycle if we can't get the interlock. 865 */ 866 if (!VI_TRYLOCK(vp)) { 867 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist); 868 continue; 869 } 870 VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0, 871 vp, ("vp inconsistent on freelist")); 872 873 /* 874 * The clear of VI_FREE prevents activation of the 875 * vnode. There is no sense in putting the vnode on 876 * the mount point active list, only to remove it 877 * later during recycling. Inline the relevant part 878 * of vholdl(), to avoid triggering assertions or 879 * activating. 880 */ 881 freevnodes--; 882 vp->v_iflag &= ~VI_FREE; 883 refcount_acquire(&vp->v_holdcnt); 884 885 mtx_unlock(&vnode_free_list_mtx); 886 VI_UNLOCK(vp); 887 vtryrecycle(vp); 888 /* 889 * If the recycled succeeded this vdrop will actually free 890 * the vnode. If not it will simply place it back on 891 * the free list. 892 */ 893 vdrop(vp); 894 mtx_lock(&vnode_free_list_mtx); 895 } 896 } 897 /* 898 * Attempt to recycle vnodes in a context that is always safe to block. 899 * Calling vlrurecycle() from the bowels of filesystem code has some 900 * interesting deadlock problems. 901 */ 902 static struct proc *vnlruproc; 903 static int vnlruproc_sig; 904 905 static void 906 vnlru_proc(void) 907 { 908 struct mount *mp, *nmp; 909 int done; 910 struct proc *p = vnlruproc; 911 912 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, 913 SHUTDOWN_PRI_FIRST); 914 915 for (;;) { 916 kproc_suspend_check(p); 917 mtx_lock(&vnode_free_list_mtx); 918 if (freevnodes > wantfreevnodes) 919 vnlru_free(freevnodes - wantfreevnodes); 920 if (numvnodes <= desiredvnodes * 9 / 10) { 921 vnlruproc_sig = 0; 922 wakeup(&vnlruproc_sig); 923 msleep(vnlruproc, &vnode_free_list_mtx, 924 PVFS|PDROP, "vlruwt", hz); 925 continue; 926 } 927 mtx_unlock(&vnode_free_list_mtx); 928 done = 0; 929 mtx_lock(&mountlist_mtx); 930 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 931 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { 932 nmp = TAILQ_NEXT(mp, mnt_list); 933 continue; 934 } 935 done += vlrureclaim(mp); 936 mtx_lock(&mountlist_mtx); 937 nmp = TAILQ_NEXT(mp, mnt_list); 938 vfs_unbusy(mp); 939 } 940 mtx_unlock(&mountlist_mtx); 941 if (done == 0) { 942 #if 0 943 /* These messages are temporary debugging aids */ 944 if (vnlru_nowhere < 5) 945 printf("vnlru process getting nowhere..\n"); 946 else if (vnlru_nowhere == 5) 947 printf("vnlru process messages stopped.\n"); 948 #endif 949 vnlru_nowhere++; 950 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 951 } else 952 kern_yield(PRI_USER); 953 } 954 } 955 956 static struct kproc_desc vnlru_kp = { 957 "vnlru", 958 vnlru_proc, 959 &vnlruproc 960 }; 961 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, 962 &vnlru_kp); 963 964 /* 965 * Routines having to do with the management of the vnode table. 966 */ 967 968 /* 969 * Try to recycle a freed vnode. We abort if anyone picks up a reference 970 * before we actually vgone(). This function must be called with the vnode 971 * held to prevent the vnode from being returned to the free list midway 972 * through vgone(). 973 */ 974 static int 975 vtryrecycle(struct vnode *vp) 976 { 977 struct mount *vnmp; 978 979 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 980 VNASSERT(vp->v_holdcnt, vp, 981 ("vtryrecycle: Recycling vp %p without a reference.", vp)); 982 /* 983 * This vnode may found and locked via some other list, if so we 984 * can't recycle it yet. 985 */ 986 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 987 CTR2(KTR_VFS, 988 "%s: impossible to recycle, vp %p lock is already held", 989 __func__, vp); 990 return (EWOULDBLOCK); 991 } 992 /* 993 * Don't recycle if its filesystem is being suspended. 994 */ 995 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { 996 VOP_UNLOCK(vp, 0); 997 CTR2(KTR_VFS, 998 "%s: impossible to recycle, cannot start the write for %p", 999 __func__, vp); 1000 return (EBUSY); 1001 } 1002 /* 1003 * If we got this far, we need to acquire the interlock and see if 1004 * anyone picked up this vnode from another list. If not, we will 1005 * mark it with DOOMED via vgonel() so that anyone who does find it 1006 * will skip over it. 1007 */ 1008 VI_LOCK(vp); 1009 if (vp->v_usecount) { 1010 VOP_UNLOCK(vp, LK_INTERLOCK); 1011 vn_finished_write(vnmp); 1012 CTR2(KTR_VFS, 1013 "%s: impossible to recycle, %p is already referenced", 1014 __func__, vp); 1015 return (EBUSY); 1016 } 1017 if ((vp->v_iflag & VI_DOOMED) == 0) { 1018 atomic_add_long(&recycles_count, 1); 1019 vgonel(vp); 1020 } 1021 VOP_UNLOCK(vp, LK_INTERLOCK); 1022 vn_finished_write(vnmp); 1023 return (0); 1024 } 1025 1026 /* 1027 * Wait for available vnodes. 1028 */ 1029 static int 1030 getnewvnode_wait(int suspended) 1031 { 1032 1033 mtx_assert(&vnode_free_list_mtx, MA_OWNED); 1034 if (numvnodes > desiredvnodes) { 1035 if (suspended) { 1036 /* 1037 * File system is beeing suspended, we cannot risk a 1038 * deadlock here, so allocate new vnode anyway. 1039 */ 1040 if (freevnodes > wantfreevnodes) 1041 vnlru_free(freevnodes - wantfreevnodes); 1042 return (0); 1043 } 1044 if (vnlruproc_sig == 0) { 1045 vnlruproc_sig = 1; /* avoid unnecessary wakeups */ 1046 wakeup(vnlruproc); 1047 } 1048 msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS, 1049 "vlruwk", hz); 1050 } 1051 return (numvnodes > desiredvnodes ? ENFILE : 0); 1052 } 1053 1054 void 1055 getnewvnode_reserve(u_int count) 1056 { 1057 struct thread *td; 1058 1059 td = curthread; 1060 /* First try to be quick and racy. */ 1061 if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) { 1062 td->td_vp_reserv += count; 1063 return; 1064 } else 1065 atomic_subtract_long(&numvnodes, count); 1066 1067 mtx_lock(&vnode_free_list_mtx); 1068 while (count > 0) { 1069 if (getnewvnode_wait(0) == 0) { 1070 count--; 1071 td->td_vp_reserv++; 1072 atomic_add_long(&numvnodes, 1); 1073 } 1074 } 1075 mtx_unlock(&vnode_free_list_mtx); 1076 } 1077 1078 void 1079 getnewvnode_drop_reserve(void) 1080 { 1081 struct thread *td; 1082 1083 td = curthread; 1084 atomic_subtract_long(&numvnodes, td->td_vp_reserv); 1085 td->td_vp_reserv = 0; 1086 } 1087 1088 /* 1089 * Return the next vnode from the free list. 1090 */ 1091 int 1092 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, 1093 struct vnode **vpp) 1094 { 1095 struct vnode *vp; 1096 struct bufobj *bo; 1097 struct thread *td; 1098 int error; 1099 1100 CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); 1101 vp = NULL; 1102 td = curthread; 1103 if (td->td_vp_reserv > 0) { 1104 td->td_vp_reserv -= 1; 1105 goto alloc; 1106 } 1107 mtx_lock(&vnode_free_list_mtx); 1108 /* 1109 * Lend our context to reclaim vnodes if they've exceeded the max. 1110 */ 1111 if (freevnodes > wantfreevnodes) 1112 vnlru_free(1); 1113 error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag & 1114 MNTK_SUSPEND)); 1115 #if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ 1116 if (error != 0) { 1117 mtx_unlock(&vnode_free_list_mtx); 1118 return (error); 1119 } 1120 #endif 1121 atomic_add_long(&numvnodes, 1); 1122 mtx_unlock(&vnode_free_list_mtx); 1123 alloc: 1124 atomic_add_long(&vnodes_created, 1); 1125 vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO); 1126 /* 1127 * Setup locks. 1128 */ 1129 vp->v_vnlock = &vp->v_lock; 1130 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); 1131 /* 1132 * By default, don't allow shared locks unless filesystems 1133 * opt-in. 1134 */ 1135 lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE); 1136 /* 1137 * Initialize bufobj. 1138 */ 1139 bo = &vp->v_bufobj; 1140 bo->__bo_vnode = vp; 1141 rw_init(BO_LOCKPTR(bo), "bufobj interlock"); 1142 bo->bo_ops = &buf_ops_bio; 1143 bo->bo_private = vp; 1144 TAILQ_INIT(&bo->bo_clean.bv_hd); 1145 TAILQ_INIT(&bo->bo_dirty.bv_hd); 1146 /* 1147 * Initialize namecache. 1148 */ 1149 LIST_INIT(&vp->v_cache_src); 1150 TAILQ_INIT(&vp->v_cache_dst); 1151 /* 1152 * Finalize various vnode identity bits. 1153 */ 1154 vp->v_type = VNON; 1155 vp->v_tag = tag; 1156 vp->v_op = vops; 1157 v_init_counters(vp); 1158 vp->v_data = NULL; 1159 #ifdef MAC 1160 mac_vnode_init(vp); 1161 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) 1162 mac_vnode_associate_singlelabel(mp, vp); 1163 else if (mp == NULL && vops != &dead_vnodeops) 1164 printf("NULL mp in getnewvnode()\n"); 1165 #endif 1166 if (mp != NULL) { 1167 bo->bo_bsize = mp->mnt_stat.f_iosize; 1168 if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) 1169 vp->v_vflag |= VV_NOKNOTE; 1170 } 1171 rangelock_init(&vp->v_rl); 1172 1173 /* 1174 * For the filesystems which do not use vfs_hash_insert(), 1175 * still initialize v_hash to have vfs_hash_index() useful. 1176 * E.g., nullfs uses vfs_hash_index() on the lower vnode for 1177 * its own hashing. 1178 */ 1179 vp->v_hash = (uintptr_t)vp >> vnsz2log; 1180 1181 *vpp = vp; 1182 return (0); 1183 } 1184 1185 /* 1186 * Delete from old mount point vnode list, if on one. 1187 */ 1188 static void 1189 delmntque(struct vnode *vp) 1190 { 1191 struct mount *mp; 1192 int active; 1193 1194 mp = vp->v_mount; 1195 if (mp == NULL) 1196 return; 1197 MNT_ILOCK(mp); 1198 VI_LOCK(vp); 1199 KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize, 1200 ("Active vnode list size %d > Vnode list size %d", 1201 mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize)); 1202 active = vp->v_iflag & VI_ACTIVE; 1203 vp->v_iflag &= ~VI_ACTIVE; 1204 if (active) { 1205 mtx_lock(&vnode_free_list_mtx); 1206 TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist); 1207 mp->mnt_activevnodelistsize--; 1208 mtx_unlock(&vnode_free_list_mtx); 1209 } 1210 vp->v_mount = NULL; 1211 VI_UNLOCK(vp); 1212 VNASSERT(mp->mnt_nvnodelistsize > 0, vp, 1213 ("bad mount point vnode list size")); 1214 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1215 mp->mnt_nvnodelistsize--; 1216 MNT_REL(mp); 1217 MNT_IUNLOCK(mp); 1218 } 1219 1220 static void 1221 insmntque_stddtr(struct vnode *vp, void *dtr_arg) 1222 { 1223 1224 vp->v_data = NULL; 1225 vp->v_op = &dead_vnodeops; 1226 vgone(vp); 1227 vput(vp); 1228 } 1229 1230 /* 1231 * Insert into list of vnodes for the new mount point, if available. 1232 */ 1233 int 1234 insmntque1(struct vnode *vp, struct mount *mp, 1235 void (*dtr)(struct vnode *, void *), void *dtr_arg) 1236 { 1237 1238 KASSERT(vp->v_mount == NULL, 1239 ("insmntque: vnode already on per mount vnode list")); 1240 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); 1241 ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); 1242 1243 /* 1244 * We acquire the vnode interlock early to ensure that the 1245 * vnode cannot be recycled by another process releasing a 1246 * holdcnt on it before we get it on both the vnode list 1247 * and the active vnode list. The mount mutex protects only 1248 * manipulation of the vnode list and the vnode freelist 1249 * mutex protects only manipulation of the active vnode list. 1250 * Hence the need to hold the vnode interlock throughout. 1251 */ 1252 MNT_ILOCK(mp); 1253 VI_LOCK(vp); 1254 if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 && 1255 ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || 1256 mp->mnt_nvnodelistsize == 0)) && 1257 (vp->v_vflag & VV_FORCEINSMQ) == 0) { 1258 VI_UNLOCK(vp); 1259 MNT_IUNLOCK(mp); 1260 if (dtr != NULL) 1261 dtr(vp, dtr_arg); 1262 return (EBUSY); 1263 } 1264 vp->v_mount = mp; 1265 MNT_REF(mp); 1266 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1267 VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, 1268 ("neg mount point vnode list size")); 1269 mp->mnt_nvnodelistsize++; 1270 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 1271 ("Activating already active vnode")); 1272 vp->v_iflag |= VI_ACTIVE; 1273 mtx_lock(&vnode_free_list_mtx); 1274 TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); 1275 mp->mnt_activevnodelistsize++; 1276 mtx_unlock(&vnode_free_list_mtx); 1277 VI_UNLOCK(vp); 1278 MNT_IUNLOCK(mp); 1279 return (0); 1280 } 1281 1282 int 1283 insmntque(struct vnode *vp, struct mount *mp) 1284 { 1285 1286 return (insmntque1(vp, mp, insmntque_stddtr, NULL)); 1287 } 1288 1289 /* 1290 * Flush out and invalidate all buffers associated with a bufobj 1291 * Called with the underlying object locked. 1292 */ 1293 int 1294 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) 1295 { 1296 int error; 1297 1298 BO_LOCK(bo); 1299 if (flags & V_SAVE) { 1300 error = bufobj_wwait(bo, slpflag, slptimeo); 1301 if (error) { 1302 BO_UNLOCK(bo); 1303 return (error); 1304 } 1305 if (bo->bo_dirty.bv_cnt > 0) { 1306 BO_UNLOCK(bo); 1307 if ((error = BO_SYNC(bo, MNT_WAIT)) != 0) 1308 return (error); 1309 /* 1310 * XXX We could save a lock/unlock if this was only 1311 * enabled under INVARIANTS 1312 */ 1313 BO_LOCK(bo); 1314 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 1315 panic("vinvalbuf: dirty bufs"); 1316 } 1317 } 1318 /* 1319 * If you alter this loop please notice that interlock is dropped and 1320 * reacquired in flushbuflist. Special care is needed to ensure that 1321 * no race conditions occur from this. 1322 */ 1323 do { 1324 error = flushbuflist(&bo->bo_clean, 1325 flags, bo, slpflag, slptimeo); 1326 if (error == 0 && !(flags & V_CLEANONLY)) 1327 error = flushbuflist(&bo->bo_dirty, 1328 flags, bo, slpflag, slptimeo); 1329 if (error != 0 && error != EAGAIN) { 1330 BO_UNLOCK(bo); 1331 return (error); 1332 } 1333 } while (error != 0); 1334 1335 /* 1336 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 1337 * have write I/O in-progress but if there is a VM object then the 1338 * VM object can also have read-I/O in-progress. 1339 */ 1340 do { 1341 bufobj_wwait(bo, 0, 0); 1342 BO_UNLOCK(bo); 1343 if (bo->bo_object != NULL) { 1344 VM_OBJECT_WLOCK(bo->bo_object); 1345 vm_object_pip_wait(bo->bo_object, "bovlbx"); 1346 VM_OBJECT_WUNLOCK(bo->bo_object); 1347 } 1348 BO_LOCK(bo); 1349 } while (bo->bo_numoutput > 0); 1350 BO_UNLOCK(bo); 1351 1352 /* 1353 * Destroy the copy in the VM cache, too. 1354 */ 1355 if (bo->bo_object != NULL && 1356 (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) { 1357 VM_OBJECT_WLOCK(bo->bo_object); 1358 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? 1359 OBJPR_CLEANONLY : 0); 1360 VM_OBJECT_WUNLOCK(bo->bo_object); 1361 } 1362 1363 #ifdef INVARIANTS 1364 BO_LOCK(bo); 1365 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 && 1366 (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) 1367 panic("vinvalbuf: flush failed"); 1368 BO_UNLOCK(bo); 1369 #endif 1370 return (0); 1371 } 1372 1373 /* 1374 * Flush out and invalidate all buffers associated with a vnode. 1375 * Called with the underlying object locked. 1376 */ 1377 int 1378 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) 1379 { 1380 1381 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 1382 ASSERT_VOP_LOCKED(vp, "vinvalbuf"); 1383 if (vp->v_object != NULL && vp->v_object->handle != vp) 1384 return (0); 1385 return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); 1386 } 1387 1388 /* 1389 * Flush out buffers on the specified list. 1390 * 1391 */ 1392 static int 1393 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, 1394 int slptimeo) 1395 { 1396 struct buf *bp, *nbp; 1397 int retval, error; 1398 daddr_t lblkno; 1399 b_xflags_t xflags; 1400 1401 ASSERT_BO_WLOCKED(bo); 1402 1403 retval = 0; 1404 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { 1405 if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) || 1406 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) { 1407 continue; 1408 } 1409 lblkno = 0; 1410 xflags = 0; 1411 if (nbp != NULL) { 1412 lblkno = nbp->b_lblkno; 1413 xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); 1414 } 1415 retval = EAGAIN; 1416 error = BUF_TIMELOCK(bp, 1417 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), 1418 "flushbuf", slpflag, slptimeo); 1419 if (error) { 1420 BO_LOCK(bo); 1421 return (error != ENOLCK ? error : EAGAIN); 1422 } 1423 KASSERT(bp->b_bufobj == bo, 1424 ("bp %p wrong b_bufobj %p should be %p", 1425 bp, bp->b_bufobj, bo)); 1426 /* 1427 * XXX Since there are no node locks for NFS, I 1428 * believe there is a slight chance that a delayed 1429 * write will occur while sleeping just above, so 1430 * check for it. 1431 */ 1432 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 1433 (flags & V_SAVE)) { 1434 bremfree(bp); 1435 bp->b_flags |= B_ASYNC; 1436 bwrite(bp); 1437 BO_LOCK(bo); 1438 return (EAGAIN); /* XXX: why not loop ? */ 1439 } 1440 bremfree(bp); 1441 bp->b_flags |= (B_INVAL | B_RELBUF); 1442 bp->b_flags &= ~B_ASYNC; 1443 brelse(bp); 1444 BO_LOCK(bo); 1445 if (nbp != NULL && 1446 (nbp->b_bufobj != bo || 1447 nbp->b_lblkno != lblkno || 1448 (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags)) 1449 break; /* nbp invalid */ 1450 } 1451 return (retval); 1452 } 1453 1454 /* 1455 * Truncate a file's buffer and pages to a specified length. This 1456 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 1457 * sync activity. 1458 */ 1459 int 1460 vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize) 1461 { 1462 struct buf *bp, *nbp; 1463 int anyfreed; 1464 int trunclbn; 1465 struct bufobj *bo; 1466 1467 CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__, 1468 vp, cred, blksize, (uintmax_t)length); 1469 1470 /* 1471 * Round up to the *next* lbn. 1472 */ 1473 trunclbn = (length + blksize - 1) / blksize; 1474 1475 ASSERT_VOP_LOCKED(vp, "vtruncbuf"); 1476 restart: 1477 bo = &vp->v_bufobj; 1478 BO_LOCK(bo); 1479 anyfreed = 1; 1480 for (;anyfreed;) { 1481 anyfreed = 0; 1482 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { 1483 if (bp->b_lblkno < trunclbn) 1484 continue; 1485 if (BUF_LOCK(bp, 1486 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1487 BO_LOCKPTR(bo)) == ENOLCK) 1488 goto restart; 1489 1490 bremfree(bp); 1491 bp->b_flags |= (B_INVAL | B_RELBUF); 1492 bp->b_flags &= ~B_ASYNC; 1493 brelse(bp); 1494 anyfreed = 1; 1495 1496 BO_LOCK(bo); 1497 if (nbp != NULL && 1498 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 1499 (nbp->b_vp != vp) || 1500 (nbp->b_flags & B_DELWRI))) { 1501 BO_UNLOCK(bo); 1502 goto restart; 1503 } 1504 } 1505 1506 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 1507 if (bp->b_lblkno < trunclbn) 1508 continue; 1509 if (BUF_LOCK(bp, 1510 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1511 BO_LOCKPTR(bo)) == ENOLCK) 1512 goto restart; 1513 bremfree(bp); 1514 bp->b_flags |= (B_INVAL | B_RELBUF); 1515 bp->b_flags &= ~B_ASYNC; 1516 brelse(bp); 1517 anyfreed = 1; 1518 1519 BO_LOCK(bo); 1520 if (nbp != NULL && 1521 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 1522 (nbp->b_vp != vp) || 1523 (nbp->b_flags & B_DELWRI) == 0)) { 1524 BO_UNLOCK(bo); 1525 goto restart; 1526 } 1527 } 1528 } 1529 1530 if (length > 0) { 1531 restartsync: 1532 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 1533 if (bp->b_lblkno > 0) 1534 continue; 1535 /* 1536 * Since we hold the vnode lock this should only 1537 * fail if we're racing with the buf daemon. 1538 */ 1539 if (BUF_LOCK(bp, 1540 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1541 BO_LOCKPTR(bo)) == ENOLCK) { 1542 goto restart; 1543 } 1544 VNASSERT((bp->b_flags & B_DELWRI), vp, 1545 ("buf(%p) on dirty queue without DELWRI", bp)); 1546 1547 bremfree(bp); 1548 bawrite(bp); 1549 BO_LOCK(bo); 1550 goto restartsync; 1551 } 1552 } 1553 1554 bufobj_wwait(bo, 0, 0); 1555 BO_UNLOCK(bo); 1556 vnode_pager_setsize(vp, length); 1557 1558 return (0); 1559 } 1560 1561 static void 1562 buf_vlist_remove(struct buf *bp) 1563 { 1564 struct bufv *bv; 1565 1566 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 1567 ASSERT_BO_WLOCKED(bp->b_bufobj); 1568 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) != 1569 (BX_VNDIRTY|BX_VNCLEAN), 1570 ("buf_vlist_remove: Buf %p is on two lists", bp)); 1571 if (bp->b_xflags & BX_VNDIRTY) 1572 bv = &bp->b_bufobj->bo_dirty; 1573 else 1574 bv = &bp->b_bufobj->bo_clean; 1575 BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); 1576 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); 1577 bv->bv_cnt--; 1578 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1579 } 1580 1581 /* 1582 * Add the buffer to the sorted clean or dirty block list. 1583 * 1584 * NOTE: xflags is passed as a constant, optimizing this inline function! 1585 */ 1586 static void 1587 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 1588 { 1589 struct bufv *bv; 1590 struct buf *n; 1591 int error; 1592 1593 ASSERT_BO_WLOCKED(bo); 1594 KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, 1595 ("dead bo %p", bo)); 1596 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, 1597 ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); 1598 bp->b_xflags |= xflags; 1599 if (xflags & BX_VNDIRTY) 1600 bv = &bo->bo_dirty; 1601 else 1602 bv = &bo->bo_clean; 1603 1604 /* 1605 * Keep the list ordered. Optimize empty list insertion. Assume 1606 * we tend to grow at the tail so lookup_le should usually be cheaper 1607 * than _ge. 1608 */ 1609 if (bv->bv_cnt == 0 || 1610 bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) 1611 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); 1612 else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) 1613 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); 1614 else 1615 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); 1616 error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); 1617 if (error) 1618 panic("buf_vlist_add: Preallocated nodes insufficient."); 1619 bv->bv_cnt++; 1620 } 1621 1622 /* 1623 * Look up a buffer using the buffer tries. 1624 */ 1625 struct buf * 1626 gbincore(struct bufobj *bo, daddr_t lblkno) 1627 { 1628 struct buf *bp; 1629 1630 ASSERT_BO_LOCKED(bo); 1631 bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); 1632 if (bp != NULL) 1633 return (bp); 1634 return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno); 1635 } 1636 1637 /* 1638 * Associate a buffer with a vnode. 1639 */ 1640 void 1641 bgetvp(struct vnode *vp, struct buf *bp) 1642 { 1643 struct bufobj *bo; 1644 1645 bo = &vp->v_bufobj; 1646 ASSERT_BO_WLOCKED(bo); 1647 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); 1648 1649 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); 1650 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, 1651 ("bgetvp: bp already attached! %p", bp)); 1652 1653 vhold(vp); 1654 bp->b_vp = vp; 1655 bp->b_bufobj = bo; 1656 /* 1657 * Insert onto list for new vnode. 1658 */ 1659 buf_vlist_add(bp, bo, BX_VNCLEAN); 1660 } 1661 1662 /* 1663 * Disassociate a buffer from a vnode. 1664 */ 1665 void 1666 brelvp(struct buf *bp) 1667 { 1668 struct bufobj *bo; 1669 struct vnode *vp; 1670 1671 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 1672 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 1673 1674 /* 1675 * Delete from old vnode list, if on one. 1676 */ 1677 vp = bp->b_vp; /* XXX */ 1678 bo = bp->b_bufobj; 1679 BO_LOCK(bo); 1680 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 1681 buf_vlist_remove(bp); 1682 else 1683 panic("brelvp: Buffer %p not on queue.", bp); 1684 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 1685 bo->bo_flag &= ~BO_ONWORKLST; 1686 mtx_lock(&sync_mtx); 1687 LIST_REMOVE(bo, bo_synclist); 1688 syncer_worklist_len--; 1689 mtx_unlock(&sync_mtx); 1690 } 1691 bp->b_vp = NULL; 1692 bp->b_bufobj = NULL; 1693 BO_UNLOCK(bo); 1694 vdrop(vp); 1695 } 1696 1697 /* 1698 * Add an item to the syncer work queue. 1699 */ 1700 static void 1701 vn_syncer_add_to_worklist(struct bufobj *bo, int delay) 1702 { 1703 int slot; 1704 1705 ASSERT_BO_WLOCKED(bo); 1706 1707 mtx_lock(&sync_mtx); 1708 if (bo->bo_flag & BO_ONWORKLST) 1709 LIST_REMOVE(bo, bo_synclist); 1710 else { 1711 bo->bo_flag |= BO_ONWORKLST; 1712 syncer_worklist_len++; 1713 } 1714 1715 if (delay > syncer_maxdelay - 2) 1716 delay = syncer_maxdelay - 2; 1717 slot = (syncer_delayno + delay) & syncer_mask; 1718 1719 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); 1720 mtx_unlock(&sync_mtx); 1721 } 1722 1723 static int 1724 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) 1725 { 1726 int error, len; 1727 1728 mtx_lock(&sync_mtx); 1729 len = syncer_worklist_len - sync_vnode_count; 1730 mtx_unlock(&sync_mtx); 1731 error = SYSCTL_OUT(req, &len, sizeof(len)); 1732 return (error); 1733 } 1734 1735 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, 1736 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); 1737 1738 static struct proc *updateproc; 1739 static void sched_sync(void); 1740 static struct kproc_desc up_kp = { 1741 "syncer", 1742 sched_sync, 1743 &updateproc 1744 }; 1745 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); 1746 1747 static int 1748 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) 1749 { 1750 struct vnode *vp; 1751 struct mount *mp; 1752 1753 *bo = LIST_FIRST(slp); 1754 if (*bo == NULL) 1755 return (0); 1756 vp = (*bo)->__bo_vnode; /* XXX */ 1757 if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) 1758 return (1); 1759 /* 1760 * We use vhold in case the vnode does not 1761 * successfully sync. vhold prevents the vnode from 1762 * going away when we unlock the sync_mtx so that 1763 * we can acquire the vnode interlock. 1764 */ 1765 vholdl(vp); 1766 mtx_unlock(&sync_mtx); 1767 VI_UNLOCK(vp); 1768 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 1769 vdrop(vp); 1770 mtx_lock(&sync_mtx); 1771 return (*bo == LIST_FIRST(slp)); 1772 } 1773 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1774 (void) VOP_FSYNC(vp, MNT_LAZY, td); 1775 VOP_UNLOCK(vp, 0); 1776 vn_finished_write(mp); 1777 BO_LOCK(*bo); 1778 if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { 1779 /* 1780 * Put us back on the worklist. The worklist 1781 * routine will remove us from our current 1782 * position and then add us back in at a later 1783 * position. 1784 */ 1785 vn_syncer_add_to_worklist(*bo, syncdelay); 1786 } 1787 BO_UNLOCK(*bo); 1788 vdrop(vp); 1789 mtx_lock(&sync_mtx); 1790 return (0); 1791 } 1792 1793 static int first_printf = 1; 1794 1795 /* 1796 * System filesystem synchronizer daemon. 1797 */ 1798 static void 1799 sched_sync(void) 1800 { 1801 struct synclist *next, *slp; 1802 struct bufobj *bo; 1803 long starttime; 1804 struct thread *td = curthread; 1805 int last_work_seen; 1806 int net_worklist_len; 1807 int syncer_final_iter; 1808 int error; 1809 1810 last_work_seen = 0; 1811 syncer_final_iter = 0; 1812 syncer_state = SYNCER_RUNNING; 1813 starttime = time_uptime; 1814 td->td_pflags |= TDP_NORUNNINGBUF; 1815 1816 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, 1817 SHUTDOWN_PRI_LAST); 1818 1819 mtx_lock(&sync_mtx); 1820 for (;;) { 1821 if (syncer_state == SYNCER_FINAL_DELAY && 1822 syncer_final_iter == 0) { 1823 mtx_unlock(&sync_mtx); 1824 kproc_suspend_check(td->td_proc); 1825 mtx_lock(&sync_mtx); 1826 } 1827 net_worklist_len = syncer_worklist_len - sync_vnode_count; 1828 if (syncer_state != SYNCER_RUNNING && 1829 starttime != time_uptime) { 1830 if (first_printf) { 1831 printf("\nSyncing disks, vnodes remaining..."); 1832 first_printf = 0; 1833 } 1834 printf("%d ", net_worklist_len); 1835 } 1836 starttime = time_uptime; 1837 1838 /* 1839 * Push files whose dirty time has expired. Be careful 1840 * of interrupt race on slp queue. 1841 * 1842 * Skip over empty worklist slots when shutting down. 1843 */ 1844 do { 1845 slp = &syncer_workitem_pending[syncer_delayno]; 1846 syncer_delayno += 1; 1847 if (syncer_delayno == syncer_maxdelay) 1848 syncer_delayno = 0; 1849 next = &syncer_workitem_pending[syncer_delayno]; 1850 /* 1851 * If the worklist has wrapped since the 1852 * it was emptied of all but syncer vnodes, 1853 * switch to the FINAL_DELAY state and run 1854 * for one more second. 1855 */ 1856 if (syncer_state == SYNCER_SHUTTING_DOWN && 1857 net_worklist_len == 0 && 1858 last_work_seen == syncer_delayno) { 1859 syncer_state = SYNCER_FINAL_DELAY; 1860 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; 1861 } 1862 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && 1863 syncer_worklist_len > 0); 1864 1865 /* 1866 * Keep track of the last time there was anything 1867 * on the worklist other than syncer vnodes. 1868 * Return to the SHUTTING_DOWN state if any 1869 * new work appears. 1870 */ 1871 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) 1872 last_work_seen = syncer_delayno; 1873 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) 1874 syncer_state = SYNCER_SHUTTING_DOWN; 1875 while (!LIST_EMPTY(slp)) { 1876 error = sync_vnode(slp, &bo, td); 1877 if (error == 1) { 1878 LIST_REMOVE(bo, bo_synclist); 1879 LIST_INSERT_HEAD(next, bo, bo_synclist); 1880 continue; 1881 } 1882 1883 if (first_printf == 0) { 1884 /* 1885 * Drop the sync mutex, because some watchdog 1886 * drivers need to sleep while patting 1887 */ 1888 mtx_unlock(&sync_mtx); 1889 wdog_kern_pat(WD_LASTVAL); 1890 mtx_lock(&sync_mtx); 1891 } 1892 1893 } 1894 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) 1895 syncer_final_iter--; 1896 /* 1897 * The variable rushjob allows the kernel to speed up the 1898 * processing of the filesystem syncer process. A rushjob 1899 * value of N tells the filesystem syncer to process the next 1900 * N seconds worth of work on its queue ASAP. Currently rushjob 1901 * is used by the soft update code to speed up the filesystem 1902 * syncer process when the incore state is getting so far 1903 * ahead of the disk that the kernel memory pool is being 1904 * threatened with exhaustion. 1905 */ 1906 if (rushjob > 0) { 1907 rushjob -= 1; 1908 continue; 1909 } 1910 /* 1911 * Just sleep for a short period of time between 1912 * iterations when shutting down to allow some I/O 1913 * to happen. 1914 * 1915 * If it has taken us less than a second to process the 1916 * current work, then wait. Otherwise start right over 1917 * again. We can still lose time if any single round 1918 * takes more than two seconds, but it does not really 1919 * matter as we are just trying to generally pace the 1920 * filesystem activity. 1921 */ 1922 if (syncer_state != SYNCER_RUNNING || 1923 time_uptime == starttime) { 1924 thread_lock(td); 1925 sched_prio(td, PPAUSE); 1926 thread_unlock(td); 1927 } 1928 if (syncer_state != SYNCER_RUNNING) 1929 cv_timedwait(&sync_wakeup, &sync_mtx, 1930 hz / SYNCER_SHUTDOWN_SPEEDUP); 1931 else if (time_uptime == starttime) 1932 cv_timedwait(&sync_wakeup, &sync_mtx, hz); 1933 } 1934 } 1935 1936 /* 1937 * Request the syncer daemon to speed up its work. 1938 * We never push it to speed up more than half of its 1939 * normal turn time, otherwise it could take over the cpu. 1940 */ 1941 int 1942 speedup_syncer(void) 1943 { 1944 int ret = 0; 1945 1946 mtx_lock(&sync_mtx); 1947 if (rushjob < syncdelay / 2) { 1948 rushjob += 1; 1949 stat_rush_requests += 1; 1950 ret = 1; 1951 } 1952 mtx_unlock(&sync_mtx); 1953 cv_broadcast(&sync_wakeup); 1954 return (ret); 1955 } 1956 1957 /* 1958 * Tell the syncer to speed up its work and run though its work 1959 * list several times, then tell it to shut down. 1960 */ 1961 static void 1962 syncer_shutdown(void *arg, int howto) 1963 { 1964 1965 if (howto & RB_NOSYNC) 1966 return; 1967 mtx_lock(&sync_mtx); 1968 syncer_state = SYNCER_SHUTTING_DOWN; 1969 rushjob = 0; 1970 mtx_unlock(&sync_mtx); 1971 cv_broadcast(&sync_wakeup); 1972 kproc_shutdown(arg, howto); 1973 } 1974 1975 void 1976 syncer_suspend(void) 1977 { 1978 1979 syncer_shutdown(updateproc, 0); 1980 } 1981 1982 void 1983 syncer_resume(void) 1984 { 1985 1986 mtx_lock(&sync_mtx); 1987 first_printf = 1; 1988 syncer_state = SYNCER_RUNNING; 1989 mtx_unlock(&sync_mtx); 1990 cv_broadcast(&sync_wakeup); 1991 kproc_resume(updateproc); 1992 } 1993 1994 /* 1995 * Reassign a buffer from one vnode to another. 1996 * Used to assign file specific control information 1997 * (indirect blocks) to the vnode to which they belong. 1998 */ 1999 void 2000 reassignbuf(struct buf *bp) 2001 { 2002 struct vnode *vp; 2003 struct bufobj *bo; 2004 int delay; 2005 #ifdef INVARIANTS 2006 struct bufv *bv; 2007 #endif 2008 2009 vp = bp->b_vp; 2010 bo = bp->b_bufobj; 2011 ++reassignbufcalls; 2012 2013 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", 2014 bp, bp->b_vp, bp->b_flags); 2015 /* 2016 * B_PAGING flagged buffers cannot be reassigned because their vp 2017 * is not fully linked in. 2018 */ 2019 if (bp->b_flags & B_PAGING) 2020 panic("cannot reassign paging buffer"); 2021 2022 /* 2023 * Delete from old vnode list, if on one. 2024 */ 2025 BO_LOCK(bo); 2026 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2027 buf_vlist_remove(bp); 2028 else 2029 panic("reassignbuf: Buffer %p not on queue.", bp); 2030 /* 2031 * If dirty, put on list of dirty buffers; otherwise insert onto list 2032 * of clean buffers. 2033 */ 2034 if (bp->b_flags & B_DELWRI) { 2035 if ((bo->bo_flag & BO_ONWORKLST) == 0) { 2036 switch (vp->v_type) { 2037 case VDIR: 2038 delay = dirdelay; 2039 break; 2040 case VCHR: 2041 delay = metadelay; 2042 break; 2043 default: 2044 delay = filedelay; 2045 } 2046 vn_syncer_add_to_worklist(bo, delay); 2047 } 2048 buf_vlist_add(bp, bo, BX_VNDIRTY); 2049 } else { 2050 buf_vlist_add(bp, bo, BX_VNCLEAN); 2051 2052 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2053 mtx_lock(&sync_mtx); 2054 LIST_REMOVE(bo, bo_synclist); 2055 syncer_worklist_len--; 2056 mtx_unlock(&sync_mtx); 2057 bo->bo_flag &= ~BO_ONWORKLST; 2058 } 2059 } 2060 #ifdef INVARIANTS 2061 bv = &bo->bo_clean; 2062 bp = TAILQ_FIRST(&bv->bv_hd); 2063 KASSERT(bp == NULL || bp->b_bufobj == bo, 2064 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2065 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2066 KASSERT(bp == NULL || bp->b_bufobj == bo, 2067 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2068 bv = &bo->bo_dirty; 2069 bp = TAILQ_FIRST(&bv->bv_hd); 2070 KASSERT(bp == NULL || bp->b_bufobj == bo, 2071 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2072 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2073 KASSERT(bp == NULL || bp->b_bufobj == bo, 2074 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2075 #endif 2076 BO_UNLOCK(bo); 2077 } 2078 2079 /* 2080 * A temporary hack until refcount_* APIs are sorted out. 2081 */ 2082 static __inline int 2083 vfs_refcount_acquire_if_not_zero(volatile u_int *count) 2084 { 2085 u_int old; 2086 2087 for (;;) { 2088 old = *count; 2089 if (old == 0) 2090 return (0); 2091 if (atomic_cmpset_int(count, old, old + 1)) 2092 return (1); 2093 } 2094 } 2095 2096 static __inline int 2097 vfs_refcount_release_if_not_last(volatile u_int *count) 2098 { 2099 u_int old; 2100 2101 for (;;) { 2102 old = *count; 2103 if (old == 1) 2104 return (0); 2105 if (atomic_cmpset_int(count, old, old - 1)) 2106 return (1); 2107 } 2108 } 2109 2110 static void 2111 v_init_counters(struct vnode *vp) 2112 { 2113 2114 VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, 2115 vp, ("%s called for an initialized vnode", __FUNCTION__)); 2116 ASSERT_VI_UNLOCKED(vp, __FUNCTION__); 2117 2118 refcount_init(&vp->v_holdcnt, 1); 2119 refcount_init(&vp->v_usecount, 1); 2120 } 2121 2122 /* 2123 * Increment the use and hold counts on the vnode, taking care to reference 2124 * the driver's usecount if this is a chardev. The _vhold() will remove 2125 * the vnode from the free list if it is presently free. 2126 */ 2127 static void 2128 v_incr_usecount(struct vnode *vp) 2129 { 2130 2131 ASSERT_VI_UNLOCKED(vp, __func__); 2132 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2133 2134 if (vp->v_type == VCHR) { 2135 VI_LOCK(vp); 2136 _vhold(vp, true); 2137 if (vp->v_iflag & VI_OWEINACT) { 2138 VNASSERT(vp->v_usecount == 0, vp, 2139 ("vnode with usecount and VI_OWEINACT set")); 2140 vp->v_iflag &= ~VI_OWEINACT; 2141 } 2142 refcount_acquire(&vp->v_usecount); 2143 v_incr_devcount(vp); 2144 VI_UNLOCK(vp); 2145 return; 2146 } 2147 2148 _vhold(vp, false); 2149 if (vfs_refcount_acquire_if_not_zero(&vp->v_usecount)) { 2150 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, 2151 ("vnode with usecount and VI_OWEINACT set")); 2152 } else { 2153 VI_LOCK(vp); 2154 if (vp->v_iflag & VI_OWEINACT) 2155 vp->v_iflag &= ~VI_OWEINACT; 2156 refcount_acquire(&vp->v_usecount); 2157 VI_UNLOCK(vp); 2158 } 2159 } 2160 2161 /* 2162 * Increment si_usecount of the associated device, if any. 2163 */ 2164 static void 2165 v_incr_devcount(struct vnode *vp) 2166 { 2167 2168 ASSERT_VI_LOCKED(vp, __FUNCTION__); 2169 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 2170 dev_lock(); 2171 vp->v_rdev->si_usecount++; 2172 dev_unlock(); 2173 } 2174 } 2175 2176 /* 2177 * Decrement si_usecount of the associated device, if any. 2178 */ 2179 static void 2180 v_decr_devcount(struct vnode *vp) 2181 { 2182 2183 ASSERT_VI_LOCKED(vp, __FUNCTION__); 2184 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 2185 dev_lock(); 2186 vp->v_rdev->si_usecount--; 2187 dev_unlock(); 2188 } 2189 } 2190 2191 /* 2192 * Grab a particular vnode from the free list, increment its 2193 * reference count and lock it. VI_DOOMED is set if the vnode 2194 * is being destroyed. Only callers who specify LK_RETRY will 2195 * see doomed vnodes. If inactive processing was delayed in 2196 * vput try to do it here. 2197 * 2198 * Notes on lockless counter manipulation: 2199 * _vhold, vputx and other routines make various decisions based 2200 * on either holdcnt or usecount being 0. As long as either contuner 2201 * is not transitioning 0->1 nor 1->0, the manipulation can be done 2202 * with atomic operations. Otherwise the interlock is taken. 2203 */ 2204 int 2205 vget(struct vnode *vp, int flags, struct thread *td) 2206 { 2207 int error, oweinact; 2208 2209 VNASSERT((flags & LK_TYPE_MASK) != 0, vp, 2210 ("vget: invalid lock operation")); 2211 2212 if ((flags & LK_INTERLOCK) != 0) 2213 ASSERT_VI_LOCKED(vp, __func__); 2214 else 2215 ASSERT_VI_UNLOCKED(vp, __func__); 2216 if ((flags & LK_VNHELD) != 0) 2217 VNASSERT((vp->v_holdcnt > 0), vp, 2218 ("vget: LK_VNHELD passed but vnode not held")); 2219 2220 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 2221 2222 if ((flags & LK_VNHELD) == 0) 2223 _vhold(vp, (flags & LK_INTERLOCK) != 0); 2224 2225 if ((error = vn_lock(vp, flags)) != 0) { 2226 vdrop(vp); 2227 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, 2228 vp); 2229 return (error); 2230 } 2231 if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0) 2232 panic("vget: vn_lock failed to return ENOENT\n"); 2233 /* 2234 * We don't guarantee that any particular close will 2235 * trigger inactive processing so just make a best effort 2236 * here at preventing a reference to a removed file. If 2237 * we don't succeed no harm is done. 2238 * 2239 * Upgrade our holdcnt to a usecount. 2240 */ 2241 if (vp->v_type != VCHR && 2242 vfs_refcount_acquire_if_not_zero(&vp->v_usecount)) { 2243 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, 2244 ("vnode with usecount and VI_OWEINACT set")); 2245 } else { 2246 VI_LOCK(vp); 2247 if ((vp->v_iflag & VI_OWEINACT) == 0) { 2248 oweinact = 0; 2249 } else { 2250 oweinact = 1; 2251 vp->v_iflag &= ~VI_OWEINACT; 2252 } 2253 refcount_acquire(&vp->v_usecount); 2254 v_incr_devcount(vp); 2255 if (oweinact && VOP_ISLOCKED(vp) == LK_EXCLUSIVE && 2256 (flags & LK_NOWAIT) == 0) 2257 vinactive(vp, td); 2258 VI_UNLOCK(vp); 2259 } 2260 return (0); 2261 } 2262 2263 /* 2264 * Increase the reference count of a vnode. 2265 */ 2266 void 2267 vref(struct vnode *vp) 2268 { 2269 2270 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2271 v_incr_usecount(vp); 2272 } 2273 2274 /* 2275 * Return reference count of a vnode. 2276 * 2277 * The results of this call are only guaranteed when some mechanism is used to 2278 * stop other processes from gaining references to the vnode. This may be the 2279 * case if the caller holds the only reference. This is also useful when stale 2280 * data is acceptable as race conditions may be accounted for by some other 2281 * means. 2282 */ 2283 int 2284 vrefcnt(struct vnode *vp) 2285 { 2286 2287 return (vp->v_usecount); 2288 } 2289 2290 #define VPUTX_VRELE 1 2291 #define VPUTX_VPUT 2 2292 #define VPUTX_VUNREF 3 2293 2294 /* 2295 * Decrement the use and hold counts for a vnode. 2296 * 2297 * See an explanation near vget() as to why atomic operation is safe. 2298 */ 2299 static void 2300 vputx(struct vnode *vp, int func) 2301 { 2302 int error; 2303 2304 KASSERT(vp != NULL, ("vputx: null vp")); 2305 if (func == VPUTX_VUNREF) 2306 ASSERT_VOP_LOCKED(vp, "vunref"); 2307 else if (func == VPUTX_VPUT) 2308 ASSERT_VOP_LOCKED(vp, "vput"); 2309 else 2310 KASSERT(func == VPUTX_VRELE, ("vputx: wrong func")); 2311 ASSERT_VI_UNLOCKED(vp, __func__); 2312 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2313 2314 if (vp->v_type != VCHR && 2315 vfs_refcount_release_if_not_last(&vp->v_usecount)) { 2316 if (func == VPUTX_VPUT) 2317 VOP_UNLOCK(vp, 0); 2318 vdrop(vp); 2319 return; 2320 } 2321 2322 VI_LOCK(vp); 2323 2324 /* 2325 * We want to hold the vnode until the inactive finishes to 2326 * prevent vgone() races. We drop the use count here and the 2327 * hold count below when we're done. 2328 */ 2329 if (!refcount_release(&vp->v_usecount) || 2330 (vp->v_iflag & VI_DOINGINACT)) { 2331 if (func == VPUTX_VPUT) 2332 VOP_UNLOCK(vp, 0); 2333 v_decr_devcount(vp); 2334 vdropl(vp); 2335 return; 2336 } 2337 2338 v_decr_devcount(vp); 2339 2340 error = 0; 2341 2342 if (vp->v_usecount != 0) { 2343 vprint("vputx: usecount not zero", vp); 2344 panic("vputx: usecount not zero"); 2345 } 2346 2347 CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp); 2348 2349 /* 2350 * We must call VOP_INACTIVE with the node locked. Mark 2351 * as VI_DOINGINACT to avoid recursion. 2352 */ 2353 vp->v_iflag |= VI_OWEINACT; 2354 switch (func) { 2355 case VPUTX_VRELE: 2356 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 2357 VI_LOCK(vp); 2358 break; 2359 case VPUTX_VPUT: 2360 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 2361 error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | 2362 LK_NOWAIT); 2363 VI_LOCK(vp); 2364 } 2365 break; 2366 case VPUTX_VUNREF: 2367 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 2368 error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); 2369 VI_LOCK(vp); 2370 } 2371 break; 2372 } 2373 VNASSERT(vp->v_usecount == 0 || (vp->v_iflag & VI_OWEINACT) == 0, vp, 2374 ("vnode with usecount and VI_OWEINACT set")); 2375 if (error == 0) { 2376 if (vp->v_iflag & VI_OWEINACT) 2377 vinactive(vp, curthread); 2378 if (func != VPUTX_VUNREF) 2379 VOP_UNLOCK(vp, 0); 2380 } 2381 vdropl(vp); 2382 } 2383 2384 /* 2385 * Vnode put/release. 2386 * If count drops to zero, call inactive routine and return to freelist. 2387 */ 2388 void 2389 vrele(struct vnode *vp) 2390 { 2391 2392 vputx(vp, VPUTX_VRELE); 2393 } 2394 2395 /* 2396 * Release an already locked vnode. This give the same effects as 2397 * unlock+vrele(), but takes less time and avoids releasing and 2398 * re-aquiring the lock (as vrele() acquires the lock internally.) 2399 */ 2400 void 2401 vput(struct vnode *vp) 2402 { 2403 2404 vputx(vp, VPUTX_VPUT); 2405 } 2406 2407 /* 2408 * Release an exclusively locked vnode. Do not unlock the vnode lock. 2409 */ 2410 void 2411 vunref(struct vnode *vp) 2412 { 2413 2414 vputx(vp, VPUTX_VUNREF); 2415 } 2416 2417 /* 2418 * Increase the hold count and activate if this is the first reference. 2419 */ 2420 void 2421 _vhold(struct vnode *vp, bool locked) 2422 { 2423 struct mount *mp; 2424 2425 if (locked) 2426 ASSERT_VI_LOCKED(vp, __func__); 2427 else 2428 ASSERT_VI_UNLOCKED(vp, __func__); 2429 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2430 if (!locked && vfs_refcount_acquire_if_not_zero(&vp->v_holdcnt)) { 2431 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, 2432 ("_vhold: vnode with holdcnt is free")); 2433 return; 2434 } 2435 2436 if (!locked) 2437 VI_LOCK(vp); 2438 if ((vp->v_iflag & VI_FREE) == 0) { 2439 refcount_acquire(&vp->v_holdcnt); 2440 if (!locked) 2441 VI_UNLOCK(vp); 2442 return; 2443 } 2444 VNASSERT(vp->v_holdcnt == 0, vp, 2445 ("%s: wrong hold count", __func__)); 2446 VNASSERT(vp->v_op != NULL, vp, 2447 ("%s: vnode already reclaimed.", __func__)); 2448 /* 2449 * Remove a vnode from the free list, mark it as in use, 2450 * and put it on the active list. 2451 */ 2452 mtx_lock(&vnode_free_list_mtx); 2453 TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); 2454 freevnodes--; 2455 vp->v_iflag &= ~(VI_FREE|VI_AGE); 2456 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 2457 ("Activating already active vnode")); 2458 vp->v_iflag |= VI_ACTIVE; 2459 mp = vp->v_mount; 2460 TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); 2461 mp->mnt_activevnodelistsize++; 2462 mtx_unlock(&vnode_free_list_mtx); 2463 refcount_acquire(&vp->v_holdcnt); 2464 if (!locked) 2465 VI_UNLOCK(vp); 2466 } 2467 2468 /* 2469 * Drop the hold count of the vnode. If this is the last reference to 2470 * the vnode we place it on the free list unless it has been vgone'd 2471 * (marked VI_DOOMED) in which case we will free it. 2472 */ 2473 void 2474 _vdrop(struct vnode *vp, bool locked) 2475 { 2476 struct bufobj *bo; 2477 struct mount *mp; 2478 int active; 2479 2480 if (locked) 2481 ASSERT_VI_LOCKED(vp, __func__); 2482 else 2483 ASSERT_VI_UNLOCKED(vp, __func__); 2484 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2485 if ((int)vp->v_holdcnt <= 0) 2486 panic("vdrop: holdcnt %d", vp->v_holdcnt); 2487 if (vfs_refcount_release_if_not_last(&vp->v_holdcnt)) { 2488 if (locked) 2489 VI_UNLOCK(vp); 2490 return; 2491 } 2492 2493 if (!locked) 2494 VI_LOCK(vp); 2495 if (refcount_release(&vp->v_holdcnt) == 0) { 2496 VI_UNLOCK(vp); 2497 return; 2498 } 2499 if ((vp->v_iflag & VI_DOOMED) == 0) { 2500 /* 2501 * Mark a vnode as free: remove it from its active list 2502 * and put it up for recycling on the freelist. 2503 */ 2504 VNASSERT(vp->v_op != NULL, vp, 2505 ("vdropl: vnode already reclaimed.")); 2506 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, 2507 ("vnode already free")); 2508 VNASSERT(vp->v_holdcnt == 0, vp, 2509 ("vdropl: freeing when we shouldn't")); 2510 active = vp->v_iflag & VI_ACTIVE; 2511 if ((vp->v_iflag & VI_OWEINACT) == 0) { 2512 vp->v_iflag &= ~VI_ACTIVE; 2513 mp = vp->v_mount; 2514 mtx_lock(&vnode_free_list_mtx); 2515 if (active) { 2516 TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, 2517 v_actfreelist); 2518 mp->mnt_activevnodelistsize--; 2519 } 2520 if (vp->v_iflag & VI_AGE) { 2521 TAILQ_INSERT_HEAD(&vnode_free_list, vp, 2522 v_actfreelist); 2523 } else { 2524 TAILQ_INSERT_TAIL(&vnode_free_list, vp, 2525 v_actfreelist); 2526 } 2527 freevnodes++; 2528 vp->v_iflag &= ~VI_AGE; 2529 vp->v_iflag |= VI_FREE; 2530 mtx_unlock(&vnode_free_list_mtx); 2531 } else { 2532 atomic_add_long(&free_owe_inact, 1); 2533 } 2534 VI_UNLOCK(vp); 2535 return; 2536 } 2537 /* 2538 * The vnode has been marked for destruction, so free it. 2539 */ 2540 CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); 2541 atomic_subtract_long(&numvnodes, 1); 2542 bo = &vp->v_bufobj; 2543 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, 2544 ("cleaned vnode still on the free list.")); 2545 VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); 2546 VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count")); 2547 VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); 2548 VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); 2549 VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); 2550 VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); 2551 VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, 2552 ("clean blk trie not empty")); 2553 VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); 2554 VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, 2555 ("dirty blk trie not empty")); 2556 VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); 2557 VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); 2558 VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for ..")); 2559 VI_UNLOCK(vp); 2560 #ifdef MAC 2561 mac_vnode_destroy(vp); 2562 #endif 2563 if (vp->v_pollinfo != NULL) 2564 destroy_vpollinfo(vp->v_pollinfo); 2565 #ifdef INVARIANTS 2566 /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */ 2567 vp->v_op = NULL; 2568 #endif 2569 rangelock_destroy(&vp->v_rl); 2570 lockdestroy(vp->v_vnlock); 2571 mtx_destroy(&vp->v_interlock); 2572 rw_destroy(BO_LOCKPTR(bo)); 2573 uma_zfree(vnode_zone, vp); 2574 } 2575 2576 /* 2577 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT 2578 * flags. DOINGINACT prevents us from recursing in calls to vinactive. 2579 * OWEINACT tracks whether a vnode missed a call to inactive due to a 2580 * failed lock upgrade. 2581 */ 2582 void 2583 vinactive(struct vnode *vp, struct thread *td) 2584 { 2585 struct vm_object *obj; 2586 2587 ASSERT_VOP_ELOCKED(vp, "vinactive"); 2588 ASSERT_VI_LOCKED(vp, "vinactive"); 2589 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, 2590 ("vinactive: recursed on VI_DOINGINACT")); 2591 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2592 vp->v_iflag |= VI_DOINGINACT; 2593 vp->v_iflag &= ~VI_OWEINACT; 2594 VI_UNLOCK(vp); 2595 /* 2596 * Before moving off the active list, we must be sure that any 2597 * modified pages are on the vnode's dirty list since these will 2598 * no longer be checked once the vnode is on the inactive list. 2599 * Because the vnode vm object keeps a hold reference on the vnode 2600 * if there is at least one resident non-cached page, the vnode 2601 * cannot leave the active list without the page cleanup done. 2602 */ 2603 obj = vp->v_object; 2604 if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) { 2605 VM_OBJECT_WLOCK(obj); 2606 vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC); 2607 VM_OBJECT_WUNLOCK(obj); 2608 } 2609 VOP_INACTIVE(vp, td); 2610 VI_LOCK(vp); 2611 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, 2612 ("vinactive: lost VI_DOINGINACT")); 2613 vp->v_iflag &= ~VI_DOINGINACT; 2614 } 2615 2616 /* 2617 * Remove any vnodes in the vnode table belonging to mount point mp. 2618 * 2619 * If FORCECLOSE is not specified, there should not be any active ones, 2620 * return error if any are found (nb: this is a user error, not a 2621 * system error). If FORCECLOSE is specified, detach any active vnodes 2622 * that are found. 2623 * 2624 * If WRITECLOSE is set, only flush out regular file vnodes open for 2625 * writing. 2626 * 2627 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. 2628 * 2629 * `rootrefs' specifies the base reference count for the root vnode 2630 * of this filesystem. The root vnode is considered busy if its 2631 * v_usecount exceeds this value. On a successful return, vflush(, td) 2632 * will call vrele() on the root vnode exactly rootrefs times. 2633 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 2634 * be zero. 2635 */ 2636 #ifdef DIAGNOSTIC 2637 static int busyprt = 0; /* print out busy vnodes */ 2638 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); 2639 #endif 2640 2641 int 2642 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) 2643 { 2644 struct vnode *vp, *mvp, *rootvp = NULL; 2645 struct vattr vattr; 2646 int busy = 0, error; 2647 2648 CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, 2649 rootrefs, flags); 2650 if (rootrefs > 0) { 2651 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 2652 ("vflush: bad args")); 2653 /* 2654 * Get the filesystem root vnode. We can vput() it 2655 * immediately, since with rootrefs > 0, it won't go away. 2656 */ 2657 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { 2658 CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", 2659 __func__, error); 2660 return (error); 2661 } 2662 vput(rootvp); 2663 } 2664 loop: 2665 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 2666 vholdl(vp); 2667 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); 2668 if (error) { 2669 vdrop(vp); 2670 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 2671 goto loop; 2672 } 2673 /* 2674 * Skip over a vnodes marked VV_SYSTEM. 2675 */ 2676 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 2677 VOP_UNLOCK(vp, 0); 2678 vdrop(vp); 2679 continue; 2680 } 2681 /* 2682 * If WRITECLOSE is set, flush out unlinked but still open 2683 * files (even if open only for reading) and regular file 2684 * vnodes open for writing. 2685 */ 2686 if (flags & WRITECLOSE) { 2687 if (vp->v_object != NULL) { 2688 VM_OBJECT_WLOCK(vp->v_object); 2689 vm_object_page_clean(vp->v_object, 0, 0, 0); 2690 VM_OBJECT_WUNLOCK(vp->v_object); 2691 } 2692 error = VOP_FSYNC(vp, MNT_WAIT, td); 2693 if (error != 0) { 2694 VOP_UNLOCK(vp, 0); 2695 vdrop(vp); 2696 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 2697 return (error); 2698 } 2699 error = VOP_GETATTR(vp, &vattr, td->td_ucred); 2700 VI_LOCK(vp); 2701 2702 if ((vp->v_type == VNON || 2703 (error == 0 && vattr.va_nlink > 0)) && 2704 (vp->v_writecount == 0 || vp->v_type != VREG)) { 2705 VOP_UNLOCK(vp, 0); 2706 vdropl(vp); 2707 continue; 2708 } 2709 } else 2710 VI_LOCK(vp); 2711 /* 2712 * With v_usecount == 0, all we need to do is clear out the 2713 * vnode data structures and we are done. 2714 * 2715 * If FORCECLOSE is set, forcibly close the vnode. 2716 */ 2717 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { 2718 vgonel(vp); 2719 } else { 2720 busy++; 2721 #ifdef DIAGNOSTIC 2722 if (busyprt) 2723 vprint("vflush: busy vnode", vp); 2724 #endif 2725 } 2726 VOP_UNLOCK(vp, 0); 2727 vdropl(vp); 2728 } 2729 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 2730 /* 2731 * If just the root vnode is busy, and if its refcount 2732 * is equal to `rootrefs', then go ahead and kill it. 2733 */ 2734 VI_LOCK(rootvp); 2735 KASSERT(busy > 0, ("vflush: not busy")); 2736 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, 2737 ("vflush: usecount %d < rootrefs %d", 2738 rootvp->v_usecount, rootrefs)); 2739 if (busy == 1 && rootvp->v_usecount == rootrefs) { 2740 VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); 2741 vgone(rootvp); 2742 VOP_UNLOCK(rootvp, 0); 2743 busy = 0; 2744 } else 2745 VI_UNLOCK(rootvp); 2746 } 2747 if (busy) { 2748 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, 2749 busy); 2750 return (EBUSY); 2751 } 2752 for (; rootrefs > 0; rootrefs--) 2753 vrele(rootvp); 2754 return (0); 2755 } 2756 2757 /* 2758 * Recycle an unused vnode to the front of the free list. 2759 */ 2760 int 2761 vrecycle(struct vnode *vp) 2762 { 2763 int recycled; 2764 2765 ASSERT_VOP_ELOCKED(vp, "vrecycle"); 2766 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2767 recycled = 0; 2768 VI_LOCK(vp); 2769 if (vp->v_usecount == 0) { 2770 recycled = 1; 2771 vgonel(vp); 2772 } 2773 VI_UNLOCK(vp); 2774 return (recycled); 2775 } 2776 2777 /* 2778 * Eliminate all activity associated with a vnode 2779 * in preparation for reuse. 2780 */ 2781 void 2782 vgone(struct vnode *vp) 2783 { 2784 VI_LOCK(vp); 2785 vgonel(vp); 2786 VI_UNLOCK(vp); 2787 } 2788 2789 static void 2790 notify_lowervp_vfs_dummy(struct mount *mp __unused, 2791 struct vnode *lowervp __unused) 2792 { 2793 } 2794 2795 /* 2796 * Notify upper mounts about reclaimed or unlinked vnode. 2797 */ 2798 void 2799 vfs_notify_upper(struct vnode *vp, int event) 2800 { 2801 static struct vfsops vgonel_vfsops = { 2802 .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy, 2803 .vfs_unlink_lowervp = notify_lowervp_vfs_dummy, 2804 }; 2805 struct mount *mp, *ump, *mmp; 2806 2807 mp = vp->v_mount; 2808 if (mp == NULL) 2809 return; 2810 2811 MNT_ILOCK(mp); 2812 if (TAILQ_EMPTY(&mp->mnt_uppers)) 2813 goto unlock; 2814 MNT_IUNLOCK(mp); 2815 mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO); 2816 mmp->mnt_op = &vgonel_vfsops; 2817 mmp->mnt_kern_flag |= MNTK_MARKER; 2818 MNT_ILOCK(mp); 2819 mp->mnt_kern_flag |= MNTK_VGONE_UPPER; 2820 for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) { 2821 if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) { 2822 ump = TAILQ_NEXT(ump, mnt_upper_link); 2823 continue; 2824 } 2825 TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link); 2826 MNT_IUNLOCK(mp); 2827 switch (event) { 2828 case VFS_NOTIFY_UPPER_RECLAIM: 2829 VFS_RECLAIM_LOWERVP(ump, vp); 2830 break; 2831 case VFS_NOTIFY_UPPER_UNLINK: 2832 VFS_UNLINK_LOWERVP(ump, vp); 2833 break; 2834 default: 2835 KASSERT(0, ("invalid event %d", event)); 2836 break; 2837 } 2838 MNT_ILOCK(mp); 2839 ump = TAILQ_NEXT(mmp, mnt_upper_link); 2840 TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link); 2841 } 2842 free(mmp, M_TEMP); 2843 mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER; 2844 if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) { 2845 mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER; 2846 wakeup(&mp->mnt_uppers); 2847 } 2848 unlock: 2849 MNT_IUNLOCK(mp); 2850 } 2851 2852 /* 2853 * vgone, with the vp interlock held. 2854 */ 2855 static void 2856 vgonel(struct vnode *vp) 2857 { 2858 struct thread *td; 2859 int oweinact; 2860 int active; 2861 struct mount *mp; 2862 2863 ASSERT_VOP_ELOCKED(vp, "vgonel"); 2864 ASSERT_VI_LOCKED(vp, "vgonel"); 2865 VNASSERT(vp->v_holdcnt, vp, 2866 ("vgonel: vp %p has no reference.", vp)); 2867 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2868 td = curthread; 2869 2870 /* 2871 * Don't vgonel if we're already doomed. 2872 */ 2873 if (vp->v_iflag & VI_DOOMED) 2874 return; 2875 vp->v_iflag |= VI_DOOMED; 2876 2877 /* 2878 * Check to see if the vnode is in use. If so, we have to call 2879 * VOP_CLOSE() and VOP_INACTIVE(). 2880 */ 2881 active = vp->v_usecount; 2882 oweinact = (vp->v_iflag & VI_OWEINACT); 2883 VI_UNLOCK(vp); 2884 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); 2885 2886 /* 2887 * If purging an active vnode, it must be closed and 2888 * deactivated before being reclaimed. 2889 */ 2890 if (active) 2891 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 2892 if (oweinact || active) { 2893 VI_LOCK(vp); 2894 if ((vp->v_iflag & VI_DOINGINACT) == 0) 2895 vinactive(vp, td); 2896 VI_UNLOCK(vp); 2897 } 2898 if (vp->v_type == VSOCK) 2899 vfs_unp_reclaim(vp); 2900 2901 /* 2902 * Clean out any buffers associated with the vnode. 2903 * If the flush fails, just toss the buffers. 2904 */ 2905 mp = NULL; 2906 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) 2907 (void) vn_start_secondary_write(vp, &mp, V_WAIT); 2908 if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { 2909 while (vinvalbuf(vp, 0, 0, 0) != 0) 2910 ; 2911 } 2912 2913 BO_LOCK(&vp->v_bufobj); 2914 KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && 2915 vp->v_bufobj.bo_dirty.bv_cnt == 0 && 2916 TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && 2917 vp->v_bufobj.bo_clean.bv_cnt == 0, 2918 ("vp %p bufobj not invalidated", vp)); 2919 vp->v_bufobj.bo_flag |= BO_DEAD; 2920 BO_UNLOCK(&vp->v_bufobj); 2921 2922 /* 2923 * Reclaim the vnode. 2924 */ 2925 if (VOP_RECLAIM(vp, td)) 2926 panic("vgone: cannot reclaim"); 2927 if (mp != NULL) 2928 vn_finished_secondary_write(mp); 2929 VNASSERT(vp->v_object == NULL, vp, 2930 ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag)); 2931 /* 2932 * Clear the advisory locks and wake up waiting threads. 2933 */ 2934 (void)VOP_ADVLOCKPURGE(vp); 2935 /* 2936 * Delete from old mount point vnode list. 2937 */ 2938 delmntque(vp); 2939 cache_purge(vp); 2940 /* 2941 * Done with purge, reset to the standard lock and invalidate 2942 * the vnode. 2943 */ 2944 VI_LOCK(vp); 2945 vp->v_vnlock = &vp->v_lock; 2946 vp->v_op = &dead_vnodeops; 2947 vp->v_tag = "none"; 2948 vp->v_type = VBAD; 2949 } 2950 2951 /* 2952 * Calculate the total number of references to a special device. 2953 */ 2954 int 2955 vcount(struct vnode *vp) 2956 { 2957 int count; 2958 2959 dev_lock(); 2960 count = vp->v_rdev->si_usecount; 2961 dev_unlock(); 2962 return (count); 2963 } 2964 2965 /* 2966 * Same as above, but using the struct cdev *as argument 2967 */ 2968 int 2969 count_dev(struct cdev *dev) 2970 { 2971 int count; 2972 2973 dev_lock(); 2974 count = dev->si_usecount; 2975 dev_unlock(); 2976 return(count); 2977 } 2978 2979 /* 2980 * Print out a description of a vnode. 2981 */ 2982 static char *typename[] = 2983 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", 2984 "VMARKER"}; 2985 2986 void 2987 vn_printf(struct vnode *vp, const char *fmt, ...) 2988 { 2989 va_list ap; 2990 char buf[256], buf2[16]; 2991 u_long flags; 2992 2993 va_start(ap, fmt); 2994 vprintf(fmt, ap); 2995 va_end(ap); 2996 printf("%p: ", (void *)vp); 2997 printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]); 2998 printf(" usecount %d, writecount %d, refcount %d mountedhere %p\n", 2999 vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere); 3000 buf[0] = '\0'; 3001 buf[1] = '\0'; 3002 if (vp->v_vflag & VV_ROOT) 3003 strlcat(buf, "|VV_ROOT", sizeof(buf)); 3004 if (vp->v_vflag & VV_ISTTY) 3005 strlcat(buf, "|VV_ISTTY", sizeof(buf)); 3006 if (vp->v_vflag & VV_NOSYNC) 3007 strlcat(buf, "|VV_NOSYNC", sizeof(buf)); 3008 if (vp->v_vflag & VV_ETERNALDEV) 3009 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); 3010 if (vp->v_vflag & VV_CACHEDLABEL) 3011 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); 3012 if (vp->v_vflag & VV_TEXT) 3013 strlcat(buf, "|VV_TEXT", sizeof(buf)); 3014 if (vp->v_vflag & VV_COPYONWRITE) 3015 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); 3016 if (vp->v_vflag & VV_SYSTEM) 3017 strlcat(buf, "|VV_SYSTEM", sizeof(buf)); 3018 if (vp->v_vflag & VV_PROCDEP) 3019 strlcat(buf, "|VV_PROCDEP", sizeof(buf)); 3020 if (vp->v_vflag & VV_NOKNOTE) 3021 strlcat(buf, "|VV_NOKNOTE", sizeof(buf)); 3022 if (vp->v_vflag & VV_DELETED) 3023 strlcat(buf, "|VV_DELETED", sizeof(buf)); 3024 if (vp->v_vflag & VV_MD) 3025 strlcat(buf, "|VV_MD", sizeof(buf)); 3026 if (vp->v_vflag & VV_FORCEINSMQ) 3027 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); 3028 flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | 3029 VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | 3030 VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ); 3031 if (flags != 0) { 3032 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); 3033 strlcat(buf, buf2, sizeof(buf)); 3034 } 3035 if (vp->v_iflag & VI_MOUNT) 3036 strlcat(buf, "|VI_MOUNT", sizeof(buf)); 3037 if (vp->v_iflag & VI_AGE) 3038 strlcat(buf, "|VI_AGE", sizeof(buf)); 3039 if (vp->v_iflag & VI_DOOMED) 3040 strlcat(buf, "|VI_DOOMED", sizeof(buf)); 3041 if (vp->v_iflag & VI_FREE) 3042 strlcat(buf, "|VI_FREE", sizeof(buf)); 3043 if (vp->v_iflag & VI_ACTIVE) 3044 strlcat(buf, "|VI_ACTIVE", sizeof(buf)); 3045 if (vp->v_iflag & VI_DOINGINACT) 3046 strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); 3047 if (vp->v_iflag & VI_OWEINACT) 3048 strlcat(buf, "|VI_OWEINACT", sizeof(buf)); 3049 flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE | 3050 VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT); 3051 if (flags != 0) { 3052 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); 3053 strlcat(buf, buf2, sizeof(buf)); 3054 } 3055 printf(" flags (%s)\n", buf + 1); 3056 if (mtx_owned(VI_MTX(vp))) 3057 printf(" VI_LOCKed"); 3058 if (vp->v_object != NULL) 3059 printf(" v_object %p ref %d pages %d " 3060 "cleanbuf %d dirtybuf %d\n", 3061 vp->v_object, vp->v_object->ref_count, 3062 vp->v_object->resident_page_count, 3063 vp->v_bufobj.bo_clean.bv_cnt, 3064 vp->v_bufobj.bo_dirty.bv_cnt); 3065 printf(" "); 3066 lockmgr_printinfo(vp->v_vnlock); 3067 if (vp->v_data != NULL) 3068 VOP_PRINT(vp); 3069 } 3070 3071 #ifdef DDB 3072 /* 3073 * List all of the locked vnodes in the system. 3074 * Called when debugging the kernel. 3075 */ 3076 DB_SHOW_COMMAND(lockedvnods, lockedvnodes) 3077 { 3078 struct mount *mp; 3079 struct vnode *vp; 3080 3081 /* 3082 * Note: because this is DDB, we can't obey the locking semantics 3083 * for these structures, which means we could catch an inconsistent 3084 * state and dereference a nasty pointer. Not much to be done 3085 * about that. 3086 */ 3087 db_printf("Locked vnodes\n"); 3088 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3089 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 3090 if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) 3091 vprint("", vp); 3092 } 3093 } 3094 } 3095 3096 /* 3097 * Show details about the given vnode. 3098 */ 3099 DB_SHOW_COMMAND(vnode, db_show_vnode) 3100 { 3101 struct vnode *vp; 3102 3103 if (!have_addr) 3104 return; 3105 vp = (struct vnode *)addr; 3106 vn_printf(vp, "vnode "); 3107 } 3108 3109 /* 3110 * Show details about the given mount point. 3111 */ 3112 DB_SHOW_COMMAND(mount, db_show_mount) 3113 { 3114 struct mount *mp; 3115 struct vfsopt *opt; 3116 struct statfs *sp; 3117 struct vnode *vp; 3118 char buf[512]; 3119 uint64_t mflags; 3120 u_int flags; 3121 3122 if (!have_addr) { 3123 /* No address given, print short info about all mount points. */ 3124 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3125 db_printf("%p %s on %s (%s)\n", mp, 3126 mp->mnt_stat.f_mntfromname, 3127 mp->mnt_stat.f_mntonname, 3128 mp->mnt_stat.f_fstypename); 3129 if (db_pager_quit) 3130 break; 3131 } 3132 db_printf("\nMore info: show mount <addr>\n"); 3133 return; 3134 } 3135 3136 mp = (struct mount *)addr; 3137 db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, 3138 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); 3139 3140 buf[0] = '\0'; 3141 mflags = mp->mnt_flag; 3142 #define MNT_FLAG(flag) do { \ 3143 if (mflags & (flag)) { \ 3144 if (buf[0] != '\0') \ 3145 strlcat(buf, ", ", sizeof(buf)); \ 3146 strlcat(buf, (#flag) + 4, sizeof(buf)); \ 3147 mflags &= ~(flag); \ 3148 } \ 3149 } while (0) 3150 MNT_FLAG(MNT_RDONLY); 3151 MNT_FLAG(MNT_SYNCHRONOUS); 3152 MNT_FLAG(MNT_NOEXEC); 3153 MNT_FLAG(MNT_NOSUID); 3154 MNT_FLAG(MNT_NFS4ACLS); 3155 MNT_FLAG(MNT_UNION); 3156 MNT_FLAG(MNT_ASYNC); 3157 MNT_FLAG(MNT_SUIDDIR); 3158 MNT_FLAG(MNT_SOFTDEP); 3159 MNT_FLAG(MNT_NOSYMFOLLOW); 3160 MNT_FLAG(MNT_GJOURNAL); 3161 MNT_FLAG(MNT_MULTILABEL); 3162 MNT_FLAG(MNT_ACLS); 3163 MNT_FLAG(MNT_NOATIME); 3164 MNT_FLAG(MNT_NOCLUSTERR); 3165 MNT_FLAG(MNT_NOCLUSTERW); 3166 MNT_FLAG(MNT_SUJ); 3167 MNT_FLAG(MNT_EXRDONLY); 3168 MNT_FLAG(MNT_EXPORTED); 3169 MNT_FLAG(MNT_DEFEXPORTED); 3170 MNT_FLAG(MNT_EXPORTANON); 3171 MNT_FLAG(MNT_EXKERB); 3172 MNT_FLAG(MNT_EXPUBLIC); 3173 MNT_FLAG(MNT_LOCAL); 3174 MNT_FLAG(MNT_QUOTA); 3175 MNT_FLAG(MNT_ROOTFS); 3176 MNT_FLAG(MNT_USER); 3177 MNT_FLAG(MNT_IGNORE); 3178 MNT_FLAG(MNT_UPDATE); 3179 MNT_FLAG(MNT_DELEXPORT); 3180 MNT_FLAG(MNT_RELOAD); 3181 MNT_FLAG(MNT_FORCE); 3182 MNT_FLAG(MNT_SNAPSHOT); 3183 MNT_FLAG(MNT_BYFSID); 3184 #undef MNT_FLAG 3185 if (mflags != 0) { 3186 if (buf[0] != '\0') 3187 strlcat(buf, ", ", sizeof(buf)); 3188 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 3189 "0x%016jx", mflags); 3190 } 3191 db_printf(" mnt_flag = %s\n", buf); 3192 3193 buf[0] = '\0'; 3194 flags = mp->mnt_kern_flag; 3195 #define MNT_KERN_FLAG(flag) do { \ 3196 if (flags & (flag)) { \ 3197 if (buf[0] != '\0') \ 3198 strlcat(buf, ", ", sizeof(buf)); \ 3199 strlcat(buf, (#flag) + 5, sizeof(buf)); \ 3200 flags &= ~(flag); \ 3201 } \ 3202 } while (0) 3203 MNT_KERN_FLAG(MNTK_UNMOUNTF); 3204 MNT_KERN_FLAG(MNTK_ASYNC); 3205 MNT_KERN_FLAG(MNTK_SOFTDEP); 3206 MNT_KERN_FLAG(MNTK_NOINSMNTQ); 3207 MNT_KERN_FLAG(MNTK_DRAINING); 3208 MNT_KERN_FLAG(MNTK_REFEXPIRE); 3209 MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); 3210 MNT_KERN_FLAG(MNTK_SHARED_WRITES); 3211 MNT_KERN_FLAG(MNTK_NO_IOPF); 3212 MNT_KERN_FLAG(MNTK_VGONE_UPPER); 3213 MNT_KERN_FLAG(MNTK_VGONE_WAITER); 3214 MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT); 3215 MNT_KERN_FLAG(MNTK_MARKER); 3216 MNT_KERN_FLAG(MNTK_USES_BCACHE); 3217 MNT_KERN_FLAG(MNTK_NOASYNC); 3218 MNT_KERN_FLAG(MNTK_UNMOUNT); 3219 MNT_KERN_FLAG(MNTK_MWAIT); 3220 MNT_KERN_FLAG(MNTK_SUSPEND); 3221 MNT_KERN_FLAG(MNTK_SUSPEND2); 3222 MNT_KERN_FLAG(MNTK_SUSPENDED); 3223 MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); 3224 MNT_KERN_FLAG(MNTK_NOKNOTE); 3225 #undef MNT_KERN_FLAG 3226 if (flags != 0) { 3227 if (buf[0] != '\0') 3228 strlcat(buf, ", ", sizeof(buf)); 3229 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 3230 "0x%08x", flags); 3231 } 3232 db_printf(" mnt_kern_flag = %s\n", buf); 3233 3234 db_printf(" mnt_opt = "); 3235 opt = TAILQ_FIRST(mp->mnt_opt); 3236 if (opt != NULL) { 3237 db_printf("%s", opt->name); 3238 opt = TAILQ_NEXT(opt, link); 3239 while (opt != NULL) { 3240 db_printf(", %s", opt->name); 3241 opt = TAILQ_NEXT(opt, link); 3242 } 3243 } 3244 db_printf("\n"); 3245 3246 sp = &mp->mnt_stat; 3247 db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " 3248 "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " 3249 "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " 3250 "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", 3251 (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, 3252 (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, 3253 (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, 3254 (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, 3255 (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, 3256 (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, 3257 (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, 3258 (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); 3259 3260 db_printf(" mnt_cred = { uid=%u ruid=%u", 3261 (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); 3262 if (jailed(mp->mnt_cred)) 3263 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); 3264 db_printf(" }\n"); 3265 db_printf(" mnt_ref = %d\n", mp->mnt_ref); 3266 db_printf(" mnt_gen = %d\n", mp->mnt_gen); 3267 db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); 3268 db_printf(" mnt_activevnodelistsize = %d\n", 3269 mp->mnt_activevnodelistsize); 3270 db_printf(" mnt_writeopcount = %d\n", mp->mnt_writeopcount); 3271 db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); 3272 db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); 3273 db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); 3274 db_printf(" mnt_lockref = %d\n", mp->mnt_lockref); 3275 db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); 3276 db_printf(" mnt_secondary_accwrites = %d\n", 3277 mp->mnt_secondary_accwrites); 3278 db_printf(" mnt_gjprovider = %s\n", 3279 mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); 3280 3281 db_printf("\n\nList of active vnodes\n"); 3282 TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) { 3283 if (vp->v_type != VMARKER) { 3284 vn_printf(vp, "vnode "); 3285 if (db_pager_quit) 3286 break; 3287 } 3288 } 3289 db_printf("\n\nList of inactive vnodes\n"); 3290 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 3291 if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) { 3292 vn_printf(vp, "vnode "); 3293 if (db_pager_quit) 3294 break; 3295 } 3296 } 3297 } 3298 #endif /* DDB */ 3299 3300 /* 3301 * Fill in a struct xvfsconf based on a struct vfsconf. 3302 */ 3303 static int 3304 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) 3305 { 3306 struct xvfsconf xvfsp; 3307 3308 bzero(&xvfsp, sizeof(xvfsp)); 3309 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 3310 xvfsp.vfc_typenum = vfsp->vfc_typenum; 3311 xvfsp.vfc_refcount = vfsp->vfc_refcount; 3312 xvfsp.vfc_flags = vfsp->vfc_flags; 3313 /* 3314 * These are unused in userland, we keep them 3315 * to not break binary compatibility. 3316 */ 3317 xvfsp.vfc_vfsops = NULL; 3318 xvfsp.vfc_next = NULL; 3319 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 3320 } 3321 3322 #ifdef COMPAT_FREEBSD32 3323 struct xvfsconf32 { 3324 uint32_t vfc_vfsops; 3325 char vfc_name[MFSNAMELEN]; 3326 int32_t vfc_typenum; 3327 int32_t vfc_refcount; 3328 int32_t vfc_flags; 3329 uint32_t vfc_next; 3330 }; 3331 3332 static int 3333 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) 3334 { 3335 struct xvfsconf32 xvfsp; 3336 3337 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 3338 xvfsp.vfc_typenum = vfsp->vfc_typenum; 3339 xvfsp.vfc_refcount = vfsp->vfc_refcount; 3340 xvfsp.vfc_flags = vfsp->vfc_flags; 3341 xvfsp.vfc_vfsops = 0; 3342 xvfsp.vfc_next = 0; 3343 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 3344 } 3345 #endif 3346 3347 /* 3348 * Top level filesystem related information gathering. 3349 */ 3350 static int 3351 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) 3352 { 3353 struct vfsconf *vfsp; 3354 int error; 3355 3356 error = 0; 3357 vfsconf_slock(); 3358 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 3359 #ifdef COMPAT_FREEBSD32 3360 if (req->flags & SCTL_MASK32) 3361 error = vfsconf2x32(req, vfsp); 3362 else 3363 #endif 3364 error = vfsconf2x(req, vfsp); 3365 if (error) 3366 break; 3367 } 3368 vfsconf_sunlock(); 3369 return (error); 3370 } 3371 3372 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | 3373 CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, 3374 "S,xvfsconf", "List of all configured filesystems"); 3375 3376 #ifndef BURN_BRIDGES 3377 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 3378 3379 static int 3380 vfs_sysctl(SYSCTL_HANDLER_ARGS) 3381 { 3382 int *name = (int *)arg1 - 1; /* XXX */ 3383 u_int namelen = arg2 + 1; /* XXX */ 3384 struct vfsconf *vfsp; 3385 3386 log(LOG_WARNING, "userland calling deprecated sysctl, " 3387 "please rebuild world\n"); 3388 3389 #if 1 || defined(COMPAT_PRELITE2) 3390 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 3391 if (namelen == 1) 3392 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 3393 #endif 3394 3395 switch (name[1]) { 3396 case VFS_MAXTYPENUM: 3397 if (namelen != 2) 3398 return (ENOTDIR); 3399 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 3400 case VFS_CONF: 3401 if (namelen != 3) 3402 return (ENOTDIR); /* overloaded */ 3403 vfsconf_slock(); 3404 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 3405 if (vfsp->vfc_typenum == name[2]) 3406 break; 3407 } 3408 vfsconf_sunlock(); 3409 if (vfsp == NULL) 3410 return (EOPNOTSUPP); 3411 #ifdef COMPAT_FREEBSD32 3412 if (req->flags & SCTL_MASK32) 3413 return (vfsconf2x32(req, vfsp)); 3414 else 3415 #endif 3416 return (vfsconf2x(req, vfsp)); 3417 } 3418 return (EOPNOTSUPP); 3419 } 3420 3421 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | 3422 CTLFLAG_MPSAFE, vfs_sysctl, 3423 "Generic filesystem"); 3424 3425 #if 1 || defined(COMPAT_PRELITE2) 3426 3427 static int 3428 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 3429 { 3430 int error; 3431 struct vfsconf *vfsp; 3432 struct ovfsconf ovfs; 3433 3434 vfsconf_slock(); 3435 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 3436 bzero(&ovfs, sizeof(ovfs)); 3437 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 3438 strcpy(ovfs.vfc_name, vfsp->vfc_name); 3439 ovfs.vfc_index = vfsp->vfc_typenum; 3440 ovfs.vfc_refcount = vfsp->vfc_refcount; 3441 ovfs.vfc_flags = vfsp->vfc_flags; 3442 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 3443 if (error != 0) { 3444 vfsconf_sunlock(); 3445 return (error); 3446 } 3447 } 3448 vfsconf_sunlock(); 3449 return (0); 3450 } 3451 3452 #endif /* 1 || COMPAT_PRELITE2 */ 3453 #endif /* !BURN_BRIDGES */ 3454 3455 #define KINFO_VNODESLOP 10 3456 #ifdef notyet 3457 /* 3458 * Dump vnode list (via sysctl). 3459 */ 3460 /* ARGSUSED */ 3461 static int 3462 sysctl_vnode(SYSCTL_HANDLER_ARGS) 3463 { 3464 struct xvnode *xvn; 3465 struct mount *mp; 3466 struct vnode *vp; 3467 int error, len, n; 3468 3469 /* 3470 * Stale numvnodes access is not fatal here. 3471 */ 3472 req->lock = 0; 3473 len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; 3474 if (!req->oldptr) 3475 /* Make an estimate */ 3476 return (SYSCTL_OUT(req, 0, len)); 3477 3478 error = sysctl_wire_old_buffer(req, 0); 3479 if (error != 0) 3480 return (error); 3481 xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); 3482 n = 0; 3483 mtx_lock(&mountlist_mtx); 3484 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3485 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) 3486 continue; 3487 MNT_ILOCK(mp); 3488 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 3489 if (n == len) 3490 break; 3491 vref(vp); 3492 xvn[n].xv_size = sizeof *xvn; 3493 xvn[n].xv_vnode = vp; 3494 xvn[n].xv_id = 0; /* XXX compat */ 3495 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field 3496 XV_COPY(usecount); 3497 XV_COPY(writecount); 3498 XV_COPY(holdcnt); 3499 XV_COPY(mount); 3500 XV_COPY(numoutput); 3501 XV_COPY(type); 3502 #undef XV_COPY 3503 xvn[n].xv_flag = vp->v_vflag; 3504 3505 switch (vp->v_type) { 3506 case VREG: 3507 case VDIR: 3508 case VLNK: 3509 break; 3510 case VBLK: 3511 case VCHR: 3512 if (vp->v_rdev == NULL) { 3513 vrele(vp); 3514 continue; 3515 } 3516 xvn[n].xv_dev = dev2udev(vp->v_rdev); 3517 break; 3518 case VSOCK: 3519 xvn[n].xv_socket = vp->v_socket; 3520 break; 3521 case VFIFO: 3522 xvn[n].xv_fifo = vp->v_fifoinfo; 3523 break; 3524 case VNON: 3525 case VBAD: 3526 default: 3527 /* shouldn't happen? */ 3528 vrele(vp); 3529 continue; 3530 } 3531 vrele(vp); 3532 ++n; 3533 } 3534 MNT_IUNLOCK(mp); 3535 mtx_lock(&mountlist_mtx); 3536 vfs_unbusy(mp); 3537 if (n == len) 3538 break; 3539 } 3540 mtx_unlock(&mountlist_mtx); 3541 3542 error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); 3543 free(xvn, M_TEMP); 3544 return (error); 3545 } 3546 3547 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD | 3548 CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode", 3549 ""); 3550 #endif 3551 3552 static void 3553 unmount_or_warn(struct mount *mp) 3554 { 3555 int error; 3556 3557 error = dounmount(mp, MNT_FORCE, curthread); 3558 if (error != 0) { 3559 printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); 3560 if (error == EBUSY) 3561 printf("BUSY)\n"); 3562 else 3563 printf("%d)\n", error); 3564 } 3565 } 3566 3567 /* 3568 * Unmount all filesystems. The list is traversed in reverse order 3569 * of mounting to avoid dependencies. 3570 */ 3571 void 3572 vfs_unmountall(void) 3573 { 3574 struct mount *mp, *tmp; 3575 3576 CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); 3577 3578 /* 3579 * Since this only runs when rebooting, it is not interlocked. 3580 */ 3581 TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { 3582 vfs_ref(mp); 3583 3584 /* 3585 * Forcibly unmounting "/dev" before "/" would prevent clean 3586 * unmount of the latter. 3587 */ 3588 if (mp == rootdevmp) 3589 continue; 3590 3591 unmount_or_warn(mp); 3592 } 3593 3594 if (rootdevmp != NULL) 3595 unmount_or_warn(rootdevmp); 3596 } 3597 3598 /* 3599 * perform msync on all vnodes under a mount point 3600 * the mount point must be locked. 3601 */ 3602 void 3603 vfs_msync(struct mount *mp, int flags) 3604 { 3605 struct vnode *vp, *mvp; 3606 struct vm_object *obj; 3607 3608 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 3609 MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { 3610 obj = vp->v_object; 3611 if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 && 3612 (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) { 3613 if (!vget(vp, 3614 LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, 3615 curthread)) { 3616 if (vp->v_vflag & VV_NOSYNC) { /* unlinked */ 3617 vput(vp); 3618 continue; 3619 } 3620 3621 obj = vp->v_object; 3622 if (obj != NULL) { 3623 VM_OBJECT_WLOCK(obj); 3624 vm_object_page_clean(obj, 0, 0, 3625 flags == MNT_WAIT ? 3626 OBJPC_SYNC : OBJPC_NOSYNC); 3627 VM_OBJECT_WUNLOCK(obj); 3628 } 3629 vput(vp); 3630 } 3631 } else 3632 VI_UNLOCK(vp); 3633 } 3634 } 3635 3636 static void 3637 destroy_vpollinfo_free(struct vpollinfo *vi) 3638 { 3639 3640 knlist_destroy(&vi->vpi_selinfo.si_note); 3641 mtx_destroy(&vi->vpi_lock); 3642 uma_zfree(vnodepoll_zone, vi); 3643 } 3644 3645 static void 3646 destroy_vpollinfo(struct vpollinfo *vi) 3647 { 3648 3649 knlist_clear(&vi->vpi_selinfo.si_note, 1); 3650 seldrain(&vi->vpi_selinfo); 3651 destroy_vpollinfo_free(vi); 3652 } 3653 3654 /* 3655 * Initalize per-vnode helper structure to hold poll-related state. 3656 */ 3657 void 3658 v_addpollinfo(struct vnode *vp) 3659 { 3660 struct vpollinfo *vi; 3661 3662 if (vp->v_pollinfo != NULL) 3663 return; 3664 vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO); 3665 mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); 3666 knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, 3667 vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked); 3668 VI_LOCK(vp); 3669 if (vp->v_pollinfo != NULL) { 3670 VI_UNLOCK(vp); 3671 destroy_vpollinfo_free(vi); 3672 return; 3673 } 3674 vp->v_pollinfo = vi; 3675 VI_UNLOCK(vp); 3676 } 3677 3678 /* 3679 * Record a process's interest in events which might happen to 3680 * a vnode. Because poll uses the historic select-style interface 3681 * internally, this routine serves as both the ``check for any 3682 * pending events'' and the ``record my interest in future events'' 3683 * functions. (These are done together, while the lock is held, 3684 * to avoid race conditions.) 3685 */ 3686 int 3687 vn_pollrecord(struct vnode *vp, struct thread *td, int events) 3688 { 3689 3690 v_addpollinfo(vp); 3691 mtx_lock(&vp->v_pollinfo->vpi_lock); 3692 if (vp->v_pollinfo->vpi_revents & events) { 3693 /* 3694 * This leaves events we are not interested 3695 * in available for the other process which 3696 * which presumably had requested them 3697 * (otherwise they would never have been 3698 * recorded). 3699 */ 3700 events &= vp->v_pollinfo->vpi_revents; 3701 vp->v_pollinfo->vpi_revents &= ~events; 3702 3703 mtx_unlock(&vp->v_pollinfo->vpi_lock); 3704 return (events); 3705 } 3706 vp->v_pollinfo->vpi_events |= events; 3707 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 3708 mtx_unlock(&vp->v_pollinfo->vpi_lock); 3709 return (0); 3710 } 3711 3712 /* 3713 * Routine to create and manage a filesystem syncer vnode. 3714 */ 3715 #define sync_close ((int (*)(struct vop_close_args *))nullop) 3716 static int sync_fsync(struct vop_fsync_args *); 3717 static int sync_inactive(struct vop_inactive_args *); 3718 static int sync_reclaim(struct vop_reclaim_args *); 3719 3720 static struct vop_vector sync_vnodeops = { 3721 .vop_bypass = VOP_EOPNOTSUPP, 3722 .vop_close = sync_close, /* close */ 3723 .vop_fsync = sync_fsync, /* fsync */ 3724 .vop_inactive = sync_inactive, /* inactive */ 3725 .vop_reclaim = sync_reclaim, /* reclaim */ 3726 .vop_lock1 = vop_stdlock, /* lock */ 3727 .vop_unlock = vop_stdunlock, /* unlock */ 3728 .vop_islocked = vop_stdislocked, /* islocked */ 3729 }; 3730 3731 /* 3732 * Create a new filesystem syncer vnode for the specified mount point. 3733 */ 3734 void 3735 vfs_allocate_syncvnode(struct mount *mp) 3736 { 3737 struct vnode *vp; 3738 struct bufobj *bo; 3739 static long start, incr, next; 3740 int error; 3741 3742 /* Allocate a new vnode */ 3743 error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); 3744 if (error != 0) 3745 panic("vfs_allocate_syncvnode: getnewvnode() failed"); 3746 vp->v_type = VNON; 3747 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 3748 vp->v_vflag |= VV_FORCEINSMQ; 3749 error = insmntque(vp, mp); 3750 if (error != 0) 3751 panic("vfs_allocate_syncvnode: insmntque() failed"); 3752 vp->v_vflag &= ~VV_FORCEINSMQ; 3753 VOP_UNLOCK(vp, 0); 3754 /* 3755 * Place the vnode onto the syncer worklist. We attempt to 3756 * scatter them about on the list so that they will go off 3757 * at evenly distributed times even if all the filesystems 3758 * are mounted at once. 3759 */ 3760 next += incr; 3761 if (next == 0 || next > syncer_maxdelay) { 3762 start /= 2; 3763 incr /= 2; 3764 if (start == 0) { 3765 start = syncer_maxdelay / 2; 3766 incr = syncer_maxdelay; 3767 } 3768 next = start; 3769 } 3770 bo = &vp->v_bufobj; 3771 BO_LOCK(bo); 3772 vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); 3773 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ 3774 mtx_lock(&sync_mtx); 3775 sync_vnode_count++; 3776 if (mp->mnt_syncer == NULL) { 3777 mp->mnt_syncer = vp; 3778 vp = NULL; 3779 } 3780 mtx_unlock(&sync_mtx); 3781 BO_UNLOCK(bo); 3782 if (vp != NULL) { 3783 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 3784 vgone(vp); 3785 vput(vp); 3786 } 3787 } 3788 3789 void 3790 vfs_deallocate_syncvnode(struct mount *mp) 3791 { 3792 struct vnode *vp; 3793 3794 mtx_lock(&sync_mtx); 3795 vp = mp->mnt_syncer; 3796 if (vp != NULL) 3797 mp->mnt_syncer = NULL; 3798 mtx_unlock(&sync_mtx); 3799 if (vp != NULL) 3800 vrele(vp); 3801 } 3802 3803 /* 3804 * Do a lazy sync of the filesystem. 3805 */ 3806 static int 3807 sync_fsync(struct vop_fsync_args *ap) 3808 { 3809 struct vnode *syncvp = ap->a_vp; 3810 struct mount *mp = syncvp->v_mount; 3811 int error, save; 3812 struct bufobj *bo; 3813 3814 /* 3815 * We only need to do something if this is a lazy evaluation. 3816 */ 3817 if (ap->a_waitfor != MNT_LAZY) 3818 return (0); 3819 3820 /* 3821 * Move ourselves to the back of the sync list. 3822 */ 3823 bo = &syncvp->v_bufobj; 3824 BO_LOCK(bo); 3825 vn_syncer_add_to_worklist(bo, syncdelay); 3826 BO_UNLOCK(bo); 3827 3828 /* 3829 * Walk the list of vnodes pushing all that are dirty and 3830 * not already on the sync list. 3831 */ 3832 if (vfs_busy(mp, MBF_NOWAIT) != 0) 3833 return (0); 3834 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { 3835 vfs_unbusy(mp); 3836 return (0); 3837 } 3838 save = curthread_pflags_set(TDP_SYNCIO); 3839 vfs_msync(mp, MNT_NOWAIT); 3840 error = VFS_SYNC(mp, MNT_LAZY); 3841 curthread_pflags_restore(save); 3842 vn_finished_write(mp); 3843 vfs_unbusy(mp); 3844 return (error); 3845 } 3846 3847 /* 3848 * The syncer vnode is no referenced. 3849 */ 3850 static int 3851 sync_inactive(struct vop_inactive_args *ap) 3852 { 3853 3854 vgone(ap->a_vp); 3855 return (0); 3856 } 3857 3858 /* 3859 * The syncer vnode is no longer needed and is being decommissioned. 3860 * 3861 * Modifications to the worklist must be protected by sync_mtx. 3862 */ 3863 static int 3864 sync_reclaim(struct vop_reclaim_args *ap) 3865 { 3866 struct vnode *vp = ap->a_vp; 3867 struct bufobj *bo; 3868 3869 bo = &vp->v_bufobj; 3870 BO_LOCK(bo); 3871 mtx_lock(&sync_mtx); 3872 if (vp->v_mount->mnt_syncer == vp) 3873 vp->v_mount->mnt_syncer = NULL; 3874 if (bo->bo_flag & BO_ONWORKLST) { 3875 LIST_REMOVE(bo, bo_synclist); 3876 syncer_worklist_len--; 3877 sync_vnode_count--; 3878 bo->bo_flag &= ~BO_ONWORKLST; 3879 } 3880 mtx_unlock(&sync_mtx); 3881 BO_UNLOCK(bo); 3882 3883 return (0); 3884 } 3885 3886 /* 3887 * Check if vnode represents a disk device 3888 */ 3889 int 3890 vn_isdisk(struct vnode *vp, int *errp) 3891 { 3892 int error; 3893 3894 if (vp->v_type != VCHR) { 3895 error = ENOTBLK; 3896 goto out; 3897 } 3898 error = 0; 3899 dev_lock(); 3900 if (vp->v_rdev == NULL) 3901 error = ENXIO; 3902 else if (vp->v_rdev->si_devsw == NULL) 3903 error = ENXIO; 3904 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) 3905 error = ENOTBLK; 3906 dev_unlock(); 3907 out: 3908 if (errp != NULL) 3909 *errp = error; 3910 return (error == 0); 3911 } 3912 3913 /* 3914 * Common filesystem object access control check routine. Accepts a 3915 * vnode's type, "mode", uid and gid, requested access mode, credentials, 3916 * and optional call-by-reference privused argument allowing vaccess() 3917 * to indicate to the caller whether privilege was used to satisfy the 3918 * request (obsoleted). Returns 0 on success, or an errno on failure. 3919 */ 3920 int 3921 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, 3922 accmode_t accmode, struct ucred *cred, int *privused) 3923 { 3924 accmode_t dac_granted; 3925 accmode_t priv_granted; 3926 3927 KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, 3928 ("invalid bit in accmode")); 3929 KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), 3930 ("VAPPEND without VWRITE")); 3931 3932 /* 3933 * Look for a normal, non-privileged way to access the file/directory 3934 * as requested. If it exists, go with that. 3935 */ 3936 3937 if (privused != NULL) 3938 *privused = 0; 3939 3940 dac_granted = 0; 3941 3942 /* Check the owner. */ 3943 if (cred->cr_uid == file_uid) { 3944 dac_granted |= VADMIN; 3945 if (file_mode & S_IXUSR) 3946 dac_granted |= VEXEC; 3947 if (file_mode & S_IRUSR) 3948 dac_granted |= VREAD; 3949 if (file_mode & S_IWUSR) 3950 dac_granted |= (VWRITE | VAPPEND); 3951 3952 if ((accmode & dac_granted) == accmode) 3953 return (0); 3954 3955 goto privcheck; 3956 } 3957 3958 /* Otherwise, check the groups (first match) */ 3959 if (groupmember(file_gid, cred)) { 3960 if (file_mode & S_IXGRP) 3961 dac_granted |= VEXEC; 3962 if (file_mode & S_IRGRP) 3963 dac_granted |= VREAD; 3964 if (file_mode & S_IWGRP) 3965 dac_granted |= (VWRITE | VAPPEND); 3966 3967 if ((accmode & dac_granted) == accmode) 3968 return (0); 3969 3970 goto privcheck; 3971 } 3972 3973 /* Otherwise, check everyone else. */ 3974 if (file_mode & S_IXOTH) 3975 dac_granted |= VEXEC; 3976 if (file_mode & S_IROTH) 3977 dac_granted |= VREAD; 3978 if (file_mode & S_IWOTH) 3979 dac_granted |= (VWRITE | VAPPEND); 3980 if ((accmode & dac_granted) == accmode) 3981 return (0); 3982 3983 privcheck: 3984 /* 3985 * Build a privilege mask to determine if the set of privileges 3986 * satisfies the requirements when combined with the granted mask 3987 * from above. For each privilege, if the privilege is required, 3988 * bitwise or the request type onto the priv_granted mask. 3989 */ 3990 priv_granted = 0; 3991 3992 if (type == VDIR) { 3993 /* 3994 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC 3995 * requests, instead of PRIV_VFS_EXEC. 3996 */ 3997 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 3998 !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0)) 3999 priv_granted |= VEXEC; 4000 } else { 4001 /* 4002 * Ensure that at least one execute bit is on. Otherwise, 4003 * a privileged user will always succeed, and we don't want 4004 * this to happen unless the file really is executable. 4005 */ 4006 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 4007 (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && 4008 !priv_check_cred(cred, PRIV_VFS_EXEC, 0)) 4009 priv_granted |= VEXEC; 4010 } 4011 4012 if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && 4013 !priv_check_cred(cred, PRIV_VFS_READ, 0)) 4014 priv_granted |= VREAD; 4015 4016 if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && 4017 !priv_check_cred(cred, PRIV_VFS_WRITE, 0)) 4018 priv_granted |= (VWRITE | VAPPEND); 4019 4020 if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && 4021 !priv_check_cred(cred, PRIV_VFS_ADMIN, 0)) 4022 priv_granted |= VADMIN; 4023 4024 if ((accmode & (priv_granted | dac_granted)) == accmode) { 4025 /* XXX audit: privilege used */ 4026 if (privused != NULL) 4027 *privused = 1; 4028 return (0); 4029 } 4030 4031 return ((accmode & VADMIN) ? EPERM : EACCES); 4032 } 4033 4034 /* 4035 * Credential check based on process requesting service, and per-attribute 4036 * permissions. 4037 */ 4038 int 4039 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, 4040 struct thread *td, accmode_t accmode) 4041 { 4042 4043 /* 4044 * Kernel-invoked always succeeds. 4045 */ 4046 if (cred == NOCRED) 4047 return (0); 4048 4049 /* 4050 * Do not allow privileged processes in jail to directly manipulate 4051 * system attributes. 4052 */ 4053 switch (attrnamespace) { 4054 case EXTATTR_NAMESPACE_SYSTEM: 4055 /* Potentially should be: return (EPERM); */ 4056 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0)); 4057 case EXTATTR_NAMESPACE_USER: 4058 return (VOP_ACCESS(vp, accmode, cred, td)); 4059 default: 4060 return (EPERM); 4061 } 4062 } 4063 4064 #ifdef DEBUG_VFS_LOCKS 4065 /* 4066 * This only exists to supress warnings from unlocked specfs accesses. It is 4067 * no longer ok to have an unlocked VFS. 4068 */ 4069 #define IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL || \ 4070 (vp)->v_type == VCHR || (vp)->v_type == VBAD) 4071 4072 int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ 4073 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, 4074 "Drop into debugger on lock violation"); 4075 4076 int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ 4077 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 4078 0, "Check for interlock across VOPs"); 4079 4080 int vfs_badlock_print = 1; /* Print lock violations. */ 4081 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 4082 0, "Print lock violations"); 4083 4084 #ifdef KDB 4085 int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ 4086 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, 4087 &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); 4088 #endif 4089 4090 static void 4091 vfs_badlock(const char *msg, const char *str, struct vnode *vp) 4092 { 4093 4094 #ifdef KDB 4095 if (vfs_badlock_backtrace) 4096 kdb_backtrace(); 4097 #endif 4098 if (vfs_badlock_print) 4099 printf("%s: %p %s\n", str, (void *)vp, msg); 4100 if (vfs_badlock_ddb) 4101 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 4102 } 4103 4104 void 4105 assert_vi_locked(struct vnode *vp, const char *str) 4106 { 4107 4108 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) 4109 vfs_badlock("interlock is not locked but should be", str, vp); 4110 } 4111 4112 void 4113 assert_vi_unlocked(struct vnode *vp, const char *str) 4114 { 4115 4116 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) 4117 vfs_badlock("interlock is locked but should not be", str, vp); 4118 } 4119 4120 void 4121 assert_vop_locked(struct vnode *vp, const char *str) 4122 { 4123 int locked; 4124 4125 if (!IGNORE_LOCK(vp)) { 4126 locked = VOP_ISLOCKED(vp); 4127 if (locked == 0 || locked == LK_EXCLOTHER) 4128 vfs_badlock("is not locked but should be", str, vp); 4129 } 4130 } 4131 4132 void 4133 assert_vop_unlocked(struct vnode *vp, const char *str) 4134 { 4135 4136 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) 4137 vfs_badlock("is locked but should not be", str, vp); 4138 } 4139 4140 void 4141 assert_vop_elocked(struct vnode *vp, const char *str) 4142 { 4143 4144 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 4145 vfs_badlock("is not exclusive locked but should be", str, vp); 4146 } 4147 4148 #if 0 4149 void 4150 assert_vop_elocked_other(struct vnode *vp, const char *str) 4151 { 4152 4153 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER) 4154 vfs_badlock("is not exclusive locked by another thread", 4155 str, vp); 4156 } 4157 4158 void 4159 assert_vop_slocked(struct vnode *vp, const char *str) 4160 { 4161 4162 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED) 4163 vfs_badlock("is not locked shared but should be", str, vp); 4164 } 4165 #endif /* 0 */ 4166 #endif /* DEBUG_VFS_LOCKS */ 4167 4168 void 4169 vop_rename_fail(struct vop_rename_args *ap) 4170 { 4171 4172 if (ap->a_tvp != NULL) 4173 vput(ap->a_tvp); 4174 if (ap->a_tdvp == ap->a_tvp) 4175 vrele(ap->a_tdvp); 4176 else 4177 vput(ap->a_tdvp); 4178 vrele(ap->a_fdvp); 4179 vrele(ap->a_fvp); 4180 } 4181 4182 void 4183 vop_rename_pre(void *ap) 4184 { 4185 struct vop_rename_args *a = ap; 4186 4187 #ifdef DEBUG_VFS_LOCKS 4188 if (a->a_tvp) 4189 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); 4190 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); 4191 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); 4192 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); 4193 4194 /* Check the source (from). */ 4195 if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && 4196 (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) 4197 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); 4198 if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) 4199 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); 4200 4201 /* Check the target. */ 4202 if (a->a_tvp) 4203 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); 4204 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); 4205 #endif 4206 if (a->a_tdvp != a->a_fdvp) 4207 vhold(a->a_fdvp); 4208 if (a->a_tvp != a->a_fvp) 4209 vhold(a->a_fvp); 4210 vhold(a->a_tdvp); 4211 if (a->a_tvp) 4212 vhold(a->a_tvp); 4213 } 4214 4215 void 4216 vop_strategy_pre(void *ap) 4217 { 4218 #ifdef DEBUG_VFS_LOCKS 4219 struct vop_strategy_args *a; 4220 struct buf *bp; 4221 4222 a = ap; 4223 bp = a->a_bp; 4224 4225 /* 4226 * Cluster ops lock their component buffers but not the IO container. 4227 */ 4228 if ((bp->b_flags & B_CLUSTER) != 0) 4229 return; 4230 4231 if (panicstr == NULL && !BUF_ISLOCKED(bp)) { 4232 if (vfs_badlock_print) 4233 printf( 4234 "VOP_STRATEGY: bp is not locked but should be\n"); 4235 if (vfs_badlock_ddb) 4236 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 4237 } 4238 #endif 4239 } 4240 4241 void 4242 vop_lock_pre(void *ap) 4243 { 4244 #ifdef DEBUG_VFS_LOCKS 4245 struct vop_lock1_args *a = ap; 4246 4247 if ((a->a_flags & LK_INTERLOCK) == 0) 4248 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 4249 else 4250 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); 4251 #endif 4252 } 4253 4254 void 4255 vop_lock_post(void *ap, int rc) 4256 { 4257 #ifdef DEBUG_VFS_LOCKS 4258 struct vop_lock1_args *a = ap; 4259 4260 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 4261 if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) 4262 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); 4263 #endif 4264 } 4265 4266 void 4267 vop_unlock_pre(void *ap) 4268 { 4269 #ifdef DEBUG_VFS_LOCKS 4270 struct vop_unlock_args *a = ap; 4271 4272 if (a->a_flags & LK_INTERLOCK) 4273 ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK"); 4274 ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); 4275 #endif 4276 } 4277 4278 void 4279 vop_unlock_post(void *ap, int rc) 4280 { 4281 #ifdef DEBUG_VFS_LOCKS 4282 struct vop_unlock_args *a = ap; 4283 4284 if (a->a_flags & LK_INTERLOCK) 4285 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK"); 4286 #endif 4287 } 4288 4289 void 4290 vop_create_post(void *ap, int rc) 4291 { 4292 struct vop_create_args *a = ap; 4293 4294 if (!rc) 4295 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4296 } 4297 4298 void 4299 vop_deleteextattr_post(void *ap, int rc) 4300 { 4301 struct vop_deleteextattr_args *a = ap; 4302 4303 if (!rc) 4304 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 4305 } 4306 4307 void 4308 vop_link_post(void *ap, int rc) 4309 { 4310 struct vop_link_args *a = ap; 4311 4312 if (!rc) { 4313 VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK); 4314 VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE); 4315 } 4316 } 4317 4318 void 4319 vop_mkdir_post(void *ap, int rc) 4320 { 4321 struct vop_mkdir_args *a = ap; 4322 4323 if (!rc) 4324 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); 4325 } 4326 4327 void 4328 vop_mknod_post(void *ap, int rc) 4329 { 4330 struct vop_mknod_args *a = ap; 4331 4332 if (!rc) 4333 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4334 } 4335 4336 void 4337 vop_reclaim_post(void *ap, int rc) 4338 { 4339 struct vop_reclaim_args *a = ap; 4340 4341 if (!rc) 4342 VFS_KNOTE_LOCKED(a->a_vp, NOTE_REVOKE); 4343 } 4344 4345 void 4346 vop_remove_post(void *ap, int rc) 4347 { 4348 struct vop_remove_args *a = ap; 4349 4350 if (!rc) { 4351 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4352 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); 4353 } 4354 } 4355 4356 void 4357 vop_rename_post(void *ap, int rc) 4358 { 4359 struct vop_rename_args *a = ap; 4360 4361 if (!rc) { 4362 VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE); 4363 VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE); 4364 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); 4365 if (a->a_tvp) 4366 VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); 4367 } 4368 if (a->a_tdvp != a->a_fdvp) 4369 vdrop(a->a_fdvp); 4370 if (a->a_tvp != a->a_fvp) 4371 vdrop(a->a_fvp); 4372 vdrop(a->a_tdvp); 4373 if (a->a_tvp) 4374 vdrop(a->a_tvp); 4375 } 4376 4377 void 4378 vop_rmdir_post(void *ap, int rc) 4379 { 4380 struct vop_rmdir_args *a = ap; 4381 4382 if (!rc) { 4383 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); 4384 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); 4385 } 4386 } 4387 4388 void 4389 vop_setattr_post(void *ap, int rc) 4390 { 4391 struct vop_setattr_args *a = ap; 4392 4393 if (!rc) 4394 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 4395 } 4396 4397 void 4398 vop_setextattr_post(void *ap, int rc) 4399 { 4400 struct vop_setextattr_args *a = ap; 4401 4402 if (!rc) 4403 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 4404 } 4405 4406 void 4407 vop_symlink_post(void *ap, int rc) 4408 { 4409 struct vop_symlink_args *a = ap; 4410 4411 if (!rc) 4412 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4413 } 4414 4415 static struct knlist fs_knlist; 4416 4417 static void 4418 vfs_event_init(void *arg) 4419 { 4420 knlist_init_mtx(&fs_knlist, NULL); 4421 } 4422 /* XXX - correct order? */ 4423 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); 4424 4425 void 4426 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) 4427 { 4428 4429 KNOTE_UNLOCKED(&fs_knlist, event); 4430 } 4431 4432 static int filt_fsattach(struct knote *kn); 4433 static void filt_fsdetach(struct knote *kn); 4434 static int filt_fsevent(struct knote *kn, long hint); 4435 4436 struct filterops fs_filtops = { 4437 .f_isfd = 0, 4438 .f_attach = filt_fsattach, 4439 .f_detach = filt_fsdetach, 4440 .f_event = filt_fsevent 4441 }; 4442 4443 static int 4444 filt_fsattach(struct knote *kn) 4445 { 4446 4447 kn->kn_flags |= EV_CLEAR; 4448 knlist_add(&fs_knlist, kn, 0); 4449 return (0); 4450 } 4451 4452 static void 4453 filt_fsdetach(struct knote *kn) 4454 { 4455 4456 knlist_remove(&fs_knlist, kn, 0); 4457 } 4458 4459 static int 4460 filt_fsevent(struct knote *kn, long hint) 4461 { 4462 4463 kn->kn_fflags |= hint; 4464 return (kn->kn_fflags != 0); 4465 } 4466 4467 static int 4468 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) 4469 { 4470 struct vfsidctl vc; 4471 int error; 4472 struct mount *mp; 4473 4474 error = SYSCTL_IN(req, &vc, sizeof(vc)); 4475 if (error) 4476 return (error); 4477 if (vc.vc_vers != VFS_CTL_VERS1) 4478 return (EINVAL); 4479 mp = vfs_getvfs(&vc.vc_fsid); 4480 if (mp == NULL) 4481 return (ENOENT); 4482 /* ensure that a specific sysctl goes to the right filesystem. */ 4483 if (strcmp(vc.vc_fstypename, "*") != 0 && 4484 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { 4485 vfs_rel(mp); 4486 return (EINVAL); 4487 } 4488 VCTLTOREQ(&vc, req); 4489 error = VFS_SYSCTL(mp, vc.vc_op, req); 4490 vfs_rel(mp); 4491 return (error); 4492 } 4493 4494 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR, 4495 NULL, 0, sysctl_vfs_ctl, "", 4496 "Sysctl by fsid"); 4497 4498 /* 4499 * Function to initialize a va_filerev field sensibly. 4500 * XXX: Wouldn't a random number make a lot more sense ?? 4501 */ 4502 u_quad_t 4503 init_va_filerev(void) 4504 { 4505 struct bintime bt; 4506 4507 getbinuptime(&bt); 4508 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); 4509 } 4510 4511 static int filt_vfsread(struct knote *kn, long hint); 4512 static int filt_vfswrite(struct knote *kn, long hint); 4513 static int filt_vfsvnode(struct knote *kn, long hint); 4514 static void filt_vfsdetach(struct knote *kn); 4515 static struct filterops vfsread_filtops = { 4516 .f_isfd = 1, 4517 .f_detach = filt_vfsdetach, 4518 .f_event = filt_vfsread 4519 }; 4520 static struct filterops vfswrite_filtops = { 4521 .f_isfd = 1, 4522 .f_detach = filt_vfsdetach, 4523 .f_event = filt_vfswrite 4524 }; 4525 static struct filterops vfsvnode_filtops = { 4526 .f_isfd = 1, 4527 .f_detach = filt_vfsdetach, 4528 .f_event = filt_vfsvnode 4529 }; 4530 4531 static void 4532 vfs_knllock(void *arg) 4533 { 4534 struct vnode *vp = arg; 4535 4536 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 4537 } 4538 4539 static void 4540 vfs_knlunlock(void *arg) 4541 { 4542 struct vnode *vp = arg; 4543 4544 VOP_UNLOCK(vp, 0); 4545 } 4546 4547 static void 4548 vfs_knl_assert_locked(void *arg) 4549 { 4550 #ifdef DEBUG_VFS_LOCKS 4551 struct vnode *vp = arg; 4552 4553 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); 4554 #endif 4555 } 4556 4557 static void 4558 vfs_knl_assert_unlocked(void *arg) 4559 { 4560 #ifdef DEBUG_VFS_LOCKS 4561 struct vnode *vp = arg; 4562 4563 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); 4564 #endif 4565 } 4566 4567 int 4568 vfs_kqfilter(struct vop_kqfilter_args *ap) 4569 { 4570 struct vnode *vp = ap->a_vp; 4571 struct knote *kn = ap->a_kn; 4572 struct knlist *knl; 4573 4574 switch (kn->kn_filter) { 4575 case EVFILT_READ: 4576 kn->kn_fop = &vfsread_filtops; 4577 break; 4578 case EVFILT_WRITE: 4579 kn->kn_fop = &vfswrite_filtops; 4580 break; 4581 case EVFILT_VNODE: 4582 kn->kn_fop = &vfsvnode_filtops; 4583 break; 4584 default: 4585 return (EINVAL); 4586 } 4587 4588 kn->kn_hook = (caddr_t)vp; 4589 4590 v_addpollinfo(vp); 4591 if (vp->v_pollinfo == NULL) 4592 return (ENOMEM); 4593 knl = &vp->v_pollinfo->vpi_selinfo.si_note; 4594 vhold(vp); 4595 knlist_add(knl, kn, 0); 4596 4597 return (0); 4598 } 4599 4600 /* 4601 * Detach knote from vnode 4602 */ 4603 static void 4604 filt_vfsdetach(struct knote *kn) 4605 { 4606 struct vnode *vp = (struct vnode *)kn->kn_hook; 4607 4608 KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); 4609 knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); 4610 vdrop(vp); 4611 } 4612 4613 /*ARGSUSED*/ 4614 static int 4615 filt_vfsread(struct knote *kn, long hint) 4616 { 4617 struct vnode *vp = (struct vnode *)kn->kn_hook; 4618 struct vattr va; 4619 int res; 4620 4621 /* 4622 * filesystem is gone, so set the EOF flag and schedule 4623 * the knote for deletion. 4624 */ 4625 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 4626 VI_LOCK(vp); 4627 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 4628 VI_UNLOCK(vp); 4629 return (1); 4630 } 4631 4632 if (VOP_GETATTR(vp, &va, curthread->td_ucred)) 4633 return (0); 4634 4635 VI_LOCK(vp); 4636 kn->kn_data = va.va_size - kn->kn_fp->f_offset; 4637 res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; 4638 VI_UNLOCK(vp); 4639 return (res); 4640 } 4641 4642 /*ARGSUSED*/ 4643 static int 4644 filt_vfswrite(struct knote *kn, long hint) 4645 { 4646 struct vnode *vp = (struct vnode *)kn->kn_hook; 4647 4648 VI_LOCK(vp); 4649 4650 /* 4651 * filesystem is gone, so set the EOF flag and schedule 4652 * the knote for deletion. 4653 */ 4654 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) 4655 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 4656 4657 kn->kn_data = 0; 4658 VI_UNLOCK(vp); 4659 return (1); 4660 } 4661 4662 static int 4663 filt_vfsvnode(struct knote *kn, long hint) 4664 { 4665 struct vnode *vp = (struct vnode *)kn->kn_hook; 4666 int res; 4667 4668 VI_LOCK(vp); 4669 if (kn->kn_sfflags & hint) 4670 kn->kn_fflags |= hint; 4671 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 4672 kn->kn_flags |= EV_EOF; 4673 VI_UNLOCK(vp); 4674 return (1); 4675 } 4676 res = (kn->kn_fflags != 0); 4677 VI_UNLOCK(vp); 4678 return (res); 4679 } 4680 4681 int 4682 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) 4683 { 4684 int error; 4685 4686 if (dp->d_reclen > ap->a_uio->uio_resid) 4687 return (ENAMETOOLONG); 4688 error = uiomove(dp, dp->d_reclen, ap->a_uio); 4689 if (error) { 4690 if (ap->a_ncookies != NULL) { 4691 if (ap->a_cookies != NULL) 4692 free(ap->a_cookies, M_TEMP); 4693 ap->a_cookies = NULL; 4694 *ap->a_ncookies = 0; 4695 } 4696 return (error); 4697 } 4698 if (ap->a_ncookies == NULL) 4699 return (0); 4700 4701 KASSERT(ap->a_cookies, 4702 ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); 4703 4704 *ap->a_cookies = realloc(*ap->a_cookies, 4705 (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO); 4706 (*ap->a_cookies)[*ap->a_ncookies] = off; 4707 return (0); 4708 } 4709 4710 /* 4711 * Mark for update the access time of the file if the filesystem 4712 * supports VOP_MARKATIME. This functionality is used by execve and 4713 * mmap, so we want to avoid the I/O implied by directly setting 4714 * va_atime for the sake of efficiency. 4715 */ 4716 void 4717 vfs_mark_atime(struct vnode *vp, struct ucred *cred) 4718 { 4719 struct mount *mp; 4720 4721 mp = vp->v_mount; 4722 ASSERT_VOP_LOCKED(vp, "vfs_mark_atime"); 4723 if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) 4724 (void)VOP_MARKATIME(vp); 4725 } 4726 4727 /* 4728 * The purpose of this routine is to remove granularity from accmode_t, 4729 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, 4730 * VADMIN and VAPPEND. 4731 * 4732 * If it returns 0, the caller is supposed to continue with the usual 4733 * access checks using 'accmode' as modified by this routine. If it 4734 * returns nonzero value, the caller is supposed to return that value 4735 * as errno. 4736 * 4737 * Note that after this routine runs, accmode may be zero. 4738 */ 4739 int 4740 vfs_unixify_accmode(accmode_t *accmode) 4741 { 4742 /* 4743 * There is no way to specify explicit "deny" rule using 4744 * file mode or POSIX.1e ACLs. 4745 */ 4746 if (*accmode & VEXPLICIT_DENY) { 4747 *accmode = 0; 4748 return (0); 4749 } 4750 4751 /* 4752 * None of these can be translated into usual access bits. 4753 * Also, the common case for NFSv4 ACLs is to not contain 4754 * either of these bits. Caller should check for VWRITE 4755 * on the containing directory instead. 4756 */ 4757 if (*accmode & (VDELETE_CHILD | VDELETE)) 4758 return (EPERM); 4759 4760 if (*accmode & VADMIN_PERMS) { 4761 *accmode &= ~VADMIN_PERMS; 4762 *accmode |= VADMIN; 4763 } 4764 4765 /* 4766 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL 4767 * or VSYNCHRONIZE using file mode or POSIX.1e ACL. 4768 */ 4769 *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); 4770 4771 return (0); 4772 } 4773 4774 /* 4775 * These are helper functions for filesystems to traverse all 4776 * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. 4777 * 4778 * This interface replaces MNT_VNODE_FOREACH. 4779 */ 4780 4781 MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); 4782 4783 struct vnode * 4784 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) 4785 { 4786 struct vnode *vp; 4787 4788 if (should_yield()) 4789 kern_yield(PRI_USER); 4790 MNT_ILOCK(mp); 4791 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 4792 vp = TAILQ_NEXT(*mvp, v_nmntvnodes); 4793 while (vp != NULL && (vp->v_type == VMARKER || 4794 (vp->v_iflag & VI_DOOMED) != 0)) 4795 vp = TAILQ_NEXT(vp, v_nmntvnodes); 4796 4797 /* Check if we are done */ 4798 if (vp == NULL) { 4799 __mnt_vnode_markerfree_all(mvp, mp); 4800 /* MNT_IUNLOCK(mp); -- done in above function */ 4801 mtx_assert(MNT_MTX(mp), MA_NOTOWNED); 4802 return (NULL); 4803 } 4804 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 4805 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 4806 VI_LOCK(vp); 4807 MNT_IUNLOCK(mp); 4808 return (vp); 4809 } 4810 4811 struct vnode * 4812 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) 4813 { 4814 struct vnode *vp; 4815 4816 *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 4817 MNT_ILOCK(mp); 4818 MNT_REF(mp); 4819 (*mvp)->v_type = VMARKER; 4820 4821 vp = TAILQ_FIRST(&mp->mnt_nvnodelist); 4822 while (vp != NULL && (vp->v_type == VMARKER || 4823 (vp->v_iflag & VI_DOOMED) != 0)) 4824 vp = TAILQ_NEXT(vp, v_nmntvnodes); 4825 4826 /* Check if we are done */ 4827 if (vp == NULL) { 4828 MNT_REL(mp); 4829 MNT_IUNLOCK(mp); 4830 free(*mvp, M_VNODE_MARKER); 4831 *mvp = NULL; 4832 return (NULL); 4833 } 4834 (*mvp)->v_mount = mp; 4835 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 4836 VI_LOCK(vp); 4837 MNT_IUNLOCK(mp); 4838 return (vp); 4839 } 4840 4841 4842 void 4843 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) 4844 { 4845 4846 if (*mvp == NULL) { 4847 MNT_IUNLOCK(mp); 4848 return; 4849 } 4850 4851 mtx_assert(MNT_MTX(mp), MA_OWNED); 4852 4853 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 4854 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 4855 MNT_REL(mp); 4856 MNT_IUNLOCK(mp); 4857 free(*mvp, M_VNODE_MARKER); 4858 *mvp = NULL; 4859 } 4860 4861 /* 4862 * These are helper functions for filesystems to traverse their 4863 * active vnodes. See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h 4864 */ 4865 static void 4866 mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) 4867 { 4868 4869 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 4870 4871 MNT_ILOCK(mp); 4872 MNT_REL(mp); 4873 MNT_IUNLOCK(mp); 4874 free(*mvp, M_VNODE_MARKER); 4875 *mvp = NULL; 4876 } 4877 4878 static struct vnode * 4879 mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) 4880 { 4881 struct vnode *vp, *nvp; 4882 4883 mtx_assert(&vnode_free_list_mtx, MA_OWNED); 4884 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 4885 restart: 4886 vp = TAILQ_NEXT(*mvp, v_actfreelist); 4887 TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); 4888 while (vp != NULL) { 4889 if (vp->v_type == VMARKER) { 4890 vp = TAILQ_NEXT(vp, v_actfreelist); 4891 continue; 4892 } 4893 if (!VI_TRYLOCK(vp)) { 4894 if (mp_ncpus == 1 || should_yield()) { 4895 TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist); 4896 mtx_unlock(&vnode_free_list_mtx); 4897 pause("vnacti", 1); 4898 mtx_lock(&vnode_free_list_mtx); 4899 goto restart; 4900 } 4901 continue; 4902 } 4903 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); 4904 KASSERT(vp->v_mount == mp || vp->v_mount == NULL, 4905 ("alien vnode on the active list %p %p", vp, mp)); 4906 if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0) 4907 break; 4908 nvp = TAILQ_NEXT(vp, v_actfreelist); 4909 VI_UNLOCK(vp); 4910 vp = nvp; 4911 } 4912 4913 /* Check if we are done */ 4914 if (vp == NULL) { 4915 mtx_unlock(&vnode_free_list_mtx); 4916 mnt_vnode_markerfree_active(mvp, mp); 4917 return (NULL); 4918 } 4919 TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist); 4920 mtx_unlock(&vnode_free_list_mtx); 4921 ASSERT_VI_LOCKED(vp, "active iter"); 4922 KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp)); 4923 return (vp); 4924 } 4925 4926 struct vnode * 4927 __mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) 4928 { 4929 4930 if (should_yield()) 4931 kern_yield(PRI_USER); 4932 mtx_lock(&vnode_free_list_mtx); 4933 return (mnt_vnode_next_active(mvp, mp)); 4934 } 4935 4936 struct vnode * 4937 __mnt_vnode_first_active(struct vnode **mvp, struct mount *mp) 4938 { 4939 struct vnode *vp; 4940 4941 *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 4942 MNT_ILOCK(mp); 4943 MNT_REF(mp); 4944 MNT_IUNLOCK(mp); 4945 (*mvp)->v_type = VMARKER; 4946 (*mvp)->v_mount = mp; 4947 4948 mtx_lock(&vnode_free_list_mtx); 4949 vp = TAILQ_FIRST(&mp->mnt_activevnodelist); 4950 if (vp == NULL) { 4951 mtx_unlock(&vnode_free_list_mtx); 4952 mnt_vnode_markerfree_active(mvp, mp); 4953 return (NULL); 4954 } 4955 TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist); 4956 return (mnt_vnode_next_active(mvp, mp)); 4957 } 4958 4959 void 4960 __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) 4961 { 4962 4963 if (*mvp == NULL) 4964 return; 4965 4966 mtx_lock(&vnode_free_list_mtx); 4967 TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); 4968 mtx_unlock(&vnode_free_list_mtx); 4969 mnt_vnode_markerfree_active(mvp, mp); 4970 } 4971