1 /*- 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 35 */ 36 37 /* 38 * External virtual filesystem routines 39 */ 40 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 #include "opt_compat.h" 45 #include "opt_ddb.h" 46 #include "opt_watchdog.h" 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/bio.h> 51 #include <sys/buf.h> 52 #include <sys/condvar.h> 53 #include <sys/conf.h> 54 #include <sys/dirent.h> 55 #include <sys/event.h> 56 #include <sys/eventhandler.h> 57 #include <sys/extattr.h> 58 #include <sys/file.h> 59 #include <sys/fcntl.h> 60 #include <sys/jail.h> 61 #include <sys/kdb.h> 62 #include <sys/kernel.h> 63 #include <sys/kthread.h> 64 #include <sys/lockf.h> 65 #include <sys/malloc.h> 66 #include <sys/mount.h> 67 #include <sys/namei.h> 68 #include <sys/pctrie.h> 69 #include <sys/priv.h> 70 #include <sys/reboot.h> 71 #include <sys/refcount.h> 72 #include <sys/rwlock.h> 73 #include <sys/sched.h> 74 #include <sys/sleepqueue.h> 75 #include <sys/smp.h> 76 #include <sys/stat.h> 77 #include <sys/sysctl.h> 78 #include <sys/syslog.h> 79 #include <sys/vmmeter.h> 80 #include <sys/vnode.h> 81 #include <sys/watchdog.h> 82 83 #include <machine/stdarg.h> 84 85 #include <security/mac/mac_framework.h> 86 87 #include <vm/vm.h> 88 #include <vm/vm_object.h> 89 #include <vm/vm_extern.h> 90 #include <vm/pmap.h> 91 #include <vm/vm_map.h> 92 #include <vm/vm_page.h> 93 #include <vm/vm_kern.h> 94 #include <vm/uma.h> 95 96 #ifdef DDB 97 #include <ddb/ddb.h> 98 #endif 99 100 static void delmntque(struct vnode *vp); 101 static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, 102 int slpflag, int slptimeo); 103 static void syncer_shutdown(void *arg, int howto); 104 static int vtryrecycle(struct vnode *vp); 105 static void v_init_counters(struct vnode *); 106 static void v_incr_usecount(struct vnode *); 107 static void v_incr_devcount(struct vnode *); 108 static void v_decr_devcount(struct vnode *); 109 static void vnlru_free(int); 110 static void vgonel(struct vnode *); 111 static void vfs_knllock(void *arg); 112 static void vfs_knlunlock(void *arg); 113 static void vfs_knl_assert_locked(void *arg); 114 static void vfs_knl_assert_unlocked(void *arg); 115 static void destroy_vpollinfo(struct vpollinfo *vi); 116 117 /* 118 * Number of vnodes in existence. Increased whenever getnewvnode() 119 * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode. 120 */ 121 static unsigned long numvnodes; 122 123 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, 124 "Number of vnodes in existence"); 125 126 static u_long vnodes_created; 127 SYSCTL_ULONG(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, 128 0, "Number of vnodes created by getnewvnode"); 129 130 /* 131 * Conversion tables for conversion from vnode types to inode formats 132 * and back. 133 */ 134 enum vtype iftovt_tab[16] = { 135 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 136 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 137 }; 138 int vttoif_tab[10] = { 139 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 140 S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT 141 }; 142 143 /* 144 * List of vnodes that are ready for recycling. 145 */ 146 static TAILQ_HEAD(freelst, vnode) vnode_free_list; 147 148 /* 149 * "Free" vnode target. Free vnodes are rarely completely free, but are 150 * just ones that are cheap to recycle. Usually they are for files which 151 * have been stat'd but not read; these usually have inode and namecache 152 * data attached to them. This target is the preferred minimum size of a 153 * sub-cache consisting mostly of such files. The system balances the size 154 * of this sub-cache with its complement to try to prevent either from 155 * thrashing while the other is relatively inactive. The targets express 156 * a preference for the best balance. 157 * 158 * "Above" this target there are 2 further targets (watermarks) related 159 * to recyling of free vnodes. In the best-operating case, the cache is 160 * exactly full, the free list has size between vlowat and vhiwat above the 161 * free target, and recycling from it and normal use maintains this state. 162 * Sometimes the free list is below vlowat or even empty, but this state 163 * is even better for immediate use provided the cache is not full. 164 * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free 165 * ones) to reach one of these states. The watermarks are currently hard- 166 * coded as 4% and 9% of the available space higher. These and the default 167 * of 25% for wantfreevnodes are too large if the memory size is large. 168 * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim 169 * whenever vnlru_proc() becomes active. 170 */ 171 static u_long wantfreevnodes; 172 SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, 173 &wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes"); 174 static u_long freevnodes; 175 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, 176 &freevnodes, 0, "Number of \"free\" vnodes"); 177 178 static u_long recycles_count; 179 SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 0, 180 "Number of vnodes recycled to meet vnode cache targets"); 181 182 /* 183 * Various variables used for debugging the new implementation of 184 * reassignbuf(). 185 * XXX these are probably of (very) limited utility now. 186 */ 187 static int reassignbufcalls; 188 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, 189 "Number of calls to reassignbuf"); 190 191 static u_long free_owe_inact; 192 SYSCTL_ULONG(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact, 0, 193 "Number of times free vnodes kept on active list due to VFS " 194 "owing inactivation"); 195 196 /* To keep more than one thread at a time from running vfs_getnewfsid */ 197 static struct mtx mntid_mtx; 198 199 /* 200 * Lock for any access to the following: 201 * vnode_free_list 202 * numvnodes 203 * freevnodes 204 */ 205 static struct mtx vnode_free_list_mtx; 206 207 /* Publicly exported FS */ 208 struct nfs_public nfs_pub; 209 210 static uma_zone_t buf_trie_zone; 211 212 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 213 static uma_zone_t vnode_zone; 214 static uma_zone_t vnodepoll_zone; 215 216 /* 217 * The workitem queue. 218 * 219 * It is useful to delay writes of file data and filesystem metadata 220 * for tens of seconds so that quickly created and deleted files need 221 * not waste disk bandwidth being created and removed. To realize this, 222 * we append vnodes to a "workitem" queue. When running with a soft 223 * updates implementation, most pending metadata dependencies should 224 * not wait for more than a few seconds. Thus, mounted on block devices 225 * are delayed only about a half the time that file data is delayed. 226 * Similarly, directory updates are more critical, so are only delayed 227 * about a third the time that file data is delayed. Thus, there are 228 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 229 * one each second (driven off the filesystem syncer process). The 230 * syncer_delayno variable indicates the next queue that is to be processed. 231 * Items that need to be processed soon are placed in this queue: 232 * 233 * syncer_workitem_pending[syncer_delayno] 234 * 235 * A delay of fifteen seconds is done by placing the request fifteen 236 * entries later in the queue: 237 * 238 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 239 * 240 */ 241 static int syncer_delayno; 242 static long syncer_mask; 243 LIST_HEAD(synclist, bufobj); 244 static struct synclist *syncer_workitem_pending; 245 /* 246 * The sync_mtx protects: 247 * bo->bo_synclist 248 * sync_vnode_count 249 * syncer_delayno 250 * syncer_state 251 * syncer_workitem_pending 252 * syncer_worklist_len 253 * rushjob 254 */ 255 static struct mtx sync_mtx; 256 static struct cv sync_wakeup; 257 258 #define SYNCER_MAXDELAY 32 259 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 260 static int syncdelay = 30; /* max time to delay syncing data */ 261 static int filedelay = 30; /* time to delay syncing files */ 262 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, 263 "Time to delay syncing files (in seconds)"); 264 static int dirdelay = 29; /* time to delay syncing directories */ 265 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, 266 "Time to delay syncing directories (in seconds)"); 267 static int metadelay = 28; /* time to delay syncing metadata */ 268 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, 269 "Time to delay syncing metadata (in seconds)"); 270 static int rushjob; /* number of slots to run ASAP */ 271 static int stat_rush_requests; /* number of times I/O speeded up */ 272 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, 273 "Number of times I/O speeded up (rush requests)"); 274 275 /* 276 * When shutting down the syncer, run it at four times normal speed. 277 */ 278 #define SYNCER_SHUTDOWN_SPEEDUP 4 279 static int sync_vnode_count; 280 static int syncer_worklist_len; 281 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } 282 syncer_state; 283 284 /* Target for maximum number of vnodes. */ 285 int desiredvnodes; 286 static int gapvnodes; /* gap between wanted and desired */ 287 static int vhiwat; /* enough extras after expansion */ 288 static int vlowat; /* minimal extras before expansion */ 289 static int vstir; /* nonzero to stir non-free vnodes */ 290 static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ 291 292 static int 293 sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS) 294 { 295 int error, old_desiredvnodes; 296 297 old_desiredvnodes = desiredvnodes; 298 if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0) 299 return (error); 300 if (old_desiredvnodes != desiredvnodes) { 301 wantfreevnodes = desiredvnodes / 4; 302 /* XXX locking seems to be incomplete. */ 303 vfs_hash_changesize(desiredvnodes); 304 cache_changesize(desiredvnodes); 305 } 306 return (0); 307 } 308 309 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, 310 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0, 311 sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes"); 312 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 313 &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)"); 314 static int vnlru_nowhere; 315 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, 316 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); 317 318 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ 319 static int vnsz2log; 320 321 /* 322 * Support for the bufobj clean & dirty pctrie. 323 */ 324 static void * 325 buf_trie_alloc(struct pctrie *ptree) 326 { 327 328 return uma_zalloc(buf_trie_zone, M_NOWAIT); 329 } 330 331 static void 332 buf_trie_free(struct pctrie *ptree, void *node) 333 { 334 335 uma_zfree(buf_trie_zone, node); 336 } 337 PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free); 338 339 /* 340 * Initialize the vnode management data structures. 341 * 342 * Reevaluate the following cap on the number of vnodes after the physical 343 * memory size exceeds 512GB. In the limit, as the physical memory size 344 * grows, the ratio of the memory size in KB to to vnodes approaches 64:1. 345 */ 346 #ifndef MAXVNODES_MAX 347 #define MAXVNODES_MAX (512 * 1024 * 1024 / 64) /* 8M */ 348 #endif 349 350 /* 351 * Initialize a vnode as it first enters the zone. 352 */ 353 static int 354 vnode_init(void *mem, int size, int flags) 355 { 356 struct vnode *vp; 357 struct bufobj *bo; 358 359 vp = mem; 360 bzero(vp, size); 361 /* 362 * Setup locks. 363 */ 364 vp->v_vnlock = &vp->v_lock; 365 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); 366 /* 367 * By default, don't allow shared locks unless filesystems opt-in. 368 */ 369 lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, 370 LK_NOSHARE | LK_IS_VNODE); 371 /* 372 * Initialize bufobj. 373 */ 374 bo = &vp->v_bufobj; 375 bo->__bo_vnode = vp; 376 rw_init(BO_LOCKPTR(bo), "bufobj interlock"); 377 bo->bo_private = vp; 378 TAILQ_INIT(&bo->bo_clean.bv_hd); 379 TAILQ_INIT(&bo->bo_dirty.bv_hd); 380 /* 381 * Initialize namecache. 382 */ 383 LIST_INIT(&vp->v_cache_src); 384 TAILQ_INIT(&vp->v_cache_dst); 385 /* 386 * Initialize rangelocks. 387 */ 388 rangelock_init(&vp->v_rl); 389 return (0); 390 } 391 392 /* 393 * Free a vnode when it is cleared from the zone. 394 */ 395 static void 396 vnode_fini(void *mem, int size) 397 { 398 struct vnode *vp; 399 struct bufobj *bo; 400 401 vp = mem; 402 rangelock_destroy(&vp->v_rl); 403 lockdestroy(vp->v_vnlock); 404 mtx_destroy(&vp->v_interlock); 405 bo = &vp->v_bufobj; 406 rw_destroy(BO_LOCKPTR(bo)); 407 } 408 409 static void 410 vntblinit(void *dummy __unused) 411 { 412 u_int i; 413 int physvnodes, virtvnodes; 414 415 /* 416 * Desiredvnodes is a function of the physical memory size and the 417 * kernel's heap size. Generally speaking, it scales with the 418 * physical memory size. The ratio of desiredvnodes to the physical 419 * memory size is 1:16 until desiredvnodes exceeds 98,304. 420 * Thereafter, the 421 * marginal ratio of desiredvnodes to the physical memory size is 422 * 1:64. However, desiredvnodes is limited by the kernel's heap 423 * size. The memory required by desiredvnodes vnodes and vm objects 424 * must not exceed 1/7th of the kernel's heap size. 425 */ 426 physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 + 427 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64; 428 virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) + 429 sizeof(struct vnode))); 430 desiredvnodes = min(physvnodes, virtvnodes); 431 if (desiredvnodes > MAXVNODES_MAX) { 432 if (bootverbose) 433 printf("Reducing kern.maxvnodes %d -> %d\n", 434 desiredvnodes, MAXVNODES_MAX); 435 desiredvnodes = MAXVNODES_MAX; 436 } 437 wantfreevnodes = desiredvnodes / 4; 438 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); 439 TAILQ_INIT(&vnode_free_list); 440 mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF); 441 vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, 442 vnode_init, vnode_fini, UMA_ALIGN_PTR, 0); 443 vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), 444 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 445 /* 446 * Preallocate enough nodes to support one-per buf so that 447 * we can not fail an insert. reassignbuf() callers can not 448 * tolerate the insertion failure. 449 */ 450 buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), 451 NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 452 UMA_ZONE_NOFREE | UMA_ZONE_VM); 453 uma_prealloc(buf_trie_zone, nbuf); 454 /* 455 * Initialize the filesystem syncer. 456 */ 457 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 458 &syncer_mask); 459 syncer_maxdelay = syncer_mask + 1; 460 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); 461 cv_init(&sync_wakeup, "syncer"); 462 for (i = 1; i <= sizeof(struct vnode); i <<= 1) 463 vnsz2log++; 464 vnsz2log--; 465 } 466 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); 467 468 469 /* 470 * Mark a mount point as busy. Used to synchronize access and to delay 471 * unmounting. Eventually, mountlist_mtx is not released on failure. 472 * 473 * vfs_busy() is a custom lock, it can block the caller. 474 * vfs_busy() only sleeps if the unmount is active on the mount point. 475 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any 476 * vnode belonging to mp. 477 * 478 * Lookup uses vfs_busy() to traverse mount points. 479 * root fs var fs 480 * / vnode lock A / vnode lock (/var) D 481 * /var vnode lock B /log vnode lock(/var/log) E 482 * vfs_busy lock C vfs_busy lock F 483 * 484 * Within each file system, the lock order is C->A->B and F->D->E. 485 * 486 * When traversing across mounts, the system follows that lock order: 487 * 488 * C->A->B 489 * | 490 * +->F->D->E 491 * 492 * The lookup() process for namei("/var") illustrates the process: 493 * VOP_LOOKUP() obtains B while A is held 494 * vfs_busy() obtains a shared lock on F while A and B are held 495 * vput() releases lock on B 496 * vput() releases lock on A 497 * VFS_ROOT() obtains lock on D while shared lock on F is held 498 * vfs_unbusy() releases shared lock on F 499 * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. 500 * Attempt to lock A (instead of vp_crossmp) while D is held would 501 * violate the global order, causing deadlocks. 502 * 503 * dounmount() locks B while F is drained. 504 */ 505 int 506 vfs_busy(struct mount *mp, int flags) 507 { 508 509 MPASS((flags & ~MBF_MASK) == 0); 510 CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); 511 512 MNT_ILOCK(mp); 513 MNT_REF(mp); 514 /* 515 * If mount point is currenly being unmounted, sleep until the 516 * mount point fate is decided. If thread doing the unmounting fails, 517 * it will clear MNTK_UNMOUNT flag before waking us up, indicating 518 * that this mount point has survived the unmount attempt and vfs_busy 519 * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE 520 * flag in addition to MNTK_UNMOUNT, indicating that mount point is 521 * about to be really destroyed. vfs_busy needs to release its 522 * reference on the mount point in this case and return with ENOENT, 523 * telling the caller that mount mount it tried to busy is no longer 524 * valid. 525 */ 526 while (mp->mnt_kern_flag & MNTK_UNMOUNT) { 527 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { 528 MNT_REL(mp); 529 MNT_IUNLOCK(mp); 530 CTR1(KTR_VFS, "%s: failed busying before sleeping", 531 __func__); 532 return (ENOENT); 533 } 534 if (flags & MBF_MNTLSTLOCK) 535 mtx_unlock(&mountlist_mtx); 536 mp->mnt_kern_flag |= MNTK_MWAIT; 537 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); 538 if (flags & MBF_MNTLSTLOCK) 539 mtx_lock(&mountlist_mtx); 540 MNT_ILOCK(mp); 541 } 542 if (flags & MBF_MNTLSTLOCK) 543 mtx_unlock(&mountlist_mtx); 544 mp->mnt_lockref++; 545 MNT_IUNLOCK(mp); 546 return (0); 547 } 548 549 /* 550 * Free a busy filesystem. 551 */ 552 void 553 vfs_unbusy(struct mount *mp) 554 { 555 556 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 557 MNT_ILOCK(mp); 558 MNT_REL(mp); 559 KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref")); 560 mp->mnt_lockref--; 561 if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { 562 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); 563 CTR1(KTR_VFS, "%s: waking up waiters", __func__); 564 mp->mnt_kern_flag &= ~MNTK_DRAINING; 565 wakeup(&mp->mnt_lockref); 566 } 567 MNT_IUNLOCK(mp); 568 } 569 570 /* 571 * Lookup a mount point by filesystem identifier. 572 */ 573 struct mount * 574 vfs_getvfs(fsid_t *fsid) 575 { 576 struct mount *mp; 577 578 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 579 mtx_lock(&mountlist_mtx); 580 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 581 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 582 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 583 vfs_ref(mp); 584 mtx_unlock(&mountlist_mtx); 585 return (mp); 586 } 587 } 588 mtx_unlock(&mountlist_mtx); 589 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 590 return ((struct mount *) 0); 591 } 592 593 /* 594 * Lookup a mount point by filesystem identifier, busying it before 595 * returning. 596 * 597 * To avoid congestion on mountlist_mtx, implement simple direct-mapped 598 * cache for popular filesystem identifiers. The cache is lockess, using 599 * the fact that struct mount's are never freed. In worst case we may 600 * get pointer to unmounted or even different filesystem, so we have to 601 * check what we got, and go slow way if so. 602 */ 603 struct mount * 604 vfs_busyfs(fsid_t *fsid) 605 { 606 #define FSID_CACHE_SIZE 256 607 typedef struct mount * volatile vmp_t; 608 static vmp_t cache[FSID_CACHE_SIZE]; 609 struct mount *mp; 610 int error; 611 uint32_t hash; 612 613 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 614 hash = fsid->val[0] ^ fsid->val[1]; 615 hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); 616 mp = cache[hash]; 617 if (mp == NULL || 618 mp->mnt_stat.f_fsid.val[0] != fsid->val[0] || 619 mp->mnt_stat.f_fsid.val[1] != fsid->val[1]) 620 goto slow; 621 if (vfs_busy(mp, 0) != 0) { 622 cache[hash] = NULL; 623 goto slow; 624 } 625 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 626 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) 627 return (mp); 628 else 629 vfs_unbusy(mp); 630 631 slow: 632 mtx_lock(&mountlist_mtx); 633 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 634 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 635 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 636 error = vfs_busy(mp, MBF_MNTLSTLOCK); 637 if (error) { 638 cache[hash] = NULL; 639 mtx_unlock(&mountlist_mtx); 640 return (NULL); 641 } 642 cache[hash] = mp; 643 return (mp); 644 } 645 } 646 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 647 mtx_unlock(&mountlist_mtx); 648 return ((struct mount *) 0); 649 } 650 651 /* 652 * Check if a user can access privileged mount options. 653 */ 654 int 655 vfs_suser(struct mount *mp, struct thread *td) 656 { 657 int error; 658 659 /* 660 * If the thread is jailed, but this is not a jail-friendly file 661 * system, deny immediately. 662 */ 663 if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred)) 664 return (EPERM); 665 666 /* 667 * If the file system was mounted outside the jail of the calling 668 * thread, deny immediately. 669 */ 670 if (prison_check(td->td_ucred, mp->mnt_cred) != 0) 671 return (EPERM); 672 673 /* 674 * If file system supports delegated administration, we don't check 675 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified 676 * by the file system itself. 677 * If this is not the user that did original mount, we check for 678 * the PRIV_VFS_MOUNT_OWNER privilege. 679 */ 680 if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && 681 mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { 682 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) 683 return (error); 684 } 685 return (0); 686 } 687 688 /* 689 * Get a new unique fsid. Try to make its val[0] unique, since this value 690 * will be used to create fake device numbers for stat(). Also try (but 691 * not so hard) make its val[0] unique mod 2^16, since some emulators only 692 * support 16-bit device numbers. We end up with unique val[0]'s for the 693 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 694 * 695 * Keep in mind that several mounts may be running in parallel. Starting 696 * the search one past where the previous search terminated is both a 697 * micro-optimization and a defense against returning the same fsid to 698 * different mounts. 699 */ 700 void 701 vfs_getnewfsid(struct mount *mp) 702 { 703 static uint16_t mntid_base; 704 struct mount *nmp; 705 fsid_t tfsid; 706 int mtype; 707 708 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 709 mtx_lock(&mntid_mtx); 710 mtype = mp->mnt_vfc->vfc_typenum; 711 tfsid.val[1] = mtype; 712 mtype = (mtype & 0xFF) << 24; 713 for (;;) { 714 tfsid.val[0] = makedev(255, 715 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 716 mntid_base++; 717 if ((nmp = vfs_getvfs(&tfsid)) == NULL) 718 break; 719 vfs_rel(nmp); 720 } 721 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 722 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 723 mtx_unlock(&mntid_mtx); 724 } 725 726 /* 727 * Knob to control the precision of file timestamps: 728 * 729 * 0 = seconds only; nanoseconds zeroed. 730 * 1 = seconds and nanoseconds, accurate within 1/HZ. 731 * 2 = seconds and nanoseconds, truncated to microseconds. 732 * >=3 = seconds and nanoseconds, maximum precision. 733 */ 734 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 735 736 static int timestamp_precision = TSP_USEC; 737 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 738 ×tamp_precision, 0, "File timestamp precision (0: seconds, " 739 "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, " 740 "3+: sec + ns (max. precision))"); 741 742 /* 743 * Get a current timestamp. 744 */ 745 void 746 vfs_timestamp(struct timespec *tsp) 747 { 748 struct timeval tv; 749 750 switch (timestamp_precision) { 751 case TSP_SEC: 752 tsp->tv_sec = time_second; 753 tsp->tv_nsec = 0; 754 break; 755 case TSP_HZ: 756 getnanotime(tsp); 757 break; 758 case TSP_USEC: 759 microtime(&tv); 760 TIMEVAL_TO_TIMESPEC(&tv, tsp); 761 break; 762 case TSP_NSEC: 763 default: 764 nanotime(tsp); 765 break; 766 } 767 } 768 769 /* 770 * Set vnode attributes to VNOVAL 771 */ 772 void 773 vattr_null(struct vattr *vap) 774 { 775 776 vap->va_type = VNON; 777 vap->va_size = VNOVAL; 778 vap->va_bytes = VNOVAL; 779 vap->va_mode = VNOVAL; 780 vap->va_nlink = VNOVAL; 781 vap->va_uid = VNOVAL; 782 vap->va_gid = VNOVAL; 783 vap->va_fsid = VNOVAL; 784 vap->va_fileid = VNOVAL; 785 vap->va_blocksize = VNOVAL; 786 vap->va_rdev = VNOVAL; 787 vap->va_atime.tv_sec = VNOVAL; 788 vap->va_atime.tv_nsec = VNOVAL; 789 vap->va_mtime.tv_sec = VNOVAL; 790 vap->va_mtime.tv_nsec = VNOVAL; 791 vap->va_ctime.tv_sec = VNOVAL; 792 vap->va_ctime.tv_nsec = VNOVAL; 793 vap->va_birthtime.tv_sec = VNOVAL; 794 vap->va_birthtime.tv_nsec = VNOVAL; 795 vap->va_flags = VNOVAL; 796 vap->va_gen = VNOVAL; 797 vap->va_vaflags = 0; 798 } 799 800 /* 801 * This routine is called when we have too many vnodes. It attempts 802 * to free <count> vnodes and will potentially free vnodes that still 803 * have VM backing store (VM backing store is typically the cause 804 * of a vnode blowout so we want to do this). Therefore, this operation 805 * is not considered cheap. 806 * 807 * A number of conditions may prevent a vnode from being reclaimed. 808 * the buffer cache may have references on the vnode, a directory 809 * vnode may still have references due to the namei cache representing 810 * underlying files, or the vnode may be in active use. It is not 811 * desireable to reuse such vnodes. These conditions may cause the 812 * number of vnodes to reach some minimum value regardless of what 813 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 814 */ 815 static int 816 vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger) 817 { 818 struct vnode *vp; 819 int count, done, target; 820 821 done = 0; 822 vn_start_write(NULL, &mp, V_WAIT); 823 MNT_ILOCK(mp); 824 count = mp->mnt_nvnodelistsize; 825 target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1); 826 target = target / 10 + 1; 827 while (count != 0 && done < target) { 828 vp = TAILQ_FIRST(&mp->mnt_nvnodelist); 829 while (vp != NULL && vp->v_type == VMARKER) 830 vp = TAILQ_NEXT(vp, v_nmntvnodes); 831 if (vp == NULL) 832 break; 833 /* 834 * XXX LRU is completely broken for non-free vnodes. First 835 * by calling here in mountpoint order, then by moving 836 * unselected vnodes to the end here, and most grossly by 837 * removing the vlruvp() function that was supposed to 838 * maintain the order. (This function was born broken 839 * since syncer problems prevented it doing anything.) The 840 * order is closer to LRC (C = Created). 841 * 842 * LRU reclaiming of vnodes seems to have last worked in 843 * FreeBSD-3 where LRU wasn't mentioned under any spelling. 844 * Then there was no hold count, and inactive vnodes were 845 * simply put on the free list in LRU order. The separate 846 * lists also break LRU. We prefer to reclaim from the 847 * free list for technical reasons. This tends to thrash 848 * the free list to keep very unrecently used held vnodes. 849 * The problem is mitigated by keeping the free list large. 850 */ 851 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 852 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 853 --count; 854 if (!VI_TRYLOCK(vp)) 855 goto next_iter; 856 /* 857 * If it's been deconstructed already, it's still 858 * referenced, or it exceeds the trigger, skip it. 859 * Also skip free vnodes. We are trying to make space 860 * to expand the free list, not reduce it. 861 */ 862 if (vp->v_usecount || 863 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 864 ((vp->v_iflag & VI_FREE) != 0) || 865 (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL && 866 vp->v_object->resident_page_count > trigger)) { 867 VI_UNLOCK(vp); 868 goto next_iter; 869 } 870 MNT_IUNLOCK(mp); 871 vholdl(vp); 872 if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) { 873 vdrop(vp); 874 goto next_iter_mntunlocked; 875 } 876 VI_LOCK(vp); 877 /* 878 * v_usecount may have been bumped after VOP_LOCK() dropped 879 * the vnode interlock and before it was locked again. 880 * 881 * It is not necessary to recheck VI_DOOMED because it can 882 * only be set by another thread that holds both the vnode 883 * lock and vnode interlock. If another thread has the 884 * vnode lock before we get to VOP_LOCK() and obtains the 885 * vnode interlock after VOP_LOCK() drops the vnode 886 * interlock, the other thread will be unable to drop the 887 * vnode lock before our VOP_LOCK() call fails. 888 */ 889 if (vp->v_usecount || 890 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 891 (vp->v_iflag & VI_FREE) != 0 || 892 (vp->v_object != NULL && 893 vp->v_object->resident_page_count > trigger)) { 894 VOP_UNLOCK(vp, LK_INTERLOCK); 895 vdrop(vp); 896 goto next_iter_mntunlocked; 897 } 898 KASSERT((vp->v_iflag & VI_DOOMED) == 0, 899 ("VI_DOOMED unexpectedly detected in vlrureclaim()")); 900 atomic_add_long(&recycles_count, 1); 901 vgonel(vp); 902 VOP_UNLOCK(vp, 0); 903 vdropl(vp); 904 done++; 905 next_iter_mntunlocked: 906 if (!should_yield()) 907 goto relock_mnt; 908 goto yield; 909 next_iter: 910 if (!should_yield()) 911 continue; 912 MNT_IUNLOCK(mp); 913 yield: 914 kern_yield(PRI_USER); 915 relock_mnt: 916 MNT_ILOCK(mp); 917 } 918 MNT_IUNLOCK(mp); 919 vn_finished_write(mp); 920 return done; 921 } 922 923 /* 924 * Attempt to reduce the free list by the requested amount. 925 */ 926 static void 927 vnlru_free(int count) 928 { 929 struct vnode *vp; 930 931 mtx_assert(&vnode_free_list_mtx, MA_OWNED); 932 for (; count > 0; count--) { 933 vp = TAILQ_FIRST(&vnode_free_list); 934 /* 935 * The list can be modified while the free_list_mtx 936 * has been dropped and vp could be NULL here. 937 */ 938 if (!vp) 939 break; 940 VNASSERT(vp->v_op != NULL, vp, 941 ("vnlru_free: vnode already reclaimed.")); 942 KASSERT((vp->v_iflag & VI_FREE) != 0, 943 ("Removing vnode not on freelist")); 944 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 945 ("Mangling active vnode")); 946 TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); 947 /* 948 * Don't recycle if we can't get the interlock. 949 */ 950 if (!VI_TRYLOCK(vp)) { 951 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist); 952 continue; 953 } 954 VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0, 955 vp, ("vp inconsistent on freelist")); 956 957 /* 958 * The clear of VI_FREE prevents activation of the 959 * vnode. There is no sense in putting the vnode on 960 * the mount point active list, only to remove it 961 * later during recycling. Inline the relevant part 962 * of vholdl(), to avoid triggering assertions or 963 * activating. 964 */ 965 freevnodes--; 966 vp->v_iflag &= ~VI_FREE; 967 refcount_acquire(&vp->v_holdcnt); 968 969 mtx_unlock(&vnode_free_list_mtx); 970 VI_UNLOCK(vp); 971 vtryrecycle(vp); 972 /* 973 * If the recycled succeeded this vdrop will actually free 974 * the vnode. If not it will simply place it back on 975 * the free list. 976 */ 977 vdrop(vp); 978 mtx_lock(&vnode_free_list_mtx); 979 } 980 } 981 982 /* XXX some names and initialization are bad for limits and watermarks. */ 983 static int 984 vspace(void) 985 { 986 int space; 987 988 gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); 989 vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ 990 vlowat = vhiwat / 2; 991 if (numvnodes > desiredvnodes) 992 return (0); 993 space = desiredvnodes - numvnodes; 994 if (freevnodes > wantfreevnodes) 995 space += freevnodes - wantfreevnodes; 996 return (space); 997 } 998 999 /* 1000 * Attempt to recycle vnodes in a context that is always safe to block. 1001 * Calling vlrurecycle() from the bowels of filesystem code has some 1002 * interesting deadlock problems. 1003 */ 1004 static struct proc *vnlruproc; 1005 static int vnlruproc_sig; 1006 1007 static void 1008 vnlru_proc(void) 1009 { 1010 struct mount *mp, *nmp; 1011 unsigned long ofreevnodes, onumvnodes; 1012 int done, force, reclaim_nc_src, trigger, usevnodes; 1013 1014 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, 1015 SHUTDOWN_PRI_FIRST); 1016 1017 force = 0; 1018 for (;;) { 1019 kproc_suspend_check(vnlruproc); 1020 mtx_lock(&vnode_free_list_mtx); 1021 /* 1022 * If numvnodes is too large (due to desiredvnodes being 1023 * adjusted using its sysctl, or emergency growth), first 1024 * try to reduce it by discarding from the free list. 1025 */ 1026 if (numvnodes > desiredvnodes && freevnodes > 0) 1027 vnlru_free(ulmin(numvnodes - desiredvnodes, 1028 freevnodes)); 1029 /* 1030 * Sleep if the vnode cache is in a good state. This is 1031 * when it is not over-full and has space for about a 4% 1032 * or 9% expansion (by growing its size or inexcessively 1033 * reducing its free list). Otherwise, try to reclaim 1034 * space for a 10% expansion. 1035 */ 1036 if (vstir && force == 0) { 1037 force = 1; 1038 vstir = 0; 1039 } 1040 if (vspace() >= vlowat && force == 0) { 1041 vnlruproc_sig = 0; 1042 wakeup(&vnlruproc_sig); 1043 msleep(vnlruproc, &vnode_free_list_mtx, 1044 PVFS|PDROP, "vlruwt", hz); 1045 continue; 1046 } 1047 mtx_unlock(&vnode_free_list_mtx); 1048 done = 0; 1049 ofreevnodes = freevnodes; 1050 onumvnodes = numvnodes; 1051 /* 1052 * Calculate parameters for recycling. These are the same 1053 * throughout the loop to give some semblance of fairness. 1054 * The trigger point is to avoid recycling vnodes with lots 1055 * of resident pages. We aren't trying to free memory; we 1056 * are trying to recycle or at least free vnodes. 1057 */ 1058 if (numvnodes <= desiredvnodes) 1059 usevnodes = numvnodes - freevnodes; 1060 else 1061 usevnodes = numvnodes; 1062 if (usevnodes <= 0) 1063 usevnodes = 1; 1064 /* 1065 * The trigger value is is chosen to give a conservatively 1066 * large value to ensure that it alone doesn't prevent 1067 * making progress. The value can easily be so large that 1068 * it is effectively infinite in some congested and 1069 * misconfigured cases, and this is necessary. Normally 1070 * it is about 8 to 100 (pages), which is quite large. 1071 */ 1072 trigger = vm_cnt.v_page_count * 2 / usevnodes; 1073 if (force < 2) 1074 trigger = vsmalltrigger; 1075 reclaim_nc_src = force >= 3; 1076 mtx_lock(&mountlist_mtx); 1077 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 1078 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { 1079 nmp = TAILQ_NEXT(mp, mnt_list); 1080 continue; 1081 } 1082 done += vlrureclaim(mp, reclaim_nc_src, trigger); 1083 mtx_lock(&mountlist_mtx); 1084 nmp = TAILQ_NEXT(mp, mnt_list); 1085 vfs_unbusy(mp); 1086 } 1087 mtx_unlock(&mountlist_mtx); 1088 if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) 1089 uma_reclaim(); 1090 if (done == 0) { 1091 if (force == 0 || force == 1) { 1092 force = 2; 1093 continue; 1094 } 1095 if (force == 2) { 1096 force = 3; 1097 continue; 1098 } 1099 force = 0; 1100 vnlru_nowhere++; 1101 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 1102 } else 1103 kern_yield(PRI_USER); 1104 /* 1105 * After becoming active to expand above low water, keep 1106 * active until above high water. 1107 */ 1108 force = vspace() < vhiwat; 1109 } 1110 } 1111 1112 static struct kproc_desc vnlru_kp = { 1113 "vnlru", 1114 vnlru_proc, 1115 &vnlruproc 1116 }; 1117 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, 1118 &vnlru_kp); 1119 1120 /* 1121 * Routines having to do with the management of the vnode table. 1122 */ 1123 1124 /* 1125 * Try to recycle a freed vnode. We abort if anyone picks up a reference 1126 * before we actually vgone(). This function must be called with the vnode 1127 * held to prevent the vnode from being returned to the free list midway 1128 * through vgone(). 1129 */ 1130 static int 1131 vtryrecycle(struct vnode *vp) 1132 { 1133 struct mount *vnmp; 1134 1135 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 1136 VNASSERT(vp->v_holdcnt, vp, 1137 ("vtryrecycle: Recycling vp %p without a reference.", vp)); 1138 /* 1139 * This vnode may found and locked via some other list, if so we 1140 * can't recycle it yet. 1141 */ 1142 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 1143 CTR2(KTR_VFS, 1144 "%s: impossible to recycle, vp %p lock is already held", 1145 __func__, vp); 1146 return (EWOULDBLOCK); 1147 } 1148 /* 1149 * Don't recycle if its filesystem is being suspended. 1150 */ 1151 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { 1152 VOP_UNLOCK(vp, 0); 1153 CTR2(KTR_VFS, 1154 "%s: impossible to recycle, cannot start the write for %p", 1155 __func__, vp); 1156 return (EBUSY); 1157 } 1158 /* 1159 * If we got this far, we need to acquire the interlock and see if 1160 * anyone picked up this vnode from another list. If not, we will 1161 * mark it with DOOMED via vgonel() so that anyone who does find it 1162 * will skip over it. 1163 */ 1164 VI_LOCK(vp); 1165 if (vp->v_usecount) { 1166 VOP_UNLOCK(vp, LK_INTERLOCK); 1167 vn_finished_write(vnmp); 1168 CTR2(KTR_VFS, 1169 "%s: impossible to recycle, %p is already referenced", 1170 __func__, vp); 1171 return (EBUSY); 1172 } 1173 if ((vp->v_iflag & VI_DOOMED) == 0) { 1174 atomic_add_long(&recycles_count, 1); 1175 vgonel(vp); 1176 } 1177 VOP_UNLOCK(vp, LK_INTERLOCK); 1178 vn_finished_write(vnmp); 1179 return (0); 1180 } 1181 1182 static void 1183 vcheckspace(void) 1184 { 1185 1186 if (vspace() < vlowat && vnlruproc_sig == 0) { 1187 vnlruproc_sig = 1; 1188 wakeup(vnlruproc); 1189 } 1190 } 1191 1192 /* 1193 * Wait if necessary for space for a new vnode. 1194 */ 1195 static int 1196 getnewvnode_wait(int suspended) 1197 { 1198 1199 mtx_assert(&vnode_free_list_mtx, MA_OWNED); 1200 if (numvnodes >= desiredvnodes) { 1201 if (suspended) { 1202 /* 1203 * The file system is being suspended. We cannot 1204 * risk a deadlock here, so allow allocation of 1205 * another vnode even if this would give too many. 1206 */ 1207 return (0); 1208 } 1209 if (vnlruproc_sig == 0) { 1210 vnlruproc_sig = 1; /* avoid unnecessary wakeups */ 1211 wakeup(vnlruproc); 1212 } 1213 msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS, 1214 "vlruwk", hz); 1215 } 1216 /* Post-adjust like the pre-adjust in getnewvnode(). */ 1217 if (numvnodes + 1 > desiredvnodes && freevnodes > 1) 1218 vnlru_free(1); 1219 return (numvnodes >= desiredvnodes ? ENFILE : 0); 1220 } 1221 1222 /* 1223 * This hack is fragile, and probably not needed any more now that the 1224 * watermark handling works. 1225 */ 1226 void 1227 getnewvnode_reserve(u_int count) 1228 { 1229 struct thread *td; 1230 1231 /* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */ 1232 /* XXX no longer so quick, but this part is not racy. */ 1233 mtx_lock(&vnode_free_list_mtx); 1234 if (numvnodes + count > desiredvnodes && freevnodes > wantfreevnodes) 1235 vnlru_free(ulmin(numvnodes + count - desiredvnodes, 1236 freevnodes - wantfreevnodes)); 1237 mtx_unlock(&vnode_free_list_mtx); 1238 1239 td = curthread; 1240 /* First try to be quick and racy. */ 1241 if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) { 1242 td->td_vp_reserv += count; 1243 vcheckspace(); /* XXX no longer so quick, but more racy */ 1244 return; 1245 } else 1246 atomic_subtract_long(&numvnodes, count); 1247 1248 mtx_lock(&vnode_free_list_mtx); 1249 while (count > 0) { 1250 if (getnewvnode_wait(0) == 0) { 1251 count--; 1252 td->td_vp_reserv++; 1253 atomic_add_long(&numvnodes, 1); 1254 } 1255 } 1256 vcheckspace(); 1257 mtx_unlock(&vnode_free_list_mtx); 1258 } 1259 1260 /* 1261 * This hack is fragile, especially if desiredvnodes or wantvnodes are 1262 * misconfgured or changed significantly. Reducing desiredvnodes below 1263 * the reserved amount should cause bizarre behaviour like reducing it 1264 * below the number of active vnodes -- the system will try to reduce 1265 * numvnodes to match, but should fail, so the subtraction below should 1266 * not overflow. 1267 */ 1268 void 1269 getnewvnode_drop_reserve(void) 1270 { 1271 struct thread *td; 1272 1273 td = curthread; 1274 atomic_subtract_long(&numvnodes, td->td_vp_reserv); 1275 td->td_vp_reserv = 0; 1276 } 1277 1278 /* 1279 * Return the next vnode from the free list. 1280 */ 1281 int 1282 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, 1283 struct vnode **vpp) 1284 { 1285 struct vnode *vp; 1286 struct thread *td; 1287 struct lock_object *lo; 1288 static int cyclecount; 1289 int error; 1290 1291 CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); 1292 vp = NULL; 1293 td = curthread; 1294 if (td->td_vp_reserv > 0) { 1295 td->td_vp_reserv -= 1; 1296 goto alloc; 1297 } 1298 mtx_lock(&vnode_free_list_mtx); 1299 if (numvnodes < desiredvnodes) 1300 cyclecount = 0; 1301 else if (cyclecount++ >= freevnodes) { 1302 cyclecount = 0; 1303 vstir = 1; 1304 } 1305 /* 1306 * Grow the vnode cache if it will not be above its target max 1307 * after growing. Otherwise, if the free list is nonempty, try 1308 * to reclaim 1 item from it before growing the cache (possibly 1309 * above its target max if the reclamation failed or is delayed). 1310 * Otherwise, wait for some space. In all cases, schedule 1311 * vnlru_proc() if we are getting short of space. The watermarks 1312 * should be chosen so that we never wait or even reclaim from 1313 * the free list to below its target minimum. 1314 */ 1315 if (numvnodes + 1 <= desiredvnodes) 1316 ; 1317 else if (freevnodes > 0) 1318 vnlru_free(1); 1319 else { 1320 error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag & 1321 MNTK_SUSPEND)); 1322 #if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ 1323 if (error != 0) { 1324 mtx_unlock(&vnode_free_list_mtx); 1325 return (error); 1326 } 1327 #endif 1328 } 1329 vcheckspace(); 1330 atomic_add_long(&numvnodes, 1); 1331 mtx_unlock(&vnode_free_list_mtx); 1332 alloc: 1333 atomic_add_long(&vnodes_created, 1); 1334 vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK); 1335 /* 1336 * Locks are given the generic name "vnode" when created. 1337 * Follow the historic practice of using the filesystem 1338 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. 1339 * 1340 * Locks live in a witness group keyed on their name. Thus, 1341 * when a lock is renamed, it must also move from the witness 1342 * group of its old name to the witness group of its new name. 1343 * 1344 * The change only needs to be made when the vnode moves 1345 * from one filesystem type to another. We ensure that each 1346 * filesystem use a single static name pointer for its tag so 1347 * that we can compare pointers rather than doing a strcmp(). 1348 */ 1349 lo = &vp->v_vnlock->lock_object; 1350 if (lo->lo_name != tag) { 1351 lo->lo_name = tag; 1352 WITNESS_DESTROY(lo); 1353 WITNESS_INIT(lo, tag); 1354 } 1355 /* 1356 * By default, don't allow shared locks unless filesystems opt-in. 1357 */ 1358 vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; 1359 /* 1360 * Finalize various vnode identity bits. 1361 */ 1362 KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); 1363 KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); 1364 KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); 1365 vp->v_type = VNON; 1366 vp->v_tag = tag; 1367 vp->v_op = vops; 1368 v_init_counters(vp); 1369 vp->v_bufobj.bo_ops = &buf_ops_bio; 1370 #ifdef MAC 1371 mac_vnode_init(vp); 1372 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) 1373 mac_vnode_associate_singlelabel(mp, vp); 1374 else if (mp == NULL && vops != &dead_vnodeops) 1375 printf("NULL mp in getnewvnode()\n"); 1376 #endif 1377 if (mp != NULL) { 1378 vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; 1379 if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) 1380 vp->v_vflag |= VV_NOKNOTE; 1381 } 1382 1383 /* 1384 * For the filesystems which do not use vfs_hash_insert(), 1385 * still initialize v_hash to have vfs_hash_index() useful. 1386 * E.g., nullfs uses vfs_hash_index() on the lower vnode for 1387 * its own hashing. 1388 */ 1389 vp->v_hash = (uintptr_t)vp >> vnsz2log; 1390 1391 *vpp = vp; 1392 return (0); 1393 } 1394 1395 /* 1396 * Delete from old mount point vnode list, if on one. 1397 */ 1398 static void 1399 delmntque(struct vnode *vp) 1400 { 1401 struct mount *mp; 1402 int active; 1403 1404 mp = vp->v_mount; 1405 if (mp == NULL) 1406 return; 1407 MNT_ILOCK(mp); 1408 VI_LOCK(vp); 1409 KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize, 1410 ("Active vnode list size %d > Vnode list size %d", 1411 mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize)); 1412 active = vp->v_iflag & VI_ACTIVE; 1413 vp->v_iflag &= ~VI_ACTIVE; 1414 if (active) { 1415 mtx_lock(&vnode_free_list_mtx); 1416 TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist); 1417 mp->mnt_activevnodelistsize--; 1418 mtx_unlock(&vnode_free_list_mtx); 1419 } 1420 vp->v_mount = NULL; 1421 VI_UNLOCK(vp); 1422 VNASSERT(mp->mnt_nvnodelistsize > 0, vp, 1423 ("bad mount point vnode list size")); 1424 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1425 mp->mnt_nvnodelistsize--; 1426 MNT_REL(mp); 1427 MNT_IUNLOCK(mp); 1428 } 1429 1430 static void 1431 insmntque_stddtr(struct vnode *vp, void *dtr_arg) 1432 { 1433 1434 vp->v_data = NULL; 1435 vp->v_op = &dead_vnodeops; 1436 vgone(vp); 1437 vput(vp); 1438 } 1439 1440 /* 1441 * Insert into list of vnodes for the new mount point, if available. 1442 */ 1443 int 1444 insmntque1(struct vnode *vp, struct mount *mp, 1445 void (*dtr)(struct vnode *, void *), void *dtr_arg) 1446 { 1447 1448 KASSERT(vp->v_mount == NULL, 1449 ("insmntque: vnode already on per mount vnode list")); 1450 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); 1451 ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); 1452 1453 /* 1454 * We acquire the vnode interlock early to ensure that the 1455 * vnode cannot be recycled by another process releasing a 1456 * holdcnt on it before we get it on both the vnode list 1457 * and the active vnode list. The mount mutex protects only 1458 * manipulation of the vnode list and the vnode freelist 1459 * mutex protects only manipulation of the active vnode list. 1460 * Hence the need to hold the vnode interlock throughout. 1461 */ 1462 MNT_ILOCK(mp); 1463 VI_LOCK(vp); 1464 if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 && 1465 ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || 1466 mp->mnt_nvnodelistsize == 0)) && 1467 (vp->v_vflag & VV_FORCEINSMQ) == 0) { 1468 VI_UNLOCK(vp); 1469 MNT_IUNLOCK(mp); 1470 if (dtr != NULL) 1471 dtr(vp, dtr_arg); 1472 return (EBUSY); 1473 } 1474 vp->v_mount = mp; 1475 MNT_REF(mp); 1476 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1477 VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, 1478 ("neg mount point vnode list size")); 1479 mp->mnt_nvnodelistsize++; 1480 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 1481 ("Activating already active vnode")); 1482 vp->v_iflag |= VI_ACTIVE; 1483 mtx_lock(&vnode_free_list_mtx); 1484 TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); 1485 mp->mnt_activevnodelistsize++; 1486 mtx_unlock(&vnode_free_list_mtx); 1487 VI_UNLOCK(vp); 1488 MNT_IUNLOCK(mp); 1489 return (0); 1490 } 1491 1492 int 1493 insmntque(struct vnode *vp, struct mount *mp) 1494 { 1495 1496 return (insmntque1(vp, mp, insmntque_stddtr, NULL)); 1497 } 1498 1499 /* 1500 * Flush out and invalidate all buffers associated with a bufobj 1501 * Called with the underlying object locked. 1502 */ 1503 int 1504 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) 1505 { 1506 int error; 1507 1508 BO_LOCK(bo); 1509 if (flags & V_SAVE) { 1510 error = bufobj_wwait(bo, slpflag, slptimeo); 1511 if (error) { 1512 BO_UNLOCK(bo); 1513 return (error); 1514 } 1515 if (bo->bo_dirty.bv_cnt > 0) { 1516 BO_UNLOCK(bo); 1517 if ((error = BO_SYNC(bo, MNT_WAIT)) != 0) 1518 return (error); 1519 /* 1520 * XXX We could save a lock/unlock if this was only 1521 * enabled under INVARIANTS 1522 */ 1523 BO_LOCK(bo); 1524 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 1525 panic("vinvalbuf: dirty bufs"); 1526 } 1527 } 1528 /* 1529 * If you alter this loop please notice that interlock is dropped and 1530 * reacquired in flushbuflist. Special care is needed to ensure that 1531 * no race conditions occur from this. 1532 */ 1533 do { 1534 error = flushbuflist(&bo->bo_clean, 1535 flags, bo, slpflag, slptimeo); 1536 if (error == 0 && !(flags & V_CLEANONLY)) 1537 error = flushbuflist(&bo->bo_dirty, 1538 flags, bo, slpflag, slptimeo); 1539 if (error != 0 && error != EAGAIN) { 1540 BO_UNLOCK(bo); 1541 return (error); 1542 } 1543 } while (error != 0); 1544 1545 /* 1546 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 1547 * have write I/O in-progress but if there is a VM object then the 1548 * VM object can also have read-I/O in-progress. 1549 */ 1550 do { 1551 bufobj_wwait(bo, 0, 0); 1552 BO_UNLOCK(bo); 1553 if (bo->bo_object != NULL) { 1554 VM_OBJECT_WLOCK(bo->bo_object); 1555 vm_object_pip_wait(bo->bo_object, "bovlbx"); 1556 VM_OBJECT_WUNLOCK(bo->bo_object); 1557 } 1558 BO_LOCK(bo); 1559 } while (bo->bo_numoutput > 0); 1560 BO_UNLOCK(bo); 1561 1562 /* 1563 * Destroy the copy in the VM cache, too. 1564 */ 1565 if (bo->bo_object != NULL && 1566 (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) { 1567 VM_OBJECT_WLOCK(bo->bo_object); 1568 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? 1569 OBJPR_CLEANONLY : 0); 1570 VM_OBJECT_WUNLOCK(bo->bo_object); 1571 } 1572 1573 #ifdef INVARIANTS 1574 BO_LOCK(bo); 1575 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 && 1576 (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) 1577 panic("vinvalbuf: flush failed"); 1578 BO_UNLOCK(bo); 1579 #endif 1580 return (0); 1581 } 1582 1583 /* 1584 * Flush out and invalidate all buffers associated with a vnode. 1585 * Called with the underlying object locked. 1586 */ 1587 int 1588 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) 1589 { 1590 1591 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 1592 ASSERT_VOP_LOCKED(vp, "vinvalbuf"); 1593 if (vp->v_object != NULL && vp->v_object->handle != vp) 1594 return (0); 1595 return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); 1596 } 1597 1598 /* 1599 * Flush out buffers on the specified list. 1600 * 1601 */ 1602 static int 1603 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, 1604 int slptimeo) 1605 { 1606 struct buf *bp, *nbp; 1607 int retval, error; 1608 daddr_t lblkno; 1609 b_xflags_t xflags; 1610 1611 ASSERT_BO_WLOCKED(bo); 1612 1613 retval = 0; 1614 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { 1615 if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) || 1616 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) { 1617 continue; 1618 } 1619 lblkno = 0; 1620 xflags = 0; 1621 if (nbp != NULL) { 1622 lblkno = nbp->b_lblkno; 1623 xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); 1624 } 1625 retval = EAGAIN; 1626 error = BUF_TIMELOCK(bp, 1627 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), 1628 "flushbuf", slpflag, slptimeo); 1629 if (error) { 1630 BO_LOCK(bo); 1631 return (error != ENOLCK ? error : EAGAIN); 1632 } 1633 KASSERT(bp->b_bufobj == bo, 1634 ("bp %p wrong b_bufobj %p should be %p", 1635 bp, bp->b_bufobj, bo)); 1636 /* 1637 * XXX Since there are no node locks for NFS, I 1638 * believe there is a slight chance that a delayed 1639 * write will occur while sleeping just above, so 1640 * check for it. 1641 */ 1642 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 1643 (flags & V_SAVE)) { 1644 bremfree(bp); 1645 bp->b_flags |= B_ASYNC; 1646 bwrite(bp); 1647 BO_LOCK(bo); 1648 return (EAGAIN); /* XXX: why not loop ? */ 1649 } 1650 bremfree(bp); 1651 bp->b_flags |= (B_INVAL | B_RELBUF); 1652 bp->b_flags &= ~B_ASYNC; 1653 brelse(bp); 1654 BO_LOCK(bo); 1655 nbp = gbincore(bo, lblkno); 1656 if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 1657 != xflags) 1658 break; /* nbp invalid */ 1659 } 1660 return (retval); 1661 } 1662 1663 int 1664 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) 1665 { 1666 struct buf *bp; 1667 int error; 1668 daddr_t lblkno; 1669 1670 ASSERT_BO_LOCKED(bo); 1671 1672 for (lblkno = startn;; lblkno++) { 1673 bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno); 1674 if (bp == NULL || bp->b_lblkno >= endn) 1675 break; 1676 error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 1677 LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); 1678 if (error != 0) { 1679 BO_RLOCK(bo); 1680 return (error != ENOLCK ? error : EAGAIN); 1681 } 1682 KASSERT(bp->b_bufobj == bo, 1683 ("bp %p wrong b_bufobj %p should be %p", 1684 bp, bp->b_bufobj, bo)); 1685 if ((bp->b_flags & B_MANAGED) == 0) 1686 bremfree(bp); 1687 bp->b_flags |= B_RELBUF; 1688 /* 1689 * In the VMIO case, use the B_NOREUSE flag to hint that the 1690 * pages backing each buffer in the range are unlikely to be 1691 * reused. Dirty buffers will have the hint applied once 1692 * they've been written. 1693 */ 1694 if (bp->b_vp->v_object != NULL) 1695 bp->b_flags |= B_NOREUSE; 1696 brelse(bp); 1697 BO_RLOCK(bo); 1698 } 1699 return (0); 1700 } 1701 1702 /* 1703 * Truncate a file's buffer and pages to a specified length. This 1704 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 1705 * sync activity. 1706 */ 1707 int 1708 vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize) 1709 { 1710 struct buf *bp, *nbp; 1711 int anyfreed; 1712 int trunclbn; 1713 struct bufobj *bo; 1714 1715 CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__, 1716 vp, cred, blksize, (uintmax_t)length); 1717 1718 /* 1719 * Round up to the *next* lbn. 1720 */ 1721 trunclbn = (length + blksize - 1) / blksize; 1722 1723 ASSERT_VOP_LOCKED(vp, "vtruncbuf"); 1724 restart: 1725 bo = &vp->v_bufobj; 1726 BO_LOCK(bo); 1727 anyfreed = 1; 1728 for (;anyfreed;) { 1729 anyfreed = 0; 1730 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { 1731 if (bp->b_lblkno < trunclbn) 1732 continue; 1733 if (BUF_LOCK(bp, 1734 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1735 BO_LOCKPTR(bo)) == ENOLCK) 1736 goto restart; 1737 1738 bremfree(bp); 1739 bp->b_flags |= (B_INVAL | B_RELBUF); 1740 bp->b_flags &= ~B_ASYNC; 1741 brelse(bp); 1742 anyfreed = 1; 1743 1744 BO_LOCK(bo); 1745 if (nbp != NULL && 1746 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 1747 (nbp->b_vp != vp) || 1748 (nbp->b_flags & B_DELWRI))) { 1749 BO_UNLOCK(bo); 1750 goto restart; 1751 } 1752 } 1753 1754 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 1755 if (bp->b_lblkno < trunclbn) 1756 continue; 1757 if (BUF_LOCK(bp, 1758 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1759 BO_LOCKPTR(bo)) == ENOLCK) 1760 goto restart; 1761 bremfree(bp); 1762 bp->b_flags |= (B_INVAL | B_RELBUF); 1763 bp->b_flags &= ~B_ASYNC; 1764 brelse(bp); 1765 anyfreed = 1; 1766 1767 BO_LOCK(bo); 1768 if (nbp != NULL && 1769 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 1770 (nbp->b_vp != vp) || 1771 (nbp->b_flags & B_DELWRI) == 0)) { 1772 BO_UNLOCK(bo); 1773 goto restart; 1774 } 1775 } 1776 } 1777 1778 if (length > 0) { 1779 restartsync: 1780 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 1781 if (bp->b_lblkno > 0) 1782 continue; 1783 /* 1784 * Since we hold the vnode lock this should only 1785 * fail if we're racing with the buf daemon. 1786 */ 1787 if (BUF_LOCK(bp, 1788 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1789 BO_LOCKPTR(bo)) == ENOLCK) { 1790 goto restart; 1791 } 1792 VNASSERT((bp->b_flags & B_DELWRI), vp, 1793 ("buf(%p) on dirty queue without DELWRI", bp)); 1794 1795 bremfree(bp); 1796 bawrite(bp); 1797 BO_LOCK(bo); 1798 goto restartsync; 1799 } 1800 } 1801 1802 bufobj_wwait(bo, 0, 0); 1803 BO_UNLOCK(bo); 1804 vnode_pager_setsize(vp, length); 1805 1806 return (0); 1807 } 1808 1809 static void 1810 buf_vlist_remove(struct buf *bp) 1811 { 1812 struct bufv *bv; 1813 1814 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 1815 ASSERT_BO_WLOCKED(bp->b_bufobj); 1816 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) != 1817 (BX_VNDIRTY|BX_VNCLEAN), 1818 ("buf_vlist_remove: Buf %p is on two lists", bp)); 1819 if (bp->b_xflags & BX_VNDIRTY) 1820 bv = &bp->b_bufobj->bo_dirty; 1821 else 1822 bv = &bp->b_bufobj->bo_clean; 1823 BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); 1824 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); 1825 bv->bv_cnt--; 1826 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1827 } 1828 1829 /* 1830 * Add the buffer to the sorted clean or dirty block list. 1831 * 1832 * NOTE: xflags is passed as a constant, optimizing this inline function! 1833 */ 1834 static void 1835 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 1836 { 1837 struct bufv *bv; 1838 struct buf *n; 1839 int error; 1840 1841 ASSERT_BO_WLOCKED(bo); 1842 KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, 1843 ("dead bo %p", bo)); 1844 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, 1845 ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); 1846 bp->b_xflags |= xflags; 1847 if (xflags & BX_VNDIRTY) 1848 bv = &bo->bo_dirty; 1849 else 1850 bv = &bo->bo_clean; 1851 1852 /* 1853 * Keep the list ordered. Optimize empty list insertion. Assume 1854 * we tend to grow at the tail so lookup_le should usually be cheaper 1855 * than _ge. 1856 */ 1857 if (bv->bv_cnt == 0 || 1858 bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) 1859 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); 1860 else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) 1861 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); 1862 else 1863 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); 1864 error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); 1865 if (error) 1866 panic("buf_vlist_add: Preallocated nodes insufficient."); 1867 bv->bv_cnt++; 1868 } 1869 1870 /* 1871 * Look up a buffer using the buffer tries. 1872 */ 1873 struct buf * 1874 gbincore(struct bufobj *bo, daddr_t lblkno) 1875 { 1876 struct buf *bp; 1877 1878 ASSERT_BO_LOCKED(bo); 1879 bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); 1880 if (bp != NULL) 1881 return (bp); 1882 return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno); 1883 } 1884 1885 /* 1886 * Associate a buffer with a vnode. 1887 */ 1888 void 1889 bgetvp(struct vnode *vp, struct buf *bp) 1890 { 1891 struct bufobj *bo; 1892 1893 bo = &vp->v_bufobj; 1894 ASSERT_BO_WLOCKED(bo); 1895 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); 1896 1897 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); 1898 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, 1899 ("bgetvp: bp already attached! %p", bp)); 1900 1901 vhold(vp); 1902 bp->b_vp = vp; 1903 bp->b_bufobj = bo; 1904 /* 1905 * Insert onto list for new vnode. 1906 */ 1907 buf_vlist_add(bp, bo, BX_VNCLEAN); 1908 } 1909 1910 /* 1911 * Disassociate a buffer from a vnode. 1912 */ 1913 void 1914 brelvp(struct buf *bp) 1915 { 1916 struct bufobj *bo; 1917 struct vnode *vp; 1918 1919 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 1920 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 1921 1922 /* 1923 * Delete from old vnode list, if on one. 1924 */ 1925 vp = bp->b_vp; /* XXX */ 1926 bo = bp->b_bufobj; 1927 BO_LOCK(bo); 1928 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 1929 buf_vlist_remove(bp); 1930 else 1931 panic("brelvp: Buffer %p not on queue.", bp); 1932 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 1933 bo->bo_flag &= ~BO_ONWORKLST; 1934 mtx_lock(&sync_mtx); 1935 LIST_REMOVE(bo, bo_synclist); 1936 syncer_worklist_len--; 1937 mtx_unlock(&sync_mtx); 1938 } 1939 bp->b_vp = NULL; 1940 bp->b_bufobj = NULL; 1941 BO_UNLOCK(bo); 1942 vdrop(vp); 1943 } 1944 1945 /* 1946 * Add an item to the syncer work queue. 1947 */ 1948 static void 1949 vn_syncer_add_to_worklist(struct bufobj *bo, int delay) 1950 { 1951 int slot; 1952 1953 ASSERT_BO_WLOCKED(bo); 1954 1955 mtx_lock(&sync_mtx); 1956 if (bo->bo_flag & BO_ONWORKLST) 1957 LIST_REMOVE(bo, bo_synclist); 1958 else { 1959 bo->bo_flag |= BO_ONWORKLST; 1960 syncer_worklist_len++; 1961 } 1962 1963 if (delay > syncer_maxdelay - 2) 1964 delay = syncer_maxdelay - 2; 1965 slot = (syncer_delayno + delay) & syncer_mask; 1966 1967 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); 1968 mtx_unlock(&sync_mtx); 1969 } 1970 1971 static int 1972 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) 1973 { 1974 int error, len; 1975 1976 mtx_lock(&sync_mtx); 1977 len = syncer_worklist_len - sync_vnode_count; 1978 mtx_unlock(&sync_mtx); 1979 error = SYSCTL_OUT(req, &len, sizeof(len)); 1980 return (error); 1981 } 1982 1983 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, 1984 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); 1985 1986 static struct proc *updateproc; 1987 static void sched_sync(void); 1988 static struct kproc_desc up_kp = { 1989 "syncer", 1990 sched_sync, 1991 &updateproc 1992 }; 1993 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); 1994 1995 static int 1996 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) 1997 { 1998 struct vnode *vp; 1999 struct mount *mp; 2000 2001 *bo = LIST_FIRST(slp); 2002 if (*bo == NULL) 2003 return (0); 2004 vp = (*bo)->__bo_vnode; /* XXX */ 2005 if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) 2006 return (1); 2007 /* 2008 * We use vhold in case the vnode does not 2009 * successfully sync. vhold prevents the vnode from 2010 * going away when we unlock the sync_mtx so that 2011 * we can acquire the vnode interlock. 2012 */ 2013 vholdl(vp); 2014 mtx_unlock(&sync_mtx); 2015 VI_UNLOCK(vp); 2016 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 2017 vdrop(vp); 2018 mtx_lock(&sync_mtx); 2019 return (*bo == LIST_FIRST(slp)); 2020 } 2021 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2022 (void) VOP_FSYNC(vp, MNT_LAZY, td); 2023 VOP_UNLOCK(vp, 0); 2024 vn_finished_write(mp); 2025 BO_LOCK(*bo); 2026 if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { 2027 /* 2028 * Put us back on the worklist. The worklist 2029 * routine will remove us from our current 2030 * position and then add us back in at a later 2031 * position. 2032 */ 2033 vn_syncer_add_to_worklist(*bo, syncdelay); 2034 } 2035 BO_UNLOCK(*bo); 2036 vdrop(vp); 2037 mtx_lock(&sync_mtx); 2038 return (0); 2039 } 2040 2041 static int first_printf = 1; 2042 2043 /* 2044 * System filesystem synchronizer daemon. 2045 */ 2046 static void 2047 sched_sync(void) 2048 { 2049 struct synclist *next, *slp; 2050 struct bufobj *bo; 2051 long starttime; 2052 struct thread *td = curthread; 2053 int last_work_seen; 2054 int net_worklist_len; 2055 int syncer_final_iter; 2056 int error; 2057 2058 last_work_seen = 0; 2059 syncer_final_iter = 0; 2060 syncer_state = SYNCER_RUNNING; 2061 starttime = time_uptime; 2062 td->td_pflags |= TDP_NORUNNINGBUF; 2063 2064 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, 2065 SHUTDOWN_PRI_LAST); 2066 2067 mtx_lock(&sync_mtx); 2068 for (;;) { 2069 if (syncer_state == SYNCER_FINAL_DELAY && 2070 syncer_final_iter == 0) { 2071 mtx_unlock(&sync_mtx); 2072 kproc_suspend_check(td->td_proc); 2073 mtx_lock(&sync_mtx); 2074 } 2075 net_worklist_len = syncer_worklist_len - sync_vnode_count; 2076 if (syncer_state != SYNCER_RUNNING && 2077 starttime != time_uptime) { 2078 if (first_printf) { 2079 printf("\nSyncing disks, vnodes remaining..."); 2080 first_printf = 0; 2081 } 2082 printf("%d ", net_worklist_len); 2083 } 2084 starttime = time_uptime; 2085 2086 /* 2087 * Push files whose dirty time has expired. Be careful 2088 * of interrupt race on slp queue. 2089 * 2090 * Skip over empty worklist slots when shutting down. 2091 */ 2092 do { 2093 slp = &syncer_workitem_pending[syncer_delayno]; 2094 syncer_delayno += 1; 2095 if (syncer_delayno == syncer_maxdelay) 2096 syncer_delayno = 0; 2097 next = &syncer_workitem_pending[syncer_delayno]; 2098 /* 2099 * If the worklist has wrapped since the 2100 * it was emptied of all but syncer vnodes, 2101 * switch to the FINAL_DELAY state and run 2102 * for one more second. 2103 */ 2104 if (syncer_state == SYNCER_SHUTTING_DOWN && 2105 net_worklist_len == 0 && 2106 last_work_seen == syncer_delayno) { 2107 syncer_state = SYNCER_FINAL_DELAY; 2108 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; 2109 } 2110 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && 2111 syncer_worklist_len > 0); 2112 2113 /* 2114 * Keep track of the last time there was anything 2115 * on the worklist other than syncer vnodes. 2116 * Return to the SHUTTING_DOWN state if any 2117 * new work appears. 2118 */ 2119 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) 2120 last_work_seen = syncer_delayno; 2121 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) 2122 syncer_state = SYNCER_SHUTTING_DOWN; 2123 while (!LIST_EMPTY(slp)) { 2124 error = sync_vnode(slp, &bo, td); 2125 if (error == 1) { 2126 LIST_REMOVE(bo, bo_synclist); 2127 LIST_INSERT_HEAD(next, bo, bo_synclist); 2128 continue; 2129 } 2130 2131 if (first_printf == 0) { 2132 /* 2133 * Drop the sync mutex, because some watchdog 2134 * drivers need to sleep while patting 2135 */ 2136 mtx_unlock(&sync_mtx); 2137 wdog_kern_pat(WD_LASTVAL); 2138 mtx_lock(&sync_mtx); 2139 } 2140 2141 } 2142 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) 2143 syncer_final_iter--; 2144 /* 2145 * The variable rushjob allows the kernel to speed up the 2146 * processing of the filesystem syncer process. A rushjob 2147 * value of N tells the filesystem syncer to process the next 2148 * N seconds worth of work on its queue ASAP. Currently rushjob 2149 * is used by the soft update code to speed up the filesystem 2150 * syncer process when the incore state is getting so far 2151 * ahead of the disk that the kernel memory pool is being 2152 * threatened with exhaustion. 2153 */ 2154 if (rushjob > 0) { 2155 rushjob -= 1; 2156 continue; 2157 } 2158 /* 2159 * Just sleep for a short period of time between 2160 * iterations when shutting down to allow some I/O 2161 * to happen. 2162 * 2163 * If it has taken us less than a second to process the 2164 * current work, then wait. Otherwise start right over 2165 * again. We can still lose time if any single round 2166 * takes more than two seconds, but it does not really 2167 * matter as we are just trying to generally pace the 2168 * filesystem activity. 2169 */ 2170 if (syncer_state != SYNCER_RUNNING || 2171 time_uptime == starttime) { 2172 thread_lock(td); 2173 sched_prio(td, PPAUSE); 2174 thread_unlock(td); 2175 } 2176 if (syncer_state != SYNCER_RUNNING) 2177 cv_timedwait(&sync_wakeup, &sync_mtx, 2178 hz / SYNCER_SHUTDOWN_SPEEDUP); 2179 else if (time_uptime == starttime) 2180 cv_timedwait(&sync_wakeup, &sync_mtx, hz); 2181 } 2182 } 2183 2184 /* 2185 * Request the syncer daemon to speed up its work. 2186 * We never push it to speed up more than half of its 2187 * normal turn time, otherwise it could take over the cpu. 2188 */ 2189 int 2190 speedup_syncer(void) 2191 { 2192 int ret = 0; 2193 2194 mtx_lock(&sync_mtx); 2195 if (rushjob < syncdelay / 2) { 2196 rushjob += 1; 2197 stat_rush_requests += 1; 2198 ret = 1; 2199 } 2200 mtx_unlock(&sync_mtx); 2201 cv_broadcast(&sync_wakeup); 2202 return (ret); 2203 } 2204 2205 /* 2206 * Tell the syncer to speed up its work and run though its work 2207 * list several times, then tell it to shut down. 2208 */ 2209 static void 2210 syncer_shutdown(void *arg, int howto) 2211 { 2212 2213 if (howto & RB_NOSYNC) 2214 return; 2215 mtx_lock(&sync_mtx); 2216 syncer_state = SYNCER_SHUTTING_DOWN; 2217 rushjob = 0; 2218 mtx_unlock(&sync_mtx); 2219 cv_broadcast(&sync_wakeup); 2220 kproc_shutdown(arg, howto); 2221 } 2222 2223 void 2224 syncer_suspend(void) 2225 { 2226 2227 syncer_shutdown(updateproc, 0); 2228 } 2229 2230 void 2231 syncer_resume(void) 2232 { 2233 2234 mtx_lock(&sync_mtx); 2235 first_printf = 1; 2236 syncer_state = SYNCER_RUNNING; 2237 mtx_unlock(&sync_mtx); 2238 cv_broadcast(&sync_wakeup); 2239 kproc_resume(updateproc); 2240 } 2241 2242 /* 2243 * Reassign a buffer from one vnode to another. 2244 * Used to assign file specific control information 2245 * (indirect blocks) to the vnode to which they belong. 2246 */ 2247 void 2248 reassignbuf(struct buf *bp) 2249 { 2250 struct vnode *vp; 2251 struct bufobj *bo; 2252 int delay; 2253 #ifdef INVARIANTS 2254 struct bufv *bv; 2255 #endif 2256 2257 vp = bp->b_vp; 2258 bo = bp->b_bufobj; 2259 ++reassignbufcalls; 2260 2261 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", 2262 bp, bp->b_vp, bp->b_flags); 2263 /* 2264 * B_PAGING flagged buffers cannot be reassigned because their vp 2265 * is not fully linked in. 2266 */ 2267 if (bp->b_flags & B_PAGING) 2268 panic("cannot reassign paging buffer"); 2269 2270 /* 2271 * Delete from old vnode list, if on one. 2272 */ 2273 BO_LOCK(bo); 2274 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2275 buf_vlist_remove(bp); 2276 else 2277 panic("reassignbuf: Buffer %p not on queue.", bp); 2278 /* 2279 * If dirty, put on list of dirty buffers; otherwise insert onto list 2280 * of clean buffers. 2281 */ 2282 if (bp->b_flags & B_DELWRI) { 2283 if ((bo->bo_flag & BO_ONWORKLST) == 0) { 2284 switch (vp->v_type) { 2285 case VDIR: 2286 delay = dirdelay; 2287 break; 2288 case VCHR: 2289 delay = metadelay; 2290 break; 2291 default: 2292 delay = filedelay; 2293 } 2294 vn_syncer_add_to_worklist(bo, delay); 2295 } 2296 buf_vlist_add(bp, bo, BX_VNDIRTY); 2297 } else { 2298 buf_vlist_add(bp, bo, BX_VNCLEAN); 2299 2300 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2301 mtx_lock(&sync_mtx); 2302 LIST_REMOVE(bo, bo_synclist); 2303 syncer_worklist_len--; 2304 mtx_unlock(&sync_mtx); 2305 bo->bo_flag &= ~BO_ONWORKLST; 2306 } 2307 } 2308 #ifdef INVARIANTS 2309 bv = &bo->bo_clean; 2310 bp = TAILQ_FIRST(&bv->bv_hd); 2311 KASSERT(bp == NULL || bp->b_bufobj == bo, 2312 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2313 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2314 KASSERT(bp == NULL || bp->b_bufobj == bo, 2315 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2316 bv = &bo->bo_dirty; 2317 bp = TAILQ_FIRST(&bv->bv_hd); 2318 KASSERT(bp == NULL || bp->b_bufobj == bo, 2319 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2320 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2321 KASSERT(bp == NULL || bp->b_bufobj == bo, 2322 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2323 #endif 2324 BO_UNLOCK(bo); 2325 } 2326 2327 /* 2328 * A temporary hack until refcount_* APIs are sorted out. 2329 */ 2330 static __inline int 2331 vfs_refcount_acquire_if_not_zero(volatile u_int *count) 2332 { 2333 u_int old; 2334 2335 for (;;) { 2336 old = *count; 2337 if (old == 0) 2338 return (0); 2339 if (atomic_cmpset_int(count, old, old + 1)) 2340 return (1); 2341 } 2342 } 2343 2344 static __inline int 2345 vfs_refcount_release_if_not_last(volatile u_int *count) 2346 { 2347 u_int old; 2348 2349 for (;;) { 2350 old = *count; 2351 if (old == 1) 2352 return (0); 2353 if (atomic_cmpset_int(count, old, old - 1)) 2354 return (1); 2355 } 2356 } 2357 2358 static void 2359 v_init_counters(struct vnode *vp) 2360 { 2361 2362 VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, 2363 vp, ("%s called for an initialized vnode", __FUNCTION__)); 2364 ASSERT_VI_UNLOCKED(vp, __FUNCTION__); 2365 2366 refcount_init(&vp->v_holdcnt, 1); 2367 refcount_init(&vp->v_usecount, 1); 2368 } 2369 2370 /* 2371 * Increment the use and hold counts on the vnode, taking care to reference 2372 * the driver's usecount if this is a chardev. The _vhold() will remove 2373 * the vnode from the free list if it is presently free. 2374 */ 2375 static void 2376 v_incr_usecount(struct vnode *vp) 2377 { 2378 2379 ASSERT_VI_UNLOCKED(vp, __func__); 2380 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2381 2382 if (vp->v_type == VCHR) { 2383 VI_LOCK(vp); 2384 _vhold(vp, true); 2385 if (vp->v_iflag & VI_OWEINACT) { 2386 VNASSERT(vp->v_usecount == 0, vp, 2387 ("vnode with usecount and VI_OWEINACT set")); 2388 vp->v_iflag &= ~VI_OWEINACT; 2389 } 2390 refcount_acquire(&vp->v_usecount); 2391 v_incr_devcount(vp); 2392 VI_UNLOCK(vp); 2393 return; 2394 } 2395 2396 _vhold(vp, false); 2397 if (vfs_refcount_acquire_if_not_zero(&vp->v_usecount)) { 2398 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, 2399 ("vnode with usecount and VI_OWEINACT set")); 2400 } else { 2401 VI_LOCK(vp); 2402 if (vp->v_iflag & VI_OWEINACT) 2403 vp->v_iflag &= ~VI_OWEINACT; 2404 refcount_acquire(&vp->v_usecount); 2405 VI_UNLOCK(vp); 2406 } 2407 } 2408 2409 /* 2410 * Increment si_usecount of the associated device, if any. 2411 */ 2412 static void 2413 v_incr_devcount(struct vnode *vp) 2414 { 2415 2416 ASSERT_VI_LOCKED(vp, __FUNCTION__); 2417 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 2418 dev_lock(); 2419 vp->v_rdev->si_usecount++; 2420 dev_unlock(); 2421 } 2422 } 2423 2424 /* 2425 * Decrement si_usecount of the associated device, if any. 2426 */ 2427 static void 2428 v_decr_devcount(struct vnode *vp) 2429 { 2430 2431 ASSERT_VI_LOCKED(vp, __FUNCTION__); 2432 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 2433 dev_lock(); 2434 vp->v_rdev->si_usecount--; 2435 dev_unlock(); 2436 } 2437 } 2438 2439 /* 2440 * Grab a particular vnode from the free list, increment its 2441 * reference count and lock it. VI_DOOMED is set if the vnode 2442 * is being destroyed. Only callers who specify LK_RETRY will 2443 * see doomed vnodes. If inactive processing was delayed in 2444 * vput try to do it here. 2445 * 2446 * Notes on lockless counter manipulation: 2447 * _vhold, vputx and other routines make various decisions based 2448 * on either holdcnt or usecount being 0. As long as either contuner 2449 * is not transitioning 0->1 nor 1->0, the manipulation can be done 2450 * with atomic operations. Otherwise the interlock is taken. 2451 */ 2452 int 2453 vget(struct vnode *vp, int flags, struct thread *td) 2454 { 2455 int error, oweinact; 2456 2457 VNASSERT((flags & LK_TYPE_MASK) != 0, vp, 2458 ("vget: invalid lock operation")); 2459 2460 if ((flags & LK_INTERLOCK) != 0) 2461 ASSERT_VI_LOCKED(vp, __func__); 2462 else 2463 ASSERT_VI_UNLOCKED(vp, __func__); 2464 if ((flags & LK_VNHELD) != 0) 2465 VNASSERT((vp->v_holdcnt > 0), vp, 2466 ("vget: LK_VNHELD passed but vnode not held")); 2467 2468 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 2469 2470 if ((flags & LK_VNHELD) == 0) 2471 _vhold(vp, (flags & LK_INTERLOCK) != 0); 2472 2473 if ((error = vn_lock(vp, flags)) != 0) { 2474 vdrop(vp); 2475 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, 2476 vp); 2477 return (error); 2478 } 2479 if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0) 2480 panic("vget: vn_lock failed to return ENOENT\n"); 2481 /* 2482 * We don't guarantee that any particular close will 2483 * trigger inactive processing so just make a best effort 2484 * here at preventing a reference to a removed file. If 2485 * we don't succeed no harm is done. 2486 * 2487 * Upgrade our holdcnt to a usecount. 2488 */ 2489 if (vp->v_type != VCHR && 2490 vfs_refcount_acquire_if_not_zero(&vp->v_usecount)) { 2491 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, 2492 ("vnode with usecount and VI_OWEINACT set")); 2493 } else { 2494 VI_LOCK(vp); 2495 if ((vp->v_iflag & VI_OWEINACT) == 0) { 2496 oweinact = 0; 2497 } else { 2498 oweinact = 1; 2499 vp->v_iflag &= ~VI_OWEINACT; 2500 } 2501 refcount_acquire(&vp->v_usecount); 2502 v_incr_devcount(vp); 2503 if (oweinact && VOP_ISLOCKED(vp) == LK_EXCLUSIVE && 2504 (flags & LK_NOWAIT) == 0) 2505 vinactive(vp, td); 2506 VI_UNLOCK(vp); 2507 } 2508 return (0); 2509 } 2510 2511 /* 2512 * Increase the reference count of a vnode. 2513 */ 2514 void 2515 vref(struct vnode *vp) 2516 { 2517 2518 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2519 v_incr_usecount(vp); 2520 } 2521 2522 /* 2523 * Return reference count of a vnode. 2524 * 2525 * The results of this call are only guaranteed when some mechanism is used to 2526 * stop other processes from gaining references to the vnode. This may be the 2527 * case if the caller holds the only reference. This is also useful when stale 2528 * data is acceptable as race conditions may be accounted for by some other 2529 * means. 2530 */ 2531 int 2532 vrefcnt(struct vnode *vp) 2533 { 2534 2535 return (vp->v_usecount); 2536 } 2537 2538 #define VPUTX_VRELE 1 2539 #define VPUTX_VPUT 2 2540 #define VPUTX_VUNREF 3 2541 2542 /* 2543 * Decrement the use and hold counts for a vnode. 2544 * 2545 * See an explanation near vget() as to why atomic operation is safe. 2546 */ 2547 static void 2548 vputx(struct vnode *vp, int func) 2549 { 2550 int error; 2551 2552 KASSERT(vp != NULL, ("vputx: null vp")); 2553 if (func == VPUTX_VUNREF) 2554 ASSERT_VOP_LOCKED(vp, "vunref"); 2555 else if (func == VPUTX_VPUT) 2556 ASSERT_VOP_LOCKED(vp, "vput"); 2557 else 2558 KASSERT(func == VPUTX_VRELE, ("vputx: wrong func")); 2559 ASSERT_VI_UNLOCKED(vp, __func__); 2560 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2561 2562 if (vp->v_type != VCHR && 2563 vfs_refcount_release_if_not_last(&vp->v_usecount)) { 2564 if (func == VPUTX_VPUT) 2565 VOP_UNLOCK(vp, 0); 2566 vdrop(vp); 2567 return; 2568 } 2569 2570 VI_LOCK(vp); 2571 2572 /* 2573 * We want to hold the vnode until the inactive finishes to 2574 * prevent vgone() races. We drop the use count here and the 2575 * hold count below when we're done. 2576 */ 2577 if (!refcount_release(&vp->v_usecount) || 2578 (vp->v_iflag & VI_DOINGINACT)) { 2579 if (func == VPUTX_VPUT) 2580 VOP_UNLOCK(vp, 0); 2581 v_decr_devcount(vp); 2582 vdropl(vp); 2583 return; 2584 } 2585 2586 v_decr_devcount(vp); 2587 2588 error = 0; 2589 2590 if (vp->v_usecount != 0) { 2591 vprint("vputx: usecount not zero", vp); 2592 panic("vputx: usecount not zero"); 2593 } 2594 2595 CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp); 2596 2597 /* 2598 * We must call VOP_INACTIVE with the node locked. Mark 2599 * as VI_DOINGINACT to avoid recursion. 2600 */ 2601 vp->v_iflag |= VI_OWEINACT; 2602 switch (func) { 2603 case VPUTX_VRELE: 2604 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 2605 VI_LOCK(vp); 2606 break; 2607 case VPUTX_VPUT: 2608 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 2609 error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | 2610 LK_NOWAIT); 2611 VI_LOCK(vp); 2612 } 2613 break; 2614 case VPUTX_VUNREF: 2615 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 2616 error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); 2617 VI_LOCK(vp); 2618 } 2619 break; 2620 } 2621 VNASSERT(vp->v_usecount == 0 || (vp->v_iflag & VI_OWEINACT) == 0, vp, 2622 ("vnode with usecount and VI_OWEINACT set")); 2623 if (error == 0) { 2624 if (vp->v_iflag & VI_OWEINACT) 2625 vinactive(vp, curthread); 2626 if (func != VPUTX_VUNREF) 2627 VOP_UNLOCK(vp, 0); 2628 } 2629 vdropl(vp); 2630 } 2631 2632 /* 2633 * Vnode put/release. 2634 * If count drops to zero, call inactive routine and return to freelist. 2635 */ 2636 void 2637 vrele(struct vnode *vp) 2638 { 2639 2640 vputx(vp, VPUTX_VRELE); 2641 } 2642 2643 /* 2644 * Release an already locked vnode. This give the same effects as 2645 * unlock+vrele(), but takes less time and avoids releasing and 2646 * re-aquiring the lock (as vrele() acquires the lock internally.) 2647 */ 2648 void 2649 vput(struct vnode *vp) 2650 { 2651 2652 vputx(vp, VPUTX_VPUT); 2653 } 2654 2655 /* 2656 * Release an exclusively locked vnode. Do not unlock the vnode lock. 2657 */ 2658 void 2659 vunref(struct vnode *vp) 2660 { 2661 2662 vputx(vp, VPUTX_VUNREF); 2663 } 2664 2665 /* 2666 * Increase the hold count and activate if this is the first reference. 2667 */ 2668 void 2669 _vhold(struct vnode *vp, bool locked) 2670 { 2671 struct mount *mp; 2672 2673 if (locked) 2674 ASSERT_VI_LOCKED(vp, __func__); 2675 else 2676 ASSERT_VI_UNLOCKED(vp, __func__); 2677 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2678 if (!locked && vfs_refcount_acquire_if_not_zero(&vp->v_holdcnt)) { 2679 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, 2680 ("_vhold: vnode with holdcnt is free")); 2681 return; 2682 } 2683 2684 if (!locked) 2685 VI_LOCK(vp); 2686 if ((vp->v_iflag & VI_FREE) == 0) { 2687 refcount_acquire(&vp->v_holdcnt); 2688 if (!locked) 2689 VI_UNLOCK(vp); 2690 return; 2691 } 2692 VNASSERT(vp->v_holdcnt == 0, vp, 2693 ("%s: wrong hold count", __func__)); 2694 VNASSERT(vp->v_op != NULL, vp, 2695 ("%s: vnode already reclaimed.", __func__)); 2696 /* 2697 * Remove a vnode from the free list, mark it as in use, 2698 * and put it on the active list. 2699 */ 2700 mtx_lock(&vnode_free_list_mtx); 2701 TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); 2702 freevnodes--; 2703 vp->v_iflag &= ~VI_FREE; 2704 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 2705 ("Activating already active vnode")); 2706 vp->v_iflag |= VI_ACTIVE; 2707 mp = vp->v_mount; 2708 TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); 2709 mp->mnt_activevnodelistsize++; 2710 mtx_unlock(&vnode_free_list_mtx); 2711 refcount_acquire(&vp->v_holdcnt); 2712 if (!locked) 2713 VI_UNLOCK(vp); 2714 } 2715 2716 /* 2717 * Drop the hold count of the vnode. If this is the last reference to 2718 * the vnode we place it on the free list unless it has been vgone'd 2719 * (marked VI_DOOMED) in which case we will free it. 2720 * 2721 * Because the vnode vm object keeps a hold reference on the vnode if 2722 * there is at least one resident non-cached page, the vnode cannot 2723 * leave the active list without the page cleanup done. 2724 */ 2725 void 2726 _vdrop(struct vnode *vp, bool locked) 2727 { 2728 struct bufobj *bo; 2729 struct mount *mp; 2730 int active; 2731 2732 if (locked) 2733 ASSERT_VI_LOCKED(vp, __func__); 2734 else 2735 ASSERT_VI_UNLOCKED(vp, __func__); 2736 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2737 if ((int)vp->v_holdcnt <= 0) 2738 panic("vdrop: holdcnt %d", vp->v_holdcnt); 2739 if (vfs_refcount_release_if_not_last(&vp->v_holdcnt)) { 2740 if (locked) 2741 VI_UNLOCK(vp); 2742 return; 2743 } 2744 2745 if (!locked) 2746 VI_LOCK(vp); 2747 if (refcount_release(&vp->v_holdcnt) == 0) { 2748 VI_UNLOCK(vp); 2749 return; 2750 } 2751 if ((vp->v_iflag & VI_DOOMED) == 0) { 2752 /* 2753 * Mark a vnode as free: remove it from its active list 2754 * and put it up for recycling on the freelist. 2755 */ 2756 VNASSERT(vp->v_op != NULL, vp, 2757 ("vdropl: vnode already reclaimed.")); 2758 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, 2759 ("vnode already free")); 2760 VNASSERT(vp->v_holdcnt == 0, vp, 2761 ("vdropl: freeing when we shouldn't")); 2762 active = vp->v_iflag & VI_ACTIVE; 2763 if ((vp->v_iflag & VI_OWEINACT) == 0) { 2764 vp->v_iflag &= ~VI_ACTIVE; 2765 mp = vp->v_mount; 2766 mtx_lock(&vnode_free_list_mtx); 2767 if (active) { 2768 TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, 2769 v_actfreelist); 2770 mp->mnt_activevnodelistsize--; 2771 } 2772 TAILQ_INSERT_TAIL(&vnode_free_list, vp, 2773 v_actfreelist); 2774 freevnodes++; 2775 vp->v_iflag |= VI_FREE; 2776 mtx_unlock(&vnode_free_list_mtx); 2777 } else { 2778 atomic_add_long(&free_owe_inact, 1); 2779 } 2780 VI_UNLOCK(vp); 2781 return; 2782 } 2783 /* 2784 * The vnode has been marked for destruction, so free it. 2785 * 2786 * The vnode will be returned to the zone where it will 2787 * normally remain until it is needed for another vnode. We 2788 * need to cleanup (or verify that the cleanup has already 2789 * been done) any residual data left from its current use 2790 * so as not to contaminate the freshly allocated vnode. 2791 */ 2792 CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); 2793 atomic_subtract_long(&numvnodes, 1); 2794 bo = &vp->v_bufobj; 2795 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, 2796 ("cleaned vnode still on the free list.")); 2797 VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); 2798 VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count")); 2799 VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); 2800 VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); 2801 VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); 2802 VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); 2803 VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, 2804 ("clean blk trie not empty")); 2805 VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); 2806 VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, 2807 ("dirty blk trie not empty")); 2808 VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); 2809 VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); 2810 VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for ..")); 2811 VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, 2812 ("Dangling rangelock waiters")); 2813 VI_UNLOCK(vp); 2814 #ifdef MAC 2815 mac_vnode_destroy(vp); 2816 #endif 2817 if (vp->v_pollinfo != NULL) { 2818 destroy_vpollinfo(vp->v_pollinfo); 2819 vp->v_pollinfo = NULL; 2820 } 2821 #ifdef INVARIANTS 2822 /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */ 2823 vp->v_op = NULL; 2824 #endif 2825 bzero(&vp->v_un, sizeof(vp->v_un)); 2826 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 2827 vp->v_iflag = 0; 2828 vp->v_vflag = 0; 2829 bo->bo_flag = 0; 2830 uma_zfree(vnode_zone, vp); 2831 } 2832 2833 /* 2834 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT 2835 * flags. DOINGINACT prevents us from recursing in calls to vinactive. 2836 * OWEINACT tracks whether a vnode missed a call to inactive due to a 2837 * failed lock upgrade. 2838 */ 2839 void 2840 vinactive(struct vnode *vp, struct thread *td) 2841 { 2842 struct vm_object *obj; 2843 2844 ASSERT_VOP_ELOCKED(vp, "vinactive"); 2845 ASSERT_VI_LOCKED(vp, "vinactive"); 2846 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, 2847 ("vinactive: recursed on VI_DOINGINACT")); 2848 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2849 vp->v_iflag |= VI_DOINGINACT; 2850 vp->v_iflag &= ~VI_OWEINACT; 2851 VI_UNLOCK(vp); 2852 /* 2853 * Before moving off the active list, we must be sure that any 2854 * modified pages are converted into the vnode's dirty 2855 * buffers, since these will no longer be checked once the 2856 * vnode is on the inactive list. 2857 * 2858 * The write-out of the dirty pages is asynchronous. At the 2859 * point that VOP_INACTIVE() is called, there could still be 2860 * pending I/O and dirty pages in the object. 2861 */ 2862 obj = vp->v_object; 2863 if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) { 2864 VM_OBJECT_WLOCK(obj); 2865 vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC); 2866 VM_OBJECT_WUNLOCK(obj); 2867 } 2868 VOP_INACTIVE(vp, td); 2869 VI_LOCK(vp); 2870 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, 2871 ("vinactive: lost VI_DOINGINACT")); 2872 vp->v_iflag &= ~VI_DOINGINACT; 2873 } 2874 2875 /* 2876 * Remove any vnodes in the vnode table belonging to mount point mp. 2877 * 2878 * If FORCECLOSE is not specified, there should not be any active ones, 2879 * return error if any are found (nb: this is a user error, not a 2880 * system error). If FORCECLOSE is specified, detach any active vnodes 2881 * that are found. 2882 * 2883 * If WRITECLOSE is set, only flush out regular file vnodes open for 2884 * writing. 2885 * 2886 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. 2887 * 2888 * `rootrefs' specifies the base reference count for the root vnode 2889 * of this filesystem. The root vnode is considered busy if its 2890 * v_usecount exceeds this value. On a successful return, vflush(, td) 2891 * will call vrele() on the root vnode exactly rootrefs times. 2892 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 2893 * be zero. 2894 */ 2895 #ifdef DIAGNOSTIC 2896 static int busyprt = 0; /* print out busy vnodes */ 2897 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); 2898 #endif 2899 2900 int 2901 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) 2902 { 2903 struct vnode *vp, *mvp, *rootvp = NULL; 2904 struct vattr vattr; 2905 int busy = 0, error; 2906 2907 CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, 2908 rootrefs, flags); 2909 if (rootrefs > 0) { 2910 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 2911 ("vflush: bad args")); 2912 /* 2913 * Get the filesystem root vnode. We can vput() it 2914 * immediately, since with rootrefs > 0, it won't go away. 2915 */ 2916 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { 2917 CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", 2918 __func__, error); 2919 return (error); 2920 } 2921 vput(rootvp); 2922 } 2923 loop: 2924 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 2925 vholdl(vp); 2926 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); 2927 if (error) { 2928 vdrop(vp); 2929 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 2930 goto loop; 2931 } 2932 /* 2933 * Skip over a vnodes marked VV_SYSTEM. 2934 */ 2935 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 2936 VOP_UNLOCK(vp, 0); 2937 vdrop(vp); 2938 continue; 2939 } 2940 /* 2941 * If WRITECLOSE is set, flush out unlinked but still open 2942 * files (even if open only for reading) and regular file 2943 * vnodes open for writing. 2944 */ 2945 if (flags & WRITECLOSE) { 2946 if (vp->v_object != NULL) { 2947 VM_OBJECT_WLOCK(vp->v_object); 2948 vm_object_page_clean(vp->v_object, 0, 0, 0); 2949 VM_OBJECT_WUNLOCK(vp->v_object); 2950 } 2951 error = VOP_FSYNC(vp, MNT_WAIT, td); 2952 if (error != 0) { 2953 VOP_UNLOCK(vp, 0); 2954 vdrop(vp); 2955 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 2956 return (error); 2957 } 2958 error = VOP_GETATTR(vp, &vattr, td->td_ucred); 2959 VI_LOCK(vp); 2960 2961 if ((vp->v_type == VNON || 2962 (error == 0 && vattr.va_nlink > 0)) && 2963 (vp->v_writecount == 0 || vp->v_type != VREG)) { 2964 VOP_UNLOCK(vp, 0); 2965 vdropl(vp); 2966 continue; 2967 } 2968 } else 2969 VI_LOCK(vp); 2970 /* 2971 * With v_usecount == 0, all we need to do is clear out the 2972 * vnode data structures and we are done. 2973 * 2974 * If FORCECLOSE is set, forcibly close the vnode. 2975 */ 2976 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { 2977 vgonel(vp); 2978 } else { 2979 busy++; 2980 #ifdef DIAGNOSTIC 2981 if (busyprt) 2982 vprint("vflush: busy vnode", vp); 2983 #endif 2984 } 2985 VOP_UNLOCK(vp, 0); 2986 vdropl(vp); 2987 } 2988 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 2989 /* 2990 * If just the root vnode is busy, and if its refcount 2991 * is equal to `rootrefs', then go ahead and kill it. 2992 */ 2993 VI_LOCK(rootvp); 2994 KASSERT(busy > 0, ("vflush: not busy")); 2995 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, 2996 ("vflush: usecount %d < rootrefs %d", 2997 rootvp->v_usecount, rootrefs)); 2998 if (busy == 1 && rootvp->v_usecount == rootrefs) { 2999 VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); 3000 vgone(rootvp); 3001 VOP_UNLOCK(rootvp, 0); 3002 busy = 0; 3003 } else 3004 VI_UNLOCK(rootvp); 3005 } 3006 if (busy) { 3007 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, 3008 busy); 3009 return (EBUSY); 3010 } 3011 for (; rootrefs > 0; rootrefs--) 3012 vrele(rootvp); 3013 return (0); 3014 } 3015 3016 /* 3017 * Recycle an unused vnode to the front of the free list. 3018 */ 3019 int 3020 vrecycle(struct vnode *vp) 3021 { 3022 int recycled; 3023 3024 ASSERT_VOP_ELOCKED(vp, "vrecycle"); 3025 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3026 recycled = 0; 3027 VI_LOCK(vp); 3028 if (vp->v_usecount == 0) { 3029 recycled = 1; 3030 vgonel(vp); 3031 } 3032 VI_UNLOCK(vp); 3033 return (recycled); 3034 } 3035 3036 /* 3037 * Eliminate all activity associated with a vnode 3038 * in preparation for reuse. 3039 */ 3040 void 3041 vgone(struct vnode *vp) 3042 { 3043 VI_LOCK(vp); 3044 vgonel(vp); 3045 VI_UNLOCK(vp); 3046 } 3047 3048 static void 3049 notify_lowervp_vfs_dummy(struct mount *mp __unused, 3050 struct vnode *lowervp __unused) 3051 { 3052 } 3053 3054 /* 3055 * Notify upper mounts about reclaimed or unlinked vnode. 3056 */ 3057 void 3058 vfs_notify_upper(struct vnode *vp, int event) 3059 { 3060 static struct vfsops vgonel_vfsops = { 3061 .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy, 3062 .vfs_unlink_lowervp = notify_lowervp_vfs_dummy, 3063 }; 3064 struct mount *mp, *ump, *mmp; 3065 3066 mp = vp->v_mount; 3067 if (mp == NULL) 3068 return; 3069 3070 MNT_ILOCK(mp); 3071 if (TAILQ_EMPTY(&mp->mnt_uppers)) 3072 goto unlock; 3073 MNT_IUNLOCK(mp); 3074 mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO); 3075 mmp->mnt_op = &vgonel_vfsops; 3076 mmp->mnt_kern_flag |= MNTK_MARKER; 3077 MNT_ILOCK(mp); 3078 mp->mnt_kern_flag |= MNTK_VGONE_UPPER; 3079 for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) { 3080 if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) { 3081 ump = TAILQ_NEXT(ump, mnt_upper_link); 3082 continue; 3083 } 3084 TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link); 3085 MNT_IUNLOCK(mp); 3086 switch (event) { 3087 case VFS_NOTIFY_UPPER_RECLAIM: 3088 VFS_RECLAIM_LOWERVP(ump, vp); 3089 break; 3090 case VFS_NOTIFY_UPPER_UNLINK: 3091 VFS_UNLINK_LOWERVP(ump, vp); 3092 break; 3093 default: 3094 KASSERT(0, ("invalid event %d", event)); 3095 break; 3096 } 3097 MNT_ILOCK(mp); 3098 ump = TAILQ_NEXT(mmp, mnt_upper_link); 3099 TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link); 3100 } 3101 free(mmp, M_TEMP); 3102 mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER; 3103 if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) { 3104 mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER; 3105 wakeup(&mp->mnt_uppers); 3106 } 3107 unlock: 3108 MNT_IUNLOCK(mp); 3109 } 3110 3111 /* 3112 * vgone, with the vp interlock held. 3113 */ 3114 static void 3115 vgonel(struct vnode *vp) 3116 { 3117 struct thread *td; 3118 int oweinact; 3119 int active; 3120 struct mount *mp; 3121 3122 ASSERT_VOP_ELOCKED(vp, "vgonel"); 3123 ASSERT_VI_LOCKED(vp, "vgonel"); 3124 VNASSERT(vp->v_holdcnt, vp, 3125 ("vgonel: vp %p has no reference.", vp)); 3126 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3127 td = curthread; 3128 3129 /* 3130 * Don't vgonel if we're already doomed. 3131 */ 3132 if (vp->v_iflag & VI_DOOMED) 3133 return; 3134 vp->v_iflag |= VI_DOOMED; 3135 3136 /* 3137 * Check to see if the vnode is in use. If so, we have to call 3138 * VOP_CLOSE() and VOP_INACTIVE(). 3139 */ 3140 active = vp->v_usecount; 3141 oweinact = (vp->v_iflag & VI_OWEINACT); 3142 VI_UNLOCK(vp); 3143 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); 3144 3145 /* 3146 * If purging an active vnode, it must be closed and 3147 * deactivated before being reclaimed. 3148 */ 3149 if (active) 3150 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 3151 if (oweinact || active) { 3152 VI_LOCK(vp); 3153 if ((vp->v_iflag & VI_DOINGINACT) == 0) 3154 vinactive(vp, td); 3155 VI_UNLOCK(vp); 3156 } 3157 if (vp->v_type == VSOCK) 3158 vfs_unp_reclaim(vp); 3159 3160 /* 3161 * Clean out any buffers associated with the vnode. 3162 * If the flush fails, just toss the buffers. 3163 */ 3164 mp = NULL; 3165 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) 3166 (void) vn_start_secondary_write(vp, &mp, V_WAIT); 3167 if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { 3168 while (vinvalbuf(vp, 0, 0, 0) != 0) 3169 ; 3170 } 3171 3172 BO_LOCK(&vp->v_bufobj); 3173 KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && 3174 vp->v_bufobj.bo_dirty.bv_cnt == 0 && 3175 TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && 3176 vp->v_bufobj.bo_clean.bv_cnt == 0, 3177 ("vp %p bufobj not invalidated", vp)); 3178 vp->v_bufobj.bo_flag |= BO_DEAD; 3179 BO_UNLOCK(&vp->v_bufobj); 3180 3181 /* 3182 * Reclaim the vnode. 3183 */ 3184 if (VOP_RECLAIM(vp, td)) 3185 panic("vgone: cannot reclaim"); 3186 if (mp != NULL) 3187 vn_finished_secondary_write(mp); 3188 VNASSERT(vp->v_object == NULL, vp, 3189 ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag)); 3190 /* 3191 * Clear the advisory locks and wake up waiting threads. 3192 */ 3193 (void)VOP_ADVLOCKPURGE(vp); 3194 vp->v_lockf = NULL; 3195 /* 3196 * Delete from old mount point vnode list. 3197 */ 3198 delmntque(vp); 3199 cache_purge(vp); 3200 /* 3201 * Done with purge, reset to the standard lock and invalidate 3202 * the vnode. 3203 */ 3204 VI_LOCK(vp); 3205 vp->v_vnlock = &vp->v_lock; 3206 vp->v_op = &dead_vnodeops; 3207 vp->v_tag = "none"; 3208 vp->v_type = VBAD; 3209 } 3210 3211 /* 3212 * Calculate the total number of references to a special device. 3213 */ 3214 int 3215 vcount(struct vnode *vp) 3216 { 3217 int count; 3218 3219 dev_lock(); 3220 count = vp->v_rdev->si_usecount; 3221 dev_unlock(); 3222 return (count); 3223 } 3224 3225 /* 3226 * Same as above, but using the struct cdev *as argument 3227 */ 3228 int 3229 count_dev(struct cdev *dev) 3230 { 3231 int count; 3232 3233 dev_lock(); 3234 count = dev->si_usecount; 3235 dev_unlock(); 3236 return(count); 3237 } 3238 3239 /* 3240 * Print out a description of a vnode. 3241 */ 3242 static char *typename[] = 3243 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", 3244 "VMARKER"}; 3245 3246 void 3247 vn_printf(struct vnode *vp, const char *fmt, ...) 3248 { 3249 va_list ap; 3250 char buf[256], buf2[16]; 3251 u_long flags; 3252 3253 va_start(ap, fmt); 3254 vprintf(fmt, ap); 3255 va_end(ap); 3256 printf("%p: ", (void *)vp); 3257 printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]); 3258 printf(" usecount %d, writecount %d, refcount %d mountedhere %p\n", 3259 vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere); 3260 buf[0] = '\0'; 3261 buf[1] = '\0'; 3262 if (vp->v_vflag & VV_ROOT) 3263 strlcat(buf, "|VV_ROOT", sizeof(buf)); 3264 if (vp->v_vflag & VV_ISTTY) 3265 strlcat(buf, "|VV_ISTTY", sizeof(buf)); 3266 if (vp->v_vflag & VV_NOSYNC) 3267 strlcat(buf, "|VV_NOSYNC", sizeof(buf)); 3268 if (vp->v_vflag & VV_ETERNALDEV) 3269 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); 3270 if (vp->v_vflag & VV_CACHEDLABEL) 3271 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); 3272 if (vp->v_vflag & VV_TEXT) 3273 strlcat(buf, "|VV_TEXT", sizeof(buf)); 3274 if (vp->v_vflag & VV_COPYONWRITE) 3275 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); 3276 if (vp->v_vflag & VV_SYSTEM) 3277 strlcat(buf, "|VV_SYSTEM", sizeof(buf)); 3278 if (vp->v_vflag & VV_PROCDEP) 3279 strlcat(buf, "|VV_PROCDEP", sizeof(buf)); 3280 if (vp->v_vflag & VV_NOKNOTE) 3281 strlcat(buf, "|VV_NOKNOTE", sizeof(buf)); 3282 if (vp->v_vflag & VV_DELETED) 3283 strlcat(buf, "|VV_DELETED", sizeof(buf)); 3284 if (vp->v_vflag & VV_MD) 3285 strlcat(buf, "|VV_MD", sizeof(buf)); 3286 if (vp->v_vflag & VV_FORCEINSMQ) 3287 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); 3288 flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | 3289 VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | 3290 VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ); 3291 if (flags != 0) { 3292 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); 3293 strlcat(buf, buf2, sizeof(buf)); 3294 } 3295 if (vp->v_iflag & VI_MOUNT) 3296 strlcat(buf, "|VI_MOUNT", sizeof(buf)); 3297 if (vp->v_iflag & VI_DOOMED) 3298 strlcat(buf, "|VI_DOOMED", sizeof(buf)); 3299 if (vp->v_iflag & VI_FREE) 3300 strlcat(buf, "|VI_FREE", sizeof(buf)); 3301 if (vp->v_iflag & VI_ACTIVE) 3302 strlcat(buf, "|VI_ACTIVE", sizeof(buf)); 3303 if (vp->v_iflag & VI_DOINGINACT) 3304 strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); 3305 if (vp->v_iflag & VI_OWEINACT) 3306 strlcat(buf, "|VI_OWEINACT", sizeof(buf)); 3307 flags = vp->v_iflag & ~(VI_MOUNT | VI_DOOMED | VI_FREE | 3308 VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT); 3309 if (flags != 0) { 3310 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); 3311 strlcat(buf, buf2, sizeof(buf)); 3312 } 3313 printf(" flags (%s)\n", buf + 1); 3314 if (mtx_owned(VI_MTX(vp))) 3315 printf(" VI_LOCKed"); 3316 if (vp->v_object != NULL) 3317 printf(" v_object %p ref %d pages %d " 3318 "cleanbuf %d dirtybuf %d\n", 3319 vp->v_object, vp->v_object->ref_count, 3320 vp->v_object->resident_page_count, 3321 vp->v_bufobj.bo_clean.bv_cnt, 3322 vp->v_bufobj.bo_dirty.bv_cnt); 3323 printf(" "); 3324 lockmgr_printinfo(vp->v_vnlock); 3325 if (vp->v_data != NULL) 3326 VOP_PRINT(vp); 3327 } 3328 3329 #ifdef DDB 3330 /* 3331 * List all of the locked vnodes in the system. 3332 * Called when debugging the kernel. 3333 */ 3334 DB_SHOW_COMMAND(lockedvnods, lockedvnodes) 3335 { 3336 struct mount *mp; 3337 struct vnode *vp; 3338 3339 /* 3340 * Note: because this is DDB, we can't obey the locking semantics 3341 * for these structures, which means we could catch an inconsistent 3342 * state and dereference a nasty pointer. Not much to be done 3343 * about that. 3344 */ 3345 db_printf("Locked vnodes\n"); 3346 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3347 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 3348 if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) 3349 vprint("", vp); 3350 } 3351 } 3352 } 3353 3354 /* 3355 * Show details about the given vnode. 3356 */ 3357 DB_SHOW_COMMAND(vnode, db_show_vnode) 3358 { 3359 struct vnode *vp; 3360 3361 if (!have_addr) 3362 return; 3363 vp = (struct vnode *)addr; 3364 vn_printf(vp, "vnode "); 3365 } 3366 3367 /* 3368 * Show details about the given mount point. 3369 */ 3370 DB_SHOW_COMMAND(mount, db_show_mount) 3371 { 3372 struct mount *mp; 3373 struct vfsopt *opt; 3374 struct statfs *sp; 3375 struct vnode *vp; 3376 char buf[512]; 3377 uint64_t mflags; 3378 u_int flags; 3379 3380 if (!have_addr) { 3381 /* No address given, print short info about all mount points. */ 3382 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3383 db_printf("%p %s on %s (%s)\n", mp, 3384 mp->mnt_stat.f_mntfromname, 3385 mp->mnt_stat.f_mntonname, 3386 mp->mnt_stat.f_fstypename); 3387 if (db_pager_quit) 3388 break; 3389 } 3390 db_printf("\nMore info: show mount <addr>\n"); 3391 return; 3392 } 3393 3394 mp = (struct mount *)addr; 3395 db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, 3396 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); 3397 3398 buf[0] = '\0'; 3399 mflags = mp->mnt_flag; 3400 #define MNT_FLAG(flag) do { \ 3401 if (mflags & (flag)) { \ 3402 if (buf[0] != '\0') \ 3403 strlcat(buf, ", ", sizeof(buf)); \ 3404 strlcat(buf, (#flag) + 4, sizeof(buf)); \ 3405 mflags &= ~(flag); \ 3406 } \ 3407 } while (0) 3408 MNT_FLAG(MNT_RDONLY); 3409 MNT_FLAG(MNT_SYNCHRONOUS); 3410 MNT_FLAG(MNT_NOEXEC); 3411 MNT_FLAG(MNT_NOSUID); 3412 MNT_FLAG(MNT_NFS4ACLS); 3413 MNT_FLAG(MNT_UNION); 3414 MNT_FLAG(MNT_ASYNC); 3415 MNT_FLAG(MNT_SUIDDIR); 3416 MNT_FLAG(MNT_SOFTDEP); 3417 MNT_FLAG(MNT_NOSYMFOLLOW); 3418 MNT_FLAG(MNT_GJOURNAL); 3419 MNT_FLAG(MNT_MULTILABEL); 3420 MNT_FLAG(MNT_ACLS); 3421 MNT_FLAG(MNT_NOATIME); 3422 MNT_FLAG(MNT_NOCLUSTERR); 3423 MNT_FLAG(MNT_NOCLUSTERW); 3424 MNT_FLAG(MNT_SUJ); 3425 MNT_FLAG(MNT_EXRDONLY); 3426 MNT_FLAG(MNT_EXPORTED); 3427 MNT_FLAG(MNT_DEFEXPORTED); 3428 MNT_FLAG(MNT_EXPORTANON); 3429 MNT_FLAG(MNT_EXKERB); 3430 MNT_FLAG(MNT_EXPUBLIC); 3431 MNT_FLAG(MNT_LOCAL); 3432 MNT_FLAG(MNT_QUOTA); 3433 MNT_FLAG(MNT_ROOTFS); 3434 MNT_FLAG(MNT_USER); 3435 MNT_FLAG(MNT_IGNORE); 3436 MNT_FLAG(MNT_UPDATE); 3437 MNT_FLAG(MNT_DELEXPORT); 3438 MNT_FLAG(MNT_RELOAD); 3439 MNT_FLAG(MNT_FORCE); 3440 MNT_FLAG(MNT_SNAPSHOT); 3441 MNT_FLAG(MNT_BYFSID); 3442 #undef MNT_FLAG 3443 if (mflags != 0) { 3444 if (buf[0] != '\0') 3445 strlcat(buf, ", ", sizeof(buf)); 3446 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 3447 "0x%016jx", mflags); 3448 } 3449 db_printf(" mnt_flag = %s\n", buf); 3450 3451 buf[0] = '\0'; 3452 flags = mp->mnt_kern_flag; 3453 #define MNT_KERN_FLAG(flag) do { \ 3454 if (flags & (flag)) { \ 3455 if (buf[0] != '\0') \ 3456 strlcat(buf, ", ", sizeof(buf)); \ 3457 strlcat(buf, (#flag) + 5, sizeof(buf)); \ 3458 flags &= ~(flag); \ 3459 } \ 3460 } while (0) 3461 MNT_KERN_FLAG(MNTK_UNMOUNTF); 3462 MNT_KERN_FLAG(MNTK_ASYNC); 3463 MNT_KERN_FLAG(MNTK_SOFTDEP); 3464 MNT_KERN_FLAG(MNTK_NOINSMNTQ); 3465 MNT_KERN_FLAG(MNTK_DRAINING); 3466 MNT_KERN_FLAG(MNTK_REFEXPIRE); 3467 MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); 3468 MNT_KERN_FLAG(MNTK_SHARED_WRITES); 3469 MNT_KERN_FLAG(MNTK_NO_IOPF); 3470 MNT_KERN_FLAG(MNTK_VGONE_UPPER); 3471 MNT_KERN_FLAG(MNTK_VGONE_WAITER); 3472 MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT); 3473 MNT_KERN_FLAG(MNTK_MARKER); 3474 MNT_KERN_FLAG(MNTK_USES_BCACHE); 3475 MNT_KERN_FLAG(MNTK_NOASYNC); 3476 MNT_KERN_FLAG(MNTK_UNMOUNT); 3477 MNT_KERN_FLAG(MNTK_MWAIT); 3478 MNT_KERN_FLAG(MNTK_SUSPEND); 3479 MNT_KERN_FLAG(MNTK_SUSPEND2); 3480 MNT_KERN_FLAG(MNTK_SUSPENDED); 3481 MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); 3482 MNT_KERN_FLAG(MNTK_NOKNOTE); 3483 #undef MNT_KERN_FLAG 3484 if (flags != 0) { 3485 if (buf[0] != '\0') 3486 strlcat(buf, ", ", sizeof(buf)); 3487 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 3488 "0x%08x", flags); 3489 } 3490 db_printf(" mnt_kern_flag = %s\n", buf); 3491 3492 db_printf(" mnt_opt = "); 3493 opt = TAILQ_FIRST(mp->mnt_opt); 3494 if (opt != NULL) { 3495 db_printf("%s", opt->name); 3496 opt = TAILQ_NEXT(opt, link); 3497 while (opt != NULL) { 3498 db_printf(", %s", opt->name); 3499 opt = TAILQ_NEXT(opt, link); 3500 } 3501 } 3502 db_printf("\n"); 3503 3504 sp = &mp->mnt_stat; 3505 db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " 3506 "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " 3507 "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " 3508 "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", 3509 (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, 3510 (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, 3511 (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, 3512 (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, 3513 (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, 3514 (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, 3515 (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, 3516 (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); 3517 3518 db_printf(" mnt_cred = { uid=%u ruid=%u", 3519 (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); 3520 if (jailed(mp->mnt_cred)) 3521 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); 3522 db_printf(" }\n"); 3523 db_printf(" mnt_ref = %d\n", mp->mnt_ref); 3524 db_printf(" mnt_gen = %d\n", mp->mnt_gen); 3525 db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); 3526 db_printf(" mnt_activevnodelistsize = %d\n", 3527 mp->mnt_activevnodelistsize); 3528 db_printf(" mnt_writeopcount = %d\n", mp->mnt_writeopcount); 3529 db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); 3530 db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); 3531 db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); 3532 db_printf(" mnt_lockref = %d\n", mp->mnt_lockref); 3533 db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); 3534 db_printf(" mnt_secondary_accwrites = %d\n", 3535 mp->mnt_secondary_accwrites); 3536 db_printf(" mnt_gjprovider = %s\n", 3537 mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); 3538 3539 db_printf("\n\nList of active vnodes\n"); 3540 TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) { 3541 if (vp->v_type != VMARKER) { 3542 vn_printf(vp, "vnode "); 3543 if (db_pager_quit) 3544 break; 3545 } 3546 } 3547 db_printf("\n\nList of inactive vnodes\n"); 3548 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 3549 if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) { 3550 vn_printf(vp, "vnode "); 3551 if (db_pager_quit) 3552 break; 3553 } 3554 } 3555 } 3556 #endif /* DDB */ 3557 3558 /* 3559 * Fill in a struct xvfsconf based on a struct vfsconf. 3560 */ 3561 static int 3562 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) 3563 { 3564 struct xvfsconf xvfsp; 3565 3566 bzero(&xvfsp, sizeof(xvfsp)); 3567 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 3568 xvfsp.vfc_typenum = vfsp->vfc_typenum; 3569 xvfsp.vfc_refcount = vfsp->vfc_refcount; 3570 xvfsp.vfc_flags = vfsp->vfc_flags; 3571 /* 3572 * These are unused in userland, we keep them 3573 * to not break binary compatibility. 3574 */ 3575 xvfsp.vfc_vfsops = NULL; 3576 xvfsp.vfc_next = NULL; 3577 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 3578 } 3579 3580 #ifdef COMPAT_FREEBSD32 3581 struct xvfsconf32 { 3582 uint32_t vfc_vfsops; 3583 char vfc_name[MFSNAMELEN]; 3584 int32_t vfc_typenum; 3585 int32_t vfc_refcount; 3586 int32_t vfc_flags; 3587 uint32_t vfc_next; 3588 }; 3589 3590 static int 3591 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) 3592 { 3593 struct xvfsconf32 xvfsp; 3594 3595 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 3596 xvfsp.vfc_typenum = vfsp->vfc_typenum; 3597 xvfsp.vfc_refcount = vfsp->vfc_refcount; 3598 xvfsp.vfc_flags = vfsp->vfc_flags; 3599 xvfsp.vfc_vfsops = 0; 3600 xvfsp.vfc_next = 0; 3601 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 3602 } 3603 #endif 3604 3605 /* 3606 * Top level filesystem related information gathering. 3607 */ 3608 static int 3609 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) 3610 { 3611 struct vfsconf *vfsp; 3612 int error; 3613 3614 error = 0; 3615 vfsconf_slock(); 3616 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 3617 #ifdef COMPAT_FREEBSD32 3618 if (req->flags & SCTL_MASK32) 3619 error = vfsconf2x32(req, vfsp); 3620 else 3621 #endif 3622 error = vfsconf2x(req, vfsp); 3623 if (error) 3624 break; 3625 } 3626 vfsconf_sunlock(); 3627 return (error); 3628 } 3629 3630 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | 3631 CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, 3632 "S,xvfsconf", "List of all configured filesystems"); 3633 3634 #ifndef BURN_BRIDGES 3635 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 3636 3637 static int 3638 vfs_sysctl(SYSCTL_HANDLER_ARGS) 3639 { 3640 int *name = (int *)arg1 - 1; /* XXX */ 3641 u_int namelen = arg2 + 1; /* XXX */ 3642 struct vfsconf *vfsp; 3643 3644 log(LOG_WARNING, "userland calling deprecated sysctl, " 3645 "please rebuild world\n"); 3646 3647 #if 1 || defined(COMPAT_PRELITE2) 3648 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 3649 if (namelen == 1) 3650 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 3651 #endif 3652 3653 switch (name[1]) { 3654 case VFS_MAXTYPENUM: 3655 if (namelen != 2) 3656 return (ENOTDIR); 3657 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 3658 case VFS_CONF: 3659 if (namelen != 3) 3660 return (ENOTDIR); /* overloaded */ 3661 vfsconf_slock(); 3662 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 3663 if (vfsp->vfc_typenum == name[2]) 3664 break; 3665 } 3666 vfsconf_sunlock(); 3667 if (vfsp == NULL) 3668 return (EOPNOTSUPP); 3669 #ifdef COMPAT_FREEBSD32 3670 if (req->flags & SCTL_MASK32) 3671 return (vfsconf2x32(req, vfsp)); 3672 else 3673 #endif 3674 return (vfsconf2x(req, vfsp)); 3675 } 3676 return (EOPNOTSUPP); 3677 } 3678 3679 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | 3680 CTLFLAG_MPSAFE, vfs_sysctl, 3681 "Generic filesystem"); 3682 3683 #if 1 || defined(COMPAT_PRELITE2) 3684 3685 static int 3686 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 3687 { 3688 int error; 3689 struct vfsconf *vfsp; 3690 struct ovfsconf ovfs; 3691 3692 vfsconf_slock(); 3693 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 3694 bzero(&ovfs, sizeof(ovfs)); 3695 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 3696 strcpy(ovfs.vfc_name, vfsp->vfc_name); 3697 ovfs.vfc_index = vfsp->vfc_typenum; 3698 ovfs.vfc_refcount = vfsp->vfc_refcount; 3699 ovfs.vfc_flags = vfsp->vfc_flags; 3700 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 3701 if (error != 0) { 3702 vfsconf_sunlock(); 3703 return (error); 3704 } 3705 } 3706 vfsconf_sunlock(); 3707 return (0); 3708 } 3709 3710 #endif /* 1 || COMPAT_PRELITE2 */ 3711 #endif /* !BURN_BRIDGES */ 3712 3713 #define KINFO_VNODESLOP 10 3714 #ifdef notyet 3715 /* 3716 * Dump vnode list (via sysctl). 3717 */ 3718 /* ARGSUSED */ 3719 static int 3720 sysctl_vnode(SYSCTL_HANDLER_ARGS) 3721 { 3722 struct xvnode *xvn; 3723 struct mount *mp; 3724 struct vnode *vp; 3725 int error, len, n; 3726 3727 /* 3728 * Stale numvnodes access is not fatal here. 3729 */ 3730 req->lock = 0; 3731 len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; 3732 if (!req->oldptr) 3733 /* Make an estimate */ 3734 return (SYSCTL_OUT(req, 0, len)); 3735 3736 error = sysctl_wire_old_buffer(req, 0); 3737 if (error != 0) 3738 return (error); 3739 xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); 3740 n = 0; 3741 mtx_lock(&mountlist_mtx); 3742 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3743 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) 3744 continue; 3745 MNT_ILOCK(mp); 3746 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 3747 if (n == len) 3748 break; 3749 vref(vp); 3750 xvn[n].xv_size = sizeof *xvn; 3751 xvn[n].xv_vnode = vp; 3752 xvn[n].xv_id = 0; /* XXX compat */ 3753 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field 3754 XV_COPY(usecount); 3755 XV_COPY(writecount); 3756 XV_COPY(holdcnt); 3757 XV_COPY(mount); 3758 XV_COPY(numoutput); 3759 XV_COPY(type); 3760 #undef XV_COPY 3761 xvn[n].xv_flag = vp->v_vflag; 3762 3763 switch (vp->v_type) { 3764 case VREG: 3765 case VDIR: 3766 case VLNK: 3767 break; 3768 case VBLK: 3769 case VCHR: 3770 if (vp->v_rdev == NULL) { 3771 vrele(vp); 3772 continue; 3773 } 3774 xvn[n].xv_dev = dev2udev(vp->v_rdev); 3775 break; 3776 case VSOCK: 3777 xvn[n].xv_socket = vp->v_socket; 3778 break; 3779 case VFIFO: 3780 xvn[n].xv_fifo = vp->v_fifoinfo; 3781 break; 3782 case VNON: 3783 case VBAD: 3784 default: 3785 /* shouldn't happen? */ 3786 vrele(vp); 3787 continue; 3788 } 3789 vrele(vp); 3790 ++n; 3791 } 3792 MNT_IUNLOCK(mp); 3793 mtx_lock(&mountlist_mtx); 3794 vfs_unbusy(mp); 3795 if (n == len) 3796 break; 3797 } 3798 mtx_unlock(&mountlist_mtx); 3799 3800 error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); 3801 free(xvn, M_TEMP); 3802 return (error); 3803 } 3804 3805 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD | 3806 CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode", 3807 ""); 3808 #endif 3809 3810 static void 3811 unmount_or_warn(struct mount *mp) 3812 { 3813 int error; 3814 3815 error = dounmount(mp, MNT_FORCE, curthread); 3816 if (error != 0) { 3817 printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); 3818 if (error == EBUSY) 3819 printf("BUSY)\n"); 3820 else 3821 printf("%d)\n", error); 3822 } 3823 } 3824 3825 /* 3826 * Unmount all filesystems. The list is traversed in reverse order 3827 * of mounting to avoid dependencies. 3828 */ 3829 void 3830 vfs_unmountall(void) 3831 { 3832 struct mount *mp, *tmp; 3833 3834 CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); 3835 3836 /* 3837 * Since this only runs when rebooting, it is not interlocked. 3838 */ 3839 TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { 3840 vfs_ref(mp); 3841 3842 /* 3843 * Forcibly unmounting "/dev" before "/" would prevent clean 3844 * unmount of the latter. 3845 */ 3846 if (mp == rootdevmp) 3847 continue; 3848 3849 unmount_or_warn(mp); 3850 } 3851 3852 if (rootdevmp != NULL) 3853 unmount_or_warn(rootdevmp); 3854 } 3855 3856 /* 3857 * perform msync on all vnodes under a mount point 3858 * the mount point must be locked. 3859 */ 3860 void 3861 vfs_msync(struct mount *mp, int flags) 3862 { 3863 struct vnode *vp, *mvp; 3864 struct vm_object *obj; 3865 3866 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 3867 MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { 3868 obj = vp->v_object; 3869 if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 && 3870 (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) { 3871 if (!vget(vp, 3872 LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, 3873 curthread)) { 3874 if (vp->v_vflag & VV_NOSYNC) { /* unlinked */ 3875 vput(vp); 3876 continue; 3877 } 3878 3879 obj = vp->v_object; 3880 if (obj != NULL) { 3881 VM_OBJECT_WLOCK(obj); 3882 vm_object_page_clean(obj, 0, 0, 3883 flags == MNT_WAIT ? 3884 OBJPC_SYNC : OBJPC_NOSYNC); 3885 VM_OBJECT_WUNLOCK(obj); 3886 } 3887 vput(vp); 3888 } 3889 } else 3890 VI_UNLOCK(vp); 3891 } 3892 } 3893 3894 static void 3895 destroy_vpollinfo_free(struct vpollinfo *vi) 3896 { 3897 3898 knlist_destroy(&vi->vpi_selinfo.si_note); 3899 mtx_destroy(&vi->vpi_lock); 3900 uma_zfree(vnodepoll_zone, vi); 3901 } 3902 3903 static void 3904 destroy_vpollinfo(struct vpollinfo *vi) 3905 { 3906 3907 knlist_clear(&vi->vpi_selinfo.si_note, 1); 3908 seldrain(&vi->vpi_selinfo); 3909 destroy_vpollinfo_free(vi); 3910 } 3911 3912 /* 3913 * Initalize per-vnode helper structure to hold poll-related state. 3914 */ 3915 void 3916 v_addpollinfo(struct vnode *vp) 3917 { 3918 struct vpollinfo *vi; 3919 3920 if (vp->v_pollinfo != NULL) 3921 return; 3922 vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO); 3923 mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); 3924 knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, 3925 vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked); 3926 VI_LOCK(vp); 3927 if (vp->v_pollinfo != NULL) { 3928 VI_UNLOCK(vp); 3929 destroy_vpollinfo_free(vi); 3930 return; 3931 } 3932 vp->v_pollinfo = vi; 3933 VI_UNLOCK(vp); 3934 } 3935 3936 /* 3937 * Record a process's interest in events which might happen to 3938 * a vnode. Because poll uses the historic select-style interface 3939 * internally, this routine serves as both the ``check for any 3940 * pending events'' and the ``record my interest in future events'' 3941 * functions. (These are done together, while the lock is held, 3942 * to avoid race conditions.) 3943 */ 3944 int 3945 vn_pollrecord(struct vnode *vp, struct thread *td, int events) 3946 { 3947 3948 v_addpollinfo(vp); 3949 mtx_lock(&vp->v_pollinfo->vpi_lock); 3950 if (vp->v_pollinfo->vpi_revents & events) { 3951 /* 3952 * This leaves events we are not interested 3953 * in available for the other process which 3954 * which presumably had requested them 3955 * (otherwise they would never have been 3956 * recorded). 3957 */ 3958 events &= vp->v_pollinfo->vpi_revents; 3959 vp->v_pollinfo->vpi_revents &= ~events; 3960 3961 mtx_unlock(&vp->v_pollinfo->vpi_lock); 3962 return (events); 3963 } 3964 vp->v_pollinfo->vpi_events |= events; 3965 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 3966 mtx_unlock(&vp->v_pollinfo->vpi_lock); 3967 return (0); 3968 } 3969 3970 /* 3971 * Routine to create and manage a filesystem syncer vnode. 3972 */ 3973 #define sync_close ((int (*)(struct vop_close_args *))nullop) 3974 static int sync_fsync(struct vop_fsync_args *); 3975 static int sync_inactive(struct vop_inactive_args *); 3976 static int sync_reclaim(struct vop_reclaim_args *); 3977 3978 static struct vop_vector sync_vnodeops = { 3979 .vop_bypass = VOP_EOPNOTSUPP, 3980 .vop_close = sync_close, /* close */ 3981 .vop_fsync = sync_fsync, /* fsync */ 3982 .vop_inactive = sync_inactive, /* inactive */ 3983 .vop_reclaim = sync_reclaim, /* reclaim */ 3984 .vop_lock1 = vop_stdlock, /* lock */ 3985 .vop_unlock = vop_stdunlock, /* unlock */ 3986 .vop_islocked = vop_stdislocked, /* islocked */ 3987 }; 3988 3989 /* 3990 * Create a new filesystem syncer vnode for the specified mount point. 3991 */ 3992 void 3993 vfs_allocate_syncvnode(struct mount *mp) 3994 { 3995 struct vnode *vp; 3996 struct bufobj *bo; 3997 static long start, incr, next; 3998 int error; 3999 4000 /* Allocate a new vnode */ 4001 error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); 4002 if (error != 0) 4003 panic("vfs_allocate_syncvnode: getnewvnode() failed"); 4004 vp->v_type = VNON; 4005 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 4006 vp->v_vflag |= VV_FORCEINSMQ; 4007 error = insmntque(vp, mp); 4008 if (error != 0) 4009 panic("vfs_allocate_syncvnode: insmntque() failed"); 4010 vp->v_vflag &= ~VV_FORCEINSMQ; 4011 VOP_UNLOCK(vp, 0); 4012 /* 4013 * Place the vnode onto the syncer worklist. We attempt to 4014 * scatter them about on the list so that they will go off 4015 * at evenly distributed times even if all the filesystems 4016 * are mounted at once. 4017 */ 4018 next += incr; 4019 if (next == 0 || next > syncer_maxdelay) { 4020 start /= 2; 4021 incr /= 2; 4022 if (start == 0) { 4023 start = syncer_maxdelay / 2; 4024 incr = syncer_maxdelay; 4025 } 4026 next = start; 4027 } 4028 bo = &vp->v_bufobj; 4029 BO_LOCK(bo); 4030 vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); 4031 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ 4032 mtx_lock(&sync_mtx); 4033 sync_vnode_count++; 4034 if (mp->mnt_syncer == NULL) { 4035 mp->mnt_syncer = vp; 4036 vp = NULL; 4037 } 4038 mtx_unlock(&sync_mtx); 4039 BO_UNLOCK(bo); 4040 if (vp != NULL) { 4041 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 4042 vgone(vp); 4043 vput(vp); 4044 } 4045 } 4046 4047 void 4048 vfs_deallocate_syncvnode(struct mount *mp) 4049 { 4050 struct vnode *vp; 4051 4052 mtx_lock(&sync_mtx); 4053 vp = mp->mnt_syncer; 4054 if (vp != NULL) 4055 mp->mnt_syncer = NULL; 4056 mtx_unlock(&sync_mtx); 4057 if (vp != NULL) 4058 vrele(vp); 4059 } 4060 4061 /* 4062 * Do a lazy sync of the filesystem. 4063 */ 4064 static int 4065 sync_fsync(struct vop_fsync_args *ap) 4066 { 4067 struct vnode *syncvp = ap->a_vp; 4068 struct mount *mp = syncvp->v_mount; 4069 int error, save; 4070 struct bufobj *bo; 4071 4072 /* 4073 * We only need to do something if this is a lazy evaluation. 4074 */ 4075 if (ap->a_waitfor != MNT_LAZY) 4076 return (0); 4077 4078 /* 4079 * Move ourselves to the back of the sync list. 4080 */ 4081 bo = &syncvp->v_bufobj; 4082 BO_LOCK(bo); 4083 vn_syncer_add_to_worklist(bo, syncdelay); 4084 BO_UNLOCK(bo); 4085 4086 /* 4087 * Walk the list of vnodes pushing all that are dirty and 4088 * not already on the sync list. 4089 */ 4090 if (vfs_busy(mp, MBF_NOWAIT) != 0) 4091 return (0); 4092 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { 4093 vfs_unbusy(mp); 4094 return (0); 4095 } 4096 save = curthread_pflags_set(TDP_SYNCIO); 4097 vfs_msync(mp, MNT_NOWAIT); 4098 error = VFS_SYNC(mp, MNT_LAZY); 4099 curthread_pflags_restore(save); 4100 vn_finished_write(mp); 4101 vfs_unbusy(mp); 4102 return (error); 4103 } 4104 4105 /* 4106 * The syncer vnode is no referenced. 4107 */ 4108 static int 4109 sync_inactive(struct vop_inactive_args *ap) 4110 { 4111 4112 vgone(ap->a_vp); 4113 return (0); 4114 } 4115 4116 /* 4117 * The syncer vnode is no longer needed and is being decommissioned. 4118 * 4119 * Modifications to the worklist must be protected by sync_mtx. 4120 */ 4121 static int 4122 sync_reclaim(struct vop_reclaim_args *ap) 4123 { 4124 struct vnode *vp = ap->a_vp; 4125 struct bufobj *bo; 4126 4127 bo = &vp->v_bufobj; 4128 BO_LOCK(bo); 4129 mtx_lock(&sync_mtx); 4130 if (vp->v_mount->mnt_syncer == vp) 4131 vp->v_mount->mnt_syncer = NULL; 4132 if (bo->bo_flag & BO_ONWORKLST) { 4133 LIST_REMOVE(bo, bo_synclist); 4134 syncer_worklist_len--; 4135 sync_vnode_count--; 4136 bo->bo_flag &= ~BO_ONWORKLST; 4137 } 4138 mtx_unlock(&sync_mtx); 4139 BO_UNLOCK(bo); 4140 4141 return (0); 4142 } 4143 4144 /* 4145 * Check if vnode represents a disk device 4146 */ 4147 int 4148 vn_isdisk(struct vnode *vp, int *errp) 4149 { 4150 int error; 4151 4152 if (vp->v_type != VCHR) { 4153 error = ENOTBLK; 4154 goto out; 4155 } 4156 error = 0; 4157 dev_lock(); 4158 if (vp->v_rdev == NULL) 4159 error = ENXIO; 4160 else if (vp->v_rdev->si_devsw == NULL) 4161 error = ENXIO; 4162 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) 4163 error = ENOTBLK; 4164 dev_unlock(); 4165 out: 4166 if (errp != NULL) 4167 *errp = error; 4168 return (error == 0); 4169 } 4170 4171 /* 4172 * Common filesystem object access control check routine. Accepts a 4173 * vnode's type, "mode", uid and gid, requested access mode, credentials, 4174 * and optional call-by-reference privused argument allowing vaccess() 4175 * to indicate to the caller whether privilege was used to satisfy the 4176 * request (obsoleted). Returns 0 on success, or an errno on failure. 4177 */ 4178 int 4179 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, 4180 accmode_t accmode, struct ucred *cred, int *privused) 4181 { 4182 accmode_t dac_granted; 4183 accmode_t priv_granted; 4184 4185 KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, 4186 ("invalid bit in accmode")); 4187 KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), 4188 ("VAPPEND without VWRITE")); 4189 4190 /* 4191 * Look for a normal, non-privileged way to access the file/directory 4192 * as requested. If it exists, go with that. 4193 */ 4194 4195 if (privused != NULL) 4196 *privused = 0; 4197 4198 dac_granted = 0; 4199 4200 /* Check the owner. */ 4201 if (cred->cr_uid == file_uid) { 4202 dac_granted |= VADMIN; 4203 if (file_mode & S_IXUSR) 4204 dac_granted |= VEXEC; 4205 if (file_mode & S_IRUSR) 4206 dac_granted |= VREAD; 4207 if (file_mode & S_IWUSR) 4208 dac_granted |= (VWRITE | VAPPEND); 4209 4210 if ((accmode & dac_granted) == accmode) 4211 return (0); 4212 4213 goto privcheck; 4214 } 4215 4216 /* Otherwise, check the groups (first match) */ 4217 if (groupmember(file_gid, cred)) { 4218 if (file_mode & S_IXGRP) 4219 dac_granted |= VEXEC; 4220 if (file_mode & S_IRGRP) 4221 dac_granted |= VREAD; 4222 if (file_mode & S_IWGRP) 4223 dac_granted |= (VWRITE | VAPPEND); 4224 4225 if ((accmode & dac_granted) == accmode) 4226 return (0); 4227 4228 goto privcheck; 4229 } 4230 4231 /* Otherwise, check everyone else. */ 4232 if (file_mode & S_IXOTH) 4233 dac_granted |= VEXEC; 4234 if (file_mode & S_IROTH) 4235 dac_granted |= VREAD; 4236 if (file_mode & S_IWOTH) 4237 dac_granted |= (VWRITE | VAPPEND); 4238 if ((accmode & dac_granted) == accmode) 4239 return (0); 4240 4241 privcheck: 4242 /* 4243 * Build a privilege mask to determine if the set of privileges 4244 * satisfies the requirements when combined with the granted mask 4245 * from above. For each privilege, if the privilege is required, 4246 * bitwise or the request type onto the priv_granted mask. 4247 */ 4248 priv_granted = 0; 4249 4250 if (type == VDIR) { 4251 /* 4252 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC 4253 * requests, instead of PRIV_VFS_EXEC. 4254 */ 4255 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 4256 !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0)) 4257 priv_granted |= VEXEC; 4258 } else { 4259 /* 4260 * Ensure that at least one execute bit is on. Otherwise, 4261 * a privileged user will always succeed, and we don't want 4262 * this to happen unless the file really is executable. 4263 */ 4264 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 4265 (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && 4266 !priv_check_cred(cred, PRIV_VFS_EXEC, 0)) 4267 priv_granted |= VEXEC; 4268 } 4269 4270 if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && 4271 !priv_check_cred(cred, PRIV_VFS_READ, 0)) 4272 priv_granted |= VREAD; 4273 4274 if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && 4275 !priv_check_cred(cred, PRIV_VFS_WRITE, 0)) 4276 priv_granted |= (VWRITE | VAPPEND); 4277 4278 if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && 4279 !priv_check_cred(cred, PRIV_VFS_ADMIN, 0)) 4280 priv_granted |= VADMIN; 4281 4282 if ((accmode & (priv_granted | dac_granted)) == accmode) { 4283 /* XXX audit: privilege used */ 4284 if (privused != NULL) 4285 *privused = 1; 4286 return (0); 4287 } 4288 4289 return ((accmode & VADMIN) ? EPERM : EACCES); 4290 } 4291 4292 /* 4293 * Credential check based on process requesting service, and per-attribute 4294 * permissions. 4295 */ 4296 int 4297 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, 4298 struct thread *td, accmode_t accmode) 4299 { 4300 4301 /* 4302 * Kernel-invoked always succeeds. 4303 */ 4304 if (cred == NOCRED) 4305 return (0); 4306 4307 /* 4308 * Do not allow privileged processes in jail to directly manipulate 4309 * system attributes. 4310 */ 4311 switch (attrnamespace) { 4312 case EXTATTR_NAMESPACE_SYSTEM: 4313 /* Potentially should be: return (EPERM); */ 4314 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0)); 4315 case EXTATTR_NAMESPACE_USER: 4316 return (VOP_ACCESS(vp, accmode, cred, td)); 4317 default: 4318 return (EPERM); 4319 } 4320 } 4321 4322 #ifdef DEBUG_VFS_LOCKS 4323 /* 4324 * This only exists to supress warnings from unlocked specfs accesses. It is 4325 * no longer ok to have an unlocked VFS. 4326 */ 4327 #define IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL || \ 4328 (vp)->v_type == VCHR || (vp)->v_type == VBAD) 4329 4330 int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ 4331 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, 4332 "Drop into debugger on lock violation"); 4333 4334 int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ 4335 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 4336 0, "Check for interlock across VOPs"); 4337 4338 int vfs_badlock_print = 1; /* Print lock violations. */ 4339 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 4340 0, "Print lock violations"); 4341 4342 #ifdef KDB 4343 int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ 4344 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, 4345 &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); 4346 #endif 4347 4348 static void 4349 vfs_badlock(const char *msg, const char *str, struct vnode *vp) 4350 { 4351 4352 #ifdef KDB 4353 if (vfs_badlock_backtrace) 4354 kdb_backtrace(); 4355 #endif 4356 if (vfs_badlock_print) 4357 printf("%s: %p %s\n", str, (void *)vp, msg); 4358 if (vfs_badlock_ddb) 4359 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 4360 } 4361 4362 void 4363 assert_vi_locked(struct vnode *vp, const char *str) 4364 { 4365 4366 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) 4367 vfs_badlock("interlock is not locked but should be", str, vp); 4368 } 4369 4370 void 4371 assert_vi_unlocked(struct vnode *vp, const char *str) 4372 { 4373 4374 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) 4375 vfs_badlock("interlock is locked but should not be", str, vp); 4376 } 4377 4378 void 4379 assert_vop_locked(struct vnode *vp, const char *str) 4380 { 4381 int locked; 4382 4383 if (!IGNORE_LOCK(vp)) { 4384 locked = VOP_ISLOCKED(vp); 4385 if (locked == 0 || locked == LK_EXCLOTHER) 4386 vfs_badlock("is not locked but should be", str, vp); 4387 } 4388 } 4389 4390 void 4391 assert_vop_unlocked(struct vnode *vp, const char *str) 4392 { 4393 4394 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) 4395 vfs_badlock("is locked but should not be", str, vp); 4396 } 4397 4398 void 4399 assert_vop_elocked(struct vnode *vp, const char *str) 4400 { 4401 4402 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 4403 vfs_badlock("is not exclusive locked but should be", str, vp); 4404 } 4405 4406 #if 0 4407 void 4408 assert_vop_elocked_other(struct vnode *vp, const char *str) 4409 { 4410 4411 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER) 4412 vfs_badlock("is not exclusive locked by another thread", 4413 str, vp); 4414 } 4415 4416 void 4417 assert_vop_slocked(struct vnode *vp, const char *str) 4418 { 4419 4420 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED) 4421 vfs_badlock("is not locked shared but should be", str, vp); 4422 } 4423 #endif /* 0 */ 4424 #endif /* DEBUG_VFS_LOCKS */ 4425 4426 void 4427 vop_rename_fail(struct vop_rename_args *ap) 4428 { 4429 4430 if (ap->a_tvp != NULL) 4431 vput(ap->a_tvp); 4432 if (ap->a_tdvp == ap->a_tvp) 4433 vrele(ap->a_tdvp); 4434 else 4435 vput(ap->a_tdvp); 4436 vrele(ap->a_fdvp); 4437 vrele(ap->a_fvp); 4438 } 4439 4440 void 4441 vop_rename_pre(void *ap) 4442 { 4443 struct vop_rename_args *a = ap; 4444 4445 #ifdef DEBUG_VFS_LOCKS 4446 if (a->a_tvp) 4447 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); 4448 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); 4449 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); 4450 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); 4451 4452 /* Check the source (from). */ 4453 if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && 4454 (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) 4455 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); 4456 if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) 4457 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); 4458 4459 /* Check the target. */ 4460 if (a->a_tvp) 4461 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); 4462 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); 4463 #endif 4464 if (a->a_tdvp != a->a_fdvp) 4465 vhold(a->a_fdvp); 4466 if (a->a_tvp != a->a_fvp) 4467 vhold(a->a_fvp); 4468 vhold(a->a_tdvp); 4469 if (a->a_tvp) 4470 vhold(a->a_tvp); 4471 } 4472 4473 void 4474 vop_strategy_pre(void *ap) 4475 { 4476 #ifdef DEBUG_VFS_LOCKS 4477 struct vop_strategy_args *a; 4478 struct buf *bp; 4479 4480 a = ap; 4481 bp = a->a_bp; 4482 4483 /* 4484 * Cluster ops lock their component buffers but not the IO container. 4485 */ 4486 if ((bp->b_flags & B_CLUSTER) != 0) 4487 return; 4488 4489 if (panicstr == NULL && !BUF_ISLOCKED(bp)) { 4490 if (vfs_badlock_print) 4491 printf( 4492 "VOP_STRATEGY: bp is not locked but should be\n"); 4493 if (vfs_badlock_ddb) 4494 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 4495 } 4496 #endif 4497 } 4498 4499 void 4500 vop_lock_pre(void *ap) 4501 { 4502 #ifdef DEBUG_VFS_LOCKS 4503 struct vop_lock1_args *a = ap; 4504 4505 if ((a->a_flags & LK_INTERLOCK) == 0) 4506 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 4507 else 4508 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); 4509 #endif 4510 } 4511 4512 void 4513 vop_lock_post(void *ap, int rc) 4514 { 4515 #ifdef DEBUG_VFS_LOCKS 4516 struct vop_lock1_args *a = ap; 4517 4518 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 4519 if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) 4520 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); 4521 #endif 4522 } 4523 4524 void 4525 vop_unlock_pre(void *ap) 4526 { 4527 #ifdef DEBUG_VFS_LOCKS 4528 struct vop_unlock_args *a = ap; 4529 4530 if (a->a_flags & LK_INTERLOCK) 4531 ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK"); 4532 ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); 4533 #endif 4534 } 4535 4536 void 4537 vop_unlock_post(void *ap, int rc) 4538 { 4539 #ifdef DEBUG_VFS_LOCKS 4540 struct vop_unlock_args *a = ap; 4541 4542 if (a->a_flags & LK_INTERLOCK) 4543 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK"); 4544 #endif 4545 } 4546 4547 void 4548 vop_create_post(void *ap, int rc) 4549 { 4550 struct vop_create_args *a = ap; 4551 4552 if (!rc) 4553 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4554 } 4555 4556 void 4557 vop_deleteextattr_post(void *ap, int rc) 4558 { 4559 struct vop_deleteextattr_args *a = ap; 4560 4561 if (!rc) 4562 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 4563 } 4564 4565 void 4566 vop_link_post(void *ap, int rc) 4567 { 4568 struct vop_link_args *a = ap; 4569 4570 if (!rc) { 4571 VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK); 4572 VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE); 4573 } 4574 } 4575 4576 void 4577 vop_mkdir_post(void *ap, int rc) 4578 { 4579 struct vop_mkdir_args *a = ap; 4580 4581 if (!rc) 4582 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); 4583 } 4584 4585 void 4586 vop_mknod_post(void *ap, int rc) 4587 { 4588 struct vop_mknod_args *a = ap; 4589 4590 if (!rc) 4591 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4592 } 4593 4594 void 4595 vop_reclaim_post(void *ap, int rc) 4596 { 4597 struct vop_reclaim_args *a = ap; 4598 4599 if (!rc) 4600 VFS_KNOTE_LOCKED(a->a_vp, NOTE_REVOKE); 4601 } 4602 4603 void 4604 vop_remove_post(void *ap, int rc) 4605 { 4606 struct vop_remove_args *a = ap; 4607 4608 if (!rc) { 4609 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4610 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); 4611 } 4612 } 4613 4614 void 4615 vop_rename_post(void *ap, int rc) 4616 { 4617 struct vop_rename_args *a = ap; 4618 4619 if (!rc) { 4620 VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE); 4621 VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE); 4622 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); 4623 if (a->a_tvp) 4624 VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); 4625 } 4626 if (a->a_tdvp != a->a_fdvp) 4627 vdrop(a->a_fdvp); 4628 if (a->a_tvp != a->a_fvp) 4629 vdrop(a->a_fvp); 4630 vdrop(a->a_tdvp); 4631 if (a->a_tvp) 4632 vdrop(a->a_tvp); 4633 } 4634 4635 void 4636 vop_rmdir_post(void *ap, int rc) 4637 { 4638 struct vop_rmdir_args *a = ap; 4639 4640 if (!rc) { 4641 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); 4642 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); 4643 } 4644 } 4645 4646 void 4647 vop_setattr_post(void *ap, int rc) 4648 { 4649 struct vop_setattr_args *a = ap; 4650 4651 if (!rc) 4652 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 4653 } 4654 4655 void 4656 vop_setextattr_post(void *ap, int rc) 4657 { 4658 struct vop_setextattr_args *a = ap; 4659 4660 if (!rc) 4661 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 4662 } 4663 4664 void 4665 vop_symlink_post(void *ap, int rc) 4666 { 4667 struct vop_symlink_args *a = ap; 4668 4669 if (!rc) 4670 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4671 } 4672 4673 static struct knlist fs_knlist; 4674 4675 static void 4676 vfs_event_init(void *arg) 4677 { 4678 knlist_init_mtx(&fs_knlist, NULL); 4679 } 4680 /* XXX - correct order? */ 4681 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); 4682 4683 void 4684 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) 4685 { 4686 4687 KNOTE_UNLOCKED(&fs_knlist, event); 4688 } 4689 4690 static int filt_fsattach(struct knote *kn); 4691 static void filt_fsdetach(struct knote *kn); 4692 static int filt_fsevent(struct knote *kn, long hint); 4693 4694 struct filterops fs_filtops = { 4695 .f_isfd = 0, 4696 .f_attach = filt_fsattach, 4697 .f_detach = filt_fsdetach, 4698 .f_event = filt_fsevent 4699 }; 4700 4701 static int 4702 filt_fsattach(struct knote *kn) 4703 { 4704 4705 kn->kn_flags |= EV_CLEAR; 4706 knlist_add(&fs_knlist, kn, 0); 4707 return (0); 4708 } 4709 4710 static void 4711 filt_fsdetach(struct knote *kn) 4712 { 4713 4714 knlist_remove(&fs_knlist, kn, 0); 4715 } 4716 4717 static int 4718 filt_fsevent(struct knote *kn, long hint) 4719 { 4720 4721 kn->kn_fflags |= hint; 4722 return (kn->kn_fflags != 0); 4723 } 4724 4725 static int 4726 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) 4727 { 4728 struct vfsidctl vc; 4729 int error; 4730 struct mount *mp; 4731 4732 error = SYSCTL_IN(req, &vc, sizeof(vc)); 4733 if (error) 4734 return (error); 4735 if (vc.vc_vers != VFS_CTL_VERS1) 4736 return (EINVAL); 4737 mp = vfs_getvfs(&vc.vc_fsid); 4738 if (mp == NULL) 4739 return (ENOENT); 4740 /* ensure that a specific sysctl goes to the right filesystem. */ 4741 if (strcmp(vc.vc_fstypename, "*") != 0 && 4742 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { 4743 vfs_rel(mp); 4744 return (EINVAL); 4745 } 4746 VCTLTOREQ(&vc, req); 4747 error = VFS_SYSCTL(mp, vc.vc_op, req); 4748 vfs_rel(mp); 4749 return (error); 4750 } 4751 4752 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR, 4753 NULL, 0, sysctl_vfs_ctl, "", 4754 "Sysctl by fsid"); 4755 4756 /* 4757 * Function to initialize a va_filerev field sensibly. 4758 * XXX: Wouldn't a random number make a lot more sense ?? 4759 */ 4760 u_quad_t 4761 init_va_filerev(void) 4762 { 4763 struct bintime bt; 4764 4765 getbinuptime(&bt); 4766 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); 4767 } 4768 4769 static int filt_vfsread(struct knote *kn, long hint); 4770 static int filt_vfswrite(struct knote *kn, long hint); 4771 static int filt_vfsvnode(struct knote *kn, long hint); 4772 static void filt_vfsdetach(struct knote *kn); 4773 static struct filterops vfsread_filtops = { 4774 .f_isfd = 1, 4775 .f_detach = filt_vfsdetach, 4776 .f_event = filt_vfsread 4777 }; 4778 static struct filterops vfswrite_filtops = { 4779 .f_isfd = 1, 4780 .f_detach = filt_vfsdetach, 4781 .f_event = filt_vfswrite 4782 }; 4783 static struct filterops vfsvnode_filtops = { 4784 .f_isfd = 1, 4785 .f_detach = filt_vfsdetach, 4786 .f_event = filt_vfsvnode 4787 }; 4788 4789 static void 4790 vfs_knllock(void *arg) 4791 { 4792 struct vnode *vp = arg; 4793 4794 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 4795 } 4796 4797 static void 4798 vfs_knlunlock(void *arg) 4799 { 4800 struct vnode *vp = arg; 4801 4802 VOP_UNLOCK(vp, 0); 4803 } 4804 4805 static void 4806 vfs_knl_assert_locked(void *arg) 4807 { 4808 #ifdef DEBUG_VFS_LOCKS 4809 struct vnode *vp = arg; 4810 4811 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); 4812 #endif 4813 } 4814 4815 static void 4816 vfs_knl_assert_unlocked(void *arg) 4817 { 4818 #ifdef DEBUG_VFS_LOCKS 4819 struct vnode *vp = arg; 4820 4821 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); 4822 #endif 4823 } 4824 4825 int 4826 vfs_kqfilter(struct vop_kqfilter_args *ap) 4827 { 4828 struct vnode *vp = ap->a_vp; 4829 struct knote *kn = ap->a_kn; 4830 struct knlist *knl; 4831 4832 switch (kn->kn_filter) { 4833 case EVFILT_READ: 4834 kn->kn_fop = &vfsread_filtops; 4835 break; 4836 case EVFILT_WRITE: 4837 kn->kn_fop = &vfswrite_filtops; 4838 break; 4839 case EVFILT_VNODE: 4840 kn->kn_fop = &vfsvnode_filtops; 4841 break; 4842 default: 4843 return (EINVAL); 4844 } 4845 4846 kn->kn_hook = (caddr_t)vp; 4847 4848 v_addpollinfo(vp); 4849 if (vp->v_pollinfo == NULL) 4850 return (ENOMEM); 4851 knl = &vp->v_pollinfo->vpi_selinfo.si_note; 4852 vhold(vp); 4853 knlist_add(knl, kn, 0); 4854 4855 return (0); 4856 } 4857 4858 /* 4859 * Detach knote from vnode 4860 */ 4861 static void 4862 filt_vfsdetach(struct knote *kn) 4863 { 4864 struct vnode *vp = (struct vnode *)kn->kn_hook; 4865 4866 KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); 4867 knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); 4868 vdrop(vp); 4869 } 4870 4871 /*ARGSUSED*/ 4872 static int 4873 filt_vfsread(struct knote *kn, long hint) 4874 { 4875 struct vnode *vp = (struct vnode *)kn->kn_hook; 4876 struct vattr va; 4877 int res; 4878 4879 /* 4880 * filesystem is gone, so set the EOF flag and schedule 4881 * the knote for deletion. 4882 */ 4883 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 4884 VI_LOCK(vp); 4885 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 4886 VI_UNLOCK(vp); 4887 return (1); 4888 } 4889 4890 if (VOP_GETATTR(vp, &va, curthread->td_ucred)) 4891 return (0); 4892 4893 VI_LOCK(vp); 4894 kn->kn_data = va.va_size - kn->kn_fp->f_offset; 4895 res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; 4896 VI_UNLOCK(vp); 4897 return (res); 4898 } 4899 4900 /*ARGSUSED*/ 4901 static int 4902 filt_vfswrite(struct knote *kn, long hint) 4903 { 4904 struct vnode *vp = (struct vnode *)kn->kn_hook; 4905 4906 VI_LOCK(vp); 4907 4908 /* 4909 * filesystem is gone, so set the EOF flag and schedule 4910 * the knote for deletion. 4911 */ 4912 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) 4913 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 4914 4915 kn->kn_data = 0; 4916 VI_UNLOCK(vp); 4917 return (1); 4918 } 4919 4920 static int 4921 filt_vfsvnode(struct knote *kn, long hint) 4922 { 4923 struct vnode *vp = (struct vnode *)kn->kn_hook; 4924 int res; 4925 4926 VI_LOCK(vp); 4927 if (kn->kn_sfflags & hint) 4928 kn->kn_fflags |= hint; 4929 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 4930 kn->kn_flags |= EV_EOF; 4931 VI_UNLOCK(vp); 4932 return (1); 4933 } 4934 res = (kn->kn_fflags != 0); 4935 VI_UNLOCK(vp); 4936 return (res); 4937 } 4938 4939 int 4940 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) 4941 { 4942 int error; 4943 4944 if (dp->d_reclen > ap->a_uio->uio_resid) 4945 return (ENAMETOOLONG); 4946 error = uiomove(dp, dp->d_reclen, ap->a_uio); 4947 if (error) { 4948 if (ap->a_ncookies != NULL) { 4949 if (ap->a_cookies != NULL) 4950 free(ap->a_cookies, M_TEMP); 4951 ap->a_cookies = NULL; 4952 *ap->a_ncookies = 0; 4953 } 4954 return (error); 4955 } 4956 if (ap->a_ncookies == NULL) 4957 return (0); 4958 4959 KASSERT(ap->a_cookies, 4960 ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); 4961 4962 *ap->a_cookies = realloc(*ap->a_cookies, 4963 (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO); 4964 (*ap->a_cookies)[*ap->a_ncookies] = off; 4965 return (0); 4966 } 4967 4968 /* 4969 * Mark for update the access time of the file if the filesystem 4970 * supports VOP_MARKATIME. This functionality is used by execve and 4971 * mmap, so we want to avoid the I/O implied by directly setting 4972 * va_atime for the sake of efficiency. 4973 */ 4974 void 4975 vfs_mark_atime(struct vnode *vp, struct ucred *cred) 4976 { 4977 struct mount *mp; 4978 4979 mp = vp->v_mount; 4980 ASSERT_VOP_LOCKED(vp, "vfs_mark_atime"); 4981 if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) 4982 (void)VOP_MARKATIME(vp); 4983 } 4984 4985 /* 4986 * The purpose of this routine is to remove granularity from accmode_t, 4987 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, 4988 * VADMIN and VAPPEND. 4989 * 4990 * If it returns 0, the caller is supposed to continue with the usual 4991 * access checks using 'accmode' as modified by this routine. If it 4992 * returns nonzero value, the caller is supposed to return that value 4993 * as errno. 4994 * 4995 * Note that after this routine runs, accmode may be zero. 4996 */ 4997 int 4998 vfs_unixify_accmode(accmode_t *accmode) 4999 { 5000 /* 5001 * There is no way to specify explicit "deny" rule using 5002 * file mode or POSIX.1e ACLs. 5003 */ 5004 if (*accmode & VEXPLICIT_DENY) { 5005 *accmode = 0; 5006 return (0); 5007 } 5008 5009 /* 5010 * None of these can be translated into usual access bits. 5011 * Also, the common case for NFSv4 ACLs is to not contain 5012 * either of these bits. Caller should check for VWRITE 5013 * on the containing directory instead. 5014 */ 5015 if (*accmode & (VDELETE_CHILD | VDELETE)) 5016 return (EPERM); 5017 5018 if (*accmode & VADMIN_PERMS) { 5019 *accmode &= ~VADMIN_PERMS; 5020 *accmode |= VADMIN; 5021 } 5022 5023 /* 5024 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL 5025 * or VSYNCHRONIZE using file mode or POSIX.1e ACL. 5026 */ 5027 *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); 5028 5029 return (0); 5030 } 5031 5032 /* 5033 * These are helper functions for filesystems to traverse all 5034 * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. 5035 * 5036 * This interface replaces MNT_VNODE_FOREACH. 5037 */ 5038 5039 MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); 5040 5041 struct vnode * 5042 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) 5043 { 5044 struct vnode *vp; 5045 5046 if (should_yield()) 5047 kern_yield(PRI_USER); 5048 MNT_ILOCK(mp); 5049 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 5050 vp = TAILQ_NEXT(*mvp, v_nmntvnodes); 5051 while (vp != NULL && (vp->v_type == VMARKER || 5052 (vp->v_iflag & VI_DOOMED) != 0)) 5053 vp = TAILQ_NEXT(vp, v_nmntvnodes); 5054 5055 /* Check if we are done */ 5056 if (vp == NULL) { 5057 __mnt_vnode_markerfree_all(mvp, mp); 5058 /* MNT_IUNLOCK(mp); -- done in above function */ 5059 mtx_assert(MNT_MTX(mp), MA_NOTOWNED); 5060 return (NULL); 5061 } 5062 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 5063 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 5064 VI_LOCK(vp); 5065 MNT_IUNLOCK(mp); 5066 return (vp); 5067 } 5068 5069 struct vnode * 5070 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) 5071 { 5072 struct vnode *vp; 5073 5074 *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 5075 MNT_ILOCK(mp); 5076 MNT_REF(mp); 5077 (*mvp)->v_type = VMARKER; 5078 5079 vp = TAILQ_FIRST(&mp->mnt_nvnodelist); 5080 while (vp != NULL && (vp->v_type == VMARKER || 5081 (vp->v_iflag & VI_DOOMED) != 0)) 5082 vp = TAILQ_NEXT(vp, v_nmntvnodes); 5083 5084 /* Check if we are done */ 5085 if (vp == NULL) { 5086 MNT_REL(mp); 5087 MNT_IUNLOCK(mp); 5088 free(*mvp, M_VNODE_MARKER); 5089 *mvp = NULL; 5090 return (NULL); 5091 } 5092 (*mvp)->v_mount = mp; 5093 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 5094 VI_LOCK(vp); 5095 MNT_IUNLOCK(mp); 5096 return (vp); 5097 } 5098 5099 5100 void 5101 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) 5102 { 5103 5104 if (*mvp == NULL) { 5105 MNT_IUNLOCK(mp); 5106 return; 5107 } 5108 5109 mtx_assert(MNT_MTX(mp), MA_OWNED); 5110 5111 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 5112 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 5113 MNT_REL(mp); 5114 MNT_IUNLOCK(mp); 5115 free(*mvp, M_VNODE_MARKER); 5116 *mvp = NULL; 5117 } 5118 5119 /* 5120 * These are helper functions for filesystems to traverse their 5121 * active vnodes. See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h 5122 */ 5123 static void 5124 mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) 5125 { 5126 5127 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 5128 5129 MNT_ILOCK(mp); 5130 MNT_REL(mp); 5131 MNT_IUNLOCK(mp); 5132 free(*mvp, M_VNODE_MARKER); 5133 *mvp = NULL; 5134 } 5135 5136 static struct vnode * 5137 mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) 5138 { 5139 struct vnode *vp, *nvp; 5140 5141 mtx_assert(&vnode_free_list_mtx, MA_OWNED); 5142 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 5143 restart: 5144 vp = TAILQ_NEXT(*mvp, v_actfreelist); 5145 TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); 5146 while (vp != NULL) { 5147 if (vp->v_type == VMARKER) { 5148 vp = TAILQ_NEXT(vp, v_actfreelist); 5149 continue; 5150 } 5151 if (!VI_TRYLOCK(vp)) { 5152 if (mp_ncpus == 1 || should_yield()) { 5153 TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist); 5154 mtx_unlock(&vnode_free_list_mtx); 5155 pause("vnacti", 1); 5156 mtx_lock(&vnode_free_list_mtx); 5157 goto restart; 5158 } 5159 continue; 5160 } 5161 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); 5162 KASSERT(vp->v_mount == mp || vp->v_mount == NULL, 5163 ("alien vnode on the active list %p %p", vp, mp)); 5164 if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0) 5165 break; 5166 nvp = TAILQ_NEXT(vp, v_actfreelist); 5167 VI_UNLOCK(vp); 5168 vp = nvp; 5169 } 5170 5171 /* Check if we are done */ 5172 if (vp == NULL) { 5173 mtx_unlock(&vnode_free_list_mtx); 5174 mnt_vnode_markerfree_active(mvp, mp); 5175 return (NULL); 5176 } 5177 TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist); 5178 mtx_unlock(&vnode_free_list_mtx); 5179 ASSERT_VI_LOCKED(vp, "active iter"); 5180 KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp)); 5181 return (vp); 5182 } 5183 5184 struct vnode * 5185 __mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) 5186 { 5187 5188 if (should_yield()) 5189 kern_yield(PRI_USER); 5190 mtx_lock(&vnode_free_list_mtx); 5191 return (mnt_vnode_next_active(mvp, mp)); 5192 } 5193 5194 struct vnode * 5195 __mnt_vnode_first_active(struct vnode **mvp, struct mount *mp) 5196 { 5197 struct vnode *vp; 5198 5199 *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 5200 MNT_ILOCK(mp); 5201 MNT_REF(mp); 5202 MNT_IUNLOCK(mp); 5203 (*mvp)->v_type = VMARKER; 5204 (*mvp)->v_mount = mp; 5205 5206 mtx_lock(&vnode_free_list_mtx); 5207 vp = TAILQ_FIRST(&mp->mnt_activevnodelist); 5208 if (vp == NULL) { 5209 mtx_unlock(&vnode_free_list_mtx); 5210 mnt_vnode_markerfree_active(mvp, mp); 5211 return (NULL); 5212 } 5213 TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist); 5214 return (mnt_vnode_next_active(mvp, mp)); 5215 } 5216 5217 void 5218 __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) 5219 { 5220 5221 if (*mvp == NULL) 5222 return; 5223 5224 mtx_lock(&vnode_free_list_mtx); 5225 TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); 5226 mtx_unlock(&vnode_free_list_mtx); 5227 mnt_vnode_markerfree_active(mvp, mp); 5228 } 5229