xref: /freebsd/sys/kern/vfs_subr.c (revision 277fbb92d5e4cd0938c67f77b08d9ba4ac9d54a6)
1 /*-
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
35  */
36 
37 /*
38  * External virtual filesystem routines
39  */
40 
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 #include "opt_compat.h"
45 #include "opt_ddb.h"
46 #include "opt_watchdog.h"
47 
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/bio.h>
51 #include <sys/buf.h>
52 #include <sys/condvar.h>
53 #include <sys/conf.h>
54 #include <sys/dirent.h>
55 #include <sys/event.h>
56 #include <sys/eventhandler.h>
57 #include <sys/extattr.h>
58 #include <sys/file.h>
59 #include <sys/fcntl.h>
60 #include <sys/jail.h>
61 #include <sys/kdb.h>
62 #include <sys/kernel.h>
63 #include <sys/kthread.h>
64 #include <sys/lockf.h>
65 #include <sys/malloc.h>
66 #include <sys/mount.h>
67 #include <sys/namei.h>
68 #include <sys/pctrie.h>
69 #include <sys/priv.h>
70 #include <sys/reboot.h>
71 #include <sys/refcount.h>
72 #include <sys/rwlock.h>
73 #include <sys/sched.h>
74 #include <sys/sleepqueue.h>
75 #include <sys/smp.h>
76 #include <sys/stat.h>
77 #include <sys/sysctl.h>
78 #include <sys/syslog.h>
79 #include <sys/vmmeter.h>
80 #include <sys/vnode.h>
81 #include <sys/watchdog.h>
82 
83 #include <machine/stdarg.h>
84 
85 #include <security/mac/mac_framework.h>
86 
87 #include <vm/vm.h>
88 #include <vm/vm_object.h>
89 #include <vm/vm_extern.h>
90 #include <vm/pmap.h>
91 #include <vm/vm_map.h>
92 #include <vm/vm_page.h>
93 #include <vm/vm_kern.h>
94 #include <vm/uma.h>
95 
96 #ifdef DDB
97 #include <ddb/ddb.h>
98 #endif
99 
100 static void	delmntque(struct vnode *vp);
101 static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
102 		    int slpflag, int slptimeo);
103 static void	syncer_shutdown(void *arg, int howto);
104 static int	vtryrecycle(struct vnode *vp);
105 static void	v_init_counters(struct vnode *);
106 static void	v_incr_usecount(struct vnode *);
107 static void	v_incr_devcount(struct vnode *);
108 static void	v_decr_devcount(struct vnode *);
109 static void	vnlru_free(int);
110 static void	vgonel(struct vnode *);
111 static void	vfs_knllock(void *arg);
112 static void	vfs_knlunlock(void *arg);
113 static void	vfs_knl_assert_locked(void *arg);
114 static void	vfs_knl_assert_unlocked(void *arg);
115 static void	destroy_vpollinfo(struct vpollinfo *vi);
116 
117 /*
118  * Number of vnodes in existence.  Increased whenever getnewvnode()
119  * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode.
120  */
121 static unsigned long	numvnodes;
122 
123 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
124     "Number of vnodes in existence");
125 
126 static u_long vnodes_created;
127 SYSCTL_ULONG(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
128     0, "Number of vnodes created by getnewvnode");
129 
130 /*
131  * Conversion tables for conversion from vnode types to inode formats
132  * and back.
133  */
134 enum vtype iftovt_tab[16] = {
135 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
136 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
137 };
138 int vttoif_tab[10] = {
139 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
140 	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
141 };
142 
143 /*
144  * List of vnodes that are ready for recycling.
145  */
146 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
147 
148 /*
149  * "Free" vnode target.  Free vnodes are rarely completely free, but are
150  * just ones that are cheap to recycle.  Usually they are for files which
151  * have been stat'd but not read; these usually have inode and namecache
152  * data attached to them.  This target is the preferred minimum size of a
153  * sub-cache consisting mostly of such files. The system balances the size
154  * of this sub-cache with its complement to try to prevent either from
155  * thrashing while the other is relatively inactive.  The targets express
156  * a preference for the best balance.
157  *
158  * "Above" this target there are 2 further targets (watermarks) related
159  * to recyling of free vnodes.  In the best-operating case, the cache is
160  * exactly full, the free list has size between vlowat and vhiwat above the
161  * free target, and recycling from it and normal use maintains this state.
162  * Sometimes the free list is below vlowat or even empty, but this state
163  * is even better for immediate use provided the cache is not full.
164  * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
165  * ones) to reach one of these states.  The watermarks are currently hard-
166  * coded as 4% and 9% of the available space higher.  These and the default
167  * of 25% for wantfreevnodes are too large if the memory size is large.
168  * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
169  * whenever vnlru_proc() becomes active.
170  */
171 static u_long wantfreevnodes;
172 SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW,
173     &wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes");
174 static u_long freevnodes;
175 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
176     &freevnodes, 0, "Number of \"free\" vnodes");
177 
178 static u_long recycles_count;
179 SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 0,
180     "Number of vnodes recycled to meet vnode cache targets");
181 
182 /*
183  * Various variables used for debugging the new implementation of
184  * reassignbuf().
185  * XXX these are probably of (very) limited utility now.
186  */
187 static int reassignbufcalls;
188 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
189     "Number of calls to reassignbuf");
190 
191 static u_long free_owe_inact;
192 SYSCTL_ULONG(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact, 0,
193     "Number of times free vnodes kept on active list due to VFS "
194     "owing inactivation");
195 
196 /* To keep more than one thread at a time from running vfs_getnewfsid */
197 static struct mtx mntid_mtx;
198 
199 /*
200  * Lock for any access to the following:
201  *	vnode_free_list
202  *	numvnodes
203  *	freevnodes
204  */
205 static struct mtx vnode_free_list_mtx;
206 
207 /* Publicly exported FS */
208 struct nfs_public nfs_pub;
209 
210 static uma_zone_t buf_trie_zone;
211 
212 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
213 static uma_zone_t vnode_zone;
214 static uma_zone_t vnodepoll_zone;
215 
216 /*
217  * The workitem queue.
218  *
219  * It is useful to delay writes of file data and filesystem metadata
220  * for tens of seconds so that quickly created and deleted files need
221  * not waste disk bandwidth being created and removed. To realize this,
222  * we append vnodes to a "workitem" queue. When running with a soft
223  * updates implementation, most pending metadata dependencies should
224  * not wait for more than a few seconds. Thus, mounted on block devices
225  * are delayed only about a half the time that file data is delayed.
226  * Similarly, directory updates are more critical, so are only delayed
227  * about a third the time that file data is delayed. Thus, there are
228  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
229  * one each second (driven off the filesystem syncer process). The
230  * syncer_delayno variable indicates the next queue that is to be processed.
231  * Items that need to be processed soon are placed in this queue:
232  *
233  *	syncer_workitem_pending[syncer_delayno]
234  *
235  * A delay of fifteen seconds is done by placing the request fifteen
236  * entries later in the queue:
237  *
238  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
239  *
240  */
241 static int syncer_delayno;
242 static long syncer_mask;
243 LIST_HEAD(synclist, bufobj);
244 static struct synclist *syncer_workitem_pending;
245 /*
246  * The sync_mtx protects:
247  *	bo->bo_synclist
248  *	sync_vnode_count
249  *	syncer_delayno
250  *	syncer_state
251  *	syncer_workitem_pending
252  *	syncer_worklist_len
253  *	rushjob
254  */
255 static struct mtx sync_mtx;
256 static struct cv sync_wakeup;
257 
258 #define SYNCER_MAXDELAY		32
259 static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
260 static int syncdelay = 30;		/* max time to delay syncing data */
261 static int filedelay = 30;		/* time to delay syncing files */
262 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
263     "Time to delay syncing files (in seconds)");
264 static int dirdelay = 29;		/* time to delay syncing directories */
265 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
266     "Time to delay syncing directories (in seconds)");
267 static int metadelay = 28;		/* time to delay syncing metadata */
268 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
269     "Time to delay syncing metadata (in seconds)");
270 static int rushjob;		/* number of slots to run ASAP */
271 static int stat_rush_requests;	/* number of times I/O speeded up */
272 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
273     "Number of times I/O speeded up (rush requests)");
274 
275 /*
276  * When shutting down the syncer, run it at four times normal speed.
277  */
278 #define SYNCER_SHUTDOWN_SPEEDUP		4
279 static int sync_vnode_count;
280 static int syncer_worklist_len;
281 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
282     syncer_state;
283 
284 /* Target for maximum number of vnodes. */
285 int desiredvnodes;
286 static int gapvnodes;		/* gap between wanted and desired */
287 static int vhiwat;		/* enough extras after expansion */
288 static int vlowat;		/* minimal extras before expansion */
289 static int vstir;		/* nonzero to stir non-free vnodes */
290 static volatile int vsmalltrigger = 8;	/* pref to keep if > this many pages */
291 
292 static int
293 sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
294 {
295 	int error, old_desiredvnodes;
296 
297 	old_desiredvnodes = desiredvnodes;
298 	if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0)
299 		return (error);
300 	if (old_desiredvnodes != desiredvnodes) {
301 		wantfreevnodes = desiredvnodes / 4;
302 		/* XXX locking seems to be incomplete. */
303 		vfs_hash_changesize(desiredvnodes);
304 		cache_changesize(desiredvnodes);
305 	}
306 	return (0);
307 }
308 
309 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
310     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0,
311     sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes");
312 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
313     &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)");
314 static int vnlru_nowhere;
315 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
316     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
317 
318 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
319 static int vnsz2log;
320 
321 /*
322  * Support for the bufobj clean & dirty pctrie.
323  */
324 static void *
325 buf_trie_alloc(struct pctrie *ptree)
326 {
327 
328 	return uma_zalloc(buf_trie_zone, M_NOWAIT);
329 }
330 
331 static void
332 buf_trie_free(struct pctrie *ptree, void *node)
333 {
334 
335 	uma_zfree(buf_trie_zone, node);
336 }
337 PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);
338 
339 /*
340  * Initialize the vnode management data structures.
341  *
342  * Reevaluate the following cap on the number of vnodes after the physical
343  * memory size exceeds 512GB.  In the limit, as the physical memory size
344  * grows, the ratio of the memory size in KB to to vnodes approaches 64:1.
345  */
346 #ifndef	MAXVNODES_MAX
347 #define	MAXVNODES_MAX	(512 * 1024 * 1024 / 64)	/* 8M */
348 #endif
349 static void
350 vntblinit(void *dummy __unused)
351 {
352 	u_int i;
353 	int physvnodes, virtvnodes;
354 
355 	/*
356 	 * Desiredvnodes is a function of the physical memory size and the
357 	 * kernel's heap size.  Generally speaking, it scales with the
358 	 * physical memory size.  The ratio of desiredvnodes to the physical
359 	 * memory size is 1:16 until desiredvnodes exceeds 98,304.
360 	 * Thereafter, the
361 	 * marginal ratio of desiredvnodes to the physical memory size is
362 	 * 1:64.  However, desiredvnodes is limited by the kernel's heap
363 	 * size.  The memory required by desiredvnodes vnodes and vm objects
364 	 * must not exceed 1/7th of the kernel's heap size.
365 	 */
366 	physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 +
367 	    3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64;
368 	virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) +
369 	    sizeof(struct vnode)));
370 	desiredvnodes = min(physvnodes, virtvnodes);
371 	if (desiredvnodes > MAXVNODES_MAX) {
372 		if (bootverbose)
373 			printf("Reducing kern.maxvnodes %d -> %d\n",
374 			    desiredvnodes, MAXVNODES_MAX);
375 		desiredvnodes = MAXVNODES_MAX;
376 	}
377 	wantfreevnodes = desiredvnodes / 4;
378 	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
379 	TAILQ_INIT(&vnode_free_list);
380 	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
381 	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
382 	    NULL, NULL, UMA_ALIGN_PTR, 0);
383 	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
384 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
385 	/*
386 	 * Preallocate enough nodes to support one-per buf so that
387 	 * we can not fail an insert.  reassignbuf() callers can not
388 	 * tolerate the insertion failure.
389 	 */
390 	buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
391 	    NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR,
392 	    UMA_ZONE_NOFREE | UMA_ZONE_VM);
393 	uma_prealloc(buf_trie_zone, nbuf);
394 	/*
395 	 * Initialize the filesystem syncer.
396 	 */
397 	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
398 	    &syncer_mask);
399 	syncer_maxdelay = syncer_mask + 1;
400 	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
401 	cv_init(&sync_wakeup, "syncer");
402 	for (i = 1; i <= sizeof(struct vnode); i <<= 1)
403 		vnsz2log++;
404 	vnsz2log--;
405 }
406 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
407 
408 
409 /*
410  * Mark a mount point as busy. Used to synchronize access and to delay
411  * unmounting. Eventually, mountlist_mtx is not released on failure.
412  *
413  * vfs_busy() is a custom lock, it can block the caller.
414  * vfs_busy() only sleeps if the unmount is active on the mount point.
415  * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
416  * vnode belonging to mp.
417  *
418  * Lookup uses vfs_busy() to traverse mount points.
419  * root fs			var fs
420  * / vnode lock		A	/ vnode lock (/var)		D
421  * /var vnode lock	B	/log vnode lock(/var/log)	E
422  * vfs_busy lock	C	vfs_busy lock			F
423  *
424  * Within each file system, the lock order is C->A->B and F->D->E.
425  *
426  * When traversing across mounts, the system follows that lock order:
427  *
428  *        C->A->B
429  *              |
430  *              +->F->D->E
431  *
432  * The lookup() process for namei("/var") illustrates the process:
433  *  VOP_LOOKUP() obtains B while A is held
434  *  vfs_busy() obtains a shared lock on F while A and B are held
435  *  vput() releases lock on B
436  *  vput() releases lock on A
437  *  VFS_ROOT() obtains lock on D while shared lock on F is held
438  *  vfs_unbusy() releases shared lock on F
439  *  vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
440  *    Attempt to lock A (instead of vp_crossmp) while D is held would
441  *    violate the global order, causing deadlocks.
442  *
443  * dounmount() locks B while F is drained.
444  */
445 int
446 vfs_busy(struct mount *mp, int flags)
447 {
448 
449 	MPASS((flags & ~MBF_MASK) == 0);
450 	CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
451 
452 	MNT_ILOCK(mp);
453 	MNT_REF(mp);
454 	/*
455 	 * If mount point is currenly being unmounted, sleep until the
456 	 * mount point fate is decided.  If thread doing the unmounting fails,
457 	 * it will clear MNTK_UNMOUNT flag before waking us up, indicating
458 	 * that this mount point has survived the unmount attempt and vfs_busy
459 	 * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
460 	 * flag in addition to MNTK_UNMOUNT, indicating that mount point is
461 	 * about to be really destroyed.  vfs_busy needs to release its
462 	 * reference on the mount point in this case and return with ENOENT,
463 	 * telling the caller that mount mount it tried to busy is no longer
464 	 * valid.
465 	 */
466 	while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
467 		if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
468 			MNT_REL(mp);
469 			MNT_IUNLOCK(mp);
470 			CTR1(KTR_VFS, "%s: failed busying before sleeping",
471 			    __func__);
472 			return (ENOENT);
473 		}
474 		if (flags & MBF_MNTLSTLOCK)
475 			mtx_unlock(&mountlist_mtx);
476 		mp->mnt_kern_flag |= MNTK_MWAIT;
477 		msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
478 		if (flags & MBF_MNTLSTLOCK)
479 			mtx_lock(&mountlist_mtx);
480 		MNT_ILOCK(mp);
481 	}
482 	if (flags & MBF_MNTLSTLOCK)
483 		mtx_unlock(&mountlist_mtx);
484 	mp->mnt_lockref++;
485 	MNT_IUNLOCK(mp);
486 	return (0);
487 }
488 
489 /*
490  * Free a busy filesystem.
491  */
492 void
493 vfs_unbusy(struct mount *mp)
494 {
495 
496 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
497 	MNT_ILOCK(mp);
498 	MNT_REL(mp);
499 	KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref"));
500 	mp->mnt_lockref--;
501 	if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
502 		MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
503 		CTR1(KTR_VFS, "%s: waking up waiters", __func__);
504 		mp->mnt_kern_flag &= ~MNTK_DRAINING;
505 		wakeup(&mp->mnt_lockref);
506 	}
507 	MNT_IUNLOCK(mp);
508 }
509 
510 /*
511  * Lookup a mount point by filesystem identifier.
512  */
513 struct mount *
514 vfs_getvfs(fsid_t *fsid)
515 {
516 	struct mount *mp;
517 
518 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
519 	mtx_lock(&mountlist_mtx);
520 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
521 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
522 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
523 			vfs_ref(mp);
524 			mtx_unlock(&mountlist_mtx);
525 			return (mp);
526 		}
527 	}
528 	mtx_unlock(&mountlist_mtx);
529 	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
530 	return ((struct mount *) 0);
531 }
532 
533 /*
534  * Lookup a mount point by filesystem identifier, busying it before
535  * returning.
536  *
537  * To avoid congestion on mountlist_mtx, implement simple direct-mapped
538  * cache for popular filesystem identifiers.  The cache is lockess, using
539  * the fact that struct mount's are never freed.  In worst case we may
540  * get pointer to unmounted or even different filesystem, so we have to
541  * check what we got, and go slow way if so.
542  */
543 struct mount *
544 vfs_busyfs(fsid_t *fsid)
545 {
546 #define	FSID_CACHE_SIZE	256
547 	typedef struct mount * volatile vmp_t;
548 	static vmp_t cache[FSID_CACHE_SIZE];
549 	struct mount *mp;
550 	int error;
551 	uint32_t hash;
552 
553 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
554 	hash = fsid->val[0] ^ fsid->val[1];
555 	hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
556 	mp = cache[hash];
557 	if (mp == NULL ||
558 	    mp->mnt_stat.f_fsid.val[0] != fsid->val[0] ||
559 	    mp->mnt_stat.f_fsid.val[1] != fsid->val[1])
560 		goto slow;
561 	if (vfs_busy(mp, 0) != 0) {
562 		cache[hash] = NULL;
563 		goto slow;
564 	}
565 	if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
566 	    mp->mnt_stat.f_fsid.val[1] == fsid->val[1])
567 		return (mp);
568 	else
569 	    vfs_unbusy(mp);
570 
571 slow:
572 	mtx_lock(&mountlist_mtx);
573 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
574 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
575 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
576 			error = vfs_busy(mp, MBF_MNTLSTLOCK);
577 			if (error) {
578 				cache[hash] = NULL;
579 				mtx_unlock(&mountlist_mtx);
580 				return (NULL);
581 			}
582 			cache[hash] = mp;
583 			return (mp);
584 		}
585 	}
586 	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
587 	mtx_unlock(&mountlist_mtx);
588 	return ((struct mount *) 0);
589 }
590 
591 /*
592  * Check if a user can access privileged mount options.
593  */
594 int
595 vfs_suser(struct mount *mp, struct thread *td)
596 {
597 	int error;
598 
599 	/*
600 	 * If the thread is jailed, but this is not a jail-friendly file
601 	 * system, deny immediately.
602 	 */
603 	if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred))
604 		return (EPERM);
605 
606 	/*
607 	 * If the file system was mounted outside the jail of the calling
608 	 * thread, deny immediately.
609 	 */
610 	if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
611 		return (EPERM);
612 
613 	/*
614 	 * If file system supports delegated administration, we don't check
615 	 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
616 	 * by the file system itself.
617 	 * If this is not the user that did original mount, we check for
618 	 * the PRIV_VFS_MOUNT_OWNER privilege.
619 	 */
620 	if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
621 	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
622 		if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
623 			return (error);
624 	}
625 	return (0);
626 }
627 
628 /*
629  * Get a new unique fsid.  Try to make its val[0] unique, since this value
630  * will be used to create fake device numbers for stat().  Also try (but
631  * not so hard) make its val[0] unique mod 2^16, since some emulators only
632  * support 16-bit device numbers.  We end up with unique val[0]'s for the
633  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
634  *
635  * Keep in mind that several mounts may be running in parallel.  Starting
636  * the search one past where the previous search terminated is both a
637  * micro-optimization and a defense against returning the same fsid to
638  * different mounts.
639  */
640 void
641 vfs_getnewfsid(struct mount *mp)
642 {
643 	static uint16_t mntid_base;
644 	struct mount *nmp;
645 	fsid_t tfsid;
646 	int mtype;
647 
648 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
649 	mtx_lock(&mntid_mtx);
650 	mtype = mp->mnt_vfc->vfc_typenum;
651 	tfsid.val[1] = mtype;
652 	mtype = (mtype & 0xFF) << 24;
653 	for (;;) {
654 		tfsid.val[0] = makedev(255,
655 		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
656 		mntid_base++;
657 		if ((nmp = vfs_getvfs(&tfsid)) == NULL)
658 			break;
659 		vfs_rel(nmp);
660 	}
661 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
662 	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
663 	mtx_unlock(&mntid_mtx);
664 }
665 
666 /*
667  * Knob to control the precision of file timestamps:
668  *
669  *   0 = seconds only; nanoseconds zeroed.
670  *   1 = seconds and nanoseconds, accurate within 1/HZ.
671  *   2 = seconds and nanoseconds, truncated to microseconds.
672  * >=3 = seconds and nanoseconds, maximum precision.
673  */
674 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
675 
676 static int timestamp_precision = TSP_USEC;
677 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
678     &timestamp_precision, 0, "File timestamp precision (0: seconds, "
679     "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, "
680     "3+: sec + ns (max. precision))");
681 
682 /*
683  * Get a current timestamp.
684  */
685 void
686 vfs_timestamp(struct timespec *tsp)
687 {
688 	struct timeval tv;
689 
690 	switch (timestamp_precision) {
691 	case TSP_SEC:
692 		tsp->tv_sec = time_second;
693 		tsp->tv_nsec = 0;
694 		break;
695 	case TSP_HZ:
696 		getnanotime(tsp);
697 		break;
698 	case TSP_USEC:
699 		microtime(&tv);
700 		TIMEVAL_TO_TIMESPEC(&tv, tsp);
701 		break;
702 	case TSP_NSEC:
703 	default:
704 		nanotime(tsp);
705 		break;
706 	}
707 }
708 
709 /*
710  * Set vnode attributes to VNOVAL
711  */
712 void
713 vattr_null(struct vattr *vap)
714 {
715 
716 	vap->va_type = VNON;
717 	vap->va_size = VNOVAL;
718 	vap->va_bytes = VNOVAL;
719 	vap->va_mode = VNOVAL;
720 	vap->va_nlink = VNOVAL;
721 	vap->va_uid = VNOVAL;
722 	vap->va_gid = VNOVAL;
723 	vap->va_fsid = VNOVAL;
724 	vap->va_fileid = VNOVAL;
725 	vap->va_blocksize = VNOVAL;
726 	vap->va_rdev = VNOVAL;
727 	vap->va_atime.tv_sec = VNOVAL;
728 	vap->va_atime.tv_nsec = VNOVAL;
729 	vap->va_mtime.tv_sec = VNOVAL;
730 	vap->va_mtime.tv_nsec = VNOVAL;
731 	vap->va_ctime.tv_sec = VNOVAL;
732 	vap->va_ctime.tv_nsec = VNOVAL;
733 	vap->va_birthtime.tv_sec = VNOVAL;
734 	vap->va_birthtime.tv_nsec = VNOVAL;
735 	vap->va_flags = VNOVAL;
736 	vap->va_gen = VNOVAL;
737 	vap->va_vaflags = 0;
738 }
739 
740 /*
741  * This routine is called when we have too many vnodes.  It attempts
742  * to free <count> vnodes and will potentially free vnodes that still
743  * have VM backing store (VM backing store is typically the cause
744  * of a vnode blowout so we want to do this).  Therefore, this operation
745  * is not considered cheap.
746  *
747  * A number of conditions may prevent a vnode from being reclaimed.
748  * the buffer cache may have references on the vnode, a directory
749  * vnode may still have references due to the namei cache representing
750  * underlying files, or the vnode may be in active use.   It is not
751  * desireable to reuse such vnodes.  These conditions may cause the
752  * number of vnodes to reach some minimum value regardless of what
753  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
754  */
755 static int
756 vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger)
757 {
758 	struct vnode *vp;
759 	int count, done, target;
760 
761 	done = 0;
762 	vn_start_write(NULL, &mp, V_WAIT);
763 	MNT_ILOCK(mp);
764 	count = mp->mnt_nvnodelistsize;
765 	target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1);
766 	target = target / 10 + 1;
767 	while (count != 0 && done < target) {
768 		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
769 		while (vp != NULL && vp->v_type == VMARKER)
770 			vp = TAILQ_NEXT(vp, v_nmntvnodes);
771 		if (vp == NULL)
772 			break;
773 		/*
774 		 * XXX LRU is completely broken for non-free vnodes.  First
775 		 * by calling here in mountpoint order, then by moving
776 		 * unselected vnodes to the end here, and most grossly by
777 		 * removing the vlruvp() function that was supposed to
778 		 * maintain the order.  (This function was born broken
779 		 * since syncer problems prevented it doing anything.)  The
780 		 * order is closer to LRC (C = Created).
781 		 *
782 		 * LRU reclaiming of vnodes seems to have last worked in
783 		 * FreeBSD-3 where LRU wasn't mentioned under any spelling.
784 		 * Then there was no hold count, and inactive vnodes were
785 		 * simply put on the free list in LRU order.  The separate
786 		 * lists also break LRU.  We prefer to reclaim from the
787 		 * free list for technical reasons.  This tends to thrash
788 		 * the free list to keep very unrecently used held vnodes.
789 		 * The problem is mitigated by keeping the free list large.
790 		 */
791 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
792 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
793 		--count;
794 		if (!VI_TRYLOCK(vp))
795 			goto next_iter;
796 		/*
797 		 * If it's been deconstructed already, it's still
798 		 * referenced, or it exceeds the trigger, skip it.
799 		 * Also skip free vnodes.  We are trying to make space
800 		 * to expand the free list, not reduce it.
801 		 */
802 		if (vp->v_usecount ||
803 		    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
804 		    ((vp->v_iflag & VI_FREE) != 0) ||
805 		    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
806 		    vp->v_object->resident_page_count > trigger)) {
807 			VI_UNLOCK(vp);
808 			goto next_iter;
809 		}
810 		MNT_IUNLOCK(mp);
811 		vholdl(vp);
812 		if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
813 			vdrop(vp);
814 			goto next_iter_mntunlocked;
815 		}
816 		VI_LOCK(vp);
817 		/*
818 		 * v_usecount may have been bumped after VOP_LOCK() dropped
819 		 * the vnode interlock and before it was locked again.
820 		 *
821 		 * It is not necessary to recheck VI_DOOMED because it can
822 		 * only be set by another thread that holds both the vnode
823 		 * lock and vnode interlock.  If another thread has the
824 		 * vnode lock before we get to VOP_LOCK() and obtains the
825 		 * vnode interlock after VOP_LOCK() drops the vnode
826 		 * interlock, the other thread will be unable to drop the
827 		 * vnode lock before our VOP_LOCK() call fails.
828 		 */
829 		if (vp->v_usecount ||
830 		    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
831 		    (vp->v_iflag & VI_FREE) != 0 ||
832 		    (vp->v_object != NULL &&
833 		    vp->v_object->resident_page_count > trigger)) {
834 			VOP_UNLOCK(vp, LK_INTERLOCK);
835 			vdrop(vp);
836 			goto next_iter_mntunlocked;
837 		}
838 		KASSERT((vp->v_iflag & VI_DOOMED) == 0,
839 		    ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
840 		atomic_add_long(&recycles_count, 1);
841 		vgonel(vp);
842 		VOP_UNLOCK(vp, 0);
843 		vdropl(vp);
844 		done++;
845 next_iter_mntunlocked:
846 		if (!should_yield())
847 			goto relock_mnt;
848 		goto yield;
849 next_iter:
850 		if (!should_yield())
851 			continue;
852 		MNT_IUNLOCK(mp);
853 yield:
854 		kern_yield(PRI_USER);
855 relock_mnt:
856 		MNT_ILOCK(mp);
857 	}
858 	MNT_IUNLOCK(mp);
859 	vn_finished_write(mp);
860 	return done;
861 }
862 
863 /*
864  * Attempt to reduce the free list by the requested amount.
865  */
866 static void
867 vnlru_free(int count)
868 {
869 	struct vnode *vp;
870 
871 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
872 	for (; count > 0; count--) {
873 		vp = TAILQ_FIRST(&vnode_free_list);
874 		/*
875 		 * The list can be modified while the free_list_mtx
876 		 * has been dropped and vp could be NULL here.
877 		 */
878 		if (!vp)
879 			break;
880 		VNASSERT(vp->v_op != NULL, vp,
881 		    ("vnlru_free: vnode already reclaimed."));
882 		KASSERT((vp->v_iflag & VI_FREE) != 0,
883 		    ("Removing vnode not on freelist"));
884 		KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
885 		    ("Mangling active vnode"));
886 		TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
887 		/*
888 		 * Don't recycle if we can't get the interlock.
889 		 */
890 		if (!VI_TRYLOCK(vp)) {
891 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
892 			continue;
893 		}
894 		VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0,
895 		    vp, ("vp inconsistent on freelist"));
896 
897 		/*
898 		 * The clear of VI_FREE prevents activation of the
899 		 * vnode.  There is no sense in putting the vnode on
900 		 * the mount point active list, only to remove it
901 		 * later during recycling.  Inline the relevant part
902 		 * of vholdl(), to avoid triggering assertions or
903 		 * activating.
904 		 */
905 		freevnodes--;
906 		vp->v_iflag &= ~VI_FREE;
907 		refcount_acquire(&vp->v_holdcnt);
908 
909 		mtx_unlock(&vnode_free_list_mtx);
910 		VI_UNLOCK(vp);
911 		vtryrecycle(vp);
912 		/*
913 		 * If the recycled succeeded this vdrop will actually free
914 		 * the vnode.  If not it will simply place it back on
915 		 * the free list.
916 		 */
917 		vdrop(vp);
918 		mtx_lock(&vnode_free_list_mtx);
919 	}
920 }
921 
922 /* XXX some names and initialization are bad for limits and watermarks. */
923 static int
924 vspace(void)
925 {
926 	int space;
927 
928 	gapvnodes = imax(desiredvnodes - wantfreevnodes, 100);
929 	vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */
930 	vlowat = vhiwat / 2;
931 	if (numvnodes > desiredvnodes)
932 		return (0);
933 	space = desiredvnodes - numvnodes;
934 	if (freevnodes > wantfreevnodes)
935 		space += freevnodes - wantfreevnodes;
936 	return (space);
937 }
938 
939 /*
940  * Attempt to recycle vnodes in a context that is always safe to block.
941  * Calling vlrurecycle() from the bowels of filesystem code has some
942  * interesting deadlock problems.
943  */
944 static struct proc *vnlruproc;
945 static int vnlruproc_sig;
946 
947 static void
948 vnlru_proc(void)
949 {
950 	struct mount *mp, *nmp;
951 	unsigned long ofreevnodes, onumvnodes;
952 	int done, force, reclaim_nc_src, trigger, usevnodes;
953 
954 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc,
955 	    SHUTDOWN_PRI_FIRST);
956 
957 	force = 0;
958 	for (;;) {
959 		kproc_suspend_check(vnlruproc);
960 		mtx_lock(&vnode_free_list_mtx);
961 		/*
962 		 * If numvnodes is too large (due to desiredvnodes being
963 		 * adjusted using its sysctl, or emergency growth), first
964 		 * try to reduce it by discarding from the free list.
965 		 */
966 		if (numvnodes > desiredvnodes && freevnodes > 0)
967 			vnlru_free(ulmin(numvnodes - desiredvnodes,
968 			    freevnodes));
969 		/*
970 		 * Sleep if the vnode cache is in a good state.  This is
971 		 * when it is not over-full and has space for about a 4%
972 		 * or 9% expansion (by growing its size or inexcessively
973 		 * reducing its free list).  Otherwise, try to reclaim
974 		 * space for a 10% expansion.
975 		 */
976 		if (vstir && force == 0) {
977 			force = 1;
978 			vstir = 0;
979 		}
980 		if (vspace() >= vlowat && force == 0) {
981 			vnlruproc_sig = 0;
982 			wakeup(&vnlruproc_sig);
983 			msleep(vnlruproc, &vnode_free_list_mtx,
984 			    PVFS|PDROP, "vlruwt", hz);
985 			continue;
986 		}
987 		mtx_unlock(&vnode_free_list_mtx);
988 		done = 0;
989 		ofreevnodes = freevnodes;
990 		onumvnodes = numvnodes;
991 		/*
992 		 * Calculate parameters for recycling.  These are the same
993 		 * throughout the loop to give some semblance of fairness.
994 		 * The trigger point is to avoid recycling vnodes with lots
995 		 * of resident pages.  We aren't trying to free memory; we
996 		 * are trying to recycle or at least free vnodes.
997 		 */
998 		if (numvnodes <= desiredvnodes)
999 			usevnodes = numvnodes - freevnodes;
1000 		else
1001 			usevnodes = numvnodes;
1002 		if (usevnodes <= 0)
1003 			usevnodes = 1;
1004 		/*
1005 		 * The trigger value is is chosen to give a conservatively
1006 		 * large value to ensure that it alone doesn't prevent
1007 		 * making progress.  The value can easily be so large that
1008 		 * it is effectively infinite in some congested and
1009 		 * misconfigured cases, and this is necessary.  Normally
1010 		 * it is about 8 to 100 (pages), which is quite large.
1011 		 */
1012 		trigger = vm_cnt.v_page_count * 2 / usevnodes;
1013 		if (force < 2)
1014 			trigger = vsmalltrigger;
1015 		reclaim_nc_src = force >= 3;
1016 		mtx_lock(&mountlist_mtx);
1017 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
1018 			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
1019 				nmp = TAILQ_NEXT(mp, mnt_list);
1020 				continue;
1021 			}
1022 			done += vlrureclaim(mp, reclaim_nc_src, trigger);
1023 			mtx_lock(&mountlist_mtx);
1024 			nmp = TAILQ_NEXT(mp, mnt_list);
1025 			vfs_unbusy(mp);
1026 		}
1027 		mtx_unlock(&mountlist_mtx);
1028 		if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes)
1029 			uma_reclaim();
1030 		if (done == 0) {
1031 			if (force == 0 || force == 1) {
1032 				force = 2;
1033 				continue;
1034 			}
1035 			if (force == 2) {
1036 				force = 3;
1037 				continue;
1038 			}
1039 			force = 0;
1040 			vnlru_nowhere++;
1041 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
1042 		} else
1043 			kern_yield(PRI_USER);
1044 		/*
1045 		 * After becoming active to expand above low water, keep
1046 		 * active until above high water.
1047 		 */
1048 		force = vspace() < vhiwat;
1049 	}
1050 }
1051 
1052 static struct kproc_desc vnlru_kp = {
1053 	"vnlru",
1054 	vnlru_proc,
1055 	&vnlruproc
1056 };
1057 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
1058     &vnlru_kp);
1059 
1060 /*
1061  * Routines having to do with the management of the vnode table.
1062  */
1063 
1064 /*
1065  * Try to recycle a freed vnode.  We abort if anyone picks up a reference
1066  * before we actually vgone().  This function must be called with the vnode
1067  * held to prevent the vnode from being returned to the free list midway
1068  * through vgone().
1069  */
1070 static int
1071 vtryrecycle(struct vnode *vp)
1072 {
1073 	struct mount *vnmp;
1074 
1075 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
1076 	VNASSERT(vp->v_holdcnt, vp,
1077 	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
1078 	/*
1079 	 * This vnode may found and locked via some other list, if so we
1080 	 * can't recycle it yet.
1081 	 */
1082 	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
1083 		CTR2(KTR_VFS,
1084 		    "%s: impossible to recycle, vp %p lock is already held",
1085 		    __func__, vp);
1086 		return (EWOULDBLOCK);
1087 	}
1088 	/*
1089 	 * Don't recycle if its filesystem is being suspended.
1090 	 */
1091 	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
1092 		VOP_UNLOCK(vp, 0);
1093 		CTR2(KTR_VFS,
1094 		    "%s: impossible to recycle, cannot start the write for %p",
1095 		    __func__, vp);
1096 		return (EBUSY);
1097 	}
1098 	/*
1099 	 * If we got this far, we need to acquire the interlock and see if
1100 	 * anyone picked up this vnode from another list.  If not, we will
1101 	 * mark it with DOOMED via vgonel() so that anyone who does find it
1102 	 * will skip over it.
1103 	 */
1104 	VI_LOCK(vp);
1105 	if (vp->v_usecount) {
1106 		VOP_UNLOCK(vp, LK_INTERLOCK);
1107 		vn_finished_write(vnmp);
1108 		CTR2(KTR_VFS,
1109 		    "%s: impossible to recycle, %p is already referenced",
1110 		    __func__, vp);
1111 		return (EBUSY);
1112 	}
1113 	if ((vp->v_iflag & VI_DOOMED) == 0) {
1114 		atomic_add_long(&recycles_count, 1);
1115 		vgonel(vp);
1116 	}
1117 	VOP_UNLOCK(vp, LK_INTERLOCK);
1118 	vn_finished_write(vnmp);
1119 	return (0);
1120 }
1121 
1122 static void
1123 vcheckspace(void)
1124 {
1125 
1126 	if (vspace() < vlowat && vnlruproc_sig == 0) {
1127 		vnlruproc_sig = 1;
1128 		wakeup(vnlruproc);
1129 	}
1130 }
1131 
1132 /*
1133  * Wait if necessary for space for a new vnode.
1134  */
1135 static int
1136 getnewvnode_wait(int suspended)
1137 {
1138 
1139 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
1140 	if (numvnodes >= desiredvnodes) {
1141 		if (suspended) {
1142 			/*
1143 			 * The file system is being suspended.  We cannot
1144 			 * risk a deadlock here, so allow allocation of
1145 			 * another vnode even if this would give too many.
1146 			 */
1147 			return (0);
1148 		}
1149 		if (vnlruproc_sig == 0) {
1150 			vnlruproc_sig = 1;	/* avoid unnecessary wakeups */
1151 			wakeup(vnlruproc);
1152 		}
1153 		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
1154 		    "vlruwk", hz);
1155 	}
1156 	/* Post-adjust like the pre-adjust in getnewvnode(). */
1157 	if (numvnodes + 1 > desiredvnodes && freevnodes > 1)
1158 		vnlru_free(1);
1159 	return (numvnodes >= desiredvnodes ? ENFILE : 0);
1160 }
1161 
1162 /*
1163  * This hack is fragile, and probably not needed any more now that the
1164  * watermark handling works.
1165  */
1166 void
1167 getnewvnode_reserve(u_int count)
1168 {
1169 	struct thread *td;
1170 
1171 	/* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */
1172 	/* XXX no longer so quick, but this part is not racy. */
1173 	mtx_lock(&vnode_free_list_mtx);
1174 	if (numvnodes + count > desiredvnodes && freevnodes > wantfreevnodes)
1175 		vnlru_free(ulmin(numvnodes + count - desiredvnodes,
1176 		    freevnodes - wantfreevnodes));
1177 	mtx_unlock(&vnode_free_list_mtx);
1178 
1179 	td = curthread;
1180 	/* First try to be quick and racy. */
1181 	if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
1182 		td->td_vp_reserv += count;
1183 		vcheckspace();	/* XXX no longer so quick, but more racy */
1184 		return;
1185 	} else
1186 		atomic_subtract_long(&numvnodes, count);
1187 
1188 	mtx_lock(&vnode_free_list_mtx);
1189 	while (count > 0) {
1190 		if (getnewvnode_wait(0) == 0) {
1191 			count--;
1192 			td->td_vp_reserv++;
1193 			atomic_add_long(&numvnodes, 1);
1194 		}
1195 	}
1196 	vcheckspace();
1197 	mtx_unlock(&vnode_free_list_mtx);
1198 }
1199 
1200 /*
1201  * This hack is fragile, especially if desiredvnodes or wantvnodes are
1202  * misconfgured or changed significantly.  Reducing desiredvnodes below
1203  * the reserved amount should cause bizarre behaviour like reducing it
1204  * below the number of active vnodes -- the system will try to reduce
1205  * numvnodes to match, but should fail, so the subtraction below should
1206  * not overflow.
1207  */
1208 void
1209 getnewvnode_drop_reserve(void)
1210 {
1211 	struct thread *td;
1212 
1213 	td = curthread;
1214 	atomic_subtract_long(&numvnodes, td->td_vp_reserv);
1215 	td->td_vp_reserv = 0;
1216 }
1217 
1218 /*
1219  * Return the next vnode from the free list.
1220  */
1221 int
1222 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
1223     struct vnode **vpp)
1224 {
1225 	struct vnode *vp;
1226 	struct bufobj *bo;
1227 	struct thread *td;
1228 	static int cyclecount;
1229 	int error;
1230 
1231 	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
1232 	vp = NULL;
1233 	td = curthread;
1234 	if (td->td_vp_reserv > 0) {
1235 		td->td_vp_reserv -= 1;
1236 		goto alloc;
1237 	}
1238 	mtx_lock(&vnode_free_list_mtx);
1239 	if (numvnodes < desiredvnodes)
1240 		cyclecount = 0;
1241 	else if (cyclecount++ >= freevnodes) {
1242 		cyclecount = 0;
1243 		vstir = 1;
1244 	}
1245 	/*
1246 	 * Grow the vnode cache if it will not be above its target max
1247 	 * after growing.  Otherwise, if the free list is nonempty, try
1248 	 * to reclaim 1 item from it before growing the cache (possibly
1249 	 * above its target max if the reclamation failed or is delayed).
1250 	 * Otherwise, wait for some space.  In all cases, schedule
1251 	 * vnlru_proc() if we are getting short of space.  The watermarks
1252 	 * should be chosen so that we never wait or even reclaim from
1253 	 * the free list to below its target minimum.
1254 	 */
1255 	if (numvnodes + 1 <= desiredvnodes)
1256 		;
1257 	else if (freevnodes > 0)
1258 		vnlru_free(1);
1259 	else {
1260 		error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
1261 		    MNTK_SUSPEND));
1262 #if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
1263 		if (error != 0) {
1264 			mtx_unlock(&vnode_free_list_mtx);
1265 			return (error);
1266 		}
1267 #endif
1268 	}
1269 	vcheckspace();
1270 	atomic_add_long(&numvnodes, 1);
1271 	mtx_unlock(&vnode_free_list_mtx);
1272 alloc:
1273 	atomic_add_long(&vnodes_created, 1);
1274 	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
1275 	/*
1276 	 * Setup locks.
1277 	 */
1278 	vp->v_vnlock = &vp->v_lock;
1279 	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
1280 	/*
1281 	 * By default, don't allow shared locks unless filesystems
1282 	 * opt-in.
1283 	 */
1284 	lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE);
1285 	/*
1286 	 * Initialize bufobj.
1287 	 */
1288 	bo = &vp->v_bufobj;
1289 	bo->__bo_vnode = vp;
1290 	rw_init(BO_LOCKPTR(bo), "bufobj interlock");
1291 	bo->bo_ops = &buf_ops_bio;
1292 	bo->bo_private = vp;
1293 	TAILQ_INIT(&bo->bo_clean.bv_hd);
1294 	TAILQ_INIT(&bo->bo_dirty.bv_hd);
1295 	/*
1296 	 * Initialize namecache.
1297 	 */
1298 	LIST_INIT(&vp->v_cache_src);
1299 	TAILQ_INIT(&vp->v_cache_dst);
1300 	/*
1301 	 * Finalize various vnode identity bits.
1302 	 */
1303 	vp->v_type = VNON;
1304 	vp->v_tag = tag;
1305 	vp->v_op = vops;
1306 	v_init_counters(vp);
1307 	vp->v_data = NULL;
1308 #ifdef MAC
1309 	mac_vnode_init(vp);
1310 	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
1311 		mac_vnode_associate_singlelabel(mp, vp);
1312 	else if (mp == NULL && vops != &dead_vnodeops)
1313 		printf("NULL mp in getnewvnode()\n");
1314 #endif
1315 	if (mp != NULL) {
1316 		bo->bo_bsize = mp->mnt_stat.f_iosize;
1317 		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
1318 			vp->v_vflag |= VV_NOKNOTE;
1319 	}
1320 	rangelock_init(&vp->v_rl);
1321 
1322 	/*
1323 	 * For the filesystems which do not use vfs_hash_insert(),
1324 	 * still initialize v_hash to have vfs_hash_index() useful.
1325 	 * E.g., nullfs uses vfs_hash_index() on the lower vnode for
1326 	 * its own hashing.
1327 	 */
1328 	vp->v_hash = (uintptr_t)vp >> vnsz2log;
1329 
1330 	*vpp = vp;
1331 	return (0);
1332 }
1333 
1334 /*
1335  * Delete from old mount point vnode list, if on one.
1336  */
1337 static void
1338 delmntque(struct vnode *vp)
1339 {
1340 	struct mount *mp;
1341 	int active;
1342 
1343 	mp = vp->v_mount;
1344 	if (mp == NULL)
1345 		return;
1346 	MNT_ILOCK(mp);
1347 	VI_LOCK(vp);
1348 	KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize,
1349 	    ("Active vnode list size %d > Vnode list size %d",
1350 	     mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize));
1351 	active = vp->v_iflag & VI_ACTIVE;
1352 	vp->v_iflag &= ~VI_ACTIVE;
1353 	if (active) {
1354 		mtx_lock(&vnode_free_list_mtx);
1355 		TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
1356 		mp->mnt_activevnodelistsize--;
1357 		mtx_unlock(&vnode_free_list_mtx);
1358 	}
1359 	vp->v_mount = NULL;
1360 	VI_UNLOCK(vp);
1361 	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
1362 		("bad mount point vnode list size"));
1363 	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1364 	mp->mnt_nvnodelistsize--;
1365 	MNT_REL(mp);
1366 	MNT_IUNLOCK(mp);
1367 }
1368 
1369 static void
1370 insmntque_stddtr(struct vnode *vp, void *dtr_arg)
1371 {
1372 
1373 	vp->v_data = NULL;
1374 	vp->v_op = &dead_vnodeops;
1375 	vgone(vp);
1376 	vput(vp);
1377 }
1378 
1379 /*
1380  * Insert into list of vnodes for the new mount point, if available.
1381  */
1382 int
1383 insmntque1(struct vnode *vp, struct mount *mp,
1384 	void (*dtr)(struct vnode *, void *), void *dtr_arg)
1385 {
1386 
1387 	KASSERT(vp->v_mount == NULL,
1388 		("insmntque: vnode already on per mount vnode list"));
1389 	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
1390 	ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
1391 
1392 	/*
1393 	 * We acquire the vnode interlock early to ensure that the
1394 	 * vnode cannot be recycled by another process releasing a
1395 	 * holdcnt on it before we get it on both the vnode list
1396 	 * and the active vnode list. The mount mutex protects only
1397 	 * manipulation of the vnode list and the vnode freelist
1398 	 * mutex protects only manipulation of the active vnode list.
1399 	 * Hence the need to hold the vnode interlock throughout.
1400 	 */
1401 	MNT_ILOCK(mp);
1402 	VI_LOCK(vp);
1403 	if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
1404 	    ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
1405 	    mp->mnt_nvnodelistsize == 0)) &&
1406 	    (vp->v_vflag & VV_FORCEINSMQ) == 0) {
1407 		VI_UNLOCK(vp);
1408 		MNT_IUNLOCK(mp);
1409 		if (dtr != NULL)
1410 			dtr(vp, dtr_arg);
1411 		return (EBUSY);
1412 	}
1413 	vp->v_mount = mp;
1414 	MNT_REF(mp);
1415 	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1416 	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
1417 		("neg mount point vnode list size"));
1418 	mp->mnt_nvnodelistsize++;
1419 	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
1420 	    ("Activating already active vnode"));
1421 	vp->v_iflag |= VI_ACTIVE;
1422 	mtx_lock(&vnode_free_list_mtx);
1423 	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
1424 	mp->mnt_activevnodelistsize++;
1425 	mtx_unlock(&vnode_free_list_mtx);
1426 	VI_UNLOCK(vp);
1427 	MNT_IUNLOCK(mp);
1428 	return (0);
1429 }
1430 
1431 int
1432 insmntque(struct vnode *vp, struct mount *mp)
1433 {
1434 
1435 	return (insmntque1(vp, mp, insmntque_stddtr, NULL));
1436 }
1437 
1438 /*
1439  * Flush out and invalidate all buffers associated with a bufobj
1440  * Called with the underlying object locked.
1441  */
1442 int
1443 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
1444 {
1445 	int error;
1446 
1447 	BO_LOCK(bo);
1448 	if (flags & V_SAVE) {
1449 		error = bufobj_wwait(bo, slpflag, slptimeo);
1450 		if (error) {
1451 			BO_UNLOCK(bo);
1452 			return (error);
1453 		}
1454 		if (bo->bo_dirty.bv_cnt > 0) {
1455 			BO_UNLOCK(bo);
1456 			if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
1457 				return (error);
1458 			/*
1459 			 * XXX We could save a lock/unlock if this was only
1460 			 * enabled under INVARIANTS
1461 			 */
1462 			BO_LOCK(bo);
1463 			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
1464 				panic("vinvalbuf: dirty bufs");
1465 		}
1466 	}
1467 	/*
1468 	 * If you alter this loop please notice that interlock is dropped and
1469 	 * reacquired in flushbuflist.  Special care is needed to ensure that
1470 	 * no race conditions occur from this.
1471 	 */
1472 	do {
1473 		error = flushbuflist(&bo->bo_clean,
1474 		    flags, bo, slpflag, slptimeo);
1475 		if (error == 0 && !(flags & V_CLEANONLY))
1476 			error = flushbuflist(&bo->bo_dirty,
1477 			    flags, bo, slpflag, slptimeo);
1478 		if (error != 0 && error != EAGAIN) {
1479 			BO_UNLOCK(bo);
1480 			return (error);
1481 		}
1482 	} while (error != 0);
1483 
1484 	/*
1485 	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
1486 	 * have write I/O in-progress but if there is a VM object then the
1487 	 * VM object can also have read-I/O in-progress.
1488 	 */
1489 	do {
1490 		bufobj_wwait(bo, 0, 0);
1491 		BO_UNLOCK(bo);
1492 		if (bo->bo_object != NULL) {
1493 			VM_OBJECT_WLOCK(bo->bo_object);
1494 			vm_object_pip_wait(bo->bo_object, "bovlbx");
1495 			VM_OBJECT_WUNLOCK(bo->bo_object);
1496 		}
1497 		BO_LOCK(bo);
1498 	} while (bo->bo_numoutput > 0);
1499 	BO_UNLOCK(bo);
1500 
1501 	/*
1502 	 * Destroy the copy in the VM cache, too.
1503 	 */
1504 	if (bo->bo_object != NULL &&
1505 	    (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) {
1506 		VM_OBJECT_WLOCK(bo->bo_object);
1507 		vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
1508 		    OBJPR_CLEANONLY : 0);
1509 		VM_OBJECT_WUNLOCK(bo->bo_object);
1510 	}
1511 
1512 #ifdef INVARIANTS
1513 	BO_LOCK(bo);
1514 	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 &&
1515 	    (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
1516 		panic("vinvalbuf: flush failed");
1517 	BO_UNLOCK(bo);
1518 #endif
1519 	return (0);
1520 }
1521 
1522 /*
1523  * Flush out and invalidate all buffers associated with a vnode.
1524  * Called with the underlying object locked.
1525  */
1526 int
1527 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
1528 {
1529 
1530 	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
1531 	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
1532 	if (vp->v_object != NULL && vp->v_object->handle != vp)
1533 		return (0);
1534 	return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
1535 }
1536 
1537 /*
1538  * Flush out buffers on the specified list.
1539  *
1540  */
1541 static int
1542 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
1543     int slptimeo)
1544 {
1545 	struct buf *bp, *nbp;
1546 	int retval, error;
1547 	daddr_t lblkno;
1548 	b_xflags_t xflags;
1549 
1550 	ASSERT_BO_WLOCKED(bo);
1551 
1552 	retval = 0;
1553 	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
1554 		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1555 		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1556 			continue;
1557 		}
1558 		lblkno = 0;
1559 		xflags = 0;
1560 		if (nbp != NULL) {
1561 			lblkno = nbp->b_lblkno;
1562 			xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
1563 		}
1564 		retval = EAGAIN;
1565 		error = BUF_TIMELOCK(bp,
1566 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
1567 		    "flushbuf", slpflag, slptimeo);
1568 		if (error) {
1569 			BO_LOCK(bo);
1570 			return (error != ENOLCK ? error : EAGAIN);
1571 		}
1572 		KASSERT(bp->b_bufobj == bo,
1573 		    ("bp %p wrong b_bufobj %p should be %p",
1574 		    bp, bp->b_bufobj, bo));
1575 		/*
1576 		 * XXX Since there are no node locks for NFS, I
1577 		 * believe there is a slight chance that a delayed
1578 		 * write will occur while sleeping just above, so
1579 		 * check for it.
1580 		 */
1581 		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1582 		    (flags & V_SAVE)) {
1583 			bremfree(bp);
1584 			bp->b_flags |= B_ASYNC;
1585 			bwrite(bp);
1586 			BO_LOCK(bo);
1587 			return (EAGAIN);	/* XXX: why not loop ? */
1588 		}
1589 		bremfree(bp);
1590 		bp->b_flags |= (B_INVAL | B_RELBUF);
1591 		bp->b_flags &= ~B_ASYNC;
1592 		brelse(bp);
1593 		BO_LOCK(bo);
1594 		if (nbp != NULL &&
1595 		    (nbp->b_bufobj != bo ||
1596 		     nbp->b_lblkno != lblkno ||
1597 		     (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags))
1598 			break;			/* nbp invalid */
1599 	}
1600 	return (retval);
1601 }
1602 
1603 /*
1604  * Truncate a file's buffer and pages to a specified length.  This
1605  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1606  * sync activity.
1607  */
1608 int
1609 vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize)
1610 {
1611 	struct buf *bp, *nbp;
1612 	int anyfreed;
1613 	int trunclbn;
1614 	struct bufobj *bo;
1615 
1616 	CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__,
1617 	    vp, cred, blksize, (uintmax_t)length);
1618 
1619 	/*
1620 	 * Round up to the *next* lbn.
1621 	 */
1622 	trunclbn = (length + blksize - 1) / blksize;
1623 
1624 	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1625 restart:
1626 	bo = &vp->v_bufobj;
1627 	BO_LOCK(bo);
1628 	anyfreed = 1;
1629 	for (;anyfreed;) {
1630 		anyfreed = 0;
1631 		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
1632 			if (bp->b_lblkno < trunclbn)
1633 				continue;
1634 			if (BUF_LOCK(bp,
1635 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1636 			    BO_LOCKPTR(bo)) == ENOLCK)
1637 				goto restart;
1638 
1639 			bremfree(bp);
1640 			bp->b_flags |= (B_INVAL | B_RELBUF);
1641 			bp->b_flags &= ~B_ASYNC;
1642 			brelse(bp);
1643 			anyfreed = 1;
1644 
1645 			BO_LOCK(bo);
1646 			if (nbp != NULL &&
1647 			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1648 			    (nbp->b_vp != vp) ||
1649 			    (nbp->b_flags & B_DELWRI))) {
1650 				BO_UNLOCK(bo);
1651 				goto restart;
1652 			}
1653 		}
1654 
1655 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1656 			if (bp->b_lblkno < trunclbn)
1657 				continue;
1658 			if (BUF_LOCK(bp,
1659 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1660 			    BO_LOCKPTR(bo)) == ENOLCK)
1661 				goto restart;
1662 			bremfree(bp);
1663 			bp->b_flags |= (B_INVAL | B_RELBUF);
1664 			bp->b_flags &= ~B_ASYNC;
1665 			brelse(bp);
1666 			anyfreed = 1;
1667 
1668 			BO_LOCK(bo);
1669 			if (nbp != NULL &&
1670 			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1671 			    (nbp->b_vp != vp) ||
1672 			    (nbp->b_flags & B_DELWRI) == 0)) {
1673 				BO_UNLOCK(bo);
1674 				goto restart;
1675 			}
1676 		}
1677 	}
1678 
1679 	if (length > 0) {
1680 restartsync:
1681 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1682 			if (bp->b_lblkno > 0)
1683 				continue;
1684 			/*
1685 			 * Since we hold the vnode lock this should only
1686 			 * fail if we're racing with the buf daemon.
1687 			 */
1688 			if (BUF_LOCK(bp,
1689 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1690 			    BO_LOCKPTR(bo)) == ENOLCK) {
1691 				goto restart;
1692 			}
1693 			VNASSERT((bp->b_flags & B_DELWRI), vp,
1694 			    ("buf(%p) on dirty queue without DELWRI", bp));
1695 
1696 			bremfree(bp);
1697 			bawrite(bp);
1698 			BO_LOCK(bo);
1699 			goto restartsync;
1700 		}
1701 	}
1702 
1703 	bufobj_wwait(bo, 0, 0);
1704 	BO_UNLOCK(bo);
1705 	vnode_pager_setsize(vp, length);
1706 
1707 	return (0);
1708 }
1709 
1710 static void
1711 buf_vlist_remove(struct buf *bp)
1712 {
1713 	struct bufv *bv;
1714 
1715 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1716 	ASSERT_BO_WLOCKED(bp->b_bufobj);
1717 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
1718 	    (BX_VNDIRTY|BX_VNCLEAN),
1719 	    ("buf_vlist_remove: Buf %p is on two lists", bp));
1720 	if (bp->b_xflags & BX_VNDIRTY)
1721 		bv = &bp->b_bufobj->bo_dirty;
1722 	else
1723 		bv = &bp->b_bufobj->bo_clean;
1724 	BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
1725 	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
1726 	bv->bv_cnt--;
1727 	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1728 }
1729 
1730 /*
1731  * Add the buffer to the sorted clean or dirty block list.
1732  *
1733  * NOTE: xflags is passed as a constant, optimizing this inline function!
1734  */
1735 static void
1736 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
1737 {
1738 	struct bufv *bv;
1739 	struct buf *n;
1740 	int error;
1741 
1742 	ASSERT_BO_WLOCKED(bo);
1743 	KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
1744 	    ("dead bo %p", bo));
1745 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1746 	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
1747 	bp->b_xflags |= xflags;
1748 	if (xflags & BX_VNDIRTY)
1749 		bv = &bo->bo_dirty;
1750 	else
1751 		bv = &bo->bo_clean;
1752 
1753 	/*
1754 	 * Keep the list ordered.  Optimize empty list insertion.  Assume
1755 	 * we tend to grow at the tail so lookup_le should usually be cheaper
1756 	 * than _ge.
1757 	 */
1758 	if (bv->bv_cnt == 0 ||
1759 	    bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
1760 		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
1761 	else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
1762 		TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
1763 	else
1764 		TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
1765 	error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
1766 	if (error)
1767 		panic("buf_vlist_add:  Preallocated nodes insufficient.");
1768 	bv->bv_cnt++;
1769 }
1770 
1771 /*
1772  * Look up a buffer using the buffer tries.
1773  */
1774 struct buf *
1775 gbincore(struct bufobj *bo, daddr_t lblkno)
1776 {
1777 	struct buf *bp;
1778 
1779 	ASSERT_BO_LOCKED(bo);
1780 	bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
1781 	if (bp != NULL)
1782 		return (bp);
1783 	return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno);
1784 }
1785 
1786 /*
1787  * Associate a buffer with a vnode.
1788  */
1789 void
1790 bgetvp(struct vnode *vp, struct buf *bp)
1791 {
1792 	struct bufobj *bo;
1793 
1794 	bo = &vp->v_bufobj;
1795 	ASSERT_BO_WLOCKED(bo);
1796 	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
1797 
1798 	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
1799 	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
1800 	    ("bgetvp: bp already attached! %p", bp));
1801 
1802 	vhold(vp);
1803 	bp->b_vp = vp;
1804 	bp->b_bufobj = bo;
1805 	/*
1806 	 * Insert onto list for new vnode.
1807 	 */
1808 	buf_vlist_add(bp, bo, BX_VNCLEAN);
1809 }
1810 
1811 /*
1812  * Disassociate a buffer from a vnode.
1813  */
1814 void
1815 brelvp(struct buf *bp)
1816 {
1817 	struct bufobj *bo;
1818 	struct vnode *vp;
1819 
1820 	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1821 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1822 
1823 	/*
1824 	 * Delete from old vnode list, if on one.
1825 	 */
1826 	vp = bp->b_vp;		/* XXX */
1827 	bo = bp->b_bufobj;
1828 	BO_LOCK(bo);
1829 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1830 		buf_vlist_remove(bp);
1831 	else
1832 		panic("brelvp: Buffer %p not on queue.", bp);
1833 	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1834 		bo->bo_flag &= ~BO_ONWORKLST;
1835 		mtx_lock(&sync_mtx);
1836 		LIST_REMOVE(bo, bo_synclist);
1837 		syncer_worklist_len--;
1838 		mtx_unlock(&sync_mtx);
1839 	}
1840 	bp->b_vp = NULL;
1841 	bp->b_bufobj = NULL;
1842 	BO_UNLOCK(bo);
1843 	vdrop(vp);
1844 }
1845 
1846 /*
1847  * Add an item to the syncer work queue.
1848  */
1849 static void
1850 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
1851 {
1852 	int slot;
1853 
1854 	ASSERT_BO_WLOCKED(bo);
1855 
1856 	mtx_lock(&sync_mtx);
1857 	if (bo->bo_flag & BO_ONWORKLST)
1858 		LIST_REMOVE(bo, bo_synclist);
1859 	else {
1860 		bo->bo_flag |= BO_ONWORKLST;
1861 		syncer_worklist_len++;
1862 	}
1863 
1864 	if (delay > syncer_maxdelay - 2)
1865 		delay = syncer_maxdelay - 2;
1866 	slot = (syncer_delayno + delay) & syncer_mask;
1867 
1868 	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
1869 	mtx_unlock(&sync_mtx);
1870 }
1871 
1872 static int
1873 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
1874 {
1875 	int error, len;
1876 
1877 	mtx_lock(&sync_mtx);
1878 	len = syncer_worklist_len - sync_vnode_count;
1879 	mtx_unlock(&sync_mtx);
1880 	error = SYSCTL_OUT(req, &len, sizeof(len));
1881 	return (error);
1882 }
1883 
1884 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
1885     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
1886 
1887 static struct proc *updateproc;
1888 static void sched_sync(void);
1889 static struct kproc_desc up_kp = {
1890 	"syncer",
1891 	sched_sync,
1892 	&updateproc
1893 };
1894 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
1895 
1896 static int
1897 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
1898 {
1899 	struct vnode *vp;
1900 	struct mount *mp;
1901 
1902 	*bo = LIST_FIRST(slp);
1903 	if (*bo == NULL)
1904 		return (0);
1905 	vp = (*bo)->__bo_vnode;	/* XXX */
1906 	if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
1907 		return (1);
1908 	/*
1909 	 * We use vhold in case the vnode does not
1910 	 * successfully sync.  vhold prevents the vnode from
1911 	 * going away when we unlock the sync_mtx so that
1912 	 * we can acquire the vnode interlock.
1913 	 */
1914 	vholdl(vp);
1915 	mtx_unlock(&sync_mtx);
1916 	VI_UNLOCK(vp);
1917 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1918 		vdrop(vp);
1919 		mtx_lock(&sync_mtx);
1920 		return (*bo == LIST_FIRST(slp));
1921 	}
1922 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1923 	(void) VOP_FSYNC(vp, MNT_LAZY, td);
1924 	VOP_UNLOCK(vp, 0);
1925 	vn_finished_write(mp);
1926 	BO_LOCK(*bo);
1927 	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
1928 		/*
1929 		 * Put us back on the worklist.  The worklist
1930 		 * routine will remove us from our current
1931 		 * position and then add us back in at a later
1932 		 * position.
1933 		 */
1934 		vn_syncer_add_to_worklist(*bo, syncdelay);
1935 	}
1936 	BO_UNLOCK(*bo);
1937 	vdrop(vp);
1938 	mtx_lock(&sync_mtx);
1939 	return (0);
1940 }
1941 
1942 static int first_printf = 1;
1943 
1944 /*
1945  * System filesystem synchronizer daemon.
1946  */
1947 static void
1948 sched_sync(void)
1949 {
1950 	struct synclist *next, *slp;
1951 	struct bufobj *bo;
1952 	long starttime;
1953 	struct thread *td = curthread;
1954 	int last_work_seen;
1955 	int net_worklist_len;
1956 	int syncer_final_iter;
1957 	int error;
1958 
1959 	last_work_seen = 0;
1960 	syncer_final_iter = 0;
1961 	syncer_state = SYNCER_RUNNING;
1962 	starttime = time_uptime;
1963 	td->td_pflags |= TDP_NORUNNINGBUF;
1964 
1965 	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
1966 	    SHUTDOWN_PRI_LAST);
1967 
1968 	mtx_lock(&sync_mtx);
1969 	for (;;) {
1970 		if (syncer_state == SYNCER_FINAL_DELAY &&
1971 		    syncer_final_iter == 0) {
1972 			mtx_unlock(&sync_mtx);
1973 			kproc_suspend_check(td->td_proc);
1974 			mtx_lock(&sync_mtx);
1975 		}
1976 		net_worklist_len = syncer_worklist_len - sync_vnode_count;
1977 		if (syncer_state != SYNCER_RUNNING &&
1978 		    starttime != time_uptime) {
1979 			if (first_printf) {
1980 				printf("\nSyncing disks, vnodes remaining...");
1981 				first_printf = 0;
1982 			}
1983 			printf("%d ", net_worklist_len);
1984 		}
1985 		starttime = time_uptime;
1986 
1987 		/*
1988 		 * Push files whose dirty time has expired.  Be careful
1989 		 * of interrupt race on slp queue.
1990 		 *
1991 		 * Skip over empty worklist slots when shutting down.
1992 		 */
1993 		do {
1994 			slp = &syncer_workitem_pending[syncer_delayno];
1995 			syncer_delayno += 1;
1996 			if (syncer_delayno == syncer_maxdelay)
1997 				syncer_delayno = 0;
1998 			next = &syncer_workitem_pending[syncer_delayno];
1999 			/*
2000 			 * If the worklist has wrapped since the
2001 			 * it was emptied of all but syncer vnodes,
2002 			 * switch to the FINAL_DELAY state and run
2003 			 * for one more second.
2004 			 */
2005 			if (syncer_state == SYNCER_SHUTTING_DOWN &&
2006 			    net_worklist_len == 0 &&
2007 			    last_work_seen == syncer_delayno) {
2008 				syncer_state = SYNCER_FINAL_DELAY;
2009 				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
2010 			}
2011 		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
2012 		    syncer_worklist_len > 0);
2013 
2014 		/*
2015 		 * Keep track of the last time there was anything
2016 		 * on the worklist other than syncer vnodes.
2017 		 * Return to the SHUTTING_DOWN state if any
2018 		 * new work appears.
2019 		 */
2020 		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
2021 			last_work_seen = syncer_delayno;
2022 		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
2023 			syncer_state = SYNCER_SHUTTING_DOWN;
2024 		while (!LIST_EMPTY(slp)) {
2025 			error = sync_vnode(slp, &bo, td);
2026 			if (error == 1) {
2027 				LIST_REMOVE(bo, bo_synclist);
2028 				LIST_INSERT_HEAD(next, bo, bo_synclist);
2029 				continue;
2030 			}
2031 
2032 			if (first_printf == 0) {
2033 				/*
2034 				 * Drop the sync mutex, because some watchdog
2035 				 * drivers need to sleep while patting
2036 				 */
2037 				mtx_unlock(&sync_mtx);
2038 				wdog_kern_pat(WD_LASTVAL);
2039 				mtx_lock(&sync_mtx);
2040 			}
2041 
2042 		}
2043 		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
2044 			syncer_final_iter--;
2045 		/*
2046 		 * The variable rushjob allows the kernel to speed up the
2047 		 * processing of the filesystem syncer process. A rushjob
2048 		 * value of N tells the filesystem syncer to process the next
2049 		 * N seconds worth of work on its queue ASAP. Currently rushjob
2050 		 * is used by the soft update code to speed up the filesystem
2051 		 * syncer process when the incore state is getting so far
2052 		 * ahead of the disk that the kernel memory pool is being
2053 		 * threatened with exhaustion.
2054 		 */
2055 		if (rushjob > 0) {
2056 			rushjob -= 1;
2057 			continue;
2058 		}
2059 		/*
2060 		 * Just sleep for a short period of time between
2061 		 * iterations when shutting down to allow some I/O
2062 		 * to happen.
2063 		 *
2064 		 * If it has taken us less than a second to process the
2065 		 * current work, then wait. Otherwise start right over
2066 		 * again. We can still lose time if any single round
2067 		 * takes more than two seconds, but it does not really
2068 		 * matter as we are just trying to generally pace the
2069 		 * filesystem activity.
2070 		 */
2071 		if (syncer_state != SYNCER_RUNNING ||
2072 		    time_uptime == starttime) {
2073 			thread_lock(td);
2074 			sched_prio(td, PPAUSE);
2075 			thread_unlock(td);
2076 		}
2077 		if (syncer_state != SYNCER_RUNNING)
2078 			cv_timedwait(&sync_wakeup, &sync_mtx,
2079 			    hz / SYNCER_SHUTDOWN_SPEEDUP);
2080 		else if (time_uptime == starttime)
2081 			cv_timedwait(&sync_wakeup, &sync_mtx, hz);
2082 	}
2083 }
2084 
2085 /*
2086  * Request the syncer daemon to speed up its work.
2087  * We never push it to speed up more than half of its
2088  * normal turn time, otherwise it could take over the cpu.
2089  */
2090 int
2091 speedup_syncer(void)
2092 {
2093 	int ret = 0;
2094 
2095 	mtx_lock(&sync_mtx);
2096 	if (rushjob < syncdelay / 2) {
2097 		rushjob += 1;
2098 		stat_rush_requests += 1;
2099 		ret = 1;
2100 	}
2101 	mtx_unlock(&sync_mtx);
2102 	cv_broadcast(&sync_wakeup);
2103 	return (ret);
2104 }
2105 
2106 /*
2107  * Tell the syncer to speed up its work and run though its work
2108  * list several times, then tell it to shut down.
2109  */
2110 static void
2111 syncer_shutdown(void *arg, int howto)
2112 {
2113 
2114 	if (howto & RB_NOSYNC)
2115 		return;
2116 	mtx_lock(&sync_mtx);
2117 	syncer_state = SYNCER_SHUTTING_DOWN;
2118 	rushjob = 0;
2119 	mtx_unlock(&sync_mtx);
2120 	cv_broadcast(&sync_wakeup);
2121 	kproc_shutdown(arg, howto);
2122 }
2123 
2124 void
2125 syncer_suspend(void)
2126 {
2127 
2128 	syncer_shutdown(updateproc, 0);
2129 }
2130 
2131 void
2132 syncer_resume(void)
2133 {
2134 
2135 	mtx_lock(&sync_mtx);
2136 	first_printf = 1;
2137 	syncer_state = SYNCER_RUNNING;
2138 	mtx_unlock(&sync_mtx);
2139 	cv_broadcast(&sync_wakeup);
2140 	kproc_resume(updateproc);
2141 }
2142 
2143 /*
2144  * Reassign a buffer from one vnode to another.
2145  * Used to assign file specific control information
2146  * (indirect blocks) to the vnode to which they belong.
2147  */
2148 void
2149 reassignbuf(struct buf *bp)
2150 {
2151 	struct vnode *vp;
2152 	struct bufobj *bo;
2153 	int delay;
2154 #ifdef INVARIANTS
2155 	struct bufv *bv;
2156 #endif
2157 
2158 	vp = bp->b_vp;
2159 	bo = bp->b_bufobj;
2160 	++reassignbufcalls;
2161 
2162 	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
2163 	    bp, bp->b_vp, bp->b_flags);
2164 	/*
2165 	 * B_PAGING flagged buffers cannot be reassigned because their vp
2166 	 * is not fully linked in.
2167 	 */
2168 	if (bp->b_flags & B_PAGING)
2169 		panic("cannot reassign paging buffer");
2170 
2171 	/*
2172 	 * Delete from old vnode list, if on one.
2173 	 */
2174 	BO_LOCK(bo);
2175 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
2176 		buf_vlist_remove(bp);
2177 	else
2178 		panic("reassignbuf: Buffer %p not on queue.", bp);
2179 	/*
2180 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
2181 	 * of clean buffers.
2182 	 */
2183 	if (bp->b_flags & B_DELWRI) {
2184 		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
2185 			switch (vp->v_type) {
2186 			case VDIR:
2187 				delay = dirdelay;
2188 				break;
2189 			case VCHR:
2190 				delay = metadelay;
2191 				break;
2192 			default:
2193 				delay = filedelay;
2194 			}
2195 			vn_syncer_add_to_worklist(bo, delay);
2196 		}
2197 		buf_vlist_add(bp, bo, BX_VNDIRTY);
2198 	} else {
2199 		buf_vlist_add(bp, bo, BX_VNCLEAN);
2200 
2201 		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
2202 			mtx_lock(&sync_mtx);
2203 			LIST_REMOVE(bo, bo_synclist);
2204 			syncer_worklist_len--;
2205 			mtx_unlock(&sync_mtx);
2206 			bo->bo_flag &= ~BO_ONWORKLST;
2207 		}
2208 	}
2209 #ifdef INVARIANTS
2210 	bv = &bo->bo_clean;
2211 	bp = TAILQ_FIRST(&bv->bv_hd);
2212 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2213 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2214 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
2215 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2216 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2217 	bv = &bo->bo_dirty;
2218 	bp = TAILQ_FIRST(&bv->bv_hd);
2219 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2220 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2221 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
2222 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2223 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2224 #endif
2225 	BO_UNLOCK(bo);
2226 }
2227 
2228 /*
2229  * A temporary hack until refcount_* APIs are sorted out.
2230  */
2231 static __inline int
2232 vfs_refcount_acquire_if_not_zero(volatile u_int *count)
2233 {
2234 	u_int old;
2235 
2236 	for (;;) {
2237 		old = *count;
2238 		if (old == 0)
2239 			return (0);
2240 		if (atomic_cmpset_int(count, old, old + 1))
2241 			return (1);
2242 	}
2243 }
2244 
2245 static __inline int
2246 vfs_refcount_release_if_not_last(volatile u_int *count)
2247 {
2248 	u_int old;
2249 
2250 	for (;;) {
2251 		old = *count;
2252 		if (old == 1)
2253 			return (0);
2254 		if (atomic_cmpset_int(count, old, old - 1))
2255 			return (1);
2256 	}
2257 }
2258 
2259 static void
2260 v_init_counters(struct vnode *vp)
2261 {
2262 
2263 	VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0,
2264 	    vp, ("%s called for an initialized vnode", __FUNCTION__));
2265 	ASSERT_VI_UNLOCKED(vp, __FUNCTION__);
2266 
2267 	refcount_init(&vp->v_holdcnt, 1);
2268 	refcount_init(&vp->v_usecount, 1);
2269 }
2270 
2271 /*
2272  * Increment the use and hold counts on the vnode, taking care to reference
2273  * the driver's usecount if this is a chardev.  The _vhold() will remove
2274  * the vnode from the free list if it is presently free.
2275  */
2276 static void
2277 v_incr_usecount(struct vnode *vp)
2278 {
2279 
2280 	ASSERT_VI_UNLOCKED(vp, __func__);
2281 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2282 
2283 	if (vp->v_type == VCHR) {
2284 		VI_LOCK(vp);
2285 		_vhold(vp, true);
2286 		if (vp->v_iflag & VI_OWEINACT) {
2287 			VNASSERT(vp->v_usecount == 0, vp,
2288 			    ("vnode with usecount and VI_OWEINACT set"));
2289 			vp->v_iflag &= ~VI_OWEINACT;
2290 		}
2291 		refcount_acquire(&vp->v_usecount);
2292 		v_incr_devcount(vp);
2293 		VI_UNLOCK(vp);
2294 		return;
2295 	}
2296 
2297 	_vhold(vp, false);
2298 	if (vfs_refcount_acquire_if_not_zero(&vp->v_usecount)) {
2299 		VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
2300 		    ("vnode with usecount and VI_OWEINACT set"));
2301 	} else {
2302 		VI_LOCK(vp);
2303 		if (vp->v_iflag & VI_OWEINACT)
2304 			vp->v_iflag &= ~VI_OWEINACT;
2305 		refcount_acquire(&vp->v_usecount);
2306 		VI_UNLOCK(vp);
2307 	}
2308 }
2309 
2310 /*
2311  * Increment si_usecount of the associated device, if any.
2312  */
2313 static void
2314 v_incr_devcount(struct vnode *vp)
2315 {
2316 
2317 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
2318 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2319 		dev_lock();
2320 		vp->v_rdev->si_usecount++;
2321 		dev_unlock();
2322 	}
2323 }
2324 
2325 /*
2326  * Decrement si_usecount of the associated device, if any.
2327  */
2328 static void
2329 v_decr_devcount(struct vnode *vp)
2330 {
2331 
2332 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
2333 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2334 		dev_lock();
2335 		vp->v_rdev->si_usecount--;
2336 		dev_unlock();
2337 	}
2338 }
2339 
2340 /*
2341  * Grab a particular vnode from the free list, increment its
2342  * reference count and lock it.  VI_DOOMED is set if the vnode
2343  * is being destroyed.  Only callers who specify LK_RETRY will
2344  * see doomed vnodes.  If inactive processing was delayed in
2345  * vput try to do it here.
2346  *
2347  * Notes on lockless counter manipulation:
2348  * _vhold, vputx and other routines make various decisions based
2349  * on either holdcnt or usecount being 0. As long as either contuner
2350  * is not transitioning 0->1 nor 1->0, the manipulation can be done
2351  * with atomic operations. Otherwise the interlock is taken.
2352  */
2353 int
2354 vget(struct vnode *vp, int flags, struct thread *td)
2355 {
2356 	int error, oweinact;
2357 
2358 	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
2359 	    ("vget: invalid lock operation"));
2360 
2361 	if ((flags & LK_INTERLOCK) != 0)
2362 		ASSERT_VI_LOCKED(vp, __func__);
2363 	else
2364 		ASSERT_VI_UNLOCKED(vp, __func__);
2365 	if ((flags & LK_VNHELD) != 0)
2366 		VNASSERT((vp->v_holdcnt > 0), vp,
2367 		    ("vget: LK_VNHELD passed but vnode not held"));
2368 
2369 	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
2370 
2371 	if ((flags & LK_VNHELD) == 0)
2372 		_vhold(vp, (flags & LK_INTERLOCK) != 0);
2373 
2374 	if ((error = vn_lock(vp, flags)) != 0) {
2375 		vdrop(vp);
2376 		CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
2377 		    vp);
2378 		return (error);
2379 	}
2380 	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
2381 		panic("vget: vn_lock failed to return ENOENT\n");
2382 	/*
2383 	 * We don't guarantee that any particular close will
2384 	 * trigger inactive processing so just make a best effort
2385 	 * here at preventing a reference to a removed file.  If
2386 	 * we don't succeed no harm is done.
2387 	 *
2388 	 * Upgrade our holdcnt to a usecount.
2389 	 */
2390 	if (vp->v_type != VCHR &&
2391 	    vfs_refcount_acquire_if_not_zero(&vp->v_usecount)) {
2392 		VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
2393 		    ("vnode with usecount and VI_OWEINACT set"));
2394 	} else {
2395 		VI_LOCK(vp);
2396 		if ((vp->v_iflag & VI_OWEINACT) == 0) {
2397 			oweinact = 0;
2398 		} else {
2399 			oweinact = 1;
2400 			vp->v_iflag &= ~VI_OWEINACT;
2401 		}
2402 		refcount_acquire(&vp->v_usecount);
2403 		v_incr_devcount(vp);
2404 		if (oweinact && VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
2405 		    (flags & LK_NOWAIT) == 0)
2406 			vinactive(vp, td);
2407 		VI_UNLOCK(vp);
2408 	}
2409 	return (0);
2410 }
2411 
2412 /*
2413  * Increase the reference count of a vnode.
2414  */
2415 void
2416 vref(struct vnode *vp)
2417 {
2418 
2419 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2420 	v_incr_usecount(vp);
2421 }
2422 
2423 /*
2424  * Return reference count of a vnode.
2425  *
2426  * The results of this call are only guaranteed when some mechanism is used to
2427  * stop other processes from gaining references to the vnode.  This may be the
2428  * case if the caller holds the only reference.  This is also useful when stale
2429  * data is acceptable as race conditions may be accounted for by some other
2430  * means.
2431  */
2432 int
2433 vrefcnt(struct vnode *vp)
2434 {
2435 
2436 	return (vp->v_usecount);
2437 }
2438 
2439 #define	VPUTX_VRELE	1
2440 #define	VPUTX_VPUT	2
2441 #define	VPUTX_VUNREF	3
2442 
2443 /*
2444  * Decrement the use and hold counts for a vnode.
2445  *
2446  * See an explanation near vget() as to why atomic operation is safe.
2447  */
2448 static void
2449 vputx(struct vnode *vp, int func)
2450 {
2451 	int error;
2452 
2453 	KASSERT(vp != NULL, ("vputx: null vp"));
2454 	if (func == VPUTX_VUNREF)
2455 		ASSERT_VOP_LOCKED(vp, "vunref");
2456 	else if (func == VPUTX_VPUT)
2457 		ASSERT_VOP_LOCKED(vp, "vput");
2458 	else
2459 		KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
2460 	ASSERT_VI_UNLOCKED(vp, __func__);
2461 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2462 
2463 	if (vp->v_type != VCHR &&
2464 	    vfs_refcount_release_if_not_last(&vp->v_usecount)) {
2465 		if (func == VPUTX_VPUT)
2466 			VOP_UNLOCK(vp, 0);
2467 		vdrop(vp);
2468 		return;
2469 	}
2470 
2471 	VI_LOCK(vp);
2472 
2473 	/*
2474 	 * We want to hold the vnode until the inactive finishes to
2475 	 * prevent vgone() races.  We drop the use count here and the
2476 	 * hold count below when we're done.
2477 	 */
2478 	if (!refcount_release(&vp->v_usecount) ||
2479 	    (vp->v_iflag & VI_DOINGINACT)) {
2480 		if (func == VPUTX_VPUT)
2481 			VOP_UNLOCK(vp, 0);
2482 		v_decr_devcount(vp);
2483 		vdropl(vp);
2484 		return;
2485 	}
2486 
2487 	v_decr_devcount(vp);
2488 
2489 	error = 0;
2490 
2491 	if (vp->v_usecount != 0) {
2492 		vprint("vputx: usecount not zero", vp);
2493 		panic("vputx: usecount not zero");
2494 	}
2495 
2496 	CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp);
2497 
2498 	/*
2499 	 * We must call VOP_INACTIVE with the node locked. Mark
2500 	 * as VI_DOINGINACT to avoid recursion.
2501 	 */
2502 	vp->v_iflag |= VI_OWEINACT;
2503 	switch (func) {
2504 	case VPUTX_VRELE:
2505 		error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
2506 		VI_LOCK(vp);
2507 		break;
2508 	case VPUTX_VPUT:
2509 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2510 			error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
2511 			    LK_NOWAIT);
2512 			VI_LOCK(vp);
2513 		}
2514 		break;
2515 	case VPUTX_VUNREF:
2516 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2517 			error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
2518 			VI_LOCK(vp);
2519 		}
2520 		break;
2521 	}
2522 	VNASSERT(vp->v_usecount == 0 || (vp->v_iflag & VI_OWEINACT) == 0, vp,
2523 	    ("vnode with usecount and VI_OWEINACT set"));
2524 	if (error == 0) {
2525 		if (vp->v_iflag & VI_OWEINACT)
2526 			vinactive(vp, curthread);
2527 		if (func != VPUTX_VUNREF)
2528 			VOP_UNLOCK(vp, 0);
2529 	}
2530 	vdropl(vp);
2531 }
2532 
2533 /*
2534  * Vnode put/release.
2535  * If count drops to zero, call inactive routine and return to freelist.
2536  */
2537 void
2538 vrele(struct vnode *vp)
2539 {
2540 
2541 	vputx(vp, VPUTX_VRELE);
2542 }
2543 
2544 /*
2545  * Release an already locked vnode.  This give the same effects as
2546  * unlock+vrele(), but takes less time and avoids releasing and
2547  * re-aquiring the lock (as vrele() acquires the lock internally.)
2548  */
2549 void
2550 vput(struct vnode *vp)
2551 {
2552 
2553 	vputx(vp, VPUTX_VPUT);
2554 }
2555 
2556 /*
2557  * Release an exclusively locked vnode. Do not unlock the vnode lock.
2558  */
2559 void
2560 vunref(struct vnode *vp)
2561 {
2562 
2563 	vputx(vp, VPUTX_VUNREF);
2564 }
2565 
2566 /*
2567  * Increase the hold count and activate if this is the first reference.
2568  */
2569 void
2570 _vhold(struct vnode *vp, bool locked)
2571 {
2572 	struct mount *mp;
2573 
2574 	if (locked)
2575 		ASSERT_VI_LOCKED(vp, __func__);
2576 	else
2577 		ASSERT_VI_UNLOCKED(vp, __func__);
2578 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2579 	if (!locked && vfs_refcount_acquire_if_not_zero(&vp->v_holdcnt)) {
2580 		VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2581 		    ("_vhold: vnode with holdcnt is free"));
2582 		return;
2583 	}
2584 
2585 	if (!locked)
2586 		VI_LOCK(vp);
2587 	if ((vp->v_iflag & VI_FREE) == 0) {
2588 		refcount_acquire(&vp->v_holdcnt);
2589 		if (!locked)
2590 			VI_UNLOCK(vp);
2591 		return;
2592 	}
2593 	VNASSERT(vp->v_holdcnt == 0, vp,
2594 	    ("%s: wrong hold count", __func__));
2595 	VNASSERT(vp->v_op != NULL, vp,
2596 	    ("%s: vnode already reclaimed.", __func__));
2597 	/*
2598 	 * Remove a vnode from the free list, mark it as in use,
2599 	 * and put it on the active list.
2600 	 */
2601 	mtx_lock(&vnode_free_list_mtx);
2602 	TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
2603 	freevnodes--;
2604 	vp->v_iflag &= ~(VI_FREE|VI_AGE);
2605 	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
2606 	    ("Activating already active vnode"));
2607 	vp->v_iflag |= VI_ACTIVE;
2608 	mp = vp->v_mount;
2609 	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
2610 	mp->mnt_activevnodelistsize++;
2611 	mtx_unlock(&vnode_free_list_mtx);
2612 	refcount_acquire(&vp->v_holdcnt);
2613 	if (!locked)
2614 		VI_UNLOCK(vp);
2615 }
2616 
2617 /*
2618  * Drop the hold count of the vnode.  If this is the last reference to
2619  * the vnode we place it on the free list unless it has been vgone'd
2620  * (marked VI_DOOMED) in which case we will free it.
2621  */
2622 void
2623 _vdrop(struct vnode *vp, bool locked)
2624 {
2625 	struct bufobj *bo;
2626 	struct mount *mp;
2627 	int active;
2628 
2629 	if (locked)
2630 		ASSERT_VI_LOCKED(vp, __func__);
2631 	else
2632 		ASSERT_VI_UNLOCKED(vp, __func__);
2633 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2634 	if ((int)vp->v_holdcnt <= 0)
2635 		panic("vdrop: holdcnt %d", vp->v_holdcnt);
2636 	if (vfs_refcount_release_if_not_last(&vp->v_holdcnt)) {
2637 		if (locked)
2638 			VI_UNLOCK(vp);
2639 		return;
2640 	}
2641 
2642 	if (!locked)
2643 		VI_LOCK(vp);
2644 	if (refcount_release(&vp->v_holdcnt) == 0) {
2645 		VI_UNLOCK(vp);
2646 		return;
2647 	}
2648 	if ((vp->v_iflag & VI_DOOMED) == 0) {
2649 		/*
2650 		 * Mark a vnode as free: remove it from its active list
2651 		 * and put it up for recycling on the freelist.
2652 		 */
2653 		VNASSERT(vp->v_op != NULL, vp,
2654 		    ("vdropl: vnode already reclaimed."));
2655 		VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2656 		    ("vnode already free"));
2657 		VNASSERT(vp->v_holdcnt == 0, vp,
2658 		    ("vdropl: freeing when we shouldn't"));
2659 		active = vp->v_iflag & VI_ACTIVE;
2660 		if ((vp->v_iflag & VI_OWEINACT) == 0) {
2661 			vp->v_iflag &= ~VI_ACTIVE;
2662 			mp = vp->v_mount;
2663 			mtx_lock(&vnode_free_list_mtx);
2664 			if (active) {
2665 				TAILQ_REMOVE(&mp->mnt_activevnodelist, vp,
2666 				    v_actfreelist);
2667 				mp->mnt_activevnodelistsize--;
2668 			}
2669 			/* XXX V*AGE hasn't been set since 1997. */
2670 			if (vp->v_iflag & VI_AGE) {
2671 				TAILQ_INSERT_HEAD(&vnode_free_list, vp,
2672 				    v_actfreelist);
2673 			} else {
2674 				TAILQ_INSERT_TAIL(&vnode_free_list, vp,
2675 				    v_actfreelist);
2676 			}
2677 			freevnodes++;
2678 			vp->v_iflag &= ~VI_AGE;
2679 			vp->v_iflag |= VI_FREE;
2680 			mtx_unlock(&vnode_free_list_mtx);
2681 		} else {
2682 			atomic_add_long(&free_owe_inact, 1);
2683 		}
2684 		VI_UNLOCK(vp);
2685 		return;
2686 	}
2687 	/*
2688 	 * The vnode has been marked for destruction, so free it.
2689 	 */
2690 	CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
2691 	atomic_subtract_long(&numvnodes, 1);
2692 	bo = &vp->v_bufobj;
2693 	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2694 	    ("cleaned vnode still on the free list."));
2695 	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
2696 	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
2697 	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
2698 	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
2699 	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
2700 	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
2701 	VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
2702 	    ("clean blk trie not empty"));
2703 	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
2704 	VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
2705 	    ("dirty blk trie not empty"));
2706 	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
2707 	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
2708 	VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
2709 	VI_UNLOCK(vp);
2710 #ifdef MAC
2711 	mac_vnode_destroy(vp);
2712 #endif
2713 	if (vp->v_pollinfo != NULL)
2714 		destroy_vpollinfo(vp->v_pollinfo);
2715 #ifdef INVARIANTS
2716 	/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
2717 	vp->v_op = NULL;
2718 #endif
2719 	rangelock_destroy(&vp->v_rl);
2720 	lockdestroy(vp->v_vnlock);
2721 	mtx_destroy(&vp->v_interlock);
2722 	rw_destroy(BO_LOCKPTR(bo));
2723 	uma_zfree(vnode_zone, vp);
2724 }
2725 
2726 /*
2727  * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
2728  * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
2729  * OWEINACT tracks whether a vnode missed a call to inactive due to a
2730  * failed lock upgrade.
2731  */
2732 void
2733 vinactive(struct vnode *vp, struct thread *td)
2734 {
2735 	struct vm_object *obj;
2736 
2737 	ASSERT_VOP_ELOCKED(vp, "vinactive");
2738 	ASSERT_VI_LOCKED(vp, "vinactive");
2739 	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
2740 	    ("vinactive: recursed on VI_DOINGINACT"));
2741 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2742 	vp->v_iflag |= VI_DOINGINACT;
2743 	vp->v_iflag &= ~VI_OWEINACT;
2744 	VI_UNLOCK(vp);
2745 	/*
2746 	 * Before moving off the active list, we must be sure that any
2747 	 * modified pages are on the vnode's dirty list since these will
2748 	 * no longer be checked once the vnode is on the inactive list.
2749 	 * Because the vnode vm object keeps a hold reference on the vnode
2750 	 * if there is at least one resident non-cached page, the vnode
2751 	 * cannot leave the active list without the page cleanup done.
2752 	 */
2753 	obj = vp->v_object;
2754 	if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) {
2755 		VM_OBJECT_WLOCK(obj);
2756 		vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC);
2757 		VM_OBJECT_WUNLOCK(obj);
2758 	}
2759 	VOP_INACTIVE(vp, td);
2760 	VI_LOCK(vp);
2761 	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
2762 	    ("vinactive: lost VI_DOINGINACT"));
2763 	vp->v_iflag &= ~VI_DOINGINACT;
2764 }
2765 
2766 /*
2767  * Remove any vnodes in the vnode table belonging to mount point mp.
2768  *
2769  * If FORCECLOSE is not specified, there should not be any active ones,
2770  * return error if any are found (nb: this is a user error, not a
2771  * system error). If FORCECLOSE is specified, detach any active vnodes
2772  * that are found.
2773  *
2774  * If WRITECLOSE is set, only flush out regular file vnodes open for
2775  * writing.
2776  *
2777  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2778  *
2779  * `rootrefs' specifies the base reference count for the root vnode
2780  * of this filesystem. The root vnode is considered busy if its
2781  * v_usecount exceeds this value. On a successful return, vflush(, td)
2782  * will call vrele() on the root vnode exactly rootrefs times.
2783  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2784  * be zero.
2785  */
2786 #ifdef DIAGNOSTIC
2787 static int busyprt = 0;		/* print out busy vnodes */
2788 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
2789 #endif
2790 
2791 int
2792 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
2793 {
2794 	struct vnode *vp, *mvp, *rootvp = NULL;
2795 	struct vattr vattr;
2796 	int busy = 0, error;
2797 
2798 	CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
2799 	    rootrefs, flags);
2800 	if (rootrefs > 0) {
2801 		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2802 		    ("vflush: bad args"));
2803 		/*
2804 		 * Get the filesystem root vnode. We can vput() it
2805 		 * immediately, since with rootrefs > 0, it won't go away.
2806 		 */
2807 		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
2808 			CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
2809 			    __func__, error);
2810 			return (error);
2811 		}
2812 		vput(rootvp);
2813 	}
2814 loop:
2815 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2816 		vholdl(vp);
2817 		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
2818 		if (error) {
2819 			vdrop(vp);
2820 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2821 			goto loop;
2822 		}
2823 		/*
2824 		 * Skip over a vnodes marked VV_SYSTEM.
2825 		 */
2826 		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2827 			VOP_UNLOCK(vp, 0);
2828 			vdrop(vp);
2829 			continue;
2830 		}
2831 		/*
2832 		 * If WRITECLOSE is set, flush out unlinked but still open
2833 		 * files (even if open only for reading) and regular file
2834 		 * vnodes open for writing.
2835 		 */
2836 		if (flags & WRITECLOSE) {
2837 			if (vp->v_object != NULL) {
2838 				VM_OBJECT_WLOCK(vp->v_object);
2839 				vm_object_page_clean(vp->v_object, 0, 0, 0);
2840 				VM_OBJECT_WUNLOCK(vp->v_object);
2841 			}
2842 			error = VOP_FSYNC(vp, MNT_WAIT, td);
2843 			if (error != 0) {
2844 				VOP_UNLOCK(vp, 0);
2845 				vdrop(vp);
2846 				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2847 				return (error);
2848 			}
2849 			error = VOP_GETATTR(vp, &vattr, td->td_ucred);
2850 			VI_LOCK(vp);
2851 
2852 			if ((vp->v_type == VNON ||
2853 			    (error == 0 && vattr.va_nlink > 0)) &&
2854 			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
2855 				VOP_UNLOCK(vp, 0);
2856 				vdropl(vp);
2857 				continue;
2858 			}
2859 		} else
2860 			VI_LOCK(vp);
2861 		/*
2862 		 * With v_usecount == 0, all we need to do is clear out the
2863 		 * vnode data structures and we are done.
2864 		 *
2865 		 * If FORCECLOSE is set, forcibly close the vnode.
2866 		 */
2867 		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
2868 			vgonel(vp);
2869 		} else {
2870 			busy++;
2871 #ifdef DIAGNOSTIC
2872 			if (busyprt)
2873 				vprint("vflush: busy vnode", vp);
2874 #endif
2875 		}
2876 		VOP_UNLOCK(vp, 0);
2877 		vdropl(vp);
2878 	}
2879 	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2880 		/*
2881 		 * If just the root vnode is busy, and if its refcount
2882 		 * is equal to `rootrefs', then go ahead and kill it.
2883 		 */
2884 		VI_LOCK(rootvp);
2885 		KASSERT(busy > 0, ("vflush: not busy"));
2886 		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
2887 		    ("vflush: usecount %d < rootrefs %d",
2888 		     rootvp->v_usecount, rootrefs));
2889 		if (busy == 1 && rootvp->v_usecount == rootrefs) {
2890 			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
2891 			vgone(rootvp);
2892 			VOP_UNLOCK(rootvp, 0);
2893 			busy = 0;
2894 		} else
2895 			VI_UNLOCK(rootvp);
2896 	}
2897 	if (busy) {
2898 		CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
2899 		    busy);
2900 		return (EBUSY);
2901 	}
2902 	for (; rootrefs > 0; rootrefs--)
2903 		vrele(rootvp);
2904 	return (0);
2905 }
2906 
2907 /*
2908  * Recycle an unused vnode to the front of the free list.
2909  */
2910 int
2911 vrecycle(struct vnode *vp)
2912 {
2913 	int recycled;
2914 
2915 	ASSERT_VOP_ELOCKED(vp, "vrecycle");
2916 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2917 	recycled = 0;
2918 	VI_LOCK(vp);
2919 	if (vp->v_usecount == 0) {
2920 		recycled = 1;
2921 		vgonel(vp);
2922 	}
2923 	VI_UNLOCK(vp);
2924 	return (recycled);
2925 }
2926 
2927 /*
2928  * Eliminate all activity associated with a vnode
2929  * in preparation for reuse.
2930  */
2931 void
2932 vgone(struct vnode *vp)
2933 {
2934 	VI_LOCK(vp);
2935 	vgonel(vp);
2936 	VI_UNLOCK(vp);
2937 }
2938 
2939 static void
2940 notify_lowervp_vfs_dummy(struct mount *mp __unused,
2941     struct vnode *lowervp __unused)
2942 {
2943 }
2944 
2945 /*
2946  * Notify upper mounts about reclaimed or unlinked vnode.
2947  */
2948 void
2949 vfs_notify_upper(struct vnode *vp, int event)
2950 {
2951 	static struct vfsops vgonel_vfsops = {
2952 		.vfs_reclaim_lowervp = notify_lowervp_vfs_dummy,
2953 		.vfs_unlink_lowervp = notify_lowervp_vfs_dummy,
2954 	};
2955 	struct mount *mp, *ump, *mmp;
2956 
2957 	mp = vp->v_mount;
2958 	if (mp == NULL)
2959 		return;
2960 
2961 	MNT_ILOCK(mp);
2962 	if (TAILQ_EMPTY(&mp->mnt_uppers))
2963 		goto unlock;
2964 	MNT_IUNLOCK(mp);
2965 	mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
2966 	mmp->mnt_op = &vgonel_vfsops;
2967 	mmp->mnt_kern_flag |= MNTK_MARKER;
2968 	MNT_ILOCK(mp);
2969 	mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
2970 	for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
2971 		if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
2972 			ump = TAILQ_NEXT(ump, mnt_upper_link);
2973 			continue;
2974 		}
2975 		TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
2976 		MNT_IUNLOCK(mp);
2977 		switch (event) {
2978 		case VFS_NOTIFY_UPPER_RECLAIM:
2979 			VFS_RECLAIM_LOWERVP(ump, vp);
2980 			break;
2981 		case VFS_NOTIFY_UPPER_UNLINK:
2982 			VFS_UNLINK_LOWERVP(ump, vp);
2983 			break;
2984 		default:
2985 			KASSERT(0, ("invalid event %d", event));
2986 			break;
2987 		}
2988 		MNT_ILOCK(mp);
2989 		ump = TAILQ_NEXT(mmp, mnt_upper_link);
2990 		TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
2991 	}
2992 	free(mmp, M_TEMP);
2993 	mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
2994 	if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
2995 		mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
2996 		wakeup(&mp->mnt_uppers);
2997 	}
2998 unlock:
2999 	MNT_IUNLOCK(mp);
3000 }
3001 
3002 /*
3003  * vgone, with the vp interlock held.
3004  */
3005 static void
3006 vgonel(struct vnode *vp)
3007 {
3008 	struct thread *td;
3009 	int oweinact;
3010 	int active;
3011 	struct mount *mp;
3012 
3013 	ASSERT_VOP_ELOCKED(vp, "vgonel");
3014 	ASSERT_VI_LOCKED(vp, "vgonel");
3015 	VNASSERT(vp->v_holdcnt, vp,
3016 	    ("vgonel: vp %p has no reference.", vp));
3017 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3018 	td = curthread;
3019 
3020 	/*
3021 	 * Don't vgonel if we're already doomed.
3022 	 */
3023 	if (vp->v_iflag & VI_DOOMED)
3024 		return;
3025 	vp->v_iflag |= VI_DOOMED;
3026 
3027 	/*
3028 	 * Check to see if the vnode is in use.  If so, we have to call
3029 	 * VOP_CLOSE() and VOP_INACTIVE().
3030 	 */
3031 	active = vp->v_usecount;
3032 	oweinact = (vp->v_iflag & VI_OWEINACT);
3033 	VI_UNLOCK(vp);
3034 	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
3035 
3036 	/*
3037 	 * If purging an active vnode, it must be closed and
3038 	 * deactivated before being reclaimed.
3039 	 */
3040 	if (active)
3041 		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
3042 	if (oweinact || active) {
3043 		VI_LOCK(vp);
3044 		if ((vp->v_iflag & VI_DOINGINACT) == 0)
3045 			vinactive(vp, td);
3046 		VI_UNLOCK(vp);
3047 	}
3048 	if (vp->v_type == VSOCK)
3049 		vfs_unp_reclaim(vp);
3050 
3051 	/*
3052 	 * Clean out any buffers associated with the vnode.
3053 	 * If the flush fails, just toss the buffers.
3054 	 */
3055 	mp = NULL;
3056 	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
3057 		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
3058 	if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
3059 		while (vinvalbuf(vp, 0, 0, 0) != 0)
3060 			;
3061 	}
3062 
3063 	BO_LOCK(&vp->v_bufobj);
3064 	KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
3065 	    vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
3066 	    TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
3067 	    vp->v_bufobj.bo_clean.bv_cnt == 0,
3068 	    ("vp %p bufobj not invalidated", vp));
3069 	vp->v_bufobj.bo_flag |= BO_DEAD;
3070 	BO_UNLOCK(&vp->v_bufobj);
3071 
3072 	/*
3073 	 * Reclaim the vnode.
3074 	 */
3075 	if (VOP_RECLAIM(vp, td))
3076 		panic("vgone: cannot reclaim");
3077 	if (mp != NULL)
3078 		vn_finished_secondary_write(mp);
3079 	VNASSERT(vp->v_object == NULL, vp,
3080 	    ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
3081 	/*
3082 	 * Clear the advisory locks and wake up waiting threads.
3083 	 */
3084 	(void)VOP_ADVLOCKPURGE(vp);
3085 	/*
3086 	 * Delete from old mount point vnode list.
3087 	 */
3088 	delmntque(vp);
3089 	cache_purge(vp);
3090 	/*
3091 	 * Done with purge, reset to the standard lock and invalidate
3092 	 * the vnode.
3093 	 */
3094 	VI_LOCK(vp);
3095 	vp->v_vnlock = &vp->v_lock;
3096 	vp->v_op = &dead_vnodeops;
3097 	vp->v_tag = "none";
3098 	vp->v_type = VBAD;
3099 }
3100 
3101 /*
3102  * Calculate the total number of references to a special device.
3103  */
3104 int
3105 vcount(struct vnode *vp)
3106 {
3107 	int count;
3108 
3109 	dev_lock();
3110 	count = vp->v_rdev->si_usecount;
3111 	dev_unlock();
3112 	return (count);
3113 }
3114 
3115 /*
3116  * Same as above, but using the struct cdev *as argument
3117  */
3118 int
3119 count_dev(struct cdev *dev)
3120 {
3121 	int count;
3122 
3123 	dev_lock();
3124 	count = dev->si_usecount;
3125 	dev_unlock();
3126 	return(count);
3127 }
3128 
3129 /*
3130  * Print out a description of a vnode.
3131  */
3132 static char *typename[] =
3133 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
3134  "VMARKER"};
3135 
3136 void
3137 vn_printf(struct vnode *vp, const char *fmt, ...)
3138 {
3139 	va_list ap;
3140 	char buf[256], buf2[16];
3141 	u_long flags;
3142 
3143 	va_start(ap, fmt);
3144 	vprintf(fmt, ap);
3145 	va_end(ap);
3146 	printf("%p: ", (void *)vp);
3147 	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
3148 	printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
3149 	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
3150 	buf[0] = '\0';
3151 	buf[1] = '\0';
3152 	if (vp->v_vflag & VV_ROOT)
3153 		strlcat(buf, "|VV_ROOT", sizeof(buf));
3154 	if (vp->v_vflag & VV_ISTTY)
3155 		strlcat(buf, "|VV_ISTTY", sizeof(buf));
3156 	if (vp->v_vflag & VV_NOSYNC)
3157 		strlcat(buf, "|VV_NOSYNC", sizeof(buf));
3158 	if (vp->v_vflag & VV_ETERNALDEV)
3159 		strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
3160 	if (vp->v_vflag & VV_CACHEDLABEL)
3161 		strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
3162 	if (vp->v_vflag & VV_TEXT)
3163 		strlcat(buf, "|VV_TEXT", sizeof(buf));
3164 	if (vp->v_vflag & VV_COPYONWRITE)
3165 		strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
3166 	if (vp->v_vflag & VV_SYSTEM)
3167 		strlcat(buf, "|VV_SYSTEM", sizeof(buf));
3168 	if (vp->v_vflag & VV_PROCDEP)
3169 		strlcat(buf, "|VV_PROCDEP", sizeof(buf));
3170 	if (vp->v_vflag & VV_NOKNOTE)
3171 		strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
3172 	if (vp->v_vflag & VV_DELETED)
3173 		strlcat(buf, "|VV_DELETED", sizeof(buf));
3174 	if (vp->v_vflag & VV_MD)
3175 		strlcat(buf, "|VV_MD", sizeof(buf));
3176 	if (vp->v_vflag & VV_FORCEINSMQ)
3177 		strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
3178 	flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
3179 	    VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
3180 	    VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ);
3181 	if (flags != 0) {
3182 		snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
3183 		strlcat(buf, buf2, sizeof(buf));
3184 	}
3185 	if (vp->v_iflag & VI_MOUNT)
3186 		strlcat(buf, "|VI_MOUNT", sizeof(buf));
3187 	if (vp->v_iflag & VI_AGE)
3188 		strlcat(buf, "|VI_AGE", sizeof(buf));
3189 	if (vp->v_iflag & VI_DOOMED)
3190 		strlcat(buf, "|VI_DOOMED", sizeof(buf));
3191 	if (vp->v_iflag & VI_FREE)
3192 		strlcat(buf, "|VI_FREE", sizeof(buf));
3193 	if (vp->v_iflag & VI_ACTIVE)
3194 		strlcat(buf, "|VI_ACTIVE", sizeof(buf));
3195 	if (vp->v_iflag & VI_DOINGINACT)
3196 		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
3197 	if (vp->v_iflag & VI_OWEINACT)
3198 		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
3199 	flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE |
3200 	    VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT);
3201 	if (flags != 0) {
3202 		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
3203 		strlcat(buf, buf2, sizeof(buf));
3204 	}
3205 	printf("    flags (%s)\n", buf + 1);
3206 	if (mtx_owned(VI_MTX(vp)))
3207 		printf(" VI_LOCKed");
3208 	if (vp->v_object != NULL)
3209 		printf("    v_object %p ref %d pages %d "
3210 		    "cleanbuf %d dirtybuf %d\n",
3211 		    vp->v_object, vp->v_object->ref_count,
3212 		    vp->v_object->resident_page_count,
3213 		    vp->v_bufobj.bo_clean.bv_cnt,
3214 		    vp->v_bufobj.bo_dirty.bv_cnt);
3215 	printf("    ");
3216 	lockmgr_printinfo(vp->v_vnlock);
3217 	if (vp->v_data != NULL)
3218 		VOP_PRINT(vp);
3219 }
3220 
3221 #ifdef DDB
3222 /*
3223  * List all of the locked vnodes in the system.
3224  * Called when debugging the kernel.
3225  */
3226 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
3227 {
3228 	struct mount *mp;
3229 	struct vnode *vp;
3230 
3231 	/*
3232 	 * Note: because this is DDB, we can't obey the locking semantics
3233 	 * for these structures, which means we could catch an inconsistent
3234 	 * state and dereference a nasty pointer.  Not much to be done
3235 	 * about that.
3236 	 */
3237 	db_printf("Locked vnodes\n");
3238 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3239 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3240 			if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
3241 				vprint("", vp);
3242 		}
3243 	}
3244 }
3245 
3246 /*
3247  * Show details about the given vnode.
3248  */
3249 DB_SHOW_COMMAND(vnode, db_show_vnode)
3250 {
3251 	struct vnode *vp;
3252 
3253 	if (!have_addr)
3254 		return;
3255 	vp = (struct vnode *)addr;
3256 	vn_printf(vp, "vnode ");
3257 }
3258 
3259 /*
3260  * Show details about the given mount point.
3261  */
3262 DB_SHOW_COMMAND(mount, db_show_mount)
3263 {
3264 	struct mount *mp;
3265 	struct vfsopt *opt;
3266 	struct statfs *sp;
3267 	struct vnode *vp;
3268 	char buf[512];
3269 	uint64_t mflags;
3270 	u_int flags;
3271 
3272 	if (!have_addr) {
3273 		/* No address given, print short info about all mount points. */
3274 		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3275 			db_printf("%p %s on %s (%s)\n", mp,
3276 			    mp->mnt_stat.f_mntfromname,
3277 			    mp->mnt_stat.f_mntonname,
3278 			    mp->mnt_stat.f_fstypename);
3279 			if (db_pager_quit)
3280 				break;
3281 		}
3282 		db_printf("\nMore info: show mount <addr>\n");
3283 		return;
3284 	}
3285 
3286 	mp = (struct mount *)addr;
3287 	db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
3288 	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
3289 
3290 	buf[0] = '\0';
3291 	mflags = mp->mnt_flag;
3292 #define	MNT_FLAG(flag)	do {						\
3293 	if (mflags & (flag)) {						\
3294 		if (buf[0] != '\0')					\
3295 			strlcat(buf, ", ", sizeof(buf));		\
3296 		strlcat(buf, (#flag) + 4, sizeof(buf));			\
3297 		mflags &= ~(flag);					\
3298 	}								\
3299 } while (0)
3300 	MNT_FLAG(MNT_RDONLY);
3301 	MNT_FLAG(MNT_SYNCHRONOUS);
3302 	MNT_FLAG(MNT_NOEXEC);
3303 	MNT_FLAG(MNT_NOSUID);
3304 	MNT_FLAG(MNT_NFS4ACLS);
3305 	MNT_FLAG(MNT_UNION);
3306 	MNT_FLAG(MNT_ASYNC);
3307 	MNT_FLAG(MNT_SUIDDIR);
3308 	MNT_FLAG(MNT_SOFTDEP);
3309 	MNT_FLAG(MNT_NOSYMFOLLOW);
3310 	MNT_FLAG(MNT_GJOURNAL);
3311 	MNT_FLAG(MNT_MULTILABEL);
3312 	MNT_FLAG(MNT_ACLS);
3313 	MNT_FLAG(MNT_NOATIME);
3314 	MNT_FLAG(MNT_NOCLUSTERR);
3315 	MNT_FLAG(MNT_NOCLUSTERW);
3316 	MNT_FLAG(MNT_SUJ);
3317 	MNT_FLAG(MNT_EXRDONLY);
3318 	MNT_FLAG(MNT_EXPORTED);
3319 	MNT_FLAG(MNT_DEFEXPORTED);
3320 	MNT_FLAG(MNT_EXPORTANON);
3321 	MNT_FLAG(MNT_EXKERB);
3322 	MNT_FLAG(MNT_EXPUBLIC);
3323 	MNT_FLAG(MNT_LOCAL);
3324 	MNT_FLAG(MNT_QUOTA);
3325 	MNT_FLAG(MNT_ROOTFS);
3326 	MNT_FLAG(MNT_USER);
3327 	MNT_FLAG(MNT_IGNORE);
3328 	MNT_FLAG(MNT_UPDATE);
3329 	MNT_FLAG(MNT_DELEXPORT);
3330 	MNT_FLAG(MNT_RELOAD);
3331 	MNT_FLAG(MNT_FORCE);
3332 	MNT_FLAG(MNT_SNAPSHOT);
3333 	MNT_FLAG(MNT_BYFSID);
3334 #undef MNT_FLAG
3335 	if (mflags != 0) {
3336 		if (buf[0] != '\0')
3337 			strlcat(buf, ", ", sizeof(buf));
3338 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
3339 		    "0x%016jx", mflags);
3340 	}
3341 	db_printf("    mnt_flag = %s\n", buf);
3342 
3343 	buf[0] = '\0';
3344 	flags = mp->mnt_kern_flag;
3345 #define	MNT_KERN_FLAG(flag)	do {					\
3346 	if (flags & (flag)) {						\
3347 		if (buf[0] != '\0')					\
3348 			strlcat(buf, ", ", sizeof(buf));		\
3349 		strlcat(buf, (#flag) + 5, sizeof(buf));			\
3350 		flags &= ~(flag);					\
3351 	}								\
3352 } while (0)
3353 	MNT_KERN_FLAG(MNTK_UNMOUNTF);
3354 	MNT_KERN_FLAG(MNTK_ASYNC);
3355 	MNT_KERN_FLAG(MNTK_SOFTDEP);
3356 	MNT_KERN_FLAG(MNTK_NOINSMNTQ);
3357 	MNT_KERN_FLAG(MNTK_DRAINING);
3358 	MNT_KERN_FLAG(MNTK_REFEXPIRE);
3359 	MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
3360 	MNT_KERN_FLAG(MNTK_SHARED_WRITES);
3361 	MNT_KERN_FLAG(MNTK_NO_IOPF);
3362 	MNT_KERN_FLAG(MNTK_VGONE_UPPER);
3363 	MNT_KERN_FLAG(MNTK_VGONE_WAITER);
3364 	MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
3365 	MNT_KERN_FLAG(MNTK_MARKER);
3366 	MNT_KERN_FLAG(MNTK_USES_BCACHE);
3367 	MNT_KERN_FLAG(MNTK_NOASYNC);
3368 	MNT_KERN_FLAG(MNTK_UNMOUNT);
3369 	MNT_KERN_FLAG(MNTK_MWAIT);
3370 	MNT_KERN_FLAG(MNTK_SUSPEND);
3371 	MNT_KERN_FLAG(MNTK_SUSPEND2);
3372 	MNT_KERN_FLAG(MNTK_SUSPENDED);
3373 	MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
3374 	MNT_KERN_FLAG(MNTK_NOKNOTE);
3375 #undef MNT_KERN_FLAG
3376 	if (flags != 0) {
3377 		if (buf[0] != '\0')
3378 			strlcat(buf, ", ", sizeof(buf));
3379 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
3380 		    "0x%08x", flags);
3381 	}
3382 	db_printf("    mnt_kern_flag = %s\n", buf);
3383 
3384 	db_printf("    mnt_opt = ");
3385 	opt = TAILQ_FIRST(mp->mnt_opt);
3386 	if (opt != NULL) {
3387 		db_printf("%s", opt->name);
3388 		opt = TAILQ_NEXT(opt, link);
3389 		while (opt != NULL) {
3390 			db_printf(", %s", opt->name);
3391 			opt = TAILQ_NEXT(opt, link);
3392 		}
3393 	}
3394 	db_printf("\n");
3395 
3396 	sp = &mp->mnt_stat;
3397 	db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
3398 	    "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
3399 	    "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
3400 	    "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
3401 	    (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
3402 	    (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
3403 	    (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
3404 	    (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
3405 	    (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
3406 	    (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
3407 	    (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
3408 	    (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
3409 
3410 	db_printf("    mnt_cred = { uid=%u ruid=%u",
3411 	    (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
3412 	if (jailed(mp->mnt_cred))
3413 		db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
3414 	db_printf(" }\n");
3415 	db_printf("    mnt_ref = %d\n", mp->mnt_ref);
3416 	db_printf("    mnt_gen = %d\n", mp->mnt_gen);
3417 	db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
3418 	db_printf("    mnt_activevnodelistsize = %d\n",
3419 	    mp->mnt_activevnodelistsize);
3420 	db_printf("    mnt_writeopcount = %d\n", mp->mnt_writeopcount);
3421 	db_printf("    mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
3422 	db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
3423 	db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
3424 	db_printf("    mnt_lockref = %d\n", mp->mnt_lockref);
3425 	db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
3426 	db_printf("    mnt_secondary_accwrites = %d\n",
3427 	    mp->mnt_secondary_accwrites);
3428 	db_printf("    mnt_gjprovider = %s\n",
3429 	    mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
3430 
3431 	db_printf("\n\nList of active vnodes\n");
3432 	TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) {
3433 		if (vp->v_type != VMARKER) {
3434 			vn_printf(vp, "vnode ");
3435 			if (db_pager_quit)
3436 				break;
3437 		}
3438 	}
3439 	db_printf("\n\nList of inactive vnodes\n");
3440 	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3441 		if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) {
3442 			vn_printf(vp, "vnode ");
3443 			if (db_pager_quit)
3444 				break;
3445 		}
3446 	}
3447 }
3448 #endif	/* DDB */
3449 
3450 /*
3451  * Fill in a struct xvfsconf based on a struct vfsconf.
3452  */
3453 static int
3454 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
3455 {
3456 	struct xvfsconf xvfsp;
3457 
3458 	bzero(&xvfsp, sizeof(xvfsp));
3459 	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
3460 	xvfsp.vfc_typenum = vfsp->vfc_typenum;
3461 	xvfsp.vfc_refcount = vfsp->vfc_refcount;
3462 	xvfsp.vfc_flags = vfsp->vfc_flags;
3463 	/*
3464 	 * These are unused in userland, we keep them
3465 	 * to not break binary compatibility.
3466 	 */
3467 	xvfsp.vfc_vfsops = NULL;
3468 	xvfsp.vfc_next = NULL;
3469 	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3470 }
3471 
3472 #ifdef COMPAT_FREEBSD32
3473 struct xvfsconf32 {
3474 	uint32_t	vfc_vfsops;
3475 	char		vfc_name[MFSNAMELEN];
3476 	int32_t		vfc_typenum;
3477 	int32_t		vfc_refcount;
3478 	int32_t		vfc_flags;
3479 	uint32_t	vfc_next;
3480 };
3481 
3482 static int
3483 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
3484 {
3485 	struct xvfsconf32 xvfsp;
3486 
3487 	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
3488 	xvfsp.vfc_typenum = vfsp->vfc_typenum;
3489 	xvfsp.vfc_refcount = vfsp->vfc_refcount;
3490 	xvfsp.vfc_flags = vfsp->vfc_flags;
3491 	xvfsp.vfc_vfsops = 0;
3492 	xvfsp.vfc_next = 0;
3493 	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3494 }
3495 #endif
3496 
3497 /*
3498  * Top level filesystem related information gathering.
3499  */
3500 static int
3501 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
3502 {
3503 	struct vfsconf *vfsp;
3504 	int error;
3505 
3506 	error = 0;
3507 	vfsconf_slock();
3508 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3509 #ifdef COMPAT_FREEBSD32
3510 		if (req->flags & SCTL_MASK32)
3511 			error = vfsconf2x32(req, vfsp);
3512 		else
3513 #endif
3514 			error = vfsconf2x(req, vfsp);
3515 		if (error)
3516 			break;
3517 	}
3518 	vfsconf_sunlock();
3519 	return (error);
3520 }
3521 
3522 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
3523     CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
3524     "S,xvfsconf", "List of all configured filesystems");
3525 
3526 #ifndef BURN_BRIDGES
3527 static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
3528 
3529 static int
3530 vfs_sysctl(SYSCTL_HANDLER_ARGS)
3531 {
3532 	int *name = (int *)arg1 - 1;	/* XXX */
3533 	u_int namelen = arg2 + 1;	/* XXX */
3534 	struct vfsconf *vfsp;
3535 
3536 	log(LOG_WARNING, "userland calling deprecated sysctl, "
3537 	    "please rebuild world\n");
3538 
3539 #if 1 || defined(COMPAT_PRELITE2)
3540 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
3541 	if (namelen == 1)
3542 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
3543 #endif
3544 
3545 	switch (name[1]) {
3546 	case VFS_MAXTYPENUM:
3547 		if (namelen != 2)
3548 			return (ENOTDIR);
3549 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
3550 	case VFS_CONF:
3551 		if (namelen != 3)
3552 			return (ENOTDIR);	/* overloaded */
3553 		vfsconf_slock();
3554 		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3555 			if (vfsp->vfc_typenum == name[2])
3556 				break;
3557 		}
3558 		vfsconf_sunlock();
3559 		if (vfsp == NULL)
3560 			return (EOPNOTSUPP);
3561 #ifdef COMPAT_FREEBSD32
3562 		if (req->flags & SCTL_MASK32)
3563 			return (vfsconf2x32(req, vfsp));
3564 		else
3565 #endif
3566 			return (vfsconf2x(req, vfsp));
3567 	}
3568 	return (EOPNOTSUPP);
3569 }
3570 
3571 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
3572     CTLFLAG_MPSAFE, vfs_sysctl,
3573     "Generic filesystem");
3574 
3575 #if 1 || defined(COMPAT_PRELITE2)
3576 
3577 static int
3578 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
3579 {
3580 	int error;
3581 	struct vfsconf *vfsp;
3582 	struct ovfsconf ovfs;
3583 
3584 	vfsconf_slock();
3585 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3586 		bzero(&ovfs, sizeof(ovfs));
3587 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
3588 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
3589 		ovfs.vfc_index = vfsp->vfc_typenum;
3590 		ovfs.vfc_refcount = vfsp->vfc_refcount;
3591 		ovfs.vfc_flags = vfsp->vfc_flags;
3592 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
3593 		if (error != 0) {
3594 			vfsconf_sunlock();
3595 			return (error);
3596 		}
3597 	}
3598 	vfsconf_sunlock();
3599 	return (0);
3600 }
3601 
3602 #endif /* 1 || COMPAT_PRELITE2 */
3603 #endif /* !BURN_BRIDGES */
3604 
3605 #define KINFO_VNODESLOP		10
3606 #ifdef notyet
3607 /*
3608  * Dump vnode list (via sysctl).
3609  */
3610 /* ARGSUSED */
3611 static int
3612 sysctl_vnode(SYSCTL_HANDLER_ARGS)
3613 {
3614 	struct xvnode *xvn;
3615 	struct mount *mp;
3616 	struct vnode *vp;
3617 	int error, len, n;
3618 
3619 	/*
3620 	 * Stale numvnodes access is not fatal here.
3621 	 */
3622 	req->lock = 0;
3623 	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
3624 	if (!req->oldptr)
3625 		/* Make an estimate */
3626 		return (SYSCTL_OUT(req, 0, len));
3627 
3628 	error = sysctl_wire_old_buffer(req, 0);
3629 	if (error != 0)
3630 		return (error);
3631 	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
3632 	n = 0;
3633 	mtx_lock(&mountlist_mtx);
3634 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3635 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
3636 			continue;
3637 		MNT_ILOCK(mp);
3638 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3639 			if (n == len)
3640 				break;
3641 			vref(vp);
3642 			xvn[n].xv_size = sizeof *xvn;
3643 			xvn[n].xv_vnode = vp;
3644 			xvn[n].xv_id = 0;	/* XXX compat */
3645 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
3646 			XV_COPY(usecount);
3647 			XV_COPY(writecount);
3648 			XV_COPY(holdcnt);
3649 			XV_COPY(mount);
3650 			XV_COPY(numoutput);
3651 			XV_COPY(type);
3652 #undef XV_COPY
3653 			xvn[n].xv_flag = vp->v_vflag;
3654 
3655 			switch (vp->v_type) {
3656 			case VREG:
3657 			case VDIR:
3658 			case VLNK:
3659 				break;
3660 			case VBLK:
3661 			case VCHR:
3662 				if (vp->v_rdev == NULL) {
3663 					vrele(vp);
3664 					continue;
3665 				}
3666 				xvn[n].xv_dev = dev2udev(vp->v_rdev);
3667 				break;
3668 			case VSOCK:
3669 				xvn[n].xv_socket = vp->v_socket;
3670 				break;
3671 			case VFIFO:
3672 				xvn[n].xv_fifo = vp->v_fifoinfo;
3673 				break;
3674 			case VNON:
3675 			case VBAD:
3676 			default:
3677 				/* shouldn't happen? */
3678 				vrele(vp);
3679 				continue;
3680 			}
3681 			vrele(vp);
3682 			++n;
3683 		}
3684 		MNT_IUNLOCK(mp);
3685 		mtx_lock(&mountlist_mtx);
3686 		vfs_unbusy(mp);
3687 		if (n == len)
3688 			break;
3689 	}
3690 	mtx_unlock(&mountlist_mtx);
3691 
3692 	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
3693 	free(xvn, M_TEMP);
3694 	return (error);
3695 }
3696 
3697 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD |
3698     CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
3699     "");
3700 #endif
3701 
3702 static void
3703 unmount_or_warn(struct mount *mp)
3704 {
3705 	int error;
3706 
3707 	error = dounmount(mp, MNT_FORCE, curthread);
3708 	if (error != 0) {
3709 		printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
3710 		if (error == EBUSY)
3711 			printf("BUSY)\n");
3712 		else
3713 			printf("%d)\n", error);
3714 	}
3715 }
3716 
3717 /*
3718  * Unmount all filesystems. The list is traversed in reverse order
3719  * of mounting to avoid dependencies.
3720  */
3721 void
3722 vfs_unmountall(void)
3723 {
3724 	struct mount *mp, *tmp;
3725 
3726 	CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
3727 
3728 	/*
3729 	 * Since this only runs when rebooting, it is not interlocked.
3730 	 */
3731 	TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) {
3732 		vfs_ref(mp);
3733 
3734 		/*
3735 		 * Forcibly unmounting "/dev" before "/" would prevent clean
3736 		 * unmount of the latter.
3737 		 */
3738 		if (mp == rootdevmp)
3739 			continue;
3740 
3741 		unmount_or_warn(mp);
3742 	}
3743 
3744 	if (rootdevmp != NULL)
3745 		unmount_or_warn(rootdevmp);
3746 }
3747 
3748 /*
3749  * perform msync on all vnodes under a mount point
3750  * the mount point must be locked.
3751  */
3752 void
3753 vfs_msync(struct mount *mp, int flags)
3754 {
3755 	struct vnode *vp, *mvp;
3756 	struct vm_object *obj;
3757 
3758 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
3759 	MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) {
3760 		obj = vp->v_object;
3761 		if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 &&
3762 		    (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
3763 			if (!vget(vp,
3764 			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
3765 			    curthread)) {
3766 				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
3767 					vput(vp);
3768 					continue;
3769 				}
3770 
3771 				obj = vp->v_object;
3772 				if (obj != NULL) {
3773 					VM_OBJECT_WLOCK(obj);
3774 					vm_object_page_clean(obj, 0, 0,
3775 					    flags == MNT_WAIT ?
3776 					    OBJPC_SYNC : OBJPC_NOSYNC);
3777 					VM_OBJECT_WUNLOCK(obj);
3778 				}
3779 				vput(vp);
3780 			}
3781 		} else
3782 			VI_UNLOCK(vp);
3783 	}
3784 }
3785 
3786 static void
3787 destroy_vpollinfo_free(struct vpollinfo *vi)
3788 {
3789 
3790 	knlist_destroy(&vi->vpi_selinfo.si_note);
3791 	mtx_destroy(&vi->vpi_lock);
3792 	uma_zfree(vnodepoll_zone, vi);
3793 }
3794 
3795 static void
3796 destroy_vpollinfo(struct vpollinfo *vi)
3797 {
3798 
3799 	knlist_clear(&vi->vpi_selinfo.si_note, 1);
3800 	seldrain(&vi->vpi_selinfo);
3801 	destroy_vpollinfo_free(vi);
3802 }
3803 
3804 /*
3805  * Initalize per-vnode helper structure to hold poll-related state.
3806  */
3807 void
3808 v_addpollinfo(struct vnode *vp)
3809 {
3810 	struct vpollinfo *vi;
3811 
3812 	if (vp->v_pollinfo != NULL)
3813 		return;
3814 	vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO);
3815 	mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
3816 	knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
3817 	    vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
3818 	VI_LOCK(vp);
3819 	if (vp->v_pollinfo != NULL) {
3820 		VI_UNLOCK(vp);
3821 		destroy_vpollinfo_free(vi);
3822 		return;
3823 	}
3824 	vp->v_pollinfo = vi;
3825 	VI_UNLOCK(vp);
3826 }
3827 
3828 /*
3829  * Record a process's interest in events which might happen to
3830  * a vnode.  Because poll uses the historic select-style interface
3831  * internally, this routine serves as both the ``check for any
3832  * pending events'' and the ``record my interest in future events''
3833  * functions.  (These are done together, while the lock is held,
3834  * to avoid race conditions.)
3835  */
3836 int
3837 vn_pollrecord(struct vnode *vp, struct thread *td, int events)
3838 {
3839 
3840 	v_addpollinfo(vp);
3841 	mtx_lock(&vp->v_pollinfo->vpi_lock);
3842 	if (vp->v_pollinfo->vpi_revents & events) {
3843 		/*
3844 		 * This leaves events we are not interested
3845 		 * in available for the other process which
3846 		 * which presumably had requested them
3847 		 * (otherwise they would never have been
3848 		 * recorded).
3849 		 */
3850 		events &= vp->v_pollinfo->vpi_revents;
3851 		vp->v_pollinfo->vpi_revents &= ~events;
3852 
3853 		mtx_unlock(&vp->v_pollinfo->vpi_lock);
3854 		return (events);
3855 	}
3856 	vp->v_pollinfo->vpi_events |= events;
3857 	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3858 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3859 	return (0);
3860 }
3861 
3862 /*
3863  * Routine to create and manage a filesystem syncer vnode.
3864  */
3865 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
3866 static int	sync_fsync(struct  vop_fsync_args *);
3867 static int	sync_inactive(struct  vop_inactive_args *);
3868 static int	sync_reclaim(struct  vop_reclaim_args *);
3869 
3870 static struct vop_vector sync_vnodeops = {
3871 	.vop_bypass =	VOP_EOPNOTSUPP,
3872 	.vop_close =	sync_close,		/* close */
3873 	.vop_fsync =	sync_fsync,		/* fsync */
3874 	.vop_inactive =	sync_inactive,	/* inactive */
3875 	.vop_reclaim =	sync_reclaim,	/* reclaim */
3876 	.vop_lock1 =	vop_stdlock,	/* lock */
3877 	.vop_unlock =	vop_stdunlock,	/* unlock */
3878 	.vop_islocked =	vop_stdislocked,	/* islocked */
3879 };
3880 
3881 /*
3882  * Create a new filesystem syncer vnode for the specified mount point.
3883  */
3884 void
3885 vfs_allocate_syncvnode(struct mount *mp)
3886 {
3887 	struct vnode *vp;
3888 	struct bufobj *bo;
3889 	static long start, incr, next;
3890 	int error;
3891 
3892 	/* Allocate a new vnode */
3893 	error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
3894 	if (error != 0)
3895 		panic("vfs_allocate_syncvnode: getnewvnode() failed");
3896 	vp->v_type = VNON;
3897 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3898 	vp->v_vflag |= VV_FORCEINSMQ;
3899 	error = insmntque(vp, mp);
3900 	if (error != 0)
3901 		panic("vfs_allocate_syncvnode: insmntque() failed");
3902 	vp->v_vflag &= ~VV_FORCEINSMQ;
3903 	VOP_UNLOCK(vp, 0);
3904 	/*
3905 	 * Place the vnode onto the syncer worklist. We attempt to
3906 	 * scatter them about on the list so that they will go off
3907 	 * at evenly distributed times even if all the filesystems
3908 	 * are mounted at once.
3909 	 */
3910 	next += incr;
3911 	if (next == 0 || next > syncer_maxdelay) {
3912 		start /= 2;
3913 		incr /= 2;
3914 		if (start == 0) {
3915 			start = syncer_maxdelay / 2;
3916 			incr = syncer_maxdelay;
3917 		}
3918 		next = start;
3919 	}
3920 	bo = &vp->v_bufobj;
3921 	BO_LOCK(bo);
3922 	vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
3923 	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
3924 	mtx_lock(&sync_mtx);
3925 	sync_vnode_count++;
3926 	if (mp->mnt_syncer == NULL) {
3927 		mp->mnt_syncer = vp;
3928 		vp = NULL;
3929 	}
3930 	mtx_unlock(&sync_mtx);
3931 	BO_UNLOCK(bo);
3932 	if (vp != NULL) {
3933 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3934 		vgone(vp);
3935 		vput(vp);
3936 	}
3937 }
3938 
3939 void
3940 vfs_deallocate_syncvnode(struct mount *mp)
3941 {
3942 	struct vnode *vp;
3943 
3944 	mtx_lock(&sync_mtx);
3945 	vp = mp->mnt_syncer;
3946 	if (vp != NULL)
3947 		mp->mnt_syncer = NULL;
3948 	mtx_unlock(&sync_mtx);
3949 	if (vp != NULL)
3950 		vrele(vp);
3951 }
3952 
3953 /*
3954  * Do a lazy sync of the filesystem.
3955  */
3956 static int
3957 sync_fsync(struct vop_fsync_args *ap)
3958 {
3959 	struct vnode *syncvp = ap->a_vp;
3960 	struct mount *mp = syncvp->v_mount;
3961 	int error, save;
3962 	struct bufobj *bo;
3963 
3964 	/*
3965 	 * We only need to do something if this is a lazy evaluation.
3966 	 */
3967 	if (ap->a_waitfor != MNT_LAZY)
3968 		return (0);
3969 
3970 	/*
3971 	 * Move ourselves to the back of the sync list.
3972 	 */
3973 	bo = &syncvp->v_bufobj;
3974 	BO_LOCK(bo);
3975 	vn_syncer_add_to_worklist(bo, syncdelay);
3976 	BO_UNLOCK(bo);
3977 
3978 	/*
3979 	 * Walk the list of vnodes pushing all that are dirty and
3980 	 * not already on the sync list.
3981 	 */
3982 	if (vfs_busy(mp, MBF_NOWAIT) != 0)
3983 		return (0);
3984 	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3985 		vfs_unbusy(mp);
3986 		return (0);
3987 	}
3988 	save = curthread_pflags_set(TDP_SYNCIO);
3989 	vfs_msync(mp, MNT_NOWAIT);
3990 	error = VFS_SYNC(mp, MNT_LAZY);
3991 	curthread_pflags_restore(save);
3992 	vn_finished_write(mp);
3993 	vfs_unbusy(mp);
3994 	return (error);
3995 }
3996 
3997 /*
3998  * The syncer vnode is no referenced.
3999  */
4000 static int
4001 sync_inactive(struct vop_inactive_args *ap)
4002 {
4003 
4004 	vgone(ap->a_vp);
4005 	return (0);
4006 }
4007 
4008 /*
4009  * The syncer vnode is no longer needed and is being decommissioned.
4010  *
4011  * Modifications to the worklist must be protected by sync_mtx.
4012  */
4013 static int
4014 sync_reclaim(struct vop_reclaim_args *ap)
4015 {
4016 	struct vnode *vp = ap->a_vp;
4017 	struct bufobj *bo;
4018 
4019 	bo = &vp->v_bufobj;
4020 	BO_LOCK(bo);
4021 	mtx_lock(&sync_mtx);
4022 	if (vp->v_mount->mnt_syncer == vp)
4023 		vp->v_mount->mnt_syncer = NULL;
4024 	if (bo->bo_flag & BO_ONWORKLST) {
4025 		LIST_REMOVE(bo, bo_synclist);
4026 		syncer_worklist_len--;
4027 		sync_vnode_count--;
4028 		bo->bo_flag &= ~BO_ONWORKLST;
4029 	}
4030 	mtx_unlock(&sync_mtx);
4031 	BO_UNLOCK(bo);
4032 
4033 	return (0);
4034 }
4035 
4036 /*
4037  * Check if vnode represents a disk device
4038  */
4039 int
4040 vn_isdisk(struct vnode *vp, int *errp)
4041 {
4042 	int error;
4043 
4044 	if (vp->v_type != VCHR) {
4045 		error = ENOTBLK;
4046 		goto out;
4047 	}
4048 	error = 0;
4049 	dev_lock();
4050 	if (vp->v_rdev == NULL)
4051 		error = ENXIO;
4052 	else if (vp->v_rdev->si_devsw == NULL)
4053 		error = ENXIO;
4054 	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
4055 		error = ENOTBLK;
4056 	dev_unlock();
4057 out:
4058 	if (errp != NULL)
4059 		*errp = error;
4060 	return (error == 0);
4061 }
4062 
4063 /*
4064  * Common filesystem object access control check routine.  Accepts a
4065  * vnode's type, "mode", uid and gid, requested access mode, credentials,
4066  * and optional call-by-reference privused argument allowing vaccess()
4067  * to indicate to the caller whether privilege was used to satisfy the
4068  * request (obsoleted).  Returns 0 on success, or an errno on failure.
4069  */
4070 int
4071 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
4072     accmode_t accmode, struct ucred *cred, int *privused)
4073 {
4074 	accmode_t dac_granted;
4075 	accmode_t priv_granted;
4076 
4077 	KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
4078 	    ("invalid bit in accmode"));
4079 	KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
4080 	    ("VAPPEND without VWRITE"));
4081 
4082 	/*
4083 	 * Look for a normal, non-privileged way to access the file/directory
4084 	 * as requested.  If it exists, go with that.
4085 	 */
4086 
4087 	if (privused != NULL)
4088 		*privused = 0;
4089 
4090 	dac_granted = 0;
4091 
4092 	/* Check the owner. */
4093 	if (cred->cr_uid == file_uid) {
4094 		dac_granted |= VADMIN;
4095 		if (file_mode & S_IXUSR)
4096 			dac_granted |= VEXEC;
4097 		if (file_mode & S_IRUSR)
4098 			dac_granted |= VREAD;
4099 		if (file_mode & S_IWUSR)
4100 			dac_granted |= (VWRITE | VAPPEND);
4101 
4102 		if ((accmode & dac_granted) == accmode)
4103 			return (0);
4104 
4105 		goto privcheck;
4106 	}
4107 
4108 	/* Otherwise, check the groups (first match) */
4109 	if (groupmember(file_gid, cred)) {
4110 		if (file_mode & S_IXGRP)
4111 			dac_granted |= VEXEC;
4112 		if (file_mode & S_IRGRP)
4113 			dac_granted |= VREAD;
4114 		if (file_mode & S_IWGRP)
4115 			dac_granted |= (VWRITE | VAPPEND);
4116 
4117 		if ((accmode & dac_granted) == accmode)
4118 			return (0);
4119 
4120 		goto privcheck;
4121 	}
4122 
4123 	/* Otherwise, check everyone else. */
4124 	if (file_mode & S_IXOTH)
4125 		dac_granted |= VEXEC;
4126 	if (file_mode & S_IROTH)
4127 		dac_granted |= VREAD;
4128 	if (file_mode & S_IWOTH)
4129 		dac_granted |= (VWRITE | VAPPEND);
4130 	if ((accmode & dac_granted) == accmode)
4131 		return (0);
4132 
4133 privcheck:
4134 	/*
4135 	 * Build a privilege mask to determine if the set of privileges
4136 	 * satisfies the requirements when combined with the granted mask
4137 	 * from above.  For each privilege, if the privilege is required,
4138 	 * bitwise or the request type onto the priv_granted mask.
4139 	 */
4140 	priv_granted = 0;
4141 
4142 	if (type == VDIR) {
4143 		/*
4144 		 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
4145 		 * requests, instead of PRIV_VFS_EXEC.
4146 		 */
4147 		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
4148 		    !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
4149 			priv_granted |= VEXEC;
4150 	} else {
4151 		/*
4152 		 * Ensure that at least one execute bit is on. Otherwise,
4153 		 * a privileged user will always succeed, and we don't want
4154 		 * this to happen unless the file really is executable.
4155 		 */
4156 		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
4157 		    (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
4158 		    !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
4159 			priv_granted |= VEXEC;
4160 	}
4161 
4162 	if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
4163 	    !priv_check_cred(cred, PRIV_VFS_READ, 0))
4164 		priv_granted |= VREAD;
4165 
4166 	if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
4167 	    !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
4168 		priv_granted |= (VWRITE | VAPPEND);
4169 
4170 	if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
4171 	    !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
4172 		priv_granted |= VADMIN;
4173 
4174 	if ((accmode & (priv_granted | dac_granted)) == accmode) {
4175 		/* XXX audit: privilege used */
4176 		if (privused != NULL)
4177 			*privused = 1;
4178 		return (0);
4179 	}
4180 
4181 	return ((accmode & VADMIN) ? EPERM : EACCES);
4182 }
4183 
4184 /*
4185  * Credential check based on process requesting service, and per-attribute
4186  * permissions.
4187  */
4188 int
4189 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
4190     struct thread *td, accmode_t accmode)
4191 {
4192 
4193 	/*
4194 	 * Kernel-invoked always succeeds.
4195 	 */
4196 	if (cred == NOCRED)
4197 		return (0);
4198 
4199 	/*
4200 	 * Do not allow privileged processes in jail to directly manipulate
4201 	 * system attributes.
4202 	 */
4203 	switch (attrnamespace) {
4204 	case EXTATTR_NAMESPACE_SYSTEM:
4205 		/* Potentially should be: return (EPERM); */
4206 		return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
4207 	case EXTATTR_NAMESPACE_USER:
4208 		return (VOP_ACCESS(vp, accmode, cred, td));
4209 	default:
4210 		return (EPERM);
4211 	}
4212 }
4213 
4214 #ifdef DEBUG_VFS_LOCKS
4215 /*
4216  * This only exists to supress warnings from unlocked specfs accesses.  It is
4217  * no longer ok to have an unlocked VFS.
4218  */
4219 #define	IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL ||		\
4220 	(vp)->v_type == VCHR ||	(vp)->v_type == VBAD)
4221 
4222 int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
4223 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
4224     "Drop into debugger on lock violation");
4225 
4226 int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
4227 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
4228     0, "Check for interlock across VOPs");
4229 
4230 int vfs_badlock_print = 1;	/* Print lock violations. */
4231 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
4232     0, "Print lock violations");
4233 
4234 #ifdef KDB
4235 int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
4236 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
4237     &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
4238 #endif
4239 
4240 static void
4241 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
4242 {
4243 
4244 #ifdef KDB
4245 	if (vfs_badlock_backtrace)
4246 		kdb_backtrace();
4247 #endif
4248 	if (vfs_badlock_print)
4249 		printf("%s: %p %s\n", str, (void *)vp, msg);
4250 	if (vfs_badlock_ddb)
4251 		kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
4252 }
4253 
4254 void
4255 assert_vi_locked(struct vnode *vp, const char *str)
4256 {
4257 
4258 	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
4259 		vfs_badlock("interlock is not locked but should be", str, vp);
4260 }
4261 
4262 void
4263 assert_vi_unlocked(struct vnode *vp, const char *str)
4264 {
4265 
4266 	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
4267 		vfs_badlock("interlock is locked but should not be", str, vp);
4268 }
4269 
4270 void
4271 assert_vop_locked(struct vnode *vp, const char *str)
4272 {
4273 	int locked;
4274 
4275 	if (!IGNORE_LOCK(vp)) {
4276 		locked = VOP_ISLOCKED(vp);
4277 		if (locked == 0 || locked == LK_EXCLOTHER)
4278 			vfs_badlock("is not locked but should be", str, vp);
4279 	}
4280 }
4281 
4282 void
4283 assert_vop_unlocked(struct vnode *vp, const char *str)
4284 {
4285 
4286 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
4287 		vfs_badlock("is locked but should not be", str, vp);
4288 }
4289 
4290 void
4291 assert_vop_elocked(struct vnode *vp, const char *str)
4292 {
4293 
4294 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
4295 		vfs_badlock("is not exclusive locked but should be", str, vp);
4296 }
4297 
4298 #if 0
4299 void
4300 assert_vop_elocked_other(struct vnode *vp, const char *str)
4301 {
4302 
4303 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER)
4304 		vfs_badlock("is not exclusive locked by another thread",
4305 		    str, vp);
4306 }
4307 
4308 void
4309 assert_vop_slocked(struct vnode *vp, const char *str)
4310 {
4311 
4312 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED)
4313 		vfs_badlock("is not locked shared but should be", str, vp);
4314 }
4315 #endif /* 0 */
4316 #endif /* DEBUG_VFS_LOCKS */
4317 
4318 void
4319 vop_rename_fail(struct vop_rename_args *ap)
4320 {
4321 
4322 	if (ap->a_tvp != NULL)
4323 		vput(ap->a_tvp);
4324 	if (ap->a_tdvp == ap->a_tvp)
4325 		vrele(ap->a_tdvp);
4326 	else
4327 		vput(ap->a_tdvp);
4328 	vrele(ap->a_fdvp);
4329 	vrele(ap->a_fvp);
4330 }
4331 
4332 void
4333 vop_rename_pre(void *ap)
4334 {
4335 	struct vop_rename_args *a = ap;
4336 
4337 #ifdef DEBUG_VFS_LOCKS
4338 	if (a->a_tvp)
4339 		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
4340 	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
4341 	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
4342 	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
4343 
4344 	/* Check the source (from). */
4345 	if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
4346 	    (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
4347 		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
4348 	if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
4349 		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
4350 
4351 	/* Check the target. */
4352 	if (a->a_tvp)
4353 		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
4354 	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
4355 #endif
4356 	if (a->a_tdvp != a->a_fdvp)
4357 		vhold(a->a_fdvp);
4358 	if (a->a_tvp != a->a_fvp)
4359 		vhold(a->a_fvp);
4360 	vhold(a->a_tdvp);
4361 	if (a->a_tvp)
4362 		vhold(a->a_tvp);
4363 }
4364 
4365 void
4366 vop_strategy_pre(void *ap)
4367 {
4368 #ifdef DEBUG_VFS_LOCKS
4369 	struct vop_strategy_args *a;
4370 	struct buf *bp;
4371 
4372 	a = ap;
4373 	bp = a->a_bp;
4374 
4375 	/*
4376 	 * Cluster ops lock their component buffers but not the IO container.
4377 	 */
4378 	if ((bp->b_flags & B_CLUSTER) != 0)
4379 		return;
4380 
4381 	if (panicstr == NULL && !BUF_ISLOCKED(bp)) {
4382 		if (vfs_badlock_print)
4383 			printf(
4384 			    "VOP_STRATEGY: bp is not locked but should be\n");
4385 		if (vfs_badlock_ddb)
4386 			kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
4387 	}
4388 #endif
4389 }
4390 
4391 void
4392 vop_lock_pre(void *ap)
4393 {
4394 #ifdef DEBUG_VFS_LOCKS
4395 	struct vop_lock1_args *a = ap;
4396 
4397 	if ((a->a_flags & LK_INTERLOCK) == 0)
4398 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
4399 	else
4400 		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
4401 #endif
4402 }
4403 
4404 void
4405 vop_lock_post(void *ap, int rc)
4406 {
4407 #ifdef DEBUG_VFS_LOCKS
4408 	struct vop_lock1_args *a = ap;
4409 
4410 	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
4411 	if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
4412 		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
4413 #endif
4414 }
4415 
4416 void
4417 vop_unlock_pre(void *ap)
4418 {
4419 #ifdef DEBUG_VFS_LOCKS
4420 	struct vop_unlock_args *a = ap;
4421 
4422 	if (a->a_flags & LK_INTERLOCK)
4423 		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
4424 	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
4425 #endif
4426 }
4427 
4428 void
4429 vop_unlock_post(void *ap, int rc)
4430 {
4431 #ifdef DEBUG_VFS_LOCKS
4432 	struct vop_unlock_args *a = ap;
4433 
4434 	if (a->a_flags & LK_INTERLOCK)
4435 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
4436 #endif
4437 }
4438 
4439 void
4440 vop_create_post(void *ap, int rc)
4441 {
4442 	struct vop_create_args *a = ap;
4443 
4444 	if (!rc)
4445 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4446 }
4447 
4448 void
4449 vop_deleteextattr_post(void *ap, int rc)
4450 {
4451 	struct vop_deleteextattr_args *a = ap;
4452 
4453 	if (!rc)
4454 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4455 }
4456 
4457 void
4458 vop_link_post(void *ap, int rc)
4459 {
4460 	struct vop_link_args *a = ap;
4461 
4462 	if (!rc) {
4463 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
4464 		VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
4465 	}
4466 }
4467 
4468 void
4469 vop_mkdir_post(void *ap, int rc)
4470 {
4471 	struct vop_mkdir_args *a = ap;
4472 
4473 	if (!rc)
4474 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4475 }
4476 
4477 void
4478 vop_mknod_post(void *ap, int rc)
4479 {
4480 	struct vop_mknod_args *a = ap;
4481 
4482 	if (!rc)
4483 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4484 }
4485 
4486 void
4487 vop_reclaim_post(void *ap, int rc)
4488 {
4489 	struct vop_reclaim_args *a = ap;
4490 
4491 	if (!rc)
4492 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_REVOKE);
4493 }
4494 
4495 void
4496 vop_remove_post(void *ap, int rc)
4497 {
4498 	struct vop_remove_args *a = ap;
4499 
4500 	if (!rc) {
4501 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4502 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4503 	}
4504 }
4505 
4506 void
4507 vop_rename_post(void *ap, int rc)
4508 {
4509 	struct vop_rename_args *a = ap;
4510 
4511 	if (!rc) {
4512 		VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
4513 		VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
4514 		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
4515 		if (a->a_tvp)
4516 			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
4517 	}
4518 	if (a->a_tdvp != a->a_fdvp)
4519 		vdrop(a->a_fdvp);
4520 	if (a->a_tvp != a->a_fvp)
4521 		vdrop(a->a_fvp);
4522 	vdrop(a->a_tdvp);
4523 	if (a->a_tvp)
4524 		vdrop(a->a_tvp);
4525 }
4526 
4527 void
4528 vop_rmdir_post(void *ap, int rc)
4529 {
4530 	struct vop_rmdir_args *a = ap;
4531 
4532 	if (!rc) {
4533 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4534 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4535 	}
4536 }
4537 
4538 void
4539 vop_setattr_post(void *ap, int rc)
4540 {
4541 	struct vop_setattr_args *a = ap;
4542 
4543 	if (!rc)
4544 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4545 }
4546 
4547 void
4548 vop_setextattr_post(void *ap, int rc)
4549 {
4550 	struct vop_setextattr_args *a = ap;
4551 
4552 	if (!rc)
4553 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4554 }
4555 
4556 void
4557 vop_symlink_post(void *ap, int rc)
4558 {
4559 	struct vop_symlink_args *a = ap;
4560 
4561 	if (!rc)
4562 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4563 }
4564 
4565 static struct knlist fs_knlist;
4566 
4567 static void
4568 vfs_event_init(void *arg)
4569 {
4570 	knlist_init_mtx(&fs_knlist, NULL);
4571 }
4572 /* XXX - correct order? */
4573 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
4574 
4575 void
4576 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
4577 {
4578 
4579 	KNOTE_UNLOCKED(&fs_knlist, event);
4580 }
4581 
4582 static int	filt_fsattach(struct knote *kn);
4583 static void	filt_fsdetach(struct knote *kn);
4584 static int	filt_fsevent(struct knote *kn, long hint);
4585 
4586 struct filterops fs_filtops = {
4587 	.f_isfd = 0,
4588 	.f_attach = filt_fsattach,
4589 	.f_detach = filt_fsdetach,
4590 	.f_event = filt_fsevent
4591 };
4592 
4593 static int
4594 filt_fsattach(struct knote *kn)
4595 {
4596 
4597 	kn->kn_flags |= EV_CLEAR;
4598 	knlist_add(&fs_knlist, kn, 0);
4599 	return (0);
4600 }
4601 
4602 static void
4603 filt_fsdetach(struct knote *kn)
4604 {
4605 
4606 	knlist_remove(&fs_knlist, kn, 0);
4607 }
4608 
4609 static int
4610 filt_fsevent(struct knote *kn, long hint)
4611 {
4612 
4613 	kn->kn_fflags |= hint;
4614 	return (kn->kn_fflags != 0);
4615 }
4616 
4617 static int
4618 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
4619 {
4620 	struct vfsidctl vc;
4621 	int error;
4622 	struct mount *mp;
4623 
4624 	error = SYSCTL_IN(req, &vc, sizeof(vc));
4625 	if (error)
4626 		return (error);
4627 	if (vc.vc_vers != VFS_CTL_VERS1)
4628 		return (EINVAL);
4629 	mp = vfs_getvfs(&vc.vc_fsid);
4630 	if (mp == NULL)
4631 		return (ENOENT);
4632 	/* ensure that a specific sysctl goes to the right filesystem. */
4633 	if (strcmp(vc.vc_fstypename, "*") != 0 &&
4634 	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
4635 		vfs_rel(mp);
4636 		return (EINVAL);
4637 	}
4638 	VCTLTOREQ(&vc, req);
4639 	error = VFS_SYSCTL(mp, vc.vc_op, req);
4640 	vfs_rel(mp);
4641 	return (error);
4642 }
4643 
4644 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR,
4645     NULL, 0, sysctl_vfs_ctl, "",
4646     "Sysctl by fsid");
4647 
4648 /*
4649  * Function to initialize a va_filerev field sensibly.
4650  * XXX: Wouldn't a random number make a lot more sense ??
4651  */
4652 u_quad_t
4653 init_va_filerev(void)
4654 {
4655 	struct bintime bt;
4656 
4657 	getbinuptime(&bt);
4658 	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
4659 }
4660 
4661 static int	filt_vfsread(struct knote *kn, long hint);
4662 static int	filt_vfswrite(struct knote *kn, long hint);
4663 static int	filt_vfsvnode(struct knote *kn, long hint);
4664 static void	filt_vfsdetach(struct knote *kn);
4665 static struct filterops vfsread_filtops = {
4666 	.f_isfd = 1,
4667 	.f_detach = filt_vfsdetach,
4668 	.f_event = filt_vfsread
4669 };
4670 static struct filterops vfswrite_filtops = {
4671 	.f_isfd = 1,
4672 	.f_detach = filt_vfsdetach,
4673 	.f_event = filt_vfswrite
4674 };
4675 static struct filterops vfsvnode_filtops = {
4676 	.f_isfd = 1,
4677 	.f_detach = filt_vfsdetach,
4678 	.f_event = filt_vfsvnode
4679 };
4680 
4681 static void
4682 vfs_knllock(void *arg)
4683 {
4684 	struct vnode *vp = arg;
4685 
4686 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4687 }
4688 
4689 static void
4690 vfs_knlunlock(void *arg)
4691 {
4692 	struct vnode *vp = arg;
4693 
4694 	VOP_UNLOCK(vp, 0);
4695 }
4696 
4697 static void
4698 vfs_knl_assert_locked(void *arg)
4699 {
4700 #ifdef DEBUG_VFS_LOCKS
4701 	struct vnode *vp = arg;
4702 
4703 	ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
4704 #endif
4705 }
4706 
4707 static void
4708 vfs_knl_assert_unlocked(void *arg)
4709 {
4710 #ifdef DEBUG_VFS_LOCKS
4711 	struct vnode *vp = arg;
4712 
4713 	ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
4714 #endif
4715 }
4716 
4717 int
4718 vfs_kqfilter(struct vop_kqfilter_args *ap)
4719 {
4720 	struct vnode *vp = ap->a_vp;
4721 	struct knote *kn = ap->a_kn;
4722 	struct knlist *knl;
4723 
4724 	switch (kn->kn_filter) {
4725 	case EVFILT_READ:
4726 		kn->kn_fop = &vfsread_filtops;
4727 		break;
4728 	case EVFILT_WRITE:
4729 		kn->kn_fop = &vfswrite_filtops;
4730 		break;
4731 	case EVFILT_VNODE:
4732 		kn->kn_fop = &vfsvnode_filtops;
4733 		break;
4734 	default:
4735 		return (EINVAL);
4736 	}
4737 
4738 	kn->kn_hook = (caddr_t)vp;
4739 
4740 	v_addpollinfo(vp);
4741 	if (vp->v_pollinfo == NULL)
4742 		return (ENOMEM);
4743 	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
4744 	vhold(vp);
4745 	knlist_add(knl, kn, 0);
4746 
4747 	return (0);
4748 }
4749 
4750 /*
4751  * Detach knote from vnode
4752  */
4753 static void
4754 filt_vfsdetach(struct knote *kn)
4755 {
4756 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4757 
4758 	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
4759 	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
4760 	vdrop(vp);
4761 }
4762 
4763 /*ARGSUSED*/
4764 static int
4765 filt_vfsread(struct knote *kn, long hint)
4766 {
4767 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4768 	struct vattr va;
4769 	int res;
4770 
4771 	/*
4772 	 * filesystem is gone, so set the EOF flag and schedule
4773 	 * the knote for deletion.
4774 	 */
4775 	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
4776 		VI_LOCK(vp);
4777 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4778 		VI_UNLOCK(vp);
4779 		return (1);
4780 	}
4781 
4782 	if (VOP_GETATTR(vp, &va, curthread->td_ucred))
4783 		return (0);
4784 
4785 	VI_LOCK(vp);
4786 	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
4787 	res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0;
4788 	VI_UNLOCK(vp);
4789 	return (res);
4790 }
4791 
4792 /*ARGSUSED*/
4793 static int
4794 filt_vfswrite(struct knote *kn, long hint)
4795 {
4796 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4797 
4798 	VI_LOCK(vp);
4799 
4800 	/*
4801 	 * filesystem is gone, so set the EOF flag and schedule
4802 	 * the knote for deletion.
4803 	 */
4804 	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD))
4805 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4806 
4807 	kn->kn_data = 0;
4808 	VI_UNLOCK(vp);
4809 	return (1);
4810 }
4811 
4812 static int
4813 filt_vfsvnode(struct knote *kn, long hint)
4814 {
4815 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4816 	int res;
4817 
4818 	VI_LOCK(vp);
4819 	if (kn->kn_sfflags & hint)
4820 		kn->kn_fflags |= hint;
4821 	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
4822 		kn->kn_flags |= EV_EOF;
4823 		VI_UNLOCK(vp);
4824 		return (1);
4825 	}
4826 	res = (kn->kn_fflags != 0);
4827 	VI_UNLOCK(vp);
4828 	return (res);
4829 }
4830 
4831 int
4832 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
4833 {
4834 	int error;
4835 
4836 	if (dp->d_reclen > ap->a_uio->uio_resid)
4837 		return (ENAMETOOLONG);
4838 	error = uiomove(dp, dp->d_reclen, ap->a_uio);
4839 	if (error) {
4840 		if (ap->a_ncookies != NULL) {
4841 			if (ap->a_cookies != NULL)
4842 				free(ap->a_cookies, M_TEMP);
4843 			ap->a_cookies = NULL;
4844 			*ap->a_ncookies = 0;
4845 		}
4846 		return (error);
4847 	}
4848 	if (ap->a_ncookies == NULL)
4849 		return (0);
4850 
4851 	KASSERT(ap->a_cookies,
4852 	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
4853 
4854 	*ap->a_cookies = realloc(*ap->a_cookies,
4855 	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
4856 	(*ap->a_cookies)[*ap->a_ncookies] = off;
4857 	return (0);
4858 }
4859 
4860 /*
4861  * Mark for update the access time of the file if the filesystem
4862  * supports VOP_MARKATIME.  This functionality is used by execve and
4863  * mmap, so we want to avoid the I/O implied by directly setting
4864  * va_atime for the sake of efficiency.
4865  */
4866 void
4867 vfs_mark_atime(struct vnode *vp, struct ucred *cred)
4868 {
4869 	struct mount *mp;
4870 
4871 	mp = vp->v_mount;
4872 	ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
4873 	if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
4874 		(void)VOP_MARKATIME(vp);
4875 }
4876 
4877 /*
4878  * The purpose of this routine is to remove granularity from accmode_t,
4879  * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
4880  * VADMIN and VAPPEND.
4881  *
4882  * If it returns 0, the caller is supposed to continue with the usual
4883  * access checks using 'accmode' as modified by this routine.  If it
4884  * returns nonzero value, the caller is supposed to return that value
4885  * as errno.
4886  *
4887  * Note that after this routine runs, accmode may be zero.
4888  */
4889 int
4890 vfs_unixify_accmode(accmode_t *accmode)
4891 {
4892 	/*
4893 	 * There is no way to specify explicit "deny" rule using
4894 	 * file mode or POSIX.1e ACLs.
4895 	 */
4896 	if (*accmode & VEXPLICIT_DENY) {
4897 		*accmode = 0;
4898 		return (0);
4899 	}
4900 
4901 	/*
4902 	 * None of these can be translated into usual access bits.
4903 	 * Also, the common case for NFSv4 ACLs is to not contain
4904 	 * either of these bits. Caller should check for VWRITE
4905 	 * on the containing directory instead.
4906 	 */
4907 	if (*accmode & (VDELETE_CHILD | VDELETE))
4908 		return (EPERM);
4909 
4910 	if (*accmode & VADMIN_PERMS) {
4911 		*accmode &= ~VADMIN_PERMS;
4912 		*accmode |= VADMIN;
4913 	}
4914 
4915 	/*
4916 	 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
4917 	 * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
4918 	 */
4919 	*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
4920 
4921 	return (0);
4922 }
4923 
4924 /*
4925  * These are helper functions for filesystems to traverse all
4926  * their vnodes.  See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
4927  *
4928  * This interface replaces MNT_VNODE_FOREACH.
4929  */
4930 
4931 MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
4932 
4933 struct vnode *
4934 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
4935 {
4936 	struct vnode *vp;
4937 
4938 	if (should_yield())
4939 		kern_yield(PRI_USER);
4940 	MNT_ILOCK(mp);
4941 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4942 	vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
4943 	while (vp != NULL && (vp->v_type == VMARKER ||
4944 	    (vp->v_iflag & VI_DOOMED) != 0))
4945 		vp = TAILQ_NEXT(vp, v_nmntvnodes);
4946 
4947 	/* Check if we are done */
4948 	if (vp == NULL) {
4949 		__mnt_vnode_markerfree_all(mvp, mp);
4950 		/* MNT_IUNLOCK(mp); -- done in above function */
4951 		mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
4952 		return (NULL);
4953 	}
4954 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
4955 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
4956 	VI_LOCK(vp);
4957 	MNT_IUNLOCK(mp);
4958 	return (vp);
4959 }
4960 
4961 struct vnode *
4962 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
4963 {
4964 	struct vnode *vp;
4965 
4966 	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
4967 	MNT_ILOCK(mp);
4968 	MNT_REF(mp);
4969 	(*mvp)->v_type = VMARKER;
4970 
4971 	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
4972 	while (vp != NULL && (vp->v_type == VMARKER ||
4973 	    (vp->v_iflag & VI_DOOMED) != 0))
4974 		vp = TAILQ_NEXT(vp, v_nmntvnodes);
4975 
4976 	/* Check if we are done */
4977 	if (vp == NULL) {
4978 		MNT_REL(mp);
4979 		MNT_IUNLOCK(mp);
4980 		free(*mvp, M_VNODE_MARKER);
4981 		*mvp = NULL;
4982 		return (NULL);
4983 	}
4984 	(*mvp)->v_mount = mp;
4985 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
4986 	VI_LOCK(vp);
4987 	MNT_IUNLOCK(mp);
4988 	return (vp);
4989 }
4990 
4991 
4992 void
4993 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
4994 {
4995 
4996 	if (*mvp == NULL) {
4997 		MNT_IUNLOCK(mp);
4998 		return;
4999 	}
5000 
5001 	mtx_assert(MNT_MTX(mp), MA_OWNED);
5002 
5003 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
5004 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
5005 	MNT_REL(mp);
5006 	MNT_IUNLOCK(mp);
5007 	free(*mvp, M_VNODE_MARKER);
5008 	*mvp = NULL;
5009 }
5010 
5011 /*
5012  * These are helper functions for filesystems to traverse their
5013  * active vnodes.  See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h
5014  */
5015 static void
5016 mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
5017 {
5018 
5019 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
5020 
5021 	MNT_ILOCK(mp);
5022 	MNT_REL(mp);
5023 	MNT_IUNLOCK(mp);
5024 	free(*mvp, M_VNODE_MARKER);
5025 	*mvp = NULL;
5026 }
5027 
5028 static struct vnode *
5029 mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
5030 {
5031 	struct vnode *vp, *nvp;
5032 
5033 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
5034 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
5035 restart:
5036 	vp = TAILQ_NEXT(*mvp, v_actfreelist);
5037 	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
5038 	while (vp != NULL) {
5039 		if (vp->v_type == VMARKER) {
5040 			vp = TAILQ_NEXT(vp, v_actfreelist);
5041 			continue;
5042 		}
5043 		if (!VI_TRYLOCK(vp)) {
5044 			if (mp_ncpus == 1 || should_yield()) {
5045 				TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
5046 				mtx_unlock(&vnode_free_list_mtx);
5047 				pause("vnacti", 1);
5048 				mtx_lock(&vnode_free_list_mtx);
5049 				goto restart;
5050 			}
5051 			continue;
5052 		}
5053 		KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
5054 		KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
5055 		    ("alien vnode on the active list %p %p", vp, mp));
5056 		if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0)
5057 			break;
5058 		nvp = TAILQ_NEXT(vp, v_actfreelist);
5059 		VI_UNLOCK(vp);
5060 		vp = nvp;
5061 	}
5062 
5063 	/* Check if we are done */
5064 	if (vp == NULL) {
5065 		mtx_unlock(&vnode_free_list_mtx);
5066 		mnt_vnode_markerfree_active(mvp, mp);
5067 		return (NULL);
5068 	}
5069 	TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist);
5070 	mtx_unlock(&vnode_free_list_mtx);
5071 	ASSERT_VI_LOCKED(vp, "active iter");
5072 	KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
5073 	return (vp);
5074 }
5075 
5076 struct vnode *
5077 __mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
5078 {
5079 
5080 	if (should_yield())
5081 		kern_yield(PRI_USER);
5082 	mtx_lock(&vnode_free_list_mtx);
5083 	return (mnt_vnode_next_active(mvp, mp));
5084 }
5085 
5086 struct vnode *
5087 __mnt_vnode_first_active(struct vnode **mvp, struct mount *mp)
5088 {
5089 	struct vnode *vp;
5090 
5091 	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
5092 	MNT_ILOCK(mp);
5093 	MNT_REF(mp);
5094 	MNT_IUNLOCK(mp);
5095 	(*mvp)->v_type = VMARKER;
5096 	(*mvp)->v_mount = mp;
5097 
5098 	mtx_lock(&vnode_free_list_mtx);
5099 	vp = TAILQ_FIRST(&mp->mnt_activevnodelist);
5100 	if (vp == NULL) {
5101 		mtx_unlock(&vnode_free_list_mtx);
5102 		mnt_vnode_markerfree_active(mvp, mp);
5103 		return (NULL);
5104 	}
5105 	TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
5106 	return (mnt_vnode_next_active(mvp, mp));
5107 }
5108 
5109 void
5110 __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
5111 {
5112 
5113 	if (*mvp == NULL)
5114 		return;
5115 
5116 	mtx_lock(&vnode_free_list_mtx);
5117 	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
5118 	mtx_unlock(&vnode_free_list_mtx);
5119 	mnt_vnode_markerfree_active(mvp, mp);
5120 }
5121