xref: /freebsd/sys/kern/vfs_subr.c (revision db7b0ba78fea6ccdf0a708c73923ad24c009a58c)
1 /*-
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
35  */
36 
37 /*
38  * External virtual filesystem routines
39  */
40 
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 #include "opt_compat.h"
45 #include "opt_ddb.h"
46 #include "opt_watchdog.h"
47 
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/bio.h>
51 #include <sys/buf.h>
52 #include <sys/condvar.h>
53 #include <sys/conf.h>
54 #include <sys/dirent.h>
55 #include <sys/event.h>
56 #include <sys/eventhandler.h>
57 #include <sys/extattr.h>
58 #include <sys/file.h>
59 #include <sys/fcntl.h>
60 #include <sys/jail.h>
61 #include <sys/kdb.h>
62 #include <sys/kernel.h>
63 #include <sys/kthread.h>
64 #include <sys/lockf.h>
65 #include <sys/malloc.h>
66 #include <sys/mount.h>
67 #include <sys/namei.h>
68 #include <sys/priv.h>
69 #include <sys/reboot.h>
70 #include <sys/sched.h>
71 #include <sys/sleepqueue.h>
72 #include <sys/smp.h>
73 #include <sys/stat.h>
74 #include <sys/sysctl.h>
75 #include <sys/syslog.h>
76 #include <sys/vmmeter.h>
77 #include <sys/vnode.h>
78 #include <sys/watchdog.h>
79 
80 #include <machine/stdarg.h>
81 
82 #include <security/mac/mac_framework.h>
83 
84 #include <vm/vm.h>
85 #include <vm/vm_object.h>
86 #include <vm/vm_extern.h>
87 #include <vm/pmap.h>
88 #include <vm/vm_map.h>
89 #include <vm/vm_page.h>
90 #include <vm/vm_kern.h>
91 #include <vm/uma.h>
92 
93 #ifdef DDB
94 #include <ddb/ddb.h>
95 #endif
96 
97 static void	delmntque(struct vnode *vp);
98 static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
99 		    int slpflag, int slptimeo);
100 static void	syncer_shutdown(void *arg, int howto);
101 static int	vtryrecycle(struct vnode *vp);
102 static void	v_incr_usecount(struct vnode *);
103 static void	v_decr_usecount(struct vnode *);
104 static void	v_decr_useonly(struct vnode *);
105 static void	v_upgrade_usecount(struct vnode *);
106 static void	vnlru_free(int);
107 static void	vgonel(struct vnode *);
108 static void	vfs_knllock(void *arg);
109 static void	vfs_knlunlock(void *arg);
110 static void	vfs_knl_assert_locked(void *arg);
111 static void	vfs_knl_assert_unlocked(void *arg);
112 static void	destroy_vpollinfo(struct vpollinfo *vi);
113 
114 /*
115  * Number of vnodes in existence.  Increased whenever getnewvnode()
116  * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode.
117  */
118 static unsigned long	numvnodes;
119 
120 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
121     "Number of vnodes in existence");
122 
123 /*
124  * Conversion tables for conversion from vnode types to inode formats
125  * and back.
126  */
127 enum vtype iftovt_tab[16] = {
128 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
129 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
130 };
131 int vttoif_tab[10] = {
132 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
133 	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
134 };
135 
136 /*
137  * List of vnodes that are ready for recycling.
138  */
139 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
140 
141 /*
142  * Free vnode target.  Free vnodes may simply be files which have been stat'd
143  * but not read.  This is somewhat common, and a small cache of such files
144  * should be kept to avoid recreation costs.
145  */
146 static u_long wantfreevnodes;
147 SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
148 /* Number of vnodes in the free list. */
149 static u_long freevnodes;
150 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0,
151     "Number of vnodes in the free list");
152 
153 static int vlru_allow_cache_src;
154 SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW,
155     &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode");
156 
157 /*
158  * Various variables used for debugging the new implementation of
159  * reassignbuf().
160  * XXX these are probably of (very) limited utility now.
161  */
162 static int reassignbufcalls;
163 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
164     "Number of calls to reassignbuf");
165 
166 /*
167  * Cache for the mount type id assigned to NFS.  This is used for
168  * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
169  */
170 int	nfs_mount_type = -1;
171 
172 /* To keep more than one thread at a time from running vfs_getnewfsid */
173 static struct mtx mntid_mtx;
174 
175 /*
176  * Lock for any access to the following:
177  *	vnode_free_list
178  *	numvnodes
179  *	freevnodes
180  */
181 static struct mtx vnode_free_list_mtx;
182 
183 /* Publicly exported FS */
184 struct nfs_public nfs_pub;
185 
186 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
187 static uma_zone_t vnode_zone;
188 static uma_zone_t vnodepoll_zone;
189 
190 /*
191  * The workitem queue.
192  *
193  * It is useful to delay writes of file data and filesystem metadata
194  * for tens of seconds so that quickly created and deleted files need
195  * not waste disk bandwidth being created and removed. To realize this,
196  * we append vnodes to a "workitem" queue. When running with a soft
197  * updates implementation, most pending metadata dependencies should
198  * not wait for more than a few seconds. Thus, mounted on block devices
199  * are delayed only about a half the time that file data is delayed.
200  * Similarly, directory updates are more critical, so are only delayed
201  * about a third the time that file data is delayed. Thus, there are
202  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
203  * one each second (driven off the filesystem syncer process). The
204  * syncer_delayno variable indicates the next queue that is to be processed.
205  * Items that need to be processed soon are placed in this queue:
206  *
207  *	syncer_workitem_pending[syncer_delayno]
208  *
209  * A delay of fifteen seconds is done by placing the request fifteen
210  * entries later in the queue:
211  *
212  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
213  *
214  */
215 static int syncer_delayno;
216 static long syncer_mask;
217 LIST_HEAD(synclist, bufobj);
218 static struct synclist *syncer_workitem_pending;
219 /*
220  * The sync_mtx protects:
221  *	bo->bo_synclist
222  *	sync_vnode_count
223  *	syncer_delayno
224  *	syncer_state
225  *	syncer_workitem_pending
226  *	syncer_worklist_len
227  *	rushjob
228  */
229 static struct mtx sync_mtx;
230 static struct cv sync_wakeup;
231 
232 #define SYNCER_MAXDELAY		32
233 static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
234 static int syncdelay = 30;		/* max time to delay syncing data */
235 static int filedelay = 30;		/* time to delay syncing files */
236 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
237     "Time to delay syncing files (in seconds)");
238 static int dirdelay = 29;		/* time to delay syncing directories */
239 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
240     "Time to delay syncing directories (in seconds)");
241 static int metadelay = 28;		/* time to delay syncing metadata */
242 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
243     "Time to delay syncing metadata (in seconds)");
244 static int rushjob;		/* number of slots to run ASAP */
245 static int stat_rush_requests;	/* number of times I/O speeded up */
246 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
247     "Number of times I/O speeded up (rush requests)");
248 
249 /*
250  * When shutting down the syncer, run it at four times normal speed.
251  */
252 #define SYNCER_SHUTDOWN_SPEEDUP		4
253 static int sync_vnode_count;
254 static int syncer_worklist_len;
255 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
256     syncer_state;
257 
258 /*
259  * Number of vnodes we want to exist at any one time.  This is mostly used
260  * to size hash tables in vnode-related code.  It is normally not used in
261  * getnewvnode(), as wantfreevnodes is normally nonzero.)
262  *
263  * XXX desiredvnodes is historical cruft and should not exist.
264  */
265 int desiredvnodes;
266 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
267     &desiredvnodes, 0, "Maximum number of vnodes");
268 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
269     &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
270 static int vnlru_nowhere;
271 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
272     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
273 
274 /*
275  * Macros to control when a vnode is freed and recycled.  All require
276  * the vnode interlock.
277  */
278 #define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
279 #define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
280 #define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt)
281 
282 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
283 static int vnsz2log;
284 
285 /*
286  * Initialize the vnode management data structures.
287  *
288  * Reevaluate the following cap on the number of vnodes after the physical
289  * memory size exceeds 512GB.  In the limit, as the physical memory size
290  * grows, the ratio of physical pages to vnodes approaches sixteen to one.
291  */
292 #ifndef	MAXVNODES_MAX
293 #define	MAXVNODES_MAX	(512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16))
294 #endif
295 static void
296 vntblinit(void *dummy __unused)
297 {
298 	u_int i;
299 	int physvnodes, virtvnodes;
300 
301 	/*
302 	 * Desiredvnodes is a function of the physical memory size and the
303 	 * kernel's heap size.  Generally speaking, it scales with the
304 	 * physical memory size.  The ratio of desiredvnodes to physical pages
305 	 * is one to four until desiredvnodes exceeds 98,304.  Thereafter, the
306 	 * marginal ratio of desiredvnodes to physical pages is one to
307 	 * sixteen.  However, desiredvnodes is limited by the kernel's heap
308 	 * size.  The memory required by desiredvnodes vnodes and vm objects
309 	 * may not exceed one seventh of the kernel's heap size.
310 	 */
311 	physvnodes = maxproc + cnt.v_page_count / 16 + 3 * min(98304 * 4,
312 	    cnt.v_page_count) / 16;
313 	virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) +
314 	    sizeof(struct vnode)));
315 	desiredvnodes = min(physvnodes, virtvnodes);
316 	if (desiredvnodes > MAXVNODES_MAX) {
317 		if (bootverbose)
318 			printf("Reducing kern.maxvnodes %d -> %d\n",
319 			    desiredvnodes, MAXVNODES_MAX);
320 		desiredvnodes = MAXVNODES_MAX;
321 	}
322 	wantfreevnodes = desiredvnodes / 4;
323 	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
324 	TAILQ_INIT(&vnode_free_list);
325 	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
326 	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
327 	    NULL, NULL, UMA_ALIGN_PTR, 0);
328 	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
329 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
330 	/*
331 	 * Initialize the filesystem syncer.
332 	 */
333 	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
334 	    &syncer_mask);
335 	syncer_maxdelay = syncer_mask + 1;
336 	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
337 	cv_init(&sync_wakeup, "syncer");
338 	for (i = 1; i <= sizeof(struct vnode); i <<= 1)
339 		vnsz2log++;
340 	vnsz2log--;
341 }
342 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
343 
344 
345 /*
346  * Mark a mount point as busy. Used to synchronize access and to delay
347  * unmounting. Eventually, mountlist_mtx is not released on failure.
348  *
349  * vfs_busy() is a custom lock, it can block the caller.
350  * vfs_busy() only sleeps if the unmount is active on the mount point.
351  * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
352  * vnode belonging to mp.
353  *
354  * Lookup uses vfs_busy() to traverse mount points.
355  * root fs			var fs
356  * / vnode lock		A	/ vnode lock (/var)		D
357  * /var vnode lock	B	/log vnode lock(/var/log)	E
358  * vfs_busy lock	C	vfs_busy lock			F
359  *
360  * Within each file system, the lock order is C->A->B and F->D->E.
361  *
362  * When traversing across mounts, the system follows that lock order:
363  *
364  *        C->A->B
365  *              |
366  *              +->F->D->E
367  *
368  * The lookup() process for namei("/var") illustrates the process:
369  *  VOP_LOOKUP() obtains B while A is held
370  *  vfs_busy() obtains a shared lock on F while A and B are held
371  *  vput() releases lock on B
372  *  vput() releases lock on A
373  *  VFS_ROOT() obtains lock on D while shared lock on F is held
374  *  vfs_unbusy() releases shared lock on F
375  *  vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
376  *    Attempt to lock A (instead of vp_crossmp) while D is held would
377  *    violate the global order, causing deadlocks.
378  *
379  * dounmount() locks B while F is drained.
380  */
381 int
382 vfs_busy(struct mount *mp, int flags)
383 {
384 
385 	MPASS((flags & ~MBF_MASK) == 0);
386 	CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
387 
388 	MNT_ILOCK(mp);
389 	MNT_REF(mp);
390 	/*
391 	 * If mount point is currenly being unmounted, sleep until the
392 	 * mount point fate is decided.  If thread doing the unmounting fails,
393 	 * it will clear MNTK_UNMOUNT flag before waking us up, indicating
394 	 * that this mount point has survived the unmount attempt and vfs_busy
395 	 * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
396 	 * flag in addition to MNTK_UNMOUNT, indicating that mount point is
397 	 * about to be really destroyed.  vfs_busy needs to release its
398 	 * reference on the mount point in this case and return with ENOENT,
399 	 * telling the caller that mount mount it tried to busy is no longer
400 	 * valid.
401 	 */
402 	while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
403 		if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
404 			MNT_REL(mp);
405 			MNT_IUNLOCK(mp);
406 			CTR1(KTR_VFS, "%s: failed busying before sleeping",
407 			    __func__);
408 			return (ENOENT);
409 		}
410 		if (flags & MBF_MNTLSTLOCK)
411 			mtx_unlock(&mountlist_mtx);
412 		mp->mnt_kern_flag |= MNTK_MWAIT;
413 		msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
414 		if (flags & MBF_MNTLSTLOCK)
415 			mtx_lock(&mountlist_mtx);
416 		MNT_ILOCK(mp);
417 	}
418 	if (flags & MBF_MNTLSTLOCK)
419 		mtx_unlock(&mountlist_mtx);
420 	mp->mnt_lockref++;
421 	MNT_IUNLOCK(mp);
422 	return (0);
423 }
424 
425 /*
426  * Free a busy filesystem.
427  */
428 void
429 vfs_unbusy(struct mount *mp)
430 {
431 
432 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
433 	MNT_ILOCK(mp);
434 	MNT_REL(mp);
435 	KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref"));
436 	mp->mnt_lockref--;
437 	if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
438 		MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
439 		CTR1(KTR_VFS, "%s: waking up waiters", __func__);
440 		mp->mnt_kern_flag &= ~MNTK_DRAINING;
441 		wakeup(&mp->mnt_lockref);
442 	}
443 	MNT_IUNLOCK(mp);
444 }
445 
446 /*
447  * Lookup a mount point by filesystem identifier.
448  */
449 struct mount *
450 vfs_getvfs(fsid_t *fsid)
451 {
452 	struct mount *mp;
453 
454 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
455 	mtx_lock(&mountlist_mtx);
456 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
457 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
458 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
459 			vfs_ref(mp);
460 			mtx_unlock(&mountlist_mtx);
461 			return (mp);
462 		}
463 	}
464 	mtx_unlock(&mountlist_mtx);
465 	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
466 	return ((struct mount *) 0);
467 }
468 
469 /*
470  * Lookup a mount point by filesystem identifier, busying it before
471  * returning.
472  */
473 struct mount *
474 vfs_busyfs(fsid_t *fsid)
475 {
476 	struct mount *mp;
477 	int error;
478 
479 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
480 	mtx_lock(&mountlist_mtx);
481 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
482 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
483 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
484 			error = vfs_busy(mp, MBF_MNTLSTLOCK);
485 			if (error) {
486 				mtx_unlock(&mountlist_mtx);
487 				return (NULL);
488 			}
489 			return (mp);
490 		}
491 	}
492 	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
493 	mtx_unlock(&mountlist_mtx);
494 	return ((struct mount *) 0);
495 }
496 
497 /*
498  * Check if a user can access privileged mount options.
499  */
500 int
501 vfs_suser(struct mount *mp, struct thread *td)
502 {
503 	int error;
504 
505 	/*
506 	 * If the thread is jailed, but this is not a jail-friendly file
507 	 * system, deny immediately.
508 	 */
509 	if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred))
510 		return (EPERM);
511 
512 	/*
513 	 * If the file system was mounted outside the jail of the calling
514 	 * thread, deny immediately.
515 	 */
516 	if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
517 		return (EPERM);
518 
519 	/*
520 	 * If file system supports delegated administration, we don't check
521 	 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
522 	 * by the file system itself.
523 	 * If this is not the user that did original mount, we check for
524 	 * the PRIV_VFS_MOUNT_OWNER privilege.
525 	 */
526 	if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
527 	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
528 		if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
529 			return (error);
530 	}
531 	return (0);
532 }
533 
534 /*
535  * Get a new unique fsid.  Try to make its val[0] unique, since this value
536  * will be used to create fake device numbers for stat().  Also try (but
537  * not so hard) make its val[0] unique mod 2^16, since some emulators only
538  * support 16-bit device numbers.  We end up with unique val[0]'s for the
539  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
540  *
541  * Keep in mind that several mounts may be running in parallel.  Starting
542  * the search one past where the previous search terminated is both a
543  * micro-optimization and a defense against returning the same fsid to
544  * different mounts.
545  */
546 void
547 vfs_getnewfsid(struct mount *mp)
548 {
549 	static uint16_t mntid_base;
550 	struct mount *nmp;
551 	fsid_t tfsid;
552 	int mtype;
553 
554 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
555 	mtx_lock(&mntid_mtx);
556 	mtype = mp->mnt_vfc->vfc_typenum;
557 	tfsid.val[1] = mtype;
558 	mtype = (mtype & 0xFF) << 24;
559 	for (;;) {
560 		tfsid.val[0] = makedev(255,
561 		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
562 		mntid_base++;
563 		if ((nmp = vfs_getvfs(&tfsid)) == NULL)
564 			break;
565 		vfs_rel(nmp);
566 	}
567 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
568 	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
569 	mtx_unlock(&mntid_mtx);
570 }
571 
572 /*
573  * Knob to control the precision of file timestamps:
574  *
575  *   0 = seconds only; nanoseconds zeroed.
576  *   1 = seconds and nanoseconds, accurate within 1/HZ.
577  *   2 = seconds and nanoseconds, truncated to microseconds.
578  * >=3 = seconds and nanoseconds, maximum precision.
579  */
580 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
581 
582 static int timestamp_precision = TSP_SEC;
583 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
584     &timestamp_precision, 0, "File timestamp precision (0: seconds, "
585     "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, "
586     "3+: sec + ns (max. precision))");
587 
588 /*
589  * Get a current timestamp.
590  */
591 void
592 vfs_timestamp(struct timespec *tsp)
593 {
594 	struct timeval tv;
595 
596 	switch (timestamp_precision) {
597 	case TSP_SEC:
598 		tsp->tv_sec = time_second;
599 		tsp->tv_nsec = 0;
600 		break;
601 	case TSP_HZ:
602 		getnanotime(tsp);
603 		break;
604 	case TSP_USEC:
605 		microtime(&tv);
606 		TIMEVAL_TO_TIMESPEC(&tv, tsp);
607 		break;
608 	case TSP_NSEC:
609 	default:
610 		nanotime(tsp);
611 		break;
612 	}
613 }
614 
615 /*
616  * Set vnode attributes to VNOVAL
617  */
618 void
619 vattr_null(struct vattr *vap)
620 {
621 
622 	vap->va_type = VNON;
623 	vap->va_size = VNOVAL;
624 	vap->va_bytes = VNOVAL;
625 	vap->va_mode = VNOVAL;
626 	vap->va_nlink = VNOVAL;
627 	vap->va_uid = VNOVAL;
628 	vap->va_gid = VNOVAL;
629 	vap->va_fsid = VNOVAL;
630 	vap->va_fileid = VNOVAL;
631 	vap->va_blocksize = VNOVAL;
632 	vap->va_rdev = VNOVAL;
633 	vap->va_atime.tv_sec = VNOVAL;
634 	vap->va_atime.tv_nsec = VNOVAL;
635 	vap->va_mtime.tv_sec = VNOVAL;
636 	vap->va_mtime.tv_nsec = VNOVAL;
637 	vap->va_ctime.tv_sec = VNOVAL;
638 	vap->va_ctime.tv_nsec = VNOVAL;
639 	vap->va_birthtime.tv_sec = VNOVAL;
640 	vap->va_birthtime.tv_nsec = VNOVAL;
641 	vap->va_flags = VNOVAL;
642 	vap->va_gen = VNOVAL;
643 	vap->va_vaflags = 0;
644 }
645 
646 /*
647  * This routine is called when we have too many vnodes.  It attempts
648  * to free <count> vnodes and will potentially free vnodes that still
649  * have VM backing store (VM backing store is typically the cause
650  * of a vnode blowout so we want to do this).  Therefore, this operation
651  * is not considered cheap.
652  *
653  * A number of conditions may prevent a vnode from being reclaimed.
654  * the buffer cache may have references on the vnode, a directory
655  * vnode may still have references due to the namei cache representing
656  * underlying files, or the vnode may be in active use.   It is not
657  * desireable to reuse such vnodes.  These conditions may cause the
658  * number of vnodes to reach some minimum value regardless of what
659  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
660  */
661 static int
662 vlrureclaim(struct mount *mp)
663 {
664 	struct vnode *vp;
665 	int done;
666 	int trigger;
667 	int usevnodes;
668 	int count;
669 
670 	/*
671 	 * Calculate the trigger point, don't allow user
672 	 * screwups to blow us up.   This prevents us from
673 	 * recycling vnodes with lots of resident pages.  We
674 	 * aren't trying to free memory, we are trying to
675 	 * free vnodes.
676 	 */
677 	usevnodes = desiredvnodes;
678 	if (usevnodes <= 0)
679 		usevnodes = 1;
680 	trigger = cnt.v_page_count * 2 / usevnodes;
681 	done = 0;
682 	vn_start_write(NULL, &mp, V_WAIT);
683 	MNT_ILOCK(mp);
684 	count = mp->mnt_nvnodelistsize / 10 + 1;
685 	while (count != 0) {
686 		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
687 		while (vp != NULL && vp->v_type == VMARKER)
688 			vp = TAILQ_NEXT(vp, v_nmntvnodes);
689 		if (vp == NULL)
690 			break;
691 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
692 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
693 		--count;
694 		if (!VI_TRYLOCK(vp))
695 			goto next_iter;
696 		/*
697 		 * If it's been deconstructed already, it's still
698 		 * referenced, or it exceeds the trigger, skip it.
699 		 */
700 		if (vp->v_usecount ||
701 		    (!vlru_allow_cache_src &&
702 			!LIST_EMPTY(&(vp)->v_cache_src)) ||
703 		    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
704 		    vp->v_object->resident_page_count > trigger)) {
705 			VI_UNLOCK(vp);
706 			goto next_iter;
707 		}
708 		MNT_IUNLOCK(mp);
709 		vholdl(vp);
710 		if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
711 			vdrop(vp);
712 			goto next_iter_mntunlocked;
713 		}
714 		VI_LOCK(vp);
715 		/*
716 		 * v_usecount may have been bumped after VOP_LOCK() dropped
717 		 * the vnode interlock and before it was locked again.
718 		 *
719 		 * It is not necessary to recheck VI_DOOMED because it can
720 		 * only be set by another thread that holds both the vnode
721 		 * lock and vnode interlock.  If another thread has the
722 		 * vnode lock before we get to VOP_LOCK() and obtains the
723 		 * vnode interlock after VOP_LOCK() drops the vnode
724 		 * interlock, the other thread will be unable to drop the
725 		 * vnode lock before our VOP_LOCK() call fails.
726 		 */
727 		if (vp->v_usecount ||
728 		    (!vlru_allow_cache_src &&
729 			!LIST_EMPTY(&(vp)->v_cache_src)) ||
730 		    (vp->v_object != NULL &&
731 		    vp->v_object->resident_page_count > trigger)) {
732 			VOP_UNLOCK(vp, LK_INTERLOCK);
733 			goto next_iter_mntunlocked;
734 		}
735 		KASSERT((vp->v_iflag & VI_DOOMED) == 0,
736 		    ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
737 		vgonel(vp);
738 		VOP_UNLOCK(vp, 0);
739 		vdropl(vp);
740 		done++;
741 next_iter_mntunlocked:
742 		if (!should_yield())
743 			goto relock_mnt;
744 		goto yield;
745 next_iter:
746 		if (!should_yield())
747 			continue;
748 		MNT_IUNLOCK(mp);
749 yield:
750 		kern_yield(PRI_USER);
751 relock_mnt:
752 		MNT_ILOCK(mp);
753 	}
754 	MNT_IUNLOCK(mp);
755 	vn_finished_write(mp);
756 	return done;
757 }
758 
759 /*
760  * Attempt to keep the free list at wantfreevnodes length.
761  */
762 static void
763 vnlru_free(int count)
764 {
765 	struct vnode *vp;
766 
767 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
768 	for (; count > 0; count--) {
769 		vp = TAILQ_FIRST(&vnode_free_list);
770 		/*
771 		 * The list can be modified while the free_list_mtx
772 		 * has been dropped and vp could be NULL here.
773 		 */
774 		if (!vp)
775 			break;
776 		VNASSERT(vp->v_op != NULL, vp,
777 		    ("vnlru_free: vnode already reclaimed."));
778 		KASSERT((vp->v_iflag & VI_FREE) != 0,
779 		    ("Removing vnode not on freelist"));
780 		KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
781 		    ("Mangling active vnode"));
782 		TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
783 		/*
784 		 * Don't recycle if we can't get the interlock.
785 		 */
786 		if (!VI_TRYLOCK(vp)) {
787 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
788 			continue;
789 		}
790 		VNASSERT(VCANRECYCLE(vp), vp,
791 		    ("vp inconsistent on freelist"));
792 		freevnodes--;
793 		vp->v_iflag &= ~VI_FREE;
794 		vholdl(vp);
795 		mtx_unlock(&vnode_free_list_mtx);
796 		VI_UNLOCK(vp);
797 		vtryrecycle(vp);
798 		/*
799 		 * If the recycled succeeded this vdrop will actually free
800 		 * the vnode.  If not it will simply place it back on
801 		 * the free list.
802 		 */
803 		vdrop(vp);
804 		mtx_lock(&vnode_free_list_mtx);
805 	}
806 }
807 /*
808  * Attempt to recycle vnodes in a context that is always safe to block.
809  * Calling vlrurecycle() from the bowels of filesystem code has some
810  * interesting deadlock problems.
811  */
812 static struct proc *vnlruproc;
813 static int vnlruproc_sig;
814 
815 static void
816 vnlru_proc(void)
817 {
818 	struct mount *mp, *nmp;
819 	int done;
820 	struct proc *p = vnlruproc;
821 
822 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
823 	    SHUTDOWN_PRI_FIRST);
824 
825 	for (;;) {
826 		kproc_suspend_check(p);
827 		mtx_lock(&vnode_free_list_mtx);
828 		if (freevnodes > wantfreevnodes)
829 			vnlru_free(freevnodes - wantfreevnodes);
830 		if (numvnodes <= desiredvnodes * 9 / 10) {
831 			vnlruproc_sig = 0;
832 			wakeup(&vnlruproc_sig);
833 			msleep(vnlruproc, &vnode_free_list_mtx,
834 			    PVFS|PDROP, "vlruwt", hz);
835 			continue;
836 		}
837 		mtx_unlock(&vnode_free_list_mtx);
838 		done = 0;
839 		mtx_lock(&mountlist_mtx);
840 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
841 			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
842 				nmp = TAILQ_NEXT(mp, mnt_list);
843 				continue;
844 			}
845 			done += vlrureclaim(mp);
846 			mtx_lock(&mountlist_mtx);
847 			nmp = TAILQ_NEXT(mp, mnt_list);
848 			vfs_unbusy(mp);
849 		}
850 		mtx_unlock(&mountlist_mtx);
851 		if (done == 0) {
852 #if 0
853 			/* These messages are temporary debugging aids */
854 			if (vnlru_nowhere < 5)
855 				printf("vnlru process getting nowhere..\n");
856 			else if (vnlru_nowhere == 5)
857 				printf("vnlru process messages stopped.\n");
858 #endif
859 			vnlru_nowhere++;
860 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
861 		} else
862 			kern_yield(PRI_USER);
863 	}
864 }
865 
866 static struct kproc_desc vnlru_kp = {
867 	"vnlru",
868 	vnlru_proc,
869 	&vnlruproc
870 };
871 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
872     &vnlru_kp);
873 
874 /*
875  * Routines having to do with the management of the vnode table.
876  */
877 
878 /*
879  * Try to recycle a freed vnode.  We abort if anyone picks up a reference
880  * before we actually vgone().  This function must be called with the vnode
881  * held to prevent the vnode from being returned to the free list midway
882  * through vgone().
883  */
884 static int
885 vtryrecycle(struct vnode *vp)
886 {
887 	struct mount *vnmp;
888 
889 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
890 	VNASSERT(vp->v_holdcnt, vp,
891 	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
892 	/*
893 	 * This vnode may found and locked via some other list, if so we
894 	 * can't recycle it yet.
895 	 */
896 	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
897 		CTR2(KTR_VFS,
898 		    "%s: impossible to recycle, vp %p lock is already held",
899 		    __func__, vp);
900 		return (EWOULDBLOCK);
901 	}
902 	/*
903 	 * Don't recycle if its filesystem is being suspended.
904 	 */
905 	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
906 		VOP_UNLOCK(vp, 0);
907 		CTR2(KTR_VFS,
908 		    "%s: impossible to recycle, cannot start the write for %p",
909 		    __func__, vp);
910 		return (EBUSY);
911 	}
912 	/*
913 	 * If we got this far, we need to acquire the interlock and see if
914 	 * anyone picked up this vnode from another list.  If not, we will
915 	 * mark it with DOOMED via vgonel() so that anyone who does find it
916 	 * will skip over it.
917 	 */
918 	VI_LOCK(vp);
919 	if (vp->v_usecount) {
920 		VOP_UNLOCK(vp, LK_INTERLOCK);
921 		vn_finished_write(vnmp);
922 		CTR2(KTR_VFS,
923 		    "%s: impossible to recycle, %p is already referenced",
924 		    __func__, vp);
925 		return (EBUSY);
926 	}
927 	if ((vp->v_iflag & VI_DOOMED) == 0)
928 		vgonel(vp);
929 	VOP_UNLOCK(vp, LK_INTERLOCK);
930 	vn_finished_write(vnmp);
931 	return (0);
932 }
933 
934 /*
935  * Wait for available vnodes.
936  */
937 static int
938 getnewvnode_wait(int suspended)
939 {
940 
941 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
942 	if (numvnodes > desiredvnodes) {
943 		if (suspended) {
944 			/*
945 			 * File system is beeing suspended, we cannot risk a
946 			 * deadlock here, so allocate new vnode anyway.
947 			 */
948 			if (freevnodes > wantfreevnodes)
949 				vnlru_free(freevnodes - wantfreevnodes);
950 			return (0);
951 		}
952 		if (vnlruproc_sig == 0) {
953 			vnlruproc_sig = 1;	/* avoid unnecessary wakeups */
954 			wakeup(vnlruproc);
955 		}
956 		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
957 		    "vlruwk", hz);
958 	}
959 	return (numvnodes > desiredvnodes ? ENFILE : 0);
960 }
961 
962 void
963 getnewvnode_reserve(u_int count)
964 {
965 	struct thread *td;
966 
967 	td = curthread;
968 	mtx_lock(&vnode_free_list_mtx);
969 	while (count > 0) {
970 		if (getnewvnode_wait(0) == 0) {
971 			count--;
972 			td->td_vp_reserv++;
973 			numvnodes++;
974 		}
975 	}
976 	mtx_unlock(&vnode_free_list_mtx);
977 }
978 
979 void
980 getnewvnode_drop_reserve(void)
981 {
982 	struct thread *td;
983 
984 	td = curthread;
985 	mtx_lock(&vnode_free_list_mtx);
986 	KASSERT(numvnodes >= td->td_vp_reserv, ("reserve too large"));
987 	numvnodes -= td->td_vp_reserv;
988 	mtx_unlock(&vnode_free_list_mtx);
989 	td->td_vp_reserv = 0;
990 }
991 
992 /*
993  * Return the next vnode from the free list.
994  */
995 int
996 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
997     struct vnode **vpp)
998 {
999 	struct vnode *vp;
1000 	struct bufobj *bo;
1001 	struct thread *td;
1002 	int error;
1003 
1004 	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
1005 	vp = NULL;
1006 	td = curthread;
1007 	if (td->td_vp_reserv > 0) {
1008 		td->td_vp_reserv -= 1;
1009 		goto alloc;
1010 	}
1011 	mtx_lock(&vnode_free_list_mtx);
1012 	/*
1013 	 * Lend our context to reclaim vnodes if they've exceeded the max.
1014 	 */
1015 	if (freevnodes > wantfreevnodes)
1016 		vnlru_free(1);
1017 	error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
1018 	    MNTK_SUSPEND));
1019 #if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
1020 	if (error != 0) {
1021 		mtx_unlock(&vnode_free_list_mtx);
1022 		return (error);
1023 	}
1024 #endif
1025 	numvnodes++;
1026 	mtx_unlock(&vnode_free_list_mtx);
1027 alloc:
1028 	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
1029 	/*
1030 	 * Setup locks.
1031 	 */
1032 	vp->v_vnlock = &vp->v_lock;
1033 	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
1034 	/*
1035 	 * By default, don't allow shared locks unless filesystems
1036 	 * opt-in.
1037 	 */
1038 	lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE);
1039 	/*
1040 	 * Initialize bufobj.
1041 	 */
1042 	bo = &vp->v_bufobj;
1043 	bo->__bo_vnode = vp;
1044 	mtx_init(BO_MTX(bo), "bufobj interlock", NULL, MTX_DEF);
1045 	bo->bo_ops = &buf_ops_bio;
1046 	bo->bo_private = vp;
1047 	TAILQ_INIT(&bo->bo_clean.bv_hd);
1048 	TAILQ_INIT(&bo->bo_dirty.bv_hd);
1049 	/*
1050 	 * Initialize namecache.
1051 	 */
1052 	LIST_INIT(&vp->v_cache_src);
1053 	TAILQ_INIT(&vp->v_cache_dst);
1054 	/*
1055 	 * Finalize various vnode identity bits.
1056 	 */
1057 	vp->v_type = VNON;
1058 	vp->v_tag = tag;
1059 	vp->v_op = vops;
1060 	v_incr_usecount(vp);
1061 	vp->v_data = NULL;
1062 #ifdef MAC
1063 	mac_vnode_init(vp);
1064 	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
1065 		mac_vnode_associate_singlelabel(mp, vp);
1066 	else if (mp == NULL && vops != &dead_vnodeops)
1067 		printf("NULL mp in getnewvnode()\n");
1068 #endif
1069 	if (mp != NULL) {
1070 		bo->bo_bsize = mp->mnt_stat.f_iosize;
1071 		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
1072 			vp->v_vflag |= VV_NOKNOTE;
1073 	}
1074 	rangelock_init(&vp->v_rl);
1075 
1076 	/*
1077 	 * For the filesystems which do not use vfs_hash_insert(),
1078 	 * still initialize v_hash to have vfs_hash_index() useful.
1079 	 * E.g., nullfs uses vfs_hash_index() on the lower vnode for
1080 	 * its own hashing.
1081 	 */
1082 	vp->v_hash = (uintptr_t)vp >> vnsz2log;
1083 
1084 	*vpp = vp;
1085 	return (0);
1086 }
1087 
1088 /*
1089  * Delete from old mount point vnode list, if on one.
1090  */
1091 static void
1092 delmntque(struct vnode *vp)
1093 {
1094 	struct mount *mp;
1095 	int active;
1096 
1097 	mp = vp->v_mount;
1098 	if (mp == NULL)
1099 		return;
1100 	MNT_ILOCK(mp);
1101 	VI_LOCK(vp);
1102 	KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize,
1103 	    ("Active vnode list size %d > Vnode list size %d",
1104 	     mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize));
1105 	active = vp->v_iflag & VI_ACTIVE;
1106 	vp->v_iflag &= ~VI_ACTIVE;
1107 	if (active) {
1108 		mtx_lock(&vnode_free_list_mtx);
1109 		TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
1110 		mp->mnt_activevnodelistsize--;
1111 		mtx_unlock(&vnode_free_list_mtx);
1112 	}
1113 	vp->v_mount = NULL;
1114 	VI_UNLOCK(vp);
1115 	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
1116 		("bad mount point vnode list size"));
1117 	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1118 	mp->mnt_nvnodelistsize--;
1119 	MNT_REL(mp);
1120 	MNT_IUNLOCK(mp);
1121 }
1122 
1123 static void
1124 insmntque_stddtr(struct vnode *vp, void *dtr_arg)
1125 {
1126 
1127 	vp->v_data = NULL;
1128 	vp->v_op = &dead_vnodeops;
1129 	vgone(vp);
1130 	vput(vp);
1131 }
1132 
1133 /*
1134  * Insert into list of vnodes for the new mount point, if available.
1135  */
1136 int
1137 insmntque1(struct vnode *vp, struct mount *mp,
1138 	void (*dtr)(struct vnode *, void *), void *dtr_arg)
1139 {
1140 
1141 	KASSERT(vp->v_mount == NULL,
1142 		("insmntque: vnode already on per mount vnode list"));
1143 	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
1144 	ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
1145 
1146 	/*
1147 	 * We acquire the vnode interlock early to ensure that the
1148 	 * vnode cannot be recycled by another process releasing a
1149 	 * holdcnt on it before we get it on both the vnode list
1150 	 * and the active vnode list. The mount mutex protects only
1151 	 * manipulation of the vnode list and the vnode freelist
1152 	 * mutex protects only manipulation of the active vnode list.
1153 	 * Hence the need to hold the vnode interlock throughout.
1154 	 */
1155 	MNT_ILOCK(mp);
1156 	VI_LOCK(vp);
1157 	if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
1158 	    ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
1159 	    mp->mnt_nvnodelistsize == 0)) &&
1160 	    (vp->v_vflag & VV_FORCEINSMQ) == 0) {
1161 		VI_UNLOCK(vp);
1162 		MNT_IUNLOCK(mp);
1163 		if (dtr != NULL)
1164 			dtr(vp, dtr_arg);
1165 		return (EBUSY);
1166 	}
1167 	vp->v_mount = mp;
1168 	MNT_REF(mp);
1169 	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1170 	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
1171 		("neg mount point vnode list size"));
1172 	mp->mnt_nvnodelistsize++;
1173 	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
1174 	    ("Activating already active vnode"));
1175 	vp->v_iflag |= VI_ACTIVE;
1176 	mtx_lock(&vnode_free_list_mtx);
1177 	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
1178 	mp->mnt_activevnodelistsize++;
1179 	mtx_unlock(&vnode_free_list_mtx);
1180 	VI_UNLOCK(vp);
1181 	MNT_IUNLOCK(mp);
1182 	return (0);
1183 }
1184 
1185 int
1186 insmntque(struct vnode *vp, struct mount *mp)
1187 {
1188 
1189 	return (insmntque1(vp, mp, insmntque_stddtr, NULL));
1190 }
1191 
1192 /*
1193  * Flush out and invalidate all buffers associated with a bufobj
1194  * Called with the underlying object locked.
1195  */
1196 int
1197 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
1198 {
1199 	int error;
1200 
1201 	BO_LOCK(bo);
1202 	if (flags & V_SAVE) {
1203 		error = bufobj_wwait(bo, slpflag, slptimeo);
1204 		if (error) {
1205 			BO_UNLOCK(bo);
1206 			return (error);
1207 		}
1208 		if (bo->bo_dirty.bv_cnt > 0) {
1209 			BO_UNLOCK(bo);
1210 			if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
1211 				return (error);
1212 			/*
1213 			 * XXX We could save a lock/unlock if this was only
1214 			 * enabled under INVARIANTS
1215 			 */
1216 			BO_LOCK(bo);
1217 			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
1218 				panic("vinvalbuf: dirty bufs");
1219 		}
1220 	}
1221 	/*
1222 	 * If you alter this loop please notice that interlock is dropped and
1223 	 * reacquired in flushbuflist.  Special care is needed to ensure that
1224 	 * no race conditions occur from this.
1225 	 */
1226 	do {
1227 		error = flushbuflist(&bo->bo_clean,
1228 		    flags, bo, slpflag, slptimeo);
1229 		if (error == 0 && !(flags & V_CLEANONLY))
1230 			error = flushbuflist(&bo->bo_dirty,
1231 			    flags, bo, slpflag, slptimeo);
1232 		if (error != 0 && error != EAGAIN) {
1233 			BO_UNLOCK(bo);
1234 			return (error);
1235 		}
1236 	} while (error != 0);
1237 
1238 	/*
1239 	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
1240 	 * have write I/O in-progress but if there is a VM object then the
1241 	 * VM object can also have read-I/O in-progress.
1242 	 */
1243 	do {
1244 		bufobj_wwait(bo, 0, 0);
1245 		BO_UNLOCK(bo);
1246 		if (bo->bo_object != NULL) {
1247 			VM_OBJECT_LOCK(bo->bo_object);
1248 			vm_object_pip_wait(bo->bo_object, "bovlbx");
1249 			VM_OBJECT_UNLOCK(bo->bo_object);
1250 		}
1251 		BO_LOCK(bo);
1252 	} while (bo->bo_numoutput > 0);
1253 	BO_UNLOCK(bo);
1254 
1255 	/*
1256 	 * Destroy the copy in the VM cache, too.
1257 	 */
1258 	if (bo->bo_object != NULL &&
1259 	    (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) {
1260 		VM_OBJECT_LOCK(bo->bo_object);
1261 		vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
1262 		    OBJPR_CLEANONLY : 0);
1263 		VM_OBJECT_UNLOCK(bo->bo_object);
1264 	}
1265 
1266 #ifdef INVARIANTS
1267 	BO_LOCK(bo);
1268 	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 &&
1269 	    (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
1270 		panic("vinvalbuf: flush failed");
1271 	BO_UNLOCK(bo);
1272 #endif
1273 	return (0);
1274 }
1275 
1276 /*
1277  * Flush out and invalidate all buffers associated with a vnode.
1278  * Called with the underlying object locked.
1279  */
1280 int
1281 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
1282 {
1283 
1284 	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
1285 	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
1286 	return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
1287 }
1288 
1289 /*
1290  * Flush out buffers on the specified list.
1291  *
1292  */
1293 static int
1294 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
1295     int slptimeo)
1296 {
1297 	struct buf *bp, *nbp;
1298 	int retval, error;
1299 	daddr_t lblkno;
1300 	b_xflags_t xflags;
1301 
1302 	ASSERT_BO_LOCKED(bo);
1303 
1304 	retval = 0;
1305 	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
1306 		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1307 		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1308 			continue;
1309 		}
1310 		lblkno = 0;
1311 		xflags = 0;
1312 		if (nbp != NULL) {
1313 			lblkno = nbp->b_lblkno;
1314 			xflags = nbp->b_xflags &
1315 				(BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN);
1316 		}
1317 		retval = EAGAIN;
1318 		error = BUF_TIMELOCK(bp,
1319 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo),
1320 		    "flushbuf", slpflag, slptimeo);
1321 		if (error) {
1322 			BO_LOCK(bo);
1323 			return (error != ENOLCK ? error : EAGAIN);
1324 		}
1325 		KASSERT(bp->b_bufobj == bo,
1326 		    ("bp %p wrong b_bufobj %p should be %p",
1327 		    bp, bp->b_bufobj, bo));
1328 		if (bp->b_bufobj != bo) {	/* XXX: necessary ? */
1329 			BUF_UNLOCK(bp);
1330 			BO_LOCK(bo);
1331 			return (EAGAIN);
1332 		}
1333 		/*
1334 		 * XXX Since there are no node locks for NFS, I
1335 		 * believe there is a slight chance that a delayed
1336 		 * write will occur while sleeping just above, so
1337 		 * check for it.
1338 		 */
1339 		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1340 		    (flags & V_SAVE)) {
1341 			BO_LOCK(bo);
1342 			bremfree(bp);
1343 			BO_UNLOCK(bo);
1344 			bp->b_flags |= B_ASYNC;
1345 			bwrite(bp);
1346 			BO_LOCK(bo);
1347 			return (EAGAIN);	/* XXX: why not loop ? */
1348 		}
1349 		BO_LOCK(bo);
1350 		bremfree(bp);
1351 		BO_UNLOCK(bo);
1352 		bp->b_flags |= (B_INVAL | B_RELBUF);
1353 		bp->b_flags &= ~B_ASYNC;
1354 		brelse(bp);
1355 		BO_LOCK(bo);
1356 		if (nbp != NULL &&
1357 		    (nbp->b_bufobj != bo ||
1358 		     nbp->b_lblkno != lblkno ||
1359 		     (nbp->b_xflags &
1360 		      (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN)) != xflags))
1361 			break;			/* nbp invalid */
1362 	}
1363 	return (retval);
1364 }
1365 
1366 /*
1367  * Truncate a file's buffer and pages to a specified length.  This
1368  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1369  * sync activity.
1370  */
1371 int
1372 vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize)
1373 {
1374 	struct buf *bp, *nbp;
1375 	int anyfreed;
1376 	int trunclbn;
1377 	struct bufobj *bo;
1378 
1379 	CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__,
1380 	    vp, cred, blksize, (uintmax_t)length);
1381 
1382 	/*
1383 	 * Round up to the *next* lbn.
1384 	 */
1385 	trunclbn = (length + blksize - 1) / blksize;
1386 
1387 	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1388 restart:
1389 	bo = &vp->v_bufobj;
1390 	BO_LOCK(bo);
1391 	anyfreed = 1;
1392 	for (;anyfreed;) {
1393 		anyfreed = 0;
1394 		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
1395 			if (bp->b_lblkno < trunclbn)
1396 				continue;
1397 			if (BUF_LOCK(bp,
1398 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1399 			    BO_MTX(bo)) == ENOLCK)
1400 				goto restart;
1401 
1402 			BO_LOCK(bo);
1403 			bremfree(bp);
1404 			BO_UNLOCK(bo);
1405 			bp->b_flags |= (B_INVAL | B_RELBUF);
1406 			bp->b_flags &= ~B_ASYNC;
1407 			brelse(bp);
1408 			anyfreed = 1;
1409 
1410 			BO_LOCK(bo);
1411 			if (nbp != NULL &&
1412 			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1413 			    (nbp->b_vp != vp) ||
1414 			    (nbp->b_flags & B_DELWRI))) {
1415 				BO_UNLOCK(bo);
1416 				goto restart;
1417 			}
1418 		}
1419 
1420 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1421 			if (bp->b_lblkno < trunclbn)
1422 				continue;
1423 			if (BUF_LOCK(bp,
1424 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1425 			    BO_MTX(bo)) == ENOLCK)
1426 				goto restart;
1427 			BO_LOCK(bo);
1428 			bremfree(bp);
1429 			BO_UNLOCK(bo);
1430 			bp->b_flags |= (B_INVAL | B_RELBUF);
1431 			bp->b_flags &= ~B_ASYNC;
1432 			brelse(bp);
1433 			anyfreed = 1;
1434 
1435 			BO_LOCK(bo);
1436 			if (nbp != NULL &&
1437 			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1438 			    (nbp->b_vp != vp) ||
1439 			    (nbp->b_flags & B_DELWRI) == 0)) {
1440 				BO_UNLOCK(bo);
1441 				goto restart;
1442 			}
1443 		}
1444 	}
1445 
1446 	if (length > 0) {
1447 restartsync:
1448 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1449 			if (bp->b_lblkno > 0)
1450 				continue;
1451 			/*
1452 			 * Since we hold the vnode lock this should only
1453 			 * fail if we're racing with the buf daemon.
1454 			 */
1455 			if (BUF_LOCK(bp,
1456 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1457 			    BO_MTX(bo)) == ENOLCK) {
1458 				goto restart;
1459 			}
1460 			VNASSERT((bp->b_flags & B_DELWRI), vp,
1461 			    ("buf(%p) on dirty queue without DELWRI", bp));
1462 
1463 			BO_LOCK(bo);
1464 			bremfree(bp);
1465 			BO_UNLOCK(bo);
1466 			bawrite(bp);
1467 			BO_LOCK(bo);
1468 			goto restartsync;
1469 		}
1470 	}
1471 
1472 	bufobj_wwait(bo, 0, 0);
1473 	BO_UNLOCK(bo);
1474 	vnode_pager_setsize(vp, length);
1475 
1476 	return (0);
1477 }
1478 
1479 /*
1480  * buf_splay() - splay tree core for the clean/dirty list of buffers in
1481  *		 a vnode.
1482  *
1483  *	NOTE: We have to deal with the special case of a background bitmap
1484  *	buffer, a situation where two buffers will have the same logical
1485  *	block offset.  We want (1) only the foreground buffer to be accessed
1486  *	in a lookup and (2) must differentiate between the foreground and
1487  *	background buffer in the splay tree algorithm because the splay
1488  *	tree cannot normally handle multiple entities with the same 'index'.
1489  *	We accomplish this by adding differentiating flags to the splay tree's
1490  *	numerical domain.
1491  */
1492 static
1493 struct buf *
1494 buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
1495 {
1496 	struct buf dummy;
1497 	struct buf *lefttreemax, *righttreemin, *y;
1498 
1499 	if (root == NULL)
1500 		return (NULL);
1501 	lefttreemax = righttreemin = &dummy;
1502 	for (;;) {
1503 		if (lblkno < root->b_lblkno ||
1504 		    (lblkno == root->b_lblkno &&
1505 		    (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1506 			if ((y = root->b_left) == NULL)
1507 				break;
1508 			if (lblkno < y->b_lblkno) {
1509 				/* Rotate right. */
1510 				root->b_left = y->b_right;
1511 				y->b_right = root;
1512 				root = y;
1513 				if ((y = root->b_left) == NULL)
1514 					break;
1515 			}
1516 			/* Link into the new root's right tree. */
1517 			righttreemin->b_left = root;
1518 			righttreemin = root;
1519 		} else if (lblkno > root->b_lblkno ||
1520 		    (lblkno == root->b_lblkno &&
1521 		    (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
1522 			if ((y = root->b_right) == NULL)
1523 				break;
1524 			if (lblkno > y->b_lblkno) {
1525 				/* Rotate left. */
1526 				root->b_right = y->b_left;
1527 				y->b_left = root;
1528 				root = y;
1529 				if ((y = root->b_right) == NULL)
1530 					break;
1531 			}
1532 			/* Link into the new root's left tree. */
1533 			lefttreemax->b_right = root;
1534 			lefttreemax = root;
1535 		} else {
1536 			break;
1537 		}
1538 		root = y;
1539 	}
1540 	/* Assemble the new root. */
1541 	lefttreemax->b_right = root->b_left;
1542 	righttreemin->b_left = root->b_right;
1543 	root->b_left = dummy.b_right;
1544 	root->b_right = dummy.b_left;
1545 	return (root);
1546 }
1547 
1548 static void
1549 buf_vlist_remove(struct buf *bp)
1550 {
1551 	struct buf *root;
1552 	struct bufv *bv;
1553 
1554 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1555 	ASSERT_BO_LOCKED(bp->b_bufobj);
1556 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
1557 	    (BX_VNDIRTY|BX_VNCLEAN),
1558 	    ("buf_vlist_remove: Buf %p is on two lists", bp));
1559 	if (bp->b_xflags & BX_VNDIRTY)
1560 		bv = &bp->b_bufobj->bo_dirty;
1561 	else
1562 		bv = &bp->b_bufobj->bo_clean;
1563 	if (bp != bv->bv_root) {
1564 		root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
1565 		KASSERT(root == bp, ("splay lookup failed in remove"));
1566 	}
1567 	if (bp->b_left == NULL) {
1568 		root = bp->b_right;
1569 	} else {
1570 		root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
1571 		root->b_right = bp->b_right;
1572 	}
1573 	bv->bv_root = root;
1574 	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
1575 	bv->bv_cnt--;
1576 	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1577 }
1578 
1579 /*
1580  * Add the buffer to the sorted clean or dirty block list using a
1581  * splay tree algorithm.
1582  *
1583  * NOTE: xflags is passed as a constant, optimizing this inline function!
1584  */
1585 static void
1586 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
1587 {
1588 	struct buf *root;
1589 	struct bufv *bv;
1590 
1591 	ASSERT_BO_LOCKED(bo);
1592 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1593 	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
1594 	bp->b_xflags |= xflags;
1595 	if (xflags & BX_VNDIRTY)
1596 		bv = &bo->bo_dirty;
1597 	else
1598 		bv = &bo->bo_clean;
1599 
1600 	root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
1601 	if (root == NULL) {
1602 		bp->b_left = NULL;
1603 		bp->b_right = NULL;
1604 		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
1605 	} else if (bp->b_lblkno < root->b_lblkno ||
1606 	    (bp->b_lblkno == root->b_lblkno &&
1607 	    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1608 		bp->b_left = root->b_left;
1609 		bp->b_right = root;
1610 		root->b_left = NULL;
1611 		TAILQ_INSERT_BEFORE(root, bp, b_bobufs);
1612 	} else {
1613 		bp->b_right = root->b_right;
1614 		bp->b_left = root;
1615 		root->b_right = NULL;
1616 		TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs);
1617 	}
1618 	bv->bv_cnt++;
1619 	bv->bv_root = bp;
1620 }
1621 
1622 /*
1623  * Lookup a buffer using the splay tree.  Note that we specifically avoid
1624  * shadow buffers used in background bitmap writes.
1625  *
1626  * This code isn't quite efficient as it could be because we are maintaining
1627  * two sorted lists and do not know which list the block resides in.
1628  *
1629  * During a "make buildworld" the desired buffer is found at one of
1630  * the roots more than 60% of the time.  Thus, checking both roots
1631  * before performing either splay eliminates unnecessary splays on the
1632  * first tree splayed.
1633  */
1634 struct buf *
1635 gbincore(struct bufobj *bo, daddr_t lblkno)
1636 {
1637 	struct buf *bp;
1638 
1639 	ASSERT_BO_LOCKED(bo);
1640 	if ((bp = bo->bo_clean.bv_root) != NULL &&
1641 	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1642 		return (bp);
1643 	if ((bp = bo->bo_dirty.bv_root) != NULL &&
1644 	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1645 		return (bp);
1646 	if ((bp = bo->bo_clean.bv_root) != NULL) {
1647 		bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp);
1648 		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1649 			return (bp);
1650 	}
1651 	if ((bp = bo->bo_dirty.bv_root) != NULL) {
1652 		bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp);
1653 		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1654 			return (bp);
1655 	}
1656 	return (NULL);
1657 }
1658 
1659 /*
1660  * Associate a buffer with a vnode.
1661  */
1662 void
1663 bgetvp(struct vnode *vp, struct buf *bp)
1664 {
1665 	struct bufobj *bo;
1666 
1667 	bo = &vp->v_bufobj;
1668 	ASSERT_BO_LOCKED(bo);
1669 	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
1670 
1671 	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
1672 	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
1673 	    ("bgetvp: bp already attached! %p", bp));
1674 
1675 	vhold(vp);
1676 	bp->b_vp = vp;
1677 	bp->b_bufobj = bo;
1678 	/*
1679 	 * Insert onto list for new vnode.
1680 	 */
1681 	buf_vlist_add(bp, bo, BX_VNCLEAN);
1682 }
1683 
1684 /*
1685  * Disassociate a buffer from a vnode.
1686  */
1687 void
1688 brelvp(struct buf *bp)
1689 {
1690 	struct bufobj *bo;
1691 	struct vnode *vp;
1692 
1693 	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1694 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1695 
1696 	/*
1697 	 * Delete from old vnode list, if on one.
1698 	 */
1699 	vp = bp->b_vp;		/* XXX */
1700 	bo = bp->b_bufobj;
1701 	BO_LOCK(bo);
1702 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1703 		buf_vlist_remove(bp);
1704 	else
1705 		panic("brelvp: Buffer %p not on queue.", bp);
1706 	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1707 		bo->bo_flag &= ~BO_ONWORKLST;
1708 		mtx_lock(&sync_mtx);
1709 		LIST_REMOVE(bo, bo_synclist);
1710 		syncer_worklist_len--;
1711 		mtx_unlock(&sync_mtx);
1712 	}
1713 	bp->b_vp = NULL;
1714 	bp->b_bufobj = NULL;
1715 	BO_UNLOCK(bo);
1716 	vdrop(vp);
1717 }
1718 
1719 /*
1720  * Add an item to the syncer work queue.
1721  */
1722 static void
1723 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
1724 {
1725 	int slot;
1726 
1727 	ASSERT_BO_LOCKED(bo);
1728 
1729 	mtx_lock(&sync_mtx);
1730 	if (bo->bo_flag & BO_ONWORKLST)
1731 		LIST_REMOVE(bo, bo_synclist);
1732 	else {
1733 		bo->bo_flag |= BO_ONWORKLST;
1734 		syncer_worklist_len++;
1735 	}
1736 
1737 	if (delay > syncer_maxdelay - 2)
1738 		delay = syncer_maxdelay - 2;
1739 	slot = (syncer_delayno + delay) & syncer_mask;
1740 
1741 	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
1742 	mtx_unlock(&sync_mtx);
1743 }
1744 
1745 static int
1746 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
1747 {
1748 	int error, len;
1749 
1750 	mtx_lock(&sync_mtx);
1751 	len = syncer_worklist_len - sync_vnode_count;
1752 	mtx_unlock(&sync_mtx);
1753 	error = SYSCTL_OUT(req, &len, sizeof(len));
1754 	return (error);
1755 }
1756 
1757 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
1758     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
1759 
1760 static struct proc *updateproc;
1761 static void sched_sync(void);
1762 static struct kproc_desc up_kp = {
1763 	"syncer",
1764 	sched_sync,
1765 	&updateproc
1766 };
1767 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
1768 
1769 static int
1770 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
1771 {
1772 	struct vnode *vp;
1773 	struct mount *mp;
1774 
1775 	*bo = LIST_FIRST(slp);
1776 	if (*bo == NULL)
1777 		return (0);
1778 	vp = (*bo)->__bo_vnode;	/* XXX */
1779 	if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
1780 		return (1);
1781 	/*
1782 	 * We use vhold in case the vnode does not
1783 	 * successfully sync.  vhold prevents the vnode from
1784 	 * going away when we unlock the sync_mtx so that
1785 	 * we can acquire the vnode interlock.
1786 	 */
1787 	vholdl(vp);
1788 	mtx_unlock(&sync_mtx);
1789 	VI_UNLOCK(vp);
1790 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1791 		vdrop(vp);
1792 		mtx_lock(&sync_mtx);
1793 		return (*bo == LIST_FIRST(slp));
1794 	}
1795 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1796 	(void) VOP_FSYNC(vp, MNT_LAZY, td);
1797 	VOP_UNLOCK(vp, 0);
1798 	vn_finished_write(mp);
1799 	BO_LOCK(*bo);
1800 	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
1801 		/*
1802 		 * Put us back on the worklist.  The worklist
1803 		 * routine will remove us from our current
1804 		 * position and then add us back in at a later
1805 		 * position.
1806 		 */
1807 		vn_syncer_add_to_worklist(*bo, syncdelay);
1808 	}
1809 	BO_UNLOCK(*bo);
1810 	vdrop(vp);
1811 	mtx_lock(&sync_mtx);
1812 	return (0);
1813 }
1814 
1815 /*
1816  * System filesystem synchronizer daemon.
1817  */
1818 static void
1819 sched_sync(void)
1820 {
1821 	struct synclist *next, *slp;
1822 	struct bufobj *bo;
1823 	long starttime;
1824 	struct thread *td = curthread;
1825 	int last_work_seen;
1826 	int net_worklist_len;
1827 	int syncer_final_iter;
1828 	int first_printf;
1829 	int error;
1830 
1831 	last_work_seen = 0;
1832 	syncer_final_iter = 0;
1833 	first_printf = 1;
1834 	syncer_state = SYNCER_RUNNING;
1835 	starttime = time_uptime;
1836 	td->td_pflags |= TDP_NORUNNINGBUF;
1837 
1838 	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
1839 	    SHUTDOWN_PRI_LAST);
1840 
1841 	mtx_lock(&sync_mtx);
1842 	for (;;) {
1843 		if (syncer_state == SYNCER_FINAL_DELAY &&
1844 		    syncer_final_iter == 0) {
1845 			mtx_unlock(&sync_mtx);
1846 			kproc_suspend_check(td->td_proc);
1847 			mtx_lock(&sync_mtx);
1848 		}
1849 		net_worklist_len = syncer_worklist_len - sync_vnode_count;
1850 		if (syncer_state != SYNCER_RUNNING &&
1851 		    starttime != time_uptime) {
1852 			if (first_printf) {
1853 				printf("\nSyncing disks, vnodes remaining...");
1854 				first_printf = 0;
1855 			}
1856 			printf("%d ", net_worklist_len);
1857 		}
1858 		starttime = time_uptime;
1859 
1860 		/*
1861 		 * Push files whose dirty time has expired.  Be careful
1862 		 * of interrupt race on slp queue.
1863 		 *
1864 		 * Skip over empty worklist slots when shutting down.
1865 		 */
1866 		do {
1867 			slp = &syncer_workitem_pending[syncer_delayno];
1868 			syncer_delayno += 1;
1869 			if (syncer_delayno == syncer_maxdelay)
1870 				syncer_delayno = 0;
1871 			next = &syncer_workitem_pending[syncer_delayno];
1872 			/*
1873 			 * If the worklist has wrapped since the
1874 			 * it was emptied of all but syncer vnodes,
1875 			 * switch to the FINAL_DELAY state and run
1876 			 * for one more second.
1877 			 */
1878 			if (syncer_state == SYNCER_SHUTTING_DOWN &&
1879 			    net_worklist_len == 0 &&
1880 			    last_work_seen == syncer_delayno) {
1881 				syncer_state = SYNCER_FINAL_DELAY;
1882 				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
1883 			}
1884 		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
1885 		    syncer_worklist_len > 0);
1886 
1887 		/*
1888 		 * Keep track of the last time there was anything
1889 		 * on the worklist other than syncer vnodes.
1890 		 * Return to the SHUTTING_DOWN state if any
1891 		 * new work appears.
1892 		 */
1893 		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
1894 			last_work_seen = syncer_delayno;
1895 		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
1896 			syncer_state = SYNCER_SHUTTING_DOWN;
1897 		while (!LIST_EMPTY(slp)) {
1898 			error = sync_vnode(slp, &bo, td);
1899 			if (error == 1) {
1900 				LIST_REMOVE(bo, bo_synclist);
1901 				LIST_INSERT_HEAD(next, bo, bo_synclist);
1902 				continue;
1903 			}
1904 
1905 			if (first_printf == 0)
1906 				wdog_kern_pat(WD_LASTVAL);
1907 
1908 		}
1909 		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
1910 			syncer_final_iter--;
1911 		/*
1912 		 * The variable rushjob allows the kernel to speed up the
1913 		 * processing of the filesystem syncer process. A rushjob
1914 		 * value of N tells the filesystem syncer to process the next
1915 		 * N seconds worth of work on its queue ASAP. Currently rushjob
1916 		 * is used by the soft update code to speed up the filesystem
1917 		 * syncer process when the incore state is getting so far
1918 		 * ahead of the disk that the kernel memory pool is being
1919 		 * threatened with exhaustion.
1920 		 */
1921 		if (rushjob > 0) {
1922 			rushjob -= 1;
1923 			continue;
1924 		}
1925 		/*
1926 		 * Just sleep for a short period of time between
1927 		 * iterations when shutting down to allow some I/O
1928 		 * to happen.
1929 		 *
1930 		 * If it has taken us less than a second to process the
1931 		 * current work, then wait. Otherwise start right over
1932 		 * again. We can still lose time if any single round
1933 		 * takes more than two seconds, but it does not really
1934 		 * matter as we are just trying to generally pace the
1935 		 * filesystem activity.
1936 		 */
1937 		if (syncer_state != SYNCER_RUNNING ||
1938 		    time_uptime == starttime) {
1939 			thread_lock(td);
1940 			sched_prio(td, PPAUSE);
1941 			thread_unlock(td);
1942 		}
1943 		if (syncer_state != SYNCER_RUNNING)
1944 			cv_timedwait(&sync_wakeup, &sync_mtx,
1945 			    hz / SYNCER_SHUTDOWN_SPEEDUP);
1946 		else if (time_uptime == starttime)
1947 			cv_timedwait(&sync_wakeup, &sync_mtx, hz);
1948 	}
1949 }
1950 
1951 /*
1952  * Request the syncer daemon to speed up its work.
1953  * We never push it to speed up more than half of its
1954  * normal turn time, otherwise it could take over the cpu.
1955  */
1956 int
1957 speedup_syncer(void)
1958 {
1959 	int ret = 0;
1960 
1961 	mtx_lock(&sync_mtx);
1962 	if (rushjob < syncdelay / 2) {
1963 		rushjob += 1;
1964 		stat_rush_requests += 1;
1965 		ret = 1;
1966 	}
1967 	mtx_unlock(&sync_mtx);
1968 	cv_broadcast(&sync_wakeup);
1969 	return (ret);
1970 }
1971 
1972 /*
1973  * Tell the syncer to speed up its work and run though its work
1974  * list several times, then tell it to shut down.
1975  */
1976 static void
1977 syncer_shutdown(void *arg, int howto)
1978 {
1979 
1980 	if (howto & RB_NOSYNC)
1981 		return;
1982 	mtx_lock(&sync_mtx);
1983 	syncer_state = SYNCER_SHUTTING_DOWN;
1984 	rushjob = 0;
1985 	mtx_unlock(&sync_mtx);
1986 	cv_broadcast(&sync_wakeup);
1987 	kproc_shutdown(arg, howto);
1988 }
1989 
1990 /*
1991  * Reassign a buffer from one vnode to another.
1992  * Used to assign file specific control information
1993  * (indirect blocks) to the vnode to which they belong.
1994  */
1995 void
1996 reassignbuf(struct buf *bp)
1997 {
1998 	struct vnode *vp;
1999 	struct bufobj *bo;
2000 	int delay;
2001 #ifdef INVARIANTS
2002 	struct bufv *bv;
2003 #endif
2004 
2005 	vp = bp->b_vp;
2006 	bo = bp->b_bufobj;
2007 	++reassignbufcalls;
2008 
2009 	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
2010 	    bp, bp->b_vp, bp->b_flags);
2011 	/*
2012 	 * B_PAGING flagged buffers cannot be reassigned because their vp
2013 	 * is not fully linked in.
2014 	 */
2015 	if (bp->b_flags & B_PAGING)
2016 		panic("cannot reassign paging buffer");
2017 
2018 	/*
2019 	 * Delete from old vnode list, if on one.
2020 	 */
2021 	BO_LOCK(bo);
2022 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
2023 		buf_vlist_remove(bp);
2024 	else
2025 		panic("reassignbuf: Buffer %p not on queue.", bp);
2026 	/*
2027 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
2028 	 * of clean buffers.
2029 	 */
2030 	if (bp->b_flags & B_DELWRI) {
2031 		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
2032 			switch (vp->v_type) {
2033 			case VDIR:
2034 				delay = dirdelay;
2035 				break;
2036 			case VCHR:
2037 				delay = metadelay;
2038 				break;
2039 			default:
2040 				delay = filedelay;
2041 			}
2042 			vn_syncer_add_to_worklist(bo, delay);
2043 		}
2044 		buf_vlist_add(bp, bo, BX_VNDIRTY);
2045 	} else {
2046 		buf_vlist_add(bp, bo, BX_VNCLEAN);
2047 
2048 		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
2049 			mtx_lock(&sync_mtx);
2050 			LIST_REMOVE(bo, bo_synclist);
2051 			syncer_worklist_len--;
2052 			mtx_unlock(&sync_mtx);
2053 			bo->bo_flag &= ~BO_ONWORKLST;
2054 		}
2055 	}
2056 #ifdef INVARIANTS
2057 	bv = &bo->bo_clean;
2058 	bp = TAILQ_FIRST(&bv->bv_hd);
2059 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2060 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2061 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
2062 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2063 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2064 	bv = &bo->bo_dirty;
2065 	bp = TAILQ_FIRST(&bv->bv_hd);
2066 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2067 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2068 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
2069 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2070 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2071 #endif
2072 	BO_UNLOCK(bo);
2073 }
2074 
2075 /*
2076  * Increment the use and hold counts on the vnode, taking care to reference
2077  * the driver's usecount if this is a chardev.  The vholdl() will remove
2078  * the vnode from the free list if it is presently free.  Requires the
2079  * vnode interlock and returns with it held.
2080  */
2081 static void
2082 v_incr_usecount(struct vnode *vp)
2083 {
2084 
2085 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2086 	vp->v_usecount++;
2087 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2088 		dev_lock();
2089 		vp->v_rdev->si_usecount++;
2090 		dev_unlock();
2091 	}
2092 	vholdl(vp);
2093 }
2094 
2095 /*
2096  * Turn a holdcnt into a use+holdcnt such that only one call to
2097  * v_decr_usecount is needed.
2098  */
2099 static void
2100 v_upgrade_usecount(struct vnode *vp)
2101 {
2102 
2103 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2104 	vp->v_usecount++;
2105 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2106 		dev_lock();
2107 		vp->v_rdev->si_usecount++;
2108 		dev_unlock();
2109 	}
2110 }
2111 
2112 /*
2113  * Decrement the vnode use and hold count along with the driver's usecount
2114  * if this is a chardev.  The vdropl() below releases the vnode interlock
2115  * as it may free the vnode.
2116  */
2117 static void
2118 v_decr_usecount(struct vnode *vp)
2119 {
2120 
2121 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
2122 	VNASSERT(vp->v_usecount > 0, vp,
2123 	    ("v_decr_usecount: negative usecount"));
2124 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2125 	vp->v_usecount--;
2126 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2127 		dev_lock();
2128 		vp->v_rdev->si_usecount--;
2129 		dev_unlock();
2130 	}
2131 	vdropl(vp);
2132 }
2133 
2134 /*
2135  * Decrement only the use count and driver use count.  This is intended to
2136  * be paired with a follow on vdropl() to release the remaining hold count.
2137  * In this way we may vgone() a vnode with a 0 usecount without risk of
2138  * having it end up on a free list because the hold count is kept above 0.
2139  */
2140 static void
2141 v_decr_useonly(struct vnode *vp)
2142 {
2143 
2144 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
2145 	VNASSERT(vp->v_usecount > 0, vp,
2146 	    ("v_decr_useonly: negative usecount"));
2147 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2148 	vp->v_usecount--;
2149 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2150 		dev_lock();
2151 		vp->v_rdev->si_usecount--;
2152 		dev_unlock();
2153 	}
2154 }
2155 
2156 /*
2157  * Grab a particular vnode from the free list, increment its
2158  * reference count and lock it.  VI_DOOMED is set if the vnode
2159  * is being destroyed.  Only callers who specify LK_RETRY will
2160  * see doomed vnodes.  If inactive processing was delayed in
2161  * vput try to do it here.
2162  */
2163 int
2164 vget(struct vnode *vp, int flags, struct thread *td)
2165 {
2166 	int error;
2167 
2168 	error = 0;
2169 	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
2170 	    ("vget: invalid lock operation"));
2171 	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
2172 
2173 	if ((flags & LK_INTERLOCK) == 0)
2174 		VI_LOCK(vp);
2175 	vholdl(vp);
2176 	if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) {
2177 		vdrop(vp);
2178 		CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
2179 		    vp);
2180 		return (error);
2181 	}
2182 	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
2183 		panic("vget: vn_lock failed to return ENOENT\n");
2184 	VI_LOCK(vp);
2185 	/* Upgrade our holdcnt to a usecount. */
2186 	v_upgrade_usecount(vp);
2187 	/*
2188 	 * We don't guarantee that any particular close will
2189 	 * trigger inactive processing so just make a best effort
2190 	 * here at preventing a reference to a removed file.  If
2191 	 * we don't succeed no harm is done.
2192 	 */
2193 	if (vp->v_iflag & VI_OWEINACT) {
2194 		if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
2195 		    (flags & LK_NOWAIT) == 0)
2196 			vinactive(vp, td);
2197 		vp->v_iflag &= ~VI_OWEINACT;
2198 	}
2199 	VI_UNLOCK(vp);
2200 	return (0);
2201 }
2202 
2203 /*
2204  * Increase the reference count of a vnode.
2205  */
2206 void
2207 vref(struct vnode *vp)
2208 {
2209 
2210 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2211 	VI_LOCK(vp);
2212 	v_incr_usecount(vp);
2213 	VI_UNLOCK(vp);
2214 }
2215 
2216 /*
2217  * Return reference count of a vnode.
2218  *
2219  * The results of this call are only guaranteed when some mechanism other
2220  * than the VI lock is used to stop other processes from gaining references
2221  * to the vnode.  This may be the case if the caller holds the only reference.
2222  * This is also useful when stale data is acceptable as race conditions may
2223  * be accounted for by some other means.
2224  */
2225 int
2226 vrefcnt(struct vnode *vp)
2227 {
2228 	int usecnt;
2229 
2230 	VI_LOCK(vp);
2231 	usecnt = vp->v_usecount;
2232 	VI_UNLOCK(vp);
2233 
2234 	return (usecnt);
2235 }
2236 
2237 #define	VPUTX_VRELE	1
2238 #define	VPUTX_VPUT	2
2239 #define	VPUTX_VUNREF	3
2240 
2241 static void
2242 vputx(struct vnode *vp, int func)
2243 {
2244 	int error;
2245 
2246 	KASSERT(vp != NULL, ("vputx: null vp"));
2247 	if (func == VPUTX_VUNREF)
2248 		ASSERT_VOP_LOCKED(vp, "vunref");
2249 	else if (func == VPUTX_VPUT)
2250 		ASSERT_VOP_LOCKED(vp, "vput");
2251 	else
2252 		KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
2253 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2254 	VI_LOCK(vp);
2255 
2256 	/* Skip this v_writecount check if we're going to panic below. */
2257 	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
2258 	    ("vputx: missed vn_close"));
2259 	error = 0;
2260 
2261 	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2262 	    vp->v_usecount == 1)) {
2263 		if (func == VPUTX_VPUT)
2264 			VOP_UNLOCK(vp, 0);
2265 		v_decr_usecount(vp);
2266 		return;
2267 	}
2268 
2269 	if (vp->v_usecount != 1) {
2270 		vprint("vputx: negative ref count", vp);
2271 		panic("vputx: negative ref cnt");
2272 	}
2273 	CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp);
2274 	/*
2275 	 * We want to hold the vnode until the inactive finishes to
2276 	 * prevent vgone() races.  We drop the use count here and the
2277 	 * hold count below when we're done.
2278 	 */
2279 	v_decr_useonly(vp);
2280 	/*
2281 	 * We must call VOP_INACTIVE with the node locked. Mark
2282 	 * as VI_DOINGINACT to avoid recursion.
2283 	 */
2284 	vp->v_iflag |= VI_OWEINACT;
2285 	switch (func) {
2286 	case VPUTX_VRELE:
2287 		error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
2288 		VI_LOCK(vp);
2289 		break;
2290 	case VPUTX_VPUT:
2291 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2292 			error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
2293 			    LK_NOWAIT);
2294 			VI_LOCK(vp);
2295 		}
2296 		break;
2297 	case VPUTX_VUNREF:
2298 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
2299 			error = EBUSY;
2300 		break;
2301 	}
2302 	if (vp->v_usecount > 0)
2303 		vp->v_iflag &= ~VI_OWEINACT;
2304 	if (error == 0) {
2305 		if (vp->v_iflag & VI_OWEINACT)
2306 			vinactive(vp, curthread);
2307 		if (func != VPUTX_VUNREF)
2308 			VOP_UNLOCK(vp, 0);
2309 	}
2310 	vdropl(vp);
2311 }
2312 
2313 /*
2314  * Vnode put/release.
2315  * If count drops to zero, call inactive routine and return to freelist.
2316  */
2317 void
2318 vrele(struct vnode *vp)
2319 {
2320 
2321 	vputx(vp, VPUTX_VRELE);
2322 }
2323 
2324 /*
2325  * Release an already locked vnode.  This give the same effects as
2326  * unlock+vrele(), but takes less time and avoids releasing and
2327  * re-aquiring the lock (as vrele() acquires the lock internally.)
2328  */
2329 void
2330 vput(struct vnode *vp)
2331 {
2332 
2333 	vputx(vp, VPUTX_VPUT);
2334 }
2335 
2336 /*
2337  * Release an exclusively locked vnode. Do not unlock the vnode lock.
2338  */
2339 void
2340 vunref(struct vnode *vp)
2341 {
2342 
2343 	vputx(vp, VPUTX_VUNREF);
2344 }
2345 
2346 /*
2347  * Somebody doesn't want the vnode recycled.
2348  */
2349 void
2350 vhold(struct vnode *vp)
2351 {
2352 
2353 	VI_LOCK(vp);
2354 	vholdl(vp);
2355 	VI_UNLOCK(vp);
2356 }
2357 
2358 /*
2359  * Increase the hold count and activate if this is the first reference.
2360  */
2361 void
2362 vholdl(struct vnode *vp)
2363 {
2364 	struct mount *mp;
2365 
2366 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2367 	vp->v_holdcnt++;
2368 	if (!VSHOULDBUSY(vp))
2369 		return;
2370 	ASSERT_VI_LOCKED(vp, "vholdl");
2371 	VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free"));
2372 	VNASSERT(vp->v_op != NULL, vp, ("vholdl: vnode already reclaimed."));
2373 	/*
2374 	 * Remove a vnode from the free list, mark it as in use,
2375 	 * and put it on the active list.
2376 	 */
2377 	mtx_lock(&vnode_free_list_mtx);
2378 	TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
2379 	freevnodes--;
2380 	vp->v_iflag &= ~(VI_FREE|VI_AGE);
2381 	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
2382 	    ("Activating already active vnode"));
2383 	vp->v_iflag |= VI_ACTIVE;
2384 	mp = vp->v_mount;
2385 	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
2386 	mp->mnt_activevnodelistsize++;
2387 	mtx_unlock(&vnode_free_list_mtx);
2388 }
2389 
2390 /*
2391  * Note that there is one less who cares about this vnode.
2392  * vdrop() is the opposite of vhold().
2393  */
2394 void
2395 vdrop(struct vnode *vp)
2396 {
2397 
2398 	VI_LOCK(vp);
2399 	vdropl(vp);
2400 }
2401 
2402 /*
2403  * Drop the hold count of the vnode.  If this is the last reference to
2404  * the vnode we place it on the free list unless it has been vgone'd
2405  * (marked VI_DOOMED) in which case we will free it.
2406  */
2407 void
2408 vdropl(struct vnode *vp)
2409 {
2410 	struct bufobj *bo;
2411 	struct mount *mp;
2412 	int active;
2413 
2414 	ASSERT_VI_LOCKED(vp, "vdropl");
2415 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2416 	if (vp->v_holdcnt <= 0)
2417 		panic("vdrop: holdcnt %d", vp->v_holdcnt);
2418 	vp->v_holdcnt--;
2419 	if (vp->v_holdcnt > 0) {
2420 		VI_UNLOCK(vp);
2421 		return;
2422 	}
2423 	if ((vp->v_iflag & VI_DOOMED) == 0) {
2424 		/*
2425 		 * Mark a vnode as free: remove it from its active list
2426 		 * and put it up for recycling on the freelist.
2427 		 */
2428 		VNASSERT(vp->v_op != NULL, vp,
2429 		    ("vdropl: vnode already reclaimed."));
2430 		VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2431 		    ("vnode already free"));
2432 		VNASSERT(VSHOULDFREE(vp), vp,
2433 		    ("vdropl: freeing when we shouldn't"));
2434 		active = vp->v_iflag & VI_ACTIVE;
2435 		vp->v_iflag &= ~VI_ACTIVE;
2436 		mp = vp->v_mount;
2437 		mtx_lock(&vnode_free_list_mtx);
2438 		if (active) {
2439 			TAILQ_REMOVE(&mp->mnt_activevnodelist, vp,
2440 			    v_actfreelist);
2441 			mp->mnt_activevnodelistsize--;
2442 		}
2443 		if (vp->v_iflag & VI_AGE) {
2444 			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_actfreelist);
2445 		} else {
2446 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
2447 		}
2448 		freevnodes++;
2449 		vp->v_iflag &= ~VI_AGE;
2450 		vp->v_iflag |= VI_FREE;
2451 		mtx_unlock(&vnode_free_list_mtx);
2452 		VI_UNLOCK(vp);
2453 		return;
2454 	}
2455 	/*
2456 	 * The vnode has been marked for destruction, so free it.
2457 	 */
2458 	CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
2459 	mtx_lock(&vnode_free_list_mtx);
2460 	numvnodes--;
2461 	mtx_unlock(&vnode_free_list_mtx);
2462 	bo = &vp->v_bufobj;
2463 	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2464 	    ("cleaned vnode still on the free list."));
2465 	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
2466 	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
2467 	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
2468 	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
2469 	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
2470 	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
2471 	VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL"));
2472 	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
2473 	VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL"));
2474 	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
2475 	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
2476 	VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
2477 	VI_UNLOCK(vp);
2478 #ifdef MAC
2479 	mac_vnode_destroy(vp);
2480 #endif
2481 	if (vp->v_pollinfo != NULL)
2482 		destroy_vpollinfo(vp->v_pollinfo);
2483 #ifdef INVARIANTS
2484 	/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
2485 	vp->v_op = NULL;
2486 #endif
2487 	rangelock_destroy(&vp->v_rl);
2488 	lockdestroy(vp->v_vnlock);
2489 	mtx_destroy(&vp->v_interlock);
2490 	mtx_destroy(BO_MTX(bo));
2491 	uma_zfree(vnode_zone, vp);
2492 }
2493 
2494 /*
2495  * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
2496  * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
2497  * OWEINACT tracks whether a vnode missed a call to inactive due to a
2498  * failed lock upgrade.
2499  */
2500 void
2501 vinactive(struct vnode *vp, struct thread *td)
2502 {
2503 	struct vm_object *obj;
2504 
2505 	ASSERT_VOP_ELOCKED(vp, "vinactive");
2506 	ASSERT_VI_LOCKED(vp, "vinactive");
2507 	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
2508 	    ("vinactive: recursed on VI_DOINGINACT"));
2509 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2510 	vp->v_iflag |= VI_DOINGINACT;
2511 	vp->v_iflag &= ~VI_OWEINACT;
2512 	VI_UNLOCK(vp);
2513 	/*
2514 	 * Before moving off the active list, we must be sure that any
2515 	 * modified pages are on the vnode's dirty list since these will
2516 	 * no longer be checked once the vnode is on the inactive list.
2517 	 * Because the vnode vm object keeps a hold reference on the vnode
2518 	 * if there is at least one resident non-cached page, the vnode
2519 	 * cannot leave the active list without the page cleanup done.
2520 	 */
2521 	obj = vp->v_object;
2522 	if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) {
2523 		VM_OBJECT_LOCK(obj);
2524 		vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC);
2525 		VM_OBJECT_UNLOCK(obj);
2526 	}
2527 	VOP_INACTIVE(vp, td);
2528 	VI_LOCK(vp);
2529 	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
2530 	    ("vinactive: lost VI_DOINGINACT"));
2531 	vp->v_iflag &= ~VI_DOINGINACT;
2532 }
2533 
2534 /*
2535  * Remove any vnodes in the vnode table belonging to mount point mp.
2536  *
2537  * If FORCECLOSE is not specified, there should not be any active ones,
2538  * return error if any are found (nb: this is a user error, not a
2539  * system error). If FORCECLOSE is specified, detach any active vnodes
2540  * that are found.
2541  *
2542  * If WRITECLOSE is set, only flush out regular file vnodes open for
2543  * writing.
2544  *
2545  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2546  *
2547  * `rootrefs' specifies the base reference count for the root vnode
2548  * of this filesystem. The root vnode is considered busy if its
2549  * v_usecount exceeds this value. On a successful return, vflush(, td)
2550  * will call vrele() on the root vnode exactly rootrefs times.
2551  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2552  * be zero.
2553  */
2554 #ifdef DIAGNOSTIC
2555 static int busyprt = 0;		/* print out busy vnodes */
2556 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
2557 #endif
2558 
2559 int
2560 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
2561 {
2562 	struct vnode *vp, *mvp, *rootvp = NULL;
2563 	struct vattr vattr;
2564 	int busy = 0, error;
2565 
2566 	CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
2567 	    rootrefs, flags);
2568 	if (rootrefs > 0) {
2569 		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2570 		    ("vflush: bad args"));
2571 		/*
2572 		 * Get the filesystem root vnode. We can vput() it
2573 		 * immediately, since with rootrefs > 0, it won't go away.
2574 		 */
2575 		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
2576 			CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
2577 			    __func__, error);
2578 			return (error);
2579 		}
2580 		vput(rootvp);
2581 	}
2582 loop:
2583 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2584 		vholdl(vp);
2585 		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
2586 		if (error) {
2587 			vdrop(vp);
2588 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2589 			goto loop;
2590 		}
2591 		/*
2592 		 * Skip over a vnodes marked VV_SYSTEM.
2593 		 */
2594 		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2595 			VOP_UNLOCK(vp, 0);
2596 			vdrop(vp);
2597 			continue;
2598 		}
2599 		/*
2600 		 * If WRITECLOSE is set, flush out unlinked but still open
2601 		 * files (even if open only for reading) and regular file
2602 		 * vnodes open for writing.
2603 		 */
2604 		if (flags & WRITECLOSE) {
2605 			if (vp->v_object != NULL) {
2606 				VM_OBJECT_LOCK(vp->v_object);
2607 				vm_object_page_clean(vp->v_object, 0, 0, 0);
2608 				VM_OBJECT_UNLOCK(vp->v_object);
2609 			}
2610 			error = VOP_FSYNC(vp, MNT_WAIT, td);
2611 			if (error != 0) {
2612 				VOP_UNLOCK(vp, 0);
2613 				vdrop(vp);
2614 				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2615 				return (error);
2616 			}
2617 			error = VOP_GETATTR(vp, &vattr, td->td_ucred);
2618 			VI_LOCK(vp);
2619 
2620 			if ((vp->v_type == VNON ||
2621 			    (error == 0 && vattr.va_nlink > 0)) &&
2622 			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
2623 				VOP_UNLOCK(vp, 0);
2624 				vdropl(vp);
2625 				continue;
2626 			}
2627 		} else
2628 			VI_LOCK(vp);
2629 		/*
2630 		 * With v_usecount == 0, all we need to do is clear out the
2631 		 * vnode data structures and we are done.
2632 		 *
2633 		 * If FORCECLOSE is set, forcibly close the vnode.
2634 		 */
2635 		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
2636 			VNASSERT(vp->v_usecount == 0 ||
2637 			    (vp->v_type != VCHR && vp->v_type != VBLK), vp,
2638 			    ("device VNODE %p is FORCECLOSED", vp));
2639 			vgonel(vp);
2640 		} else {
2641 			busy++;
2642 #ifdef DIAGNOSTIC
2643 			if (busyprt)
2644 				vprint("vflush: busy vnode", vp);
2645 #endif
2646 		}
2647 		VOP_UNLOCK(vp, 0);
2648 		vdropl(vp);
2649 	}
2650 	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2651 		/*
2652 		 * If just the root vnode is busy, and if its refcount
2653 		 * is equal to `rootrefs', then go ahead and kill it.
2654 		 */
2655 		VI_LOCK(rootvp);
2656 		KASSERT(busy > 0, ("vflush: not busy"));
2657 		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
2658 		    ("vflush: usecount %d < rootrefs %d",
2659 		     rootvp->v_usecount, rootrefs));
2660 		if (busy == 1 && rootvp->v_usecount == rootrefs) {
2661 			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
2662 			vgone(rootvp);
2663 			VOP_UNLOCK(rootvp, 0);
2664 			busy = 0;
2665 		} else
2666 			VI_UNLOCK(rootvp);
2667 	}
2668 	if (busy) {
2669 		CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
2670 		    busy);
2671 		return (EBUSY);
2672 	}
2673 	for (; rootrefs > 0; rootrefs--)
2674 		vrele(rootvp);
2675 	return (0);
2676 }
2677 
2678 /*
2679  * Recycle an unused vnode to the front of the free list.
2680  */
2681 int
2682 vrecycle(struct vnode *vp)
2683 {
2684 	int recycled;
2685 
2686 	ASSERT_VOP_ELOCKED(vp, "vrecycle");
2687 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2688 	recycled = 0;
2689 	VI_LOCK(vp);
2690 	if (vp->v_usecount == 0) {
2691 		recycled = 1;
2692 		vgonel(vp);
2693 	}
2694 	VI_UNLOCK(vp);
2695 	return (recycled);
2696 }
2697 
2698 /*
2699  * Eliminate all activity associated with a vnode
2700  * in preparation for reuse.
2701  */
2702 void
2703 vgone(struct vnode *vp)
2704 {
2705 	VI_LOCK(vp);
2706 	vgonel(vp);
2707 	VI_UNLOCK(vp);
2708 }
2709 
2710 static void
2711 vgonel_reclaim_lowervp_vfs(struct mount *mp __unused,
2712     struct vnode *lowervp __unused)
2713 {
2714 }
2715 
2716 /*
2717  * Notify upper mounts about reclaimed vnode.
2718  */
2719 static void
2720 vgonel_reclaim_lowervp(struct vnode *vp)
2721 {
2722 	static struct vfsops vgonel_vfsops = {
2723 		.vfs_reclaim_lowervp = vgonel_reclaim_lowervp_vfs
2724 	};
2725 	struct mount *mp, *ump, *mmp;
2726 
2727 	mp = vp->v_mount;
2728 	if (mp == NULL)
2729 		return;
2730 
2731 	MNT_ILOCK(mp);
2732 	if (TAILQ_EMPTY(&mp->mnt_uppers))
2733 		goto unlock;
2734 	MNT_IUNLOCK(mp);
2735 	mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
2736 	mmp->mnt_op = &vgonel_vfsops;
2737 	mmp->mnt_kern_flag |= MNTK_MARKER;
2738 	MNT_ILOCK(mp);
2739 	mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
2740 	for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
2741 		if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
2742 			ump = TAILQ_NEXT(ump, mnt_upper_link);
2743 			continue;
2744 		}
2745 		TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
2746 		MNT_IUNLOCK(mp);
2747 		VFS_RECLAIM_LOWERVP(ump, vp);
2748 		MNT_ILOCK(mp);
2749 		ump = TAILQ_NEXT(mmp, mnt_upper_link);
2750 		TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
2751 	}
2752 	free(mmp, M_TEMP);
2753 	mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
2754 	if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
2755 		mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
2756 		wakeup(&mp->mnt_uppers);
2757 	}
2758 unlock:
2759 	MNT_IUNLOCK(mp);
2760 }
2761 
2762 /*
2763  * vgone, with the vp interlock held.
2764  */
2765 void
2766 vgonel(struct vnode *vp)
2767 {
2768 	struct thread *td;
2769 	int oweinact;
2770 	int active;
2771 	struct mount *mp;
2772 
2773 	ASSERT_VOP_ELOCKED(vp, "vgonel");
2774 	ASSERT_VI_LOCKED(vp, "vgonel");
2775 	VNASSERT(vp->v_holdcnt, vp,
2776 	    ("vgonel: vp %p has no reference.", vp));
2777 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2778 	td = curthread;
2779 
2780 	/*
2781 	 * Don't vgonel if we're already doomed.
2782 	 */
2783 	if (vp->v_iflag & VI_DOOMED)
2784 		return;
2785 	vp->v_iflag |= VI_DOOMED;
2786 
2787 	/*
2788 	 * Check to see if the vnode is in use.  If so, we have to call
2789 	 * VOP_CLOSE() and VOP_INACTIVE().
2790 	 */
2791 	active = vp->v_usecount;
2792 	oweinact = (vp->v_iflag & VI_OWEINACT);
2793 	VI_UNLOCK(vp);
2794 	vgonel_reclaim_lowervp(vp);
2795 
2796 	/*
2797 	 * Clean out any buffers associated with the vnode.
2798 	 * If the flush fails, just toss the buffers.
2799 	 */
2800 	mp = NULL;
2801 	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
2802 		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
2803 	if (vinvalbuf(vp, V_SAVE, 0, 0) != 0)
2804 		vinvalbuf(vp, 0, 0, 0);
2805 
2806 	/*
2807 	 * If purging an active vnode, it must be closed and
2808 	 * deactivated before being reclaimed.
2809 	 */
2810 	if (active)
2811 		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2812 	if (oweinact || active) {
2813 		VI_LOCK(vp);
2814 		if ((vp->v_iflag & VI_DOINGINACT) == 0)
2815 			vinactive(vp, td);
2816 		VI_UNLOCK(vp);
2817 	}
2818 	if (vp->v_type == VSOCK)
2819 		vfs_unp_reclaim(vp);
2820 	/*
2821 	 * Reclaim the vnode.
2822 	 */
2823 	if (VOP_RECLAIM(vp, td))
2824 		panic("vgone: cannot reclaim");
2825 	if (mp != NULL)
2826 		vn_finished_secondary_write(mp);
2827 	VNASSERT(vp->v_object == NULL, vp,
2828 	    ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
2829 	/*
2830 	 * Clear the advisory locks and wake up waiting threads.
2831 	 */
2832 	(void)VOP_ADVLOCKPURGE(vp);
2833 	/*
2834 	 * Delete from old mount point vnode list.
2835 	 */
2836 	delmntque(vp);
2837 	cache_purge(vp);
2838 	/*
2839 	 * Done with purge, reset to the standard lock and invalidate
2840 	 * the vnode.
2841 	 */
2842 	VI_LOCK(vp);
2843 	vp->v_vnlock = &vp->v_lock;
2844 	vp->v_op = &dead_vnodeops;
2845 	vp->v_tag = "none";
2846 	vp->v_type = VBAD;
2847 }
2848 
2849 /*
2850  * Calculate the total number of references to a special device.
2851  */
2852 int
2853 vcount(struct vnode *vp)
2854 {
2855 	int count;
2856 
2857 	dev_lock();
2858 	count = vp->v_rdev->si_usecount;
2859 	dev_unlock();
2860 	return (count);
2861 }
2862 
2863 /*
2864  * Same as above, but using the struct cdev *as argument
2865  */
2866 int
2867 count_dev(struct cdev *dev)
2868 {
2869 	int count;
2870 
2871 	dev_lock();
2872 	count = dev->si_usecount;
2873 	dev_unlock();
2874 	return(count);
2875 }
2876 
2877 /*
2878  * Print out a description of a vnode.
2879  */
2880 static char *typename[] =
2881 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
2882  "VMARKER"};
2883 
2884 void
2885 vn_printf(struct vnode *vp, const char *fmt, ...)
2886 {
2887 	va_list ap;
2888 	char buf[256], buf2[16];
2889 	u_long flags;
2890 
2891 	va_start(ap, fmt);
2892 	vprintf(fmt, ap);
2893 	va_end(ap);
2894 	printf("%p: ", (void *)vp);
2895 	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
2896 	printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
2897 	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
2898 	buf[0] = '\0';
2899 	buf[1] = '\0';
2900 	if (vp->v_vflag & VV_ROOT)
2901 		strlcat(buf, "|VV_ROOT", sizeof(buf));
2902 	if (vp->v_vflag & VV_ISTTY)
2903 		strlcat(buf, "|VV_ISTTY", sizeof(buf));
2904 	if (vp->v_vflag & VV_NOSYNC)
2905 		strlcat(buf, "|VV_NOSYNC", sizeof(buf));
2906 	if (vp->v_vflag & VV_ETERNALDEV)
2907 		strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
2908 	if (vp->v_vflag & VV_CACHEDLABEL)
2909 		strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
2910 	if (vp->v_vflag & VV_TEXT)
2911 		strlcat(buf, "|VV_TEXT", sizeof(buf));
2912 	if (vp->v_vflag & VV_COPYONWRITE)
2913 		strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
2914 	if (vp->v_vflag & VV_SYSTEM)
2915 		strlcat(buf, "|VV_SYSTEM", sizeof(buf));
2916 	if (vp->v_vflag & VV_PROCDEP)
2917 		strlcat(buf, "|VV_PROCDEP", sizeof(buf));
2918 	if (vp->v_vflag & VV_NOKNOTE)
2919 		strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
2920 	if (vp->v_vflag & VV_DELETED)
2921 		strlcat(buf, "|VV_DELETED", sizeof(buf));
2922 	if (vp->v_vflag & VV_MD)
2923 		strlcat(buf, "|VV_MD", sizeof(buf));
2924 	if (vp->v_vflag & VV_FORCEINSMQ)
2925 		strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
2926 	flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
2927 	    VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
2928 	    VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ);
2929 	if (flags != 0) {
2930 		snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
2931 		strlcat(buf, buf2, sizeof(buf));
2932 	}
2933 	if (vp->v_iflag & VI_MOUNT)
2934 		strlcat(buf, "|VI_MOUNT", sizeof(buf));
2935 	if (vp->v_iflag & VI_AGE)
2936 		strlcat(buf, "|VI_AGE", sizeof(buf));
2937 	if (vp->v_iflag & VI_DOOMED)
2938 		strlcat(buf, "|VI_DOOMED", sizeof(buf));
2939 	if (vp->v_iflag & VI_FREE)
2940 		strlcat(buf, "|VI_FREE", sizeof(buf));
2941 	if (vp->v_iflag & VI_ACTIVE)
2942 		strlcat(buf, "|VI_ACTIVE", sizeof(buf));
2943 	if (vp->v_iflag & VI_DOINGINACT)
2944 		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
2945 	if (vp->v_iflag & VI_OWEINACT)
2946 		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
2947 	flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE |
2948 	    VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT);
2949 	if (flags != 0) {
2950 		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
2951 		strlcat(buf, buf2, sizeof(buf));
2952 	}
2953 	printf("    flags (%s)\n", buf + 1);
2954 	if (mtx_owned(VI_MTX(vp)))
2955 		printf(" VI_LOCKed");
2956 	if (vp->v_object != NULL)
2957 		printf("    v_object %p ref %d pages %d\n",
2958 		    vp->v_object, vp->v_object->ref_count,
2959 		    vp->v_object->resident_page_count);
2960 	printf("    ");
2961 	lockmgr_printinfo(vp->v_vnlock);
2962 	if (vp->v_data != NULL)
2963 		VOP_PRINT(vp);
2964 }
2965 
2966 #ifdef DDB
2967 /*
2968  * List all of the locked vnodes in the system.
2969  * Called when debugging the kernel.
2970  */
2971 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
2972 {
2973 	struct mount *mp, *nmp;
2974 	struct vnode *vp;
2975 
2976 	/*
2977 	 * Note: because this is DDB, we can't obey the locking semantics
2978 	 * for these structures, which means we could catch an inconsistent
2979 	 * state and dereference a nasty pointer.  Not much to be done
2980 	 * about that.
2981 	 */
2982 	db_printf("Locked vnodes\n");
2983 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2984 		nmp = TAILQ_NEXT(mp, mnt_list);
2985 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2986 			if (vp->v_type != VMARKER &&
2987 			    VOP_ISLOCKED(vp))
2988 				vprint("", vp);
2989 		}
2990 		nmp = TAILQ_NEXT(mp, mnt_list);
2991 	}
2992 }
2993 
2994 /*
2995  * Show details about the given vnode.
2996  */
2997 DB_SHOW_COMMAND(vnode, db_show_vnode)
2998 {
2999 	struct vnode *vp;
3000 
3001 	if (!have_addr)
3002 		return;
3003 	vp = (struct vnode *)addr;
3004 	vn_printf(vp, "vnode ");
3005 }
3006 
3007 /*
3008  * Show details about the given mount point.
3009  */
3010 DB_SHOW_COMMAND(mount, db_show_mount)
3011 {
3012 	struct mount *mp;
3013 	struct vfsopt *opt;
3014 	struct statfs *sp;
3015 	struct vnode *vp;
3016 	char buf[512];
3017 	uint64_t mflags;
3018 	u_int flags;
3019 
3020 	if (!have_addr) {
3021 		/* No address given, print short info about all mount points. */
3022 		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3023 			db_printf("%p %s on %s (%s)\n", mp,
3024 			    mp->mnt_stat.f_mntfromname,
3025 			    mp->mnt_stat.f_mntonname,
3026 			    mp->mnt_stat.f_fstypename);
3027 			if (db_pager_quit)
3028 				break;
3029 		}
3030 		db_printf("\nMore info: show mount <addr>\n");
3031 		return;
3032 	}
3033 
3034 	mp = (struct mount *)addr;
3035 	db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
3036 	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
3037 
3038 	buf[0] = '\0';
3039 	mflags = mp->mnt_flag;
3040 #define	MNT_FLAG(flag)	do {						\
3041 	if (mflags & (flag)) {						\
3042 		if (buf[0] != '\0')					\
3043 			strlcat(buf, ", ", sizeof(buf));		\
3044 		strlcat(buf, (#flag) + 4, sizeof(buf));			\
3045 		mflags &= ~(flag);					\
3046 	}								\
3047 } while (0)
3048 	MNT_FLAG(MNT_RDONLY);
3049 	MNT_FLAG(MNT_SYNCHRONOUS);
3050 	MNT_FLAG(MNT_NOEXEC);
3051 	MNT_FLAG(MNT_NOSUID);
3052 	MNT_FLAG(MNT_NFS4ACLS);
3053 	MNT_FLAG(MNT_UNION);
3054 	MNT_FLAG(MNT_ASYNC);
3055 	MNT_FLAG(MNT_SUIDDIR);
3056 	MNT_FLAG(MNT_SOFTDEP);
3057 	MNT_FLAG(MNT_NOSYMFOLLOW);
3058 	MNT_FLAG(MNT_GJOURNAL);
3059 	MNT_FLAG(MNT_MULTILABEL);
3060 	MNT_FLAG(MNT_ACLS);
3061 	MNT_FLAG(MNT_NOATIME);
3062 	MNT_FLAG(MNT_NOCLUSTERR);
3063 	MNT_FLAG(MNT_NOCLUSTERW);
3064 	MNT_FLAG(MNT_SUJ);
3065 	MNT_FLAG(MNT_EXRDONLY);
3066 	MNT_FLAG(MNT_EXPORTED);
3067 	MNT_FLAG(MNT_DEFEXPORTED);
3068 	MNT_FLAG(MNT_EXPORTANON);
3069 	MNT_FLAG(MNT_EXKERB);
3070 	MNT_FLAG(MNT_EXPUBLIC);
3071 	MNT_FLAG(MNT_LOCAL);
3072 	MNT_FLAG(MNT_QUOTA);
3073 	MNT_FLAG(MNT_ROOTFS);
3074 	MNT_FLAG(MNT_USER);
3075 	MNT_FLAG(MNT_IGNORE);
3076 	MNT_FLAG(MNT_UPDATE);
3077 	MNT_FLAG(MNT_DELEXPORT);
3078 	MNT_FLAG(MNT_RELOAD);
3079 	MNT_FLAG(MNT_FORCE);
3080 	MNT_FLAG(MNT_SNAPSHOT);
3081 	MNT_FLAG(MNT_BYFSID);
3082 #undef MNT_FLAG
3083 	if (mflags != 0) {
3084 		if (buf[0] != '\0')
3085 			strlcat(buf, ", ", sizeof(buf));
3086 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
3087 		    "0x%016jx", mflags);
3088 	}
3089 	db_printf("    mnt_flag = %s\n", buf);
3090 
3091 	buf[0] = '\0';
3092 	flags = mp->mnt_kern_flag;
3093 #define	MNT_KERN_FLAG(flag)	do {					\
3094 	if (flags & (flag)) {						\
3095 		if (buf[0] != '\0')					\
3096 			strlcat(buf, ", ", sizeof(buf));		\
3097 		strlcat(buf, (#flag) + 5, sizeof(buf));			\
3098 		flags &= ~(flag);					\
3099 	}								\
3100 } while (0)
3101 	MNT_KERN_FLAG(MNTK_UNMOUNTF);
3102 	MNT_KERN_FLAG(MNTK_ASYNC);
3103 	MNT_KERN_FLAG(MNTK_SOFTDEP);
3104 	MNT_KERN_FLAG(MNTK_NOINSMNTQ);
3105 	MNT_KERN_FLAG(MNTK_DRAINING);
3106 	MNT_KERN_FLAG(MNTK_REFEXPIRE);
3107 	MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
3108 	MNT_KERN_FLAG(MNTK_SHARED_WRITES);
3109 	MNT_KERN_FLAG(MNTK_NO_IOPF);
3110 	MNT_KERN_FLAG(MNTK_VGONE_UPPER);
3111 	MNT_KERN_FLAG(MNTK_VGONE_WAITER);
3112 	MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
3113 	MNT_KERN_FLAG(MNTK_MARKER);
3114 	MNT_KERN_FLAG(MNTK_NOASYNC);
3115 	MNT_KERN_FLAG(MNTK_UNMOUNT);
3116 	MNT_KERN_FLAG(MNTK_MWAIT);
3117 	MNT_KERN_FLAG(MNTK_SUSPEND);
3118 	MNT_KERN_FLAG(MNTK_SUSPEND2);
3119 	MNT_KERN_FLAG(MNTK_SUSPENDED);
3120 	MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
3121 	MNT_KERN_FLAG(MNTK_NOKNOTE);
3122 #undef MNT_KERN_FLAG
3123 	if (flags != 0) {
3124 		if (buf[0] != '\0')
3125 			strlcat(buf, ", ", sizeof(buf));
3126 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
3127 		    "0x%08x", flags);
3128 	}
3129 	db_printf("    mnt_kern_flag = %s\n", buf);
3130 
3131 	db_printf("    mnt_opt = ");
3132 	opt = TAILQ_FIRST(mp->mnt_opt);
3133 	if (opt != NULL) {
3134 		db_printf("%s", opt->name);
3135 		opt = TAILQ_NEXT(opt, link);
3136 		while (opt != NULL) {
3137 			db_printf(", %s", opt->name);
3138 			opt = TAILQ_NEXT(opt, link);
3139 		}
3140 	}
3141 	db_printf("\n");
3142 
3143 	sp = &mp->mnt_stat;
3144 	db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
3145 	    "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
3146 	    "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
3147 	    "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
3148 	    (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
3149 	    (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
3150 	    (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
3151 	    (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
3152 	    (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
3153 	    (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
3154 	    (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
3155 	    (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
3156 
3157 	db_printf("    mnt_cred = { uid=%u ruid=%u",
3158 	    (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
3159 	if (jailed(mp->mnt_cred))
3160 		db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
3161 	db_printf(" }\n");
3162 	db_printf("    mnt_ref = %d\n", mp->mnt_ref);
3163 	db_printf("    mnt_gen = %d\n", mp->mnt_gen);
3164 	db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
3165 	db_printf("    mnt_activevnodelistsize = %d\n",
3166 	    mp->mnt_activevnodelistsize);
3167 	db_printf("    mnt_writeopcount = %d\n", mp->mnt_writeopcount);
3168 	db_printf("    mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
3169 	db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
3170 	db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
3171 	db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
3172 	db_printf("    mnt_secondary_accwrites = %d\n",
3173 	    mp->mnt_secondary_accwrites);
3174 	db_printf("    mnt_gjprovider = %s\n",
3175 	    mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
3176 
3177 	db_printf("\n\nList of active vnodes\n");
3178 	TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) {
3179 		if (vp->v_type != VMARKER) {
3180 			vn_printf(vp, "vnode ");
3181 			if (db_pager_quit)
3182 				break;
3183 		}
3184 	}
3185 	db_printf("\n\nList of inactive vnodes\n");
3186 	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3187 		if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) {
3188 			vn_printf(vp, "vnode ");
3189 			if (db_pager_quit)
3190 				break;
3191 		}
3192 	}
3193 }
3194 #endif	/* DDB */
3195 
3196 /*
3197  * Fill in a struct xvfsconf based on a struct vfsconf.
3198  */
3199 static int
3200 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
3201 {
3202 	struct xvfsconf xvfsp;
3203 
3204 	bzero(&xvfsp, sizeof(xvfsp));
3205 	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
3206 	xvfsp.vfc_typenum = vfsp->vfc_typenum;
3207 	xvfsp.vfc_refcount = vfsp->vfc_refcount;
3208 	xvfsp.vfc_flags = vfsp->vfc_flags;
3209 	/*
3210 	 * These are unused in userland, we keep them
3211 	 * to not break binary compatibility.
3212 	 */
3213 	xvfsp.vfc_vfsops = NULL;
3214 	xvfsp.vfc_next = NULL;
3215 	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3216 }
3217 
3218 #ifdef COMPAT_FREEBSD32
3219 struct xvfsconf32 {
3220 	uint32_t	vfc_vfsops;
3221 	char		vfc_name[MFSNAMELEN];
3222 	int32_t		vfc_typenum;
3223 	int32_t		vfc_refcount;
3224 	int32_t		vfc_flags;
3225 	uint32_t	vfc_next;
3226 };
3227 
3228 static int
3229 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
3230 {
3231 	struct xvfsconf32 xvfsp;
3232 
3233 	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
3234 	xvfsp.vfc_typenum = vfsp->vfc_typenum;
3235 	xvfsp.vfc_refcount = vfsp->vfc_refcount;
3236 	xvfsp.vfc_flags = vfsp->vfc_flags;
3237 	xvfsp.vfc_vfsops = 0;
3238 	xvfsp.vfc_next = 0;
3239 	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3240 }
3241 #endif
3242 
3243 /*
3244  * Top level filesystem related information gathering.
3245  */
3246 static int
3247 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
3248 {
3249 	struct vfsconf *vfsp;
3250 	int error;
3251 
3252 	error = 0;
3253 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3254 #ifdef COMPAT_FREEBSD32
3255 		if (req->flags & SCTL_MASK32)
3256 			error = vfsconf2x32(req, vfsp);
3257 		else
3258 #endif
3259 			error = vfsconf2x(req, vfsp);
3260 		if (error)
3261 			break;
3262 	}
3263 	return (error);
3264 }
3265 
3266 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD,
3267     NULL, 0, sysctl_vfs_conflist,
3268     "S,xvfsconf", "List of all configured filesystems");
3269 
3270 #ifndef BURN_BRIDGES
3271 static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
3272 
3273 static int
3274 vfs_sysctl(SYSCTL_HANDLER_ARGS)
3275 {
3276 	int *name = (int *)arg1 - 1;	/* XXX */
3277 	u_int namelen = arg2 + 1;	/* XXX */
3278 	struct vfsconf *vfsp;
3279 
3280 	log(LOG_WARNING, "userland calling deprecated sysctl, "
3281 	    "please rebuild world\n");
3282 
3283 #if 1 || defined(COMPAT_PRELITE2)
3284 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
3285 	if (namelen == 1)
3286 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
3287 #endif
3288 
3289 	switch (name[1]) {
3290 	case VFS_MAXTYPENUM:
3291 		if (namelen != 2)
3292 			return (ENOTDIR);
3293 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
3294 	case VFS_CONF:
3295 		if (namelen != 3)
3296 			return (ENOTDIR);	/* overloaded */
3297 		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
3298 			if (vfsp->vfc_typenum == name[2])
3299 				break;
3300 		if (vfsp == NULL)
3301 			return (EOPNOTSUPP);
3302 #ifdef COMPAT_FREEBSD32
3303 		if (req->flags & SCTL_MASK32)
3304 			return (vfsconf2x32(req, vfsp));
3305 		else
3306 #endif
3307 			return (vfsconf2x(req, vfsp));
3308 	}
3309 	return (EOPNOTSUPP);
3310 }
3311 
3312 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP,
3313     vfs_sysctl, "Generic filesystem");
3314 
3315 #if 1 || defined(COMPAT_PRELITE2)
3316 
3317 static int
3318 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
3319 {
3320 	int error;
3321 	struct vfsconf *vfsp;
3322 	struct ovfsconf ovfs;
3323 
3324 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3325 		bzero(&ovfs, sizeof(ovfs));
3326 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
3327 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
3328 		ovfs.vfc_index = vfsp->vfc_typenum;
3329 		ovfs.vfc_refcount = vfsp->vfc_refcount;
3330 		ovfs.vfc_flags = vfsp->vfc_flags;
3331 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
3332 		if (error)
3333 			return error;
3334 	}
3335 	return 0;
3336 }
3337 
3338 #endif /* 1 || COMPAT_PRELITE2 */
3339 #endif /* !BURN_BRIDGES */
3340 
3341 #define KINFO_VNODESLOP		10
3342 #ifdef notyet
3343 /*
3344  * Dump vnode list (via sysctl).
3345  */
3346 /* ARGSUSED */
3347 static int
3348 sysctl_vnode(SYSCTL_HANDLER_ARGS)
3349 {
3350 	struct xvnode *xvn;
3351 	struct mount *mp;
3352 	struct vnode *vp;
3353 	int error, len, n;
3354 
3355 	/*
3356 	 * Stale numvnodes access is not fatal here.
3357 	 */
3358 	req->lock = 0;
3359 	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
3360 	if (!req->oldptr)
3361 		/* Make an estimate */
3362 		return (SYSCTL_OUT(req, 0, len));
3363 
3364 	error = sysctl_wire_old_buffer(req, 0);
3365 	if (error != 0)
3366 		return (error);
3367 	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
3368 	n = 0;
3369 	mtx_lock(&mountlist_mtx);
3370 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3371 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
3372 			continue;
3373 		MNT_ILOCK(mp);
3374 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3375 			if (n == len)
3376 				break;
3377 			vref(vp);
3378 			xvn[n].xv_size = sizeof *xvn;
3379 			xvn[n].xv_vnode = vp;
3380 			xvn[n].xv_id = 0;	/* XXX compat */
3381 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
3382 			XV_COPY(usecount);
3383 			XV_COPY(writecount);
3384 			XV_COPY(holdcnt);
3385 			XV_COPY(mount);
3386 			XV_COPY(numoutput);
3387 			XV_COPY(type);
3388 #undef XV_COPY
3389 			xvn[n].xv_flag = vp->v_vflag;
3390 
3391 			switch (vp->v_type) {
3392 			case VREG:
3393 			case VDIR:
3394 			case VLNK:
3395 				break;
3396 			case VBLK:
3397 			case VCHR:
3398 				if (vp->v_rdev == NULL) {
3399 					vrele(vp);
3400 					continue;
3401 				}
3402 				xvn[n].xv_dev = dev2udev(vp->v_rdev);
3403 				break;
3404 			case VSOCK:
3405 				xvn[n].xv_socket = vp->v_socket;
3406 				break;
3407 			case VFIFO:
3408 				xvn[n].xv_fifo = vp->v_fifoinfo;
3409 				break;
3410 			case VNON:
3411 			case VBAD:
3412 			default:
3413 				/* shouldn't happen? */
3414 				vrele(vp);
3415 				continue;
3416 			}
3417 			vrele(vp);
3418 			++n;
3419 		}
3420 		MNT_IUNLOCK(mp);
3421 		mtx_lock(&mountlist_mtx);
3422 		vfs_unbusy(mp);
3423 		if (n == len)
3424 			break;
3425 	}
3426 	mtx_unlock(&mountlist_mtx);
3427 
3428 	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
3429 	free(xvn, M_TEMP);
3430 	return (error);
3431 }
3432 
3433 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
3434     0, 0, sysctl_vnode, "S,xvnode", "");
3435 #endif
3436 
3437 /*
3438  * Unmount all filesystems. The list is traversed in reverse order
3439  * of mounting to avoid dependencies.
3440  */
3441 void
3442 vfs_unmountall(void)
3443 {
3444 	struct mount *mp;
3445 	struct thread *td;
3446 	int error;
3447 
3448 	CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
3449 	td = curthread;
3450 
3451 	/*
3452 	 * Since this only runs when rebooting, it is not interlocked.
3453 	 */
3454 	while(!TAILQ_EMPTY(&mountlist)) {
3455 		mp = TAILQ_LAST(&mountlist, mntlist);
3456 		error = dounmount(mp, MNT_FORCE, td);
3457 		if (error) {
3458 			TAILQ_REMOVE(&mountlist, mp, mnt_list);
3459 			/*
3460 			 * XXX: Due to the way in which we mount the root
3461 			 * file system off of devfs, devfs will generate a
3462 			 * "busy" warning when we try to unmount it before
3463 			 * the root.  Don't print a warning as a result in
3464 			 * order to avoid false positive errors that may
3465 			 * cause needless upset.
3466 			 */
3467 			if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
3468 				printf("unmount of %s failed (",
3469 				    mp->mnt_stat.f_mntonname);
3470 				if (error == EBUSY)
3471 					printf("BUSY)\n");
3472 				else
3473 					printf("%d)\n", error);
3474 			}
3475 		} else {
3476 			/* The unmount has removed mp from the mountlist */
3477 		}
3478 	}
3479 }
3480 
3481 /*
3482  * perform msync on all vnodes under a mount point
3483  * the mount point must be locked.
3484  */
3485 void
3486 vfs_msync(struct mount *mp, int flags)
3487 {
3488 	struct vnode *vp, *mvp;
3489 	struct vm_object *obj;
3490 
3491 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
3492 	MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) {
3493 		obj = vp->v_object;
3494 		if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 &&
3495 		    (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
3496 			if (!vget(vp,
3497 			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
3498 			    curthread)) {
3499 				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
3500 					vput(vp);
3501 					continue;
3502 				}
3503 
3504 				obj = vp->v_object;
3505 				if (obj != NULL) {
3506 					VM_OBJECT_LOCK(obj);
3507 					vm_object_page_clean(obj, 0, 0,
3508 					    flags == MNT_WAIT ?
3509 					    OBJPC_SYNC : OBJPC_NOSYNC);
3510 					VM_OBJECT_UNLOCK(obj);
3511 				}
3512 				vput(vp);
3513 			}
3514 		} else
3515 			VI_UNLOCK(vp);
3516 	}
3517 }
3518 
3519 static void
3520 destroy_vpollinfo(struct vpollinfo *vi)
3521 {
3522 	seldrain(&vi->vpi_selinfo);
3523 	knlist_destroy(&vi->vpi_selinfo.si_note);
3524 	mtx_destroy(&vi->vpi_lock);
3525 	uma_zfree(vnodepoll_zone, vi);
3526 }
3527 
3528 /*
3529  * Initalize per-vnode helper structure to hold poll-related state.
3530  */
3531 void
3532 v_addpollinfo(struct vnode *vp)
3533 {
3534 	struct vpollinfo *vi;
3535 
3536 	if (vp->v_pollinfo != NULL)
3537 		return;
3538 	vi = uma_zalloc(vnodepoll_zone, M_WAITOK);
3539 	mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
3540 	knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
3541 	    vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
3542 	VI_LOCK(vp);
3543 	if (vp->v_pollinfo != NULL) {
3544 		VI_UNLOCK(vp);
3545 		destroy_vpollinfo(vi);
3546 		return;
3547 	}
3548 	vp->v_pollinfo = vi;
3549 	VI_UNLOCK(vp);
3550 }
3551 
3552 /*
3553  * Record a process's interest in events which might happen to
3554  * a vnode.  Because poll uses the historic select-style interface
3555  * internally, this routine serves as both the ``check for any
3556  * pending events'' and the ``record my interest in future events''
3557  * functions.  (These are done together, while the lock is held,
3558  * to avoid race conditions.)
3559  */
3560 int
3561 vn_pollrecord(struct vnode *vp, struct thread *td, int events)
3562 {
3563 
3564 	v_addpollinfo(vp);
3565 	mtx_lock(&vp->v_pollinfo->vpi_lock);
3566 	if (vp->v_pollinfo->vpi_revents & events) {
3567 		/*
3568 		 * This leaves events we are not interested
3569 		 * in available for the other process which
3570 		 * which presumably had requested them
3571 		 * (otherwise they would never have been
3572 		 * recorded).
3573 		 */
3574 		events &= vp->v_pollinfo->vpi_revents;
3575 		vp->v_pollinfo->vpi_revents &= ~events;
3576 
3577 		mtx_unlock(&vp->v_pollinfo->vpi_lock);
3578 		return (events);
3579 	}
3580 	vp->v_pollinfo->vpi_events |= events;
3581 	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3582 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3583 	return (0);
3584 }
3585 
3586 /*
3587  * Routine to create and manage a filesystem syncer vnode.
3588  */
3589 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
3590 static int	sync_fsync(struct  vop_fsync_args *);
3591 static int	sync_inactive(struct  vop_inactive_args *);
3592 static int	sync_reclaim(struct  vop_reclaim_args *);
3593 
3594 static struct vop_vector sync_vnodeops = {
3595 	.vop_bypass =	VOP_EOPNOTSUPP,
3596 	.vop_close =	sync_close,		/* close */
3597 	.vop_fsync =	sync_fsync,		/* fsync */
3598 	.vop_inactive =	sync_inactive,	/* inactive */
3599 	.vop_reclaim =	sync_reclaim,	/* reclaim */
3600 	.vop_lock1 =	vop_stdlock,	/* lock */
3601 	.vop_unlock =	vop_stdunlock,	/* unlock */
3602 	.vop_islocked =	vop_stdislocked,	/* islocked */
3603 };
3604 
3605 /*
3606  * Create a new filesystem syncer vnode for the specified mount point.
3607  */
3608 void
3609 vfs_allocate_syncvnode(struct mount *mp)
3610 {
3611 	struct vnode *vp;
3612 	struct bufobj *bo;
3613 	static long start, incr, next;
3614 	int error;
3615 
3616 	/* Allocate a new vnode */
3617 	error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
3618 	if (error != 0)
3619 		panic("vfs_allocate_syncvnode: getnewvnode() failed");
3620 	vp->v_type = VNON;
3621 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3622 	vp->v_vflag |= VV_FORCEINSMQ;
3623 	error = insmntque(vp, mp);
3624 	if (error != 0)
3625 		panic("vfs_allocate_syncvnode: insmntque() failed");
3626 	vp->v_vflag &= ~VV_FORCEINSMQ;
3627 	VOP_UNLOCK(vp, 0);
3628 	/*
3629 	 * Place the vnode onto the syncer worklist. We attempt to
3630 	 * scatter them about on the list so that they will go off
3631 	 * at evenly distributed times even if all the filesystems
3632 	 * are mounted at once.
3633 	 */
3634 	next += incr;
3635 	if (next == 0 || next > syncer_maxdelay) {
3636 		start /= 2;
3637 		incr /= 2;
3638 		if (start == 0) {
3639 			start = syncer_maxdelay / 2;
3640 			incr = syncer_maxdelay;
3641 		}
3642 		next = start;
3643 	}
3644 	bo = &vp->v_bufobj;
3645 	BO_LOCK(bo);
3646 	vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
3647 	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
3648 	mtx_lock(&sync_mtx);
3649 	sync_vnode_count++;
3650 	if (mp->mnt_syncer == NULL) {
3651 		mp->mnt_syncer = vp;
3652 		vp = NULL;
3653 	}
3654 	mtx_unlock(&sync_mtx);
3655 	BO_UNLOCK(bo);
3656 	if (vp != NULL) {
3657 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3658 		vgone(vp);
3659 		vput(vp);
3660 	}
3661 }
3662 
3663 void
3664 vfs_deallocate_syncvnode(struct mount *mp)
3665 {
3666 	struct vnode *vp;
3667 
3668 	mtx_lock(&sync_mtx);
3669 	vp = mp->mnt_syncer;
3670 	if (vp != NULL)
3671 		mp->mnt_syncer = NULL;
3672 	mtx_unlock(&sync_mtx);
3673 	if (vp != NULL)
3674 		vrele(vp);
3675 }
3676 
3677 /*
3678  * Do a lazy sync of the filesystem.
3679  */
3680 static int
3681 sync_fsync(struct vop_fsync_args *ap)
3682 {
3683 	struct vnode *syncvp = ap->a_vp;
3684 	struct mount *mp = syncvp->v_mount;
3685 	int error, save;
3686 	struct bufobj *bo;
3687 
3688 	/*
3689 	 * We only need to do something if this is a lazy evaluation.
3690 	 */
3691 	if (ap->a_waitfor != MNT_LAZY)
3692 		return (0);
3693 
3694 	/*
3695 	 * Move ourselves to the back of the sync list.
3696 	 */
3697 	bo = &syncvp->v_bufobj;
3698 	BO_LOCK(bo);
3699 	vn_syncer_add_to_worklist(bo, syncdelay);
3700 	BO_UNLOCK(bo);
3701 
3702 	/*
3703 	 * Walk the list of vnodes pushing all that are dirty and
3704 	 * not already on the sync list.
3705 	 */
3706 	mtx_lock(&mountlist_mtx);
3707 	if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) {
3708 		mtx_unlock(&mountlist_mtx);
3709 		return (0);
3710 	}
3711 	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3712 		vfs_unbusy(mp);
3713 		return (0);
3714 	}
3715 	save = curthread_pflags_set(TDP_SYNCIO);
3716 	vfs_msync(mp, MNT_NOWAIT);
3717 	error = VFS_SYNC(mp, MNT_LAZY);
3718 	curthread_pflags_restore(save);
3719 	vn_finished_write(mp);
3720 	vfs_unbusy(mp);
3721 	return (error);
3722 }
3723 
3724 /*
3725  * The syncer vnode is no referenced.
3726  */
3727 static int
3728 sync_inactive(struct vop_inactive_args *ap)
3729 {
3730 
3731 	vgone(ap->a_vp);
3732 	return (0);
3733 }
3734 
3735 /*
3736  * The syncer vnode is no longer needed and is being decommissioned.
3737  *
3738  * Modifications to the worklist must be protected by sync_mtx.
3739  */
3740 static int
3741 sync_reclaim(struct vop_reclaim_args *ap)
3742 {
3743 	struct vnode *vp = ap->a_vp;
3744 	struct bufobj *bo;
3745 
3746 	bo = &vp->v_bufobj;
3747 	BO_LOCK(bo);
3748 	mtx_lock(&sync_mtx);
3749 	if (vp->v_mount->mnt_syncer == vp)
3750 		vp->v_mount->mnt_syncer = NULL;
3751 	if (bo->bo_flag & BO_ONWORKLST) {
3752 		LIST_REMOVE(bo, bo_synclist);
3753 		syncer_worklist_len--;
3754 		sync_vnode_count--;
3755 		bo->bo_flag &= ~BO_ONWORKLST;
3756 	}
3757 	mtx_unlock(&sync_mtx);
3758 	BO_UNLOCK(bo);
3759 
3760 	return (0);
3761 }
3762 
3763 /*
3764  * Check if vnode represents a disk device
3765  */
3766 int
3767 vn_isdisk(struct vnode *vp, int *errp)
3768 {
3769 	int error;
3770 
3771 	error = 0;
3772 	dev_lock();
3773 	if (vp->v_type != VCHR)
3774 		error = ENOTBLK;
3775 	else if (vp->v_rdev == NULL)
3776 		error = ENXIO;
3777 	else if (vp->v_rdev->si_devsw == NULL)
3778 		error = ENXIO;
3779 	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
3780 		error = ENOTBLK;
3781 	dev_unlock();
3782 	if (errp != NULL)
3783 		*errp = error;
3784 	return (error == 0);
3785 }
3786 
3787 /*
3788  * Common filesystem object access control check routine.  Accepts a
3789  * vnode's type, "mode", uid and gid, requested access mode, credentials,
3790  * and optional call-by-reference privused argument allowing vaccess()
3791  * to indicate to the caller whether privilege was used to satisfy the
3792  * request (obsoleted).  Returns 0 on success, or an errno on failure.
3793  */
3794 int
3795 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
3796     accmode_t accmode, struct ucred *cred, int *privused)
3797 {
3798 	accmode_t dac_granted;
3799 	accmode_t priv_granted;
3800 
3801 	KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
3802 	    ("invalid bit in accmode"));
3803 	KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
3804 	    ("VAPPEND without VWRITE"));
3805 
3806 	/*
3807 	 * Look for a normal, non-privileged way to access the file/directory
3808 	 * as requested.  If it exists, go with that.
3809 	 */
3810 
3811 	if (privused != NULL)
3812 		*privused = 0;
3813 
3814 	dac_granted = 0;
3815 
3816 	/* Check the owner. */
3817 	if (cred->cr_uid == file_uid) {
3818 		dac_granted |= VADMIN;
3819 		if (file_mode & S_IXUSR)
3820 			dac_granted |= VEXEC;
3821 		if (file_mode & S_IRUSR)
3822 			dac_granted |= VREAD;
3823 		if (file_mode & S_IWUSR)
3824 			dac_granted |= (VWRITE | VAPPEND);
3825 
3826 		if ((accmode & dac_granted) == accmode)
3827 			return (0);
3828 
3829 		goto privcheck;
3830 	}
3831 
3832 	/* Otherwise, check the groups (first match) */
3833 	if (groupmember(file_gid, cred)) {
3834 		if (file_mode & S_IXGRP)
3835 			dac_granted |= VEXEC;
3836 		if (file_mode & S_IRGRP)
3837 			dac_granted |= VREAD;
3838 		if (file_mode & S_IWGRP)
3839 			dac_granted |= (VWRITE | VAPPEND);
3840 
3841 		if ((accmode & dac_granted) == accmode)
3842 			return (0);
3843 
3844 		goto privcheck;
3845 	}
3846 
3847 	/* Otherwise, check everyone else. */
3848 	if (file_mode & S_IXOTH)
3849 		dac_granted |= VEXEC;
3850 	if (file_mode & S_IROTH)
3851 		dac_granted |= VREAD;
3852 	if (file_mode & S_IWOTH)
3853 		dac_granted |= (VWRITE | VAPPEND);
3854 	if ((accmode & dac_granted) == accmode)
3855 		return (0);
3856 
3857 privcheck:
3858 	/*
3859 	 * Build a privilege mask to determine if the set of privileges
3860 	 * satisfies the requirements when combined with the granted mask
3861 	 * from above.  For each privilege, if the privilege is required,
3862 	 * bitwise or the request type onto the priv_granted mask.
3863 	 */
3864 	priv_granted = 0;
3865 
3866 	if (type == VDIR) {
3867 		/*
3868 		 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
3869 		 * requests, instead of PRIV_VFS_EXEC.
3870 		 */
3871 		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3872 		    !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
3873 			priv_granted |= VEXEC;
3874 	} else {
3875 		/*
3876 		 * Ensure that at least one execute bit is on. Otherwise,
3877 		 * a privileged user will always succeed, and we don't want
3878 		 * this to happen unless the file really is executable.
3879 		 */
3880 		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3881 		    (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
3882 		    !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
3883 			priv_granted |= VEXEC;
3884 	}
3885 
3886 	if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
3887 	    !priv_check_cred(cred, PRIV_VFS_READ, 0))
3888 		priv_granted |= VREAD;
3889 
3890 	if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3891 	    !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
3892 		priv_granted |= (VWRITE | VAPPEND);
3893 
3894 	if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3895 	    !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
3896 		priv_granted |= VADMIN;
3897 
3898 	if ((accmode & (priv_granted | dac_granted)) == accmode) {
3899 		/* XXX audit: privilege used */
3900 		if (privused != NULL)
3901 			*privused = 1;
3902 		return (0);
3903 	}
3904 
3905 	return ((accmode & VADMIN) ? EPERM : EACCES);
3906 }
3907 
3908 /*
3909  * Credential check based on process requesting service, and per-attribute
3910  * permissions.
3911  */
3912 int
3913 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
3914     struct thread *td, accmode_t accmode)
3915 {
3916 
3917 	/*
3918 	 * Kernel-invoked always succeeds.
3919 	 */
3920 	if (cred == NOCRED)
3921 		return (0);
3922 
3923 	/*
3924 	 * Do not allow privileged processes in jail to directly manipulate
3925 	 * system attributes.
3926 	 */
3927 	switch (attrnamespace) {
3928 	case EXTATTR_NAMESPACE_SYSTEM:
3929 		/* Potentially should be: return (EPERM); */
3930 		return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
3931 	case EXTATTR_NAMESPACE_USER:
3932 		return (VOP_ACCESS(vp, accmode, cred, td));
3933 	default:
3934 		return (EPERM);
3935 	}
3936 }
3937 
3938 #ifdef DEBUG_VFS_LOCKS
3939 /*
3940  * This only exists to supress warnings from unlocked specfs accesses.  It is
3941  * no longer ok to have an unlocked VFS.
3942  */
3943 #define	IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL ||		\
3944 	(vp)->v_type == VCHR ||	(vp)->v_type == VBAD)
3945 
3946 int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
3947 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
3948     "Drop into debugger on lock violation");
3949 
3950 int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
3951 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
3952     0, "Check for interlock across VOPs");
3953 
3954 int vfs_badlock_print = 1;	/* Print lock violations. */
3955 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
3956     0, "Print lock violations");
3957 
3958 #ifdef KDB
3959 int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
3960 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
3961     &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
3962 #endif
3963 
3964 static void
3965 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
3966 {
3967 
3968 #ifdef KDB
3969 	if (vfs_badlock_backtrace)
3970 		kdb_backtrace();
3971 #endif
3972 	if (vfs_badlock_print)
3973 		printf("%s: %p %s\n", str, (void *)vp, msg);
3974 	if (vfs_badlock_ddb)
3975 		kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
3976 }
3977 
3978 void
3979 assert_vi_locked(struct vnode *vp, const char *str)
3980 {
3981 
3982 	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
3983 		vfs_badlock("interlock is not locked but should be", str, vp);
3984 }
3985 
3986 void
3987 assert_vi_unlocked(struct vnode *vp, const char *str)
3988 {
3989 
3990 	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
3991 		vfs_badlock("interlock is locked but should not be", str, vp);
3992 }
3993 
3994 void
3995 assert_vop_locked(struct vnode *vp, const char *str)
3996 {
3997 	int locked;
3998 
3999 	if (!IGNORE_LOCK(vp)) {
4000 		locked = VOP_ISLOCKED(vp);
4001 		if (locked == 0 || locked == LK_EXCLOTHER)
4002 			vfs_badlock("is not locked but should be", str, vp);
4003 	}
4004 }
4005 
4006 void
4007 assert_vop_unlocked(struct vnode *vp, const char *str)
4008 {
4009 
4010 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
4011 		vfs_badlock("is locked but should not be", str, vp);
4012 }
4013 
4014 void
4015 assert_vop_elocked(struct vnode *vp, const char *str)
4016 {
4017 
4018 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
4019 		vfs_badlock("is not exclusive locked but should be", str, vp);
4020 }
4021 
4022 #if 0
4023 void
4024 assert_vop_elocked_other(struct vnode *vp, const char *str)
4025 {
4026 
4027 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER)
4028 		vfs_badlock("is not exclusive locked by another thread",
4029 		    str, vp);
4030 }
4031 
4032 void
4033 assert_vop_slocked(struct vnode *vp, const char *str)
4034 {
4035 
4036 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED)
4037 		vfs_badlock("is not locked shared but should be", str, vp);
4038 }
4039 #endif /* 0 */
4040 #endif /* DEBUG_VFS_LOCKS */
4041 
4042 void
4043 vop_rename_fail(struct vop_rename_args *ap)
4044 {
4045 
4046 	if (ap->a_tvp != NULL)
4047 		vput(ap->a_tvp);
4048 	if (ap->a_tdvp == ap->a_tvp)
4049 		vrele(ap->a_tdvp);
4050 	else
4051 		vput(ap->a_tdvp);
4052 	vrele(ap->a_fdvp);
4053 	vrele(ap->a_fvp);
4054 }
4055 
4056 void
4057 vop_rename_pre(void *ap)
4058 {
4059 	struct vop_rename_args *a = ap;
4060 
4061 #ifdef DEBUG_VFS_LOCKS
4062 	if (a->a_tvp)
4063 		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
4064 	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
4065 	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
4066 	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
4067 
4068 	/* Check the source (from). */
4069 	if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
4070 	    (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
4071 		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
4072 	if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
4073 		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
4074 
4075 	/* Check the target. */
4076 	if (a->a_tvp)
4077 		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
4078 	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
4079 #endif
4080 	if (a->a_tdvp != a->a_fdvp)
4081 		vhold(a->a_fdvp);
4082 	if (a->a_tvp != a->a_fvp)
4083 		vhold(a->a_fvp);
4084 	vhold(a->a_tdvp);
4085 	if (a->a_tvp)
4086 		vhold(a->a_tvp);
4087 }
4088 
4089 void
4090 vop_strategy_pre(void *ap)
4091 {
4092 #ifdef DEBUG_VFS_LOCKS
4093 	struct vop_strategy_args *a;
4094 	struct buf *bp;
4095 
4096 	a = ap;
4097 	bp = a->a_bp;
4098 
4099 	/*
4100 	 * Cluster ops lock their component buffers but not the IO container.
4101 	 */
4102 	if ((bp->b_flags & B_CLUSTER) != 0)
4103 		return;
4104 
4105 	if (panicstr == NULL && !BUF_ISLOCKED(bp)) {
4106 		if (vfs_badlock_print)
4107 			printf(
4108 			    "VOP_STRATEGY: bp is not locked but should be\n");
4109 		if (vfs_badlock_ddb)
4110 			kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
4111 	}
4112 #endif
4113 }
4114 
4115 void
4116 vop_lock_pre(void *ap)
4117 {
4118 #ifdef DEBUG_VFS_LOCKS
4119 	struct vop_lock1_args *a = ap;
4120 
4121 	if ((a->a_flags & LK_INTERLOCK) == 0)
4122 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
4123 	else
4124 		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
4125 #endif
4126 }
4127 
4128 void
4129 vop_lock_post(void *ap, int rc)
4130 {
4131 #ifdef DEBUG_VFS_LOCKS
4132 	struct vop_lock1_args *a = ap;
4133 
4134 	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
4135 	if (rc == 0)
4136 		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
4137 #endif
4138 }
4139 
4140 void
4141 vop_unlock_pre(void *ap)
4142 {
4143 #ifdef DEBUG_VFS_LOCKS
4144 	struct vop_unlock_args *a = ap;
4145 
4146 	if (a->a_flags & LK_INTERLOCK)
4147 		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
4148 	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
4149 #endif
4150 }
4151 
4152 void
4153 vop_unlock_post(void *ap, int rc)
4154 {
4155 #ifdef DEBUG_VFS_LOCKS
4156 	struct vop_unlock_args *a = ap;
4157 
4158 	if (a->a_flags & LK_INTERLOCK)
4159 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
4160 #endif
4161 }
4162 
4163 void
4164 vop_create_post(void *ap, int rc)
4165 {
4166 	struct vop_create_args *a = ap;
4167 
4168 	if (!rc)
4169 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4170 }
4171 
4172 void
4173 vop_deleteextattr_post(void *ap, int rc)
4174 {
4175 	struct vop_deleteextattr_args *a = ap;
4176 
4177 	if (!rc)
4178 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4179 }
4180 
4181 void
4182 vop_link_post(void *ap, int rc)
4183 {
4184 	struct vop_link_args *a = ap;
4185 
4186 	if (!rc) {
4187 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
4188 		VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
4189 	}
4190 }
4191 
4192 void
4193 vop_mkdir_post(void *ap, int rc)
4194 {
4195 	struct vop_mkdir_args *a = ap;
4196 
4197 	if (!rc)
4198 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4199 }
4200 
4201 void
4202 vop_mknod_post(void *ap, int rc)
4203 {
4204 	struct vop_mknod_args *a = ap;
4205 
4206 	if (!rc)
4207 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4208 }
4209 
4210 void
4211 vop_remove_post(void *ap, int rc)
4212 {
4213 	struct vop_remove_args *a = ap;
4214 
4215 	if (!rc) {
4216 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4217 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4218 	}
4219 }
4220 
4221 void
4222 vop_rename_post(void *ap, int rc)
4223 {
4224 	struct vop_rename_args *a = ap;
4225 
4226 	if (!rc) {
4227 		VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
4228 		VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
4229 		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
4230 		if (a->a_tvp)
4231 			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
4232 	}
4233 	if (a->a_tdvp != a->a_fdvp)
4234 		vdrop(a->a_fdvp);
4235 	if (a->a_tvp != a->a_fvp)
4236 		vdrop(a->a_fvp);
4237 	vdrop(a->a_tdvp);
4238 	if (a->a_tvp)
4239 		vdrop(a->a_tvp);
4240 }
4241 
4242 void
4243 vop_rmdir_post(void *ap, int rc)
4244 {
4245 	struct vop_rmdir_args *a = ap;
4246 
4247 	if (!rc) {
4248 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4249 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4250 	}
4251 }
4252 
4253 void
4254 vop_setattr_post(void *ap, int rc)
4255 {
4256 	struct vop_setattr_args *a = ap;
4257 
4258 	if (!rc)
4259 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4260 }
4261 
4262 void
4263 vop_setextattr_post(void *ap, int rc)
4264 {
4265 	struct vop_setextattr_args *a = ap;
4266 
4267 	if (!rc)
4268 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4269 }
4270 
4271 void
4272 vop_symlink_post(void *ap, int rc)
4273 {
4274 	struct vop_symlink_args *a = ap;
4275 
4276 	if (!rc)
4277 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4278 }
4279 
4280 static struct knlist fs_knlist;
4281 
4282 static void
4283 vfs_event_init(void *arg)
4284 {
4285 	knlist_init_mtx(&fs_knlist, NULL);
4286 }
4287 /* XXX - correct order? */
4288 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
4289 
4290 void
4291 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
4292 {
4293 
4294 	KNOTE_UNLOCKED(&fs_knlist, event);
4295 }
4296 
4297 static int	filt_fsattach(struct knote *kn);
4298 static void	filt_fsdetach(struct knote *kn);
4299 static int	filt_fsevent(struct knote *kn, long hint);
4300 
4301 struct filterops fs_filtops = {
4302 	.f_isfd = 0,
4303 	.f_attach = filt_fsattach,
4304 	.f_detach = filt_fsdetach,
4305 	.f_event = filt_fsevent
4306 };
4307 
4308 static int
4309 filt_fsattach(struct knote *kn)
4310 {
4311 
4312 	kn->kn_flags |= EV_CLEAR;
4313 	knlist_add(&fs_knlist, kn, 0);
4314 	return (0);
4315 }
4316 
4317 static void
4318 filt_fsdetach(struct knote *kn)
4319 {
4320 
4321 	knlist_remove(&fs_knlist, kn, 0);
4322 }
4323 
4324 static int
4325 filt_fsevent(struct knote *kn, long hint)
4326 {
4327 
4328 	kn->kn_fflags |= hint;
4329 	return (kn->kn_fflags != 0);
4330 }
4331 
4332 static int
4333 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
4334 {
4335 	struct vfsidctl vc;
4336 	int error;
4337 	struct mount *mp;
4338 
4339 	error = SYSCTL_IN(req, &vc, sizeof(vc));
4340 	if (error)
4341 		return (error);
4342 	if (vc.vc_vers != VFS_CTL_VERS1)
4343 		return (EINVAL);
4344 	mp = vfs_getvfs(&vc.vc_fsid);
4345 	if (mp == NULL)
4346 		return (ENOENT);
4347 	/* ensure that a specific sysctl goes to the right filesystem. */
4348 	if (strcmp(vc.vc_fstypename, "*") != 0 &&
4349 	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
4350 		vfs_rel(mp);
4351 		return (EINVAL);
4352 	}
4353 	VCTLTOREQ(&vc, req);
4354 	error = VFS_SYSCTL(mp, vc.vc_op, req);
4355 	vfs_rel(mp);
4356 	return (error);
4357 }
4358 
4359 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR,
4360     NULL, 0, sysctl_vfs_ctl, "",
4361     "Sysctl by fsid");
4362 
4363 /*
4364  * Function to initialize a va_filerev field sensibly.
4365  * XXX: Wouldn't a random number make a lot more sense ??
4366  */
4367 u_quad_t
4368 init_va_filerev(void)
4369 {
4370 	struct bintime bt;
4371 
4372 	getbinuptime(&bt);
4373 	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
4374 }
4375 
4376 static int	filt_vfsread(struct knote *kn, long hint);
4377 static int	filt_vfswrite(struct knote *kn, long hint);
4378 static int	filt_vfsvnode(struct knote *kn, long hint);
4379 static void	filt_vfsdetach(struct knote *kn);
4380 static struct filterops vfsread_filtops = {
4381 	.f_isfd = 1,
4382 	.f_detach = filt_vfsdetach,
4383 	.f_event = filt_vfsread
4384 };
4385 static struct filterops vfswrite_filtops = {
4386 	.f_isfd = 1,
4387 	.f_detach = filt_vfsdetach,
4388 	.f_event = filt_vfswrite
4389 };
4390 static struct filterops vfsvnode_filtops = {
4391 	.f_isfd = 1,
4392 	.f_detach = filt_vfsdetach,
4393 	.f_event = filt_vfsvnode
4394 };
4395 
4396 static void
4397 vfs_knllock(void *arg)
4398 {
4399 	struct vnode *vp = arg;
4400 
4401 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4402 }
4403 
4404 static void
4405 vfs_knlunlock(void *arg)
4406 {
4407 	struct vnode *vp = arg;
4408 
4409 	VOP_UNLOCK(vp, 0);
4410 }
4411 
4412 static void
4413 vfs_knl_assert_locked(void *arg)
4414 {
4415 #ifdef DEBUG_VFS_LOCKS
4416 	struct vnode *vp = arg;
4417 
4418 	ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
4419 #endif
4420 }
4421 
4422 static void
4423 vfs_knl_assert_unlocked(void *arg)
4424 {
4425 #ifdef DEBUG_VFS_LOCKS
4426 	struct vnode *vp = arg;
4427 
4428 	ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
4429 #endif
4430 }
4431 
4432 int
4433 vfs_kqfilter(struct vop_kqfilter_args *ap)
4434 {
4435 	struct vnode *vp = ap->a_vp;
4436 	struct knote *kn = ap->a_kn;
4437 	struct knlist *knl;
4438 
4439 	switch (kn->kn_filter) {
4440 	case EVFILT_READ:
4441 		kn->kn_fop = &vfsread_filtops;
4442 		break;
4443 	case EVFILT_WRITE:
4444 		kn->kn_fop = &vfswrite_filtops;
4445 		break;
4446 	case EVFILT_VNODE:
4447 		kn->kn_fop = &vfsvnode_filtops;
4448 		break;
4449 	default:
4450 		return (EINVAL);
4451 	}
4452 
4453 	kn->kn_hook = (caddr_t)vp;
4454 
4455 	v_addpollinfo(vp);
4456 	if (vp->v_pollinfo == NULL)
4457 		return (ENOMEM);
4458 	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
4459 	knlist_add(knl, kn, 0);
4460 
4461 	return (0);
4462 }
4463 
4464 /*
4465  * Detach knote from vnode
4466  */
4467 static void
4468 filt_vfsdetach(struct knote *kn)
4469 {
4470 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4471 
4472 	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
4473 	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
4474 }
4475 
4476 /*ARGSUSED*/
4477 static int
4478 filt_vfsread(struct knote *kn, long hint)
4479 {
4480 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4481 	struct vattr va;
4482 	int res;
4483 
4484 	/*
4485 	 * filesystem is gone, so set the EOF flag and schedule
4486 	 * the knote for deletion.
4487 	 */
4488 	if (hint == NOTE_REVOKE) {
4489 		VI_LOCK(vp);
4490 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4491 		VI_UNLOCK(vp);
4492 		return (1);
4493 	}
4494 
4495 	if (VOP_GETATTR(vp, &va, curthread->td_ucred))
4496 		return (0);
4497 
4498 	VI_LOCK(vp);
4499 	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
4500 	res = (kn->kn_data != 0);
4501 	VI_UNLOCK(vp);
4502 	return (res);
4503 }
4504 
4505 /*ARGSUSED*/
4506 static int
4507 filt_vfswrite(struct knote *kn, long hint)
4508 {
4509 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4510 
4511 	VI_LOCK(vp);
4512 
4513 	/*
4514 	 * filesystem is gone, so set the EOF flag and schedule
4515 	 * the knote for deletion.
4516 	 */
4517 	if (hint == NOTE_REVOKE)
4518 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4519 
4520 	kn->kn_data = 0;
4521 	VI_UNLOCK(vp);
4522 	return (1);
4523 }
4524 
4525 static int
4526 filt_vfsvnode(struct knote *kn, long hint)
4527 {
4528 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4529 	int res;
4530 
4531 	VI_LOCK(vp);
4532 	if (kn->kn_sfflags & hint)
4533 		kn->kn_fflags |= hint;
4534 	if (hint == NOTE_REVOKE) {
4535 		kn->kn_flags |= EV_EOF;
4536 		VI_UNLOCK(vp);
4537 		return (1);
4538 	}
4539 	res = (kn->kn_fflags != 0);
4540 	VI_UNLOCK(vp);
4541 	return (res);
4542 }
4543 
4544 int
4545 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
4546 {
4547 	int error;
4548 
4549 	if (dp->d_reclen > ap->a_uio->uio_resid)
4550 		return (ENAMETOOLONG);
4551 	error = uiomove(dp, dp->d_reclen, ap->a_uio);
4552 	if (error) {
4553 		if (ap->a_ncookies != NULL) {
4554 			if (ap->a_cookies != NULL)
4555 				free(ap->a_cookies, M_TEMP);
4556 			ap->a_cookies = NULL;
4557 			*ap->a_ncookies = 0;
4558 		}
4559 		return (error);
4560 	}
4561 	if (ap->a_ncookies == NULL)
4562 		return (0);
4563 
4564 	KASSERT(ap->a_cookies,
4565 	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
4566 
4567 	*ap->a_cookies = realloc(*ap->a_cookies,
4568 	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
4569 	(*ap->a_cookies)[*ap->a_ncookies] = off;
4570 	return (0);
4571 }
4572 
4573 /*
4574  * Mark for update the access time of the file if the filesystem
4575  * supports VOP_MARKATIME.  This functionality is used by execve and
4576  * mmap, so we want to avoid the I/O implied by directly setting
4577  * va_atime for the sake of efficiency.
4578  */
4579 void
4580 vfs_mark_atime(struct vnode *vp, struct ucred *cred)
4581 {
4582 	struct mount *mp;
4583 
4584 	mp = vp->v_mount;
4585 	ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
4586 	if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
4587 		(void)VOP_MARKATIME(vp);
4588 }
4589 
4590 /*
4591  * The purpose of this routine is to remove granularity from accmode_t,
4592  * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
4593  * VADMIN and VAPPEND.
4594  *
4595  * If it returns 0, the caller is supposed to continue with the usual
4596  * access checks using 'accmode' as modified by this routine.  If it
4597  * returns nonzero value, the caller is supposed to return that value
4598  * as errno.
4599  *
4600  * Note that after this routine runs, accmode may be zero.
4601  */
4602 int
4603 vfs_unixify_accmode(accmode_t *accmode)
4604 {
4605 	/*
4606 	 * There is no way to specify explicit "deny" rule using
4607 	 * file mode or POSIX.1e ACLs.
4608 	 */
4609 	if (*accmode & VEXPLICIT_DENY) {
4610 		*accmode = 0;
4611 		return (0);
4612 	}
4613 
4614 	/*
4615 	 * None of these can be translated into usual access bits.
4616 	 * Also, the common case for NFSv4 ACLs is to not contain
4617 	 * either of these bits. Caller should check for VWRITE
4618 	 * on the containing directory instead.
4619 	 */
4620 	if (*accmode & (VDELETE_CHILD | VDELETE))
4621 		return (EPERM);
4622 
4623 	if (*accmode & VADMIN_PERMS) {
4624 		*accmode &= ~VADMIN_PERMS;
4625 		*accmode |= VADMIN;
4626 	}
4627 
4628 	/*
4629 	 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
4630 	 * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
4631 	 */
4632 	*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
4633 
4634 	return (0);
4635 }
4636 
4637 /*
4638  * These are helper functions for filesystems to traverse all
4639  * their vnodes.  See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
4640  *
4641  * This interface replaces MNT_VNODE_FOREACH.
4642  */
4643 
4644 MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
4645 
4646 struct vnode *
4647 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
4648 {
4649 	struct vnode *vp;
4650 
4651 	if (should_yield())
4652 		kern_yield(PRI_USER);
4653 	MNT_ILOCK(mp);
4654 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4655 	vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
4656 	while (vp != NULL && (vp->v_type == VMARKER ||
4657 	    (vp->v_iflag & VI_DOOMED) != 0))
4658 		vp = TAILQ_NEXT(vp, v_nmntvnodes);
4659 
4660 	/* Check if we are done */
4661 	if (vp == NULL) {
4662 		__mnt_vnode_markerfree_all(mvp, mp);
4663 		/* MNT_IUNLOCK(mp); -- done in above function */
4664 		mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
4665 		return (NULL);
4666 	}
4667 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
4668 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
4669 	VI_LOCK(vp);
4670 	MNT_IUNLOCK(mp);
4671 	return (vp);
4672 }
4673 
4674 struct vnode *
4675 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
4676 {
4677 	struct vnode *vp;
4678 
4679 	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
4680 	MNT_ILOCK(mp);
4681 	MNT_REF(mp);
4682 	(*mvp)->v_type = VMARKER;
4683 
4684 	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
4685 	while (vp != NULL && (vp->v_type == VMARKER ||
4686 	    (vp->v_iflag & VI_DOOMED) != 0))
4687 		vp = TAILQ_NEXT(vp, v_nmntvnodes);
4688 
4689 	/* Check if we are done */
4690 	if (vp == NULL) {
4691 		MNT_REL(mp);
4692 		MNT_IUNLOCK(mp);
4693 		free(*mvp, M_VNODE_MARKER);
4694 		*mvp = NULL;
4695 		return (NULL);
4696 	}
4697 	(*mvp)->v_mount = mp;
4698 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
4699 	VI_LOCK(vp);
4700 	MNT_IUNLOCK(mp);
4701 	return (vp);
4702 }
4703 
4704 
4705 void
4706 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
4707 {
4708 
4709 	if (*mvp == NULL) {
4710 		MNT_IUNLOCK(mp);
4711 		return;
4712 	}
4713 
4714 	mtx_assert(MNT_MTX(mp), MA_OWNED);
4715 
4716 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4717 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
4718 	MNT_REL(mp);
4719 	MNT_IUNLOCK(mp);
4720 	free(*mvp, M_VNODE_MARKER);
4721 	*mvp = NULL;
4722 }
4723 
4724 /*
4725  * These are helper functions for filesystems to traverse their
4726  * active vnodes.  See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h
4727  */
4728 static void
4729 mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
4730 {
4731 
4732 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4733 
4734 	MNT_ILOCK(mp);
4735 	MNT_REL(mp);
4736 	MNT_IUNLOCK(mp);
4737 	free(*mvp, M_VNODE_MARKER);
4738 	*mvp = NULL;
4739 }
4740 
4741 static struct vnode *
4742 mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
4743 {
4744 	struct vnode *vp, *nvp;
4745 
4746 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
4747 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4748 restart:
4749 	vp = TAILQ_NEXT(*mvp, v_actfreelist);
4750 	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
4751 	while (vp != NULL) {
4752 		if (vp->v_type == VMARKER) {
4753 			vp = TAILQ_NEXT(vp, v_actfreelist);
4754 			continue;
4755 		}
4756 		if (!VI_TRYLOCK(vp)) {
4757 			if (mp_ncpus == 1 || should_yield()) {
4758 				TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
4759 				mtx_unlock(&vnode_free_list_mtx);
4760 				kern_yield(PRI_USER);
4761 				mtx_lock(&vnode_free_list_mtx);
4762 				goto restart;
4763 			}
4764 			continue;
4765 		}
4766 		KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
4767 		KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
4768 		    ("alien vnode on the active list %p %p", vp, mp));
4769 		if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0)
4770 			break;
4771 		nvp = TAILQ_NEXT(vp, v_actfreelist);
4772 		VI_UNLOCK(vp);
4773 		vp = nvp;
4774 	}
4775 
4776 	/* Check if we are done */
4777 	if (vp == NULL) {
4778 		mtx_unlock(&vnode_free_list_mtx);
4779 		mnt_vnode_markerfree_active(mvp, mp);
4780 		return (NULL);
4781 	}
4782 	TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist);
4783 	mtx_unlock(&vnode_free_list_mtx);
4784 	ASSERT_VI_LOCKED(vp, "active iter");
4785 	KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
4786 	return (vp);
4787 }
4788 
4789 struct vnode *
4790 __mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
4791 {
4792 
4793 	if (should_yield())
4794 		kern_yield(PRI_USER);
4795 	mtx_lock(&vnode_free_list_mtx);
4796 	return (mnt_vnode_next_active(mvp, mp));
4797 }
4798 
4799 struct vnode *
4800 __mnt_vnode_first_active(struct vnode **mvp, struct mount *mp)
4801 {
4802 	struct vnode *vp;
4803 
4804 	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
4805 	MNT_ILOCK(mp);
4806 	MNT_REF(mp);
4807 	MNT_IUNLOCK(mp);
4808 	(*mvp)->v_type = VMARKER;
4809 	(*mvp)->v_mount = mp;
4810 
4811 	mtx_lock(&vnode_free_list_mtx);
4812 	vp = TAILQ_FIRST(&mp->mnt_activevnodelist);
4813 	if (vp == NULL) {
4814 		mtx_unlock(&vnode_free_list_mtx);
4815 		mnt_vnode_markerfree_active(mvp, mp);
4816 		return (NULL);
4817 	}
4818 	TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
4819 	return (mnt_vnode_next_active(mvp, mp));
4820 }
4821 
4822 void
4823 __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
4824 {
4825 
4826 	if (*mvp == NULL)
4827 		return;
4828 
4829 	mtx_lock(&vnode_free_list_mtx);
4830 	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
4831 	mtx_unlock(&vnode_free_list_mtx);
4832 	mnt_vnode_markerfree_active(mvp, mp);
4833 }
4834