xref: /freebsd/sys/kern/vfs_subr.c (revision d1a0d267b78b542fbd7e6553af2493760f49bfa8)
1 /*-
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
35  */
36 
37 /*
38  * External virtual filesystem routines
39  */
40 
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 #include "opt_compat.h"
45 #include "opt_ddb.h"
46 #include "opt_watchdog.h"
47 
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/bio.h>
51 #include <sys/buf.h>
52 #include <sys/condvar.h>
53 #include <sys/conf.h>
54 #include <sys/dirent.h>
55 #include <sys/event.h>
56 #include <sys/eventhandler.h>
57 #include <sys/extattr.h>
58 #include <sys/file.h>
59 #include <sys/fcntl.h>
60 #include <sys/jail.h>
61 #include <sys/kdb.h>
62 #include <sys/kernel.h>
63 #include <sys/kthread.h>
64 #include <sys/lockf.h>
65 #include <sys/malloc.h>
66 #include <sys/mount.h>
67 #include <sys/namei.h>
68 #include <sys/pctrie.h>
69 #include <sys/priv.h>
70 #include <sys/reboot.h>
71 #include <sys/refcount.h>
72 #include <sys/rwlock.h>
73 #include <sys/sched.h>
74 #include <sys/sleepqueue.h>
75 #include <sys/smp.h>
76 #include <sys/stat.h>
77 #include <sys/sysctl.h>
78 #include <sys/syslog.h>
79 #include <sys/vmmeter.h>
80 #include <sys/vnode.h>
81 #include <sys/watchdog.h>
82 
83 #include <machine/stdarg.h>
84 
85 #include <security/mac/mac_framework.h>
86 
87 #include <vm/vm.h>
88 #include <vm/vm_object.h>
89 #include <vm/vm_extern.h>
90 #include <vm/pmap.h>
91 #include <vm/vm_map.h>
92 #include <vm/vm_page.h>
93 #include <vm/vm_kern.h>
94 #include <vm/uma.h>
95 
96 #ifdef DDB
97 #include <ddb/ddb.h>
98 #endif
99 
100 static void	delmntque(struct vnode *vp);
101 static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
102 		    int slpflag, int slptimeo);
103 static void	syncer_shutdown(void *arg, int howto);
104 static int	vtryrecycle(struct vnode *vp);
105 static void	v_init_counters(struct vnode *);
106 static void	v_incr_usecount(struct vnode *);
107 static void	v_incr_devcount(struct vnode *);
108 static void	v_decr_devcount(struct vnode *);
109 static void	vnlru_free(int);
110 static void	vgonel(struct vnode *);
111 static void	vfs_knllock(void *arg);
112 static void	vfs_knlunlock(void *arg);
113 static void	vfs_knl_assert_locked(void *arg);
114 static void	vfs_knl_assert_unlocked(void *arg);
115 static void	destroy_vpollinfo(struct vpollinfo *vi);
116 
117 /*
118  * Number of vnodes in existence.  Increased whenever getnewvnode()
119  * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode.
120  */
121 static unsigned long	numvnodes;
122 
123 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
124     "Number of vnodes in existence");
125 
126 static u_long vnodes_created;
127 SYSCTL_ULONG(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
128     0, "Number of vnodes created by getnewvnode");
129 
130 /*
131  * Conversion tables for conversion from vnode types to inode formats
132  * and back.
133  */
134 enum vtype iftovt_tab[16] = {
135 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
136 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
137 };
138 int vttoif_tab[10] = {
139 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
140 	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
141 };
142 
143 /*
144  * List of vnodes that are ready for recycling.
145  */
146 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
147 
148 /*
149  * Free vnode target.  Free vnodes may simply be files which have been stat'd
150  * but not read.  This is somewhat common, and a small cache of such files
151  * should be kept to avoid recreation costs.
152  */
153 static u_long wantfreevnodes;
154 SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
155 /* Number of vnodes in the free list. */
156 static u_long freevnodes;
157 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0,
158     "Number of vnodes in the free list");
159 
160 static int vlru_allow_cache_src;
161 SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW,
162     &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode");
163 
164 static u_long recycles_count;
165 SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 0,
166     "Number of vnodes recycled to avoid exceding kern.maxvnodes");
167 
168 /*
169  * Various variables used for debugging the new implementation of
170  * reassignbuf().
171  * XXX these are probably of (very) limited utility now.
172  */
173 static int reassignbufcalls;
174 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
175     "Number of calls to reassignbuf");
176 
177 static u_long free_owe_inact;
178 SYSCTL_ULONG(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact, 0,
179     "Number of times free vnodes kept on active list due to VFS "
180     "owing inactivation");
181 
182 /*
183  * Cache for the mount type id assigned to NFS.  This is used for
184  * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
185  */
186 int	nfs_mount_type = -1;
187 
188 /* To keep more than one thread at a time from running vfs_getnewfsid */
189 static struct mtx mntid_mtx;
190 
191 /*
192  * Lock for any access to the following:
193  *	vnode_free_list
194  *	numvnodes
195  *	freevnodes
196  */
197 static struct mtx vnode_free_list_mtx;
198 
199 /* Publicly exported FS */
200 struct nfs_public nfs_pub;
201 
202 static uma_zone_t buf_trie_zone;
203 
204 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
205 static uma_zone_t vnode_zone;
206 static uma_zone_t vnodepoll_zone;
207 
208 /*
209  * The workitem queue.
210  *
211  * It is useful to delay writes of file data and filesystem metadata
212  * for tens of seconds so that quickly created and deleted files need
213  * not waste disk bandwidth being created and removed. To realize this,
214  * we append vnodes to a "workitem" queue. When running with a soft
215  * updates implementation, most pending metadata dependencies should
216  * not wait for more than a few seconds. Thus, mounted on block devices
217  * are delayed only about a half the time that file data is delayed.
218  * Similarly, directory updates are more critical, so are only delayed
219  * about a third the time that file data is delayed. Thus, there are
220  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
221  * one each second (driven off the filesystem syncer process). The
222  * syncer_delayno variable indicates the next queue that is to be processed.
223  * Items that need to be processed soon are placed in this queue:
224  *
225  *	syncer_workitem_pending[syncer_delayno]
226  *
227  * A delay of fifteen seconds is done by placing the request fifteen
228  * entries later in the queue:
229  *
230  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
231  *
232  */
233 static int syncer_delayno;
234 static long syncer_mask;
235 LIST_HEAD(synclist, bufobj);
236 static struct synclist *syncer_workitem_pending;
237 /*
238  * The sync_mtx protects:
239  *	bo->bo_synclist
240  *	sync_vnode_count
241  *	syncer_delayno
242  *	syncer_state
243  *	syncer_workitem_pending
244  *	syncer_worklist_len
245  *	rushjob
246  */
247 static struct mtx sync_mtx;
248 static struct cv sync_wakeup;
249 
250 #define SYNCER_MAXDELAY		32
251 static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
252 static int syncdelay = 30;		/* max time to delay syncing data */
253 static int filedelay = 30;		/* time to delay syncing files */
254 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
255     "Time to delay syncing files (in seconds)");
256 static int dirdelay = 29;		/* time to delay syncing directories */
257 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
258     "Time to delay syncing directories (in seconds)");
259 static int metadelay = 28;		/* time to delay syncing metadata */
260 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
261     "Time to delay syncing metadata (in seconds)");
262 static int rushjob;		/* number of slots to run ASAP */
263 static int stat_rush_requests;	/* number of times I/O speeded up */
264 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
265     "Number of times I/O speeded up (rush requests)");
266 
267 /*
268  * When shutting down the syncer, run it at four times normal speed.
269  */
270 #define SYNCER_SHUTDOWN_SPEEDUP		4
271 static int sync_vnode_count;
272 static int syncer_worklist_len;
273 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
274     syncer_state;
275 
276 /*
277  * Number of vnodes we want to exist at any one time.  This is mostly used
278  * to size hash tables in vnode-related code.  It is normally not used in
279  * getnewvnode(), as wantfreevnodes is normally nonzero.)
280  *
281  * XXX desiredvnodes is historical cruft and should not exist.
282  */
283 int desiredvnodes;
284 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
285     &desiredvnodes, 0, "Maximum number of vnodes");
286 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
287     &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
288 static int vnlru_nowhere;
289 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
290     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
291 
292 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
293 static int vnsz2log;
294 
295 /*
296  * Support for the bufobj clean & dirty pctrie.
297  */
298 static void *
299 buf_trie_alloc(struct pctrie *ptree)
300 {
301 
302 	return uma_zalloc(buf_trie_zone, M_NOWAIT);
303 }
304 
305 static void
306 buf_trie_free(struct pctrie *ptree, void *node)
307 {
308 
309 	uma_zfree(buf_trie_zone, node);
310 }
311 PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);
312 
313 /*
314  * Initialize the vnode management data structures.
315  *
316  * Reevaluate the following cap on the number of vnodes after the physical
317  * memory size exceeds 512GB.  In the limit, as the physical memory size
318  * grows, the ratio of physical pages to vnodes approaches sixteen to one.
319  */
320 #ifndef	MAXVNODES_MAX
321 #define	MAXVNODES_MAX	(512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16))
322 #endif
323 static void
324 vntblinit(void *dummy __unused)
325 {
326 	u_int i;
327 	int physvnodes, virtvnodes;
328 
329 	/*
330 	 * Desiredvnodes is a function of the physical memory size and the
331 	 * kernel's heap size.  Generally speaking, it scales with the
332 	 * physical memory size.  The ratio of desiredvnodes to physical pages
333 	 * is one to four until desiredvnodes exceeds 98,304.  Thereafter, the
334 	 * marginal ratio of desiredvnodes to physical pages is one to
335 	 * sixteen.  However, desiredvnodes is limited by the kernel's heap
336 	 * size.  The memory required by desiredvnodes vnodes and vm objects
337 	 * may not exceed one seventh of the kernel's heap size.
338 	 */
339 	physvnodes = maxproc + vm_cnt.v_page_count / 16 + 3 * min(98304 * 4,
340 	    vm_cnt.v_page_count) / 16;
341 	virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) +
342 	    sizeof(struct vnode)));
343 	desiredvnodes = min(physvnodes, virtvnodes);
344 	if (desiredvnodes > MAXVNODES_MAX) {
345 		if (bootverbose)
346 			printf("Reducing kern.maxvnodes %d -> %d\n",
347 			    desiredvnodes, MAXVNODES_MAX);
348 		desiredvnodes = MAXVNODES_MAX;
349 	}
350 	wantfreevnodes = desiredvnodes / 4;
351 	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
352 	TAILQ_INIT(&vnode_free_list);
353 	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
354 	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
355 	    NULL, NULL, UMA_ALIGN_PTR, 0);
356 	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
357 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
358 	/*
359 	 * Preallocate enough nodes to support one-per buf so that
360 	 * we can not fail an insert.  reassignbuf() callers can not
361 	 * tolerate the insertion failure.
362 	 */
363 	buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
364 	    NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR,
365 	    UMA_ZONE_NOFREE | UMA_ZONE_VM);
366 	uma_prealloc(buf_trie_zone, nbuf);
367 	/*
368 	 * Initialize the filesystem syncer.
369 	 */
370 	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
371 	    &syncer_mask);
372 	syncer_maxdelay = syncer_mask + 1;
373 	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
374 	cv_init(&sync_wakeup, "syncer");
375 	for (i = 1; i <= sizeof(struct vnode); i <<= 1)
376 		vnsz2log++;
377 	vnsz2log--;
378 }
379 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
380 
381 
382 /*
383  * Mark a mount point as busy. Used to synchronize access and to delay
384  * unmounting. Eventually, mountlist_mtx is not released on failure.
385  *
386  * vfs_busy() is a custom lock, it can block the caller.
387  * vfs_busy() only sleeps if the unmount is active on the mount point.
388  * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
389  * vnode belonging to mp.
390  *
391  * Lookup uses vfs_busy() to traverse mount points.
392  * root fs			var fs
393  * / vnode lock		A	/ vnode lock (/var)		D
394  * /var vnode lock	B	/log vnode lock(/var/log)	E
395  * vfs_busy lock	C	vfs_busy lock			F
396  *
397  * Within each file system, the lock order is C->A->B and F->D->E.
398  *
399  * When traversing across mounts, the system follows that lock order:
400  *
401  *        C->A->B
402  *              |
403  *              +->F->D->E
404  *
405  * The lookup() process for namei("/var") illustrates the process:
406  *  VOP_LOOKUP() obtains B while A is held
407  *  vfs_busy() obtains a shared lock on F while A and B are held
408  *  vput() releases lock on B
409  *  vput() releases lock on A
410  *  VFS_ROOT() obtains lock on D while shared lock on F is held
411  *  vfs_unbusy() releases shared lock on F
412  *  vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
413  *    Attempt to lock A (instead of vp_crossmp) while D is held would
414  *    violate the global order, causing deadlocks.
415  *
416  * dounmount() locks B while F is drained.
417  */
418 int
419 vfs_busy(struct mount *mp, int flags)
420 {
421 
422 	MPASS((flags & ~MBF_MASK) == 0);
423 	CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
424 
425 	MNT_ILOCK(mp);
426 	MNT_REF(mp);
427 	/*
428 	 * If mount point is currenly being unmounted, sleep until the
429 	 * mount point fate is decided.  If thread doing the unmounting fails,
430 	 * it will clear MNTK_UNMOUNT flag before waking us up, indicating
431 	 * that this mount point has survived the unmount attempt and vfs_busy
432 	 * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
433 	 * flag in addition to MNTK_UNMOUNT, indicating that mount point is
434 	 * about to be really destroyed.  vfs_busy needs to release its
435 	 * reference on the mount point in this case and return with ENOENT,
436 	 * telling the caller that mount mount it tried to busy is no longer
437 	 * valid.
438 	 */
439 	while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
440 		if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
441 			MNT_REL(mp);
442 			MNT_IUNLOCK(mp);
443 			CTR1(KTR_VFS, "%s: failed busying before sleeping",
444 			    __func__);
445 			return (ENOENT);
446 		}
447 		if (flags & MBF_MNTLSTLOCK)
448 			mtx_unlock(&mountlist_mtx);
449 		mp->mnt_kern_flag |= MNTK_MWAIT;
450 		msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
451 		if (flags & MBF_MNTLSTLOCK)
452 			mtx_lock(&mountlist_mtx);
453 		MNT_ILOCK(mp);
454 	}
455 	if (flags & MBF_MNTLSTLOCK)
456 		mtx_unlock(&mountlist_mtx);
457 	mp->mnt_lockref++;
458 	MNT_IUNLOCK(mp);
459 	return (0);
460 }
461 
462 /*
463  * Free a busy filesystem.
464  */
465 void
466 vfs_unbusy(struct mount *mp)
467 {
468 
469 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
470 	MNT_ILOCK(mp);
471 	MNT_REL(mp);
472 	KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref"));
473 	mp->mnt_lockref--;
474 	if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
475 		MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
476 		CTR1(KTR_VFS, "%s: waking up waiters", __func__);
477 		mp->mnt_kern_flag &= ~MNTK_DRAINING;
478 		wakeup(&mp->mnt_lockref);
479 	}
480 	MNT_IUNLOCK(mp);
481 }
482 
483 /*
484  * Lookup a mount point by filesystem identifier.
485  */
486 struct mount *
487 vfs_getvfs(fsid_t *fsid)
488 {
489 	struct mount *mp;
490 
491 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
492 	mtx_lock(&mountlist_mtx);
493 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
494 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
495 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
496 			vfs_ref(mp);
497 			mtx_unlock(&mountlist_mtx);
498 			return (mp);
499 		}
500 	}
501 	mtx_unlock(&mountlist_mtx);
502 	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
503 	return ((struct mount *) 0);
504 }
505 
506 /*
507  * Lookup a mount point by filesystem identifier, busying it before
508  * returning.
509  *
510  * To avoid congestion on mountlist_mtx, implement simple direct-mapped
511  * cache for popular filesystem identifiers.  The cache is lockess, using
512  * the fact that struct mount's are never freed.  In worst case we may
513  * get pointer to unmounted or even different filesystem, so we have to
514  * check what we got, and go slow way if so.
515  */
516 struct mount *
517 vfs_busyfs(fsid_t *fsid)
518 {
519 #define	FSID_CACHE_SIZE	256
520 	typedef struct mount * volatile vmp_t;
521 	static vmp_t cache[FSID_CACHE_SIZE];
522 	struct mount *mp;
523 	int error;
524 	uint32_t hash;
525 
526 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
527 	hash = fsid->val[0] ^ fsid->val[1];
528 	hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
529 	mp = cache[hash];
530 	if (mp == NULL ||
531 	    mp->mnt_stat.f_fsid.val[0] != fsid->val[0] ||
532 	    mp->mnt_stat.f_fsid.val[1] != fsid->val[1])
533 		goto slow;
534 	if (vfs_busy(mp, 0) != 0) {
535 		cache[hash] = NULL;
536 		goto slow;
537 	}
538 	if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
539 	    mp->mnt_stat.f_fsid.val[1] == fsid->val[1])
540 		return (mp);
541 	else
542 	    vfs_unbusy(mp);
543 
544 slow:
545 	mtx_lock(&mountlist_mtx);
546 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
547 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
548 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
549 			error = vfs_busy(mp, MBF_MNTLSTLOCK);
550 			if (error) {
551 				cache[hash] = NULL;
552 				mtx_unlock(&mountlist_mtx);
553 				return (NULL);
554 			}
555 			cache[hash] = mp;
556 			return (mp);
557 		}
558 	}
559 	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
560 	mtx_unlock(&mountlist_mtx);
561 	return ((struct mount *) 0);
562 }
563 
564 /*
565  * Check if a user can access privileged mount options.
566  */
567 int
568 vfs_suser(struct mount *mp, struct thread *td)
569 {
570 	int error;
571 
572 	/*
573 	 * If the thread is jailed, but this is not a jail-friendly file
574 	 * system, deny immediately.
575 	 */
576 	if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred))
577 		return (EPERM);
578 
579 	/*
580 	 * If the file system was mounted outside the jail of the calling
581 	 * thread, deny immediately.
582 	 */
583 	if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
584 		return (EPERM);
585 
586 	/*
587 	 * If file system supports delegated administration, we don't check
588 	 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
589 	 * by the file system itself.
590 	 * If this is not the user that did original mount, we check for
591 	 * the PRIV_VFS_MOUNT_OWNER privilege.
592 	 */
593 	if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
594 	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
595 		if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
596 			return (error);
597 	}
598 	return (0);
599 }
600 
601 /*
602  * Get a new unique fsid.  Try to make its val[0] unique, since this value
603  * will be used to create fake device numbers for stat().  Also try (but
604  * not so hard) make its val[0] unique mod 2^16, since some emulators only
605  * support 16-bit device numbers.  We end up with unique val[0]'s for the
606  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
607  *
608  * Keep in mind that several mounts may be running in parallel.  Starting
609  * the search one past where the previous search terminated is both a
610  * micro-optimization and a defense against returning the same fsid to
611  * different mounts.
612  */
613 void
614 vfs_getnewfsid(struct mount *mp)
615 {
616 	static uint16_t mntid_base;
617 	struct mount *nmp;
618 	fsid_t tfsid;
619 	int mtype;
620 
621 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
622 	mtx_lock(&mntid_mtx);
623 	mtype = mp->mnt_vfc->vfc_typenum;
624 	tfsid.val[1] = mtype;
625 	mtype = (mtype & 0xFF) << 24;
626 	for (;;) {
627 		tfsid.val[0] = makedev(255,
628 		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
629 		mntid_base++;
630 		if ((nmp = vfs_getvfs(&tfsid)) == NULL)
631 			break;
632 		vfs_rel(nmp);
633 	}
634 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
635 	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
636 	mtx_unlock(&mntid_mtx);
637 }
638 
639 /*
640  * Knob to control the precision of file timestamps:
641  *
642  *   0 = seconds only; nanoseconds zeroed.
643  *   1 = seconds and nanoseconds, accurate within 1/HZ.
644  *   2 = seconds and nanoseconds, truncated to microseconds.
645  * >=3 = seconds and nanoseconds, maximum precision.
646  */
647 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
648 
649 static int timestamp_precision = TSP_USEC;
650 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
651     &timestamp_precision, 0, "File timestamp precision (0: seconds, "
652     "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, "
653     "3+: sec + ns (max. precision))");
654 
655 /*
656  * Get a current timestamp.
657  */
658 void
659 vfs_timestamp(struct timespec *tsp)
660 {
661 	struct timeval tv;
662 
663 	switch (timestamp_precision) {
664 	case TSP_SEC:
665 		tsp->tv_sec = time_second;
666 		tsp->tv_nsec = 0;
667 		break;
668 	case TSP_HZ:
669 		getnanotime(tsp);
670 		break;
671 	case TSP_USEC:
672 		microtime(&tv);
673 		TIMEVAL_TO_TIMESPEC(&tv, tsp);
674 		break;
675 	case TSP_NSEC:
676 	default:
677 		nanotime(tsp);
678 		break;
679 	}
680 }
681 
682 /*
683  * Set vnode attributes to VNOVAL
684  */
685 void
686 vattr_null(struct vattr *vap)
687 {
688 
689 	vap->va_type = VNON;
690 	vap->va_size = VNOVAL;
691 	vap->va_bytes = VNOVAL;
692 	vap->va_mode = VNOVAL;
693 	vap->va_nlink = VNOVAL;
694 	vap->va_uid = VNOVAL;
695 	vap->va_gid = VNOVAL;
696 	vap->va_fsid = VNOVAL;
697 	vap->va_fileid = VNOVAL;
698 	vap->va_blocksize = VNOVAL;
699 	vap->va_rdev = VNOVAL;
700 	vap->va_atime.tv_sec = VNOVAL;
701 	vap->va_atime.tv_nsec = VNOVAL;
702 	vap->va_mtime.tv_sec = VNOVAL;
703 	vap->va_mtime.tv_nsec = VNOVAL;
704 	vap->va_ctime.tv_sec = VNOVAL;
705 	vap->va_ctime.tv_nsec = VNOVAL;
706 	vap->va_birthtime.tv_sec = VNOVAL;
707 	vap->va_birthtime.tv_nsec = VNOVAL;
708 	vap->va_flags = VNOVAL;
709 	vap->va_gen = VNOVAL;
710 	vap->va_vaflags = 0;
711 }
712 
713 /*
714  * This routine is called when we have too many vnodes.  It attempts
715  * to free <count> vnodes and will potentially free vnodes that still
716  * have VM backing store (VM backing store is typically the cause
717  * of a vnode blowout so we want to do this).  Therefore, this operation
718  * is not considered cheap.
719  *
720  * A number of conditions may prevent a vnode from being reclaimed.
721  * the buffer cache may have references on the vnode, a directory
722  * vnode may still have references due to the namei cache representing
723  * underlying files, or the vnode may be in active use.   It is not
724  * desireable to reuse such vnodes.  These conditions may cause the
725  * number of vnodes to reach some minimum value regardless of what
726  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
727  */
728 static int
729 vlrureclaim(struct mount *mp)
730 {
731 	struct vnode *vp;
732 	int done;
733 	int trigger;
734 	int usevnodes;
735 	int count;
736 
737 	/*
738 	 * Calculate the trigger point, don't allow user
739 	 * screwups to blow us up.   This prevents us from
740 	 * recycling vnodes with lots of resident pages.  We
741 	 * aren't trying to free memory, we are trying to
742 	 * free vnodes.
743 	 */
744 	usevnodes = desiredvnodes;
745 	if (usevnodes <= 0)
746 		usevnodes = 1;
747 	trigger = vm_cnt.v_page_count * 2 / usevnodes;
748 	done = 0;
749 	vn_start_write(NULL, &mp, V_WAIT);
750 	MNT_ILOCK(mp);
751 	count = mp->mnt_nvnodelistsize / 10 + 1;
752 	while (count != 0) {
753 		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
754 		while (vp != NULL && vp->v_type == VMARKER)
755 			vp = TAILQ_NEXT(vp, v_nmntvnodes);
756 		if (vp == NULL)
757 			break;
758 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
759 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
760 		--count;
761 		if (!VI_TRYLOCK(vp))
762 			goto next_iter;
763 		/*
764 		 * If it's been deconstructed already, it's still
765 		 * referenced, or it exceeds the trigger, skip it.
766 		 */
767 		if (vp->v_usecount ||
768 		    (!vlru_allow_cache_src &&
769 			!LIST_EMPTY(&(vp)->v_cache_src)) ||
770 		    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
771 		    vp->v_object->resident_page_count > trigger)) {
772 			VI_UNLOCK(vp);
773 			goto next_iter;
774 		}
775 		MNT_IUNLOCK(mp);
776 		vholdl(vp);
777 		if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
778 			vdrop(vp);
779 			goto next_iter_mntunlocked;
780 		}
781 		VI_LOCK(vp);
782 		/*
783 		 * v_usecount may have been bumped after VOP_LOCK() dropped
784 		 * the vnode interlock and before it was locked again.
785 		 *
786 		 * It is not necessary to recheck VI_DOOMED because it can
787 		 * only be set by another thread that holds both the vnode
788 		 * lock and vnode interlock.  If another thread has the
789 		 * vnode lock before we get to VOP_LOCK() and obtains the
790 		 * vnode interlock after VOP_LOCK() drops the vnode
791 		 * interlock, the other thread will be unable to drop the
792 		 * vnode lock before our VOP_LOCK() call fails.
793 		 */
794 		if (vp->v_usecount ||
795 		    (!vlru_allow_cache_src &&
796 			!LIST_EMPTY(&(vp)->v_cache_src)) ||
797 		    (vp->v_object != NULL &&
798 		    vp->v_object->resident_page_count > trigger)) {
799 			VOP_UNLOCK(vp, LK_INTERLOCK);
800 			vdrop(vp);
801 			goto next_iter_mntunlocked;
802 		}
803 		KASSERT((vp->v_iflag & VI_DOOMED) == 0,
804 		    ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
805 		atomic_add_long(&recycles_count, 1);
806 		vgonel(vp);
807 		VOP_UNLOCK(vp, 0);
808 		vdropl(vp);
809 		done++;
810 next_iter_mntunlocked:
811 		if (!should_yield())
812 			goto relock_mnt;
813 		goto yield;
814 next_iter:
815 		if (!should_yield())
816 			continue;
817 		MNT_IUNLOCK(mp);
818 yield:
819 		kern_yield(PRI_USER);
820 relock_mnt:
821 		MNT_ILOCK(mp);
822 	}
823 	MNT_IUNLOCK(mp);
824 	vn_finished_write(mp);
825 	return done;
826 }
827 
828 /*
829  * Attempt to keep the free list at wantfreevnodes length.
830  */
831 static void
832 vnlru_free(int count)
833 {
834 	struct vnode *vp;
835 
836 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
837 	for (; count > 0; count--) {
838 		vp = TAILQ_FIRST(&vnode_free_list);
839 		/*
840 		 * The list can be modified while the free_list_mtx
841 		 * has been dropped and vp could be NULL here.
842 		 */
843 		if (!vp)
844 			break;
845 		VNASSERT(vp->v_op != NULL, vp,
846 		    ("vnlru_free: vnode already reclaimed."));
847 		KASSERT((vp->v_iflag & VI_FREE) != 0,
848 		    ("Removing vnode not on freelist"));
849 		KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
850 		    ("Mangling active vnode"));
851 		TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
852 		/*
853 		 * Don't recycle if we can't get the interlock.
854 		 */
855 		if (!VI_TRYLOCK(vp)) {
856 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
857 			continue;
858 		}
859 		VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0,
860 		    vp, ("vp inconsistent on freelist"));
861 
862 		/*
863 		 * The clear of VI_FREE prevents activation of the
864 		 * vnode.  There is no sense in putting the vnode on
865 		 * the mount point active list, only to remove it
866 		 * later during recycling.  Inline the relevant part
867 		 * of vholdl(), to avoid triggering assertions or
868 		 * activating.
869 		 */
870 		freevnodes--;
871 		vp->v_iflag &= ~VI_FREE;
872 		refcount_acquire(&vp->v_holdcnt);
873 
874 		mtx_unlock(&vnode_free_list_mtx);
875 		VI_UNLOCK(vp);
876 		vtryrecycle(vp);
877 		/*
878 		 * If the recycled succeeded this vdrop will actually free
879 		 * the vnode.  If not it will simply place it back on
880 		 * the free list.
881 		 */
882 		vdrop(vp);
883 		mtx_lock(&vnode_free_list_mtx);
884 	}
885 }
886 /*
887  * Attempt to recycle vnodes in a context that is always safe to block.
888  * Calling vlrurecycle() from the bowels of filesystem code has some
889  * interesting deadlock problems.
890  */
891 static struct proc *vnlruproc;
892 static int vnlruproc_sig;
893 
894 static void
895 vnlru_proc(void)
896 {
897 	struct mount *mp, *nmp;
898 	int done;
899 	struct proc *p = vnlruproc;
900 
901 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
902 	    SHUTDOWN_PRI_FIRST);
903 
904 	for (;;) {
905 		kproc_suspend_check(p);
906 		mtx_lock(&vnode_free_list_mtx);
907 		if (freevnodes > wantfreevnodes)
908 			vnlru_free(freevnodes - wantfreevnodes);
909 		if (numvnodes <= desiredvnodes * 9 / 10) {
910 			vnlruproc_sig = 0;
911 			wakeup(&vnlruproc_sig);
912 			msleep(vnlruproc, &vnode_free_list_mtx,
913 			    PVFS|PDROP, "vlruwt", hz);
914 			continue;
915 		}
916 		mtx_unlock(&vnode_free_list_mtx);
917 		done = 0;
918 		mtx_lock(&mountlist_mtx);
919 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
920 			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
921 				nmp = TAILQ_NEXT(mp, mnt_list);
922 				continue;
923 			}
924 			done += vlrureclaim(mp);
925 			mtx_lock(&mountlist_mtx);
926 			nmp = TAILQ_NEXT(mp, mnt_list);
927 			vfs_unbusy(mp);
928 		}
929 		mtx_unlock(&mountlist_mtx);
930 		if (done == 0) {
931 #if 0
932 			/* These messages are temporary debugging aids */
933 			if (vnlru_nowhere < 5)
934 				printf("vnlru process getting nowhere..\n");
935 			else if (vnlru_nowhere == 5)
936 				printf("vnlru process messages stopped.\n");
937 #endif
938 			vnlru_nowhere++;
939 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
940 		} else
941 			kern_yield(PRI_USER);
942 	}
943 }
944 
945 static struct kproc_desc vnlru_kp = {
946 	"vnlru",
947 	vnlru_proc,
948 	&vnlruproc
949 };
950 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
951     &vnlru_kp);
952 
953 /*
954  * Routines having to do with the management of the vnode table.
955  */
956 
957 /*
958  * Try to recycle a freed vnode.  We abort if anyone picks up a reference
959  * before we actually vgone().  This function must be called with the vnode
960  * held to prevent the vnode from being returned to the free list midway
961  * through vgone().
962  */
963 static int
964 vtryrecycle(struct vnode *vp)
965 {
966 	struct mount *vnmp;
967 
968 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
969 	VNASSERT(vp->v_holdcnt, vp,
970 	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
971 	/*
972 	 * This vnode may found and locked via some other list, if so we
973 	 * can't recycle it yet.
974 	 */
975 	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
976 		CTR2(KTR_VFS,
977 		    "%s: impossible to recycle, vp %p lock is already held",
978 		    __func__, vp);
979 		return (EWOULDBLOCK);
980 	}
981 	/*
982 	 * Don't recycle if its filesystem is being suspended.
983 	 */
984 	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
985 		VOP_UNLOCK(vp, 0);
986 		CTR2(KTR_VFS,
987 		    "%s: impossible to recycle, cannot start the write for %p",
988 		    __func__, vp);
989 		return (EBUSY);
990 	}
991 	/*
992 	 * If we got this far, we need to acquire the interlock and see if
993 	 * anyone picked up this vnode from another list.  If not, we will
994 	 * mark it with DOOMED via vgonel() so that anyone who does find it
995 	 * will skip over it.
996 	 */
997 	VI_LOCK(vp);
998 	if (vp->v_usecount) {
999 		VOP_UNLOCK(vp, LK_INTERLOCK);
1000 		vn_finished_write(vnmp);
1001 		CTR2(KTR_VFS,
1002 		    "%s: impossible to recycle, %p is already referenced",
1003 		    __func__, vp);
1004 		return (EBUSY);
1005 	}
1006 	if ((vp->v_iflag & VI_DOOMED) == 0) {
1007 		atomic_add_long(&recycles_count, 1);
1008 		vgonel(vp);
1009 	}
1010 	VOP_UNLOCK(vp, LK_INTERLOCK);
1011 	vn_finished_write(vnmp);
1012 	return (0);
1013 }
1014 
1015 /*
1016  * Wait for available vnodes.
1017  */
1018 static int
1019 getnewvnode_wait(int suspended)
1020 {
1021 
1022 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
1023 	if (numvnodes > desiredvnodes) {
1024 		if (suspended) {
1025 			/*
1026 			 * File system is beeing suspended, we cannot risk a
1027 			 * deadlock here, so allocate new vnode anyway.
1028 			 */
1029 			if (freevnodes > wantfreevnodes)
1030 				vnlru_free(freevnodes - wantfreevnodes);
1031 			return (0);
1032 		}
1033 		if (vnlruproc_sig == 0) {
1034 			vnlruproc_sig = 1;	/* avoid unnecessary wakeups */
1035 			wakeup(vnlruproc);
1036 		}
1037 		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
1038 		    "vlruwk", hz);
1039 	}
1040 	return (numvnodes > desiredvnodes ? ENFILE : 0);
1041 }
1042 
1043 void
1044 getnewvnode_reserve(u_int count)
1045 {
1046 	struct thread *td;
1047 
1048 	td = curthread;
1049 	/* First try to be quick and racy. */
1050 	if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
1051 		td->td_vp_reserv += count;
1052 		return;
1053 	} else
1054 		atomic_subtract_long(&numvnodes, count);
1055 
1056 	mtx_lock(&vnode_free_list_mtx);
1057 	while (count > 0) {
1058 		if (getnewvnode_wait(0) == 0) {
1059 			count--;
1060 			td->td_vp_reserv++;
1061 			atomic_add_long(&numvnodes, 1);
1062 		}
1063 	}
1064 	mtx_unlock(&vnode_free_list_mtx);
1065 }
1066 
1067 void
1068 getnewvnode_drop_reserve(void)
1069 {
1070 	struct thread *td;
1071 
1072 	td = curthread;
1073 	atomic_subtract_long(&numvnodes, td->td_vp_reserv);
1074 	td->td_vp_reserv = 0;
1075 }
1076 
1077 /*
1078  * Return the next vnode from the free list.
1079  */
1080 int
1081 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
1082     struct vnode **vpp)
1083 {
1084 	struct vnode *vp;
1085 	struct bufobj *bo;
1086 	struct thread *td;
1087 	int error;
1088 
1089 	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
1090 	vp = NULL;
1091 	td = curthread;
1092 	if (td->td_vp_reserv > 0) {
1093 		td->td_vp_reserv -= 1;
1094 		goto alloc;
1095 	}
1096 	mtx_lock(&vnode_free_list_mtx);
1097 	/*
1098 	 * Lend our context to reclaim vnodes if they've exceeded the max.
1099 	 */
1100 	if (freevnodes > wantfreevnodes)
1101 		vnlru_free(1);
1102 	error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
1103 	    MNTK_SUSPEND));
1104 #if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
1105 	if (error != 0) {
1106 		mtx_unlock(&vnode_free_list_mtx);
1107 		return (error);
1108 	}
1109 #endif
1110 	atomic_add_long(&numvnodes, 1);
1111 	mtx_unlock(&vnode_free_list_mtx);
1112 alloc:
1113 	atomic_add_long(&vnodes_created, 1);
1114 	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
1115 	/*
1116 	 * Setup locks.
1117 	 */
1118 	vp->v_vnlock = &vp->v_lock;
1119 	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
1120 	/*
1121 	 * By default, don't allow shared locks unless filesystems
1122 	 * opt-in.
1123 	 */
1124 	lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE);
1125 	/*
1126 	 * Initialize bufobj.
1127 	 */
1128 	bo = &vp->v_bufobj;
1129 	bo->__bo_vnode = vp;
1130 	rw_init(BO_LOCKPTR(bo), "bufobj interlock");
1131 	bo->bo_ops = &buf_ops_bio;
1132 	bo->bo_private = vp;
1133 	TAILQ_INIT(&bo->bo_clean.bv_hd);
1134 	TAILQ_INIT(&bo->bo_dirty.bv_hd);
1135 	/*
1136 	 * Initialize namecache.
1137 	 */
1138 	LIST_INIT(&vp->v_cache_src);
1139 	TAILQ_INIT(&vp->v_cache_dst);
1140 	/*
1141 	 * Finalize various vnode identity bits.
1142 	 */
1143 	vp->v_type = VNON;
1144 	vp->v_tag = tag;
1145 	vp->v_op = vops;
1146 	v_init_counters(vp);
1147 	vp->v_data = NULL;
1148 #ifdef MAC
1149 	mac_vnode_init(vp);
1150 	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
1151 		mac_vnode_associate_singlelabel(mp, vp);
1152 	else if (mp == NULL && vops != &dead_vnodeops)
1153 		printf("NULL mp in getnewvnode()\n");
1154 #endif
1155 	if (mp != NULL) {
1156 		bo->bo_bsize = mp->mnt_stat.f_iosize;
1157 		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
1158 			vp->v_vflag |= VV_NOKNOTE;
1159 	}
1160 	rangelock_init(&vp->v_rl);
1161 
1162 	/*
1163 	 * For the filesystems which do not use vfs_hash_insert(),
1164 	 * still initialize v_hash to have vfs_hash_index() useful.
1165 	 * E.g., nullfs uses vfs_hash_index() on the lower vnode for
1166 	 * its own hashing.
1167 	 */
1168 	vp->v_hash = (uintptr_t)vp >> vnsz2log;
1169 
1170 	*vpp = vp;
1171 	return (0);
1172 }
1173 
1174 /*
1175  * Delete from old mount point vnode list, if on one.
1176  */
1177 static void
1178 delmntque(struct vnode *vp)
1179 {
1180 	struct mount *mp;
1181 	int active;
1182 
1183 	mp = vp->v_mount;
1184 	if (mp == NULL)
1185 		return;
1186 	MNT_ILOCK(mp);
1187 	VI_LOCK(vp);
1188 	KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize,
1189 	    ("Active vnode list size %d > Vnode list size %d",
1190 	     mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize));
1191 	active = vp->v_iflag & VI_ACTIVE;
1192 	vp->v_iflag &= ~VI_ACTIVE;
1193 	if (active) {
1194 		mtx_lock(&vnode_free_list_mtx);
1195 		TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
1196 		mp->mnt_activevnodelistsize--;
1197 		mtx_unlock(&vnode_free_list_mtx);
1198 	}
1199 	vp->v_mount = NULL;
1200 	VI_UNLOCK(vp);
1201 	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
1202 		("bad mount point vnode list size"));
1203 	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1204 	mp->mnt_nvnodelistsize--;
1205 	MNT_REL(mp);
1206 	MNT_IUNLOCK(mp);
1207 }
1208 
1209 static void
1210 insmntque_stddtr(struct vnode *vp, void *dtr_arg)
1211 {
1212 
1213 	vp->v_data = NULL;
1214 	vp->v_op = &dead_vnodeops;
1215 	vgone(vp);
1216 	vput(vp);
1217 }
1218 
1219 /*
1220  * Insert into list of vnodes for the new mount point, if available.
1221  */
1222 int
1223 insmntque1(struct vnode *vp, struct mount *mp,
1224 	void (*dtr)(struct vnode *, void *), void *dtr_arg)
1225 {
1226 
1227 	KASSERT(vp->v_mount == NULL,
1228 		("insmntque: vnode already on per mount vnode list"));
1229 	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
1230 	ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
1231 
1232 	/*
1233 	 * We acquire the vnode interlock early to ensure that the
1234 	 * vnode cannot be recycled by another process releasing a
1235 	 * holdcnt on it before we get it on both the vnode list
1236 	 * and the active vnode list. The mount mutex protects only
1237 	 * manipulation of the vnode list and the vnode freelist
1238 	 * mutex protects only manipulation of the active vnode list.
1239 	 * Hence the need to hold the vnode interlock throughout.
1240 	 */
1241 	MNT_ILOCK(mp);
1242 	VI_LOCK(vp);
1243 	if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
1244 	    ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
1245 	    mp->mnt_nvnodelistsize == 0)) &&
1246 	    (vp->v_vflag & VV_FORCEINSMQ) == 0) {
1247 		VI_UNLOCK(vp);
1248 		MNT_IUNLOCK(mp);
1249 		if (dtr != NULL)
1250 			dtr(vp, dtr_arg);
1251 		return (EBUSY);
1252 	}
1253 	vp->v_mount = mp;
1254 	MNT_REF(mp);
1255 	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1256 	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
1257 		("neg mount point vnode list size"));
1258 	mp->mnt_nvnodelistsize++;
1259 	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
1260 	    ("Activating already active vnode"));
1261 	vp->v_iflag |= VI_ACTIVE;
1262 	mtx_lock(&vnode_free_list_mtx);
1263 	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
1264 	mp->mnt_activevnodelistsize++;
1265 	mtx_unlock(&vnode_free_list_mtx);
1266 	VI_UNLOCK(vp);
1267 	MNT_IUNLOCK(mp);
1268 	return (0);
1269 }
1270 
1271 int
1272 insmntque(struct vnode *vp, struct mount *mp)
1273 {
1274 
1275 	return (insmntque1(vp, mp, insmntque_stddtr, NULL));
1276 }
1277 
1278 /*
1279  * Flush out and invalidate all buffers associated with a bufobj
1280  * Called with the underlying object locked.
1281  */
1282 int
1283 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
1284 {
1285 	int error;
1286 
1287 	BO_LOCK(bo);
1288 	if (flags & V_SAVE) {
1289 		error = bufobj_wwait(bo, slpflag, slptimeo);
1290 		if (error) {
1291 			BO_UNLOCK(bo);
1292 			return (error);
1293 		}
1294 		if (bo->bo_dirty.bv_cnt > 0) {
1295 			BO_UNLOCK(bo);
1296 			if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
1297 				return (error);
1298 			/*
1299 			 * XXX We could save a lock/unlock if this was only
1300 			 * enabled under INVARIANTS
1301 			 */
1302 			BO_LOCK(bo);
1303 			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
1304 				panic("vinvalbuf: dirty bufs");
1305 		}
1306 	}
1307 	/*
1308 	 * If you alter this loop please notice that interlock is dropped and
1309 	 * reacquired in flushbuflist.  Special care is needed to ensure that
1310 	 * no race conditions occur from this.
1311 	 */
1312 	do {
1313 		error = flushbuflist(&bo->bo_clean,
1314 		    flags, bo, slpflag, slptimeo);
1315 		if (error == 0 && !(flags & V_CLEANONLY))
1316 			error = flushbuflist(&bo->bo_dirty,
1317 			    flags, bo, slpflag, slptimeo);
1318 		if (error != 0 && error != EAGAIN) {
1319 			BO_UNLOCK(bo);
1320 			return (error);
1321 		}
1322 	} while (error != 0);
1323 
1324 	/*
1325 	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
1326 	 * have write I/O in-progress but if there is a VM object then the
1327 	 * VM object can also have read-I/O in-progress.
1328 	 */
1329 	do {
1330 		bufobj_wwait(bo, 0, 0);
1331 		BO_UNLOCK(bo);
1332 		if (bo->bo_object != NULL) {
1333 			VM_OBJECT_WLOCK(bo->bo_object);
1334 			vm_object_pip_wait(bo->bo_object, "bovlbx");
1335 			VM_OBJECT_WUNLOCK(bo->bo_object);
1336 		}
1337 		BO_LOCK(bo);
1338 	} while (bo->bo_numoutput > 0);
1339 	BO_UNLOCK(bo);
1340 
1341 	/*
1342 	 * Destroy the copy in the VM cache, too.
1343 	 */
1344 	if (bo->bo_object != NULL &&
1345 	    (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) {
1346 		VM_OBJECT_WLOCK(bo->bo_object);
1347 		vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
1348 		    OBJPR_CLEANONLY : 0);
1349 		VM_OBJECT_WUNLOCK(bo->bo_object);
1350 	}
1351 
1352 #ifdef INVARIANTS
1353 	BO_LOCK(bo);
1354 	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 &&
1355 	    (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
1356 		panic("vinvalbuf: flush failed");
1357 	BO_UNLOCK(bo);
1358 #endif
1359 	return (0);
1360 }
1361 
1362 /*
1363  * Flush out and invalidate all buffers associated with a vnode.
1364  * Called with the underlying object locked.
1365  */
1366 int
1367 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
1368 {
1369 
1370 	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
1371 	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
1372 	if (vp->v_object != NULL && vp->v_object->handle != vp)
1373 		return (0);
1374 	return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
1375 }
1376 
1377 /*
1378  * Flush out buffers on the specified list.
1379  *
1380  */
1381 static int
1382 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
1383     int slptimeo)
1384 {
1385 	struct buf *bp, *nbp;
1386 	int retval, error;
1387 	daddr_t lblkno;
1388 	b_xflags_t xflags;
1389 
1390 	ASSERT_BO_WLOCKED(bo);
1391 
1392 	retval = 0;
1393 	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
1394 		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1395 		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1396 			continue;
1397 		}
1398 		lblkno = 0;
1399 		xflags = 0;
1400 		if (nbp != NULL) {
1401 			lblkno = nbp->b_lblkno;
1402 			xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
1403 		}
1404 		retval = EAGAIN;
1405 		error = BUF_TIMELOCK(bp,
1406 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
1407 		    "flushbuf", slpflag, slptimeo);
1408 		if (error) {
1409 			BO_LOCK(bo);
1410 			return (error != ENOLCK ? error : EAGAIN);
1411 		}
1412 		KASSERT(bp->b_bufobj == bo,
1413 		    ("bp %p wrong b_bufobj %p should be %p",
1414 		    bp, bp->b_bufobj, bo));
1415 		if (bp->b_bufobj != bo) {	/* XXX: necessary ? */
1416 			BUF_UNLOCK(bp);
1417 			BO_LOCK(bo);
1418 			return (EAGAIN);
1419 		}
1420 		/*
1421 		 * XXX Since there are no node locks for NFS, I
1422 		 * believe there is a slight chance that a delayed
1423 		 * write will occur while sleeping just above, so
1424 		 * check for it.
1425 		 */
1426 		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1427 		    (flags & V_SAVE)) {
1428 			bremfree(bp);
1429 			bp->b_flags |= B_ASYNC;
1430 			bwrite(bp);
1431 			BO_LOCK(bo);
1432 			return (EAGAIN);	/* XXX: why not loop ? */
1433 		}
1434 		bremfree(bp);
1435 		bp->b_flags |= (B_INVAL | B_RELBUF);
1436 		bp->b_flags &= ~B_ASYNC;
1437 		brelse(bp);
1438 		BO_LOCK(bo);
1439 		if (nbp != NULL &&
1440 		    (nbp->b_bufobj != bo ||
1441 		     nbp->b_lblkno != lblkno ||
1442 		     (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags))
1443 			break;			/* nbp invalid */
1444 	}
1445 	return (retval);
1446 }
1447 
1448 /*
1449  * Truncate a file's buffer and pages to a specified length.  This
1450  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1451  * sync activity.
1452  */
1453 int
1454 vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize)
1455 {
1456 	struct buf *bp, *nbp;
1457 	int anyfreed;
1458 	int trunclbn;
1459 	struct bufobj *bo;
1460 
1461 	CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__,
1462 	    vp, cred, blksize, (uintmax_t)length);
1463 
1464 	/*
1465 	 * Round up to the *next* lbn.
1466 	 */
1467 	trunclbn = (length + blksize - 1) / blksize;
1468 
1469 	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1470 restart:
1471 	bo = &vp->v_bufobj;
1472 	BO_LOCK(bo);
1473 	anyfreed = 1;
1474 	for (;anyfreed;) {
1475 		anyfreed = 0;
1476 		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
1477 			if (bp->b_lblkno < trunclbn)
1478 				continue;
1479 			if (BUF_LOCK(bp,
1480 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1481 			    BO_LOCKPTR(bo)) == ENOLCK)
1482 				goto restart;
1483 
1484 			bremfree(bp);
1485 			bp->b_flags |= (B_INVAL | B_RELBUF);
1486 			bp->b_flags &= ~B_ASYNC;
1487 			brelse(bp);
1488 			anyfreed = 1;
1489 
1490 			BO_LOCK(bo);
1491 			if (nbp != NULL &&
1492 			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1493 			    (nbp->b_vp != vp) ||
1494 			    (nbp->b_flags & B_DELWRI))) {
1495 				BO_UNLOCK(bo);
1496 				goto restart;
1497 			}
1498 		}
1499 
1500 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1501 			if (bp->b_lblkno < trunclbn)
1502 				continue;
1503 			if (BUF_LOCK(bp,
1504 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1505 			    BO_LOCKPTR(bo)) == ENOLCK)
1506 				goto restart;
1507 			bremfree(bp);
1508 			bp->b_flags |= (B_INVAL | B_RELBUF);
1509 			bp->b_flags &= ~B_ASYNC;
1510 			brelse(bp);
1511 			anyfreed = 1;
1512 
1513 			BO_LOCK(bo);
1514 			if (nbp != NULL &&
1515 			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1516 			    (nbp->b_vp != vp) ||
1517 			    (nbp->b_flags & B_DELWRI) == 0)) {
1518 				BO_UNLOCK(bo);
1519 				goto restart;
1520 			}
1521 		}
1522 	}
1523 
1524 	if (length > 0) {
1525 restartsync:
1526 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1527 			if (bp->b_lblkno > 0)
1528 				continue;
1529 			/*
1530 			 * Since we hold the vnode lock this should only
1531 			 * fail if we're racing with the buf daemon.
1532 			 */
1533 			if (BUF_LOCK(bp,
1534 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1535 			    BO_LOCKPTR(bo)) == ENOLCK) {
1536 				goto restart;
1537 			}
1538 			VNASSERT((bp->b_flags & B_DELWRI), vp,
1539 			    ("buf(%p) on dirty queue without DELWRI", bp));
1540 
1541 			bremfree(bp);
1542 			bawrite(bp);
1543 			BO_LOCK(bo);
1544 			goto restartsync;
1545 		}
1546 	}
1547 
1548 	bufobj_wwait(bo, 0, 0);
1549 	BO_UNLOCK(bo);
1550 	vnode_pager_setsize(vp, length);
1551 
1552 	return (0);
1553 }
1554 
1555 static void
1556 buf_vlist_remove(struct buf *bp)
1557 {
1558 	struct bufv *bv;
1559 
1560 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1561 	ASSERT_BO_WLOCKED(bp->b_bufobj);
1562 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
1563 	    (BX_VNDIRTY|BX_VNCLEAN),
1564 	    ("buf_vlist_remove: Buf %p is on two lists", bp));
1565 	if (bp->b_xflags & BX_VNDIRTY)
1566 		bv = &bp->b_bufobj->bo_dirty;
1567 	else
1568 		bv = &bp->b_bufobj->bo_clean;
1569 	BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
1570 	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
1571 	bv->bv_cnt--;
1572 	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1573 }
1574 
1575 /*
1576  * Add the buffer to the sorted clean or dirty block list.
1577  *
1578  * NOTE: xflags is passed as a constant, optimizing this inline function!
1579  */
1580 static void
1581 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
1582 {
1583 	struct bufv *bv;
1584 	struct buf *n;
1585 	int error;
1586 
1587 	ASSERT_BO_WLOCKED(bo);
1588 	KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
1589 	    ("dead bo %p", bo));
1590 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1591 	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
1592 	bp->b_xflags |= xflags;
1593 	if (xflags & BX_VNDIRTY)
1594 		bv = &bo->bo_dirty;
1595 	else
1596 		bv = &bo->bo_clean;
1597 
1598 	/*
1599 	 * Keep the list ordered.  Optimize empty list insertion.  Assume
1600 	 * we tend to grow at the tail so lookup_le should usually be cheaper
1601 	 * than _ge.
1602 	 */
1603 	if (bv->bv_cnt == 0 ||
1604 	    bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
1605 		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
1606 	else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
1607 		TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
1608 	else
1609 		TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
1610 	error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
1611 	if (error)
1612 		panic("buf_vlist_add:  Preallocated nodes insufficient.");
1613 	bv->bv_cnt++;
1614 }
1615 
1616 /*
1617  * Look up a buffer using the buffer tries.
1618  */
1619 struct buf *
1620 gbincore(struct bufobj *bo, daddr_t lblkno)
1621 {
1622 	struct buf *bp;
1623 
1624 	ASSERT_BO_LOCKED(bo);
1625 	bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
1626 	if (bp != NULL)
1627 		return (bp);
1628 	return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno);
1629 }
1630 
1631 /*
1632  * Associate a buffer with a vnode.
1633  */
1634 void
1635 bgetvp(struct vnode *vp, struct buf *bp)
1636 {
1637 	struct bufobj *bo;
1638 
1639 	bo = &vp->v_bufobj;
1640 	ASSERT_BO_WLOCKED(bo);
1641 	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
1642 
1643 	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
1644 	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
1645 	    ("bgetvp: bp already attached! %p", bp));
1646 
1647 	vhold(vp);
1648 	bp->b_vp = vp;
1649 	bp->b_bufobj = bo;
1650 	/*
1651 	 * Insert onto list for new vnode.
1652 	 */
1653 	buf_vlist_add(bp, bo, BX_VNCLEAN);
1654 }
1655 
1656 /*
1657  * Disassociate a buffer from a vnode.
1658  */
1659 void
1660 brelvp(struct buf *bp)
1661 {
1662 	struct bufobj *bo;
1663 	struct vnode *vp;
1664 
1665 	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1666 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1667 
1668 	/*
1669 	 * Delete from old vnode list, if on one.
1670 	 */
1671 	vp = bp->b_vp;		/* XXX */
1672 	bo = bp->b_bufobj;
1673 	BO_LOCK(bo);
1674 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1675 		buf_vlist_remove(bp);
1676 	else
1677 		panic("brelvp: Buffer %p not on queue.", bp);
1678 	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1679 		bo->bo_flag &= ~BO_ONWORKLST;
1680 		mtx_lock(&sync_mtx);
1681 		LIST_REMOVE(bo, bo_synclist);
1682 		syncer_worklist_len--;
1683 		mtx_unlock(&sync_mtx);
1684 	}
1685 	bp->b_vp = NULL;
1686 	bp->b_bufobj = NULL;
1687 	BO_UNLOCK(bo);
1688 	vdrop(vp);
1689 }
1690 
1691 /*
1692  * Add an item to the syncer work queue.
1693  */
1694 static void
1695 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
1696 {
1697 	int slot;
1698 
1699 	ASSERT_BO_WLOCKED(bo);
1700 
1701 	mtx_lock(&sync_mtx);
1702 	if (bo->bo_flag & BO_ONWORKLST)
1703 		LIST_REMOVE(bo, bo_synclist);
1704 	else {
1705 		bo->bo_flag |= BO_ONWORKLST;
1706 		syncer_worklist_len++;
1707 	}
1708 
1709 	if (delay > syncer_maxdelay - 2)
1710 		delay = syncer_maxdelay - 2;
1711 	slot = (syncer_delayno + delay) & syncer_mask;
1712 
1713 	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
1714 	mtx_unlock(&sync_mtx);
1715 }
1716 
1717 static int
1718 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
1719 {
1720 	int error, len;
1721 
1722 	mtx_lock(&sync_mtx);
1723 	len = syncer_worklist_len - sync_vnode_count;
1724 	mtx_unlock(&sync_mtx);
1725 	error = SYSCTL_OUT(req, &len, sizeof(len));
1726 	return (error);
1727 }
1728 
1729 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
1730     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
1731 
1732 static struct proc *updateproc;
1733 static void sched_sync(void);
1734 static struct kproc_desc up_kp = {
1735 	"syncer",
1736 	sched_sync,
1737 	&updateproc
1738 };
1739 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
1740 
1741 static int
1742 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
1743 {
1744 	struct vnode *vp;
1745 	struct mount *mp;
1746 
1747 	*bo = LIST_FIRST(slp);
1748 	if (*bo == NULL)
1749 		return (0);
1750 	vp = (*bo)->__bo_vnode;	/* XXX */
1751 	if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
1752 		return (1);
1753 	/*
1754 	 * We use vhold in case the vnode does not
1755 	 * successfully sync.  vhold prevents the vnode from
1756 	 * going away when we unlock the sync_mtx so that
1757 	 * we can acquire the vnode interlock.
1758 	 */
1759 	vholdl(vp);
1760 	mtx_unlock(&sync_mtx);
1761 	VI_UNLOCK(vp);
1762 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1763 		vdrop(vp);
1764 		mtx_lock(&sync_mtx);
1765 		return (*bo == LIST_FIRST(slp));
1766 	}
1767 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1768 	(void) VOP_FSYNC(vp, MNT_LAZY, td);
1769 	VOP_UNLOCK(vp, 0);
1770 	vn_finished_write(mp);
1771 	BO_LOCK(*bo);
1772 	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
1773 		/*
1774 		 * Put us back on the worklist.  The worklist
1775 		 * routine will remove us from our current
1776 		 * position and then add us back in at a later
1777 		 * position.
1778 		 */
1779 		vn_syncer_add_to_worklist(*bo, syncdelay);
1780 	}
1781 	BO_UNLOCK(*bo);
1782 	vdrop(vp);
1783 	mtx_lock(&sync_mtx);
1784 	return (0);
1785 }
1786 
1787 static int first_printf = 1;
1788 
1789 /*
1790  * System filesystem synchronizer daemon.
1791  */
1792 static void
1793 sched_sync(void)
1794 {
1795 	struct synclist *next, *slp;
1796 	struct bufobj *bo;
1797 	long starttime;
1798 	struct thread *td = curthread;
1799 	int last_work_seen;
1800 	int net_worklist_len;
1801 	int syncer_final_iter;
1802 	int error;
1803 
1804 	last_work_seen = 0;
1805 	syncer_final_iter = 0;
1806 	syncer_state = SYNCER_RUNNING;
1807 	starttime = time_uptime;
1808 	td->td_pflags |= TDP_NORUNNINGBUF;
1809 
1810 	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
1811 	    SHUTDOWN_PRI_LAST);
1812 
1813 	mtx_lock(&sync_mtx);
1814 	for (;;) {
1815 		if (syncer_state == SYNCER_FINAL_DELAY &&
1816 		    syncer_final_iter == 0) {
1817 			mtx_unlock(&sync_mtx);
1818 			kproc_suspend_check(td->td_proc);
1819 			mtx_lock(&sync_mtx);
1820 		}
1821 		net_worklist_len = syncer_worklist_len - sync_vnode_count;
1822 		if (syncer_state != SYNCER_RUNNING &&
1823 		    starttime != time_uptime) {
1824 			if (first_printf) {
1825 				printf("\nSyncing disks, vnodes remaining...");
1826 				first_printf = 0;
1827 			}
1828 			printf("%d ", net_worklist_len);
1829 		}
1830 		starttime = time_uptime;
1831 
1832 		/*
1833 		 * Push files whose dirty time has expired.  Be careful
1834 		 * of interrupt race on slp queue.
1835 		 *
1836 		 * Skip over empty worklist slots when shutting down.
1837 		 */
1838 		do {
1839 			slp = &syncer_workitem_pending[syncer_delayno];
1840 			syncer_delayno += 1;
1841 			if (syncer_delayno == syncer_maxdelay)
1842 				syncer_delayno = 0;
1843 			next = &syncer_workitem_pending[syncer_delayno];
1844 			/*
1845 			 * If the worklist has wrapped since the
1846 			 * it was emptied of all but syncer vnodes,
1847 			 * switch to the FINAL_DELAY state and run
1848 			 * for one more second.
1849 			 */
1850 			if (syncer_state == SYNCER_SHUTTING_DOWN &&
1851 			    net_worklist_len == 0 &&
1852 			    last_work_seen == syncer_delayno) {
1853 				syncer_state = SYNCER_FINAL_DELAY;
1854 				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
1855 			}
1856 		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
1857 		    syncer_worklist_len > 0);
1858 
1859 		/*
1860 		 * Keep track of the last time there was anything
1861 		 * on the worklist other than syncer vnodes.
1862 		 * Return to the SHUTTING_DOWN state if any
1863 		 * new work appears.
1864 		 */
1865 		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
1866 			last_work_seen = syncer_delayno;
1867 		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
1868 			syncer_state = SYNCER_SHUTTING_DOWN;
1869 		while (!LIST_EMPTY(slp)) {
1870 			error = sync_vnode(slp, &bo, td);
1871 			if (error == 1) {
1872 				LIST_REMOVE(bo, bo_synclist);
1873 				LIST_INSERT_HEAD(next, bo, bo_synclist);
1874 				continue;
1875 			}
1876 
1877 			if (first_printf == 0) {
1878 				/*
1879 				 * Drop the sync mutex, because some watchdog
1880 				 * drivers need to sleep while patting
1881 				 */
1882 				mtx_unlock(&sync_mtx);
1883 				wdog_kern_pat(WD_LASTVAL);
1884 				mtx_lock(&sync_mtx);
1885 			}
1886 
1887 		}
1888 		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
1889 			syncer_final_iter--;
1890 		/*
1891 		 * The variable rushjob allows the kernel to speed up the
1892 		 * processing of the filesystem syncer process. A rushjob
1893 		 * value of N tells the filesystem syncer to process the next
1894 		 * N seconds worth of work on its queue ASAP. Currently rushjob
1895 		 * is used by the soft update code to speed up the filesystem
1896 		 * syncer process when the incore state is getting so far
1897 		 * ahead of the disk that the kernel memory pool is being
1898 		 * threatened with exhaustion.
1899 		 */
1900 		if (rushjob > 0) {
1901 			rushjob -= 1;
1902 			continue;
1903 		}
1904 		/*
1905 		 * Just sleep for a short period of time between
1906 		 * iterations when shutting down to allow some I/O
1907 		 * to happen.
1908 		 *
1909 		 * If it has taken us less than a second to process the
1910 		 * current work, then wait. Otherwise start right over
1911 		 * again. We can still lose time if any single round
1912 		 * takes more than two seconds, but it does not really
1913 		 * matter as we are just trying to generally pace the
1914 		 * filesystem activity.
1915 		 */
1916 		if (syncer_state != SYNCER_RUNNING ||
1917 		    time_uptime == starttime) {
1918 			thread_lock(td);
1919 			sched_prio(td, PPAUSE);
1920 			thread_unlock(td);
1921 		}
1922 		if (syncer_state != SYNCER_RUNNING)
1923 			cv_timedwait(&sync_wakeup, &sync_mtx,
1924 			    hz / SYNCER_SHUTDOWN_SPEEDUP);
1925 		else if (time_uptime == starttime)
1926 			cv_timedwait(&sync_wakeup, &sync_mtx, hz);
1927 	}
1928 }
1929 
1930 /*
1931  * Request the syncer daemon to speed up its work.
1932  * We never push it to speed up more than half of its
1933  * normal turn time, otherwise it could take over the cpu.
1934  */
1935 int
1936 speedup_syncer(void)
1937 {
1938 	int ret = 0;
1939 
1940 	mtx_lock(&sync_mtx);
1941 	if (rushjob < syncdelay / 2) {
1942 		rushjob += 1;
1943 		stat_rush_requests += 1;
1944 		ret = 1;
1945 	}
1946 	mtx_unlock(&sync_mtx);
1947 	cv_broadcast(&sync_wakeup);
1948 	return (ret);
1949 }
1950 
1951 /*
1952  * Tell the syncer to speed up its work and run though its work
1953  * list several times, then tell it to shut down.
1954  */
1955 static void
1956 syncer_shutdown(void *arg, int howto)
1957 {
1958 
1959 	if (howto & RB_NOSYNC)
1960 		return;
1961 	mtx_lock(&sync_mtx);
1962 	syncer_state = SYNCER_SHUTTING_DOWN;
1963 	rushjob = 0;
1964 	mtx_unlock(&sync_mtx);
1965 	cv_broadcast(&sync_wakeup);
1966 	kproc_shutdown(arg, howto);
1967 }
1968 
1969 void
1970 syncer_suspend(void)
1971 {
1972 
1973 	syncer_shutdown(updateproc, 0);
1974 }
1975 
1976 void
1977 syncer_resume(void)
1978 {
1979 
1980 	mtx_lock(&sync_mtx);
1981 	first_printf = 1;
1982 	syncer_state = SYNCER_RUNNING;
1983 	mtx_unlock(&sync_mtx);
1984 	cv_broadcast(&sync_wakeup);
1985 	kproc_resume(updateproc);
1986 }
1987 
1988 /*
1989  * Reassign a buffer from one vnode to another.
1990  * Used to assign file specific control information
1991  * (indirect blocks) to the vnode to which they belong.
1992  */
1993 void
1994 reassignbuf(struct buf *bp)
1995 {
1996 	struct vnode *vp;
1997 	struct bufobj *bo;
1998 	int delay;
1999 #ifdef INVARIANTS
2000 	struct bufv *bv;
2001 #endif
2002 
2003 	vp = bp->b_vp;
2004 	bo = bp->b_bufobj;
2005 	++reassignbufcalls;
2006 
2007 	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
2008 	    bp, bp->b_vp, bp->b_flags);
2009 	/*
2010 	 * B_PAGING flagged buffers cannot be reassigned because their vp
2011 	 * is not fully linked in.
2012 	 */
2013 	if (bp->b_flags & B_PAGING)
2014 		panic("cannot reassign paging buffer");
2015 
2016 	/*
2017 	 * Delete from old vnode list, if on one.
2018 	 */
2019 	BO_LOCK(bo);
2020 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
2021 		buf_vlist_remove(bp);
2022 	else
2023 		panic("reassignbuf: Buffer %p not on queue.", bp);
2024 	/*
2025 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
2026 	 * of clean buffers.
2027 	 */
2028 	if (bp->b_flags & B_DELWRI) {
2029 		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
2030 			switch (vp->v_type) {
2031 			case VDIR:
2032 				delay = dirdelay;
2033 				break;
2034 			case VCHR:
2035 				delay = metadelay;
2036 				break;
2037 			default:
2038 				delay = filedelay;
2039 			}
2040 			vn_syncer_add_to_worklist(bo, delay);
2041 		}
2042 		buf_vlist_add(bp, bo, BX_VNDIRTY);
2043 	} else {
2044 		buf_vlist_add(bp, bo, BX_VNCLEAN);
2045 
2046 		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
2047 			mtx_lock(&sync_mtx);
2048 			LIST_REMOVE(bo, bo_synclist);
2049 			syncer_worklist_len--;
2050 			mtx_unlock(&sync_mtx);
2051 			bo->bo_flag &= ~BO_ONWORKLST;
2052 		}
2053 	}
2054 #ifdef INVARIANTS
2055 	bv = &bo->bo_clean;
2056 	bp = TAILQ_FIRST(&bv->bv_hd);
2057 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2058 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2059 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
2060 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2061 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2062 	bv = &bo->bo_dirty;
2063 	bp = TAILQ_FIRST(&bv->bv_hd);
2064 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2065 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2066 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
2067 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2068 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2069 #endif
2070 	BO_UNLOCK(bo);
2071 }
2072 
2073 /*
2074  * A temporary hack until refcount_* APIs are sorted out.
2075  */
2076 static __inline int
2077 vfs_refcount_acquire_if_not_zero(volatile u_int *count)
2078 {
2079 	u_int old;
2080 
2081 	for (;;) {
2082 		old = *count;
2083 		if (old == 0)
2084 			return (0);
2085 		if (atomic_cmpset_int(count, old, old + 1))
2086 			return (1);
2087 	}
2088 }
2089 
2090 static __inline int
2091 vfs_refcount_release_if_not_last(volatile u_int *count)
2092 {
2093 	u_int old;
2094 
2095 	for (;;) {
2096 		old = *count;
2097 		if (old == 1)
2098 			return (0);
2099 		if (atomic_cmpset_int(count, old, old - 1))
2100 			return (1);
2101 	}
2102 }
2103 
2104 static void
2105 v_init_counters(struct vnode *vp)
2106 {
2107 
2108 	VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0,
2109 	    vp, ("%s called for an initialized vnode", __FUNCTION__));
2110 	ASSERT_VI_UNLOCKED(vp, __FUNCTION__);
2111 
2112 	refcount_init(&vp->v_holdcnt, 1);
2113 	refcount_init(&vp->v_usecount, 1);
2114 }
2115 
2116 /*
2117  * Increment the use and hold counts on the vnode, taking care to reference
2118  * the driver's usecount if this is a chardev.  The _vhold() will remove
2119  * the vnode from the free list if it is presently free.
2120  */
2121 static void
2122 v_incr_usecount(struct vnode *vp)
2123 {
2124 
2125 	ASSERT_VI_UNLOCKED(vp, __func__);
2126 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2127 
2128 	if (vp->v_type == VCHR) {
2129 		VI_LOCK(vp);
2130 		_vhold(vp, true);
2131 		if (vp->v_iflag & VI_OWEINACT) {
2132 			VNASSERT(vp->v_usecount == 0, vp,
2133 			    ("vnode with usecount and VI_OWEINACT set"));
2134 			vp->v_iflag &= ~VI_OWEINACT;
2135 		}
2136 		refcount_acquire(&vp->v_usecount);
2137 		v_incr_devcount(vp);
2138 		VI_UNLOCK(vp);
2139 		return;
2140 	}
2141 
2142 	_vhold(vp, false);
2143 	if (vfs_refcount_acquire_if_not_zero(&vp->v_usecount)) {
2144 		VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
2145 		    ("vnode with usecount and VI_OWEINACT set"));
2146 	} else {
2147 		VI_LOCK(vp);
2148 		if (vp->v_iflag & VI_OWEINACT)
2149 			vp->v_iflag &= ~VI_OWEINACT;
2150 		refcount_acquire(&vp->v_usecount);
2151 		VI_UNLOCK(vp);
2152 	}
2153 }
2154 
2155 /*
2156  * Increment si_usecount of the associated device, if any.
2157  */
2158 static void
2159 v_incr_devcount(struct vnode *vp)
2160 {
2161 
2162 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
2163 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2164 		dev_lock();
2165 		vp->v_rdev->si_usecount++;
2166 		dev_unlock();
2167 	}
2168 }
2169 
2170 /*
2171  * Decrement si_usecount of the associated device, if any.
2172  */
2173 static void
2174 v_decr_devcount(struct vnode *vp)
2175 {
2176 
2177 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
2178 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2179 		dev_lock();
2180 		vp->v_rdev->si_usecount--;
2181 		dev_unlock();
2182 	}
2183 }
2184 
2185 /*
2186  * Grab a particular vnode from the free list, increment its
2187  * reference count and lock it.  VI_DOOMED is set if the vnode
2188  * is being destroyed.  Only callers who specify LK_RETRY will
2189  * see doomed vnodes.  If inactive processing was delayed in
2190  * vput try to do it here.
2191  *
2192  * Notes on lockless counter manipulation:
2193  * _vhold, vputx and other routines make various decisions based
2194  * on either holdcnt or usecount being 0. As long as either contuner
2195  * is not transitioning 0->1 nor 1->0, the manipulation can be done
2196  * with atomic operations. Otherwise the interlock is taken.
2197  */
2198 int
2199 vget(struct vnode *vp, int flags, struct thread *td)
2200 {
2201 	int error, oweinact;
2202 
2203 	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
2204 	    ("vget: invalid lock operation"));
2205 
2206 	if ((flags & LK_INTERLOCK) != 0)
2207 		ASSERT_VI_LOCKED(vp, __func__);
2208 	else
2209 		ASSERT_VI_UNLOCKED(vp, __func__);
2210 	if ((flags & LK_VNHELD) != 0)
2211 		VNASSERT((vp->v_holdcnt > 0), vp,
2212 		    ("vget: LK_VNHELD passed but vnode not held"));
2213 
2214 	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
2215 
2216 	if ((flags & LK_VNHELD) == 0)
2217 		_vhold(vp, (flags & LK_INTERLOCK) != 0);
2218 
2219 	if ((error = vn_lock(vp, flags)) != 0) {
2220 		vdrop(vp);
2221 		CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
2222 		    vp);
2223 		return (error);
2224 	}
2225 	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
2226 		panic("vget: vn_lock failed to return ENOENT\n");
2227 	/*
2228 	 * We don't guarantee that any particular close will
2229 	 * trigger inactive processing so just make a best effort
2230 	 * here at preventing a reference to a removed file.  If
2231 	 * we don't succeed no harm is done.
2232 	 *
2233 	 * Upgrade our holdcnt to a usecount.
2234 	 */
2235 	if (vp->v_type != VCHR &&
2236 	    vfs_refcount_acquire_if_not_zero(&vp->v_usecount)) {
2237 		VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
2238 		    ("vnode with usecount and VI_OWEINACT set"));
2239 	} else {
2240 		VI_LOCK(vp);
2241 		if ((vp->v_iflag & VI_OWEINACT) == 0) {
2242 			oweinact = 0;
2243 		} else {
2244 			oweinact = 1;
2245 			vp->v_iflag &= ~VI_OWEINACT;
2246 		}
2247 		refcount_acquire(&vp->v_usecount);
2248 		v_incr_devcount(vp);
2249 		if (oweinact && VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
2250 		    (flags & LK_NOWAIT) == 0)
2251 			vinactive(vp, td);
2252 		VI_UNLOCK(vp);
2253 	}
2254 	return (0);
2255 }
2256 
2257 /*
2258  * Increase the reference count of a vnode.
2259  */
2260 void
2261 vref(struct vnode *vp)
2262 {
2263 
2264 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2265 	v_incr_usecount(vp);
2266 }
2267 
2268 /*
2269  * Return reference count of a vnode.
2270  *
2271  * The results of this call are only guaranteed when some mechanism is used to
2272  * stop other processes from gaining references to the vnode.  This may be the
2273  * case if the caller holds the only reference.  This is also useful when stale
2274  * data is acceptable as race conditions may be accounted for by some other
2275  * means.
2276  */
2277 int
2278 vrefcnt(struct vnode *vp)
2279 {
2280 
2281 	return (vp->v_usecount);
2282 }
2283 
2284 #define	VPUTX_VRELE	1
2285 #define	VPUTX_VPUT	2
2286 #define	VPUTX_VUNREF	3
2287 
2288 /*
2289  * Decrement the use and hold counts for a vnode.
2290  *
2291  * See an explanation near vget() as to why atomic operation is safe.
2292  */
2293 static void
2294 vputx(struct vnode *vp, int func)
2295 {
2296 	int error;
2297 
2298 	KASSERT(vp != NULL, ("vputx: null vp"));
2299 	if (func == VPUTX_VUNREF)
2300 		ASSERT_VOP_LOCKED(vp, "vunref");
2301 	else if (func == VPUTX_VPUT)
2302 		ASSERT_VOP_LOCKED(vp, "vput");
2303 	else
2304 		KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
2305 	ASSERT_VI_UNLOCKED(vp, __func__);
2306 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2307 
2308 	if (vp->v_type != VCHR &&
2309 	    vfs_refcount_release_if_not_last(&vp->v_usecount)) {
2310 		if (func == VPUTX_VPUT)
2311 			VOP_UNLOCK(vp, 0);
2312 		vdrop(vp);
2313 		return;
2314 	}
2315 
2316 	VI_LOCK(vp);
2317 
2318 	/*
2319 	 * We want to hold the vnode until the inactive finishes to
2320 	 * prevent vgone() races.  We drop the use count here and the
2321 	 * hold count below when we're done.
2322 	 */
2323 	if (!refcount_release(&vp->v_usecount) ||
2324 	    (vp->v_iflag & VI_DOINGINACT)) {
2325 		if (func == VPUTX_VPUT)
2326 			VOP_UNLOCK(vp, 0);
2327 		v_decr_devcount(vp);
2328 		vdropl(vp);
2329 		return;
2330 	}
2331 
2332 	v_decr_devcount(vp);
2333 
2334 	error = 0;
2335 
2336 	if (vp->v_usecount != 0) {
2337 		vprint("vputx: usecount not zero", vp);
2338 		panic("vputx: usecount not zero");
2339 	}
2340 
2341 	CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp);
2342 
2343 	/*
2344 	 * We must call VOP_INACTIVE with the node locked. Mark
2345 	 * as VI_DOINGINACT to avoid recursion.
2346 	 */
2347 	vp->v_iflag |= VI_OWEINACT;
2348 	switch (func) {
2349 	case VPUTX_VRELE:
2350 		error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
2351 		VI_LOCK(vp);
2352 		break;
2353 	case VPUTX_VPUT:
2354 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2355 			error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
2356 			    LK_NOWAIT);
2357 			VI_LOCK(vp);
2358 		}
2359 		break;
2360 	case VPUTX_VUNREF:
2361 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2362 			error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
2363 			VI_LOCK(vp);
2364 		}
2365 		break;
2366 	}
2367 	VNASSERT(vp->v_usecount == 0 || (vp->v_iflag & VI_OWEINACT) == 0, vp,
2368 	    ("vnode with usecount and VI_OWEINACT set"));
2369 	if (error == 0) {
2370 		if (vp->v_iflag & VI_OWEINACT)
2371 			vinactive(vp, curthread);
2372 		if (func != VPUTX_VUNREF)
2373 			VOP_UNLOCK(vp, 0);
2374 	}
2375 	vdropl(vp);
2376 }
2377 
2378 /*
2379  * Vnode put/release.
2380  * If count drops to zero, call inactive routine and return to freelist.
2381  */
2382 void
2383 vrele(struct vnode *vp)
2384 {
2385 
2386 	vputx(vp, VPUTX_VRELE);
2387 }
2388 
2389 /*
2390  * Release an already locked vnode.  This give the same effects as
2391  * unlock+vrele(), but takes less time and avoids releasing and
2392  * re-aquiring the lock (as vrele() acquires the lock internally.)
2393  */
2394 void
2395 vput(struct vnode *vp)
2396 {
2397 
2398 	vputx(vp, VPUTX_VPUT);
2399 }
2400 
2401 /*
2402  * Release an exclusively locked vnode. Do not unlock the vnode lock.
2403  */
2404 void
2405 vunref(struct vnode *vp)
2406 {
2407 
2408 	vputx(vp, VPUTX_VUNREF);
2409 }
2410 
2411 /*
2412  * Increase the hold count and activate if this is the first reference.
2413  */
2414 void
2415 _vhold(struct vnode *vp, bool locked)
2416 {
2417 	struct mount *mp;
2418 
2419 	if (locked)
2420 		ASSERT_VI_LOCKED(vp, __func__);
2421 	else
2422 		ASSERT_VI_UNLOCKED(vp, __func__);
2423 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2424 	if (!locked && vfs_refcount_acquire_if_not_zero(&vp->v_holdcnt)) {
2425 		VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2426 		    ("_vhold: vnode with holdcnt is free"));
2427 		return;
2428 	}
2429 
2430 	if (!locked)
2431 		VI_LOCK(vp);
2432 	if ((vp->v_iflag & VI_FREE) == 0) {
2433 		refcount_acquire(&vp->v_holdcnt);
2434 		if (!locked)
2435 			VI_UNLOCK(vp);
2436 		return;
2437 	}
2438 	VNASSERT(vp->v_holdcnt == 0, vp,
2439 	    ("%s: wrong hold count", __func__));
2440 	VNASSERT(vp->v_op != NULL, vp,
2441 	    ("%s: vnode already reclaimed.", __func__));
2442 	/*
2443 	 * Remove a vnode from the free list, mark it as in use,
2444 	 * and put it on the active list.
2445 	 */
2446 	mtx_lock(&vnode_free_list_mtx);
2447 	TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
2448 	freevnodes--;
2449 	vp->v_iflag &= ~(VI_FREE|VI_AGE);
2450 	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
2451 	    ("Activating already active vnode"));
2452 	vp->v_iflag |= VI_ACTIVE;
2453 	mp = vp->v_mount;
2454 	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
2455 	mp->mnt_activevnodelistsize++;
2456 	mtx_unlock(&vnode_free_list_mtx);
2457 	refcount_acquire(&vp->v_holdcnt);
2458 	if (!locked)
2459 		VI_UNLOCK(vp);
2460 }
2461 
2462 /*
2463  * Drop the hold count of the vnode.  If this is the last reference to
2464  * the vnode we place it on the free list unless it has been vgone'd
2465  * (marked VI_DOOMED) in which case we will free it.
2466  */
2467 void
2468 _vdrop(struct vnode *vp, bool locked)
2469 {
2470 	struct bufobj *bo;
2471 	struct mount *mp;
2472 	int active;
2473 
2474 	if (locked)
2475 		ASSERT_VI_LOCKED(vp, __func__);
2476 	else
2477 		ASSERT_VI_UNLOCKED(vp, __func__);
2478 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2479 	if ((int)vp->v_holdcnt <= 0)
2480 		panic("vdrop: holdcnt %d", vp->v_holdcnt);
2481 	if (vfs_refcount_release_if_not_last(&vp->v_holdcnt)) {
2482 		if (locked)
2483 			VI_UNLOCK(vp);
2484 		return;
2485 	}
2486 
2487 	if (!locked)
2488 		VI_LOCK(vp);
2489 	if (refcount_release(&vp->v_holdcnt) == 0) {
2490 		VI_UNLOCK(vp);
2491 		return;
2492 	}
2493 	if ((vp->v_iflag & VI_DOOMED) == 0) {
2494 		/*
2495 		 * Mark a vnode as free: remove it from its active list
2496 		 * and put it up for recycling on the freelist.
2497 		 */
2498 		VNASSERT(vp->v_op != NULL, vp,
2499 		    ("vdropl: vnode already reclaimed."));
2500 		VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2501 		    ("vnode already free"));
2502 		VNASSERT(vp->v_holdcnt == 0, vp,
2503 		    ("vdropl: freeing when we shouldn't"));
2504 		active = vp->v_iflag & VI_ACTIVE;
2505 		if ((vp->v_iflag & VI_OWEINACT) == 0) {
2506 			vp->v_iflag &= ~VI_ACTIVE;
2507 			mp = vp->v_mount;
2508 			mtx_lock(&vnode_free_list_mtx);
2509 			if (active) {
2510 				TAILQ_REMOVE(&mp->mnt_activevnodelist, vp,
2511 				    v_actfreelist);
2512 				mp->mnt_activevnodelistsize--;
2513 			}
2514 			if (vp->v_iflag & VI_AGE) {
2515 				TAILQ_INSERT_HEAD(&vnode_free_list, vp,
2516 				    v_actfreelist);
2517 			} else {
2518 				TAILQ_INSERT_TAIL(&vnode_free_list, vp,
2519 				    v_actfreelist);
2520 			}
2521 			freevnodes++;
2522 			vp->v_iflag &= ~VI_AGE;
2523 			vp->v_iflag |= VI_FREE;
2524 			mtx_unlock(&vnode_free_list_mtx);
2525 		} else {
2526 			atomic_add_long(&free_owe_inact, 1);
2527 		}
2528 		VI_UNLOCK(vp);
2529 		return;
2530 	}
2531 	/*
2532 	 * The vnode has been marked for destruction, so free it.
2533 	 */
2534 	CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
2535 	atomic_subtract_long(&numvnodes, 1);
2536 	bo = &vp->v_bufobj;
2537 	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2538 	    ("cleaned vnode still on the free list."));
2539 	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
2540 	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
2541 	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
2542 	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
2543 	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
2544 	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
2545 	VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
2546 	    ("clean blk trie not empty"));
2547 	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
2548 	VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
2549 	    ("dirty blk trie not empty"));
2550 	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
2551 	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
2552 	VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
2553 	VI_UNLOCK(vp);
2554 #ifdef MAC
2555 	mac_vnode_destroy(vp);
2556 #endif
2557 	if (vp->v_pollinfo != NULL)
2558 		destroy_vpollinfo(vp->v_pollinfo);
2559 #ifdef INVARIANTS
2560 	/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
2561 	vp->v_op = NULL;
2562 #endif
2563 	rangelock_destroy(&vp->v_rl);
2564 	lockdestroy(vp->v_vnlock);
2565 	mtx_destroy(&vp->v_interlock);
2566 	rw_destroy(BO_LOCKPTR(bo));
2567 	uma_zfree(vnode_zone, vp);
2568 }
2569 
2570 /*
2571  * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
2572  * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
2573  * OWEINACT tracks whether a vnode missed a call to inactive due to a
2574  * failed lock upgrade.
2575  */
2576 void
2577 vinactive(struct vnode *vp, struct thread *td)
2578 {
2579 	struct vm_object *obj;
2580 
2581 	ASSERT_VOP_ELOCKED(vp, "vinactive");
2582 	ASSERT_VI_LOCKED(vp, "vinactive");
2583 	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
2584 	    ("vinactive: recursed on VI_DOINGINACT"));
2585 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2586 	vp->v_iflag |= VI_DOINGINACT;
2587 	vp->v_iflag &= ~VI_OWEINACT;
2588 	VI_UNLOCK(vp);
2589 	/*
2590 	 * Before moving off the active list, we must be sure that any
2591 	 * modified pages are on the vnode's dirty list since these will
2592 	 * no longer be checked once the vnode is on the inactive list.
2593 	 * Because the vnode vm object keeps a hold reference on the vnode
2594 	 * if there is at least one resident non-cached page, the vnode
2595 	 * cannot leave the active list without the page cleanup done.
2596 	 */
2597 	obj = vp->v_object;
2598 	if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) {
2599 		VM_OBJECT_WLOCK(obj);
2600 		vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC);
2601 		VM_OBJECT_WUNLOCK(obj);
2602 	}
2603 	VOP_INACTIVE(vp, td);
2604 	VI_LOCK(vp);
2605 	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
2606 	    ("vinactive: lost VI_DOINGINACT"));
2607 	vp->v_iflag &= ~VI_DOINGINACT;
2608 }
2609 
2610 /*
2611  * Remove any vnodes in the vnode table belonging to mount point mp.
2612  *
2613  * If FORCECLOSE is not specified, there should not be any active ones,
2614  * return error if any are found (nb: this is a user error, not a
2615  * system error). If FORCECLOSE is specified, detach any active vnodes
2616  * that are found.
2617  *
2618  * If WRITECLOSE is set, only flush out regular file vnodes open for
2619  * writing.
2620  *
2621  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2622  *
2623  * `rootrefs' specifies the base reference count for the root vnode
2624  * of this filesystem. The root vnode is considered busy if its
2625  * v_usecount exceeds this value. On a successful return, vflush(, td)
2626  * will call vrele() on the root vnode exactly rootrefs times.
2627  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2628  * be zero.
2629  */
2630 #ifdef DIAGNOSTIC
2631 static int busyprt = 0;		/* print out busy vnodes */
2632 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
2633 #endif
2634 
2635 int
2636 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
2637 {
2638 	struct vnode *vp, *mvp, *rootvp = NULL;
2639 	struct vattr vattr;
2640 	int busy = 0, error;
2641 
2642 	CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
2643 	    rootrefs, flags);
2644 	if (rootrefs > 0) {
2645 		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2646 		    ("vflush: bad args"));
2647 		/*
2648 		 * Get the filesystem root vnode. We can vput() it
2649 		 * immediately, since with rootrefs > 0, it won't go away.
2650 		 */
2651 		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
2652 			CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
2653 			    __func__, error);
2654 			return (error);
2655 		}
2656 		vput(rootvp);
2657 	}
2658 loop:
2659 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2660 		vholdl(vp);
2661 		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
2662 		if (error) {
2663 			vdrop(vp);
2664 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2665 			goto loop;
2666 		}
2667 		/*
2668 		 * Skip over a vnodes marked VV_SYSTEM.
2669 		 */
2670 		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2671 			VOP_UNLOCK(vp, 0);
2672 			vdrop(vp);
2673 			continue;
2674 		}
2675 		/*
2676 		 * If WRITECLOSE is set, flush out unlinked but still open
2677 		 * files (even if open only for reading) and regular file
2678 		 * vnodes open for writing.
2679 		 */
2680 		if (flags & WRITECLOSE) {
2681 			if (vp->v_object != NULL) {
2682 				VM_OBJECT_WLOCK(vp->v_object);
2683 				vm_object_page_clean(vp->v_object, 0, 0, 0);
2684 				VM_OBJECT_WUNLOCK(vp->v_object);
2685 			}
2686 			error = VOP_FSYNC(vp, MNT_WAIT, td);
2687 			if (error != 0) {
2688 				VOP_UNLOCK(vp, 0);
2689 				vdrop(vp);
2690 				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2691 				return (error);
2692 			}
2693 			error = VOP_GETATTR(vp, &vattr, td->td_ucred);
2694 			VI_LOCK(vp);
2695 
2696 			if ((vp->v_type == VNON ||
2697 			    (error == 0 && vattr.va_nlink > 0)) &&
2698 			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
2699 				VOP_UNLOCK(vp, 0);
2700 				vdropl(vp);
2701 				continue;
2702 			}
2703 		} else
2704 			VI_LOCK(vp);
2705 		/*
2706 		 * With v_usecount == 0, all we need to do is clear out the
2707 		 * vnode data structures and we are done.
2708 		 *
2709 		 * If FORCECLOSE is set, forcibly close the vnode.
2710 		 */
2711 		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
2712 			vgonel(vp);
2713 		} else {
2714 			busy++;
2715 #ifdef DIAGNOSTIC
2716 			if (busyprt)
2717 				vprint("vflush: busy vnode", vp);
2718 #endif
2719 		}
2720 		VOP_UNLOCK(vp, 0);
2721 		vdropl(vp);
2722 	}
2723 	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2724 		/*
2725 		 * If just the root vnode is busy, and if its refcount
2726 		 * is equal to `rootrefs', then go ahead and kill it.
2727 		 */
2728 		VI_LOCK(rootvp);
2729 		KASSERT(busy > 0, ("vflush: not busy"));
2730 		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
2731 		    ("vflush: usecount %d < rootrefs %d",
2732 		     rootvp->v_usecount, rootrefs));
2733 		if (busy == 1 && rootvp->v_usecount == rootrefs) {
2734 			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
2735 			vgone(rootvp);
2736 			VOP_UNLOCK(rootvp, 0);
2737 			busy = 0;
2738 		} else
2739 			VI_UNLOCK(rootvp);
2740 	}
2741 	if (busy) {
2742 		CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
2743 		    busy);
2744 		return (EBUSY);
2745 	}
2746 	for (; rootrefs > 0; rootrefs--)
2747 		vrele(rootvp);
2748 	return (0);
2749 }
2750 
2751 /*
2752  * Recycle an unused vnode to the front of the free list.
2753  */
2754 int
2755 vrecycle(struct vnode *vp)
2756 {
2757 	int recycled;
2758 
2759 	ASSERT_VOP_ELOCKED(vp, "vrecycle");
2760 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2761 	recycled = 0;
2762 	VI_LOCK(vp);
2763 	if (vp->v_usecount == 0) {
2764 		recycled = 1;
2765 		vgonel(vp);
2766 	}
2767 	VI_UNLOCK(vp);
2768 	return (recycled);
2769 }
2770 
2771 /*
2772  * Eliminate all activity associated with a vnode
2773  * in preparation for reuse.
2774  */
2775 void
2776 vgone(struct vnode *vp)
2777 {
2778 	VI_LOCK(vp);
2779 	vgonel(vp);
2780 	VI_UNLOCK(vp);
2781 }
2782 
2783 static void
2784 notify_lowervp_vfs_dummy(struct mount *mp __unused,
2785     struct vnode *lowervp __unused)
2786 {
2787 }
2788 
2789 /*
2790  * Notify upper mounts about reclaimed or unlinked vnode.
2791  */
2792 void
2793 vfs_notify_upper(struct vnode *vp, int event)
2794 {
2795 	static struct vfsops vgonel_vfsops = {
2796 		.vfs_reclaim_lowervp = notify_lowervp_vfs_dummy,
2797 		.vfs_unlink_lowervp = notify_lowervp_vfs_dummy,
2798 	};
2799 	struct mount *mp, *ump, *mmp;
2800 
2801 	mp = vp->v_mount;
2802 	if (mp == NULL)
2803 		return;
2804 
2805 	MNT_ILOCK(mp);
2806 	if (TAILQ_EMPTY(&mp->mnt_uppers))
2807 		goto unlock;
2808 	MNT_IUNLOCK(mp);
2809 	mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
2810 	mmp->mnt_op = &vgonel_vfsops;
2811 	mmp->mnt_kern_flag |= MNTK_MARKER;
2812 	MNT_ILOCK(mp);
2813 	mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
2814 	for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
2815 		if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
2816 			ump = TAILQ_NEXT(ump, mnt_upper_link);
2817 			continue;
2818 		}
2819 		TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
2820 		MNT_IUNLOCK(mp);
2821 		switch (event) {
2822 		case VFS_NOTIFY_UPPER_RECLAIM:
2823 			VFS_RECLAIM_LOWERVP(ump, vp);
2824 			break;
2825 		case VFS_NOTIFY_UPPER_UNLINK:
2826 			VFS_UNLINK_LOWERVP(ump, vp);
2827 			break;
2828 		default:
2829 			KASSERT(0, ("invalid event %d", event));
2830 			break;
2831 		}
2832 		MNT_ILOCK(mp);
2833 		ump = TAILQ_NEXT(mmp, mnt_upper_link);
2834 		TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
2835 	}
2836 	free(mmp, M_TEMP);
2837 	mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
2838 	if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
2839 		mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
2840 		wakeup(&mp->mnt_uppers);
2841 	}
2842 unlock:
2843 	MNT_IUNLOCK(mp);
2844 }
2845 
2846 /*
2847  * vgone, with the vp interlock held.
2848  */
2849 static void
2850 vgonel(struct vnode *vp)
2851 {
2852 	struct thread *td;
2853 	int oweinact;
2854 	int active;
2855 	struct mount *mp;
2856 
2857 	ASSERT_VOP_ELOCKED(vp, "vgonel");
2858 	ASSERT_VI_LOCKED(vp, "vgonel");
2859 	VNASSERT(vp->v_holdcnt, vp,
2860 	    ("vgonel: vp %p has no reference.", vp));
2861 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2862 	td = curthread;
2863 
2864 	/*
2865 	 * Don't vgonel if we're already doomed.
2866 	 */
2867 	if (vp->v_iflag & VI_DOOMED)
2868 		return;
2869 	vp->v_iflag |= VI_DOOMED;
2870 
2871 	/*
2872 	 * Check to see if the vnode is in use.  If so, we have to call
2873 	 * VOP_CLOSE() and VOP_INACTIVE().
2874 	 */
2875 	active = vp->v_usecount;
2876 	oweinact = (vp->v_iflag & VI_OWEINACT);
2877 	VI_UNLOCK(vp);
2878 	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
2879 
2880 	/*
2881 	 * If purging an active vnode, it must be closed and
2882 	 * deactivated before being reclaimed.
2883 	 */
2884 	if (active)
2885 		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2886 	if (oweinact || active) {
2887 		VI_LOCK(vp);
2888 		if ((vp->v_iflag & VI_DOINGINACT) == 0)
2889 			vinactive(vp, td);
2890 		VI_UNLOCK(vp);
2891 	}
2892 	if (vp->v_type == VSOCK)
2893 		vfs_unp_reclaim(vp);
2894 
2895 	/*
2896 	 * Clean out any buffers associated with the vnode.
2897 	 * If the flush fails, just toss the buffers.
2898 	 */
2899 	mp = NULL;
2900 	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
2901 		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
2902 	if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
2903 		while (vinvalbuf(vp, 0, 0, 0) != 0)
2904 			;
2905 	}
2906 
2907 	BO_LOCK(&vp->v_bufobj);
2908 	KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
2909 	    vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
2910 	    TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
2911 	    vp->v_bufobj.bo_clean.bv_cnt == 0,
2912 	    ("vp %p bufobj not invalidated", vp));
2913 	vp->v_bufobj.bo_flag |= BO_DEAD;
2914 	BO_UNLOCK(&vp->v_bufobj);
2915 
2916 	/*
2917 	 * Reclaim the vnode.
2918 	 */
2919 	if (VOP_RECLAIM(vp, td))
2920 		panic("vgone: cannot reclaim");
2921 	if (mp != NULL)
2922 		vn_finished_secondary_write(mp);
2923 	VNASSERT(vp->v_object == NULL, vp,
2924 	    ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
2925 	/*
2926 	 * Clear the advisory locks and wake up waiting threads.
2927 	 */
2928 	(void)VOP_ADVLOCKPURGE(vp);
2929 	/*
2930 	 * Delete from old mount point vnode list.
2931 	 */
2932 	delmntque(vp);
2933 	cache_purge(vp);
2934 	/*
2935 	 * Done with purge, reset to the standard lock and invalidate
2936 	 * the vnode.
2937 	 */
2938 	VI_LOCK(vp);
2939 	vp->v_vnlock = &vp->v_lock;
2940 	vp->v_op = &dead_vnodeops;
2941 	vp->v_tag = "none";
2942 	vp->v_type = VBAD;
2943 }
2944 
2945 /*
2946  * Calculate the total number of references to a special device.
2947  */
2948 int
2949 vcount(struct vnode *vp)
2950 {
2951 	int count;
2952 
2953 	dev_lock();
2954 	count = vp->v_rdev->si_usecount;
2955 	dev_unlock();
2956 	return (count);
2957 }
2958 
2959 /*
2960  * Same as above, but using the struct cdev *as argument
2961  */
2962 int
2963 count_dev(struct cdev *dev)
2964 {
2965 	int count;
2966 
2967 	dev_lock();
2968 	count = dev->si_usecount;
2969 	dev_unlock();
2970 	return(count);
2971 }
2972 
2973 /*
2974  * Print out a description of a vnode.
2975  */
2976 static char *typename[] =
2977 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
2978  "VMARKER"};
2979 
2980 void
2981 vn_printf(struct vnode *vp, const char *fmt, ...)
2982 {
2983 	va_list ap;
2984 	char buf[256], buf2[16];
2985 	u_long flags;
2986 
2987 	va_start(ap, fmt);
2988 	vprintf(fmt, ap);
2989 	va_end(ap);
2990 	printf("%p: ", (void *)vp);
2991 	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
2992 	printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
2993 	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
2994 	buf[0] = '\0';
2995 	buf[1] = '\0';
2996 	if (vp->v_vflag & VV_ROOT)
2997 		strlcat(buf, "|VV_ROOT", sizeof(buf));
2998 	if (vp->v_vflag & VV_ISTTY)
2999 		strlcat(buf, "|VV_ISTTY", sizeof(buf));
3000 	if (vp->v_vflag & VV_NOSYNC)
3001 		strlcat(buf, "|VV_NOSYNC", sizeof(buf));
3002 	if (vp->v_vflag & VV_ETERNALDEV)
3003 		strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
3004 	if (vp->v_vflag & VV_CACHEDLABEL)
3005 		strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
3006 	if (vp->v_vflag & VV_TEXT)
3007 		strlcat(buf, "|VV_TEXT", sizeof(buf));
3008 	if (vp->v_vflag & VV_COPYONWRITE)
3009 		strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
3010 	if (vp->v_vflag & VV_SYSTEM)
3011 		strlcat(buf, "|VV_SYSTEM", sizeof(buf));
3012 	if (vp->v_vflag & VV_PROCDEP)
3013 		strlcat(buf, "|VV_PROCDEP", sizeof(buf));
3014 	if (vp->v_vflag & VV_NOKNOTE)
3015 		strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
3016 	if (vp->v_vflag & VV_DELETED)
3017 		strlcat(buf, "|VV_DELETED", sizeof(buf));
3018 	if (vp->v_vflag & VV_MD)
3019 		strlcat(buf, "|VV_MD", sizeof(buf));
3020 	if (vp->v_vflag & VV_FORCEINSMQ)
3021 		strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
3022 	flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
3023 	    VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
3024 	    VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ);
3025 	if (flags != 0) {
3026 		snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
3027 		strlcat(buf, buf2, sizeof(buf));
3028 	}
3029 	if (vp->v_iflag & VI_MOUNT)
3030 		strlcat(buf, "|VI_MOUNT", sizeof(buf));
3031 	if (vp->v_iflag & VI_AGE)
3032 		strlcat(buf, "|VI_AGE", sizeof(buf));
3033 	if (vp->v_iflag & VI_DOOMED)
3034 		strlcat(buf, "|VI_DOOMED", sizeof(buf));
3035 	if (vp->v_iflag & VI_FREE)
3036 		strlcat(buf, "|VI_FREE", sizeof(buf));
3037 	if (vp->v_iflag & VI_ACTIVE)
3038 		strlcat(buf, "|VI_ACTIVE", sizeof(buf));
3039 	if (vp->v_iflag & VI_DOINGINACT)
3040 		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
3041 	if (vp->v_iflag & VI_OWEINACT)
3042 		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
3043 	flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE |
3044 	    VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT);
3045 	if (flags != 0) {
3046 		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
3047 		strlcat(buf, buf2, sizeof(buf));
3048 	}
3049 	printf("    flags (%s)\n", buf + 1);
3050 	if (mtx_owned(VI_MTX(vp)))
3051 		printf(" VI_LOCKed");
3052 	if (vp->v_object != NULL)
3053 		printf("    v_object %p ref %d pages %d "
3054 		    "cleanbuf %d dirtybuf %d\n",
3055 		    vp->v_object, vp->v_object->ref_count,
3056 		    vp->v_object->resident_page_count,
3057 		    vp->v_bufobj.bo_dirty.bv_cnt,
3058 		    vp->v_bufobj.bo_clean.bv_cnt);
3059 	printf("    ");
3060 	lockmgr_printinfo(vp->v_vnlock);
3061 	if (vp->v_data != NULL)
3062 		VOP_PRINT(vp);
3063 }
3064 
3065 #ifdef DDB
3066 /*
3067  * List all of the locked vnodes in the system.
3068  * Called when debugging the kernel.
3069  */
3070 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
3071 {
3072 	struct mount *mp;
3073 	struct vnode *vp;
3074 
3075 	/*
3076 	 * Note: because this is DDB, we can't obey the locking semantics
3077 	 * for these structures, which means we could catch an inconsistent
3078 	 * state and dereference a nasty pointer.  Not much to be done
3079 	 * about that.
3080 	 */
3081 	db_printf("Locked vnodes\n");
3082 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3083 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3084 			if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
3085 				vprint("", vp);
3086 		}
3087 	}
3088 }
3089 
3090 /*
3091  * Show details about the given vnode.
3092  */
3093 DB_SHOW_COMMAND(vnode, db_show_vnode)
3094 {
3095 	struct vnode *vp;
3096 
3097 	if (!have_addr)
3098 		return;
3099 	vp = (struct vnode *)addr;
3100 	vn_printf(vp, "vnode ");
3101 }
3102 
3103 /*
3104  * Show details about the given mount point.
3105  */
3106 DB_SHOW_COMMAND(mount, db_show_mount)
3107 {
3108 	struct mount *mp;
3109 	struct vfsopt *opt;
3110 	struct statfs *sp;
3111 	struct vnode *vp;
3112 	char buf[512];
3113 	uint64_t mflags;
3114 	u_int flags;
3115 
3116 	if (!have_addr) {
3117 		/* No address given, print short info about all mount points. */
3118 		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3119 			db_printf("%p %s on %s (%s)\n", mp,
3120 			    mp->mnt_stat.f_mntfromname,
3121 			    mp->mnt_stat.f_mntonname,
3122 			    mp->mnt_stat.f_fstypename);
3123 			if (db_pager_quit)
3124 				break;
3125 		}
3126 		db_printf("\nMore info: show mount <addr>\n");
3127 		return;
3128 	}
3129 
3130 	mp = (struct mount *)addr;
3131 	db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
3132 	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
3133 
3134 	buf[0] = '\0';
3135 	mflags = mp->mnt_flag;
3136 #define	MNT_FLAG(flag)	do {						\
3137 	if (mflags & (flag)) {						\
3138 		if (buf[0] != '\0')					\
3139 			strlcat(buf, ", ", sizeof(buf));		\
3140 		strlcat(buf, (#flag) + 4, sizeof(buf));			\
3141 		mflags &= ~(flag);					\
3142 	}								\
3143 } while (0)
3144 	MNT_FLAG(MNT_RDONLY);
3145 	MNT_FLAG(MNT_SYNCHRONOUS);
3146 	MNT_FLAG(MNT_NOEXEC);
3147 	MNT_FLAG(MNT_NOSUID);
3148 	MNT_FLAG(MNT_NFS4ACLS);
3149 	MNT_FLAG(MNT_UNION);
3150 	MNT_FLAG(MNT_ASYNC);
3151 	MNT_FLAG(MNT_SUIDDIR);
3152 	MNT_FLAG(MNT_SOFTDEP);
3153 	MNT_FLAG(MNT_NOSYMFOLLOW);
3154 	MNT_FLAG(MNT_GJOURNAL);
3155 	MNT_FLAG(MNT_MULTILABEL);
3156 	MNT_FLAG(MNT_ACLS);
3157 	MNT_FLAG(MNT_NOATIME);
3158 	MNT_FLAG(MNT_NOCLUSTERR);
3159 	MNT_FLAG(MNT_NOCLUSTERW);
3160 	MNT_FLAG(MNT_SUJ);
3161 	MNT_FLAG(MNT_EXRDONLY);
3162 	MNT_FLAG(MNT_EXPORTED);
3163 	MNT_FLAG(MNT_DEFEXPORTED);
3164 	MNT_FLAG(MNT_EXPORTANON);
3165 	MNT_FLAG(MNT_EXKERB);
3166 	MNT_FLAG(MNT_EXPUBLIC);
3167 	MNT_FLAG(MNT_LOCAL);
3168 	MNT_FLAG(MNT_QUOTA);
3169 	MNT_FLAG(MNT_ROOTFS);
3170 	MNT_FLAG(MNT_USER);
3171 	MNT_FLAG(MNT_IGNORE);
3172 	MNT_FLAG(MNT_UPDATE);
3173 	MNT_FLAG(MNT_DELEXPORT);
3174 	MNT_FLAG(MNT_RELOAD);
3175 	MNT_FLAG(MNT_FORCE);
3176 	MNT_FLAG(MNT_SNAPSHOT);
3177 	MNT_FLAG(MNT_BYFSID);
3178 #undef MNT_FLAG
3179 	if (mflags != 0) {
3180 		if (buf[0] != '\0')
3181 			strlcat(buf, ", ", sizeof(buf));
3182 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
3183 		    "0x%016jx", mflags);
3184 	}
3185 	db_printf("    mnt_flag = %s\n", buf);
3186 
3187 	buf[0] = '\0';
3188 	flags = mp->mnt_kern_flag;
3189 #define	MNT_KERN_FLAG(flag)	do {					\
3190 	if (flags & (flag)) {						\
3191 		if (buf[0] != '\0')					\
3192 			strlcat(buf, ", ", sizeof(buf));		\
3193 		strlcat(buf, (#flag) + 5, sizeof(buf));			\
3194 		flags &= ~(flag);					\
3195 	}								\
3196 } while (0)
3197 	MNT_KERN_FLAG(MNTK_UNMOUNTF);
3198 	MNT_KERN_FLAG(MNTK_ASYNC);
3199 	MNT_KERN_FLAG(MNTK_SOFTDEP);
3200 	MNT_KERN_FLAG(MNTK_NOINSMNTQ);
3201 	MNT_KERN_FLAG(MNTK_DRAINING);
3202 	MNT_KERN_FLAG(MNTK_REFEXPIRE);
3203 	MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
3204 	MNT_KERN_FLAG(MNTK_SHARED_WRITES);
3205 	MNT_KERN_FLAG(MNTK_NO_IOPF);
3206 	MNT_KERN_FLAG(MNTK_VGONE_UPPER);
3207 	MNT_KERN_FLAG(MNTK_VGONE_WAITER);
3208 	MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
3209 	MNT_KERN_FLAG(MNTK_MARKER);
3210 	MNT_KERN_FLAG(MNTK_USES_BCACHE);
3211 	MNT_KERN_FLAG(MNTK_NOASYNC);
3212 	MNT_KERN_FLAG(MNTK_UNMOUNT);
3213 	MNT_KERN_FLAG(MNTK_MWAIT);
3214 	MNT_KERN_FLAG(MNTK_SUSPEND);
3215 	MNT_KERN_FLAG(MNTK_SUSPEND2);
3216 	MNT_KERN_FLAG(MNTK_SUSPENDED);
3217 	MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
3218 	MNT_KERN_FLAG(MNTK_NOKNOTE);
3219 #undef MNT_KERN_FLAG
3220 	if (flags != 0) {
3221 		if (buf[0] != '\0')
3222 			strlcat(buf, ", ", sizeof(buf));
3223 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
3224 		    "0x%08x", flags);
3225 	}
3226 	db_printf("    mnt_kern_flag = %s\n", buf);
3227 
3228 	db_printf("    mnt_opt = ");
3229 	opt = TAILQ_FIRST(mp->mnt_opt);
3230 	if (opt != NULL) {
3231 		db_printf("%s", opt->name);
3232 		opt = TAILQ_NEXT(opt, link);
3233 		while (opt != NULL) {
3234 			db_printf(", %s", opt->name);
3235 			opt = TAILQ_NEXT(opt, link);
3236 		}
3237 	}
3238 	db_printf("\n");
3239 
3240 	sp = &mp->mnt_stat;
3241 	db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
3242 	    "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
3243 	    "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
3244 	    "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
3245 	    (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
3246 	    (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
3247 	    (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
3248 	    (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
3249 	    (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
3250 	    (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
3251 	    (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
3252 	    (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
3253 
3254 	db_printf("    mnt_cred = { uid=%u ruid=%u",
3255 	    (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
3256 	if (jailed(mp->mnt_cred))
3257 		db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
3258 	db_printf(" }\n");
3259 	db_printf("    mnt_ref = %d\n", mp->mnt_ref);
3260 	db_printf("    mnt_gen = %d\n", mp->mnt_gen);
3261 	db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
3262 	db_printf("    mnt_activevnodelistsize = %d\n",
3263 	    mp->mnt_activevnodelistsize);
3264 	db_printf("    mnt_writeopcount = %d\n", mp->mnt_writeopcount);
3265 	db_printf("    mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
3266 	db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
3267 	db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
3268 	db_printf("    mnt_lockref = %d\n", mp->mnt_lockref);
3269 	db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
3270 	db_printf("    mnt_secondary_accwrites = %d\n",
3271 	    mp->mnt_secondary_accwrites);
3272 	db_printf("    mnt_gjprovider = %s\n",
3273 	    mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
3274 
3275 	db_printf("\n\nList of active vnodes\n");
3276 	TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) {
3277 		if (vp->v_type != VMARKER) {
3278 			vn_printf(vp, "vnode ");
3279 			if (db_pager_quit)
3280 				break;
3281 		}
3282 	}
3283 	db_printf("\n\nList of inactive vnodes\n");
3284 	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3285 		if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) {
3286 			vn_printf(vp, "vnode ");
3287 			if (db_pager_quit)
3288 				break;
3289 		}
3290 	}
3291 }
3292 #endif	/* DDB */
3293 
3294 /*
3295  * Fill in a struct xvfsconf based on a struct vfsconf.
3296  */
3297 static int
3298 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
3299 {
3300 	struct xvfsconf xvfsp;
3301 
3302 	bzero(&xvfsp, sizeof(xvfsp));
3303 	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
3304 	xvfsp.vfc_typenum = vfsp->vfc_typenum;
3305 	xvfsp.vfc_refcount = vfsp->vfc_refcount;
3306 	xvfsp.vfc_flags = vfsp->vfc_flags;
3307 	/*
3308 	 * These are unused in userland, we keep them
3309 	 * to not break binary compatibility.
3310 	 */
3311 	xvfsp.vfc_vfsops = NULL;
3312 	xvfsp.vfc_next = NULL;
3313 	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3314 }
3315 
3316 #ifdef COMPAT_FREEBSD32
3317 struct xvfsconf32 {
3318 	uint32_t	vfc_vfsops;
3319 	char		vfc_name[MFSNAMELEN];
3320 	int32_t		vfc_typenum;
3321 	int32_t		vfc_refcount;
3322 	int32_t		vfc_flags;
3323 	uint32_t	vfc_next;
3324 };
3325 
3326 static int
3327 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
3328 {
3329 	struct xvfsconf32 xvfsp;
3330 
3331 	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
3332 	xvfsp.vfc_typenum = vfsp->vfc_typenum;
3333 	xvfsp.vfc_refcount = vfsp->vfc_refcount;
3334 	xvfsp.vfc_flags = vfsp->vfc_flags;
3335 	xvfsp.vfc_vfsops = 0;
3336 	xvfsp.vfc_next = 0;
3337 	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3338 }
3339 #endif
3340 
3341 /*
3342  * Top level filesystem related information gathering.
3343  */
3344 static int
3345 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
3346 {
3347 	struct vfsconf *vfsp;
3348 	int error;
3349 
3350 	error = 0;
3351 	vfsconf_slock();
3352 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3353 #ifdef COMPAT_FREEBSD32
3354 		if (req->flags & SCTL_MASK32)
3355 			error = vfsconf2x32(req, vfsp);
3356 		else
3357 #endif
3358 			error = vfsconf2x(req, vfsp);
3359 		if (error)
3360 			break;
3361 	}
3362 	vfsconf_sunlock();
3363 	return (error);
3364 }
3365 
3366 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
3367     CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
3368     "S,xvfsconf", "List of all configured filesystems");
3369 
3370 #ifndef BURN_BRIDGES
3371 static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
3372 
3373 static int
3374 vfs_sysctl(SYSCTL_HANDLER_ARGS)
3375 {
3376 	int *name = (int *)arg1 - 1;	/* XXX */
3377 	u_int namelen = arg2 + 1;	/* XXX */
3378 	struct vfsconf *vfsp;
3379 
3380 	log(LOG_WARNING, "userland calling deprecated sysctl, "
3381 	    "please rebuild world\n");
3382 
3383 #if 1 || defined(COMPAT_PRELITE2)
3384 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
3385 	if (namelen == 1)
3386 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
3387 #endif
3388 
3389 	switch (name[1]) {
3390 	case VFS_MAXTYPENUM:
3391 		if (namelen != 2)
3392 			return (ENOTDIR);
3393 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
3394 	case VFS_CONF:
3395 		if (namelen != 3)
3396 			return (ENOTDIR);	/* overloaded */
3397 		vfsconf_slock();
3398 		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3399 			if (vfsp->vfc_typenum == name[2])
3400 				break;
3401 		}
3402 		vfsconf_sunlock();
3403 		if (vfsp == NULL)
3404 			return (EOPNOTSUPP);
3405 #ifdef COMPAT_FREEBSD32
3406 		if (req->flags & SCTL_MASK32)
3407 			return (vfsconf2x32(req, vfsp));
3408 		else
3409 #endif
3410 			return (vfsconf2x(req, vfsp));
3411 	}
3412 	return (EOPNOTSUPP);
3413 }
3414 
3415 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
3416     CTLFLAG_MPSAFE, vfs_sysctl,
3417     "Generic filesystem");
3418 
3419 #if 1 || defined(COMPAT_PRELITE2)
3420 
3421 static int
3422 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
3423 {
3424 	int error;
3425 	struct vfsconf *vfsp;
3426 	struct ovfsconf ovfs;
3427 
3428 	vfsconf_slock();
3429 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3430 		bzero(&ovfs, sizeof(ovfs));
3431 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
3432 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
3433 		ovfs.vfc_index = vfsp->vfc_typenum;
3434 		ovfs.vfc_refcount = vfsp->vfc_refcount;
3435 		ovfs.vfc_flags = vfsp->vfc_flags;
3436 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
3437 		if (error != 0) {
3438 			vfsconf_sunlock();
3439 			return (error);
3440 		}
3441 	}
3442 	vfsconf_sunlock();
3443 	return (0);
3444 }
3445 
3446 #endif /* 1 || COMPAT_PRELITE2 */
3447 #endif /* !BURN_BRIDGES */
3448 
3449 #define KINFO_VNODESLOP		10
3450 #ifdef notyet
3451 /*
3452  * Dump vnode list (via sysctl).
3453  */
3454 /* ARGSUSED */
3455 static int
3456 sysctl_vnode(SYSCTL_HANDLER_ARGS)
3457 {
3458 	struct xvnode *xvn;
3459 	struct mount *mp;
3460 	struct vnode *vp;
3461 	int error, len, n;
3462 
3463 	/*
3464 	 * Stale numvnodes access is not fatal here.
3465 	 */
3466 	req->lock = 0;
3467 	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
3468 	if (!req->oldptr)
3469 		/* Make an estimate */
3470 		return (SYSCTL_OUT(req, 0, len));
3471 
3472 	error = sysctl_wire_old_buffer(req, 0);
3473 	if (error != 0)
3474 		return (error);
3475 	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
3476 	n = 0;
3477 	mtx_lock(&mountlist_mtx);
3478 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3479 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
3480 			continue;
3481 		MNT_ILOCK(mp);
3482 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3483 			if (n == len)
3484 				break;
3485 			vref(vp);
3486 			xvn[n].xv_size = sizeof *xvn;
3487 			xvn[n].xv_vnode = vp;
3488 			xvn[n].xv_id = 0;	/* XXX compat */
3489 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
3490 			XV_COPY(usecount);
3491 			XV_COPY(writecount);
3492 			XV_COPY(holdcnt);
3493 			XV_COPY(mount);
3494 			XV_COPY(numoutput);
3495 			XV_COPY(type);
3496 #undef XV_COPY
3497 			xvn[n].xv_flag = vp->v_vflag;
3498 
3499 			switch (vp->v_type) {
3500 			case VREG:
3501 			case VDIR:
3502 			case VLNK:
3503 				break;
3504 			case VBLK:
3505 			case VCHR:
3506 				if (vp->v_rdev == NULL) {
3507 					vrele(vp);
3508 					continue;
3509 				}
3510 				xvn[n].xv_dev = dev2udev(vp->v_rdev);
3511 				break;
3512 			case VSOCK:
3513 				xvn[n].xv_socket = vp->v_socket;
3514 				break;
3515 			case VFIFO:
3516 				xvn[n].xv_fifo = vp->v_fifoinfo;
3517 				break;
3518 			case VNON:
3519 			case VBAD:
3520 			default:
3521 				/* shouldn't happen? */
3522 				vrele(vp);
3523 				continue;
3524 			}
3525 			vrele(vp);
3526 			++n;
3527 		}
3528 		MNT_IUNLOCK(mp);
3529 		mtx_lock(&mountlist_mtx);
3530 		vfs_unbusy(mp);
3531 		if (n == len)
3532 			break;
3533 	}
3534 	mtx_unlock(&mountlist_mtx);
3535 
3536 	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
3537 	free(xvn, M_TEMP);
3538 	return (error);
3539 }
3540 
3541 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD |
3542     CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
3543     "");
3544 #endif
3545 
3546 static void
3547 unmount_or_warn(struct mount *mp)
3548 {
3549 	int error;
3550 
3551 	error = dounmount(mp, MNT_FORCE, curthread);
3552 	if (error != 0) {
3553 		printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
3554 		if (error == EBUSY)
3555 			printf("BUSY)\n");
3556 		else
3557 			printf("%d)\n", error);
3558 	}
3559 }
3560 
3561 /*
3562  * Unmount all filesystems. The list is traversed in reverse order
3563  * of mounting to avoid dependencies.
3564  */
3565 void
3566 vfs_unmountall(void)
3567 {
3568 	struct mount *mp, *tmp;
3569 
3570 	CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
3571 
3572 	/*
3573 	 * Since this only runs when rebooting, it is not interlocked.
3574 	 */
3575 	TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) {
3576 		vfs_ref(mp);
3577 
3578 		/*
3579 		 * Forcibly unmounting "/dev" before "/" would prevent clean
3580 		 * unmount of the latter.
3581 		 */
3582 		if (mp == rootdevmp)
3583 			continue;
3584 
3585 		unmount_or_warn(mp);
3586 	}
3587 
3588 	if (rootdevmp != NULL)
3589 		unmount_or_warn(rootdevmp);
3590 }
3591 
3592 /*
3593  * perform msync on all vnodes under a mount point
3594  * the mount point must be locked.
3595  */
3596 void
3597 vfs_msync(struct mount *mp, int flags)
3598 {
3599 	struct vnode *vp, *mvp;
3600 	struct vm_object *obj;
3601 
3602 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
3603 	MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) {
3604 		obj = vp->v_object;
3605 		if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 &&
3606 		    (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
3607 			if (!vget(vp,
3608 			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
3609 			    curthread)) {
3610 				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
3611 					vput(vp);
3612 					continue;
3613 				}
3614 
3615 				obj = vp->v_object;
3616 				if (obj != NULL) {
3617 					VM_OBJECT_WLOCK(obj);
3618 					vm_object_page_clean(obj, 0, 0,
3619 					    flags == MNT_WAIT ?
3620 					    OBJPC_SYNC : OBJPC_NOSYNC);
3621 					VM_OBJECT_WUNLOCK(obj);
3622 				}
3623 				vput(vp);
3624 			}
3625 		} else
3626 			VI_UNLOCK(vp);
3627 	}
3628 }
3629 
3630 static void
3631 destroy_vpollinfo_free(struct vpollinfo *vi)
3632 {
3633 
3634 	knlist_destroy(&vi->vpi_selinfo.si_note);
3635 	mtx_destroy(&vi->vpi_lock);
3636 	uma_zfree(vnodepoll_zone, vi);
3637 }
3638 
3639 static void
3640 destroy_vpollinfo(struct vpollinfo *vi)
3641 {
3642 
3643 	knlist_clear(&vi->vpi_selinfo.si_note, 1);
3644 	seldrain(&vi->vpi_selinfo);
3645 	destroy_vpollinfo_free(vi);
3646 }
3647 
3648 /*
3649  * Initalize per-vnode helper structure to hold poll-related state.
3650  */
3651 void
3652 v_addpollinfo(struct vnode *vp)
3653 {
3654 	struct vpollinfo *vi;
3655 
3656 	if (vp->v_pollinfo != NULL)
3657 		return;
3658 	vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO);
3659 	mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
3660 	knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
3661 	    vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
3662 	VI_LOCK(vp);
3663 	if (vp->v_pollinfo != NULL) {
3664 		VI_UNLOCK(vp);
3665 		destroy_vpollinfo_free(vi);
3666 		return;
3667 	}
3668 	vp->v_pollinfo = vi;
3669 	VI_UNLOCK(vp);
3670 }
3671 
3672 /*
3673  * Record a process's interest in events which might happen to
3674  * a vnode.  Because poll uses the historic select-style interface
3675  * internally, this routine serves as both the ``check for any
3676  * pending events'' and the ``record my interest in future events''
3677  * functions.  (These are done together, while the lock is held,
3678  * to avoid race conditions.)
3679  */
3680 int
3681 vn_pollrecord(struct vnode *vp, struct thread *td, int events)
3682 {
3683 
3684 	v_addpollinfo(vp);
3685 	mtx_lock(&vp->v_pollinfo->vpi_lock);
3686 	if (vp->v_pollinfo->vpi_revents & events) {
3687 		/*
3688 		 * This leaves events we are not interested
3689 		 * in available for the other process which
3690 		 * which presumably had requested them
3691 		 * (otherwise they would never have been
3692 		 * recorded).
3693 		 */
3694 		events &= vp->v_pollinfo->vpi_revents;
3695 		vp->v_pollinfo->vpi_revents &= ~events;
3696 
3697 		mtx_unlock(&vp->v_pollinfo->vpi_lock);
3698 		return (events);
3699 	}
3700 	vp->v_pollinfo->vpi_events |= events;
3701 	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3702 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3703 	return (0);
3704 }
3705 
3706 /*
3707  * Routine to create and manage a filesystem syncer vnode.
3708  */
3709 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
3710 static int	sync_fsync(struct  vop_fsync_args *);
3711 static int	sync_inactive(struct  vop_inactive_args *);
3712 static int	sync_reclaim(struct  vop_reclaim_args *);
3713 
3714 static struct vop_vector sync_vnodeops = {
3715 	.vop_bypass =	VOP_EOPNOTSUPP,
3716 	.vop_close =	sync_close,		/* close */
3717 	.vop_fsync =	sync_fsync,		/* fsync */
3718 	.vop_inactive =	sync_inactive,	/* inactive */
3719 	.vop_reclaim =	sync_reclaim,	/* reclaim */
3720 	.vop_lock1 =	vop_stdlock,	/* lock */
3721 	.vop_unlock =	vop_stdunlock,	/* unlock */
3722 	.vop_islocked =	vop_stdislocked,	/* islocked */
3723 };
3724 
3725 /*
3726  * Create a new filesystem syncer vnode for the specified mount point.
3727  */
3728 void
3729 vfs_allocate_syncvnode(struct mount *mp)
3730 {
3731 	struct vnode *vp;
3732 	struct bufobj *bo;
3733 	static long start, incr, next;
3734 	int error;
3735 
3736 	/* Allocate a new vnode */
3737 	error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
3738 	if (error != 0)
3739 		panic("vfs_allocate_syncvnode: getnewvnode() failed");
3740 	vp->v_type = VNON;
3741 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3742 	vp->v_vflag |= VV_FORCEINSMQ;
3743 	error = insmntque(vp, mp);
3744 	if (error != 0)
3745 		panic("vfs_allocate_syncvnode: insmntque() failed");
3746 	vp->v_vflag &= ~VV_FORCEINSMQ;
3747 	VOP_UNLOCK(vp, 0);
3748 	/*
3749 	 * Place the vnode onto the syncer worklist. We attempt to
3750 	 * scatter them about on the list so that they will go off
3751 	 * at evenly distributed times even if all the filesystems
3752 	 * are mounted at once.
3753 	 */
3754 	next += incr;
3755 	if (next == 0 || next > syncer_maxdelay) {
3756 		start /= 2;
3757 		incr /= 2;
3758 		if (start == 0) {
3759 			start = syncer_maxdelay / 2;
3760 			incr = syncer_maxdelay;
3761 		}
3762 		next = start;
3763 	}
3764 	bo = &vp->v_bufobj;
3765 	BO_LOCK(bo);
3766 	vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
3767 	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
3768 	mtx_lock(&sync_mtx);
3769 	sync_vnode_count++;
3770 	if (mp->mnt_syncer == NULL) {
3771 		mp->mnt_syncer = vp;
3772 		vp = NULL;
3773 	}
3774 	mtx_unlock(&sync_mtx);
3775 	BO_UNLOCK(bo);
3776 	if (vp != NULL) {
3777 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3778 		vgone(vp);
3779 		vput(vp);
3780 	}
3781 }
3782 
3783 void
3784 vfs_deallocate_syncvnode(struct mount *mp)
3785 {
3786 	struct vnode *vp;
3787 
3788 	mtx_lock(&sync_mtx);
3789 	vp = mp->mnt_syncer;
3790 	if (vp != NULL)
3791 		mp->mnt_syncer = NULL;
3792 	mtx_unlock(&sync_mtx);
3793 	if (vp != NULL)
3794 		vrele(vp);
3795 }
3796 
3797 /*
3798  * Do a lazy sync of the filesystem.
3799  */
3800 static int
3801 sync_fsync(struct vop_fsync_args *ap)
3802 {
3803 	struct vnode *syncvp = ap->a_vp;
3804 	struct mount *mp = syncvp->v_mount;
3805 	int error, save;
3806 	struct bufobj *bo;
3807 
3808 	/*
3809 	 * We only need to do something if this is a lazy evaluation.
3810 	 */
3811 	if (ap->a_waitfor != MNT_LAZY)
3812 		return (0);
3813 
3814 	/*
3815 	 * Move ourselves to the back of the sync list.
3816 	 */
3817 	bo = &syncvp->v_bufobj;
3818 	BO_LOCK(bo);
3819 	vn_syncer_add_to_worklist(bo, syncdelay);
3820 	BO_UNLOCK(bo);
3821 
3822 	/*
3823 	 * Walk the list of vnodes pushing all that are dirty and
3824 	 * not already on the sync list.
3825 	 */
3826 	if (vfs_busy(mp, MBF_NOWAIT) != 0)
3827 		return (0);
3828 	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3829 		vfs_unbusy(mp);
3830 		return (0);
3831 	}
3832 	save = curthread_pflags_set(TDP_SYNCIO);
3833 	vfs_msync(mp, MNT_NOWAIT);
3834 	error = VFS_SYNC(mp, MNT_LAZY);
3835 	curthread_pflags_restore(save);
3836 	vn_finished_write(mp);
3837 	vfs_unbusy(mp);
3838 	return (error);
3839 }
3840 
3841 /*
3842  * The syncer vnode is no referenced.
3843  */
3844 static int
3845 sync_inactive(struct vop_inactive_args *ap)
3846 {
3847 
3848 	vgone(ap->a_vp);
3849 	return (0);
3850 }
3851 
3852 /*
3853  * The syncer vnode is no longer needed and is being decommissioned.
3854  *
3855  * Modifications to the worklist must be protected by sync_mtx.
3856  */
3857 static int
3858 sync_reclaim(struct vop_reclaim_args *ap)
3859 {
3860 	struct vnode *vp = ap->a_vp;
3861 	struct bufobj *bo;
3862 
3863 	bo = &vp->v_bufobj;
3864 	BO_LOCK(bo);
3865 	mtx_lock(&sync_mtx);
3866 	if (vp->v_mount->mnt_syncer == vp)
3867 		vp->v_mount->mnt_syncer = NULL;
3868 	if (bo->bo_flag & BO_ONWORKLST) {
3869 		LIST_REMOVE(bo, bo_synclist);
3870 		syncer_worklist_len--;
3871 		sync_vnode_count--;
3872 		bo->bo_flag &= ~BO_ONWORKLST;
3873 	}
3874 	mtx_unlock(&sync_mtx);
3875 	BO_UNLOCK(bo);
3876 
3877 	return (0);
3878 }
3879 
3880 /*
3881  * Check if vnode represents a disk device
3882  */
3883 int
3884 vn_isdisk(struct vnode *vp, int *errp)
3885 {
3886 	int error;
3887 
3888 	if (vp->v_type != VCHR) {
3889 		error = ENOTBLK;
3890 		goto out;
3891 	}
3892 	error = 0;
3893 	dev_lock();
3894 	if (vp->v_rdev == NULL)
3895 		error = ENXIO;
3896 	else if (vp->v_rdev->si_devsw == NULL)
3897 		error = ENXIO;
3898 	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
3899 		error = ENOTBLK;
3900 	dev_unlock();
3901 out:
3902 	if (errp != NULL)
3903 		*errp = error;
3904 	return (error == 0);
3905 }
3906 
3907 /*
3908  * Common filesystem object access control check routine.  Accepts a
3909  * vnode's type, "mode", uid and gid, requested access mode, credentials,
3910  * and optional call-by-reference privused argument allowing vaccess()
3911  * to indicate to the caller whether privilege was used to satisfy the
3912  * request (obsoleted).  Returns 0 on success, or an errno on failure.
3913  */
3914 int
3915 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
3916     accmode_t accmode, struct ucred *cred, int *privused)
3917 {
3918 	accmode_t dac_granted;
3919 	accmode_t priv_granted;
3920 
3921 	KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
3922 	    ("invalid bit in accmode"));
3923 	KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
3924 	    ("VAPPEND without VWRITE"));
3925 
3926 	/*
3927 	 * Look for a normal, non-privileged way to access the file/directory
3928 	 * as requested.  If it exists, go with that.
3929 	 */
3930 
3931 	if (privused != NULL)
3932 		*privused = 0;
3933 
3934 	dac_granted = 0;
3935 
3936 	/* Check the owner. */
3937 	if (cred->cr_uid == file_uid) {
3938 		dac_granted |= VADMIN;
3939 		if (file_mode & S_IXUSR)
3940 			dac_granted |= VEXEC;
3941 		if (file_mode & S_IRUSR)
3942 			dac_granted |= VREAD;
3943 		if (file_mode & S_IWUSR)
3944 			dac_granted |= (VWRITE | VAPPEND);
3945 
3946 		if ((accmode & dac_granted) == accmode)
3947 			return (0);
3948 
3949 		goto privcheck;
3950 	}
3951 
3952 	/* Otherwise, check the groups (first match) */
3953 	if (groupmember(file_gid, cred)) {
3954 		if (file_mode & S_IXGRP)
3955 			dac_granted |= VEXEC;
3956 		if (file_mode & S_IRGRP)
3957 			dac_granted |= VREAD;
3958 		if (file_mode & S_IWGRP)
3959 			dac_granted |= (VWRITE | VAPPEND);
3960 
3961 		if ((accmode & dac_granted) == accmode)
3962 			return (0);
3963 
3964 		goto privcheck;
3965 	}
3966 
3967 	/* Otherwise, check everyone else. */
3968 	if (file_mode & S_IXOTH)
3969 		dac_granted |= VEXEC;
3970 	if (file_mode & S_IROTH)
3971 		dac_granted |= VREAD;
3972 	if (file_mode & S_IWOTH)
3973 		dac_granted |= (VWRITE | VAPPEND);
3974 	if ((accmode & dac_granted) == accmode)
3975 		return (0);
3976 
3977 privcheck:
3978 	/*
3979 	 * Build a privilege mask to determine if the set of privileges
3980 	 * satisfies the requirements when combined with the granted mask
3981 	 * from above.  For each privilege, if the privilege is required,
3982 	 * bitwise or the request type onto the priv_granted mask.
3983 	 */
3984 	priv_granted = 0;
3985 
3986 	if (type == VDIR) {
3987 		/*
3988 		 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
3989 		 * requests, instead of PRIV_VFS_EXEC.
3990 		 */
3991 		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3992 		    !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
3993 			priv_granted |= VEXEC;
3994 	} else {
3995 		/*
3996 		 * Ensure that at least one execute bit is on. Otherwise,
3997 		 * a privileged user will always succeed, and we don't want
3998 		 * this to happen unless the file really is executable.
3999 		 */
4000 		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
4001 		    (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
4002 		    !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
4003 			priv_granted |= VEXEC;
4004 	}
4005 
4006 	if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
4007 	    !priv_check_cred(cred, PRIV_VFS_READ, 0))
4008 		priv_granted |= VREAD;
4009 
4010 	if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
4011 	    !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
4012 		priv_granted |= (VWRITE | VAPPEND);
4013 
4014 	if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
4015 	    !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
4016 		priv_granted |= VADMIN;
4017 
4018 	if ((accmode & (priv_granted | dac_granted)) == accmode) {
4019 		/* XXX audit: privilege used */
4020 		if (privused != NULL)
4021 			*privused = 1;
4022 		return (0);
4023 	}
4024 
4025 	return ((accmode & VADMIN) ? EPERM : EACCES);
4026 }
4027 
4028 /*
4029  * Credential check based on process requesting service, and per-attribute
4030  * permissions.
4031  */
4032 int
4033 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
4034     struct thread *td, accmode_t accmode)
4035 {
4036 
4037 	/*
4038 	 * Kernel-invoked always succeeds.
4039 	 */
4040 	if (cred == NOCRED)
4041 		return (0);
4042 
4043 	/*
4044 	 * Do not allow privileged processes in jail to directly manipulate
4045 	 * system attributes.
4046 	 */
4047 	switch (attrnamespace) {
4048 	case EXTATTR_NAMESPACE_SYSTEM:
4049 		/* Potentially should be: return (EPERM); */
4050 		return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
4051 	case EXTATTR_NAMESPACE_USER:
4052 		return (VOP_ACCESS(vp, accmode, cred, td));
4053 	default:
4054 		return (EPERM);
4055 	}
4056 }
4057 
4058 #ifdef DEBUG_VFS_LOCKS
4059 /*
4060  * This only exists to supress warnings from unlocked specfs accesses.  It is
4061  * no longer ok to have an unlocked VFS.
4062  */
4063 #define	IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL ||		\
4064 	(vp)->v_type == VCHR ||	(vp)->v_type == VBAD)
4065 
4066 int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
4067 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
4068     "Drop into debugger on lock violation");
4069 
4070 int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
4071 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
4072     0, "Check for interlock across VOPs");
4073 
4074 int vfs_badlock_print = 1;	/* Print lock violations. */
4075 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
4076     0, "Print lock violations");
4077 
4078 #ifdef KDB
4079 int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
4080 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
4081     &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
4082 #endif
4083 
4084 static void
4085 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
4086 {
4087 
4088 #ifdef KDB
4089 	if (vfs_badlock_backtrace)
4090 		kdb_backtrace();
4091 #endif
4092 	if (vfs_badlock_print)
4093 		printf("%s: %p %s\n", str, (void *)vp, msg);
4094 	if (vfs_badlock_ddb)
4095 		kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
4096 }
4097 
4098 void
4099 assert_vi_locked(struct vnode *vp, const char *str)
4100 {
4101 
4102 	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
4103 		vfs_badlock("interlock is not locked but should be", str, vp);
4104 }
4105 
4106 void
4107 assert_vi_unlocked(struct vnode *vp, const char *str)
4108 {
4109 
4110 	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
4111 		vfs_badlock("interlock is locked but should not be", str, vp);
4112 }
4113 
4114 void
4115 assert_vop_locked(struct vnode *vp, const char *str)
4116 {
4117 	int locked;
4118 
4119 	if (!IGNORE_LOCK(vp)) {
4120 		locked = VOP_ISLOCKED(vp);
4121 		if (locked == 0 || locked == LK_EXCLOTHER)
4122 			vfs_badlock("is not locked but should be", str, vp);
4123 	}
4124 }
4125 
4126 void
4127 assert_vop_unlocked(struct vnode *vp, const char *str)
4128 {
4129 
4130 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
4131 		vfs_badlock("is locked but should not be", str, vp);
4132 }
4133 
4134 void
4135 assert_vop_elocked(struct vnode *vp, const char *str)
4136 {
4137 
4138 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
4139 		vfs_badlock("is not exclusive locked but should be", str, vp);
4140 }
4141 
4142 #if 0
4143 void
4144 assert_vop_elocked_other(struct vnode *vp, const char *str)
4145 {
4146 
4147 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER)
4148 		vfs_badlock("is not exclusive locked by another thread",
4149 		    str, vp);
4150 }
4151 
4152 void
4153 assert_vop_slocked(struct vnode *vp, const char *str)
4154 {
4155 
4156 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED)
4157 		vfs_badlock("is not locked shared but should be", str, vp);
4158 }
4159 #endif /* 0 */
4160 #endif /* DEBUG_VFS_LOCKS */
4161 
4162 void
4163 vop_rename_fail(struct vop_rename_args *ap)
4164 {
4165 
4166 	if (ap->a_tvp != NULL)
4167 		vput(ap->a_tvp);
4168 	if (ap->a_tdvp == ap->a_tvp)
4169 		vrele(ap->a_tdvp);
4170 	else
4171 		vput(ap->a_tdvp);
4172 	vrele(ap->a_fdvp);
4173 	vrele(ap->a_fvp);
4174 }
4175 
4176 void
4177 vop_rename_pre(void *ap)
4178 {
4179 	struct vop_rename_args *a = ap;
4180 
4181 #ifdef DEBUG_VFS_LOCKS
4182 	if (a->a_tvp)
4183 		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
4184 	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
4185 	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
4186 	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
4187 
4188 	/* Check the source (from). */
4189 	if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
4190 	    (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
4191 		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
4192 	if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
4193 		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
4194 
4195 	/* Check the target. */
4196 	if (a->a_tvp)
4197 		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
4198 	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
4199 #endif
4200 	if (a->a_tdvp != a->a_fdvp)
4201 		vhold(a->a_fdvp);
4202 	if (a->a_tvp != a->a_fvp)
4203 		vhold(a->a_fvp);
4204 	vhold(a->a_tdvp);
4205 	if (a->a_tvp)
4206 		vhold(a->a_tvp);
4207 }
4208 
4209 void
4210 vop_strategy_pre(void *ap)
4211 {
4212 #ifdef DEBUG_VFS_LOCKS
4213 	struct vop_strategy_args *a;
4214 	struct buf *bp;
4215 
4216 	a = ap;
4217 	bp = a->a_bp;
4218 
4219 	/*
4220 	 * Cluster ops lock their component buffers but not the IO container.
4221 	 */
4222 	if ((bp->b_flags & B_CLUSTER) != 0)
4223 		return;
4224 
4225 	if (panicstr == NULL && !BUF_ISLOCKED(bp)) {
4226 		if (vfs_badlock_print)
4227 			printf(
4228 			    "VOP_STRATEGY: bp is not locked but should be\n");
4229 		if (vfs_badlock_ddb)
4230 			kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
4231 	}
4232 #endif
4233 }
4234 
4235 void
4236 vop_lock_pre(void *ap)
4237 {
4238 #ifdef DEBUG_VFS_LOCKS
4239 	struct vop_lock1_args *a = ap;
4240 
4241 	if ((a->a_flags & LK_INTERLOCK) == 0)
4242 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
4243 	else
4244 		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
4245 #endif
4246 }
4247 
4248 void
4249 vop_lock_post(void *ap, int rc)
4250 {
4251 #ifdef DEBUG_VFS_LOCKS
4252 	struct vop_lock1_args *a = ap;
4253 
4254 	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
4255 	if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
4256 		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
4257 #endif
4258 }
4259 
4260 void
4261 vop_unlock_pre(void *ap)
4262 {
4263 #ifdef DEBUG_VFS_LOCKS
4264 	struct vop_unlock_args *a = ap;
4265 
4266 	if (a->a_flags & LK_INTERLOCK)
4267 		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
4268 	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
4269 #endif
4270 }
4271 
4272 void
4273 vop_unlock_post(void *ap, int rc)
4274 {
4275 #ifdef DEBUG_VFS_LOCKS
4276 	struct vop_unlock_args *a = ap;
4277 
4278 	if (a->a_flags & LK_INTERLOCK)
4279 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
4280 #endif
4281 }
4282 
4283 void
4284 vop_create_post(void *ap, int rc)
4285 {
4286 	struct vop_create_args *a = ap;
4287 
4288 	if (!rc)
4289 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4290 }
4291 
4292 void
4293 vop_deleteextattr_post(void *ap, int rc)
4294 {
4295 	struct vop_deleteextattr_args *a = ap;
4296 
4297 	if (!rc)
4298 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4299 }
4300 
4301 void
4302 vop_link_post(void *ap, int rc)
4303 {
4304 	struct vop_link_args *a = ap;
4305 
4306 	if (!rc) {
4307 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
4308 		VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
4309 	}
4310 }
4311 
4312 void
4313 vop_mkdir_post(void *ap, int rc)
4314 {
4315 	struct vop_mkdir_args *a = ap;
4316 
4317 	if (!rc)
4318 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4319 }
4320 
4321 void
4322 vop_mknod_post(void *ap, int rc)
4323 {
4324 	struct vop_mknod_args *a = ap;
4325 
4326 	if (!rc)
4327 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4328 }
4329 
4330 void
4331 vop_remove_post(void *ap, int rc)
4332 {
4333 	struct vop_remove_args *a = ap;
4334 
4335 	if (!rc) {
4336 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4337 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4338 	}
4339 }
4340 
4341 void
4342 vop_rename_post(void *ap, int rc)
4343 {
4344 	struct vop_rename_args *a = ap;
4345 
4346 	if (!rc) {
4347 		VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
4348 		VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
4349 		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
4350 		if (a->a_tvp)
4351 			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
4352 	}
4353 	if (a->a_tdvp != a->a_fdvp)
4354 		vdrop(a->a_fdvp);
4355 	if (a->a_tvp != a->a_fvp)
4356 		vdrop(a->a_fvp);
4357 	vdrop(a->a_tdvp);
4358 	if (a->a_tvp)
4359 		vdrop(a->a_tvp);
4360 }
4361 
4362 void
4363 vop_rmdir_post(void *ap, int rc)
4364 {
4365 	struct vop_rmdir_args *a = ap;
4366 
4367 	if (!rc) {
4368 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4369 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4370 	}
4371 }
4372 
4373 void
4374 vop_setattr_post(void *ap, int rc)
4375 {
4376 	struct vop_setattr_args *a = ap;
4377 
4378 	if (!rc)
4379 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4380 }
4381 
4382 void
4383 vop_setextattr_post(void *ap, int rc)
4384 {
4385 	struct vop_setextattr_args *a = ap;
4386 
4387 	if (!rc)
4388 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4389 }
4390 
4391 void
4392 vop_symlink_post(void *ap, int rc)
4393 {
4394 	struct vop_symlink_args *a = ap;
4395 
4396 	if (!rc)
4397 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4398 }
4399 
4400 static struct knlist fs_knlist;
4401 
4402 static void
4403 vfs_event_init(void *arg)
4404 {
4405 	knlist_init_mtx(&fs_knlist, NULL);
4406 }
4407 /* XXX - correct order? */
4408 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
4409 
4410 void
4411 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
4412 {
4413 
4414 	KNOTE_UNLOCKED(&fs_knlist, event);
4415 }
4416 
4417 static int	filt_fsattach(struct knote *kn);
4418 static void	filt_fsdetach(struct knote *kn);
4419 static int	filt_fsevent(struct knote *kn, long hint);
4420 
4421 struct filterops fs_filtops = {
4422 	.f_isfd = 0,
4423 	.f_attach = filt_fsattach,
4424 	.f_detach = filt_fsdetach,
4425 	.f_event = filt_fsevent
4426 };
4427 
4428 static int
4429 filt_fsattach(struct knote *kn)
4430 {
4431 
4432 	kn->kn_flags |= EV_CLEAR;
4433 	knlist_add(&fs_knlist, kn, 0);
4434 	return (0);
4435 }
4436 
4437 static void
4438 filt_fsdetach(struct knote *kn)
4439 {
4440 
4441 	knlist_remove(&fs_knlist, kn, 0);
4442 }
4443 
4444 static int
4445 filt_fsevent(struct knote *kn, long hint)
4446 {
4447 
4448 	kn->kn_fflags |= hint;
4449 	return (kn->kn_fflags != 0);
4450 }
4451 
4452 static int
4453 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
4454 {
4455 	struct vfsidctl vc;
4456 	int error;
4457 	struct mount *mp;
4458 
4459 	error = SYSCTL_IN(req, &vc, sizeof(vc));
4460 	if (error)
4461 		return (error);
4462 	if (vc.vc_vers != VFS_CTL_VERS1)
4463 		return (EINVAL);
4464 	mp = vfs_getvfs(&vc.vc_fsid);
4465 	if (mp == NULL)
4466 		return (ENOENT);
4467 	/* ensure that a specific sysctl goes to the right filesystem. */
4468 	if (strcmp(vc.vc_fstypename, "*") != 0 &&
4469 	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
4470 		vfs_rel(mp);
4471 		return (EINVAL);
4472 	}
4473 	VCTLTOREQ(&vc, req);
4474 	error = VFS_SYSCTL(mp, vc.vc_op, req);
4475 	vfs_rel(mp);
4476 	return (error);
4477 }
4478 
4479 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR,
4480     NULL, 0, sysctl_vfs_ctl, "",
4481     "Sysctl by fsid");
4482 
4483 /*
4484  * Function to initialize a va_filerev field sensibly.
4485  * XXX: Wouldn't a random number make a lot more sense ??
4486  */
4487 u_quad_t
4488 init_va_filerev(void)
4489 {
4490 	struct bintime bt;
4491 
4492 	getbinuptime(&bt);
4493 	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
4494 }
4495 
4496 static int	filt_vfsread(struct knote *kn, long hint);
4497 static int	filt_vfswrite(struct knote *kn, long hint);
4498 static int	filt_vfsvnode(struct knote *kn, long hint);
4499 static void	filt_vfsdetach(struct knote *kn);
4500 static struct filterops vfsread_filtops = {
4501 	.f_isfd = 1,
4502 	.f_detach = filt_vfsdetach,
4503 	.f_event = filt_vfsread
4504 };
4505 static struct filterops vfswrite_filtops = {
4506 	.f_isfd = 1,
4507 	.f_detach = filt_vfsdetach,
4508 	.f_event = filt_vfswrite
4509 };
4510 static struct filterops vfsvnode_filtops = {
4511 	.f_isfd = 1,
4512 	.f_detach = filt_vfsdetach,
4513 	.f_event = filt_vfsvnode
4514 };
4515 
4516 static void
4517 vfs_knllock(void *arg)
4518 {
4519 	struct vnode *vp = arg;
4520 
4521 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4522 }
4523 
4524 static void
4525 vfs_knlunlock(void *arg)
4526 {
4527 	struct vnode *vp = arg;
4528 
4529 	VOP_UNLOCK(vp, 0);
4530 }
4531 
4532 static void
4533 vfs_knl_assert_locked(void *arg)
4534 {
4535 #ifdef DEBUG_VFS_LOCKS
4536 	struct vnode *vp = arg;
4537 
4538 	ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
4539 #endif
4540 }
4541 
4542 static void
4543 vfs_knl_assert_unlocked(void *arg)
4544 {
4545 #ifdef DEBUG_VFS_LOCKS
4546 	struct vnode *vp = arg;
4547 
4548 	ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
4549 #endif
4550 }
4551 
4552 int
4553 vfs_kqfilter(struct vop_kqfilter_args *ap)
4554 {
4555 	struct vnode *vp = ap->a_vp;
4556 	struct knote *kn = ap->a_kn;
4557 	struct knlist *knl;
4558 
4559 	switch (kn->kn_filter) {
4560 	case EVFILT_READ:
4561 		kn->kn_fop = &vfsread_filtops;
4562 		break;
4563 	case EVFILT_WRITE:
4564 		kn->kn_fop = &vfswrite_filtops;
4565 		break;
4566 	case EVFILT_VNODE:
4567 		kn->kn_fop = &vfsvnode_filtops;
4568 		break;
4569 	default:
4570 		return (EINVAL);
4571 	}
4572 
4573 	kn->kn_hook = (caddr_t)vp;
4574 
4575 	v_addpollinfo(vp);
4576 	if (vp->v_pollinfo == NULL)
4577 		return (ENOMEM);
4578 	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
4579 	vhold(vp);
4580 	knlist_add(knl, kn, 0);
4581 
4582 	return (0);
4583 }
4584 
4585 /*
4586  * Detach knote from vnode
4587  */
4588 static void
4589 filt_vfsdetach(struct knote *kn)
4590 {
4591 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4592 
4593 	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
4594 	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
4595 	vdrop(vp);
4596 }
4597 
4598 /*ARGSUSED*/
4599 static int
4600 filt_vfsread(struct knote *kn, long hint)
4601 {
4602 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4603 	struct vattr va;
4604 	int res;
4605 
4606 	/*
4607 	 * filesystem is gone, so set the EOF flag and schedule
4608 	 * the knote for deletion.
4609 	 */
4610 	if (hint == NOTE_REVOKE) {
4611 		VI_LOCK(vp);
4612 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4613 		VI_UNLOCK(vp);
4614 		return (1);
4615 	}
4616 
4617 	if (VOP_GETATTR(vp, &va, curthread->td_ucred))
4618 		return (0);
4619 
4620 	VI_LOCK(vp);
4621 	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
4622 	res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0;
4623 	VI_UNLOCK(vp);
4624 	return (res);
4625 }
4626 
4627 /*ARGSUSED*/
4628 static int
4629 filt_vfswrite(struct knote *kn, long hint)
4630 {
4631 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4632 
4633 	VI_LOCK(vp);
4634 
4635 	/*
4636 	 * filesystem is gone, so set the EOF flag and schedule
4637 	 * the knote for deletion.
4638 	 */
4639 	if (hint == NOTE_REVOKE)
4640 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4641 
4642 	kn->kn_data = 0;
4643 	VI_UNLOCK(vp);
4644 	return (1);
4645 }
4646 
4647 static int
4648 filt_vfsvnode(struct knote *kn, long hint)
4649 {
4650 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4651 	int res;
4652 
4653 	VI_LOCK(vp);
4654 	if (kn->kn_sfflags & hint)
4655 		kn->kn_fflags |= hint;
4656 	if (hint == NOTE_REVOKE) {
4657 		kn->kn_flags |= EV_EOF;
4658 		VI_UNLOCK(vp);
4659 		return (1);
4660 	}
4661 	res = (kn->kn_fflags != 0);
4662 	VI_UNLOCK(vp);
4663 	return (res);
4664 }
4665 
4666 int
4667 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
4668 {
4669 	int error;
4670 
4671 	if (dp->d_reclen > ap->a_uio->uio_resid)
4672 		return (ENAMETOOLONG);
4673 	error = uiomove(dp, dp->d_reclen, ap->a_uio);
4674 	if (error) {
4675 		if (ap->a_ncookies != NULL) {
4676 			if (ap->a_cookies != NULL)
4677 				free(ap->a_cookies, M_TEMP);
4678 			ap->a_cookies = NULL;
4679 			*ap->a_ncookies = 0;
4680 		}
4681 		return (error);
4682 	}
4683 	if (ap->a_ncookies == NULL)
4684 		return (0);
4685 
4686 	KASSERT(ap->a_cookies,
4687 	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
4688 
4689 	*ap->a_cookies = realloc(*ap->a_cookies,
4690 	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
4691 	(*ap->a_cookies)[*ap->a_ncookies] = off;
4692 	return (0);
4693 }
4694 
4695 /*
4696  * Mark for update the access time of the file if the filesystem
4697  * supports VOP_MARKATIME.  This functionality is used by execve and
4698  * mmap, so we want to avoid the I/O implied by directly setting
4699  * va_atime for the sake of efficiency.
4700  */
4701 void
4702 vfs_mark_atime(struct vnode *vp, struct ucred *cred)
4703 {
4704 	struct mount *mp;
4705 
4706 	mp = vp->v_mount;
4707 	ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
4708 	if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
4709 		(void)VOP_MARKATIME(vp);
4710 }
4711 
4712 /*
4713  * The purpose of this routine is to remove granularity from accmode_t,
4714  * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
4715  * VADMIN and VAPPEND.
4716  *
4717  * If it returns 0, the caller is supposed to continue with the usual
4718  * access checks using 'accmode' as modified by this routine.  If it
4719  * returns nonzero value, the caller is supposed to return that value
4720  * as errno.
4721  *
4722  * Note that after this routine runs, accmode may be zero.
4723  */
4724 int
4725 vfs_unixify_accmode(accmode_t *accmode)
4726 {
4727 	/*
4728 	 * There is no way to specify explicit "deny" rule using
4729 	 * file mode or POSIX.1e ACLs.
4730 	 */
4731 	if (*accmode & VEXPLICIT_DENY) {
4732 		*accmode = 0;
4733 		return (0);
4734 	}
4735 
4736 	/*
4737 	 * None of these can be translated into usual access bits.
4738 	 * Also, the common case for NFSv4 ACLs is to not contain
4739 	 * either of these bits. Caller should check for VWRITE
4740 	 * on the containing directory instead.
4741 	 */
4742 	if (*accmode & (VDELETE_CHILD | VDELETE))
4743 		return (EPERM);
4744 
4745 	if (*accmode & VADMIN_PERMS) {
4746 		*accmode &= ~VADMIN_PERMS;
4747 		*accmode |= VADMIN;
4748 	}
4749 
4750 	/*
4751 	 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
4752 	 * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
4753 	 */
4754 	*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
4755 
4756 	return (0);
4757 }
4758 
4759 /*
4760  * These are helper functions for filesystems to traverse all
4761  * their vnodes.  See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
4762  *
4763  * This interface replaces MNT_VNODE_FOREACH.
4764  */
4765 
4766 MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
4767 
4768 struct vnode *
4769 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
4770 {
4771 	struct vnode *vp;
4772 
4773 	if (should_yield())
4774 		kern_yield(PRI_USER);
4775 	MNT_ILOCK(mp);
4776 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4777 	vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
4778 	while (vp != NULL && (vp->v_type == VMARKER ||
4779 	    (vp->v_iflag & VI_DOOMED) != 0))
4780 		vp = TAILQ_NEXT(vp, v_nmntvnodes);
4781 
4782 	/* Check if we are done */
4783 	if (vp == NULL) {
4784 		__mnt_vnode_markerfree_all(mvp, mp);
4785 		/* MNT_IUNLOCK(mp); -- done in above function */
4786 		mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
4787 		return (NULL);
4788 	}
4789 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
4790 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
4791 	VI_LOCK(vp);
4792 	MNT_IUNLOCK(mp);
4793 	return (vp);
4794 }
4795 
4796 struct vnode *
4797 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
4798 {
4799 	struct vnode *vp;
4800 
4801 	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
4802 	MNT_ILOCK(mp);
4803 	MNT_REF(mp);
4804 	(*mvp)->v_type = VMARKER;
4805 
4806 	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
4807 	while (vp != NULL && (vp->v_type == VMARKER ||
4808 	    (vp->v_iflag & VI_DOOMED) != 0))
4809 		vp = TAILQ_NEXT(vp, v_nmntvnodes);
4810 
4811 	/* Check if we are done */
4812 	if (vp == NULL) {
4813 		MNT_REL(mp);
4814 		MNT_IUNLOCK(mp);
4815 		free(*mvp, M_VNODE_MARKER);
4816 		*mvp = NULL;
4817 		return (NULL);
4818 	}
4819 	(*mvp)->v_mount = mp;
4820 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
4821 	VI_LOCK(vp);
4822 	MNT_IUNLOCK(mp);
4823 	return (vp);
4824 }
4825 
4826 
4827 void
4828 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
4829 {
4830 
4831 	if (*mvp == NULL) {
4832 		MNT_IUNLOCK(mp);
4833 		return;
4834 	}
4835 
4836 	mtx_assert(MNT_MTX(mp), MA_OWNED);
4837 
4838 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4839 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
4840 	MNT_REL(mp);
4841 	MNT_IUNLOCK(mp);
4842 	free(*mvp, M_VNODE_MARKER);
4843 	*mvp = NULL;
4844 }
4845 
4846 /*
4847  * These are helper functions for filesystems to traverse their
4848  * active vnodes.  See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h
4849  */
4850 static void
4851 mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
4852 {
4853 
4854 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4855 
4856 	MNT_ILOCK(mp);
4857 	MNT_REL(mp);
4858 	MNT_IUNLOCK(mp);
4859 	free(*mvp, M_VNODE_MARKER);
4860 	*mvp = NULL;
4861 }
4862 
4863 static struct vnode *
4864 mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
4865 {
4866 	struct vnode *vp, *nvp;
4867 
4868 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
4869 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4870 restart:
4871 	vp = TAILQ_NEXT(*mvp, v_actfreelist);
4872 	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
4873 	while (vp != NULL) {
4874 		if (vp->v_type == VMARKER) {
4875 			vp = TAILQ_NEXT(vp, v_actfreelist);
4876 			continue;
4877 		}
4878 		if (!VI_TRYLOCK(vp)) {
4879 			if (mp_ncpus == 1 || should_yield()) {
4880 				TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
4881 				mtx_unlock(&vnode_free_list_mtx);
4882 				pause("vnacti", 1);
4883 				mtx_lock(&vnode_free_list_mtx);
4884 				goto restart;
4885 			}
4886 			continue;
4887 		}
4888 		KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
4889 		KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
4890 		    ("alien vnode on the active list %p %p", vp, mp));
4891 		if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0)
4892 			break;
4893 		nvp = TAILQ_NEXT(vp, v_actfreelist);
4894 		VI_UNLOCK(vp);
4895 		vp = nvp;
4896 	}
4897 
4898 	/* Check if we are done */
4899 	if (vp == NULL) {
4900 		mtx_unlock(&vnode_free_list_mtx);
4901 		mnt_vnode_markerfree_active(mvp, mp);
4902 		return (NULL);
4903 	}
4904 	TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist);
4905 	mtx_unlock(&vnode_free_list_mtx);
4906 	ASSERT_VI_LOCKED(vp, "active iter");
4907 	KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
4908 	return (vp);
4909 }
4910 
4911 struct vnode *
4912 __mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
4913 {
4914 
4915 	if (should_yield())
4916 		kern_yield(PRI_USER);
4917 	mtx_lock(&vnode_free_list_mtx);
4918 	return (mnt_vnode_next_active(mvp, mp));
4919 }
4920 
4921 struct vnode *
4922 __mnt_vnode_first_active(struct vnode **mvp, struct mount *mp)
4923 {
4924 	struct vnode *vp;
4925 
4926 	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
4927 	MNT_ILOCK(mp);
4928 	MNT_REF(mp);
4929 	MNT_IUNLOCK(mp);
4930 	(*mvp)->v_type = VMARKER;
4931 	(*mvp)->v_mount = mp;
4932 
4933 	mtx_lock(&vnode_free_list_mtx);
4934 	vp = TAILQ_FIRST(&mp->mnt_activevnodelist);
4935 	if (vp == NULL) {
4936 		mtx_unlock(&vnode_free_list_mtx);
4937 		mnt_vnode_markerfree_active(mvp, mp);
4938 		return (NULL);
4939 	}
4940 	TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
4941 	return (mnt_vnode_next_active(mvp, mp));
4942 }
4943 
4944 void
4945 __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
4946 {
4947 
4948 	if (*mvp == NULL)
4949 		return;
4950 
4951 	mtx_lock(&vnode_free_list_mtx);
4952 	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
4953 	mtx_unlock(&vnode_free_list_mtx);
4954 	mnt_vnode_markerfree_active(mvp, mp);
4955 }
4956