xref: /freebsd/sys/kern/vfs_subr.c (revision 119b75925c562202145d7bac7b676b98029c6cb9)
1 /*-
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
35  */
36 
37 /*
38  * External virtual filesystem routines
39  */
40 
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 #include "opt_compat.h"
45 #include "opt_ddb.h"
46 #include "opt_watchdog.h"
47 
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/bio.h>
51 #include <sys/buf.h>
52 #include <sys/condvar.h>
53 #include <sys/conf.h>
54 #include <sys/dirent.h>
55 #include <sys/event.h>
56 #include <sys/eventhandler.h>
57 #include <sys/extattr.h>
58 #include <sys/file.h>
59 #include <sys/fcntl.h>
60 #include <sys/jail.h>
61 #include <sys/kdb.h>
62 #include <sys/kernel.h>
63 #include <sys/kthread.h>
64 #include <sys/lockf.h>
65 #include <sys/malloc.h>
66 #include <sys/mount.h>
67 #include <sys/namei.h>
68 #include <sys/pctrie.h>
69 #include <sys/priv.h>
70 #include <sys/reboot.h>
71 #include <sys/refcount.h>
72 #include <sys/rwlock.h>
73 #include <sys/sched.h>
74 #include <sys/sleepqueue.h>
75 #include <sys/smp.h>
76 #include <sys/stat.h>
77 #include <sys/sysctl.h>
78 #include <sys/syslog.h>
79 #include <sys/vmmeter.h>
80 #include <sys/vnode.h>
81 #include <sys/watchdog.h>
82 
83 #include <machine/stdarg.h>
84 
85 #include <security/mac/mac_framework.h>
86 
87 #include <vm/vm.h>
88 #include <vm/vm_object.h>
89 #include <vm/vm_extern.h>
90 #include <vm/pmap.h>
91 #include <vm/vm_map.h>
92 #include <vm/vm_page.h>
93 #include <vm/vm_kern.h>
94 #include <vm/uma.h>
95 
96 #ifdef DDB
97 #include <ddb/ddb.h>
98 #endif
99 
100 static void	delmntque(struct vnode *vp);
101 static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
102 		    int slpflag, int slptimeo);
103 static void	syncer_shutdown(void *arg, int howto);
104 static int	vtryrecycle(struct vnode *vp);
105 static void	v_init_counters(struct vnode *);
106 static void	v_incr_usecount(struct vnode *);
107 static void	v_incr_devcount(struct vnode *);
108 static void	v_decr_devcount(struct vnode *);
109 static void	vnlru_free(int);
110 static void	vgonel(struct vnode *);
111 static void	vfs_knllock(void *arg);
112 static void	vfs_knlunlock(void *arg);
113 static void	vfs_knl_assert_locked(void *arg);
114 static void	vfs_knl_assert_unlocked(void *arg);
115 static void	destroy_vpollinfo(struct vpollinfo *vi);
116 
117 /*
118  * Number of vnodes in existence.  Increased whenever getnewvnode()
119  * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode.
120  */
121 static unsigned long	numvnodes;
122 
123 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
124     "Number of vnodes in existence");
125 
126 static u_long vnodes_created;
127 SYSCTL_ULONG(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
128     0, "Number of vnodes created by getnewvnode");
129 
130 /*
131  * Conversion tables for conversion from vnode types to inode formats
132  * and back.
133  */
134 enum vtype iftovt_tab[16] = {
135 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
136 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
137 };
138 int vttoif_tab[10] = {
139 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
140 	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
141 };
142 
143 /*
144  * List of vnodes that are ready for recycling.
145  */
146 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
147 
148 /*
149  * Free vnode target.  Free vnodes may simply be files which have been stat'd
150  * but not read.  This is somewhat common, and a small cache of such files
151  * should be kept to avoid recreation costs.
152  */
153 static u_long wantfreevnodes;
154 SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
155 /* Number of vnodes in the free list. */
156 static u_long freevnodes;
157 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0,
158     "Number of vnodes in the free list");
159 
160 static int vlru_allow_cache_src;
161 SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW,
162     &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode");
163 
164 static u_long recycles_count;
165 SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 0,
166     "Number of vnodes recycled to avoid exceding kern.maxvnodes");
167 
168 /*
169  * Various variables used for debugging the new implementation of
170  * reassignbuf().
171  * XXX these are probably of (very) limited utility now.
172  */
173 static int reassignbufcalls;
174 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
175     "Number of calls to reassignbuf");
176 
177 static u_long free_owe_inact;
178 SYSCTL_ULONG(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact, 0,
179     "Number of times free vnodes kept on active list due to VFS "
180     "owing inactivation");
181 
182 /*
183  * Cache for the mount type id assigned to NFS.  This is used for
184  * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
185  */
186 int	nfs_mount_type = -1;
187 
188 /* To keep more than one thread at a time from running vfs_getnewfsid */
189 static struct mtx mntid_mtx;
190 
191 /*
192  * Lock for any access to the following:
193  *	vnode_free_list
194  *	numvnodes
195  *	freevnodes
196  */
197 static struct mtx vnode_free_list_mtx;
198 
199 /* Publicly exported FS */
200 struct nfs_public nfs_pub;
201 
202 static uma_zone_t buf_trie_zone;
203 
204 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
205 static uma_zone_t vnode_zone;
206 static uma_zone_t vnodepoll_zone;
207 
208 /*
209  * The workitem queue.
210  *
211  * It is useful to delay writes of file data and filesystem metadata
212  * for tens of seconds so that quickly created and deleted files need
213  * not waste disk bandwidth being created and removed. To realize this,
214  * we append vnodes to a "workitem" queue. When running with a soft
215  * updates implementation, most pending metadata dependencies should
216  * not wait for more than a few seconds. Thus, mounted on block devices
217  * are delayed only about a half the time that file data is delayed.
218  * Similarly, directory updates are more critical, so are only delayed
219  * about a third the time that file data is delayed. Thus, there are
220  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
221  * one each second (driven off the filesystem syncer process). The
222  * syncer_delayno variable indicates the next queue that is to be processed.
223  * Items that need to be processed soon are placed in this queue:
224  *
225  *	syncer_workitem_pending[syncer_delayno]
226  *
227  * A delay of fifteen seconds is done by placing the request fifteen
228  * entries later in the queue:
229  *
230  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
231  *
232  */
233 static int syncer_delayno;
234 static long syncer_mask;
235 LIST_HEAD(synclist, bufobj);
236 static struct synclist *syncer_workitem_pending;
237 /*
238  * The sync_mtx protects:
239  *	bo->bo_synclist
240  *	sync_vnode_count
241  *	syncer_delayno
242  *	syncer_state
243  *	syncer_workitem_pending
244  *	syncer_worklist_len
245  *	rushjob
246  */
247 static struct mtx sync_mtx;
248 static struct cv sync_wakeup;
249 
250 #define SYNCER_MAXDELAY		32
251 static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
252 static int syncdelay = 30;		/* max time to delay syncing data */
253 static int filedelay = 30;		/* time to delay syncing files */
254 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
255     "Time to delay syncing files (in seconds)");
256 static int dirdelay = 29;		/* time to delay syncing directories */
257 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
258     "Time to delay syncing directories (in seconds)");
259 static int metadelay = 28;		/* time to delay syncing metadata */
260 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
261     "Time to delay syncing metadata (in seconds)");
262 static int rushjob;		/* number of slots to run ASAP */
263 static int stat_rush_requests;	/* number of times I/O speeded up */
264 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
265     "Number of times I/O speeded up (rush requests)");
266 
267 /*
268  * When shutting down the syncer, run it at four times normal speed.
269  */
270 #define SYNCER_SHUTDOWN_SPEEDUP		4
271 static int sync_vnode_count;
272 static int syncer_worklist_len;
273 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
274     syncer_state;
275 
276 /*
277  * Number of vnodes we want to exist at any one time.  This is mostly used
278  * to size hash tables in vnode-related code.  It is normally not used in
279  * getnewvnode(), as wantfreevnodes is normally nonzero.)
280  *
281  * XXX desiredvnodes is historical cruft and should not exist.
282  */
283 int desiredvnodes;
284 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
285     &desiredvnodes, 0, "Maximum number of vnodes");
286 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
287     &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
288 static int vnlru_nowhere;
289 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
290     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
291 
292 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
293 static int vnsz2log;
294 
295 /*
296  * Support for the bufobj clean & dirty pctrie.
297  */
298 static void *
299 buf_trie_alloc(struct pctrie *ptree)
300 {
301 
302 	return uma_zalloc(buf_trie_zone, M_NOWAIT);
303 }
304 
305 static void
306 buf_trie_free(struct pctrie *ptree, void *node)
307 {
308 
309 	uma_zfree(buf_trie_zone, node);
310 }
311 PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);
312 
313 /*
314  * Initialize the vnode management data structures.
315  *
316  * Reevaluate the following cap on the number of vnodes after the physical
317  * memory size exceeds 512GB.  In the limit, as the physical memory size
318  * grows, the ratio of physical pages to vnodes approaches sixteen to one.
319  */
320 #ifndef	MAXVNODES_MAX
321 #define	MAXVNODES_MAX	(512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16))
322 #endif
323 static void
324 vntblinit(void *dummy __unused)
325 {
326 	u_int i;
327 	int physvnodes, virtvnodes;
328 
329 	/*
330 	 * Desiredvnodes is a function of the physical memory size and the
331 	 * kernel's heap size.  Generally speaking, it scales with the
332 	 * physical memory size.  The ratio of desiredvnodes to physical pages
333 	 * is one to four until desiredvnodes exceeds 98,304.  Thereafter, the
334 	 * marginal ratio of desiredvnodes to physical pages is one to
335 	 * sixteen.  However, desiredvnodes is limited by the kernel's heap
336 	 * size.  The memory required by desiredvnodes vnodes and vm objects
337 	 * may not exceed one seventh of the kernel's heap size.
338 	 */
339 	physvnodes = maxproc + vm_cnt.v_page_count / 16 + 3 * min(98304 * 4,
340 	    vm_cnt.v_page_count) / 16;
341 	virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) +
342 	    sizeof(struct vnode)));
343 	desiredvnodes = min(physvnodes, virtvnodes);
344 	if (desiredvnodes > MAXVNODES_MAX) {
345 		if (bootverbose)
346 			printf("Reducing kern.maxvnodes %d -> %d\n",
347 			    desiredvnodes, MAXVNODES_MAX);
348 		desiredvnodes = MAXVNODES_MAX;
349 	}
350 	wantfreevnodes = desiredvnodes / 4;
351 	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
352 	TAILQ_INIT(&vnode_free_list);
353 	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
354 	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
355 	    NULL, NULL, UMA_ALIGN_PTR, 0);
356 	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
357 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
358 	/*
359 	 * Preallocate enough nodes to support one-per buf so that
360 	 * we can not fail an insert.  reassignbuf() callers can not
361 	 * tolerate the insertion failure.
362 	 */
363 	buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
364 	    NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR,
365 	    UMA_ZONE_NOFREE | UMA_ZONE_VM);
366 	uma_prealloc(buf_trie_zone, nbuf);
367 	/*
368 	 * Initialize the filesystem syncer.
369 	 */
370 	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
371 	    &syncer_mask);
372 	syncer_maxdelay = syncer_mask + 1;
373 	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
374 	cv_init(&sync_wakeup, "syncer");
375 	for (i = 1; i <= sizeof(struct vnode); i <<= 1)
376 		vnsz2log++;
377 	vnsz2log--;
378 }
379 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
380 
381 
382 /*
383  * Mark a mount point as busy. Used to synchronize access and to delay
384  * unmounting. Eventually, mountlist_mtx is not released on failure.
385  *
386  * vfs_busy() is a custom lock, it can block the caller.
387  * vfs_busy() only sleeps if the unmount is active on the mount point.
388  * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
389  * vnode belonging to mp.
390  *
391  * Lookup uses vfs_busy() to traverse mount points.
392  * root fs			var fs
393  * / vnode lock		A	/ vnode lock (/var)		D
394  * /var vnode lock	B	/log vnode lock(/var/log)	E
395  * vfs_busy lock	C	vfs_busy lock			F
396  *
397  * Within each file system, the lock order is C->A->B and F->D->E.
398  *
399  * When traversing across mounts, the system follows that lock order:
400  *
401  *        C->A->B
402  *              |
403  *              +->F->D->E
404  *
405  * The lookup() process for namei("/var") illustrates the process:
406  *  VOP_LOOKUP() obtains B while A is held
407  *  vfs_busy() obtains a shared lock on F while A and B are held
408  *  vput() releases lock on B
409  *  vput() releases lock on A
410  *  VFS_ROOT() obtains lock on D while shared lock on F is held
411  *  vfs_unbusy() releases shared lock on F
412  *  vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
413  *    Attempt to lock A (instead of vp_crossmp) while D is held would
414  *    violate the global order, causing deadlocks.
415  *
416  * dounmount() locks B while F is drained.
417  */
418 int
419 vfs_busy(struct mount *mp, int flags)
420 {
421 
422 	MPASS((flags & ~MBF_MASK) == 0);
423 	CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
424 
425 	MNT_ILOCK(mp);
426 	MNT_REF(mp);
427 	/*
428 	 * If mount point is currenly being unmounted, sleep until the
429 	 * mount point fate is decided.  If thread doing the unmounting fails,
430 	 * it will clear MNTK_UNMOUNT flag before waking us up, indicating
431 	 * that this mount point has survived the unmount attempt and vfs_busy
432 	 * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
433 	 * flag in addition to MNTK_UNMOUNT, indicating that mount point is
434 	 * about to be really destroyed.  vfs_busy needs to release its
435 	 * reference on the mount point in this case and return with ENOENT,
436 	 * telling the caller that mount mount it tried to busy is no longer
437 	 * valid.
438 	 */
439 	while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
440 		if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
441 			MNT_REL(mp);
442 			MNT_IUNLOCK(mp);
443 			CTR1(KTR_VFS, "%s: failed busying before sleeping",
444 			    __func__);
445 			return (ENOENT);
446 		}
447 		if (flags & MBF_MNTLSTLOCK)
448 			mtx_unlock(&mountlist_mtx);
449 		mp->mnt_kern_flag |= MNTK_MWAIT;
450 		msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
451 		if (flags & MBF_MNTLSTLOCK)
452 			mtx_lock(&mountlist_mtx);
453 		MNT_ILOCK(mp);
454 	}
455 	if (flags & MBF_MNTLSTLOCK)
456 		mtx_unlock(&mountlist_mtx);
457 	mp->mnt_lockref++;
458 	MNT_IUNLOCK(mp);
459 	return (0);
460 }
461 
462 /*
463  * Free a busy filesystem.
464  */
465 void
466 vfs_unbusy(struct mount *mp)
467 {
468 
469 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
470 	MNT_ILOCK(mp);
471 	MNT_REL(mp);
472 	KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref"));
473 	mp->mnt_lockref--;
474 	if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
475 		MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
476 		CTR1(KTR_VFS, "%s: waking up waiters", __func__);
477 		mp->mnt_kern_flag &= ~MNTK_DRAINING;
478 		wakeup(&mp->mnt_lockref);
479 	}
480 	MNT_IUNLOCK(mp);
481 }
482 
483 /*
484  * Lookup a mount point by filesystem identifier.
485  */
486 struct mount *
487 vfs_getvfs(fsid_t *fsid)
488 {
489 	struct mount *mp;
490 
491 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
492 	mtx_lock(&mountlist_mtx);
493 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
494 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
495 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
496 			vfs_ref(mp);
497 			mtx_unlock(&mountlist_mtx);
498 			return (mp);
499 		}
500 	}
501 	mtx_unlock(&mountlist_mtx);
502 	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
503 	return ((struct mount *) 0);
504 }
505 
506 /*
507  * Lookup a mount point by filesystem identifier, busying it before
508  * returning.
509  *
510  * To avoid congestion on mountlist_mtx, implement simple direct-mapped
511  * cache for popular filesystem identifiers.  The cache is lockess, using
512  * the fact that struct mount's are never freed.  In worst case we may
513  * get pointer to unmounted or even different filesystem, so we have to
514  * check what we got, and go slow way if so.
515  */
516 struct mount *
517 vfs_busyfs(fsid_t *fsid)
518 {
519 #define	FSID_CACHE_SIZE	256
520 	typedef struct mount * volatile vmp_t;
521 	static vmp_t cache[FSID_CACHE_SIZE];
522 	struct mount *mp;
523 	int error;
524 	uint32_t hash;
525 
526 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
527 	hash = fsid->val[0] ^ fsid->val[1];
528 	hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
529 	mp = cache[hash];
530 	if (mp == NULL ||
531 	    mp->mnt_stat.f_fsid.val[0] != fsid->val[0] ||
532 	    mp->mnt_stat.f_fsid.val[1] != fsid->val[1])
533 		goto slow;
534 	if (vfs_busy(mp, 0) != 0) {
535 		cache[hash] = NULL;
536 		goto slow;
537 	}
538 	if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
539 	    mp->mnt_stat.f_fsid.val[1] == fsid->val[1])
540 		return (mp);
541 	else
542 	    vfs_unbusy(mp);
543 
544 slow:
545 	mtx_lock(&mountlist_mtx);
546 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
547 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
548 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
549 			error = vfs_busy(mp, MBF_MNTLSTLOCK);
550 			if (error) {
551 				cache[hash] = NULL;
552 				mtx_unlock(&mountlist_mtx);
553 				return (NULL);
554 			}
555 			cache[hash] = mp;
556 			return (mp);
557 		}
558 	}
559 	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
560 	mtx_unlock(&mountlist_mtx);
561 	return ((struct mount *) 0);
562 }
563 
564 /*
565  * Check if a user can access privileged mount options.
566  */
567 int
568 vfs_suser(struct mount *mp, struct thread *td)
569 {
570 	int error;
571 
572 	/*
573 	 * If the thread is jailed, but this is not a jail-friendly file
574 	 * system, deny immediately.
575 	 */
576 	if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred))
577 		return (EPERM);
578 
579 	/*
580 	 * If the file system was mounted outside the jail of the calling
581 	 * thread, deny immediately.
582 	 */
583 	if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
584 		return (EPERM);
585 
586 	/*
587 	 * If file system supports delegated administration, we don't check
588 	 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
589 	 * by the file system itself.
590 	 * If this is not the user that did original mount, we check for
591 	 * the PRIV_VFS_MOUNT_OWNER privilege.
592 	 */
593 	if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
594 	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
595 		if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
596 			return (error);
597 	}
598 	return (0);
599 }
600 
601 /*
602  * Get a new unique fsid.  Try to make its val[0] unique, since this value
603  * will be used to create fake device numbers for stat().  Also try (but
604  * not so hard) make its val[0] unique mod 2^16, since some emulators only
605  * support 16-bit device numbers.  We end up with unique val[0]'s for the
606  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
607  *
608  * Keep in mind that several mounts may be running in parallel.  Starting
609  * the search one past where the previous search terminated is both a
610  * micro-optimization and a defense against returning the same fsid to
611  * different mounts.
612  */
613 void
614 vfs_getnewfsid(struct mount *mp)
615 {
616 	static uint16_t mntid_base;
617 	struct mount *nmp;
618 	fsid_t tfsid;
619 	int mtype;
620 
621 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
622 	mtx_lock(&mntid_mtx);
623 	mtype = mp->mnt_vfc->vfc_typenum;
624 	tfsid.val[1] = mtype;
625 	mtype = (mtype & 0xFF) << 24;
626 	for (;;) {
627 		tfsid.val[0] = makedev(255,
628 		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
629 		mntid_base++;
630 		if ((nmp = vfs_getvfs(&tfsid)) == NULL)
631 			break;
632 		vfs_rel(nmp);
633 	}
634 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
635 	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
636 	mtx_unlock(&mntid_mtx);
637 }
638 
639 /*
640  * Knob to control the precision of file timestamps:
641  *
642  *   0 = seconds only; nanoseconds zeroed.
643  *   1 = seconds and nanoseconds, accurate within 1/HZ.
644  *   2 = seconds and nanoseconds, truncated to microseconds.
645  * >=3 = seconds and nanoseconds, maximum precision.
646  */
647 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
648 
649 static int timestamp_precision = TSP_USEC;
650 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
651     &timestamp_precision, 0, "File timestamp precision (0: seconds, "
652     "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, "
653     "3+: sec + ns (max. precision))");
654 
655 /*
656  * Get a current timestamp.
657  */
658 void
659 vfs_timestamp(struct timespec *tsp)
660 {
661 	struct timeval tv;
662 
663 	switch (timestamp_precision) {
664 	case TSP_SEC:
665 		tsp->tv_sec = time_second;
666 		tsp->tv_nsec = 0;
667 		break;
668 	case TSP_HZ:
669 		getnanotime(tsp);
670 		break;
671 	case TSP_USEC:
672 		microtime(&tv);
673 		TIMEVAL_TO_TIMESPEC(&tv, tsp);
674 		break;
675 	case TSP_NSEC:
676 	default:
677 		nanotime(tsp);
678 		break;
679 	}
680 }
681 
682 /*
683  * Set vnode attributes to VNOVAL
684  */
685 void
686 vattr_null(struct vattr *vap)
687 {
688 
689 	vap->va_type = VNON;
690 	vap->va_size = VNOVAL;
691 	vap->va_bytes = VNOVAL;
692 	vap->va_mode = VNOVAL;
693 	vap->va_nlink = VNOVAL;
694 	vap->va_uid = VNOVAL;
695 	vap->va_gid = VNOVAL;
696 	vap->va_fsid = VNOVAL;
697 	vap->va_fileid = VNOVAL;
698 	vap->va_blocksize = VNOVAL;
699 	vap->va_rdev = VNOVAL;
700 	vap->va_atime.tv_sec = VNOVAL;
701 	vap->va_atime.tv_nsec = VNOVAL;
702 	vap->va_mtime.tv_sec = VNOVAL;
703 	vap->va_mtime.tv_nsec = VNOVAL;
704 	vap->va_ctime.tv_sec = VNOVAL;
705 	vap->va_ctime.tv_nsec = VNOVAL;
706 	vap->va_birthtime.tv_sec = VNOVAL;
707 	vap->va_birthtime.tv_nsec = VNOVAL;
708 	vap->va_flags = VNOVAL;
709 	vap->va_gen = VNOVAL;
710 	vap->va_vaflags = 0;
711 }
712 
713 /*
714  * This routine is called when we have too many vnodes.  It attempts
715  * to free <count> vnodes and will potentially free vnodes that still
716  * have VM backing store (VM backing store is typically the cause
717  * of a vnode blowout so we want to do this).  Therefore, this operation
718  * is not considered cheap.
719  *
720  * A number of conditions may prevent a vnode from being reclaimed.
721  * the buffer cache may have references on the vnode, a directory
722  * vnode may still have references due to the namei cache representing
723  * underlying files, or the vnode may be in active use.   It is not
724  * desireable to reuse such vnodes.  These conditions may cause the
725  * number of vnodes to reach some minimum value regardless of what
726  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
727  */
728 static int
729 vlrureclaim(struct mount *mp)
730 {
731 	struct vnode *vp;
732 	int done;
733 	int trigger;
734 	int usevnodes;
735 	int count;
736 
737 	/*
738 	 * Calculate the trigger point, don't allow user
739 	 * screwups to blow us up.   This prevents us from
740 	 * recycling vnodes with lots of resident pages.  We
741 	 * aren't trying to free memory, we are trying to
742 	 * free vnodes.
743 	 */
744 	usevnodes = desiredvnodes;
745 	if (usevnodes <= 0)
746 		usevnodes = 1;
747 	trigger = vm_cnt.v_page_count * 2 / usevnodes;
748 	done = 0;
749 	vn_start_write(NULL, &mp, V_WAIT);
750 	MNT_ILOCK(mp);
751 	count = mp->mnt_nvnodelistsize / 10 + 1;
752 	while (count != 0) {
753 		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
754 		while (vp != NULL && vp->v_type == VMARKER)
755 			vp = TAILQ_NEXT(vp, v_nmntvnodes);
756 		if (vp == NULL)
757 			break;
758 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
759 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
760 		--count;
761 		if (!VI_TRYLOCK(vp))
762 			goto next_iter;
763 		/*
764 		 * If it's been deconstructed already, it's still
765 		 * referenced, or it exceeds the trigger, skip it.
766 		 */
767 		if (vp->v_usecount ||
768 		    (!vlru_allow_cache_src &&
769 			!LIST_EMPTY(&(vp)->v_cache_src)) ||
770 		    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
771 		    vp->v_object->resident_page_count > trigger)) {
772 			VI_UNLOCK(vp);
773 			goto next_iter;
774 		}
775 		MNT_IUNLOCK(mp);
776 		vholdl(vp);
777 		if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
778 			vdrop(vp);
779 			goto next_iter_mntunlocked;
780 		}
781 		VI_LOCK(vp);
782 		/*
783 		 * v_usecount may have been bumped after VOP_LOCK() dropped
784 		 * the vnode interlock and before it was locked again.
785 		 *
786 		 * It is not necessary to recheck VI_DOOMED because it can
787 		 * only be set by another thread that holds both the vnode
788 		 * lock and vnode interlock.  If another thread has the
789 		 * vnode lock before we get to VOP_LOCK() and obtains the
790 		 * vnode interlock after VOP_LOCK() drops the vnode
791 		 * interlock, the other thread will be unable to drop the
792 		 * vnode lock before our VOP_LOCK() call fails.
793 		 */
794 		if (vp->v_usecount ||
795 		    (!vlru_allow_cache_src &&
796 			!LIST_EMPTY(&(vp)->v_cache_src)) ||
797 		    (vp->v_object != NULL &&
798 		    vp->v_object->resident_page_count > trigger)) {
799 			VOP_UNLOCK(vp, LK_INTERLOCK);
800 			vdrop(vp);
801 			goto next_iter_mntunlocked;
802 		}
803 		KASSERT((vp->v_iflag & VI_DOOMED) == 0,
804 		    ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
805 		atomic_add_long(&recycles_count, 1);
806 		vgonel(vp);
807 		VOP_UNLOCK(vp, 0);
808 		vdropl(vp);
809 		done++;
810 next_iter_mntunlocked:
811 		if (!should_yield())
812 			goto relock_mnt;
813 		goto yield;
814 next_iter:
815 		if (!should_yield())
816 			continue;
817 		MNT_IUNLOCK(mp);
818 yield:
819 		kern_yield(PRI_USER);
820 relock_mnt:
821 		MNT_ILOCK(mp);
822 	}
823 	MNT_IUNLOCK(mp);
824 	vn_finished_write(mp);
825 	return done;
826 }
827 
828 /*
829  * Attempt to keep the free list at wantfreevnodes length.
830  */
831 static void
832 vnlru_free(int count)
833 {
834 	struct vnode *vp;
835 
836 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
837 	for (; count > 0; count--) {
838 		vp = TAILQ_FIRST(&vnode_free_list);
839 		/*
840 		 * The list can be modified while the free_list_mtx
841 		 * has been dropped and vp could be NULL here.
842 		 */
843 		if (!vp)
844 			break;
845 		VNASSERT(vp->v_op != NULL, vp,
846 		    ("vnlru_free: vnode already reclaimed."));
847 		KASSERT((vp->v_iflag & VI_FREE) != 0,
848 		    ("Removing vnode not on freelist"));
849 		KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
850 		    ("Mangling active vnode"));
851 		TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
852 		/*
853 		 * Don't recycle if we can't get the interlock.
854 		 */
855 		if (!VI_TRYLOCK(vp)) {
856 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
857 			continue;
858 		}
859 		VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0,
860 		    vp, ("vp inconsistent on freelist"));
861 
862 		/*
863 		 * The clear of VI_FREE prevents activation of the
864 		 * vnode.  There is no sense in putting the vnode on
865 		 * the mount point active list, only to remove it
866 		 * later during recycling.  Inline the relevant part
867 		 * of vholdl(), to avoid triggering assertions or
868 		 * activating.
869 		 */
870 		freevnodes--;
871 		vp->v_iflag &= ~VI_FREE;
872 		refcount_acquire(&vp->v_holdcnt);
873 
874 		mtx_unlock(&vnode_free_list_mtx);
875 		VI_UNLOCK(vp);
876 		vtryrecycle(vp);
877 		/*
878 		 * If the recycled succeeded this vdrop will actually free
879 		 * the vnode.  If not it will simply place it back on
880 		 * the free list.
881 		 */
882 		vdrop(vp);
883 		mtx_lock(&vnode_free_list_mtx);
884 	}
885 }
886 /*
887  * Attempt to recycle vnodes in a context that is always safe to block.
888  * Calling vlrurecycle() from the bowels of filesystem code has some
889  * interesting deadlock problems.
890  */
891 static struct proc *vnlruproc;
892 static int vnlruproc_sig;
893 
894 static void
895 vnlru_proc(void)
896 {
897 	struct mount *mp, *nmp;
898 	int done;
899 	struct proc *p = vnlruproc;
900 
901 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
902 	    SHUTDOWN_PRI_FIRST);
903 
904 	for (;;) {
905 		kproc_suspend_check(p);
906 		mtx_lock(&vnode_free_list_mtx);
907 		if (freevnodes > wantfreevnodes)
908 			vnlru_free(freevnodes - wantfreevnodes);
909 		if (numvnodes <= desiredvnodes * 9 / 10) {
910 			vnlruproc_sig = 0;
911 			wakeup(&vnlruproc_sig);
912 			msleep(vnlruproc, &vnode_free_list_mtx,
913 			    PVFS|PDROP, "vlruwt", hz);
914 			continue;
915 		}
916 		mtx_unlock(&vnode_free_list_mtx);
917 		done = 0;
918 		mtx_lock(&mountlist_mtx);
919 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
920 			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
921 				nmp = TAILQ_NEXT(mp, mnt_list);
922 				continue;
923 			}
924 			done += vlrureclaim(mp);
925 			mtx_lock(&mountlist_mtx);
926 			nmp = TAILQ_NEXT(mp, mnt_list);
927 			vfs_unbusy(mp);
928 		}
929 		mtx_unlock(&mountlist_mtx);
930 		if (done == 0) {
931 #if 0
932 			/* These messages are temporary debugging aids */
933 			if (vnlru_nowhere < 5)
934 				printf("vnlru process getting nowhere..\n");
935 			else if (vnlru_nowhere == 5)
936 				printf("vnlru process messages stopped.\n");
937 #endif
938 			vnlru_nowhere++;
939 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
940 		} else
941 			kern_yield(PRI_USER);
942 	}
943 }
944 
945 static struct kproc_desc vnlru_kp = {
946 	"vnlru",
947 	vnlru_proc,
948 	&vnlruproc
949 };
950 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
951     &vnlru_kp);
952 
953 /*
954  * Routines having to do with the management of the vnode table.
955  */
956 
957 /*
958  * Try to recycle a freed vnode.  We abort if anyone picks up a reference
959  * before we actually vgone().  This function must be called with the vnode
960  * held to prevent the vnode from being returned to the free list midway
961  * through vgone().
962  */
963 static int
964 vtryrecycle(struct vnode *vp)
965 {
966 	struct mount *vnmp;
967 
968 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
969 	VNASSERT(vp->v_holdcnt, vp,
970 	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
971 	/*
972 	 * This vnode may found and locked via some other list, if so we
973 	 * can't recycle it yet.
974 	 */
975 	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
976 		CTR2(KTR_VFS,
977 		    "%s: impossible to recycle, vp %p lock is already held",
978 		    __func__, vp);
979 		return (EWOULDBLOCK);
980 	}
981 	/*
982 	 * Don't recycle if its filesystem is being suspended.
983 	 */
984 	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
985 		VOP_UNLOCK(vp, 0);
986 		CTR2(KTR_VFS,
987 		    "%s: impossible to recycle, cannot start the write for %p",
988 		    __func__, vp);
989 		return (EBUSY);
990 	}
991 	/*
992 	 * If we got this far, we need to acquire the interlock and see if
993 	 * anyone picked up this vnode from another list.  If not, we will
994 	 * mark it with DOOMED via vgonel() so that anyone who does find it
995 	 * will skip over it.
996 	 */
997 	VI_LOCK(vp);
998 	if (vp->v_usecount) {
999 		VOP_UNLOCK(vp, LK_INTERLOCK);
1000 		vn_finished_write(vnmp);
1001 		CTR2(KTR_VFS,
1002 		    "%s: impossible to recycle, %p is already referenced",
1003 		    __func__, vp);
1004 		return (EBUSY);
1005 	}
1006 	if ((vp->v_iflag & VI_DOOMED) == 0) {
1007 		atomic_add_long(&recycles_count, 1);
1008 		vgonel(vp);
1009 	}
1010 	VOP_UNLOCK(vp, LK_INTERLOCK);
1011 	vn_finished_write(vnmp);
1012 	return (0);
1013 }
1014 
1015 /*
1016  * Wait for available vnodes.
1017  */
1018 static int
1019 getnewvnode_wait(int suspended)
1020 {
1021 
1022 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
1023 	if (numvnodes > desiredvnodes) {
1024 		if (suspended) {
1025 			/*
1026 			 * File system is beeing suspended, we cannot risk a
1027 			 * deadlock here, so allocate new vnode anyway.
1028 			 */
1029 			if (freevnodes > wantfreevnodes)
1030 				vnlru_free(freevnodes - wantfreevnodes);
1031 			return (0);
1032 		}
1033 		if (vnlruproc_sig == 0) {
1034 			vnlruproc_sig = 1;	/* avoid unnecessary wakeups */
1035 			wakeup(vnlruproc);
1036 		}
1037 		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
1038 		    "vlruwk", hz);
1039 	}
1040 	return (numvnodes > desiredvnodes ? ENFILE : 0);
1041 }
1042 
1043 void
1044 getnewvnode_reserve(u_int count)
1045 {
1046 	struct thread *td;
1047 
1048 	td = curthread;
1049 	/* First try to be quick and racy. */
1050 	if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
1051 		td->td_vp_reserv += count;
1052 		return;
1053 	} else
1054 		atomic_subtract_long(&numvnodes, count);
1055 
1056 	mtx_lock(&vnode_free_list_mtx);
1057 	while (count > 0) {
1058 		if (getnewvnode_wait(0) == 0) {
1059 			count--;
1060 			td->td_vp_reserv++;
1061 			atomic_add_long(&numvnodes, 1);
1062 		}
1063 	}
1064 	mtx_unlock(&vnode_free_list_mtx);
1065 }
1066 
1067 void
1068 getnewvnode_drop_reserve(void)
1069 {
1070 	struct thread *td;
1071 
1072 	td = curthread;
1073 	atomic_subtract_long(&numvnodes, td->td_vp_reserv);
1074 	td->td_vp_reserv = 0;
1075 }
1076 
1077 /*
1078  * Return the next vnode from the free list.
1079  */
1080 int
1081 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
1082     struct vnode **vpp)
1083 {
1084 	struct vnode *vp;
1085 	struct bufobj *bo;
1086 	struct thread *td;
1087 	int error;
1088 
1089 	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
1090 	vp = NULL;
1091 	td = curthread;
1092 	if (td->td_vp_reserv > 0) {
1093 		td->td_vp_reserv -= 1;
1094 		goto alloc;
1095 	}
1096 	mtx_lock(&vnode_free_list_mtx);
1097 	/*
1098 	 * Lend our context to reclaim vnodes if they've exceeded the max.
1099 	 */
1100 	if (freevnodes > wantfreevnodes)
1101 		vnlru_free(1);
1102 	error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
1103 	    MNTK_SUSPEND));
1104 #if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
1105 	if (error != 0) {
1106 		mtx_unlock(&vnode_free_list_mtx);
1107 		return (error);
1108 	}
1109 #endif
1110 	atomic_add_long(&numvnodes, 1);
1111 	mtx_unlock(&vnode_free_list_mtx);
1112 alloc:
1113 	atomic_add_long(&vnodes_created, 1);
1114 	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
1115 	/*
1116 	 * Setup locks.
1117 	 */
1118 	vp->v_vnlock = &vp->v_lock;
1119 	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
1120 	/*
1121 	 * By default, don't allow shared locks unless filesystems
1122 	 * opt-in.
1123 	 */
1124 	lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE);
1125 	/*
1126 	 * Initialize bufobj.
1127 	 */
1128 	bo = &vp->v_bufobj;
1129 	bo->__bo_vnode = vp;
1130 	rw_init(BO_LOCKPTR(bo), "bufobj interlock");
1131 	bo->bo_ops = &buf_ops_bio;
1132 	bo->bo_private = vp;
1133 	TAILQ_INIT(&bo->bo_clean.bv_hd);
1134 	TAILQ_INIT(&bo->bo_dirty.bv_hd);
1135 	/*
1136 	 * Initialize namecache.
1137 	 */
1138 	LIST_INIT(&vp->v_cache_src);
1139 	TAILQ_INIT(&vp->v_cache_dst);
1140 	/*
1141 	 * Finalize various vnode identity bits.
1142 	 */
1143 	vp->v_type = VNON;
1144 	vp->v_tag = tag;
1145 	vp->v_op = vops;
1146 	v_init_counters(vp);
1147 	vp->v_data = NULL;
1148 #ifdef MAC
1149 	mac_vnode_init(vp);
1150 	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
1151 		mac_vnode_associate_singlelabel(mp, vp);
1152 	else if (mp == NULL && vops != &dead_vnodeops)
1153 		printf("NULL mp in getnewvnode()\n");
1154 #endif
1155 	if (mp != NULL) {
1156 		bo->bo_bsize = mp->mnt_stat.f_iosize;
1157 		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
1158 			vp->v_vflag |= VV_NOKNOTE;
1159 	}
1160 	rangelock_init(&vp->v_rl);
1161 
1162 	/*
1163 	 * For the filesystems which do not use vfs_hash_insert(),
1164 	 * still initialize v_hash to have vfs_hash_index() useful.
1165 	 * E.g., nullfs uses vfs_hash_index() on the lower vnode for
1166 	 * its own hashing.
1167 	 */
1168 	vp->v_hash = (uintptr_t)vp >> vnsz2log;
1169 
1170 	*vpp = vp;
1171 	return (0);
1172 }
1173 
1174 /*
1175  * Delete from old mount point vnode list, if on one.
1176  */
1177 static void
1178 delmntque(struct vnode *vp)
1179 {
1180 	struct mount *mp;
1181 	int active;
1182 
1183 	mp = vp->v_mount;
1184 	if (mp == NULL)
1185 		return;
1186 	MNT_ILOCK(mp);
1187 	VI_LOCK(vp);
1188 	KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize,
1189 	    ("Active vnode list size %d > Vnode list size %d",
1190 	     mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize));
1191 	active = vp->v_iflag & VI_ACTIVE;
1192 	vp->v_iflag &= ~VI_ACTIVE;
1193 	if (active) {
1194 		mtx_lock(&vnode_free_list_mtx);
1195 		TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
1196 		mp->mnt_activevnodelistsize--;
1197 		mtx_unlock(&vnode_free_list_mtx);
1198 	}
1199 	vp->v_mount = NULL;
1200 	VI_UNLOCK(vp);
1201 	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
1202 		("bad mount point vnode list size"));
1203 	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1204 	mp->mnt_nvnodelistsize--;
1205 	MNT_REL(mp);
1206 	MNT_IUNLOCK(mp);
1207 }
1208 
1209 static void
1210 insmntque_stddtr(struct vnode *vp, void *dtr_arg)
1211 {
1212 
1213 	vp->v_data = NULL;
1214 	vp->v_op = &dead_vnodeops;
1215 	vgone(vp);
1216 	vput(vp);
1217 }
1218 
1219 /*
1220  * Insert into list of vnodes for the new mount point, if available.
1221  */
1222 int
1223 insmntque1(struct vnode *vp, struct mount *mp,
1224 	void (*dtr)(struct vnode *, void *), void *dtr_arg)
1225 {
1226 
1227 	KASSERT(vp->v_mount == NULL,
1228 		("insmntque: vnode already on per mount vnode list"));
1229 	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
1230 	ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
1231 
1232 	/*
1233 	 * We acquire the vnode interlock early to ensure that the
1234 	 * vnode cannot be recycled by another process releasing a
1235 	 * holdcnt on it before we get it on both the vnode list
1236 	 * and the active vnode list. The mount mutex protects only
1237 	 * manipulation of the vnode list and the vnode freelist
1238 	 * mutex protects only manipulation of the active vnode list.
1239 	 * Hence the need to hold the vnode interlock throughout.
1240 	 */
1241 	MNT_ILOCK(mp);
1242 	VI_LOCK(vp);
1243 	if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
1244 	    ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
1245 	    mp->mnt_nvnodelistsize == 0)) &&
1246 	    (vp->v_vflag & VV_FORCEINSMQ) == 0) {
1247 		VI_UNLOCK(vp);
1248 		MNT_IUNLOCK(mp);
1249 		if (dtr != NULL)
1250 			dtr(vp, dtr_arg);
1251 		return (EBUSY);
1252 	}
1253 	vp->v_mount = mp;
1254 	MNT_REF(mp);
1255 	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1256 	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
1257 		("neg mount point vnode list size"));
1258 	mp->mnt_nvnodelistsize++;
1259 	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
1260 	    ("Activating already active vnode"));
1261 	vp->v_iflag |= VI_ACTIVE;
1262 	mtx_lock(&vnode_free_list_mtx);
1263 	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
1264 	mp->mnt_activevnodelistsize++;
1265 	mtx_unlock(&vnode_free_list_mtx);
1266 	VI_UNLOCK(vp);
1267 	MNT_IUNLOCK(mp);
1268 	return (0);
1269 }
1270 
1271 int
1272 insmntque(struct vnode *vp, struct mount *mp)
1273 {
1274 
1275 	return (insmntque1(vp, mp, insmntque_stddtr, NULL));
1276 }
1277 
1278 /*
1279  * Flush out and invalidate all buffers associated with a bufobj
1280  * Called with the underlying object locked.
1281  */
1282 int
1283 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
1284 {
1285 	int error;
1286 
1287 	BO_LOCK(bo);
1288 	if (flags & V_SAVE) {
1289 		error = bufobj_wwait(bo, slpflag, slptimeo);
1290 		if (error) {
1291 			BO_UNLOCK(bo);
1292 			return (error);
1293 		}
1294 		if (bo->bo_dirty.bv_cnt > 0) {
1295 			BO_UNLOCK(bo);
1296 			if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
1297 				return (error);
1298 			/*
1299 			 * XXX We could save a lock/unlock if this was only
1300 			 * enabled under INVARIANTS
1301 			 */
1302 			BO_LOCK(bo);
1303 			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
1304 				panic("vinvalbuf: dirty bufs");
1305 		}
1306 	}
1307 	/*
1308 	 * If you alter this loop please notice that interlock is dropped and
1309 	 * reacquired in flushbuflist.  Special care is needed to ensure that
1310 	 * no race conditions occur from this.
1311 	 */
1312 	do {
1313 		error = flushbuflist(&bo->bo_clean,
1314 		    flags, bo, slpflag, slptimeo);
1315 		if (error == 0 && !(flags & V_CLEANONLY))
1316 			error = flushbuflist(&bo->bo_dirty,
1317 			    flags, bo, slpflag, slptimeo);
1318 		if (error != 0 && error != EAGAIN) {
1319 			BO_UNLOCK(bo);
1320 			return (error);
1321 		}
1322 	} while (error != 0);
1323 
1324 	/*
1325 	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
1326 	 * have write I/O in-progress but if there is a VM object then the
1327 	 * VM object can also have read-I/O in-progress.
1328 	 */
1329 	do {
1330 		bufobj_wwait(bo, 0, 0);
1331 		BO_UNLOCK(bo);
1332 		if (bo->bo_object != NULL) {
1333 			VM_OBJECT_WLOCK(bo->bo_object);
1334 			vm_object_pip_wait(bo->bo_object, "bovlbx");
1335 			VM_OBJECT_WUNLOCK(bo->bo_object);
1336 		}
1337 		BO_LOCK(bo);
1338 	} while (bo->bo_numoutput > 0);
1339 	BO_UNLOCK(bo);
1340 
1341 	/*
1342 	 * Destroy the copy in the VM cache, too.
1343 	 */
1344 	if (bo->bo_object != NULL &&
1345 	    (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) {
1346 		VM_OBJECT_WLOCK(bo->bo_object);
1347 		vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
1348 		    OBJPR_CLEANONLY : 0);
1349 		VM_OBJECT_WUNLOCK(bo->bo_object);
1350 	}
1351 
1352 #ifdef INVARIANTS
1353 	BO_LOCK(bo);
1354 	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 &&
1355 	    (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
1356 		panic("vinvalbuf: flush failed");
1357 	BO_UNLOCK(bo);
1358 #endif
1359 	return (0);
1360 }
1361 
1362 /*
1363  * Flush out and invalidate all buffers associated with a vnode.
1364  * Called with the underlying object locked.
1365  */
1366 int
1367 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
1368 {
1369 
1370 	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
1371 	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
1372 	if (vp->v_object != NULL && vp->v_object->handle != vp)
1373 		return (0);
1374 	return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
1375 }
1376 
1377 /*
1378  * Flush out buffers on the specified list.
1379  *
1380  */
1381 static int
1382 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
1383     int slptimeo)
1384 {
1385 	struct buf *bp, *nbp;
1386 	int retval, error;
1387 	daddr_t lblkno;
1388 	b_xflags_t xflags;
1389 
1390 	ASSERT_BO_WLOCKED(bo);
1391 
1392 	retval = 0;
1393 	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
1394 		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1395 		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1396 			continue;
1397 		}
1398 		lblkno = 0;
1399 		xflags = 0;
1400 		if (nbp != NULL) {
1401 			lblkno = nbp->b_lblkno;
1402 			xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
1403 		}
1404 		retval = EAGAIN;
1405 		error = BUF_TIMELOCK(bp,
1406 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
1407 		    "flushbuf", slpflag, slptimeo);
1408 		if (error) {
1409 			BO_LOCK(bo);
1410 			return (error != ENOLCK ? error : EAGAIN);
1411 		}
1412 		KASSERT(bp->b_bufobj == bo,
1413 		    ("bp %p wrong b_bufobj %p should be %p",
1414 		    bp, bp->b_bufobj, bo));
1415 		if (bp->b_bufobj != bo) {	/* XXX: necessary ? */
1416 			BUF_UNLOCK(bp);
1417 			BO_LOCK(bo);
1418 			return (EAGAIN);
1419 		}
1420 		/*
1421 		 * XXX Since there are no node locks for NFS, I
1422 		 * believe there is a slight chance that a delayed
1423 		 * write will occur while sleeping just above, so
1424 		 * check for it.
1425 		 */
1426 		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1427 		    (flags & V_SAVE)) {
1428 			bremfree(bp);
1429 			bp->b_flags |= B_ASYNC;
1430 			bwrite(bp);
1431 			BO_LOCK(bo);
1432 			return (EAGAIN);	/* XXX: why not loop ? */
1433 		}
1434 		bremfree(bp);
1435 		bp->b_flags |= (B_INVAL | B_RELBUF);
1436 		bp->b_flags &= ~B_ASYNC;
1437 		brelse(bp);
1438 		BO_LOCK(bo);
1439 		if (nbp != NULL &&
1440 		    (nbp->b_bufobj != bo ||
1441 		     nbp->b_lblkno != lblkno ||
1442 		     (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags))
1443 			break;			/* nbp invalid */
1444 	}
1445 	return (retval);
1446 }
1447 
1448 /*
1449  * Truncate a file's buffer and pages to a specified length.  This
1450  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1451  * sync activity.
1452  */
1453 int
1454 vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize)
1455 {
1456 	struct buf *bp, *nbp;
1457 	int anyfreed;
1458 	int trunclbn;
1459 	struct bufobj *bo;
1460 
1461 	CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__,
1462 	    vp, cred, blksize, (uintmax_t)length);
1463 
1464 	/*
1465 	 * Round up to the *next* lbn.
1466 	 */
1467 	trunclbn = (length + blksize - 1) / blksize;
1468 
1469 	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1470 restart:
1471 	bo = &vp->v_bufobj;
1472 	BO_LOCK(bo);
1473 	anyfreed = 1;
1474 	for (;anyfreed;) {
1475 		anyfreed = 0;
1476 		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
1477 			if (bp->b_lblkno < trunclbn)
1478 				continue;
1479 			if (BUF_LOCK(bp,
1480 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1481 			    BO_LOCKPTR(bo)) == ENOLCK)
1482 				goto restart;
1483 
1484 			bremfree(bp);
1485 			bp->b_flags |= (B_INVAL | B_RELBUF);
1486 			bp->b_flags &= ~B_ASYNC;
1487 			brelse(bp);
1488 			anyfreed = 1;
1489 
1490 			BO_LOCK(bo);
1491 			if (nbp != NULL &&
1492 			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1493 			    (nbp->b_vp != vp) ||
1494 			    (nbp->b_flags & B_DELWRI))) {
1495 				BO_UNLOCK(bo);
1496 				goto restart;
1497 			}
1498 		}
1499 
1500 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1501 			if (bp->b_lblkno < trunclbn)
1502 				continue;
1503 			if (BUF_LOCK(bp,
1504 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1505 			    BO_LOCKPTR(bo)) == ENOLCK)
1506 				goto restart;
1507 			bremfree(bp);
1508 			bp->b_flags |= (B_INVAL | B_RELBUF);
1509 			bp->b_flags &= ~B_ASYNC;
1510 			brelse(bp);
1511 			anyfreed = 1;
1512 
1513 			BO_LOCK(bo);
1514 			if (nbp != NULL &&
1515 			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1516 			    (nbp->b_vp != vp) ||
1517 			    (nbp->b_flags & B_DELWRI) == 0)) {
1518 				BO_UNLOCK(bo);
1519 				goto restart;
1520 			}
1521 		}
1522 	}
1523 
1524 	if (length > 0) {
1525 restartsync:
1526 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1527 			if (bp->b_lblkno > 0)
1528 				continue;
1529 			/*
1530 			 * Since we hold the vnode lock this should only
1531 			 * fail if we're racing with the buf daemon.
1532 			 */
1533 			if (BUF_LOCK(bp,
1534 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1535 			    BO_LOCKPTR(bo)) == ENOLCK) {
1536 				goto restart;
1537 			}
1538 			VNASSERT((bp->b_flags & B_DELWRI), vp,
1539 			    ("buf(%p) on dirty queue without DELWRI", bp));
1540 
1541 			bremfree(bp);
1542 			bawrite(bp);
1543 			BO_LOCK(bo);
1544 			goto restartsync;
1545 		}
1546 	}
1547 
1548 	bufobj_wwait(bo, 0, 0);
1549 	BO_UNLOCK(bo);
1550 	vnode_pager_setsize(vp, length);
1551 
1552 	return (0);
1553 }
1554 
1555 static void
1556 buf_vlist_remove(struct buf *bp)
1557 {
1558 	struct bufv *bv;
1559 
1560 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1561 	ASSERT_BO_WLOCKED(bp->b_bufobj);
1562 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
1563 	    (BX_VNDIRTY|BX_VNCLEAN),
1564 	    ("buf_vlist_remove: Buf %p is on two lists", bp));
1565 	if (bp->b_xflags & BX_VNDIRTY)
1566 		bv = &bp->b_bufobj->bo_dirty;
1567 	else
1568 		bv = &bp->b_bufobj->bo_clean;
1569 	BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
1570 	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
1571 	bv->bv_cnt--;
1572 	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1573 }
1574 
1575 /*
1576  * Add the buffer to the sorted clean or dirty block list.
1577  *
1578  * NOTE: xflags is passed as a constant, optimizing this inline function!
1579  */
1580 static void
1581 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
1582 {
1583 	struct bufv *bv;
1584 	struct buf *n;
1585 	int error;
1586 
1587 	ASSERT_BO_WLOCKED(bo);
1588 	KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
1589 	    ("dead bo %p", bo));
1590 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1591 	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
1592 	bp->b_xflags |= xflags;
1593 	if (xflags & BX_VNDIRTY)
1594 		bv = &bo->bo_dirty;
1595 	else
1596 		bv = &bo->bo_clean;
1597 
1598 	/*
1599 	 * Keep the list ordered.  Optimize empty list insertion.  Assume
1600 	 * we tend to grow at the tail so lookup_le should usually be cheaper
1601 	 * than _ge.
1602 	 */
1603 	if (bv->bv_cnt == 0 ||
1604 	    bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
1605 		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
1606 	else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
1607 		TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
1608 	else
1609 		TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
1610 	error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
1611 	if (error)
1612 		panic("buf_vlist_add:  Preallocated nodes insufficient.");
1613 	bv->bv_cnt++;
1614 }
1615 
1616 /*
1617  * Look up a buffer using the buffer tries.
1618  */
1619 struct buf *
1620 gbincore(struct bufobj *bo, daddr_t lblkno)
1621 {
1622 	struct buf *bp;
1623 
1624 	ASSERT_BO_LOCKED(bo);
1625 	bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
1626 	if (bp != NULL)
1627 		return (bp);
1628 	return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno);
1629 }
1630 
1631 /*
1632  * Associate a buffer with a vnode.
1633  */
1634 void
1635 bgetvp(struct vnode *vp, struct buf *bp)
1636 {
1637 	struct bufobj *bo;
1638 
1639 	bo = &vp->v_bufobj;
1640 	ASSERT_BO_WLOCKED(bo);
1641 	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
1642 
1643 	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
1644 	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
1645 	    ("bgetvp: bp already attached! %p", bp));
1646 
1647 	vhold(vp);
1648 	bp->b_vp = vp;
1649 	bp->b_bufobj = bo;
1650 	/*
1651 	 * Insert onto list for new vnode.
1652 	 */
1653 	buf_vlist_add(bp, bo, BX_VNCLEAN);
1654 }
1655 
1656 /*
1657  * Disassociate a buffer from a vnode.
1658  */
1659 void
1660 brelvp(struct buf *bp)
1661 {
1662 	struct bufobj *bo;
1663 	struct vnode *vp;
1664 
1665 	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1666 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1667 
1668 	/*
1669 	 * Delete from old vnode list, if on one.
1670 	 */
1671 	vp = bp->b_vp;		/* XXX */
1672 	bo = bp->b_bufobj;
1673 	BO_LOCK(bo);
1674 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1675 		buf_vlist_remove(bp);
1676 	else
1677 		panic("brelvp: Buffer %p not on queue.", bp);
1678 	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1679 		bo->bo_flag &= ~BO_ONWORKLST;
1680 		mtx_lock(&sync_mtx);
1681 		LIST_REMOVE(bo, bo_synclist);
1682 		syncer_worklist_len--;
1683 		mtx_unlock(&sync_mtx);
1684 	}
1685 	bp->b_vp = NULL;
1686 	bp->b_bufobj = NULL;
1687 	BO_UNLOCK(bo);
1688 	vdrop(vp);
1689 }
1690 
1691 /*
1692  * Add an item to the syncer work queue.
1693  */
1694 static void
1695 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
1696 {
1697 	int slot;
1698 
1699 	ASSERT_BO_WLOCKED(bo);
1700 
1701 	mtx_lock(&sync_mtx);
1702 	if (bo->bo_flag & BO_ONWORKLST)
1703 		LIST_REMOVE(bo, bo_synclist);
1704 	else {
1705 		bo->bo_flag |= BO_ONWORKLST;
1706 		syncer_worklist_len++;
1707 	}
1708 
1709 	if (delay > syncer_maxdelay - 2)
1710 		delay = syncer_maxdelay - 2;
1711 	slot = (syncer_delayno + delay) & syncer_mask;
1712 
1713 	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
1714 	mtx_unlock(&sync_mtx);
1715 }
1716 
1717 static int
1718 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
1719 {
1720 	int error, len;
1721 
1722 	mtx_lock(&sync_mtx);
1723 	len = syncer_worklist_len - sync_vnode_count;
1724 	mtx_unlock(&sync_mtx);
1725 	error = SYSCTL_OUT(req, &len, sizeof(len));
1726 	return (error);
1727 }
1728 
1729 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
1730     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
1731 
1732 static struct proc *updateproc;
1733 static void sched_sync(void);
1734 static struct kproc_desc up_kp = {
1735 	"syncer",
1736 	sched_sync,
1737 	&updateproc
1738 };
1739 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
1740 
1741 static int
1742 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
1743 {
1744 	struct vnode *vp;
1745 	struct mount *mp;
1746 
1747 	*bo = LIST_FIRST(slp);
1748 	if (*bo == NULL)
1749 		return (0);
1750 	vp = (*bo)->__bo_vnode;	/* XXX */
1751 	if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
1752 		return (1);
1753 	/*
1754 	 * We use vhold in case the vnode does not
1755 	 * successfully sync.  vhold prevents the vnode from
1756 	 * going away when we unlock the sync_mtx so that
1757 	 * we can acquire the vnode interlock.
1758 	 */
1759 	vholdl(vp);
1760 	mtx_unlock(&sync_mtx);
1761 	VI_UNLOCK(vp);
1762 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1763 		vdrop(vp);
1764 		mtx_lock(&sync_mtx);
1765 		return (*bo == LIST_FIRST(slp));
1766 	}
1767 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1768 	(void) VOP_FSYNC(vp, MNT_LAZY, td);
1769 	VOP_UNLOCK(vp, 0);
1770 	vn_finished_write(mp);
1771 	BO_LOCK(*bo);
1772 	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
1773 		/*
1774 		 * Put us back on the worklist.  The worklist
1775 		 * routine will remove us from our current
1776 		 * position and then add us back in at a later
1777 		 * position.
1778 		 */
1779 		vn_syncer_add_to_worklist(*bo, syncdelay);
1780 	}
1781 	BO_UNLOCK(*bo);
1782 	vdrop(vp);
1783 	mtx_lock(&sync_mtx);
1784 	return (0);
1785 }
1786 
1787 static int first_printf = 1;
1788 
1789 /*
1790  * System filesystem synchronizer daemon.
1791  */
1792 static void
1793 sched_sync(void)
1794 {
1795 	struct synclist *next, *slp;
1796 	struct bufobj *bo;
1797 	long starttime;
1798 	struct thread *td = curthread;
1799 	int last_work_seen;
1800 	int net_worklist_len;
1801 	int syncer_final_iter;
1802 	int error;
1803 
1804 	last_work_seen = 0;
1805 	syncer_final_iter = 0;
1806 	syncer_state = SYNCER_RUNNING;
1807 	starttime = time_uptime;
1808 	td->td_pflags |= TDP_NORUNNINGBUF;
1809 
1810 	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
1811 	    SHUTDOWN_PRI_LAST);
1812 
1813 	mtx_lock(&sync_mtx);
1814 	for (;;) {
1815 		if (syncer_state == SYNCER_FINAL_DELAY &&
1816 		    syncer_final_iter == 0) {
1817 			mtx_unlock(&sync_mtx);
1818 			kproc_suspend_check(td->td_proc);
1819 			mtx_lock(&sync_mtx);
1820 		}
1821 		net_worklist_len = syncer_worklist_len - sync_vnode_count;
1822 		if (syncer_state != SYNCER_RUNNING &&
1823 		    starttime != time_uptime) {
1824 			if (first_printf) {
1825 				printf("\nSyncing disks, vnodes remaining...");
1826 				first_printf = 0;
1827 			}
1828 			printf("%d ", net_worklist_len);
1829 		}
1830 		starttime = time_uptime;
1831 
1832 		/*
1833 		 * Push files whose dirty time has expired.  Be careful
1834 		 * of interrupt race on slp queue.
1835 		 *
1836 		 * Skip over empty worklist slots when shutting down.
1837 		 */
1838 		do {
1839 			slp = &syncer_workitem_pending[syncer_delayno];
1840 			syncer_delayno += 1;
1841 			if (syncer_delayno == syncer_maxdelay)
1842 				syncer_delayno = 0;
1843 			next = &syncer_workitem_pending[syncer_delayno];
1844 			/*
1845 			 * If the worklist has wrapped since the
1846 			 * it was emptied of all but syncer vnodes,
1847 			 * switch to the FINAL_DELAY state and run
1848 			 * for one more second.
1849 			 */
1850 			if (syncer_state == SYNCER_SHUTTING_DOWN &&
1851 			    net_worklist_len == 0 &&
1852 			    last_work_seen == syncer_delayno) {
1853 				syncer_state = SYNCER_FINAL_DELAY;
1854 				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
1855 			}
1856 		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
1857 		    syncer_worklist_len > 0);
1858 
1859 		/*
1860 		 * Keep track of the last time there was anything
1861 		 * on the worklist other than syncer vnodes.
1862 		 * Return to the SHUTTING_DOWN state if any
1863 		 * new work appears.
1864 		 */
1865 		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
1866 			last_work_seen = syncer_delayno;
1867 		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
1868 			syncer_state = SYNCER_SHUTTING_DOWN;
1869 		while (!LIST_EMPTY(slp)) {
1870 			error = sync_vnode(slp, &bo, td);
1871 			if (error == 1) {
1872 				LIST_REMOVE(bo, bo_synclist);
1873 				LIST_INSERT_HEAD(next, bo, bo_synclist);
1874 				continue;
1875 			}
1876 
1877 			if (first_printf == 0) {
1878 				/*
1879 				 * Drop the sync mutex, because some watchdog
1880 				 * drivers need to sleep while patting
1881 				 */
1882 				mtx_unlock(&sync_mtx);
1883 				wdog_kern_pat(WD_LASTVAL);
1884 				mtx_lock(&sync_mtx);
1885 			}
1886 
1887 		}
1888 		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
1889 			syncer_final_iter--;
1890 		/*
1891 		 * The variable rushjob allows the kernel to speed up the
1892 		 * processing of the filesystem syncer process. A rushjob
1893 		 * value of N tells the filesystem syncer to process the next
1894 		 * N seconds worth of work on its queue ASAP. Currently rushjob
1895 		 * is used by the soft update code to speed up the filesystem
1896 		 * syncer process when the incore state is getting so far
1897 		 * ahead of the disk that the kernel memory pool is being
1898 		 * threatened with exhaustion.
1899 		 */
1900 		if (rushjob > 0) {
1901 			rushjob -= 1;
1902 			continue;
1903 		}
1904 		/*
1905 		 * Just sleep for a short period of time between
1906 		 * iterations when shutting down to allow some I/O
1907 		 * to happen.
1908 		 *
1909 		 * If it has taken us less than a second to process the
1910 		 * current work, then wait. Otherwise start right over
1911 		 * again. We can still lose time if any single round
1912 		 * takes more than two seconds, but it does not really
1913 		 * matter as we are just trying to generally pace the
1914 		 * filesystem activity.
1915 		 */
1916 		if (syncer_state != SYNCER_RUNNING ||
1917 		    time_uptime == starttime) {
1918 			thread_lock(td);
1919 			sched_prio(td, PPAUSE);
1920 			thread_unlock(td);
1921 		}
1922 		if (syncer_state != SYNCER_RUNNING)
1923 			cv_timedwait(&sync_wakeup, &sync_mtx,
1924 			    hz / SYNCER_SHUTDOWN_SPEEDUP);
1925 		else if (time_uptime == starttime)
1926 			cv_timedwait(&sync_wakeup, &sync_mtx, hz);
1927 	}
1928 }
1929 
1930 /*
1931  * Request the syncer daemon to speed up its work.
1932  * We never push it to speed up more than half of its
1933  * normal turn time, otherwise it could take over the cpu.
1934  */
1935 int
1936 speedup_syncer(void)
1937 {
1938 	int ret = 0;
1939 
1940 	mtx_lock(&sync_mtx);
1941 	if (rushjob < syncdelay / 2) {
1942 		rushjob += 1;
1943 		stat_rush_requests += 1;
1944 		ret = 1;
1945 	}
1946 	mtx_unlock(&sync_mtx);
1947 	cv_broadcast(&sync_wakeup);
1948 	return (ret);
1949 }
1950 
1951 /*
1952  * Tell the syncer to speed up its work and run though its work
1953  * list several times, then tell it to shut down.
1954  */
1955 static void
1956 syncer_shutdown(void *arg, int howto)
1957 {
1958 
1959 	if (howto & RB_NOSYNC)
1960 		return;
1961 	mtx_lock(&sync_mtx);
1962 	syncer_state = SYNCER_SHUTTING_DOWN;
1963 	rushjob = 0;
1964 	mtx_unlock(&sync_mtx);
1965 	cv_broadcast(&sync_wakeup);
1966 	kproc_shutdown(arg, howto);
1967 }
1968 
1969 void
1970 syncer_suspend(void)
1971 {
1972 
1973 	syncer_shutdown(updateproc, 0);
1974 }
1975 
1976 void
1977 syncer_resume(void)
1978 {
1979 
1980 	mtx_lock(&sync_mtx);
1981 	first_printf = 1;
1982 	syncer_state = SYNCER_RUNNING;
1983 	mtx_unlock(&sync_mtx);
1984 	cv_broadcast(&sync_wakeup);
1985 	kproc_resume(updateproc);
1986 }
1987 
1988 /*
1989  * Reassign a buffer from one vnode to another.
1990  * Used to assign file specific control information
1991  * (indirect blocks) to the vnode to which they belong.
1992  */
1993 void
1994 reassignbuf(struct buf *bp)
1995 {
1996 	struct vnode *vp;
1997 	struct bufobj *bo;
1998 	int delay;
1999 #ifdef INVARIANTS
2000 	struct bufv *bv;
2001 #endif
2002 
2003 	vp = bp->b_vp;
2004 	bo = bp->b_bufobj;
2005 	++reassignbufcalls;
2006 
2007 	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
2008 	    bp, bp->b_vp, bp->b_flags);
2009 	/*
2010 	 * B_PAGING flagged buffers cannot be reassigned because their vp
2011 	 * is not fully linked in.
2012 	 */
2013 	if (bp->b_flags & B_PAGING)
2014 		panic("cannot reassign paging buffer");
2015 
2016 	/*
2017 	 * Delete from old vnode list, if on one.
2018 	 */
2019 	BO_LOCK(bo);
2020 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
2021 		buf_vlist_remove(bp);
2022 	else
2023 		panic("reassignbuf: Buffer %p not on queue.", bp);
2024 	/*
2025 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
2026 	 * of clean buffers.
2027 	 */
2028 	if (bp->b_flags & B_DELWRI) {
2029 		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
2030 			switch (vp->v_type) {
2031 			case VDIR:
2032 				delay = dirdelay;
2033 				break;
2034 			case VCHR:
2035 				delay = metadelay;
2036 				break;
2037 			default:
2038 				delay = filedelay;
2039 			}
2040 			vn_syncer_add_to_worklist(bo, delay);
2041 		}
2042 		buf_vlist_add(bp, bo, BX_VNDIRTY);
2043 	} else {
2044 		buf_vlist_add(bp, bo, BX_VNCLEAN);
2045 
2046 		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
2047 			mtx_lock(&sync_mtx);
2048 			LIST_REMOVE(bo, bo_synclist);
2049 			syncer_worklist_len--;
2050 			mtx_unlock(&sync_mtx);
2051 			bo->bo_flag &= ~BO_ONWORKLST;
2052 		}
2053 	}
2054 #ifdef INVARIANTS
2055 	bv = &bo->bo_clean;
2056 	bp = TAILQ_FIRST(&bv->bv_hd);
2057 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2058 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2059 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
2060 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2061 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2062 	bv = &bo->bo_dirty;
2063 	bp = TAILQ_FIRST(&bv->bv_hd);
2064 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2065 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2066 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
2067 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2068 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2069 #endif
2070 	BO_UNLOCK(bo);
2071 }
2072 
2073 /*
2074  * A temporary hack until refcount_* APIs are sorted out.
2075  */
2076 static __inline int
2077 vfs_refcount_acquire_if_not_zero(volatile u_int *count)
2078 {
2079 	u_int old;
2080 
2081 	for (;;) {
2082 		old = *count;
2083 		if (old == 0)
2084 			return (0);
2085 		if (atomic_cmpset_int(count, old, old + 1))
2086 			return (1);
2087 	}
2088 }
2089 
2090 static __inline int
2091 vfs_refcount_release_if_not_last(volatile u_int *count)
2092 {
2093 	u_int old;
2094 
2095 	for (;;) {
2096 		old = *count;
2097 		if (old == 1)
2098 			return (0);
2099 		if (atomic_cmpset_int(count, old, old - 1))
2100 			return (1);
2101 	}
2102 }
2103 
2104 static void
2105 v_init_counters(struct vnode *vp)
2106 {
2107 
2108 	VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0,
2109 	    vp, ("%s called for an initialized vnode", __FUNCTION__));
2110 	ASSERT_VI_UNLOCKED(vp, __FUNCTION__);
2111 
2112 	refcount_init(&vp->v_holdcnt, 1);
2113 	refcount_init(&vp->v_usecount, 1);
2114 }
2115 
2116 /*
2117  * Increment the use and hold counts on the vnode, taking care to reference
2118  * the driver's usecount if this is a chardev.  The _vhold() will remove
2119  * the vnode from the free list if it is presently free.
2120  */
2121 static void
2122 v_incr_usecount(struct vnode *vp)
2123 {
2124 
2125 	ASSERT_VI_UNLOCKED(vp, __func__);
2126 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2127 
2128 	if (vp->v_type == VCHR) {
2129 		VI_LOCK(vp);
2130 		_vhold(vp, true);
2131 		if (vp->v_iflag & VI_OWEINACT) {
2132 			VNASSERT(vp->v_usecount == 0, vp,
2133 			    ("vnode with usecount and VI_OWEINACT set"));
2134 			vp->v_iflag &= ~VI_OWEINACT;
2135 		}
2136 		refcount_acquire(&vp->v_usecount);
2137 		v_incr_devcount(vp);
2138 		VI_UNLOCK(vp);
2139 		return;
2140 	}
2141 
2142 	_vhold(vp, false);
2143 	if (vfs_refcount_acquire_if_not_zero(&vp->v_usecount)) {
2144 		VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
2145 		    ("vnode with usecount and VI_OWEINACT set"));
2146 	} else {
2147 		VI_LOCK(vp);
2148 		if (vp->v_iflag & VI_OWEINACT)
2149 			vp->v_iflag &= ~VI_OWEINACT;
2150 		refcount_acquire(&vp->v_usecount);
2151 		VI_UNLOCK(vp);
2152 	}
2153 }
2154 
2155 /*
2156  * Increment si_usecount of the associated device, if any.
2157  */
2158 static void
2159 v_incr_devcount(struct vnode *vp)
2160 {
2161 
2162 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
2163 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2164 		dev_lock();
2165 		vp->v_rdev->si_usecount++;
2166 		dev_unlock();
2167 	}
2168 }
2169 
2170 /*
2171  * Decrement si_usecount of the associated device, if any.
2172  */
2173 static void
2174 v_decr_devcount(struct vnode *vp)
2175 {
2176 
2177 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
2178 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2179 		dev_lock();
2180 		vp->v_rdev->si_usecount--;
2181 		dev_unlock();
2182 	}
2183 }
2184 
2185 /*
2186  * Grab a particular vnode from the free list, increment its
2187  * reference count and lock it.  VI_DOOMED is set if the vnode
2188  * is being destroyed.  Only callers who specify LK_RETRY will
2189  * see doomed vnodes.  If inactive processing was delayed in
2190  * vput try to do it here.
2191  *
2192  * Notes on lockless counter manipulation:
2193  * _vhold, vputx and other routines make various decisions based
2194  * on either holdcnt or usecount being 0. As long as either contuner
2195  * is not transitioning 0->1 nor 1->0, the manipulation can be done
2196  * with atomic operations. Otherwise the interlock is taken.
2197  */
2198 int
2199 vget(struct vnode *vp, int flags, struct thread *td)
2200 {
2201 	int error, oweinact;
2202 
2203 	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
2204 	    ("vget: invalid lock operation"));
2205 
2206 	if ((flags & LK_INTERLOCK) != 0)
2207 		ASSERT_VI_LOCKED(vp, __func__);
2208 	else
2209 		ASSERT_VI_UNLOCKED(vp, __func__);
2210 	if ((flags & LK_VNHELD) != 0)
2211 		VNASSERT((vp->v_holdcnt > 0), vp,
2212 		    ("vget: LK_VNHELD passed but vnode not held"));
2213 
2214 	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
2215 
2216 	if ((flags & LK_VNHELD) == 0)
2217 		_vhold(vp, (flags & LK_INTERLOCK) != 0);
2218 
2219 	if ((error = vn_lock(vp, flags)) != 0) {
2220 		vdrop(vp);
2221 		CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
2222 		    vp);
2223 		return (error);
2224 	}
2225 	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
2226 		panic("vget: vn_lock failed to return ENOENT\n");
2227 	/*
2228 	 * We don't guarantee that any particular close will
2229 	 * trigger inactive processing so just make a best effort
2230 	 * here at preventing a reference to a removed file.  If
2231 	 * we don't succeed no harm is done.
2232 	 *
2233 	 * Upgrade our holdcnt to a usecount.
2234 	 */
2235 	if (vp->v_type != VCHR &&
2236 	    vfs_refcount_acquire_if_not_zero(&vp->v_usecount)) {
2237 		VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
2238 		    ("vnode with usecount and VI_OWEINACT set"));
2239 	} else {
2240 		VI_LOCK(vp);
2241 		if ((vp->v_iflag & VI_OWEINACT) == 0) {
2242 			oweinact = 0;
2243 		} else {
2244 			oweinact = 1;
2245 			vp->v_iflag &= ~VI_OWEINACT;
2246 		}
2247 		refcount_acquire(&vp->v_usecount);
2248 		v_incr_devcount(vp);
2249 		if (oweinact && VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
2250 		    (flags & LK_NOWAIT) == 0)
2251 			vinactive(vp, td);
2252 		VI_UNLOCK(vp);
2253 	}
2254 	return (0);
2255 }
2256 
2257 /*
2258  * Increase the reference count of a vnode.
2259  */
2260 void
2261 vref(struct vnode *vp)
2262 {
2263 
2264 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2265 	v_incr_usecount(vp);
2266 }
2267 
2268 /*
2269  * Return reference count of a vnode.
2270  *
2271  * The results of this call are only guaranteed when some mechanism is used to
2272  * stop other processes from gaining references to the vnode.  This may be the
2273  * case if the caller holds the only reference.  This is also useful when stale
2274  * data is acceptable as race conditions may be accounted for by some other
2275  * means.
2276  */
2277 int
2278 vrefcnt(struct vnode *vp)
2279 {
2280 
2281 	return (vp->v_usecount);
2282 }
2283 
2284 #define	VPUTX_VRELE	1
2285 #define	VPUTX_VPUT	2
2286 #define	VPUTX_VUNREF	3
2287 
2288 /*
2289  * Decrement the use and hold counts for a vnode.
2290  *
2291  * See an explanation near vget() as to why atomic operation is safe.
2292  */
2293 static void
2294 vputx(struct vnode *vp, int func)
2295 {
2296 	int error;
2297 
2298 	KASSERT(vp != NULL, ("vputx: null vp"));
2299 	if (func == VPUTX_VUNREF)
2300 		ASSERT_VOP_LOCKED(vp, "vunref");
2301 	else if (func == VPUTX_VPUT)
2302 		ASSERT_VOP_LOCKED(vp, "vput");
2303 	else
2304 		KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
2305 	ASSERT_VI_UNLOCKED(vp, __func__);
2306 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2307 
2308 	if (vp->v_type != VCHR &&
2309 	    vfs_refcount_release_if_not_last(&vp->v_usecount)) {
2310 		if (func == VPUTX_VPUT)
2311 			VOP_UNLOCK(vp, 0);
2312 		vdrop(vp);
2313 		return;
2314 	}
2315 
2316 	VI_LOCK(vp);
2317 
2318 	/*
2319 	 * We want to hold the vnode until the inactive finishes to
2320 	 * prevent vgone() races.  We drop the use count here and the
2321 	 * hold count below when we're done.
2322 	 */
2323 	if (!refcount_release(&vp->v_usecount) ||
2324 	    (vp->v_iflag & VI_DOINGINACT)) {
2325 		if (func == VPUTX_VPUT)
2326 			VOP_UNLOCK(vp, 0);
2327 		v_decr_devcount(vp);
2328 		vdropl(vp);
2329 		return;
2330 	}
2331 
2332 	v_decr_devcount(vp);
2333 
2334 	error = 0;
2335 
2336 	if (vp->v_usecount != 0) {
2337 		vprint("vputx: usecount not zero", vp);
2338 		panic("vputx: usecount not zero");
2339 	}
2340 
2341 	CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp);
2342 
2343 	/*
2344 	 * We must call VOP_INACTIVE with the node locked. Mark
2345 	 * as VI_DOINGINACT to avoid recursion.
2346 	 */
2347 	vp->v_iflag |= VI_OWEINACT;
2348 	switch (func) {
2349 	case VPUTX_VRELE:
2350 		error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
2351 		VI_LOCK(vp);
2352 		break;
2353 	case VPUTX_VPUT:
2354 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2355 			error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
2356 			    LK_NOWAIT);
2357 			VI_LOCK(vp);
2358 		}
2359 		break;
2360 	case VPUTX_VUNREF:
2361 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2362 			error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
2363 			VI_LOCK(vp);
2364 		}
2365 		break;
2366 	}
2367 	VNASSERT(vp->v_usecount == 0 || (vp->v_iflag & VI_OWEINACT) == 0, vp,
2368 	    ("vnode with usecount and VI_OWEINACT set"));
2369 	if (error == 0) {
2370 		if (vp->v_iflag & VI_OWEINACT)
2371 			vinactive(vp, curthread);
2372 		if (func != VPUTX_VUNREF)
2373 			VOP_UNLOCK(vp, 0);
2374 	}
2375 	vdropl(vp);
2376 }
2377 
2378 /*
2379  * Vnode put/release.
2380  * If count drops to zero, call inactive routine and return to freelist.
2381  */
2382 void
2383 vrele(struct vnode *vp)
2384 {
2385 
2386 	vputx(vp, VPUTX_VRELE);
2387 }
2388 
2389 /*
2390  * Release an already locked vnode.  This give the same effects as
2391  * unlock+vrele(), but takes less time and avoids releasing and
2392  * re-aquiring the lock (as vrele() acquires the lock internally.)
2393  */
2394 void
2395 vput(struct vnode *vp)
2396 {
2397 
2398 	vputx(vp, VPUTX_VPUT);
2399 }
2400 
2401 /*
2402  * Release an exclusively locked vnode. Do not unlock the vnode lock.
2403  */
2404 void
2405 vunref(struct vnode *vp)
2406 {
2407 
2408 	vputx(vp, VPUTX_VUNREF);
2409 }
2410 
2411 /*
2412  * Increase the hold count and activate if this is the first reference.
2413  */
2414 void
2415 _vhold(struct vnode *vp, bool locked)
2416 {
2417 	struct mount *mp;
2418 
2419 	if (locked)
2420 		ASSERT_VI_LOCKED(vp, __func__);
2421 	else
2422 		ASSERT_VI_UNLOCKED(vp, __func__);
2423 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2424 	if (!locked && vfs_refcount_acquire_if_not_zero(&vp->v_holdcnt)) {
2425 		VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2426 		    ("_vhold: vnode with holdcnt is free"));
2427 		return;
2428 	}
2429 
2430 	if (!locked)
2431 		VI_LOCK(vp);
2432 	if ((vp->v_iflag & VI_FREE) == 0) {
2433 		refcount_acquire(&vp->v_holdcnt);
2434 		if (!locked)
2435 			VI_UNLOCK(vp);
2436 		return;
2437 	}
2438 	VNASSERT(vp->v_holdcnt == 0, vp,
2439 	    ("%s: wrong hold count", __func__));
2440 	VNASSERT(vp->v_op != NULL, vp,
2441 	    ("%s: vnode already reclaimed.", __func__));
2442 	/*
2443 	 * Remove a vnode from the free list, mark it as in use,
2444 	 * and put it on the active list.
2445 	 */
2446 	mtx_lock(&vnode_free_list_mtx);
2447 	TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
2448 	freevnodes--;
2449 	vp->v_iflag &= ~(VI_FREE|VI_AGE);
2450 	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
2451 	    ("Activating already active vnode"));
2452 	vp->v_iflag |= VI_ACTIVE;
2453 	mp = vp->v_mount;
2454 	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
2455 	mp->mnt_activevnodelistsize++;
2456 	mtx_unlock(&vnode_free_list_mtx);
2457 	refcount_acquire(&vp->v_holdcnt);
2458 	if (!locked)
2459 		VI_UNLOCK(vp);
2460 }
2461 
2462 /*
2463  * Drop the hold count of the vnode.  If this is the last reference to
2464  * the vnode we place it on the free list unless it has been vgone'd
2465  * (marked VI_DOOMED) in which case we will free it.
2466  */
2467 void
2468 _vdrop(struct vnode *vp, bool locked)
2469 {
2470 	struct bufobj *bo;
2471 	struct mount *mp;
2472 	int active;
2473 
2474 	if (locked)
2475 		ASSERT_VI_LOCKED(vp, __func__);
2476 	else
2477 		ASSERT_VI_UNLOCKED(vp, __func__);
2478 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2479 	if ((int)vp->v_holdcnt <= 0)
2480 		panic("vdrop: holdcnt %d", vp->v_holdcnt);
2481 	if (vfs_refcount_release_if_not_last(&vp->v_holdcnt)) {
2482 		if (locked)
2483 			VI_UNLOCK(vp);
2484 		return;
2485 	}
2486 
2487 	if (!locked)
2488 		VI_LOCK(vp);
2489 	if (refcount_release(&vp->v_holdcnt) == 0) {
2490 		VI_UNLOCK(vp);
2491 		return;
2492 	}
2493 	if ((vp->v_iflag & VI_DOOMED) == 0) {
2494 		/*
2495 		 * Mark a vnode as free: remove it from its active list
2496 		 * and put it up for recycling on the freelist.
2497 		 */
2498 		VNASSERT(vp->v_op != NULL, vp,
2499 		    ("vdropl: vnode already reclaimed."));
2500 		VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2501 		    ("vnode already free"));
2502 		VNASSERT(vp->v_holdcnt == 0, vp,
2503 		    ("vdropl: freeing when we shouldn't"));
2504 		active = vp->v_iflag & VI_ACTIVE;
2505 		if ((vp->v_iflag & VI_OWEINACT) == 0) {
2506 			vp->v_iflag &= ~VI_ACTIVE;
2507 			mp = vp->v_mount;
2508 			mtx_lock(&vnode_free_list_mtx);
2509 			if (active) {
2510 				TAILQ_REMOVE(&mp->mnt_activevnodelist, vp,
2511 				    v_actfreelist);
2512 				mp->mnt_activevnodelistsize--;
2513 			}
2514 			if (vp->v_iflag & VI_AGE) {
2515 				TAILQ_INSERT_HEAD(&vnode_free_list, vp,
2516 				    v_actfreelist);
2517 			} else {
2518 				TAILQ_INSERT_TAIL(&vnode_free_list, vp,
2519 				    v_actfreelist);
2520 			}
2521 			freevnodes++;
2522 			vp->v_iflag &= ~VI_AGE;
2523 			vp->v_iflag |= VI_FREE;
2524 			mtx_unlock(&vnode_free_list_mtx);
2525 		} else {
2526 			atomic_add_long(&free_owe_inact, 1);
2527 		}
2528 		VI_UNLOCK(vp);
2529 		return;
2530 	}
2531 	/*
2532 	 * The vnode has been marked for destruction, so free it.
2533 	 */
2534 	CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
2535 	atomic_subtract_long(&numvnodes, 1);
2536 	bo = &vp->v_bufobj;
2537 	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2538 	    ("cleaned vnode still on the free list."));
2539 	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
2540 	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
2541 	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
2542 	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
2543 	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
2544 	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
2545 	VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
2546 	    ("clean blk trie not empty"));
2547 	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
2548 	VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
2549 	    ("dirty blk trie not empty"));
2550 	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
2551 	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
2552 	VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
2553 	VI_UNLOCK(vp);
2554 #ifdef MAC
2555 	mac_vnode_destroy(vp);
2556 #endif
2557 	if (vp->v_pollinfo != NULL)
2558 		destroy_vpollinfo(vp->v_pollinfo);
2559 #ifdef INVARIANTS
2560 	/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
2561 	vp->v_op = NULL;
2562 #endif
2563 	rangelock_destroy(&vp->v_rl);
2564 	lockdestroy(vp->v_vnlock);
2565 	mtx_destroy(&vp->v_interlock);
2566 	rw_destroy(BO_LOCKPTR(bo));
2567 	uma_zfree(vnode_zone, vp);
2568 }
2569 
2570 /*
2571  * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
2572  * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
2573  * OWEINACT tracks whether a vnode missed a call to inactive due to a
2574  * failed lock upgrade.
2575  */
2576 void
2577 vinactive(struct vnode *vp, struct thread *td)
2578 {
2579 	struct vm_object *obj;
2580 
2581 	ASSERT_VOP_ELOCKED(vp, "vinactive");
2582 	ASSERT_VI_LOCKED(vp, "vinactive");
2583 	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
2584 	    ("vinactive: recursed on VI_DOINGINACT"));
2585 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2586 	vp->v_iflag |= VI_DOINGINACT;
2587 	vp->v_iflag &= ~VI_OWEINACT;
2588 	VI_UNLOCK(vp);
2589 	/*
2590 	 * Before moving off the active list, we must be sure that any
2591 	 * modified pages are on the vnode's dirty list since these will
2592 	 * no longer be checked once the vnode is on the inactive list.
2593 	 * Because the vnode vm object keeps a hold reference on the vnode
2594 	 * if there is at least one resident non-cached page, the vnode
2595 	 * cannot leave the active list without the page cleanup done.
2596 	 */
2597 	obj = vp->v_object;
2598 	if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) {
2599 		VM_OBJECT_WLOCK(obj);
2600 		vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC);
2601 		VM_OBJECT_WUNLOCK(obj);
2602 	}
2603 	VOP_INACTIVE(vp, td);
2604 	VI_LOCK(vp);
2605 	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
2606 	    ("vinactive: lost VI_DOINGINACT"));
2607 	vp->v_iflag &= ~VI_DOINGINACT;
2608 }
2609 
2610 /*
2611  * Remove any vnodes in the vnode table belonging to mount point mp.
2612  *
2613  * If FORCECLOSE is not specified, there should not be any active ones,
2614  * return error if any are found (nb: this is a user error, not a
2615  * system error). If FORCECLOSE is specified, detach any active vnodes
2616  * that are found.
2617  *
2618  * If WRITECLOSE is set, only flush out regular file vnodes open for
2619  * writing.
2620  *
2621  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2622  *
2623  * `rootrefs' specifies the base reference count for the root vnode
2624  * of this filesystem. The root vnode is considered busy if its
2625  * v_usecount exceeds this value. On a successful return, vflush(, td)
2626  * will call vrele() on the root vnode exactly rootrefs times.
2627  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2628  * be zero.
2629  */
2630 #ifdef DIAGNOSTIC
2631 static int busyprt = 0;		/* print out busy vnodes */
2632 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
2633 #endif
2634 
2635 int
2636 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
2637 {
2638 	struct vnode *vp, *mvp, *rootvp = NULL;
2639 	struct vattr vattr;
2640 	int busy = 0, error;
2641 
2642 	CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
2643 	    rootrefs, flags);
2644 	if (rootrefs > 0) {
2645 		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2646 		    ("vflush: bad args"));
2647 		/*
2648 		 * Get the filesystem root vnode. We can vput() it
2649 		 * immediately, since with rootrefs > 0, it won't go away.
2650 		 */
2651 		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
2652 			CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
2653 			    __func__, error);
2654 			return (error);
2655 		}
2656 		vput(rootvp);
2657 	}
2658 loop:
2659 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2660 		vholdl(vp);
2661 		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
2662 		if (error) {
2663 			vdrop(vp);
2664 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2665 			goto loop;
2666 		}
2667 		/*
2668 		 * Skip over a vnodes marked VV_SYSTEM.
2669 		 */
2670 		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2671 			VOP_UNLOCK(vp, 0);
2672 			vdrop(vp);
2673 			continue;
2674 		}
2675 		/*
2676 		 * If WRITECLOSE is set, flush out unlinked but still open
2677 		 * files (even if open only for reading) and regular file
2678 		 * vnodes open for writing.
2679 		 */
2680 		if (flags & WRITECLOSE) {
2681 			if (vp->v_object != NULL) {
2682 				VM_OBJECT_WLOCK(vp->v_object);
2683 				vm_object_page_clean(vp->v_object, 0, 0, 0);
2684 				VM_OBJECT_WUNLOCK(vp->v_object);
2685 			}
2686 			error = VOP_FSYNC(vp, MNT_WAIT, td);
2687 			if (error != 0) {
2688 				VOP_UNLOCK(vp, 0);
2689 				vdrop(vp);
2690 				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2691 				return (error);
2692 			}
2693 			error = VOP_GETATTR(vp, &vattr, td->td_ucred);
2694 			VI_LOCK(vp);
2695 
2696 			if ((vp->v_type == VNON ||
2697 			    (error == 0 && vattr.va_nlink > 0)) &&
2698 			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
2699 				VOP_UNLOCK(vp, 0);
2700 				vdropl(vp);
2701 				continue;
2702 			}
2703 		} else
2704 			VI_LOCK(vp);
2705 		/*
2706 		 * With v_usecount == 0, all we need to do is clear out the
2707 		 * vnode data structures and we are done.
2708 		 *
2709 		 * If FORCECLOSE is set, forcibly close the vnode.
2710 		 */
2711 		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
2712 			VNASSERT(vp->v_usecount == 0 ||
2713 			    vp->v_op != &devfs_specops ||
2714 			    (vp->v_type != VCHR && vp->v_type != VBLK), vp,
2715 			    ("device VNODE %p is FORCECLOSED", vp));
2716 			vgonel(vp);
2717 		} else {
2718 			busy++;
2719 #ifdef DIAGNOSTIC
2720 			if (busyprt)
2721 				vprint("vflush: busy vnode", vp);
2722 #endif
2723 		}
2724 		VOP_UNLOCK(vp, 0);
2725 		vdropl(vp);
2726 	}
2727 	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2728 		/*
2729 		 * If just the root vnode is busy, and if its refcount
2730 		 * is equal to `rootrefs', then go ahead and kill it.
2731 		 */
2732 		VI_LOCK(rootvp);
2733 		KASSERT(busy > 0, ("vflush: not busy"));
2734 		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
2735 		    ("vflush: usecount %d < rootrefs %d",
2736 		     rootvp->v_usecount, rootrefs));
2737 		if (busy == 1 && rootvp->v_usecount == rootrefs) {
2738 			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
2739 			vgone(rootvp);
2740 			VOP_UNLOCK(rootvp, 0);
2741 			busy = 0;
2742 		} else
2743 			VI_UNLOCK(rootvp);
2744 	}
2745 	if (busy) {
2746 		CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
2747 		    busy);
2748 		return (EBUSY);
2749 	}
2750 	for (; rootrefs > 0; rootrefs--)
2751 		vrele(rootvp);
2752 	return (0);
2753 }
2754 
2755 /*
2756  * Recycle an unused vnode to the front of the free list.
2757  */
2758 int
2759 vrecycle(struct vnode *vp)
2760 {
2761 	int recycled;
2762 
2763 	ASSERT_VOP_ELOCKED(vp, "vrecycle");
2764 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2765 	recycled = 0;
2766 	VI_LOCK(vp);
2767 	if (vp->v_usecount == 0) {
2768 		recycled = 1;
2769 		vgonel(vp);
2770 	}
2771 	VI_UNLOCK(vp);
2772 	return (recycled);
2773 }
2774 
2775 /*
2776  * Eliminate all activity associated with a vnode
2777  * in preparation for reuse.
2778  */
2779 void
2780 vgone(struct vnode *vp)
2781 {
2782 	VI_LOCK(vp);
2783 	vgonel(vp);
2784 	VI_UNLOCK(vp);
2785 }
2786 
2787 static void
2788 notify_lowervp_vfs_dummy(struct mount *mp __unused,
2789     struct vnode *lowervp __unused)
2790 {
2791 }
2792 
2793 /*
2794  * Notify upper mounts about reclaimed or unlinked vnode.
2795  */
2796 void
2797 vfs_notify_upper(struct vnode *vp, int event)
2798 {
2799 	static struct vfsops vgonel_vfsops = {
2800 		.vfs_reclaim_lowervp = notify_lowervp_vfs_dummy,
2801 		.vfs_unlink_lowervp = notify_lowervp_vfs_dummy,
2802 	};
2803 	struct mount *mp, *ump, *mmp;
2804 
2805 	mp = vp->v_mount;
2806 	if (mp == NULL)
2807 		return;
2808 
2809 	MNT_ILOCK(mp);
2810 	if (TAILQ_EMPTY(&mp->mnt_uppers))
2811 		goto unlock;
2812 	MNT_IUNLOCK(mp);
2813 	mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
2814 	mmp->mnt_op = &vgonel_vfsops;
2815 	mmp->mnt_kern_flag |= MNTK_MARKER;
2816 	MNT_ILOCK(mp);
2817 	mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
2818 	for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
2819 		if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
2820 			ump = TAILQ_NEXT(ump, mnt_upper_link);
2821 			continue;
2822 		}
2823 		TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
2824 		MNT_IUNLOCK(mp);
2825 		switch (event) {
2826 		case VFS_NOTIFY_UPPER_RECLAIM:
2827 			VFS_RECLAIM_LOWERVP(ump, vp);
2828 			break;
2829 		case VFS_NOTIFY_UPPER_UNLINK:
2830 			VFS_UNLINK_LOWERVP(ump, vp);
2831 			break;
2832 		default:
2833 			KASSERT(0, ("invalid event %d", event));
2834 			break;
2835 		}
2836 		MNT_ILOCK(mp);
2837 		ump = TAILQ_NEXT(mmp, mnt_upper_link);
2838 		TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
2839 	}
2840 	free(mmp, M_TEMP);
2841 	mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
2842 	if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
2843 		mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
2844 		wakeup(&mp->mnt_uppers);
2845 	}
2846 unlock:
2847 	MNT_IUNLOCK(mp);
2848 }
2849 
2850 /*
2851  * vgone, with the vp interlock held.
2852  */
2853 void
2854 vgonel(struct vnode *vp)
2855 {
2856 	struct thread *td;
2857 	int oweinact;
2858 	int active;
2859 	struct mount *mp;
2860 
2861 	ASSERT_VOP_ELOCKED(vp, "vgonel");
2862 	ASSERT_VI_LOCKED(vp, "vgonel");
2863 	VNASSERT(vp->v_holdcnt, vp,
2864 	    ("vgonel: vp %p has no reference.", vp));
2865 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2866 	td = curthread;
2867 
2868 	/*
2869 	 * Don't vgonel if we're already doomed.
2870 	 */
2871 	if (vp->v_iflag & VI_DOOMED)
2872 		return;
2873 	vp->v_iflag |= VI_DOOMED;
2874 
2875 	/*
2876 	 * Check to see if the vnode is in use.  If so, we have to call
2877 	 * VOP_CLOSE() and VOP_INACTIVE().
2878 	 */
2879 	active = vp->v_usecount;
2880 	oweinact = (vp->v_iflag & VI_OWEINACT);
2881 	VI_UNLOCK(vp);
2882 	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
2883 
2884 	/*
2885 	 * If purging an active vnode, it must be closed and
2886 	 * deactivated before being reclaimed.
2887 	 */
2888 	if (active)
2889 		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2890 	if (oweinact || active) {
2891 		VI_LOCK(vp);
2892 		if ((vp->v_iflag & VI_DOINGINACT) == 0)
2893 			vinactive(vp, td);
2894 		VI_UNLOCK(vp);
2895 	}
2896 	if (vp->v_type == VSOCK)
2897 		vfs_unp_reclaim(vp);
2898 
2899 	/*
2900 	 * Clean out any buffers associated with the vnode.
2901 	 * If the flush fails, just toss the buffers.
2902 	 */
2903 	mp = NULL;
2904 	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
2905 		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
2906 	if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
2907 		while (vinvalbuf(vp, 0, 0, 0) != 0)
2908 			;
2909 	}
2910 
2911 	BO_LOCK(&vp->v_bufobj);
2912 	KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
2913 	    vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
2914 	    TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
2915 	    vp->v_bufobj.bo_clean.bv_cnt == 0,
2916 	    ("vp %p bufobj not invalidated", vp));
2917 	vp->v_bufobj.bo_flag |= BO_DEAD;
2918 	BO_UNLOCK(&vp->v_bufobj);
2919 
2920 	/*
2921 	 * Reclaim the vnode.
2922 	 */
2923 	if (VOP_RECLAIM(vp, td))
2924 		panic("vgone: cannot reclaim");
2925 	if (mp != NULL)
2926 		vn_finished_secondary_write(mp);
2927 	VNASSERT(vp->v_object == NULL, vp,
2928 	    ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
2929 	/*
2930 	 * Clear the advisory locks and wake up waiting threads.
2931 	 */
2932 	(void)VOP_ADVLOCKPURGE(vp);
2933 	/*
2934 	 * Delete from old mount point vnode list.
2935 	 */
2936 	delmntque(vp);
2937 	cache_purge(vp);
2938 	/*
2939 	 * Done with purge, reset to the standard lock and invalidate
2940 	 * the vnode.
2941 	 */
2942 	VI_LOCK(vp);
2943 	vp->v_vnlock = &vp->v_lock;
2944 	vp->v_op = &dead_vnodeops;
2945 	vp->v_tag = "none";
2946 	vp->v_type = VBAD;
2947 }
2948 
2949 /*
2950  * Calculate the total number of references to a special device.
2951  */
2952 int
2953 vcount(struct vnode *vp)
2954 {
2955 	int count;
2956 
2957 	dev_lock();
2958 	count = vp->v_rdev->si_usecount;
2959 	dev_unlock();
2960 	return (count);
2961 }
2962 
2963 /*
2964  * Same as above, but using the struct cdev *as argument
2965  */
2966 int
2967 count_dev(struct cdev *dev)
2968 {
2969 	int count;
2970 
2971 	dev_lock();
2972 	count = dev->si_usecount;
2973 	dev_unlock();
2974 	return(count);
2975 }
2976 
2977 /*
2978  * Print out a description of a vnode.
2979  */
2980 static char *typename[] =
2981 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
2982  "VMARKER"};
2983 
2984 void
2985 vn_printf(struct vnode *vp, const char *fmt, ...)
2986 {
2987 	va_list ap;
2988 	char buf[256], buf2[16];
2989 	u_long flags;
2990 
2991 	va_start(ap, fmt);
2992 	vprintf(fmt, ap);
2993 	va_end(ap);
2994 	printf("%p: ", (void *)vp);
2995 	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
2996 	printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
2997 	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
2998 	buf[0] = '\0';
2999 	buf[1] = '\0';
3000 	if (vp->v_vflag & VV_ROOT)
3001 		strlcat(buf, "|VV_ROOT", sizeof(buf));
3002 	if (vp->v_vflag & VV_ISTTY)
3003 		strlcat(buf, "|VV_ISTTY", sizeof(buf));
3004 	if (vp->v_vflag & VV_NOSYNC)
3005 		strlcat(buf, "|VV_NOSYNC", sizeof(buf));
3006 	if (vp->v_vflag & VV_ETERNALDEV)
3007 		strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
3008 	if (vp->v_vflag & VV_CACHEDLABEL)
3009 		strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
3010 	if (vp->v_vflag & VV_TEXT)
3011 		strlcat(buf, "|VV_TEXT", sizeof(buf));
3012 	if (vp->v_vflag & VV_COPYONWRITE)
3013 		strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
3014 	if (vp->v_vflag & VV_SYSTEM)
3015 		strlcat(buf, "|VV_SYSTEM", sizeof(buf));
3016 	if (vp->v_vflag & VV_PROCDEP)
3017 		strlcat(buf, "|VV_PROCDEP", sizeof(buf));
3018 	if (vp->v_vflag & VV_NOKNOTE)
3019 		strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
3020 	if (vp->v_vflag & VV_DELETED)
3021 		strlcat(buf, "|VV_DELETED", sizeof(buf));
3022 	if (vp->v_vflag & VV_MD)
3023 		strlcat(buf, "|VV_MD", sizeof(buf));
3024 	if (vp->v_vflag & VV_FORCEINSMQ)
3025 		strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
3026 	flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
3027 	    VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
3028 	    VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ);
3029 	if (flags != 0) {
3030 		snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
3031 		strlcat(buf, buf2, sizeof(buf));
3032 	}
3033 	if (vp->v_iflag & VI_MOUNT)
3034 		strlcat(buf, "|VI_MOUNT", sizeof(buf));
3035 	if (vp->v_iflag & VI_AGE)
3036 		strlcat(buf, "|VI_AGE", sizeof(buf));
3037 	if (vp->v_iflag & VI_DOOMED)
3038 		strlcat(buf, "|VI_DOOMED", sizeof(buf));
3039 	if (vp->v_iflag & VI_FREE)
3040 		strlcat(buf, "|VI_FREE", sizeof(buf));
3041 	if (vp->v_iflag & VI_ACTIVE)
3042 		strlcat(buf, "|VI_ACTIVE", sizeof(buf));
3043 	if (vp->v_iflag & VI_DOINGINACT)
3044 		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
3045 	if (vp->v_iflag & VI_OWEINACT)
3046 		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
3047 	flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE |
3048 	    VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT);
3049 	if (flags != 0) {
3050 		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
3051 		strlcat(buf, buf2, sizeof(buf));
3052 	}
3053 	printf("    flags (%s)\n", buf + 1);
3054 	if (mtx_owned(VI_MTX(vp)))
3055 		printf(" VI_LOCKed");
3056 	if (vp->v_object != NULL)
3057 		printf("    v_object %p ref %d pages %d "
3058 		    "cleanbuf %d dirtybuf %d\n",
3059 		    vp->v_object, vp->v_object->ref_count,
3060 		    vp->v_object->resident_page_count,
3061 		    vp->v_bufobj.bo_dirty.bv_cnt,
3062 		    vp->v_bufobj.bo_clean.bv_cnt);
3063 	printf("    ");
3064 	lockmgr_printinfo(vp->v_vnlock);
3065 	if (vp->v_data != NULL)
3066 		VOP_PRINT(vp);
3067 }
3068 
3069 #ifdef DDB
3070 /*
3071  * List all of the locked vnodes in the system.
3072  * Called when debugging the kernel.
3073  */
3074 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
3075 {
3076 	struct mount *mp;
3077 	struct vnode *vp;
3078 
3079 	/*
3080 	 * Note: because this is DDB, we can't obey the locking semantics
3081 	 * for these structures, which means we could catch an inconsistent
3082 	 * state and dereference a nasty pointer.  Not much to be done
3083 	 * about that.
3084 	 */
3085 	db_printf("Locked vnodes\n");
3086 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3087 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3088 			if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
3089 				vprint("", vp);
3090 		}
3091 	}
3092 }
3093 
3094 /*
3095  * Show details about the given vnode.
3096  */
3097 DB_SHOW_COMMAND(vnode, db_show_vnode)
3098 {
3099 	struct vnode *vp;
3100 
3101 	if (!have_addr)
3102 		return;
3103 	vp = (struct vnode *)addr;
3104 	vn_printf(vp, "vnode ");
3105 }
3106 
3107 /*
3108  * Show details about the given mount point.
3109  */
3110 DB_SHOW_COMMAND(mount, db_show_mount)
3111 {
3112 	struct mount *mp;
3113 	struct vfsopt *opt;
3114 	struct statfs *sp;
3115 	struct vnode *vp;
3116 	char buf[512];
3117 	uint64_t mflags;
3118 	u_int flags;
3119 
3120 	if (!have_addr) {
3121 		/* No address given, print short info about all mount points. */
3122 		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3123 			db_printf("%p %s on %s (%s)\n", mp,
3124 			    mp->mnt_stat.f_mntfromname,
3125 			    mp->mnt_stat.f_mntonname,
3126 			    mp->mnt_stat.f_fstypename);
3127 			if (db_pager_quit)
3128 				break;
3129 		}
3130 		db_printf("\nMore info: show mount <addr>\n");
3131 		return;
3132 	}
3133 
3134 	mp = (struct mount *)addr;
3135 	db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
3136 	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
3137 
3138 	buf[0] = '\0';
3139 	mflags = mp->mnt_flag;
3140 #define	MNT_FLAG(flag)	do {						\
3141 	if (mflags & (flag)) {						\
3142 		if (buf[0] != '\0')					\
3143 			strlcat(buf, ", ", sizeof(buf));		\
3144 		strlcat(buf, (#flag) + 4, sizeof(buf));			\
3145 		mflags &= ~(flag);					\
3146 	}								\
3147 } while (0)
3148 	MNT_FLAG(MNT_RDONLY);
3149 	MNT_FLAG(MNT_SYNCHRONOUS);
3150 	MNT_FLAG(MNT_NOEXEC);
3151 	MNT_FLAG(MNT_NOSUID);
3152 	MNT_FLAG(MNT_NFS4ACLS);
3153 	MNT_FLAG(MNT_UNION);
3154 	MNT_FLAG(MNT_ASYNC);
3155 	MNT_FLAG(MNT_SUIDDIR);
3156 	MNT_FLAG(MNT_SOFTDEP);
3157 	MNT_FLAG(MNT_NOSYMFOLLOW);
3158 	MNT_FLAG(MNT_GJOURNAL);
3159 	MNT_FLAG(MNT_MULTILABEL);
3160 	MNT_FLAG(MNT_ACLS);
3161 	MNT_FLAG(MNT_NOATIME);
3162 	MNT_FLAG(MNT_NOCLUSTERR);
3163 	MNT_FLAG(MNT_NOCLUSTERW);
3164 	MNT_FLAG(MNT_SUJ);
3165 	MNT_FLAG(MNT_EXRDONLY);
3166 	MNT_FLAG(MNT_EXPORTED);
3167 	MNT_FLAG(MNT_DEFEXPORTED);
3168 	MNT_FLAG(MNT_EXPORTANON);
3169 	MNT_FLAG(MNT_EXKERB);
3170 	MNT_FLAG(MNT_EXPUBLIC);
3171 	MNT_FLAG(MNT_LOCAL);
3172 	MNT_FLAG(MNT_QUOTA);
3173 	MNT_FLAG(MNT_ROOTFS);
3174 	MNT_FLAG(MNT_USER);
3175 	MNT_FLAG(MNT_IGNORE);
3176 	MNT_FLAG(MNT_UPDATE);
3177 	MNT_FLAG(MNT_DELEXPORT);
3178 	MNT_FLAG(MNT_RELOAD);
3179 	MNT_FLAG(MNT_FORCE);
3180 	MNT_FLAG(MNT_SNAPSHOT);
3181 	MNT_FLAG(MNT_BYFSID);
3182 #undef MNT_FLAG
3183 	if (mflags != 0) {
3184 		if (buf[0] != '\0')
3185 			strlcat(buf, ", ", sizeof(buf));
3186 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
3187 		    "0x%016jx", mflags);
3188 	}
3189 	db_printf("    mnt_flag = %s\n", buf);
3190 
3191 	buf[0] = '\0';
3192 	flags = mp->mnt_kern_flag;
3193 #define	MNT_KERN_FLAG(flag)	do {					\
3194 	if (flags & (flag)) {						\
3195 		if (buf[0] != '\0')					\
3196 			strlcat(buf, ", ", sizeof(buf));		\
3197 		strlcat(buf, (#flag) + 5, sizeof(buf));			\
3198 		flags &= ~(flag);					\
3199 	}								\
3200 } while (0)
3201 	MNT_KERN_FLAG(MNTK_UNMOUNTF);
3202 	MNT_KERN_FLAG(MNTK_ASYNC);
3203 	MNT_KERN_FLAG(MNTK_SOFTDEP);
3204 	MNT_KERN_FLAG(MNTK_NOINSMNTQ);
3205 	MNT_KERN_FLAG(MNTK_DRAINING);
3206 	MNT_KERN_FLAG(MNTK_REFEXPIRE);
3207 	MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
3208 	MNT_KERN_FLAG(MNTK_SHARED_WRITES);
3209 	MNT_KERN_FLAG(MNTK_NO_IOPF);
3210 	MNT_KERN_FLAG(MNTK_VGONE_UPPER);
3211 	MNT_KERN_FLAG(MNTK_VGONE_WAITER);
3212 	MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
3213 	MNT_KERN_FLAG(MNTK_MARKER);
3214 	MNT_KERN_FLAG(MNTK_USES_BCACHE);
3215 	MNT_KERN_FLAG(MNTK_NOASYNC);
3216 	MNT_KERN_FLAG(MNTK_UNMOUNT);
3217 	MNT_KERN_FLAG(MNTK_MWAIT);
3218 	MNT_KERN_FLAG(MNTK_SUSPEND);
3219 	MNT_KERN_FLAG(MNTK_SUSPEND2);
3220 	MNT_KERN_FLAG(MNTK_SUSPENDED);
3221 	MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
3222 	MNT_KERN_FLAG(MNTK_NOKNOTE);
3223 #undef MNT_KERN_FLAG
3224 	if (flags != 0) {
3225 		if (buf[0] != '\0')
3226 			strlcat(buf, ", ", sizeof(buf));
3227 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
3228 		    "0x%08x", flags);
3229 	}
3230 	db_printf("    mnt_kern_flag = %s\n", buf);
3231 
3232 	db_printf("    mnt_opt = ");
3233 	opt = TAILQ_FIRST(mp->mnt_opt);
3234 	if (opt != NULL) {
3235 		db_printf("%s", opt->name);
3236 		opt = TAILQ_NEXT(opt, link);
3237 		while (opt != NULL) {
3238 			db_printf(", %s", opt->name);
3239 			opt = TAILQ_NEXT(opt, link);
3240 		}
3241 	}
3242 	db_printf("\n");
3243 
3244 	sp = &mp->mnt_stat;
3245 	db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
3246 	    "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
3247 	    "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
3248 	    "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
3249 	    (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
3250 	    (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
3251 	    (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
3252 	    (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
3253 	    (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
3254 	    (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
3255 	    (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
3256 	    (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
3257 
3258 	db_printf("    mnt_cred = { uid=%u ruid=%u",
3259 	    (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
3260 	if (jailed(mp->mnt_cred))
3261 		db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
3262 	db_printf(" }\n");
3263 	db_printf("    mnt_ref = %d\n", mp->mnt_ref);
3264 	db_printf("    mnt_gen = %d\n", mp->mnt_gen);
3265 	db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
3266 	db_printf("    mnt_activevnodelistsize = %d\n",
3267 	    mp->mnt_activevnodelistsize);
3268 	db_printf("    mnt_writeopcount = %d\n", mp->mnt_writeopcount);
3269 	db_printf("    mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
3270 	db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
3271 	db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
3272 	db_printf("    mnt_lockref = %d\n", mp->mnt_lockref);
3273 	db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
3274 	db_printf("    mnt_secondary_accwrites = %d\n",
3275 	    mp->mnt_secondary_accwrites);
3276 	db_printf("    mnt_gjprovider = %s\n",
3277 	    mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
3278 
3279 	db_printf("\n\nList of active vnodes\n");
3280 	TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) {
3281 		if (vp->v_type != VMARKER) {
3282 			vn_printf(vp, "vnode ");
3283 			if (db_pager_quit)
3284 				break;
3285 		}
3286 	}
3287 	db_printf("\n\nList of inactive vnodes\n");
3288 	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3289 		if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) {
3290 			vn_printf(vp, "vnode ");
3291 			if (db_pager_quit)
3292 				break;
3293 		}
3294 	}
3295 }
3296 #endif	/* DDB */
3297 
3298 /*
3299  * Fill in a struct xvfsconf based on a struct vfsconf.
3300  */
3301 static int
3302 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
3303 {
3304 	struct xvfsconf xvfsp;
3305 
3306 	bzero(&xvfsp, sizeof(xvfsp));
3307 	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
3308 	xvfsp.vfc_typenum = vfsp->vfc_typenum;
3309 	xvfsp.vfc_refcount = vfsp->vfc_refcount;
3310 	xvfsp.vfc_flags = vfsp->vfc_flags;
3311 	/*
3312 	 * These are unused in userland, we keep them
3313 	 * to not break binary compatibility.
3314 	 */
3315 	xvfsp.vfc_vfsops = NULL;
3316 	xvfsp.vfc_next = NULL;
3317 	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3318 }
3319 
3320 #ifdef COMPAT_FREEBSD32
3321 struct xvfsconf32 {
3322 	uint32_t	vfc_vfsops;
3323 	char		vfc_name[MFSNAMELEN];
3324 	int32_t		vfc_typenum;
3325 	int32_t		vfc_refcount;
3326 	int32_t		vfc_flags;
3327 	uint32_t	vfc_next;
3328 };
3329 
3330 static int
3331 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
3332 {
3333 	struct xvfsconf32 xvfsp;
3334 
3335 	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
3336 	xvfsp.vfc_typenum = vfsp->vfc_typenum;
3337 	xvfsp.vfc_refcount = vfsp->vfc_refcount;
3338 	xvfsp.vfc_flags = vfsp->vfc_flags;
3339 	xvfsp.vfc_vfsops = 0;
3340 	xvfsp.vfc_next = 0;
3341 	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3342 }
3343 #endif
3344 
3345 /*
3346  * Top level filesystem related information gathering.
3347  */
3348 static int
3349 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
3350 {
3351 	struct vfsconf *vfsp;
3352 	int error;
3353 
3354 	error = 0;
3355 	vfsconf_slock();
3356 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3357 #ifdef COMPAT_FREEBSD32
3358 		if (req->flags & SCTL_MASK32)
3359 			error = vfsconf2x32(req, vfsp);
3360 		else
3361 #endif
3362 			error = vfsconf2x(req, vfsp);
3363 		if (error)
3364 			break;
3365 	}
3366 	vfsconf_sunlock();
3367 	return (error);
3368 }
3369 
3370 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
3371     CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
3372     "S,xvfsconf", "List of all configured filesystems");
3373 
3374 #ifndef BURN_BRIDGES
3375 static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
3376 
3377 static int
3378 vfs_sysctl(SYSCTL_HANDLER_ARGS)
3379 {
3380 	int *name = (int *)arg1 - 1;	/* XXX */
3381 	u_int namelen = arg2 + 1;	/* XXX */
3382 	struct vfsconf *vfsp;
3383 
3384 	log(LOG_WARNING, "userland calling deprecated sysctl, "
3385 	    "please rebuild world\n");
3386 
3387 #if 1 || defined(COMPAT_PRELITE2)
3388 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
3389 	if (namelen == 1)
3390 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
3391 #endif
3392 
3393 	switch (name[1]) {
3394 	case VFS_MAXTYPENUM:
3395 		if (namelen != 2)
3396 			return (ENOTDIR);
3397 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
3398 	case VFS_CONF:
3399 		if (namelen != 3)
3400 			return (ENOTDIR);	/* overloaded */
3401 		vfsconf_slock();
3402 		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3403 			if (vfsp->vfc_typenum == name[2])
3404 				break;
3405 		}
3406 		vfsconf_sunlock();
3407 		if (vfsp == NULL)
3408 			return (EOPNOTSUPP);
3409 #ifdef COMPAT_FREEBSD32
3410 		if (req->flags & SCTL_MASK32)
3411 			return (vfsconf2x32(req, vfsp));
3412 		else
3413 #endif
3414 			return (vfsconf2x(req, vfsp));
3415 	}
3416 	return (EOPNOTSUPP);
3417 }
3418 
3419 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
3420     CTLFLAG_MPSAFE, vfs_sysctl,
3421     "Generic filesystem");
3422 
3423 #if 1 || defined(COMPAT_PRELITE2)
3424 
3425 static int
3426 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
3427 {
3428 	int error;
3429 	struct vfsconf *vfsp;
3430 	struct ovfsconf ovfs;
3431 
3432 	vfsconf_slock();
3433 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3434 		bzero(&ovfs, sizeof(ovfs));
3435 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
3436 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
3437 		ovfs.vfc_index = vfsp->vfc_typenum;
3438 		ovfs.vfc_refcount = vfsp->vfc_refcount;
3439 		ovfs.vfc_flags = vfsp->vfc_flags;
3440 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
3441 		if (error != 0) {
3442 			vfsconf_sunlock();
3443 			return (error);
3444 		}
3445 	}
3446 	vfsconf_sunlock();
3447 	return (0);
3448 }
3449 
3450 #endif /* 1 || COMPAT_PRELITE2 */
3451 #endif /* !BURN_BRIDGES */
3452 
3453 #define KINFO_VNODESLOP		10
3454 #ifdef notyet
3455 /*
3456  * Dump vnode list (via sysctl).
3457  */
3458 /* ARGSUSED */
3459 static int
3460 sysctl_vnode(SYSCTL_HANDLER_ARGS)
3461 {
3462 	struct xvnode *xvn;
3463 	struct mount *mp;
3464 	struct vnode *vp;
3465 	int error, len, n;
3466 
3467 	/*
3468 	 * Stale numvnodes access is not fatal here.
3469 	 */
3470 	req->lock = 0;
3471 	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
3472 	if (!req->oldptr)
3473 		/* Make an estimate */
3474 		return (SYSCTL_OUT(req, 0, len));
3475 
3476 	error = sysctl_wire_old_buffer(req, 0);
3477 	if (error != 0)
3478 		return (error);
3479 	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
3480 	n = 0;
3481 	mtx_lock(&mountlist_mtx);
3482 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3483 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
3484 			continue;
3485 		MNT_ILOCK(mp);
3486 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3487 			if (n == len)
3488 				break;
3489 			vref(vp);
3490 			xvn[n].xv_size = sizeof *xvn;
3491 			xvn[n].xv_vnode = vp;
3492 			xvn[n].xv_id = 0;	/* XXX compat */
3493 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
3494 			XV_COPY(usecount);
3495 			XV_COPY(writecount);
3496 			XV_COPY(holdcnt);
3497 			XV_COPY(mount);
3498 			XV_COPY(numoutput);
3499 			XV_COPY(type);
3500 #undef XV_COPY
3501 			xvn[n].xv_flag = vp->v_vflag;
3502 
3503 			switch (vp->v_type) {
3504 			case VREG:
3505 			case VDIR:
3506 			case VLNK:
3507 				break;
3508 			case VBLK:
3509 			case VCHR:
3510 				if (vp->v_rdev == NULL) {
3511 					vrele(vp);
3512 					continue;
3513 				}
3514 				xvn[n].xv_dev = dev2udev(vp->v_rdev);
3515 				break;
3516 			case VSOCK:
3517 				xvn[n].xv_socket = vp->v_socket;
3518 				break;
3519 			case VFIFO:
3520 				xvn[n].xv_fifo = vp->v_fifoinfo;
3521 				break;
3522 			case VNON:
3523 			case VBAD:
3524 			default:
3525 				/* shouldn't happen? */
3526 				vrele(vp);
3527 				continue;
3528 			}
3529 			vrele(vp);
3530 			++n;
3531 		}
3532 		MNT_IUNLOCK(mp);
3533 		mtx_lock(&mountlist_mtx);
3534 		vfs_unbusy(mp);
3535 		if (n == len)
3536 			break;
3537 	}
3538 	mtx_unlock(&mountlist_mtx);
3539 
3540 	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
3541 	free(xvn, M_TEMP);
3542 	return (error);
3543 }
3544 
3545 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD |
3546     CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
3547     "");
3548 #endif
3549 
3550 /*
3551  * Unmount all filesystems. The list is traversed in reverse order
3552  * of mounting to avoid dependencies.
3553  */
3554 void
3555 vfs_unmountall(void)
3556 {
3557 	struct mount *mp;
3558 	struct thread *td;
3559 	int error;
3560 
3561 	CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
3562 	td = curthread;
3563 
3564 	/*
3565 	 * Since this only runs when rebooting, it is not interlocked.
3566 	 */
3567 	while(!TAILQ_EMPTY(&mountlist)) {
3568 		mp = TAILQ_LAST(&mountlist, mntlist);
3569 		vfs_ref(mp);
3570 		error = dounmount(mp, MNT_FORCE, td);
3571 		if (error != 0) {
3572 			TAILQ_REMOVE(&mountlist, mp, mnt_list);
3573 			/*
3574 			 * XXX: Due to the way in which we mount the root
3575 			 * file system off of devfs, devfs will generate a
3576 			 * "busy" warning when we try to unmount it before
3577 			 * the root.  Don't print a warning as a result in
3578 			 * order to avoid false positive errors that may
3579 			 * cause needless upset.
3580 			 */
3581 			if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
3582 				printf("unmount of %s failed (",
3583 				    mp->mnt_stat.f_mntonname);
3584 				if (error == EBUSY)
3585 					printf("BUSY)\n");
3586 				else
3587 					printf("%d)\n", error);
3588 			}
3589 		} else {
3590 			/* The unmount has removed mp from the mountlist */
3591 		}
3592 	}
3593 }
3594 
3595 /*
3596  * perform msync on all vnodes under a mount point
3597  * the mount point must be locked.
3598  */
3599 void
3600 vfs_msync(struct mount *mp, int flags)
3601 {
3602 	struct vnode *vp, *mvp;
3603 	struct vm_object *obj;
3604 
3605 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
3606 	MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) {
3607 		obj = vp->v_object;
3608 		if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 &&
3609 		    (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
3610 			if (!vget(vp,
3611 			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
3612 			    curthread)) {
3613 				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
3614 					vput(vp);
3615 					continue;
3616 				}
3617 
3618 				obj = vp->v_object;
3619 				if (obj != NULL) {
3620 					VM_OBJECT_WLOCK(obj);
3621 					vm_object_page_clean(obj, 0, 0,
3622 					    flags == MNT_WAIT ?
3623 					    OBJPC_SYNC : OBJPC_NOSYNC);
3624 					VM_OBJECT_WUNLOCK(obj);
3625 				}
3626 				vput(vp);
3627 			}
3628 		} else
3629 			VI_UNLOCK(vp);
3630 	}
3631 }
3632 
3633 static void
3634 destroy_vpollinfo_free(struct vpollinfo *vi)
3635 {
3636 
3637 	knlist_destroy(&vi->vpi_selinfo.si_note);
3638 	mtx_destroy(&vi->vpi_lock);
3639 	uma_zfree(vnodepoll_zone, vi);
3640 }
3641 
3642 static void
3643 destroy_vpollinfo(struct vpollinfo *vi)
3644 {
3645 
3646 	knlist_clear(&vi->vpi_selinfo.si_note, 1);
3647 	seldrain(&vi->vpi_selinfo);
3648 	destroy_vpollinfo_free(vi);
3649 }
3650 
3651 /*
3652  * Initalize per-vnode helper structure to hold poll-related state.
3653  */
3654 void
3655 v_addpollinfo(struct vnode *vp)
3656 {
3657 	struct vpollinfo *vi;
3658 
3659 	if (vp->v_pollinfo != NULL)
3660 		return;
3661 	vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO);
3662 	mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
3663 	knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
3664 	    vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
3665 	VI_LOCK(vp);
3666 	if (vp->v_pollinfo != NULL) {
3667 		VI_UNLOCK(vp);
3668 		destroy_vpollinfo_free(vi);
3669 		return;
3670 	}
3671 	vp->v_pollinfo = vi;
3672 	VI_UNLOCK(vp);
3673 }
3674 
3675 /*
3676  * Record a process's interest in events which might happen to
3677  * a vnode.  Because poll uses the historic select-style interface
3678  * internally, this routine serves as both the ``check for any
3679  * pending events'' and the ``record my interest in future events''
3680  * functions.  (These are done together, while the lock is held,
3681  * to avoid race conditions.)
3682  */
3683 int
3684 vn_pollrecord(struct vnode *vp, struct thread *td, int events)
3685 {
3686 
3687 	v_addpollinfo(vp);
3688 	mtx_lock(&vp->v_pollinfo->vpi_lock);
3689 	if (vp->v_pollinfo->vpi_revents & events) {
3690 		/*
3691 		 * This leaves events we are not interested
3692 		 * in available for the other process which
3693 		 * which presumably had requested them
3694 		 * (otherwise they would never have been
3695 		 * recorded).
3696 		 */
3697 		events &= vp->v_pollinfo->vpi_revents;
3698 		vp->v_pollinfo->vpi_revents &= ~events;
3699 
3700 		mtx_unlock(&vp->v_pollinfo->vpi_lock);
3701 		return (events);
3702 	}
3703 	vp->v_pollinfo->vpi_events |= events;
3704 	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3705 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3706 	return (0);
3707 }
3708 
3709 /*
3710  * Routine to create and manage a filesystem syncer vnode.
3711  */
3712 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
3713 static int	sync_fsync(struct  vop_fsync_args *);
3714 static int	sync_inactive(struct  vop_inactive_args *);
3715 static int	sync_reclaim(struct  vop_reclaim_args *);
3716 
3717 static struct vop_vector sync_vnodeops = {
3718 	.vop_bypass =	VOP_EOPNOTSUPP,
3719 	.vop_close =	sync_close,		/* close */
3720 	.vop_fsync =	sync_fsync,		/* fsync */
3721 	.vop_inactive =	sync_inactive,	/* inactive */
3722 	.vop_reclaim =	sync_reclaim,	/* reclaim */
3723 	.vop_lock1 =	vop_stdlock,	/* lock */
3724 	.vop_unlock =	vop_stdunlock,	/* unlock */
3725 	.vop_islocked =	vop_stdislocked,	/* islocked */
3726 };
3727 
3728 /*
3729  * Create a new filesystem syncer vnode for the specified mount point.
3730  */
3731 void
3732 vfs_allocate_syncvnode(struct mount *mp)
3733 {
3734 	struct vnode *vp;
3735 	struct bufobj *bo;
3736 	static long start, incr, next;
3737 	int error;
3738 
3739 	/* Allocate a new vnode */
3740 	error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
3741 	if (error != 0)
3742 		panic("vfs_allocate_syncvnode: getnewvnode() failed");
3743 	vp->v_type = VNON;
3744 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3745 	vp->v_vflag |= VV_FORCEINSMQ;
3746 	error = insmntque(vp, mp);
3747 	if (error != 0)
3748 		panic("vfs_allocate_syncvnode: insmntque() failed");
3749 	vp->v_vflag &= ~VV_FORCEINSMQ;
3750 	VOP_UNLOCK(vp, 0);
3751 	/*
3752 	 * Place the vnode onto the syncer worklist. We attempt to
3753 	 * scatter them about on the list so that they will go off
3754 	 * at evenly distributed times even if all the filesystems
3755 	 * are mounted at once.
3756 	 */
3757 	next += incr;
3758 	if (next == 0 || next > syncer_maxdelay) {
3759 		start /= 2;
3760 		incr /= 2;
3761 		if (start == 0) {
3762 			start = syncer_maxdelay / 2;
3763 			incr = syncer_maxdelay;
3764 		}
3765 		next = start;
3766 	}
3767 	bo = &vp->v_bufobj;
3768 	BO_LOCK(bo);
3769 	vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
3770 	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
3771 	mtx_lock(&sync_mtx);
3772 	sync_vnode_count++;
3773 	if (mp->mnt_syncer == NULL) {
3774 		mp->mnt_syncer = vp;
3775 		vp = NULL;
3776 	}
3777 	mtx_unlock(&sync_mtx);
3778 	BO_UNLOCK(bo);
3779 	if (vp != NULL) {
3780 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3781 		vgone(vp);
3782 		vput(vp);
3783 	}
3784 }
3785 
3786 void
3787 vfs_deallocate_syncvnode(struct mount *mp)
3788 {
3789 	struct vnode *vp;
3790 
3791 	mtx_lock(&sync_mtx);
3792 	vp = mp->mnt_syncer;
3793 	if (vp != NULL)
3794 		mp->mnt_syncer = NULL;
3795 	mtx_unlock(&sync_mtx);
3796 	if (vp != NULL)
3797 		vrele(vp);
3798 }
3799 
3800 /*
3801  * Do a lazy sync of the filesystem.
3802  */
3803 static int
3804 sync_fsync(struct vop_fsync_args *ap)
3805 {
3806 	struct vnode *syncvp = ap->a_vp;
3807 	struct mount *mp = syncvp->v_mount;
3808 	int error, save;
3809 	struct bufobj *bo;
3810 
3811 	/*
3812 	 * We only need to do something if this is a lazy evaluation.
3813 	 */
3814 	if (ap->a_waitfor != MNT_LAZY)
3815 		return (0);
3816 
3817 	/*
3818 	 * Move ourselves to the back of the sync list.
3819 	 */
3820 	bo = &syncvp->v_bufobj;
3821 	BO_LOCK(bo);
3822 	vn_syncer_add_to_worklist(bo, syncdelay);
3823 	BO_UNLOCK(bo);
3824 
3825 	/*
3826 	 * Walk the list of vnodes pushing all that are dirty and
3827 	 * not already on the sync list.
3828 	 */
3829 	if (vfs_busy(mp, MBF_NOWAIT) != 0)
3830 		return (0);
3831 	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3832 		vfs_unbusy(mp);
3833 		return (0);
3834 	}
3835 	save = curthread_pflags_set(TDP_SYNCIO);
3836 	vfs_msync(mp, MNT_NOWAIT);
3837 	error = VFS_SYNC(mp, MNT_LAZY);
3838 	curthread_pflags_restore(save);
3839 	vn_finished_write(mp);
3840 	vfs_unbusy(mp);
3841 	return (error);
3842 }
3843 
3844 /*
3845  * The syncer vnode is no referenced.
3846  */
3847 static int
3848 sync_inactive(struct vop_inactive_args *ap)
3849 {
3850 
3851 	vgone(ap->a_vp);
3852 	return (0);
3853 }
3854 
3855 /*
3856  * The syncer vnode is no longer needed and is being decommissioned.
3857  *
3858  * Modifications to the worklist must be protected by sync_mtx.
3859  */
3860 static int
3861 sync_reclaim(struct vop_reclaim_args *ap)
3862 {
3863 	struct vnode *vp = ap->a_vp;
3864 	struct bufobj *bo;
3865 
3866 	bo = &vp->v_bufobj;
3867 	BO_LOCK(bo);
3868 	mtx_lock(&sync_mtx);
3869 	if (vp->v_mount->mnt_syncer == vp)
3870 		vp->v_mount->mnt_syncer = NULL;
3871 	if (bo->bo_flag & BO_ONWORKLST) {
3872 		LIST_REMOVE(bo, bo_synclist);
3873 		syncer_worklist_len--;
3874 		sync_vnode_count--;
3875 		bo->bo_flag &= ~BO_ONWORKLST;
3876 	}
3877 	mtx_unlock(&sync_mtx);
3878 	BO_UNLOCK(bo);
3879 
3880 	return (0);
3881 }
3882 
3883 /*
3884  * Check if vnode represents a disk device
3885  */
3886 int
3887 vn_isdisk(struct vnode *vp, int *errp)
3888 {
3889 	int error;
3890 
3891 	if (vp->v_type != VCHR) {
3892 		error = ENOTBLK;
3893 		goto out;
3894 	}
3895 	error = 0;
3896 	dev_lock();
3897 	if (vp->v_rdev == NULL)
3898 		error = ENXIO;
3899 	else if (vp->v_rdev->si_devsw == NULL)
3900 		error = ENXIO;
3901 	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
3902 		error = ENOTBLK;
3903 	dev_unlock();
3904 out:
3905 	if (errp != NULL)
3906 		*errp = error;
3907 	return (error == 0);
3908 }
3909 
3910 /*
3911  * Common filesystem object access control check routine.  Accepts a
3912  * vnode's type, "mode", uid and gid, requested access mode, credentials,
3913  * and optional call-by-reference privused argument allowing vaccess()
3914  * to indicate to the caller whether privilege was used to satisfy the
3915  * request (obsoleted).  Returns 0 on success, or an errno on failure.
3916  */
3917 int
3918 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
3919     accmode_t accmode, struct ucred *cred, int *privused)
3920 {
3921 	accmode_t dac_granted;
3922 	accmode_t priv_granted;
3923 
3924 	KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
3925 	    ("invalid bit in accmode"));
3926 	KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
3927 	    ("VAPPEND without VWRITE"));
3928 
3929 	/*
3930 	 * Look for a normal, non-privileged way to access the file/directory
3931 	 * as requested.  If it exists, go with that.
3932 	 */
3933 
3934 	if (privused != NULL)
3935 		*privused = 0;
3936 
3937 	dac_granted = 0;
3938 
3939 	/* Check the owner. */
3940 	if (cred->cr_uid == file_uid) {
3941 		dac_granted |= VADMIN;
3942 		if (file_mode & S_IXUSR)
3943 			dac_granted |= VEXEC;
3944 		if (file_mode & S_IRUSR)
3945 			dac_granted |= VREAD;
3946 		if (file_mode & S_IWUSR)
3947 			dac_granted |= (VWRITE | VAPPEND);
3948 
3949 		if ((accmode & dac_granted) == accmode)
3950 			return (0);
3951 
3952 		goto privcheck;
3953 	}
3954 
3955 	/* Otherwise, check the groups (first match) */
3956 	if (groupmember(file_gid, cred)) {
3957 		if (file_mode & S_IXGRP)
3958 			dac_granted |= VEXEC;
3959 		if (file_mode & S_IRGRP)
3960 			dac_granted |= VREAD;
3961 		if (file_mode & S_IWGRP)
3962 			dac_granted |= (VWRITE | VAPPEND);
3963 
3964 		if ((accmode & dac_granted) == accmode)
3965 			return (0);
3966 
3967 		goto privcheck;
3968 	}
3969 
3970 	/* Otherwise, check everyone else. */
3971 	if (file_mode & S_IXOTH)
3972 		dac_granted |= VEXEC;
3973 	if (file_mode & S_IROTH)
3974 		dac_granted |= VREAD;
3975 	if (file_mode & S_IWOTH)
3976 		dac_granted |= (VWRITE | VAPPEND);
3977 	if ((accmode & dac_granted) == accmode)
3978 		return (0);
3979 
3980 privcheck:
3981 	/*
3982 	 * Build a privilege mask to determine if the set of privileges
3983 	 * satisfies the requirements when combined with the granted mask
3984 	 * from above.  For each privilege, if the privilege is required,
3985 	 * bitwise or the request type onto the priv_granted mask.
3986 	 */
3987 	priv_granted = 0;
3988 
3989 	if (type == VDIR) {
3990 		/*
3991 		 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
3992 		 * requests, instead of PRIV_VFS_EXEC.
3993 		 */
3994 		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3995 		    !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
3996 			priv_granted |= VEXEC;
3997 	} else {
3998 		/*
3999 		 * Ensure that at least one execute bit is on. Otherwise,
4000 		 * a privileged user will always succeed, and we don't want
4001 		 * this to happen unless the file really is executable.
4002 		 */
4003 		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
4004 		    (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
4005 		    !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
4006 			priv_granted |= VEXEC;
4007 	}
4008 
4009 	if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
4010 	    !priv_check_cred(cred, PRIV_VFS_READ, 0))
4011 		priv_granted |= VREAD;
4012 
4013 	if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
4014 	    !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
4015 		priv_granted |= (VWRITE | VAPPEND);
4016 
4017 	if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
4018 	    !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
4019 		priv_granted |= VADMIN;
4020 
4021 	if ((accmode & (priv_granted | dac_granted)) == accmode) {
4022 		/* XXX audit: privilege used */
4023 		if (privused != NULL)
4024 			*privused = 1;
4025 		return (0);
4026 	}
4027 
4028 	return ((accmode & VADMIN) ? EPERM : EACCES);
4029 }
4030 
4031 /*
4032  * Credential check based on process requesting service, and per-attribute
4033  * permissions.
4034  */
4035 int
4036 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
4037     struct thread *td, accmode_t accmode)
4038 {
4039 
4040 	/*
4041 	 * Kernel-invoked always succeeds.
4042 	 */
4043 	if (cred == NOCRED)
4044 		return (0);
4045 
4046 	/*
4047 	 * Do not allow privileged processes in jail to directly manipulate
4048 	 * system attributes.
4049 	 */
4050 	switch (attrnamespace) {
4051 	case EXTATTR_NAMESPACE_SYSTEM:
4052 		/* Potentially should be: return (EPERM); */
4053 		return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
4054 	case EXTATTR_NAMESPACE_USER:
4055 		return (VOP_ACCESS(vp, accmode, cred, td));
4056 	default:
4057 		return (EPERM);
4058 	}
4059 }
4060 
4061 #ifdef DEBUG_VFS_LOCKS
4062 /*
4063  * This only exists to supress warnings from unlocked specfs accesses.  It is
4064  * no longer ok to have an unlocked VFS.
4065  */
4066 #define	IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL ||		\
4067 	(vp)->v_type == VCHR ||	(vp)->v_type == VBAD)
4068 
4069 int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
4070 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
4071     "Drop into debugger on lock violation");
4072 
4073 int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
4074 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
4075     0, "Check for interlock across VOPs");
4076 
4077 int vfs_badlock_print = 1;	/* Print lock violations. */
4078 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
4079     0, "Print lock violations");
4080 
4081 #ifdef KDB
4082 int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
4083 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
4084     &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
4085 #endif
4086 
4087 static void
4088 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
4089 {
4090 
4091 #ifdef KDB
4092 	if (vfs_badlock_backtrace)
4093 		kdb_backtrace();
4094 #endif
4095 	if (vfs_badlock_print)
4096 		printf("%s: %p %s\n", str, (void *)vp, msg);
4097 	if (vfs_badlock_ddb)
4098 		kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
4099 }
4100 
4101 void
4102 assert_vi_locked(struct vnode *vp, const char *str)
4103 {
4104 
4105 	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
4106 		vfs_badlock("interlock is not locked but should be", str, vp);
4107 }
4108 
4109 void
4110 assert_vi_unlocked(struct vnode *vp, const char *str)
4111 {
4112 
4113 	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
4114 		vfs_badlock("interlock is locked but should not be", str, vp);
4115 }
4116 
4117 void
4118 assert_vop_locked(struct vnode *vp, const char *str)
4119 {
4120 	int locked;
4121 
4122 	if (!IGNORE_LOCK(vp)) {
4123 		locked = VOP_ISLOCKED(vp);
4124 		if (locked == 0 || locked == LK_EXCLOTHER)
4125 			vfs_badlock("is not locked but should be", str, vp);
4126 	}
4127 }
4128 
4129 void
4130 assert_vop_unlocked(struct vnode *vp, const char *str)
4131 {
4132 
4133 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
4134 		vfs_badlock("is locked but should not be", str, vp);
4135 }
4136 
4137 void
4138 assert_vop_elocked(struct vnode *vp, const char *str)
4139 {
4140 
4141 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
4142 		vfs_badlock("is not exclusive locked but should be", str, vp);
4143 }
4144 
4145 #if 0
4146 void
4147 assert_vop_elocked_other(struct vnode *vp, const char *str)
4148 {
4149 
4150 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER)
4151 		vfs_badlock("is not exclusive locked by another thread",
4152 		    str, vp);
4153 }
4154 
4155 void
4156 assert_vop_slocked(struct vnode *vp, const char *str)
4157 {
4158 
4159 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED)
4160 		vfs_badlock("is not locked shared but should be", str, vp);
4161 }
4162 #endif /* 0 */
4163 #endif /* DEBUG_VFS_LOCKS */
4164 
4165 void
4166 vop_rename_fail(struct vop_rename_args *ap)
4167 {
4168 
4169 	if (ap->a_tvp != NULL)
4170 		vput(ap->a_tvp);
4171 	if (ap->a_tdvp == ap->a_tvp)
4172 		vrele(ap->a_tdvp);
4173 	else
4174 		vput(ap->a_tdvp);
4175 	vrele(ap->a_fdvp);
4176 	vrele(ap->a_fvp);
4177 }
4178 
4179 void
4180 vop_rename_pre(void *ap)
4181 {
4182 	struct vop_rename_args *a = ap;
4183 
4184 #ifdef DEBUG_VFS_LOCKS
4185 	if (a->a_tvp)
4186 		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
4187 	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
4188 	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
4189 	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
4190 
4191 	/* Check the source (from). */
4192 	if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
4193 	    (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
4194 		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
4195 	if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
4196 		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
4197 
4198 	/* Check the target. */
4199 	if (a->a_tvp)
4200 		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
4201 	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
4202 #endif
4203 	if (a->a_tdvp != a->a_fdvp)
4204 		vhold(a->a_fdvp);
4205 	if (a->a_tvp != a->a_fvp)
4206 		vhold(a->a_fvp);
4207 	vhold(a->a_tdvp);
4208 	if (a->a_tvp)
4209 		vhold(a->a_tvp);
4210 }
4211 
4212 void
4213 vop_strategy_pre(void *ap)
4214 {
4215 #ifdef DEBUG_VFS_LOCKS
4216 	struct vop_strategy_args *a;
4217 	struct buf *bp;
4218 
4219 	a = ap;
4220 	bp = a->a_bp;
4221 
4222 	/*
4223 	 * Cluster ops lock their component buffers but not the IO container.
4224 	 */
4225 	if ((bp->b_flags & B_CLUSTER) != 0)
4226 		return;
4227 
4228 	if (panicstr == NULL && !BUF_ISLOCKED(bp)) {
4229 		if (vfs_badlock_print)
4230 			printf(
4231 			    "VOP_STRATEGY: bp is not locked but should be\n");
4232 		if (vfs_badlock_ddb)
4233 			kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
4234 	}
4235 #endif
4236 }
4237 
4238 void
4239 vop_lock_pre(void *ap)
4240 {
4241 #ifdef DEBUG_VFS_LOCKS
4242 	struct vop_lock1_args *a = ap;
4243 
4244 	if ((a->a_flags & LK_INTERLOCK) == 0)
4245 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
4246 	else
4247 		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
4248 #endif
4249 }
4250 
4251 void
4252 vop_lock_post(void *ap, int rc)
4253 {
4254 #ifdef DEBUG_VFS_LOCKS
4255 	struct vop_lock1_args *a = ap;
4256 
4257 	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
4258 	if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
4259 		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
4260 #endif
4261 }
4262 
4263 void
4264 vop_unlock_pre(void *ap)
4265 {
4266 #ifdef DEBUG_VFS_LOCKS
4267 	struct vop_unlock_args *a = ap;
4268 
4269 	if (a->a_flags & LK_INTERLOCK)
4270 		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
4271 	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
4272 #endif
4273 }
4274 
4275 void
4276 vop_unlock_post(void *ap, int rc)
4277 {
4278 #ifdef DEBUG_VFS_LOCKS
4279 	struct vop_unlock_args *a = ap;
4280 
4281 	if (a->a_flags & LK_INTERLOCK)
4282 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
4283 #endif
4284 }
4285 
4286 void
4287 vop_create_post(void *ap, int rc)
4288 {
4289 	struct vop_create_args *a = ap;
4290 
4291 	if (!rc)
4292 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4293 }
4294 
4295 void
4296 vop_deleteextattr_post(void *ap, int rc)
4297 {
4298 	struct vop_deleteextattr_args *a = ap;
4299 
4300 	if (!rc)
4301 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4302 }
4303 
4304 void
4305 vop_link_post(void *ap, int rc)
4306 {
4307 	struct vop_link_args *a = ap;
4308 
4309 	if (!rc) {
4310 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
4311 		VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
4312 	}
4313 }
4314 
4315 void
4316 vop_mkdir_post(void *ap, int rc)
4317 {
4318 	struct vop_mkdir_args *a = ap;
4319 
4320 	if (!rc)
4321 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4322 }
4323 
4324 void
4325 vop_mknod_post(void *ap, int rc)
4326 {
4327 	struct vop_mknod_args *a = ap;
4328 
4329 	if (!rc)
4330 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4331 }
4332 
4333 void
4334 vop_remove_post(void *ap, int rc)
4335 {
4336 	struct vop_remove_args *a = ap;
4337 
4338 	if (!rc) {
4339 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4340 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4341 	}
4342 }
4343 
4344 void
4345 vop_rename_post(void *ap, int rc)
4346 {
4347 	struct vop_rename_args *a = ap;
4348 
4349 	if (!rc) {
4350 		VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
4351 		VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
4352 		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
4353 		if (a->a_tvp)
4354 			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
4355 	}
4356 	if (a->a_tdvp != a->a_fdvp)
4357 		vdrop(a->a_fdvp);
4358 	if (a->a_tvp != a->a_fvp)
4359 		vdrop(a->a_fvp);
4360 	vdrop(a->a_tdvp);
4361 	if (a->a_tvp)
4362 		vdrop(a->a_tvp);
4363 }
4364 
4365 void
4366 vop_rmdir_post(void *ap, int rc)
4367 {
4368 	struct vop_rmdir_args *a = ap;
4369 
4370 	if (!rc) {
4371 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4372 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4373 	}
4374 }
4375 
4376 void
4377 vop_setattr_post(void *ap, int rc)
4378 {
4379 	struct vop_setattr_args *a = ap;
4380 
4381 	if (!rc)
4382 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4383 }
4384 
4385 void
4386 vop_setextattr_post(void *ap, int rc)
4387 {
4388 	struct vop_setextattr_args *a = ap;
4389 
4390 	if (!rc)
4391 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4392 }
4393 
4394 void
4395 vop_symlink_post(void *ap, int rc)
4396 {
4397 	struct vop_symlink_args *a = ap;
4398 
4399 	if (!rc)
4400 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4401 }
4402 
4403 static struct knlist fs_knlist;
4404 
4405 static void
4406 vfs_event_init(void *arg)
4407 {
4408 	knlist_init_mtx(&fs_knlist, NULL);
4409 }
4410 /* XXX - correct order? */
4411 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
4412 
4413 void
4414 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
4415 {
4416 
4417 	KNOTE_UNLOCKED(&fs_knlist, event);
4418 }
4419 
4420 static int	filt_fsattach(struct knote *kn);
4421 static void	filt_fsdetach(struct knote *kn);
4422 static int	filt_fsevent(struct knote *kn, long hint);
4423 
4424 struct filterops fs_filtops = {
4425 	.f_isfd = 0,
4426 	.f_attach = filt_fsattach,
4427 	.f_detach = filt_fsdetach,
4428 	.f_event = filt_fsevent
4429 };
4430 
4431 static int
4432 filt_fsattach(struct knote *kn)
4433 {
4434 
4435 	kn->kn_flags |= EV_CLEAR;
4436 	knlist_add(&fs_knlist, kn, 0);
4437 	return (0);
4438 }
4439 
4440 static void
4441 filt_fsdetach(struct knote *kn)
4442 {
4443 
4444 	knlist_remove(&fs_knlist, kn, 0);
4445 }
4446 
4447 static int
4448 filt_fsevent(struct knote *kn, long hint)
4449 {
4450 
4451 	kn->kn_fflags |= hint;
4452 	return (kn->kn_fflags != 0);
4453 }
4454 
4455 static int
4456 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
4457 {
4458 	struct vfsidctl vc;
4459 	int error;
4460 	struct mount *mp;
4461 
4462 	error = SYSCTL_IN(req, &vc, sizeof(vc));
4463 	if (error)
4464 		return (error);
4465 	if (vc.vc_vers != VFS_CTL_VERS1)
4466 		return (EINVAL);
4467 	mp = vfs_getvfs(&vc.vc_fsid);
4468 	if (mp == NULL)
4469 		return (ENOENT);
4470 	/* ensure that a specific sysctl goes to the right filesystem. */
4471 	if (strcmp(vc.vc_fstypename, "*") != 0 &&
4472 	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
4473 		vfs_rel(mp);
4474 		return (EINVAL);
4475 	}
4476 	VCTLTOREQ(&vc, req);
4477 	error = VFS_SYSCTL(mp, vc.vc_op, req);
4478 	vfs_rel(mp);
4479 	return (error);
4480 }
4481 
4482 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR,
4483     NULL, 0, sysctl_vfs_ctl, "",
4484     "Sysctl by fsid");
4485 
4486 /*
4487  * Function to initialize a va_filerev field sensibly.
4488  * XXX: Wouldn't a random number make a lot more sense ??
4489  */
4490 u_quad_t
4491 init_va_filerev(void)
4492 {
4493 	struct bintime bt;
4494 
4495 	getbinuptime(&bt);
4496 	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
4497 }
4498 
4499 static int	filt_vfsread(struct knote *kn, long hint);
4500 static int	filt_vfswrite(struct knote *kn, long hint);
4501 static int	filt_vfsvnode(struct knote *kn, long hint);
4502 static void	filt_vfsdetach(struct knote *kn);
4503 static struct filterops vfsread_filtops = {
4504 	.f_isfd = 1,
4505 	.f_detach = filt_vfsdetach,
4506 	.f_event = filt_vfsread
4507 };
4508 static struct filterops vfswrite_filtops = {
4509 	.f_isfd = 1,
4510 	.f_detach = filt_vfsdetach,
4511 	.f_event = filt_vfswrite
4512 };
4513 static struct filterops vfsvnode_filtops = {
4514 	.f_isfd = 1,
4515 	.f_detach = filt_vfsdetach,
4516 	.f_event = filt_vfsvnode
4517 };
4518 
4519 static void
4520 vfs_knllock(void *arg)
4521 {
4522 	struct vnode *vp = arg;
4523 
4524 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4525 }
4526 
4527 static void
4528 vfs_knlunlock(void *arg)
4529 {
4530 	struct vnode *vp = arg;
4531 
4532 	VOP_UNLOCK(vp, 0);
4533 }
4534 
4535 static void
4536 vfs_knl_assert_locked(void *arg)
4537 {
4538 #ifdef DEBUG_VFS_LOCKS
4539 	struct vnode *vp = arg;
4540 
4541 	ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
4542 #endif
4543 }
4544 
4545 static void
4546 vfs_knl_assert_unlocked(void *arg)
4547 {
4548 #ifdef DEBUG_VFS_LOCKS
4549 	struct vnode *vp = arg;
4550 
4551 	ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
4552 #endif
4553 }
4554 
4555 int
4556 vfs_kqfilter(struct vop_kqfilter_args *ap)
4557 {
4558 	struct vnode *vp = ap->a_vp;
4559 	struct knote *kn = ap->a_kn;
4560 	struct knlist *knl;
4561 
4562 	switch (kn->kn_filter) {
4563 	case EVFILT_READ:
4564 		kn->kn_fop = &vfsread_filtops;
4565 		break;
4566 	case EVFILT_WRITE:
4567 		kn->kn_fop = &vfswrite_filtops;
4568 		break;
4569 	case EVFILT_VNODE:
4570 		kn->kn_fop = &vfsvnode_filtops;
4571 		break;
4572 	default:
4573 		return (EINVAL);
4574 	}
4575 
4576 	kn->kn_hook = (caddr_t)vp;
4577 
4578 	v_addpollinfo(vp);
4579 	if (vp->v_pollinfo == NULL)
4580 		return (ENOMEM);
4581 	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
4582 	vhold(vp);
4583 	knlist_add(knl, kn, 0);
4584 
4585 	return (0);
4586 }
4587 
4588 /*
4589  * Detach knote from vnode
4590  */
4591 static void
4592 filt_vfsdetach(struct knote *kn)
4593 {
4594 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4595 
4596 	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
4597 	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
4598 	vdrop(vp);
4599 }
4600 
4601 /*ARGSUSED*/
4602 static int
4603 filt_vfsread(struct knote *kn, long hint)
4604 {
4605 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4606 	struct vattr va;
4607 	int res;
4608 
4609 	/*
4610 	 * filesystem is gone, so set the EOF flag and schedule
4611 	 * the knote for deletion.
4612 	 */
4613 	if (hint == NOTE_REVOKE) {
4614 		VI_LOCK(vp);
4615 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4616 		VI_UNLOCK(vp);
4617 		return (1);
4618 	}
4619 
4620 	if (VOP_GETATTR(vp, &va, curthread->td_ucred))
4621 		return (0);
4622 
4623 	VI_LOCK(vp);
4624 	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
4625 	res = (kn->kn_data != 0);
4626 	VI_UNLOCK(vp);
4627 	return (res);
4628 }
4629 
4630 /*ARGSUSED*/
4631 static int
4632 filt_vfswrite(struct knote *kn, long hint)
4633 {
4634 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4635 
4636 	VI_LOCK(vp);
4637 
4638 	/*
4639 	 * filesystem is gone, so set the EOF flag and schedule
4640 	 * the knote for deletion.
4641 	 */
4642 	if (hint == NOTE_REVOKE)
4643 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4644 
4645 	kn->kn_data = 0;
4646 	VI_UNLOCK(vp);
4647 	return (1);
4648 }
4649 
4650 static int
4651 filt_vfsvnode(struct knote *kn, long hint)
4652 {
4653 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4654 	int res;
4655 
4656 	VI_LOCK(vp);
4657 	if (kn->kn_sfflags & hint)
4658 		kn->kn_fflags |= hint;
4659 	if (hint == NOTE_REVOKE) {
4660 		kn->kn_flags |= EV_EOF;
4661 		VI_UNLOCK(vp);
4662 		return (1);
4663 	}
4664 	res = (kn->kn_fflags != 0);
4665 	VI_UNLOCK(vp);
4666 	return (res);
4667 }
4668 
4669 int
4670 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
4671 {
4672 	int error;
4673 
4674 	if (dp->d_reclen > ap->a_uio->uio_resid)
4675 		return (ENAMETOOLONG);
4676 	error = uiomove(dp, dp->d_reclen, ap->a_uio);
4677 	if (error) {
4678 		if (ap->a_ncookies != NULL) {
4679 			if (ap->a_cookies != NULL)
4680 				free(ap->a_cookies, M_TEMP);
4681 			ap->a_cookies = NULL;
4682 			*ap->a_ncookies = 0;
4683 		}
4684 		return (error);
4685 	}
4686 	if (ap->a_ncookies == NULL)
4687 		return (0);
4688 
4689 	KASSERT(ap->a_cookies,
4690 	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
4691 
4692 	*ap->a_cookies = realloc(*ap->a_cookies,
4693 	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
4694 	(*ap->a_cookies)[*ap->a_ncookies] = off;
4695 	return (0);
4696 }
4697 
4698 /*
4699  * Mark for update the access time of the file if the filesystem
4700  * supports VOP_MARKATIME.  This functionality is used by execve and
4701  * mmap, so we want to avoid the I/O implied by directly setting
4702  * va_atime for the sake of efficiency.
4703  */
4704 void
4705 vfs_mark_atime(struct vnode *vp, struct ucred *cred)
4706 {
4707 	struct mount *mp;
4708 
4709 	mp = vp->v_mount;
4710 	ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
4711 	if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
4712 		(void)VOP_MARKATIME(vp);
4713 }
4714 
4715 /*
4716  * The purpose of this routine is to remove granularity from accmode_t,
4717  * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
4718  * VADMIN and VAPPEND.
4719  *
4720  * If it returns 0, the caller is supposed to continue with the usual
4721  * access checks using 'accmode' as modified by this routine.  If it
4722  * returns nonzero value, the caller is supposed to return that value
4723  * as errno.
4724  *
4725  * Note that after this routine runs, accmode may be zero.
4726  */
4727 int
4728 vfs_unixify_accmode(accmode_t *accmode)
4729 {
4730 	/*
4731 	 * There is no way to specify explicit "deny" rule using
4732 	 * file mode or POSIX.1e ACLs.
4733 	 */
4734 	if (*accmode & VEXPLICIT_DENY) {
4735 		*accmode = 0;
4736 		return (0);
4737 	}
4738 
4739 	/*
4740 	 * None of these can be translated into usual access bits.
4741 	 * Also, the common case for NFSv4 ACLs is to not contain
4742 	 * either of these bits. Caller should check for VWRITE
4743 	 * on the containing directory instead.
4744 	 */
4745 	if (*accmode & (VDELETE_CHILD | VDELETE))
4746 		return (EPERM);
4747 
4748 	if (*accmode & VADMIN_PERMS) {
4749 		*accmode &= ~VADMIN_PERMS;
4750 		*accmode |= VADMIN;
4751 	}
4752 
4753 	/*
4754 	 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
4755 	 * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
4756 	 */
4757 	*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
4758 
4759 	return (0);
4760 }
4761 
4762 /*
4763  * These are helper functions for filesystems to traverse all
4764  * their vnodes.  See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
4765  *
4766  * This interface replaces MNT_VNODE_FOREACH.
4767  */
4768 
4769 MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
4770 
4771 struct vnode *
4772 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
4773 {
4774 	struct vnode *vp;
4775 
4776 	if (should_yield())
4777 		kern_yield(PRI_USER);
4778 	MNT_ILOCK(mp);
4779 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4780 	vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
4781 	while (vp != NULL && (vp->v_type == VMARKER ||
4782 	    (vp->v_iflag & VI_DOOMED) != 0))
4783 		vp = TAILQ_NEXT(vp, v_nmntvnodes);
4784 
4785 	/* Check if we are done */
4786 	if (vp == NULL) {
4787 		__mnt_vnode_markerfree_all(mvp, mp);
4788 		/* MNT_IUNLOCK(mp); -- done in above function */
4789 		mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
4790 		return (NULL);
4791 	}
4792 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
4793 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
4794 	VI_LOCK(vp);
4795 	MNT_IUNLOCK(mp);
4796 	return (vp);
4797 }
4798 
4799 struct vnode *
4800 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
4801 {
4802 	struct vnode *vp;
4803 
4804 	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
4805 	MNT_ILOCK(mp);
4806 	MNT_REF(mp);
4807 	(*mvp)->v_type = VMARKER;
4808 
4809 	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
4810 	while (vp != NULL && (vp->v_type == VMARKER ||
4811 	    (vp->v_iflag & VI_DOOMED) != 0))
4812 		vp = TAILQ_NEXT(vp, v_nmntvnodes);
4813 
4814 	/* Check if we are done */
4815 	if (vp == NULL) {
4816 		MNT_REL(mp);
4817 		MNT_IUNLOCK(mp);
4818 		free(*mvp, M_VNODE_MARKER);
4819 		*mvp = NULL;
4820 		return (NULL);
4821 	}
4822 	(*mvp)->v_mount = mp;
4823 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
4824 	VI_LOCK(vp);
4825 	MNT_IUNLOCK(mp);
4826 	return (vp);
4827 }
4828 
4829 
4830 void
4831 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
4832 {
4833 
4834 	if (*mvp == NULL) {
4835 		MNT_IUNLOCK(mp);
4836 		return;
4837 	}
4838 
4839 	mtx_assert(MNT_MTX(mp), MA_OWNED);
4840 
4841 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4842 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
4843 	MNT_REL(mp);
4844 	MNT_IUNLOCK(mp);
4845 	free(*mvp, M_VNODE_MARKER);
4846 	*mvp = NULL;
4847 }
4848 
4849 /*
4850  * These are helper functions for filesystems to traverse their
4851  * active vnodes.  See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h
4852  */
4853 static void
4854 mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
4855 {
4856 
4857 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4858 
4859 	MNT_ILOCK(mp);
4860 	MNT_REL(mp);
4861 	MNT_IUNLOCK(mp);
4862 	free(*mvp, M_VNODE_MARKER);
4863 	*mvp = NULL;
4864 }
4865 
4866 static struct vnode *
4867 mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
4868 {
4869 	struct vnode *vp, *nvp;
4870 
4871 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
4872 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4873 restart:
4874 	vp = TAILQ_NEXT(*mvp, v_actfreelist);
4875 	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
4876 	while (vp != NULL) {
4877 		if (vp->v_type == VMARKER) {
4878 			vp = TAILQ_NEXT(vp, v_actfreelist);
4879 			continue;
4880 		}
4881 		if (!VI_TRYLOCK(vp)) {
4882 			if (mp_ncpus == 1 || should_yield()) {
4883 				TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
4884 				mtx_unlock(&vnode_free_list_mtx);
4885 				pause("vnacti", 1);
4886 				mtx_lock(&vnode_free_list_mtx);
4887 				goto restart;
4888 			}
4889 			continue;
4890 		}
4891 		KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
4892 		KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
4893 		    ("alien vnode on the active list %p %p", vp, mp));
4894 		if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0)
4895 			break;
4896 		nvp = TAILQ_NEXT(vp, v_actfreelist);
4897 		VI_UNLOCK(vp);
4898 		vp = nvp;
4899 	}
4900 
4901 	/* Check if we are done */
4902 	if (vp == NULL) {
4903 		mtx_unlock(&vnode_free_list_mtx);
4904 		mnt_vnode_markerfree_active(mvp, mp);
4905 		return (NULL);
4906 	}
4907 	TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist);
4908 	mtx_unlock(&vnode_free_list_mtx);
4909 	ASSERT_VI_LOCKED(vp, "active iter");
4910 	KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
4911 	return (vp);
4912 }
4913 
4914 struct vnode *
4915 __mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
4916 {
4917 
4918 	if (should_yield())
4919 		kern_yield(PRI_USER);
4920 	mtx_lock(&vnode_free_list_mtx);
4921 	return (mnt_vnode_next_active(mvp, mp));
4922 }
4923 
4924 struct vnode *
4925 __mnt_vnode_first_active(struct vnode **mvp, struct mount *mp)
4926 {
4927 	struct vnode *vp;
4928 
4929 	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
4930 	MNT_ILOCK(mp);
4931 	MNT_REF(mp);
4932 	MNT_IUNLOCK(mp);
4933 	(*mvp)->v_type = VMARKER;
4934 	(*mvp)->v_mount = mp;
4935 
4936 	mtx_lock(&vnode_free_list_mtx);
4937 	vp = TAILQ_FIRST(&mp->mnt_activevnodelist);
4938 	if (vp == NULL) {
4939 		mtx_unlock(&vnode_free_list_mtx);
4940 		mnt_vnode_markerfree_active(mvp, mp);
4941 		return (NULL);
4942 	}
4943 	TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
4944 	return (mnt_vnode_next_active(mvp, mp));
4945 }
4946 
4947 void
4948 __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
4949 {
4950 
4951 	if (*mvp == NULL)
4952 		return;
4953 
4954 	mtx_lock(&vnode_free_list_mtx);
4955 	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
4956 	mtx_unlock(&vnode_free_list_mtx);
4957 	mnt_vnode_markerfree_active(mvp, mp);
4958 }
4959