xref: /freebsd/sys/kern/vfs_subr.c (revision fe50a38eb029e6e551fef9f6cf6ffa515f1897fa)
1 /*-
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
35  */
36 
37 /*
38  * External virtual filesystem routines
39  */
40 
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 #include "opt_compat.h"
45 #include "opt_ddb.h"
46 #include "opt_watchdog.h"
47 
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/bio.h>
51 #include <sys/buf.h>
52 #include <sys/condvar.h>
53 #include <sys/conf.h>
54 #include <sys/dirent.h>
55 #include <sys/event.h>
56 #include <sys/eventhandler.h>
57 #include <sys/extattr.h>
58 #include <sys/file.h>
59 #include <sys/fcntl.h>
60 #include <sys/jail.h>
61 #include <sys/kdb.h>
62 #include <sys/kernel.h>
63 #include <sys/kthread.h>
64 #include <sys/lockf.h>
65 #include <sys/malloc.h>
66 #include <sys/mount.h>
67 #include <sys/namei.h>
68 #include <sys/pctrie.h>
69 #include <sys/priv.h>
70 #include <sys/reboot.h>
71 #include <sys/refcount.h>
72 #include <sys/rwlock.h>
73 #include <sys/sched.h>
74 #include <sys/sleepqueue.h>
75 #include <sys/smp.h>
76 #include <sys/stat.h>
77 #include <sys/sysctl.h>
78 #include <sys/syslog.h>
79 #include <sys/vmmeter.h>
80 #include <sys/vnode.h>
81 #include <sys/watchdog.h>
82 
83 #include <machine/stdarg.h>
84 
85 #include <security/mac/mac_framework.h>
86 
87 #include <vm/vm.h>
88 #include <vm/vm_object.h>
89 #include <vm/vm_extern.h>
90 #include <vm/pmap.h>
91 #include <vm/vm_map.h>
92 #include <vm/vm_page.h>
93 #include <vm/vm_kern.h>
94 #include <vm/uma.h>
95 
96 #ifdef DDB
97 #include <ddb/ddb.h>
98 #endif
99 
100 static void	delmntque(struct vnode *vp);
101 static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
102 		    int slpflag, int slptimeo);
103 static void	syncer_shutdown(void *arg, int howto);
104 static int	vtryrecycle(struct vnode *vp);
105 static void	v_init_counters(struct vnode *);
106 static void	v_incr_usecount(struct vnode *);
107 static void	v_incr_devcount(struct vnode *);
108 static void	v_decr_devcount(struct vnode *);
109 static void	vnlru_free(int);
110 static void	vgonel(struct vnode *);
111 static void	vfs_knllock(void *arg);
112 static void	vfs_knlunlock(void *arg);
113 static void	vfs_knl_assert_locked(void *arg);
114 static void	vfs_knl_assert_unlocked(void *arg);
115 static void	destroy_vpollinfo(struct vpollinfo *vi);
116 
117 /*
118  * Number of vnodes in existence.  Increased whenever getnewvnode()
119  * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode.
120  */
121 static unsigned long	numvnodes;
122 
123 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
124     "Number of vnodes in existence");
125 
126 static u_long vnodes_created;
127 SYSCTL_ULONG(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
128     0, "Number of vnodes created by getnewvnode");
129 
130 /*
131  * Conversion tables for conversion from vnode types to inode formats
132  * and back.
133  */
134 enum vtype iftovt_tab[16] = {
135 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
136 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
137 };
138 int vttoif_tab[10] = {
139 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
140 	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
141 };
142 
143 /*
144  * List of vnodes that are ready for recycling.
145  */
146 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
147 
148 /*
149  * Free vnode target.  Free vnodes may simply be files which have been stat'd
150  * but not read.  This is somewhat common, and a small cache of such files
151  * should be kept to avoid recreation costs.
152  */
153 static u_long wantfreevnodes;
154 SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
155 /* Number of vnodes in the free list. */
156 static u_long freevnodes;
157 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0,
158     "Number of vnodes in the free list");
159 
160 static int vlru_allow_cache_src;
161 SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW,
162     &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode");
163 
164 static u_long recycles_count;
165 SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 0,
166     "Number of vnodes recycled to avoid exceding kern.maxvnodes");
167 
168 /*
169  * Various variables used for debugging the new implementation of
170  * reassignbuf().
171  * XXX these are probably of (very) limited utility now.
172  */
173 static int reassignbufcalls;
174 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
175     "Number of calls to reassignbuf");
176 
177 static u_long free_owe_inact;
178 SYSCTL_ULONG(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact, 0,
179     "Number of times free vnodes kept on active list due to VFS "
180     "owing inactivation");
181 
182 /* To keep more than one thread at a time from running vfs_getnewfsid */
183 static struct mtx mntid_mtx;
184 
185 /*
186  * Lock for any access to the following:
187  *	vnode_free_list
188  *	numvnodes
189  *	freevnodes
190  */
191 static struct mtx vnode_free_list_mtx;
192 
193 /* Publicly exported FS */
194 struct nfs_public nfs_pub;
195 
196 static uma_zone_t buf_trie_zone;
197 
198 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
199 static uma_zone_t vnode_zone;
200 static uma_zone_t vnodepoll_zone;
201 
202 /*
203  * The workitem queue.
204  *
205  * It is useful to delay writes of file data and filesystem metadata
206  * for tens of seconds so that quickly created and deleted files need
207  * not waste disk bandwidth being created and removed. To realize this,
208  * we append vnodes to a "workitem" queue. When running with a soft
209  * updates implementation, most pending metadata dependencies should
210  * not wait for more than a few seconds. Thus, mounted on block devices
211  * are delayed only about a half the time that file data is delayed.
212  * Similarly, directory updates are more critical, so are only delayed
213  * about a third the time that file data is delayed. Thus, there are
214  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
215  * one each second (driven off the filesystem syncer process). The
216  * syncer_delayno variable indicates the next queue that is to be processed.
217  * Items that need to be processed soon are placed in this queue:
218  *
219  *	syncer_workitem_pending[syncer_delayno]
220  *
221  * A delay of fifteen seconds is done by placing the request fifteen
222  * entries later in the queue:
223  *
224  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
225  *
226  */
227 static int syncer_delayno;
228 static long syncer_mask;
229 LIST_HEAD(synclist, bufobj);
230 static struct synclist *syncer_workitem_pending;
231 /*
232  * The sync_mtx protects:
233  *	bo->bo_synclist
234  *	sync_vnode_count
235  *	syncer_delayno
236  *	syncer_state
237  *	syncer_workitem_pending
238  *	syncer_worklist_len
239  *	rushjob
240  */
241 static struct mtx sync_mtx;
242 static struct cv sync_wakeup;
243 
244 #define SYNCER_MAXDELAY		32
245 static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
246 static int syncdelay = 30;		/* max time to delay syncing data */
247 static int filedelay = 30;		/* time to delay syncing files */
248 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
249     "Time to delay syncing files (in seconds)");
250 static int dirdelay = 29;		/* time to delay syncing directories */
251 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
252     "Time to delay syncing directories (in seconds)");
253 static int metadelay = 28;		/* time to delay syncing metadata */
254 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
255     "Time to delay syncing metadata (in seconds)");
256 static int rushjob;		/* number of slots to run ASAP */
257 static int stat_rush_requests;	/* number of times I/O speeded up */
258 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
259     "Number of times I/O speeded up (rush requests)");
260 
261 /*
262  * When shutting down the syncer, run it at four times normal speed.
263  */
264 #define SYNCER_SHUTDOWN_SPEEDUP		4
265 static int sync_vnode_count;
266 static int syncer_worklist_len;
267 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
268     syncer_state;
269 
270 /*
271  * Number of vnodes we want to exist at any one time.  This is mostly used
272  * to size hash tables in vnode-related code.  It is normally not used in
273  * getnewvnode(), as wantfreevnodes is normally nonzero.)
274  *
275  * XXX desiredvnodes is historical cruft and should not exist.
276  */
277 int desiredvnodes;
278 
279 static int
280 sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
281 {
282 	int error, old_desiredvnodes;
283 
284 	old_desiredvnodes = desiredvnodes;
285 	if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0)
286 		return (error);
287 	if (old_desiredvnodes != desiredvnodes) {
288 		vfs_hash_changesize(desiredvnodes);
289 		cache_changesize(desiredvnodes);
290 	}
291 	return (0);
292 }
293 
294 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
295     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0,
296     sysctl_update_desiredvnodes, "I", "Maximum number of vnodes");
297 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
298     &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
299 static int vnlru_nowhere;
300 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
301     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
302 
303 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
304 static int vnsz2log;
305 
306 /*
307  * Support for the bufobj clean & dirty pctrie.
308  */
309 static void *
310 buf_trie_alloc(struct pctrie *ptree)
311 {
312 
313 	return uma_zalloc(buf_trie_zone, M_NOWAIT);
314 }
315 
316 static void
317 buf_trie_free(struct pctrie *ptree, void *node)
318 {
319 
320 	uma_zfree(buf_trie_zone, node);
321 }
322 PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);
323 
324 /*
325  * Initialize the vnode management data structures.
326  *
327  * Reevaluate the following cap on the number of vnodes after the physical
328  * memory size exceeds 512GB.  In the limit, as the physical memory size
329  * grows, the ratio of physical pages to vnodes approaches sixteen to one.
330  */
331 #ifndef	MAXVNODES_MAX
332 #define	MAXVNODES_MAX	(512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16))
333 #endif
334 static void
335 vntblinit(void *dummy __unused)
336 {
337 	u_int i;
338 	int physvnodes, virtvnodes;
339 
340 	/*
341 	 * Desiredvnodes is a function of the physical memory size and the
342 	 * kernel's heap size.  Generally speaking, it scales with the
343 	 * physical memory size.  The ratio of desiredvnodes to physical pages
344 	 * is one to four until desiredvnodes exceeds 98,304.  Thereafter, the
345 	 * marginal ratio of desiredvnodes to physical pages is one to
346 	 * sixteen.  However, desiredvnodes is limited by the kernel's heap
347 	 * size.  The memory required by desiredvnodes vnodes and vm objects
348 	 * may not exceed one seventh of the kernel's heap size.
349 	 */
350 	physvnodes = maxproc + vm_cnt.v_page_count / 16 + 3 * min(98304 * 4,
351 	    vm_cnt.v_page_count) / 16;
352 	virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) +
353 	    sizeof(struct vnode)));
354 	desiredvnodes = min(physvnodes, virtvnodes);
355 	if (desiredvnodes > MAXVNODES_MAX) {
356 		if (bootverbose)
357 			printf("Reducing kern.maxvnodes %d -> %d\n",
358 			    desiredvnodes, MAXVNODES_MAX);
359 		desiredvnodes = MAXVNODES_MAX;
360 	}
361 	wantfreevnodes = desiredvnodes / 4;
362 	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
363 	TAILQ_INIT(&vnode_free_list);
364 	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
365 	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
366 	    NULL, NULL, UMA_ALIGN_PTR, 0);
367 	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
368 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
369 	/*
370 	 * Preallocate enough nodes to support one-per buf so that
371 	 * we can not fail an insert.  reassignbuf() callers can not
372 	 * tolerate the insertion failure.
373 	 */
374 	buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
375 	    NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR,
376 	    UMA_ZONE_NOFREE | UMA_ZONE_VM);
377 	uma_prealloc(buf_trie_zone, nbuf);
378 	/*
379 	 * Initialize the filesystem syncer.
380 	 */
381 	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
382 	    &syncer_mask);
383 	syncer_maxdelay = syncer_mask + 1;
384 	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
385 	cv_init(&sync_wakeup, "syncer");
386 	for (i = 1; i <= sizeof(struct vnode); i <<= 1)
387 		vnsz2log++;
388 	vnsz2log--;
389 }
390 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
391 
392 
393 /*
394  * Mark a mount point as busy. Used to synchronize access and to delay
395  * unmounting. Eventually, mountlist_mtx is not released on failure.
396  *
397  * vfs_busy() is a custom lock, it can block the caller.
398  * vfs_busy() only sleeps if the unmount is active on the mount point.
399  * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
400  * vnode belonging to mp.
401  *
402  * Lookup uses vfs_busy() to traverse mount points.
403  * root fs			var fs
404  * / vnode lock		A	/ vnode lock (/var)		D
405  * /var vnode lock	B	/log vnode lock(/var/log)	E
406  * vfs_busy lock	C	vfs_busy lock			F
407  *
408  * Within each file system, the lock order is C->A->B and F->D->E.
409  *
410  * When traversing across mounts, the system follows that lock order:
411  *
412  *        C->A->B
413  *              |
414  *              +->F->D->E
415  *
416  * The lookup() process for namei("/var") illustrates the process:
417  *  VOP_LOOKUP() obtains B while A is held
418  *  vfs_busy() obtains a shared lock on F while A and B are held
419  *  vput() releases lock on B
420  *  vput() releases lock on A
421  *  VFS_ROOT() obtains lock on D while shared lock on F is held
422  *  vfs_unbusy() releases shared lock on F
423  *  vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
424  *    Attempt to lock A (instead of vp_crossmp) while D is held would
425  *    violate the global order, causing deadlocks.
426  *
427  * dounmount() locks B while F is drained.
428  */
429 int
430 vfs_busy(struct mount *mp, int flags)
431 {
432 
433 	MPASS((flags & ~MBF_MASK) == 0);
434 	CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
435 
436 	MNT_ILOCK(mp);
437 	MNT_REF(mp);
438 	/*
439 	 * If mount point is currenly being unmounted, sleep until the
440 	 * mount point fate is decided.  If thread doing the unmounting fails,
441 	 * it will clear MNTK_UNMOUNT flag before waking us up, indicating
442 	 * that this mount point has survived the unmount attempt and vfs_busy
443 	 * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
444 	 * flag in addition to MNTK_UNMOUNT, indicating that mount point is
445 	 * about to be really destroyed.  vfs_busy needs to release its
446 	 * reference on the mount point in this case and return with ENOENT,
447 	 * telling the caller that mount mount it tried to busy is no longer
448 	 * valid.
449 	 */
450 	while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
451 		if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
452 			MNT_REL(mp);
453 			MNT_IUNLOCK(mp);
454 			CTR1(KTR_VFS, "%s: failed busying before sleeping",
455 			    __func__);
456 			return (ENOENT);
457 		}
458 		if (flags & MBF_MNTLSTLOCK)
459 			mtx_unlock(&mountlist_mtx);
460 		mp->mnt_kern_flag |= MNTK_MWAIT;
461 		msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
462 		if (flags & MBF_MNTLSTLOCK)
463 			mtx_lock(&mountlist_mtx);
464 		MNT_ILOCK(mp);
465 	}
466 	if (flags & MBF_MNTLSTLOCK)
467 		mtx_unlock(&mountlist_mtx);
468 	mp->mnt_lockref++;
469 	MNT_IUNLOCK(mp);
470 	return (0);
471 }
472 
473 /*
474  * Free a busy filesystem.
475  */
476 void
477 vfs_unbusy(struct mount *mp)
478 {
479 
480 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
481 	MNT_ILOCK(mp);
482 	MNT_REL(mp);
483 	KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref"));
484 	mp->mnt_lockref--;
485 	if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
486 		MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
487 		CTR1(KTR_VFS, "%s: waking up waiters", __func__);
488 		mp->mnt_kern_flag &= ~MNTK_DRAINING;
489 		wakeup(&mp->mnt_lockref);
490 	}
491 	MNT_IUNLOCK(mp);
492 }
493 
494 /*
495  * Lookup a mount point by filesystem identifier.
496  */
497 struct mount *
498 vfs_getvfs(fsid_t *fsid)
499 {
500 	struct mount *mp;
501 
502 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
503 	mtx_lock(&mountlist_mtx);
504 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
505 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
506 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
507 			vfs_ref(mp);
508 			mtx_unlock(&mountlist_mtx);
509 			return (mp);
510 		}
511 	}
512 	mtx_unlock(&mountlist_mtx);
513 	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
514 	return ((struct mount *) 0);
515 }
516 
517 /*
518  * Lookup a mount point by filesystem identifier, busying it before
519  * returning.
520  *
521  * To avoid congestion on mountlist_mtx, implement simple direct-mapped
522  * cache for popular filesystem identifiers.  The cache is lockess, using
523  * the fact that struct mount's are never freed.  In worst case we may
524  * get pointer to unmounted or even different filesystem, so we have to
525  * check what we got, and go slow way if so.
526  */
527 struct mount *
528 vfs_busyfs(fsid_t *fsid)
529 {
530 #define	FSID_CACHE_SIZE	256
531 	typedef struct mount * volatile vmp_t;
532 	static vmp_t cache[FSID_CACHE_SIZE];
533 	struct mount *mp;
534 	int error;
535 	uint32_t hash;
536 
537 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
538 	hash = fsid->val[0] ^ fsid->val[1];
539 	hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
540 	mp = cache[hash];
541 	if (mp == NULL ||
542 	    mp->mnt_stat.f_fsid.val[0] != fsid->val[0] ||
543 	    mp->mnt_stat.f_fsid.val[1] != fsid->val[1])
544 		goto slow;
545 	if (vfs_busy(mp, 0) != 0) {
546 		cache[hash] = NULL;
547 		goto slow;
548 	}
549 	if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
550 	    mp->mnt_stat.f_fsid.val[1] == fsid->val[1])
551 		return (mp);
552 	else
553 	    vfs_unbusy(mp);
554 
555 slow:
556 	mtx_lock(&mountlist_mtx);
557 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
558 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
559 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
560 			error = vfs_busy(mp, MBF_MNTLSTLOCK);
561 			if (error) {
562 				cache[hash] = NULL;
563 				mtx_unlock(&mountlist_mtx);
564 				return (NULL);
565 			}
566 			cache[hash] = mp;
567 			return (mp);
568 		}
569 	}
570 	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
571 	mtx_unlock(&mountlist_mtx);
572 	return ((struct mount *) 0);
573 }
574 
575 /*
576  * Check if a user can access privileged mount options.
577  */
578 int
579 vfs_suser(struct mount *mp, struct thread *td)
580 {
581 	int error;
582 
583 	/*
584 	 * If the thread is jailed, but this is not a jail-friendly file
585 	 * system, deny immediately.
586 	 */
587 	if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred))
588 		return (EPERM);
589 
590 	/*
591 	 * If the file system was mounted outside the jail of the calling
592 	 * thread, deny immediately.
593 	 */
594 	if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
595 		return (EPERM);
596 
597 	/*
598 	 * If file system supports delegated administration, we don't check
599 	 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
600 	 * by the file system itself.
601 	 * If this is not the user that did original mount, we check for
602 	 * the PRIV_VFS_MOUNT_OWNER privilege.
603 	 */
604 	if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
605 	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
606 		if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
607 			return (error);
608 	}
609 	return (0);
610 }
611 
612 /*
613  * Get a new unique fsid.  Try to make its val[0] unique, since this value
614  * will be used to create fake device numbers for stat().  Also try (but
615  * not so hard) make its val[0] unique mod 2^16, since some emulators only
616  * support 16-bit device numbers.  We end up with unique val[0]'s for the
617  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
618  *
619  * Keep in mind that several mounts may be running in parallel.  Starting
620  * the search one past where the previous search terminated is both a
621  * micro-optimization and a defense against returning the same fsid to
622  * different mounts.
623  */
624 void
625 vfs_getnewfsid(struct mount *mp)
626 {
627 	static uint16_t mntid_base;
628 	struct mount *nmp;
629 	fsid_t tfsid;
630 	int mtype;
631 
632 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
633 	mtx_lock(&mntid_mtx);
634 	mtype = mp->mnt_vfc->vfc_typenum;
635 	tfsid.val[1] = mtype;
636 	mtype = (mtype & 0xFF) << 24;
637 	for (;;) {
638 		tfsid.val[0] = makedev(255,
639 		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
640 		mntid_base++;
641 		if ((nmp = vfs_getvfs(&tfsid)) == NULL)
642 			break;
643 		vfs_rel(nmp);
644 	}
645 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
646 	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
647 	mtx_unlock(&mntid_mtx);
648 }
649 
650 /*
651  * Knob to control the precision of file timestamps:
652  *
653  *   0 = seconds only; nanoseconds zeroed.
654  *   1 = seconds and nanoseconds, accurate within 1/HZ.
655  *   2 = seconds and nanoseconds, truncated to microseconds.
656  * >=3 = seconds and nanoseconds, maximum precision.
657  */
658 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
659 
660 static int timestamp_precision = TSP_USEC;
661 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
662     &timestamp_precision, 0, "File timestamp precision (0: seconds, "
663     "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, "
664     "3+: sec + ns (max. precision))");
665 
666 /*
667  * Get a current timestamp.
668  */
669 void
670 vfs_timestamp(struct timespec *tsp)
671 {
672 	struct timeval tv;
673 
674 	switch (timestamp_precision) {
675 	case TSP_SEC:
676 		tsp->tv_sec = time_second;
677 		tsp->tv_nsec = 0;
678 		break;
679 	case TSP_HZ:
680 		getnanotime(tsp);
681 		break;
682 	case TSP_USEC:
683 		microtime(&tv);
684 		TIMEVAL_TO_TIMESPEC(&tv, tsp);
685 		break;
686 	case TSP_NSEC:
687 	default:
688 		nanotime(tsp);
689 		break;
690 	}
691 }
692 
693 /*
694  * Set vnode attributes to VNOVAL
695  */
696 void
697 vattr_null(struct vattr *vap)
698 {
699 
700 	vap->va_type = VNON;
701 	vap->va_size = VNOVAL;
702 	vap->va_bytes = VNOVAL;
703 	vap->va_mode = VNOVAL;
704 	vap->va_nlink = VNOVAL;
705 	vap->va_uid = VNOVAL;
706 	vap->va_gid = VNOVAL;
707 	vap->va_fsid = VNOVAL;
708 	vap->va_fileid = VNOVAL;
709 	vap->va_blocksize = VNOVAL;
710 	vap->va_rdev = VNOVAL;
711 	vap->va_atime.tv_sec = VNOVAL;
712 	vap->va_atime.tv_nsec = VNOVAL;
713 	vap->va_mtime.tv_sec = VNOVAL;
714 	vap->va_mtime.tv_nsec = VNOVAL;
715 	vap->va_ctime.tv_sec = VNOVAL;
716 	vap->va_ctime.tv_nsec = VNOVAL;
717 	vap->va_birthtime.tv_sec = VNOVAL;
718 	vap->va_birthtime.tv_nsec = VNOVAL;
719 	vap->va_flags = VNOVAL;
720 	vap->va_gen = VNOVAL;
721 	vap->va_vaflags = 0;
722 }
723 
724 /*
725  * This routine is called when we have too many vnodes.  It attempts
726  * to free <count> vnodes and will potentially free vnodes that still
727  * have VM backing store (VM backing store is typically the cause
728  * of a vnode blowout so we want to do this).  Therefore, this operation
729  * is not considered cheap.
730  *
731  * A number of conditions may prevent a vnode from being reclaimed.
732  * the buffer cache may have references on the vnode, a directory
733  * vnode may still have references due to the namei cache representing
734  * underlying files, or the vnode may be in active use.   It is not
735  * desireable to reuse such vnodes.  These conditions may cause the
736  * number of vnodes to reach some minimum value regardless of what
737  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
738  */
739 static int
740 vlrureclaim(struct mount *mp)
741 {
742 	struct vnode *vp;
743 	int done;
744 	int trigger;
745 	int usevnodes;
746 	int count;
747 
748 	/*
749 	 * Calculate the trigger point, don't allow user
750 	 * screwups to blow us up.   This prevents us from
751 	 * recycling vnodes with lots of resident pages.  We
752 	 * aren't trying to free memory, we are trying to
753 	 * free vnodes.
754 	 */
755 	usevnodes = desiredvnodes;
756 	if (usevnodes <= 0)
757 		usevnodes = 1;
758 	trigger = vm_cnt.v_page_count * 2 / usevnodes;
759 	done = 0;
760 	vn_start_write(NULL, &mp, V_WAIT);
761 	MNT_ILOCK(mp);
762 	count = mp->mnt_nvnodelistsize / 10 + 1;
763 	while (count != 0) {
764 		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
765 		while (vp != NULL && vp->v_type == VMARKER)
766 			vp = TAILQ_NEXT(vp, v_nmntvnodes);
767 		if (vp == NULL)
768 			break;
769 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
770 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
771 		--count;
772 		if (!VI_TRYLOCK(vp))
773 			goto next_iter;
774 		/*
775 		 * If it's been deconstructed already, it's still
776 		 * referenced, or it exceeds the trigger, skip it.
777 		 */
778 		if (vp->v_usecount ||
779 		    (!vlru_allow_cache_src &&
780 			!LIST_EMPTY(&(vp)->v_cache_src)) ||
781 		    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
782 		    vp->v_object->resident_page_count > trigger)) {
783 			VI_UNLOCK(vp);
784 			goto next_iter;
785 		}
786 		MNT_IUNLOCK(mp);
787 		vholdl(vp);
788 		if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
789 			vdrop(vp);
790 			goto next_iter_mntunlocked;
791 		}
792 		VI_LOCK(vp);
793 		/*
794 		 * v_usecount may have been bumped after VOP_LOCK() dropped
795 		 * the vnode interlock and before it was locked again.
796 		 *
797 		 * It is not necessary to recheck VI_DOOMED because it can
798 		 * only be set by another thread that holds both the vnode
799 		 * lock and vnode interlock.  If another thread has the
800 		 * vnode lock before we get to VOP_LOCK() and obtains the
801 		 * vnode interlock after VOP_LOCK() drops the vnode
802 		 * interlock, the other thread will be unable to drop the
803 		 * vnode lock before our VOP_LOCK() call fails.
804 		 */
805 		if (vp->v_usecount ||
806 		    (!vlru_allow_cache_src &&
807 			!LIST_EMPTY(&(vp)->v_cache_src)) ||
808 		    (vp->v_object != NULL &&
809 		    vp->v_object->resident_page_count > trigger)) {
810 			VOP_UNLOCK(vp, LK_INTERLOCK);
811 			vdrop(vp);
812 			goto next_iter_mntunlocked;
813 		}
814 		KASSERT((vp->v_iflag & VI_DOOMED) == 0,
815 		    ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
816 		atomic_add_long(&recycles_count, 1);
817 		vgonel(vp);
818 		VOP_UNLOCK(vp, 0);
819 		vdropl(vp);
820 		done++;
821 next_iter_mntunlocked:
822 		if (!should_yield())
823 			goto relock_mnt;
824 		goto yield;
825 next_iter:
826 		if (!should_yield())
827 			continue;
828 		MNT_IUNLOCK(mp);
829 yield:
830 		kern_yield(PRI_USER);
831 relock_mnt:
832 		MNT_ILOCK(mp);
833 	}
834 	MNT_IUNLOCK(mp);
835 	vn_finished_write(mp);
836 	return done;
837 }
838 
839 /*
840  * Attempt to keep the free list at wantfreevnodes length.
841  */
842 static void
843 vnlru_free(int count)
844 {
845 	struct vnode *vp;
846 
847 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
848 	for (; count > 0; count--) {
849 		vp = TAILQ_FIRST(&vnode_free_list);
850 		/*
851 		 * The list can be modified while the free_list_mtx
852 		 * has been dropped and vp could be NULL here.
853 		 */
854 		if (!vp)
855 			break;
856 		VNASSERT(vp->v_op != NULL, vp,
857 		    ("vnlru_free: vnode already reclaimed."));
858 		KASSERT((vp->v_iflag & VI_FREE) != 0,
859 		    ("Removing vnode not on freelist"));
860 		KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
861 		    ("Mangling active vnode"));
862 		TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
863 		/*
864 		 * Don't recycle if we can't get the interlock.
865 		 */
866 		if (!VI_TRYLOCK(vp)) {
867 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
868 			continue;
869 		}
870 		VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0,
871 		    vp, ("vp inconsistent on freelist"));
872 
873 		/*
874 		 * The clear of VI_FREE prevents activation of the
875 		 * vnode.  There is no sense in putting the vnode on
876 		 * the mount point active list, only to remove it
877 		 * later during recycling.  Inline the relevant part
878 		 * of vholdl(), to avoid triggering assertions or
879 		 * activating.
880 		 */
881 		freevnodes--;
882 		vp->v_iflag &= ~VI_FREE;
883 		refcount_acquire(&vp->v_holdcnt);
884 
885 		mtx_unlock(&vnode_free_list_mtx);
886 		VI_UNLOCK(vp);
887 		vtryrecycle(vp);
888 		/*
889 		 * If the recycled succeeded this vdrop will actually free
890 		 * the vnode.  If not it will simply place it back on
891 		 * the free list.
892 		 */
893 		vdrop(vp);
894 		mtx_lock(&vnode_free_list_mtx);
895 	}
896 }
897 /*
898  * Attempt to recycle vnodes in a context that is always safe to block.
899  * Calling vlrurecycle() from the bowels of filesystem code has some
900  * interesting deadlock problems.
901  */
902 static struct proc *vnlruproc;
903 static int vnlruproc_sig;
904 
905 static void
906 vnlru_proc(void)
907 {
908 	struct mount *mp, *nmp;
909 	int done;
910 	struct proc *p = vnlruproc;
911 
912 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
913 	    SHUTDOWN_PRI_FIRST);
914 
915 	for (;;) {
916 		kproc_suspend_check(p);
917 		mtx_lock(&vnode_free_list_mtx);
918 		if (freevnodes > wantfreevnodes)
919 			vnlru_free(freevnodes - wantfreevnodes);
920 		if (numvnodes <= desiredvnodes * 9 / 10) {
921 			vnlruproc_sig = 0;
922 			wakeup(&vnlruproc_sig);
923 			msleep(vnlruproc, &vnode_free_list_mtx,
924 			    PVFS|PDROP, "vlruwt", hz);
925 			continue;
926 		}
927 		mtx_unlock(&vnode_free_list_mtx);
928 		done = 0;
929 		mtx_lock(&mountlist_mtx);
930 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
931 			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
932 				nmp = TAILQ_NEXT(mp, mnt_list);
933 				continue;
934 			}
935 			done += vlrureclaim(mp);
936 			mtx_lock(&mountlist_mtx);
937 			nmp = TAILQ_NEXT(mp, mnt_list);
938 			vfs_unbusy(mp);
939 		}
940 		mtx_unlock(&mountlist_mtx);
941 		if (done == 0) {
942 #if 0
943 			/* These messages are temporary debugging aids */
944 			if (vnlru_nowhere < 5)
945 				printf("vnlru process getting nowhere..\n");
946 			else if (vnlru_nowhere == 5)
947 				printf("vnlru process messages stopped.\n");
948 #endif
949 			vnlru_nowhere++;
950 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
951 		} else
952 			kern_yield(PRI_USER);
953 	}
954 }
955 
956 static struct kproc_desc vnlru_kp = {
957 	"vnlru",
958 	vnlru_proc,
959 	&vnlruproc
960 };
961 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
962     &vnlru_kp);
963 
964 /*
965  * Routines having to do with the management of the vnode table.
966  */
967 
968 /*
969  * Try to recycle a freed vnode.  We abort if anyone picks up a reference
970  * before we actually vgone().  This function must be called with the vnode
971  * held to prevent the vnode from being returned to the free list midway
972  * through vgone().
973  */
974 static int
975 vtryrecycle(struct vnode *vp)
976 {
977 	struct mount *vnmp;
978 
979 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
980 	VNASSERT(vp->v_holdcnt, vp,
981 	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
982 	/*
983 	 * This vnode may found and locked via some other list, if so we
984 	 * can't recycle it yet.
985 	 */
986 	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
987 		CTR2(KTR_VFS,
988 		    "%s: impossible to recycle, vp %p lock is already held",
989 		    __func__, vp);
990 		return (EWOULDBLOCK);
991 	}
992 	/*
993 	 * Don't recycle if its filesystem is being suspended.
994 	 */
995 	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
996 		VOP_UNLOCK(vp, 0);
997 		CTR2(KTR_VFS,
998 		    "%s: impossible to recycle, cannot start the write for %p",
999 		    __func__, vp);
1000 		return (EBUSY);
1001 	}
1002 	/*
1003 	 * If we got this far, we need to acquire the interlock and see if
1004 	 * anyone picked up this vnode from another list.  If not, we will
1005 	 * mark it with DOOMED via vgonel() so that anyone who does find it
1006 	 * will skip over it.
1007 	 */
1008 	VI_LOCK(vp);
1009 	if (vp->v_usecount) {
1010 		VOP_UNLOCK(vp, LK_INTERLOCK);
1011 		vn_finished_write(vnmp);
1012 		CTR2(KTR_VFS,
1013 		    "%s: impossible to recycle, %p is already referenced",
1014 		    __func__, vp);
1015 		return (EBUSY);
1016 	}
1017 	if ((vp->v_iflag & VI_DOOMED) == 0) {
1018 		atomic_add_long(&recycles_count, 1);
1019 		vgonel(vp);
1020 	}
1021 	VOP_UNLOCK(vp, LK_INTERLOCK);
1022 	vn_finished_write(vnmp);
1023 	return (0);
1024 }
1025 
1026 /*
1027  * Wait for available vnodes.
1028  */
1029 static int
1030 getnewvnode_wait(int suspended)
1031 {
1032 
1033 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
1034 	if (numvnodes > desiredvnodes) {
1035 		if (suspended) {
1036 			/*
1037 			 * File system is beeing suspended, we cannot risk a
1038 			 * deadlock here, so allocate new vnode anyway.
1039 			 */
1040 			if (freevnodes > wantfreevnodes)
1041 				vnlru_free(freevnodes - wantfreevnodes);
1042 			return (0);
1043 		}
1044 		if (vnlruproc_sig == 0) {
1045 			vnlruproc_sig = 1;	/* avoid unnecessary wakeups */
1046 			wakeup(vnlruproc);
1047 		}
1048 		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
1049 		    "vlruwk", hz);
1050 	}
1051 	return (numvnodes > desiredvnodes ? ENFILE : 0);
1052 }
1053 
1054 void
1055 getnewvnode_reserve(u_int count)
1056 {
1057 	struct thread *td;
1058 
1059 	td = curthread;
1060 	/* First try to be quick and racy. */
1061 	if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
1062 		td->td_vp_reserv += count;
1063 		return;
1064 	} else
1065 		atomic_subtract_long(&numvnodes, count);
1066 
1067 	mtx_lock(&vnode_free_list_mtx);
1068 	while (count > 0) {
1069 		if (getnewvnode_wait(0) == 0) {
1070 			count--;
1071 			td->td_vp_reserv++;
1072 			atomic_add_long(&numvnodes, 1);
1073 		}
1074 	}
1075 	mtx_unlock(&vnode_free_list_mtx);
1076 }
1077 
1078 void
1079 getnewvnode_drop_reserve(void)
1080 {
1081 	struct thread *td;
1082 
1083 	td = curthread;
1084 	atomic_subtract_long(&numvnodes, td->td_vp_reserv);
1085 	td->td_vp_reserv = 0;
1086 }
1087 
1088 /*
1089  * Return the next vnode from the free list.
1090  */
1091 int
1092 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
1093     struct vnode **vpp)
1094 {
1095 	struct vnode *vp;
1096 	struct bufobj *bo;
1097 	struct thread *td;
1098 	int error;
1099 
1100 	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
1101 	vp = NULL;
1102 	td = curthread;
1103 	if (td->td_vp_reserv > 0) {
1104 		td->td_vp_reserv -= 1;
1105 		goto alloc;
1106 	}
1107 	mtx_lock(&vnode_free_list_mtx);
1108 	/*
1109 	 * Lend our context to reclaim vnodes if they've exceeded the max.
1110 	 */
1111 	if (freevnodes > wantfreevnodes)
1112 		vnlru_free(1);
1113 	error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
1114 	    MNTK_SUSPEND));
1115 #if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
1116 	if (error != 0) {
1117 		mtx_unlock(&vnode_free_list_mtx);
1118 		return (error);
1119 	}
1120 #endif
1121 	atomic_add_long(&numvnodes, 1);
1122 	mtx_unlock(&vnode_free_list_mtx);
1123 alloc:
1124 	atomic_add_long(&vnodes_created, 1);
1125 	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
1126 	/*
1127 	 * Setup locks.
1128 	 */
1129 	vp->v_vnlock = &vp->v_lock;
1130 	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
1131 	/*
1132 	 * By default, don't allow shared locks unless filesystems
1133 	 * opt-in.
1134 	 */
1135 	lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE);
1136 	/*
1137 	 * Initialize bufobj.
1138 	 */
1139 	bo = &vp->v_bufobj;
1140 	bo->__bo_vnode = vp;
1141 	rw_init(BO_LOCKPTR(bo), "bufobj interlock");
1142 	bo->bo_ops = &buf_ops_bio;
1143 	bo->bo_private = vp;
1144 	TAILQ_INIT(&bo->bo_clean.bv_hd);
1145 	TAILQ_INIT(&bo->bo_dirty.bv_hd);
1146 	/*
1147 	 * Initialize namecache.
1148 	 */
1149 	LIST_INIT(&vp->v_cache_src);
1150 	TAILQ_INIT(&vp->v_cache_dst);
1151 	/*
1152 	 * Finalize various vnode identity bits.
1153 	 */
1154 	vp->v_type = VNON;
1155 	vp->v_tag = tag;
1156 	vp->v_op = vops;
1157 	v_init_counters(vp);
1158 	vp->v_data = NULL;
1159 #ifdef MAC
1160 	mac_vnode_init(vp);
1161 	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
1162 		mac_vnode_associate_singlelabel(mp, vp);
1163 	else if (mp == NULL && vops != &dead_vnodeops)
1164 		printf("NULL mp in getnewvnode()\n");
1165 #endif
1166 	if (mp != NULL) {
1167 		bo->bo_bsize = mp->mnt_stat.f_iosize;
1168 		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
1169 			vp->v_vflag |= VV_NOKNOTE;
1170 	}
1171 	rangelock_init(&vp->v_rl);
1172 
1173 	/*
1174 	 * For the filesystems which do not use vfs_hash_insert(),
1175 	 * still initialize v_hash to have vfs_hash_index() useful.
1176 	 * E.g., nullfs uses vfs_hash_index() on the lower vnode for
1177 	 * its own hashing.
1178 	 */
1179 	vp->v_hash = (uintptr_t)vp >> vnsz2log;
1180 
1181 	*vpp = vp;
1182 	return (0);
1183 }
1184 
1185 /*
1186  * Delete from old mount point vnode list, if on one.
1187  */
1188 static void
1189 delmntque(struct vnode *vp)
1190 {
1191 	struct mount *mp;
1192 	int active;
1193 
1194 	mp = vp->v_mount;
1195 	if (mp == NULL)
1196 		return;
1197 	MNT_ILOCK(mp);
1198 	VI_LOCK(vp);
1199 	KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize,
1200 	    ("Active vnode list size %d > Vnode list size %d",
1201 	     mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize));
1202 	active = vp->v_iflag & VI_ACTIVE;
1203 	vp->v_iflag &= ~VI_ACTIVE;
1204 	if (active) {
1205 		mtx_lock(&vnode_free_list_mtx);
1206 		TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
1207 		mp->mnt_activevnodelistsize--;
1208 		mtx_unlock(&vnode_free_list_mtx);
1209 	}
1210 	vp->v_mount = NULL;
1211 	VI_UNLOCK(vp);
1212 	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
1213 		("bad mount point vnode list size"));
1214 	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1215 	mp->mnt_nvnodelistsize--;
1216 	MNT_REL(mp);
1217 	MNT_IUNLOCK(mp);
1218 }
1219 
1220 static void
1221 insmntque_stddtr(struct vnode *vp, void *dtr_arg)
1222 {
1223 
1224 	vp->v_data = NULL;
1225 	vp->v_op = &dead_vnodeops;
1226 	vgone(vp);
1227 	vput(vp);
1228 }
1229 
1230 /*
1231  * Insert into list of vnodes for the new mount point, if available.
1232  */
1233 int
1234 insmntque1(struct vnode *vp, struct mount *mp,
1235 	void (*dtr)(struct vnode *, void *), void *dtr_arg)
1236 {
1237 
1238 	KASSERT(vp->v_mount == NULL,
1239 		("insmntque: vnode already on per mount vnode list"));
1240 	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
1241 	ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
1242 
1243 	/*
1244 	 * We acquire the vnode interlock early to ensure that the
1245 	 * vnode cannot be recycled by another process releasing a
1246 	 * holdcnt on it before we get it on both the vnode list
1247 	 * and the active vnode list. The mount mutex protects only
1248 	 * manipulation of the vnode list and the vnode freelist
1249 	 * mutex protects only manipulation of the active vnode list.
1250 	 * Hence the need to hold the vnode interlock throughout.
1251 	 */
1252 	MNT_ILOCK(mp);
1253 	VI_LOCK(vp);
1254 	if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
1255 	    ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
1256 	    mp->mnt_nvnodelistsize == 0)) &&
1257 	    (vp->v_vflag & VV_FORCEINSMQ) == 0) {
1258 		VI_UNLOCK(vp);
1259 		MNT_IUNLOCK(mp);
1260 		if (dtr != NULL)
1261 			dtr(vp, dtr_arg);
1262 		return (EBUSY);
1263 	}
1264 	vp->v_mount = mp;
1265 	MNT_REF(mp);
1266 	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1267 	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
1268 		("neg mount point vnode list size"));
1269 	mp->mnt_nvnodelistsize++;
1270 	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
1271 	    ("Activating already active vnode"));
1272 	vp->v_iflag |= VI_ACTIVE;
1273 	mtx_lock(&vnode_free_list_mtx);
1274 	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
1275 	mp->mnt_activevnodelistsize++;
1276 	mtx_unlock(&vnode_free_list_mtx);
1277 	VI_UNLOCK(vp);
1278 	MNT_IUNLOCK(mp);
1279 	return (0);
1280 }
1281 
1282 int
1283 insmntque(struct vnode *vp, struct mount *mp)
1284 {
1285 
1286 	return (insmntque1(vp, mp, insmntque_stddtr, NULL));
1287 }
1288 
1289 /*
1290  * Flush out and invalidate all buffers associated with a bufobj
1291  * Called with the underlying object locked.
1292  */
1293 int
1294 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
1295 {
1296 	int error;
1297 
1298 	BO_LOCK(bo);
1299 	if (flags & V_SAVE) {
1300 		error = bufobj_wwait(bo, slpflag, slptimeo);
1301 		if (error) {
1302 			BO_UNLOCK(bo);
1303 			return (error);
1304 		}
1305 		if (bo->bo_dirty.bv_cnt > 0) {
1306 			BO_UNLOCK(bo);
1307 			if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
1308 				return (error);
1309 			/*
1310 			 * XXX We could save a lock/unlock if this was only
1311 			 * enabled under INVARIANTS
1312 			 */
1313 			BO_LOCK(bo);
1314 			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
1315 				panic("vinvalbuf: dirty bufs");
1316 		}
1317 	}
1318 	/*
1319 	 * If you alter this loop please notice that interlock is dropped and
1320 	 * reacquired in flushbuflist.  Special care is needed to ensure that
1321 	 * no race conditions occur from this.
1322 	 */
1323 	do {
1324 		error = flushbuflist(&bo->bo_clean,
1325 		    flags, bo, slpflag, slptimeo);
1326 		if (error == 0 && !(flags & V_CLEANONLY))
1327 			error = flushbuflist(&bo->bo_dirty,
1328 			    flags, bo, slpflag, slptimeo);
1329 		if (error != 0 && error != EAGAIN) {
1330 			BO_UNLOCK(bo);
1331 			return (error);
1332 		}
1333 	} while (error != 0);
1334 
1335 	/*
1336 	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
1337 	 * have write I/O in-progress but if there is a VM object then the
1338 	 * VM object can also have read-I/O in-progress.
1339 	 */
1340 	do {
1341 		bufobj_wwait(bo, 0, 0);
1342 		BO_UNLOCK(bo);
1343 		if (bo->bo_object != NULL) {
1344 			VM_OBJECT_WLOCK(bo->bo_object);
1345 			vm_object_pip_wait(bo->bo_object, "bovlbx");
1346 			VM_OBJECT_WUNLOCK(bo->bo_object);
1347 		}
1348 		BO_LOCK(bo);
1349 	} while (bo->bo_numoutput > 0);
1350 	BO_UNLOCK(bo);
1351 
1352 	/*
1353 	 * Destroy the copy in the VM cache, too.
1354 	 */
1355 	if (bo->bo_object != NULL &&
1356 	    (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) {
1357 		VM_OBJECT_WLOCK(bo->bo_object);
1358 		vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
1359 		    OBJPR_CLEANONLY : 0);
1360 		VM_OBJECT_WUNLOCK(bo->bo_object);
1361 	}
1362 
1363 #ifdef INVARIANTS
1364 	BO_LOCK(bo);
1365 	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 &&
1366 	    (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
1367 		panic("vinvalbuf: flush failed");
1368 	BO_UNLOCK(bo);
1369 #endif
1370 	return (0);
1371 }
1372 
1373 /*
1374  * Flush out and invalidate all buffers associated with a vnode.
1375  * Called with the underlying object locked.
1376  */
1377 int
1378 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
1379 {
1380 
1381 	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
1382 	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
1383 	if (vp->v_object != NULL && vp->v_object->handle != vp)
1384 		return (0);
1385 	return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
1386 }
1387 
1388 /*
1389  * Flush out buffers on the specified list.
1390  *
1391  */
1392 static int
1393 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
1394     int slptimeo)
1395 {
1396 	struct buf *bp, *nbp;
1397 	int retval, error;
1398 	daddr_t lblkno;
1399 	b_xflags_t xflags;
1400 
1401 	ASSERT_BO_WLOCKED(bo);
1402 
1403 	retval = 0;
1404 	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
1405 		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1406 		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1407 			continue;
1408 		}
1409 		lblkno = 0;
1410 		xflags = 0;
1411 		if (nbp != NULL) {
1412 			lblkno = nbp->b_lblkno;
1413 			xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
1414 		}
1415 		retval = EAGAIN;
1416 		error = BUF_TIMELOCK(bp,
1417 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
1418 		    "flushbuf", slpflag, slptimeo);
1419 		if (error) {
1420 			BO_LOCK(bo);
1421 			return (error != ENOLCK ? error : EAGAIN);
1422 		}
1423 		KASSERT(bp->b_bufobj == bo,
1424 		    ("bp %p wrong b_bufobj %p should be %p",
1425 		    bp, bp->b_bufobj, bo));
1426 		/*
1427 		 * XXX Since there are no node locks for NFS, I
1428 		 * believe there is a slight chance that a delayed
1429 		 * write will occur while sleeping just above, so
1430 		 * check for it.
1431 		 */
1432 		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1433 		    (flags & V_SAVE)) {
1434 			bremfree(bp);
1435 			bp->b_flags |= B_ASYNC;
1436 			bwrite(bp);
1437 			BO_LOCK(bo);
1438 			return (EAGAIN);	/* XXX: why not loop ? */
1439 		}
1440 		bremfree(bp);
1441 		bp->b_flags |= (B_INVAL | B_RELBUF);
1442 		bp->b_flags &= ~B_ASYNC;
1443 		brelse(bp);
1444 		BO_LOCK(bo);
1445 		if (nbp != NULL &&
1446 		    (nbp->b_bufobj != bo ||
1447 		     nbp->b_lblkno != lblkno ||
1448 		     (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags))
1449 			break;			/* nbp invalid */
1450 	}
1451 	return (retval);
1452 }
1453 
1454 /*
1455  * Truncate a file's buffer and pages to a specified length.  This
1456  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1457  * sync activity.
1458  */
1459 int
1460 vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize)
1461 {
1462 	struct buf *bp, *nbp;
1463 	int anyfreed;
1464 	int trunclbn;
1465 	struct bufobj *bo;
1466 
1467 	CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__,
1468 	    vp, cred, blksize, (uintmax_t)length);
1469 
1470 	/*
1471 	 * Round up to the *next* lbn.
1472 	 */
1473 	trunclbn = (length + blksize - 1) / blksize;
1474 
1475 	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1476 restart:
1477 	bo = &vp->v_bufobj;
1478 	BO_LOCK(bo);
1479 	anyfreed = 1;
1480 	for (;anyfreed;) {
1481 		anyfreed = 0;
1482 		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
1483 			if (bp->b_lblkno < trunclbn)
1484 				continue;
1485 			if (BUF_LOCK(bp,
1486 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1487 			    BO_LOCKPTR(bo)) == ENOLCK)
1488 				goto restart;
1489 
1490 			bremfree(bp);
1491 			bp->b_flags |= (B_INVAL | B_RELBUF);
1492 			bp->b_flags &= ~B_ASYNC;
1493 			brelse(bp);
1494 			anyfreed = 1;
1495 
1496 			BO_LOCK(bo);
1497 			if (nbp != NULL &&
1498 			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1499 			    (nbp->b_vp != vp) ||
1500 			    (nbp->b_flags & B_DELWRI))) {
1501 				BO_UNLOCK(bo);
1502 				goto restart;
1503 			}
1504 		}
1505 
1506 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1507 			if (bp->b_lblkno < trunclbn)
1508 				continue;
1509 			if (BUF_LOCK(bp,
1510 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1511 			    BO_LOCKPTR(bo)) == ENOLCK)
1512 				goto restart;
1513 			bremfree(bp);
1514 			bp->b_flags |= (B_INVAL | B_RELBUF);
1515 			bp->b_flags &= ~B_ASYNC;
1516 			brelse(bp);
1517 			anyfreed = 1;
1518 
1519 			BO_LOCK(bo);
1520 			if (nbp != NULL &&
1521 			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1522 			    (nbp->b_vp != vp) ||
1523 			    (nbp->b_flags & B_DELWRI) == 0)) {
1524 				BO_UNLOCK(bo);
1525 				goto restart;
1526 			}
1527 		}
1528 	}
1529 
1530 	if (length > 0) {
1531 restartsync:
1532 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1533 			if (bp->b_lblkno > 0)
1534 				continue;
1535 			/*
1536 			 * Since we hold the vnode lock this should only
1537 			 * fail if we're racing with the buf daemon.
1538 			 */
1539 			if (BUF_LOCK(bp,
1540 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1541 			    BO_LOCKPTR(bo)) == ENOLCK) {
1542 				goto restart;
1543 			}
1544 			VNASSERT((bp->b_flags & B_DELWRI), vp,
1545 			    ("buf(%p) on dirty queue without DELWRI", bp));
1546 
1547 			bremfree(bp);
1548 			bawrite(bp);
1549 			BO_LOCK(bo);
1550 			goto restartsync;
1551 		}
1552 	}
1553 
1554 	bufobj_wwait(bo, 0, 0);
1555 	BO_UNLOCK(bo);
1556 	vnode_pager_setsize(vp, length);
1557 
1558 	return (0);
1559 }
1560 
1561 static void
1562 buf_vlist_remove(struct buf *bp)
1563 {
1564 	struct bufv *bv;
1565 
1566 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1567 	ASSERT_BO_WLOCKED(bp->b_bufobj);
1568 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
1569 	    (BX_VNDIRTY|BX_VNCLEAN),
1570 	    ("buf_vlist_remove: Buf %p is on two lists", bp));
1571 	if (bp->b_xflags & BX_VNDIRTY)
1572 		bv = &bp->b_bufobj->bo_dirty;
1573 	else
1574 		bv = &bp->b_bufobj->bo_clean;
1575 	BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
1576 	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
1577 	bv->bv_cnt--;
1578 	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1579 }
1580 
1581 /*
1582  * Add the buffer to the sorted clean or dirty block list.
1583  *
1584  * NOTE: xflags is passed as a constant, optimizing this inline function!
1585  */
1586 static void
1587 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
1588 {
1589 	struct bufv *bv;
1590 	struct buf *n;
1591 	int error;
1592 
1593 	ASSERT_BO_WLOCKED(bo);
1594 	KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
1595 	    ("dead bo %p", bo));
1596 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1597 	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
1598 	bp->b_xflags |= xflags;
1599 	if (xflags & BX_VNDIRTY)
1600 		bv = &bo->bo_dirty;
1601 	else
1602 		bv = &bo->bo_clean;
1603 
1604 	/*
1605 	 * Keep the list ordered.  Optimize empty list insertion.  Assume
1606 	 * we tend to grow at the tail so lookup_le should usually be cheaper
1607 	 * than _ge.
1608 	 */
1609 	if (bv->bv_cnt == 0 ||
1610 	    bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
1611 		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
1612 	else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
1613 		TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
1614 	else
1615 		TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
1616 	error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
1617 	if (error)
1618 		panic("buf_vlist_add:  Preallocated nodes insufficient.");
1619 	bv->bv_cnt++;
1620 }
1621 
1622 /*
1623  * Look up a buffer using the buffer tries.
1624  */
1625 struct buf *
1626 gbincore(struct bufobj *bo, daddr_t lblkno)
1627 {
1628 	struct buf *bp;
1629 
1630 	ASSERT_BO_LOCKED(bo);
1631 	bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
1632 	if (bp != NULL)
1633 		return (bp);
1634 	return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno);
1635 }
1636 
1637 /*
1638  * Associate a buffer with a vnode.
1639  */
1640 void
1641 bgetvp(struct vnode *vp, struct buf *bp)
1642 {
1643 	struct bufobj *bo;
1644 
1645 	bo = &vp->v_bufobj;
1646 	ASSERT_BO_WLOCKED(bo);
1647 	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
1648 
1649 	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
1650 	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
1651 	    ("bgetvp: bp already attached! %p", bp));
1652 
1653 	vhold(vp);
1654 	bp->b_vp = vp;
1655 	bp->b_bufobj = bo;
1656 	/*
1657 	 * Insert onto list for new vnode.
1658 	 */
1659 	buf_vlist_add(bp, bo, BX_VNCLEAN);
1660 }
1661 
1662 /*
1663  * Disassociate a buffer from a vnode.
1664  */
1665 void
1666 brelvp(struct buf *bp)
1667 {
1668 	struct bufobj *bo;
1669 	struct vnode *vp;
1670 
1671 	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1672 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1673 
1674 	/*
1675 	 * Delete from old vnode list, if on one.
1676 	 */
1677 	vp = bp->b_vp;		/* XXX */
1678 	bo = bp->b_bufobj;
1679 	BO_LOCK(bo);
1680 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1681 		buf_vlist_remove(bp);
1682 	else
1683 		panic("brelvp: Buffer %p not on queue.", bp);
1684 	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1685 		bo->bo_flag &= ~BO_ONWORKLST;
1686 		mtx_lock(&sync_mtx);
1687 		LIST_REMOVE(bo, bo_synclist);
1688 		syncer_worklist_len--;
1689 		mtx_unlock(&sync_mtx);
1690 	}
1691 	bp->b_vp = NULL;
1692 	bp->b_bufobj = NULL;
1693 	BO_UNLOCK(bo);
1694 	vdrop(vp);
1695 }
1696 
1697 /*
1698  * Add an item to the syncer work queue.
1699  */
1700 static void
1701 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
1702 {
1703 	int slot;
1704 
1705 	ASSERT_BO_WLOCKED(bo);
1706 
1707 	mtx_lock(&sync_mtx);
1708 	if (bo->bo_flag & BO_ONWORKLST)
1709 		LIST_REMOVE(bo, bo_synclist);
1710 	else {
1711 		bo->bo_flag |= BO_ONWORKLST;
1712 		syncer_worklist_len++;
1713 	}
1714 
1715 	if (delay > syncer_maxdelay - 2)
1716 		delay = syncer_maxdelay - 2;
1717 	slot = (syncer_delayno + delay) & syncer_mask;
1718 
1719 	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
1720 	mtx_unlock(&sync_mtx);
1721 }
1722 
1723 static int
1724 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
1725 {
1726 	int error, len;
1727 
1728 	mtx_lock(&sync_mtx);
1729 	len = syncer_worklist_len - sync_vnode_count;
1730 	mtx_unlock(&sync_mtx);
1731 	error = SYSCTL_OUT(req, &len, sizeof(len));
1732 	return (error);
1733 }
1734 
1735 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
1736     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
1737 
1738 static struct proc *updateproc;
1739 static void sched_sync(void);
1740 static struct kproc_desc up_kp = {
1741 	"syncer",
1742 	sched_sync,
1743 	&updateproc
1744 };
1745 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
1746 
1747 static int
1748 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
1749 {
1750 	struct vnode *vp;
1751 	struct mount *mp;
1752 
1753 	*bo = LIST_FIRST(slp);
1754 	if (*bo == NULL)
1755 		return (0);
1756 	vp = (*bo)->__bo_vnode;	/* XXX */
1757 	if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
1758 		return (1);
1759 	/*
1760 	 * We use vhold in case the vnode does not
1761 	 * successfully sync.  vhold prevents the vnode from
1762 	 * going away when we unlock the sync_mtx so that
1763 	 * we can acquire the vnode interlock.
1764 	 */
1765 	vholdl(vp);
1766 	mtx_unlock(&sync_mtx);
1767 	VI_UNLOCK(vp);
1768 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1769 		vdrop(vp);
1770 		mtx_lock(&sync_mtx);
1771 		return (*bo == LIST_FIRST(slp));
1772 	}
1773 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1774 	(void) VOP_FSYNC(vp, MNT_LAZY, td);
1775 	VOP_UNLOCK(vp, 0);
1776 	vn_finished_write(mp);
1777 	BO_LOCK(*bo);
1778 	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
1779 		/*
1780 		 * Put us back on the worklist.  The worklist
1781 		 * routine will remove us from our current
1782 		 * position and then add us back in at a later
1783 		 * position.
1784 		 */
1785 		vn_syncer_add_to_worklist(*bo, syncdelay);
1786 	}
1787 	BO_UNLOCK(*bo);
1788 	vdrop(vp);
1789 	mtx_lock(&sync_mtx);
1790 	return (0);
1791 }
1792 
1793 static int first_printf = 1;
1794 
1795 /*
1796  * System filesystem synchronizer daemon.
1797  */
1798 static void
1799 sched_sync(void)
1800 {
1801 	struct synclist *next, *slp;
1802 	struct bufobj *bo;
1803 	long starttime;
1804 	struct thread *td = curthread;
1805 	int last_work_seen;
1806 	int net_worklist_len;
1807 	int syncer_final_iter;
1808 	int error;
1809 
1810 	last_work_seen = 0;
1811 	syncer_final_iter = 0;
1812 	syncer_state = SYNCER_RUNNING;
1813 	starttime = time_uptime;
1814 	td->td_pflags |= TDP_NORUNNINGBUF;
1815 
1816 	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
1817 	    SHUTDOWN_PRI_LAST);
1818 
1819 	mtx_lock(&sync_mtx);
1820 	for (;;) {
1821 		if (syncer_state == SYNCER_FINAL_DELAY &&
1822 		    syncer_final_iter == 0) {
1823 			mtx_unlock(&sync_mtx);
1824 			kproc_suspend_check(td->td_proc);
1825 			mtx_lock(&sync_mtx);
1826 		}
1827 		net_worklist_len = syncer_worklist_len - sync_vnode_count;
1828 		if (syncer_state != SYNCER_RUNNING &&
1829 		    starttime != time_uptime) {
1830 			if (first_printf) {
1831 				printf("\nSyncing disks, vnodes remaining...");
1832 				first_printf = 0;
1833 			}
1834 			printf("%d ", net_worklist_len);
1835 		}
1836 		starttime = time_uptime;
1837 
1838 		/*
1839 		 * Push files whose dirty time has expired.  Be careful
1840 		 * of interrupt race on slp queue.
1841 		 *
1842 		 * Skip over empty worklist slots when shutting down.
1843 		 */
1844 		do {
1845 			slp = &syncer_workitem_pending[syncer_delayno];
1846 			syncer_delayno += 1;
1847 			if (syncer_delayno == syncer_maxdelay)
1848 				syncer_delayno = 0;
1849 			next = &syncer_workitem_pending[syncer_delayno];
1850 			/*
1851 			 * If the worklist has wrapped since the
1852 			 * it was emptied of all but syncer vnodes,
1853 			 * switch to the FINAL_DELAY state and run
1854 			 * for one more second.
1855 			 */
1856 			if (syncer_state == SYNCER_SHUTTING_DOWN &&
1857 			    net_worklist_len == 0 &&
1858 			    last_work_seen == syncer_delayno) {
1859 				syncer_state = SYNCER_FINAL_DELAY;
1860 				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
1861 			}
1862 		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
1863 		    syncer_worklist_len > 0);
1864 
1865 		/*
1866 		 * Keep track of the last time there was anything
1867 		 * on the worklist other than syncer vnodes.
1868 		 * Return to the SHUTTING_DOWN state if any
1869 		 * new work appears.
1870 		 */
1871 		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
1872 			last_work_seen = syncer_delayno;
1873 		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
1874 			syncer_state = SYNCER_SHUTTING_DOWN;
1875 		while (!LIST_EMPTY(slp)) {
1876 			error = sync_vnode(slp, &bo, td);
1877 			if (error == 1) {
1878 				LIST_REMOVE(bo, bo_synclist);
1879 				LIST_INSERT_HEAD(next, bo, bo_synclist);
1880 				continue;
1881 			}
1882 
1883 			if (first_printf == 0) {
1884 				/*
1885 				 * Drop the sync mutex, because some watchdog
1886 				 * drivers need to sleep while patting
1887 				 */
1888 				mtx_unlock(&sync_mtx);
1889 				wdog_kern_pat(WD_LASTVAL);
1890 				mtx_lock(&sync_mtx);
1891 			}
1892 
1893 		}
1894 		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
1895 			syncer_final_iter--;
1896 		/*
1897 		 * The variable rushjob allows the kernel to speed up the
1898 		 * processing of the filesystem syncer process. A rushjob
1899 		 * value of N tells the filesystem syncer to process the next
1900 		 * N seconds worth of work on its queue ASAP. Currently rushjob
1901 		 * is used by the soft update code to speed up the filesystem
1902 		 * syncer process when the incore state is getting so far
1903 		 * ahead of the disk that the kernel memory pool is being
1904 		 * threatened with exhaustion.
1905 		 */
1906 		if (rushjob > 0) {
1907 			rushjob -= 1;
1908 			continue;
1909 		}
1910 		/*
1911 		 * Just sleep for a short period of time between
1912 		 * iterations when shutting down to allow some I/O
1913 		 * to happen.
1914 		 *
1915 		 * If it has taken us less than a second to process the
1916 		 * current work, then wait. Otherwise start right over
1917 		 * again. We can still lose time if any single round
1918 		 * takes more than two seconds, but it does not really
1919 		 * matter as we are just trying to generally pace the
1920 		 * filesystem activity.
1921 		 */
1922 		if (syncer_state != SYNCER_RUNNING ||
1923 		    time_uptime == starttime) {
1924 			thread_lock(td);
1925 			sched_prio(td, PPAUSE);
1926 			thread_unlock(td);
1927 		}
1928 		if (syncer_state != SYNCER_RUNNING)
1929 			cv_timedwait(&sync_wakeup, &sync_mtx,
1930 			    hz / SYNCER_SHUTDOWN_SPEEDUP);
1931 		else if (time_uptime == starttime)
1932 			cv_timedwait(&sync_wakeup, &sync_mtx, hz);
1933 	}
1934 }
1935 
1936 /*
1937  * Request the syncer daemon to speed up its work.
1938  * We never push it to speed up more than half of its
1939  * normal turn time, otherwise it could take over the cpu.
1940  */
1941 int
1942 speedup_syncer(void)
1943 {
1944 	int ret = 0;
1945 
1946 	mtx_lock(&sync_mtx);
1947 	if (rushjob < syncdelay / 2) {
1948 		rushjob += 1;
1949 		stat_rush_requests += 1;
1950 		ret = 1;
1951 	}
1952 	mtx_unlock(&sync_mtx);
1953 	cv_broadcast(&sync_wakeup);
1954 	return (ret);
1955 }
1956 
1957 /*
1958  * Tell the syncer to speed up its work and run though its work
1959  * list several times, then tell it to shut down.
1960  */
1961 static void
1962 syncer_shutdown(void *arg, int howto)
1963 {
1964 
1965 	if (howto & RB_NOSYNC)
1966 		return;
1967 	mtx_lock(&sync_mtx);
1968 	syncer_state = SYNCER_SHUTTING_DOWN;
1969 	rushjob = 0;
1970 	mtx_unlock(&sync_mtx);
1971 	cv_broadcast(&sync_wakeup);
1972 	kproc_shutdown(arg, howto);
1973 }
1974 
1975 void
1976 syncer_suspend(void)
1977 {
1978 
1979 	syncer_shutdown(updateproc, 0);
1980 }
1981 
1982 void
1983 syncer_resume(void)
1984 {
1985 
1986 	mtx_lock(&sync_mtx);
1987 	first_printf = 1;
1988 	syncer_state = SYNCER_RUNNING;
1989 	mtx_unlock(&sync_mtx);
1990 	cv_broadcast(&sync_wakeup);
1991 	kproc_resume(updateproc);
1992 }
1993 
1994 /*
1995  * Reassign a buffer from one vnode to another.
1996  * Used to assign file specific control information
1997  * (indirect blocks) to the vnode to which they belong.
1998  */
1999 void
2000 reassignbuf(struct buf *bp)
2001 {
2002 	struct vnode *vp;
2003 	struct bufobj *bo;
2004 	int delay;
2005 #ifdef INVARIANTS
2006 	struct bufv *bv;
2007 #endif
2008 
2009 	vp = bp->b_vp;
2010 	bo = bp->b_bufobj;
2011 	++reassignbufcalls;
2012 
2013 	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
2014 	    bp, bp->b_vp, bp->b_flags);
2015 	/*
2016 	 * B_PAGING flagged buffers cannot be reassigned because their vp
2017 	 * is not fully linked in.
2018 	 */
2019 	if (bp->b_flags & B_PAGING)
2020 		panic("cannot reassign paging buffer");
2021 
2022 	/*
2023 	 * Delete from old vnode list, if on one.
2024 	 */
2025 	BO_LOCK(bo);
2026 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
2027 		buf_vlist_remove(bp);
2028 	else
2029 		panic("reassignbuf: Buffer %p not on queue.", bp);
2030 	/*
2031 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
2032 	 * of clean buffers.
2033 	 */
2034 	if (bp->b_flags & B_DELWRI) {
2035 		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
2036 			switch (vp->v_type) {
2037 			case VDIR:
2038 				delay = dirdelay;
2039 				break;
2040 			case VCHR:
2041 				delay = metadelay;
2042 				break;
2043 			default:
2044 				delay = filedelay;
2045 			}
2046 			vn_syncer_add_to_worklist(bo, delay);
2047 		}
2048 		buf_vlist_add(bp, bo, BX_VNDIRTY);
2049 	} else {
2050 		buf_vlist_add(bp, bo, BX_VNCLEAN);
2051 
2052 		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
2053 			mtx_lock(&sync_mtx);
2054 			LIST_REMOVE(bo, bo_synclist);
2055 			syncer_worklist_len--;
2056 			mtx_unlock(&sync_mtx);
2057 			bo->bo_flag &= ~BO_ONWORKLST;
2058 		}
2059 	}
2060 #ifdef INVARIANTS
2061 	bv = &bo->bo_clean;
2062 	bp = TAILQ_FIRST(&bv->bv_hd);
2063 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2064 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2065 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
2066 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2067 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2068 	bv = &bo->bo_dirty;
2069 	bp = TAILQ_FIRST(&bv->bv_hd);
2070 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2071 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2072 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
2073 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2074 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2075 #endif
2076 	BO_UNLOCK(bo);
2077 }
2078 
2079 /*
2080  * A temporary hack until refcount_* APIs are sorted out.
2081  */
2082 static __inline int
2083 vfs_refcount_acquire_if_not_zero(volatile u_int *count)
2084 {
2085 	u_int old;
2086 
2087 	for (;;) {
2088 		old = *count;
2089 		if (old == 0)
2090 			return (0);
2091 		if (atomic_cmpset_int(count, old, old + 1))
2092 			return (1);
2093 	}
2094 }
2095 
2096 static __inline int
2097 vfs_refcount_release_if_not_last(volatile u_int *count)
2098 {
2099 	u_int old;
2100 
2101 	for (;;) {
2102 		old = *count;
2103 		if (old == 1)
2104 			return (0);
2105 		if (atomic_cmpset_int(count, old, old - 1))
2106 			return (1);
2107 	}
2108 }
2109 
2110 static void
2111 v_init_counters(struct vnode *vp)
2112 {
2113 
2114 	VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0,
2115 	    vp, ("%s called for an initialized vnode", __FUNCTION__));
2116 	ASSERT_VI_UNLOCKED(vp, __FUNCTION__);
2117 
2118 	refcount_init(&vp->v_holdcnt, 1);
2119 	refcount_init(&vp->v_usecount, 1);
2120 }
2121 
2122 /*
2123  * Increment the use and hold counts on the vnode, taking care to reference
2124  * the driver's usecount if this is a chardev.  The _vhold() will remove
2125  * the vnode from the free list if it is presently free.
2126  */
2127 static void
2128 v_incr_usecount(struct vnode *vp)
2129 {
2130 
2131 	ASSERT_VI_UNLOCKED(vp, __func__);
2132 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2133 
2134 	if (vp->v_type == VCHR) {
2135 		VI_LOCK(vp);
2136 		_vhold(vp, true);
2137 		if (vp->v_iflag & VI_OWEINACT) {
2138 			VNASSERT(vp->v_usecount == 0, vp,
2139 			    ("vnode with usecount and VI_OWEINACT set"));
2140 			vp->v_iflag &= ~VI_OWEINACT;
2141 		}
2142 		refcount_acquire(&vp->v_usecount);
2143 		v_incr_devcount(vp);
2144 		VI_UNLOCK(vp);
2145 		return;
2146 	}
2147 
2148 	_vhold(vp, false);
2149 	if (vfs_refcount_acquire_if_not_zero(&vp->v_usecount)) {
2150 		VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
2151 		    ("vnode with usecount and VI_OWEINACT set"));
2152 	} else {
2153 		VI_LOCK(vp);
2154 		if (vp->v_iflag & VI_OWEINACT)
2155 			vp->v_iflag &= ~VI_OWEINACT;
2156 		refcount_acquire(&vp->v_usecount);
2157 		VI_UNLOCK(vp);
2158 	}
2159 }
2160 
2161 /*
2162  * Increment si_usecount of the associated device, if any.
2163  */
2164 static void
2165 v_incr_devcount(struct vnode *vp)
2166 {
2167 
2168 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
2169 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2170 		dev_lock();
2171 		vp->v_rdev->si_usecount++;
2172 		dev_unlock();
2173 	}
2174 }
2175 
2176 /*
2177  * Decrement si_usecount of the associated device, if any.
2178  */
2179 static void
2180 v_decr_devcount(struct vnode *vp)
2181 {
2182 
2183 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
2184 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2185 		dev_lock();
2186 		vp->v_rdev->si_usecount--;
2187 		dev_unlock();
2188 	}
2189 }
2190 
2191 /*
2192  * Grab a particular vnode from the free list, increment its
2193  * reference count and lock it.  VI_DOOMED is set if the vnode
2194  * is being destroyed.  Only callers who specify LK_RETRY will
2195  * see doomed vnodes.  If inactive processing was delayed in
2196  * vput try to do it here.
2197  *
2198  * Notes on lockless counter manipulation:
2199  * _vhold, vputx and other routines make various decisions based
2200  * on either holdcnt or usecount being 0. As long as either contuner
2201  * is not transitioning 0->1 nor 1->0, the manipulation can be done
2202  * with atomic operations. Otherwise the interlock is taken.
2203  */
2204 int
2205 vget(struct vnode *vp, int flags, struct thread *td)
2206 {
2207 	int error, oweinact;
2208 
2209 	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
2210 	    ("vget: invalid lock operation"));
2211 
2212 	if ((flags & LK_INTERLOCK) != 0)
2213 		ASSERT_VI_LOCKED(vp, __func__);
2214 	else
2215 		ASSERT_VI_UNLOCKED(vp, __func__);
2216 	if ((flags & LK_VNHELD) != 0)
2217 		VNASSERT((vp->v_holdcnt > 0), vp,
2218 		    ("vget: LK_VNHELD passed but vnode not held"));
2219 
2220 	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
2221 
2222 	if ((flags & LK_VNHELD) == 0)
2223 		_vhold(vp, (flags & LK_INTERLOCK) != 0);
2224 
2225 	if ((error = vn_lock(vp, flags)) != 0) {
2226 		vdrop(vp);
2227 		CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
2228 		    vp);
2229 		return (error);
2230 	}
2231 	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
2232 		panic("vget: vn_lock failed to return ENOENT\n");
2233 	/*
2234 	 * We don't guarantee that any particular close will
2235 	 * trigger inactive processing so just make a best effort
2236 	 * here at preventing a reference to a removed file.  If
2237 	 * we don't succeed no harm is done.
2238 	 *
2239 	 * Upgrade our holdcnt to a usecount.
2240 	 */
2241 	if (vp->v_type != VCHR &&
2242 	    vfs_refcount_acquire_if_not_zero(&vp->v_usecount)) {
2243 		VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
2244 		    ("vnode with usecount and VI_OWEINACT set"));
2245 	} else {
2246 		VI_LOCK(vp);
2247 		if ((vp->v_iflag & VI_OWEINACT) == 0) {
2248 			oweinact = 0;
2249 		} else {
2250 			oweinact = 1;
2251 			vp->v_iflag &= ~VI_OWEINACT;
2252 		}
2253 		refcount_acquire(&vp->v_usecount);
2254 		v_incr_devcount(vp);
2255 		if (oweinact && VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
2256 		    (flags & LK_NOWAIT) == 0)
2257 			vinactive(vp, td);
2258 		VI_UNLOCK(vp);
2259 	}
2260 	return (0);
2261 }
2262 
2263 /*
2264  * Increase the reference count of a vnode.
2265  */
2266 void
2267 vref(struct vnode *vp)
2268 {
2269 
2270 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2271 	v_incr_usecount(vp);
2272 }
2273 
2274 /*
2275  * Return reference count of a vnode.
2276  *
2277  * The results of this call are only guaranteed when some mechanism is used to
2278  * stop other processes from gaining references to the vnode.  This may be the
2279  * case if the caller holds the only reference.  This is also useful when stale
2280  * data is acceptable as race conditions may be accounted for by some other
2281  * means.
2282  */
2283 int
2284 vrefcnt(struct vnode *vp)
2285 {
2286 
2287 	return (vp->v_usecount);
2288 }
2289 
2290 #define	VPUTX_VRELE	1
2291 #define	VPUTX_VPUT	2
2292 #define	VPUTX_VUNREF	3
2293 
2294 /*
2295  * Decrement the use and hold counts for a vnode.
2296  *
2297  * See an explanation near vget() as to why atomic operation is safe.
2298  */
2299 static void
2300 vputx(struct vnode *vp, int func)
2301 {
2302 	int error;
2303 
2304 	KASSERT(vp != NULL, ("vputx: null vp"));
2305 	if (func == VPUTX_VUNREF)
2306 		ASSERT_VOP_LOCKED(vp, "vunref");
2307 	else if (func == VPUTX_VPUT)
2308 		ASSERT_VOP_LOCKED(vp, "vput");
2309 	else
2310 		KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
2311 	ASSERT_VI_UNLOCKED(vp, __func__);
2312 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2313 
2314 	if (vp->v_type != VCHR &&
2315 	    vfs_refcount_release_if_not_last(&vp->v_usecount)) {
2316 		if (func == VPUTX_VPUT)
2317 			VOP_UNLOCK(vp, 0);
2318 		vdrop(vp);
2319 		return;
2320 	}
2321 
2322 	VI_LOCK(vp);
2323 
2324 	/*
2325 	 * We want to hold the vnode until the inactive finishes to
2326 	 * prevent vgone() races.  We drop the use count here and the
2327 	 * hold count below when we're done.
2328 	 */
2329 	if (!refcount_release(&vp->v_usecount) ||
2330 	    (vp->v_iflag & VI_DOINGINACT)) {
2331 		if (func == VPUTX_VPUT)
2332 			VOP_UNLOCK(vp, 0);
2333 		v_decr_devcount(vp);
2334 		vdropl(vp);
2335 		return;
2336 	}
2337 
2338 	v_decr_devcount(vp);
2339 
2340 	error = 0;
2341 
2342 	if (vp->v_usecount != 0) {
2343 		vprint("vputx: usecount not zero", vp);
2344 		panic("vputx: usecount not zero");
2345 	}
2346 
2347 	CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp);
2348 
2349 	/*
2350 	 * We must call VOP_INACTIVE with the node locked. Mark
2351 	 * as VI_DOINGINACT to avoid recursion.
2352 	 */
2353 	vp->v_iflag |= VI_OWEINACT;
2354 	switch (func) {
2355 	case VPUTX_VRELE:
2356 		error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
2357 		VI_LOCK(vp);
2358 		break;
2359 	case VPUTX_VPUT:
2360 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2361 			error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
2362 			    LK_NOWAIT);
2363 			VI_LOCK(vp);
2364 		}
2365 		break;
2366 	case VPUTX_VUNREF:
2367 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2368 			error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
2369 			VI_LOCK(vp);
2370 		}
2371 		break;
2372 	}
2373 	VNASSERT(vp->v_usecount == 0 || (vp->v_iflag & VI_OWEINACT) == 0, vp,
2374 	    ("vnode with usecount and VI_OWEINACT set"));
2375 	if (error == 0) {
2376 		if (vp->v_iflag & VI_OWEINACT)
2377 			vinactive(vp, curthread);
2378 		if (func != VPUTX_VUNREF)
2379 			VOP_UNLOCK(vp, 0);
2380 	}
2381 	vdropl(vp);
2382 }
2383 
2384 /*
2385  * Vnode put/release.
2386  * If count drops to zero, call inactive routine and return to freelist.
2387  */
2388 void
2389 vrele(struct vnode *vp)
2390 {
2391 
2392 	vputx(vp, VPUTX_VRELE);
2393 }
2394 
2395 /*
2396  * Release an already locked vnode.  This give the same effects as
2397  * unlock+vrele(), but takes less time and avoids releasing and
2398  * re-aquiring the lock (as vrele() acquires the lock internally.)
2399  */
2400 void
2401 vput(struct vnode *vp)
2402 {
2403 
2404 	vputx(vp, VPUTX_VPUT);
2405 }
2406 
2407 /*
2408  * Release an exclusively locked vnode. Do not unlock the vnode lock.
2409  */
2410 void
2411 vunref(struct vnode *vp)
2412 {
2413 
2414 	vputx(vp, VPUTX_VUNREF);
2415 }
2416 
2417 /*
2418  * Increase the hold count and activate if this is the first reference.
2419  */
2420 void
2421 _vhold(struct vnode *vp, bool locked)
2422 {
2423 	struct mount *mp;
2424 
2425 	if (locked)
2426 		ASSERT_VI_LOCKED(vp, __func__);
2427 	else
2428 		ASSERT_VI_UNLOCKED(vp, __func__);
2429 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2430 	if (!locked && vfs_refcount_acquire_if_not_zero(&vp->v_holdcnt)) {
2431 		VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2432 		    ("_vhold: vnode with holdcnt is free"));
2433 		return;
2434 	}
2435 
2436 	if (!locked)
2437 		VI_LOCK(vp);
2438 	if ((vp->v_iflag & VI_FREE) == 0) {
2439 		refcount_acquire(&vp->v_holdcnt);
2440 		if (!locked)
2441 			VI_UNLOCK(vp);
2442 		return;
2443 	}
2444 	VNASSERT(vp->v_holdcnt == 0, vp,
2445 	    ("%s: wrong hold count", __func__));
2446 	VNASSERT(vp->v_op != NULL, vp,
2447 	    ("%s: vnode already reclaimed.", __func__));
2448 	/*
2449 	 * Remove a vnode from the free list, mark it as in use,
2450 	 * and put it on the active list.
2451 	 */
2452 	mtx_lock(&vnode_free_list_mtx);
2453 	TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
2454 	freevnodes--;
2455 	vp->v_iflag &= ~(VI_FREE|VI_AGE);
2456 	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
2457 	    ("Activating already active vnode"));
2458 	vp->v_iflag |= VI_ACTIVE;
2459 	mp = vp->v_mount;
2460 	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
2461 	mp->mnt_activevnodelistsize++;
2462 	mtx_unlock(&vnode_free_list_mtx);
2463 	refcount_acquire(&vp->v_holdcnt);
2464 	if (!locked)
2465 		VI_UNLOCK(vp);
2466 }
2467 
2468 /*
2469  * Drop the hold count of the vnode.  If this is the last reference to
2470  * the vnode we place it on the free list unless it has been vgone'd
2471  * (marked VI_DOOMED) in which case we will free it.
2472  */
2473 void
2474 _vdrop(struct vnode *vp, bool locked)
2475 {
2476 	struct bufobj *bo;
2477 	struct mount *mp;
2478 	int active;
2479 
2480 	if (locked)
2481 		ASSERT_VI_LOCKED(vp, __func__);
2482 	else
2483 		ASSERT_VI_UNLOCKED(vp, __func__);
2484 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2485 	if ((int)vp->v_holdcnt <= 0)
2486 		panic("vdrop: holdcnt %d", vp->v_holdcnt);
2487 	if (vfs_refcount_release_if_not_last(&vp->v_holdcnt)) {
2488 		if (locked)
2489 			VI_UNLOCK(vp);
2490 		return;
2491 	}
2492 
2493 	if (!locked)
2494 		VI_LOCK(vp);
2495 	if (refcount_release(&vp->v_holdcnt) == 0) {
2496 		VI_UNLOCK(vp);
2497 		return;
2498 	}
2499 	if ((vp->v_iflag & VI_DOOMED) == 0) {
2500 		/*
2501 		 * Mark a vnode as free: remove it from its active list
2502 		 * and put it up for recycling on the freelist.
2503 		 */
2504 		VNASSERT(vp->v_op != NULL, vp,
2505 		    ("vdropl: vnode already reclaimed."));
2506 		VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2507 		    ("vnode already free"));
2508 		VNASSERT(vp->v_holdcnt == 0, vp,
2509 		    ("vdropl: freeing when we shouldn't"));
2510 		active = vp->v_iflag & VI_ACTIVE;
2511 		if ((vp->v_iflag & VI_OWEINACT) == 0) {
2512 			vp->v_iflag &= ~VI_ACTIVE;
2513 			mp = vp->v_mount;
2514 			mtx_lock(&vnode_free_list_mtx);
2515 			if (active) {
2516 				TAILQ_REMOVE(&mp->mnt_activevnodelist, vp,
2517 				    v_actfreelist);
2518 				mp->mnt_activevnodelistsize--;
2519 			}
2520 			if (vp->v_iflag & VI_AGE) {
2521 				TAILQ_INSERT_HEAD(&vnode_free_list, vp,
2522 				    v_actfreelist);
2523 			} else {
2524 				TAILQ_INSERT_TAIL(&vnode_free_list, vp,
2525 				    v_actfreelist);
2526 			}
2527 			freevnodes++;
2528 			vp->v_iflag &= ~VI_AGE;
2529 			vp->v_iflag |= VI_FREE;
2530 			mtx_unlock(&vnode_free_list_mtx);
2531 		} else {
2532 			atomic_add_long(&free_owe_inact, 1);
2533 		}
2534 		VI_UNLOCK(vp);
2535 		return;
2536 	}
2537 	/*
2538 	 * The vnode has been marked for destruction, so free it.
2539 	 */
2540 	CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
2541 	atomic_subtract_long(&numvnodes, 1);
2542 	bo = &vp->v_bufobj;
2543 	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2544 	    ("cleaned vnode still on the free list."));
2545 	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
2546 	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
2547 	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
2548 	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
2549 	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
2550 	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
2551 	VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
2552 	    ("clean blk trie not empty"));
2553 	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
2554 	VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
2555 	    ("dirty blk trie not empty"));
2556 	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
2557 	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
2558 	VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
2559 	VI_UNLOCK(vp);
2560 #ifdef MAC
2561 	mac_vnode_destroy(vp);
2562 #endif
2563 	if (vp->v_pollinfo != NULL)
2564 		destroy_vpollinfo(vp->v_pollinfo);
2565 #ifdef INVARIANTS
2566 	/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
2567 	vp->v_op = NULL;
2568 #endif
2569 	rangelock_destroy(&vp->v_rl);
2570 	lockdestroy(vp->v_vnlock);
2571 	mtx_destroy(&vp->v_interlock);
2572 	rw_destroy(BO_LOCKPTR(bo));
2573 	uma_zfree(vnode_zone, vp);
2574 }
2575 
2576 /*
2577  * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
2578  * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
2579  * OWEINACT tracks whether a vnode missed a call to inactive due to a
2580  * failed lock upgrade.
2581  */
2582 void
2583 vinactive(struct vnode *vp, struct thread *td)
2584 {
2585 	struct vm_object *obj;
2586 
2587 	ASSERT_VOP_ELOCKED(vp, "vinactive");
2588 	ASSERT_VI_LOCKED(vp, "vinactive");
2589 	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
2590 	    ("vinactive: recursed on VI_DOINGINACT"));
2591 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2592 	vp->v_iflag |= VI_DOINGINACT;
2593 	vp->v_iflag &= ~VI_OWEINACT;
2594 	VI_UNLOCK(vp);
2595 	/*
2596 	 * Before moving off the active list, we must be sure that any
2597 	 * modified pages are on the vnode's dirty list since these will
2598 	 * no longer be checked once the vnode is on the inactive list.
2599 	 * Because the vnode vm object keeps a hold reference on the vnode
2600 	 * if there is at least one resident non-cached page, the vnode
2601 	 * cannot leave the active list without the page cleanup done.
2602 	 */
2603 	obj = vp->v_object;
2604 	if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) {
2605 		VM_OBJECT_WLOCK(obj);
2606 		vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC);
2607 		VM_OBJECT_WUNLOCK(obj);
2608 	}
2609 	VOP_INACTIVE(vp, td);
2610 	VI_LOCK(vp);
2611 	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
2612 	    ("vinactive: lost VI_DOINGINACT"));
2613 	vp->v_iflag &= ~VI_DOINGINACT;
2614 }
2615 
2616 /*
2617  * Remove any vnodes in the vnode table belonging to mount point mp.
2618  *
2619  * If FORCECLOSE is not specified, there should not be any active ones,
2620  * return error if any are found (nb: this is a user error, not a
2621  * system error). If FORCECLOSE is specified, detach any active vnodes
2622  * that are found.
2623  *
2624  * If WRITECLOSE is set, only flush out regular file vnodes open for
2625  * writing.
2626  *
2627  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2628  *
2629  * `rootrefs' specifies the base reference count for the root vnode
2630  * of this filesystem. The root vnode is considered busy if its
2631  * v_usecount exceeds this value. On a successful return, vflush(, td)
2632  * will call vrele() on the root vnode exactly rootrefs times.
2633  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2634  * be zero.
2635  */
2636 #ifdef DIAGNOSTIC
2637 static int busyprt = 0;		/* print out busy vnodes */
2638 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
2639 #endif
2640 
2641 int
2642 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
2643 {
2644 	struct vnode *vp, *mvp, *rootvp = NULL;
2645 	struct vattr vattr;
2646 	int busy = 0, error;
2647 
2648 	CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
2649 	    rootrefs, flags);
2650 	if (rootrefs > 0) {
2651 		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2652 		    ("vflush: bad args"));
2653 		/*
2654 		 * Get the filesystem root vnode. We can vput() it
2655 		 * immediately, since with rootrefs > 0, it won't go away.
2656 		 */
2657 		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
2658 			CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
2659 			    __func__, error);
2660 			return (error);
2661 		}
2662 		vput(rootvp);
2663 	}
2664 loop:
2665 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2666 		vholdl(vp);
2667 		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
2668 		if (error) {
2669 			vdrop(vp);
2670 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2671 			goto loop;
2672 		}
2673 		/*
2674 		 * Skip over a vnodes marked VV_SYSTEM.
2675 		 */
2676 		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2677 			VOP_UNLOCK(vp, 0);
2678 			vdrop(vp);
2679 			continue;
2680 		}
2681 		/*
2682 		 * If WRITECLOSE is set, flush out unlinked but still open
2683 		 * files (even if open only for reading) and regular file
2684 		 * vnodes open for writing.
2685 		 */
2686 		if (flags & WRITECLOSE) {
2687 			if (vp->v_object != NULL) {
2688 				VM_OBJECT_WLOCK(vp->v_object);
2689 				vm_object_page_clean(vp->v_object, 0, 0, 0);
2690 				VM_OBJECT_WUNLOCK(vp->v_object);
2691 			}
2692 			error = VOP_FSYNC(vp, MNT_WAIT, td);
2693 			if (error != 0) {
2694 				VOP_UNLOCK(vp, 0);
2695 				vdrop(vp);
2696 				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2697 				return (error);
2698 			}
2699 			error = VOP_GETATTR(vp, &vattr, td->td_ucred);
2700 			VI_LOCK(vp);
2701 
2702 			if ((vp->v_type == VNON ||
2703 			    (error == 0 && vattr.va_nlink > 0)) &&
2704 			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
2705 				VOP_UNLOCK(vp, 0);
2706 				vdropl(vp);
2707 				continue;
2708 			}
2709 		} else
2710 			VI_LOCK(vp);
2711 		/*
2712 		 * With v_usecount == 0, all we need to do is clear out the
2713 		 * vnode data structures and we are done.
2714 		 *
2715 		 * If FORCECLOSE is set, forcibly close the vnode.
2716 		 */
2717 		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
2718 			vgonel(vp);
2719 		} else {
2720 			busy++;
2721 #ifdef DIAGNOSTIC
2722 			if (busyprt)
2723 				vprint("vflush: busy vnode", vp);
2724 #endif
2725 		}
2726 		VOP_UNLOCK(vp, 0);
2727 		vdropl(vp);
2728 	}
2729 	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2730 		/*
2731 		 * If just the root vnode is busy, and if its refcount
2732 		 * is equal to `rootrefs', then go ahead and kill it.
2733 		 */
2734 		VI_LOCK(rootvp);
2735 		KASSERT(busy > 0, ("vflush: not busy"));
2736 		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
2737 		    ("vflush: usecount %d < rootrefs %d",
2738 		     rootvp->v_usecount, rootrefs));
2739 		if (busy == 1 && rootvp->v_usecount == rootrefs) {
2740 			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
2741 			vgone(rootvp);
2742 			VOP_UNLOCK(rootvp, 0);
2743 			busy = 0;
2744 		} else
2745 			VI_UNLOCK(rootvp);
2746 	}
2747 	if (busy) {
2748 		CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
2749 		    busy);
2750 		return (EBUSY);
2751 	}
2752 	for (; rootrefs > 0; rootrefs--)
2753 		vrele(rootvp);
2754 	return (0);
2755 }
2756 
2757 /*
2758  * Recycle an unused vnode to the front of the free list.
2759  */
2760 int
2761 vrecycle(struct vnode *vp)
2762 {
2763 	int recycled;
2764 
2765 	ASSERT_VOP_ELOCKED(vp, "vrecycle");
2766 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2767 	recycled = 0;
2768 	VI_LOCK(vp);
2769 	if (vp->v_usecount == 0) {
2770 		recycled = 1;
2771 		vgonel(vp);
2772 	}
2773 	VI_UNLOCK(vp);
2774 	return (recycled);
2775 }
2776 
2777 /*
2778  * Eliminate all activity associated with a vnode
2779  * in preparation for reuse.
2780  */
2781 void
2782 vgone(struct vnode *vp)
2783 {
2784 	VI_LOCK(vp);
2785 	vgonel(vp);
2786 	VI_UNLOCK(vp);
2787 }
2788 
2789 static void
2790 notify_lowervp_vfs_dummy(struct mount *mp __unused,
2791     struct vnode *lowervp __unused)
2792 {
2793 }
2794 
2795 /*
2796  * Notify upper mounts about reclaimed or unlinked vnode.
2797  */
2798 void
2799 vfs_notify_upper(struct vnode *vp, int event)
2800 {
2801 	static struct vfsops vgonel_vfsops = {
2802 		.vfs_reclaim_lowervp = notify_lowervp_vfs_dummy,
2803 		.vfs_unlink_lowervp = notify_lowervp_vfs_dummy,
2804 	};
2805 	struct mount *mp, *ump, *mmp;
2806 
2807 	mp = vp->v_mount;
2808 	if (mp == NULL)
2809 		return;
2810 
2811 	MNT_ILOCK(mp);
2812 	if (TAILQ_EMPTY(&mp->mnt_uppers))
2813 		goto unlock;
2814 	MNT_IUNLOCK(mp);
2815 	mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
2816 	mmp->mnt_op = &vgonel_vfsops;
2817 	mmp->mnt_kern_flag |= MNTK_MARKER;
2818 	MNT_ILOCK(mp);
2819 	mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
2820 	for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
2821 		if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
2822 			ump = TAILQ_NEXT(ump, mnt_upper_link);
2823 			continue;
2824 		}
2825 		TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
2826 		MNT_IUNLOCK(mp);
2827 		switch (event) {
2828 		case VFS_NOTIFY_UPPER_RECLAIM:
2829 			VFS_RECLAIM_LOWERVP(ump, vp);
2830 			break;
2831 		case VFS_NOTIFY_UPPER_UNLINK:
2832 			VFS_UNLINK_LOWERVP(ump, vp);
2833 			break;
2834 		default:
2835 			KASSERT(0, ("invalid event %d", event));
2836 			break;
2837 		}
2838 		MNT_ILOCK(mp);
2839 		ump = TAILQ_NEXT(mmp, mnt_upper_link);
2840 		TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
2841 	}
2842 	free(mmp, M_TEMP);
2843 	mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
2844 	if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
2845 		mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
2846 		wakeup(&mp->mnt_uppers);
2847 	}
2848 unlock:
2849 	MNT_IUNLOCK(mp);
2850 }
2851 
2852 /*
2853  * vgone, with the vp interlock held.
2854  */
2855 static void
2856 vgonel(struct vnode *vp)
2857 {
2858 	struct thread *td;
2859 	int oweinact;
2860 	int active;
2861 	struct mount *mp;
2862 
2863 	ASSERT_VOP_ELOCKED(vp, "vgonel");
2864 	ASSERT_VI_LOCKED(vp, "vgonel");
2865 	VNASSERT(vp->v_holdcnt, vp,
2866 	    ("vgonel: vp %p has no reference.", vp));
2867 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2868 	td = curthread;
2869 
2870 	/*
2871 	 * Don't vgonel if we're already doomed.
2872 	 */
2873 	if (vp->v_iflag & VI_DOOMED)
2874 		return;
2875 	vp->v_iflag |= VI_DOOMED;
2876 
2877 	/*
2878 	 * Check to see if the vnode is in use.  If so, we have to call
2879 	 * VOP_CLOSE() and VOP_INACTIVE().
2880 	 */
2881 	active = vp->v_usecount;
2882 	oweinact = (vp->v_iflag & VI_OWEINACT);
2883 	VI_UNLOCK(vp);
2884 	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
2885 
2886 	/*
2887 	 * If purging an active vnode, it must be closed and
2888 	 * deactivated before being reclaimed.
2889 	 */
2890 	if (active)
2891 		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2892 	if (oweinact || active) {
2893 		VI_LOCK(vp);
2894 		if ((vp->v_iflag & VI_DOINGINACT) == 0)
2895 			vinactive(vp, td);
2896 		VI_UNLOCK(vp);
2897 	}
2898 	if (vp->v_type == VSOCK)
2899 		vfs_unp_reclaim(vp);
2900 
2901 	/*
2902 	 * Clean out any buffers associated with the vnode.
2903 	 * If the flush fails, just toss the buffers.
2904 	 */
2905 	mp = NULL;
2906 	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
2907 		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
2908 	if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
2909 		while (vinvalbuf(vp, 0, 0, 0) != 0)
2910 			;
2911 	}
2912 
2913 	BO_LOCK(&vp->v_bufobj);
2914 	KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
2915 	    vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
2916 	    TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
2917 	    vp->v_bufobj.bo_clean.bv_cnt == 0,
2918 	    ("vp %p bufobj not invalidated", vp));
2919 	vp->v_bufobj.bo_flag |= BO_DEAD;
2920 	BO_UNLOCK(&vp->v_bufobj);
2921 
2922 	/*
2923 	 * Reclaim the vnode.
2924 	 */
2925 	if (VOP_RECLAIM(vp, td))
2926 		panic("vgone: cannot reclaim");
2927 	if (mp != NULL)
2928 		vn_finished_secondary_write(mp);
2929 	VNASSERT(vp->v_object == NULL, vp,
2930 	    ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
2931 	/*
2932 	 * Clear the advisory locks and wake up waiting threads.
2933 	 */
2934 	(void)VOP_ADVLOCKPURGE(vp);
2935 	/*
2936 	 * Delete from old mount point vnode list.
2937 	 */
2938 	delmntque(vp);
2939 	cache_purge(vp);
2940 	/*
2941 	 * Done with purge, reset to the standard lock and invalidate
2942 	 * the vnode.
2943 	 */
2944 	VI_LOCK(vp);
2945 	vp->v_vnlock = &vp->v_lock;
2946 	vp->v_op = &dead_vnodeops;
2947 	vp->v_tag = "none";
2948 	vp->v_type = VBAD;
2949 }
2950 
2951 /*
2952  * Calculate the total number of references to a special device.
2953  */
2954 int
2955 vcount(struct vnode *vp)
2956 {
2957 	int count;
2958 
2959 	dev_lock();
2960 	count = vp->v_rdev->si_usecount;
2961 	dev_unlock();
2962 	return (count);
2963 }
2964 
2965 /*
2966  * Same as above, but using the struct cdev *as argument
2967  */
2968 int
2969 count_dev(struct cdev *dev)
2970 {
2971 	int count;
2972 
2973 	dev_lock();
2974 	count = dev->si_usecount;
2975 	dev_unlock();
2976 	return(count);
2977 }
2978 
2979 /*
2980  * Print out a description of a vnode.
2981  */
2982 static char *typename[] =
2983 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
2984  "VMARKER"};
2985 
2986 void
2987 vn_printf(struct vnode *vp, const char *fmt, ...)
2988 {
2989 	va_list ap;
2990 	char buf[256], buf2[16];
2991 	u_long flags;
2992 
2993 	va_start(ap, fmt);
2994 	vprintf(fmt, ap);
2995 	va_end(ap);
2996 	printf("%p: ", (void *)vp);
2997 	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
2998 	printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
2999 	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
3000 	buf[0] = '\0';
3001 	buf[1] = '\0';
3002 	if (vp->v_vflag & VV_ROOT)
3003 		strlcat(buf, "|VV_ROOT", sizeof(buf));
3004 	if (vp->v_vflag & VV_ISTTY)
3005 		strlcat(buf, "|VV_ISTTY", sizeof(buf));
3006 	if (vp->v_vflag & VV_NOSYNC)
3007 		strlcat(buf, "|VV_NOSYNC", sizeof(buf));
3008 	if (vp->v_vflag & VV_ETERNALDEV)
3009 		strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
3010 	if (vp->v_vflag & VV_CACHEDLABEL)
3011 		strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
3012 	if (vp->v_vflag & VV_TEXT)
3013 		strlcat(buf, "|VV_TEXT", sizeof(buf));
3014 	if (vp->v_vflag & VV_COPYONWRITE)
3015 		strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
3016 	if (vp->v_vflag & VV_SYSTEM)
3017 		strlcat(buf, "|VV_SYSTEM", sizeof(buf));
3018 	if (vp->v_vflag & VV_PROCDEP)
3019 		strlcat(buf, "|VV_PROCDEP", sizeof(buf));
3020 	if (vp->v_vflag & VV_NOKNOTE)
3021 		strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
3022 	if (vp->v_vflag & VV_DELETED)
3023 		strlcat(buf, "|VV_DELETED", sizeof(buf));
3024 	if (vp->v_vflag & VV_MD)
3025 		strlcat(buf, "|VV_MD", sizeof(buf));
3026 	if (vp->v_vflag & VV_FORCEINSMQ)
3027 		strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
3028 	flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
3029 	    VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
3030 	    VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ);
3031 	if (flags != 0) {
3032 		snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
3033 		strlcat(buf, buf2, sizeof(buf));
3034 	}
3035 	if (vp->v_iflag & VI_MOUNT)
3036 		strlcat(buf, "|VI_MOUNT", sizeof(buf));
3037 	if (vp->v_iflag & VI_AGE)
3038 		strlcat(buf, "|VI_AGE", sizeof(buf));
3039 	if (vp->v_iflag & VI_DOOMED)
3040 		strlcat(buf, "|VI_DOOMED", sizeof(buf));
3041 	if (vp->v_iflag & VI_FREE)
3042 		strlcat(buf, "|VI_FREE", sizeof(buf));
3043 	if (vp->v_iflag & VI_ACTIVE)
3044 		strlcat(buf, "|VI_ACTIVE", sizeof(buf));
3045 	if (vp->v_iflag & VI_DOINGINACT)
3046 		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
3047 	if (vp->v_iflag & VI_OWEINACT)
3048 		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
3049 	flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE |
3050 	    VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT);
3051 	if (flags != 0) {
3052 		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
3053 		strlcat(buf, buf2, sizeof(buf));
3054 	}
3055 	printf("    flags (%s)\n", buf + 1);
3056 	if (mtx_owned(VI_MTX(vp)))
3057 		printf(" VI_LOCKed");
3058 	if (vp->v_object != NULL)
3059 		printf("    v_object %p ref %d pages %d "
3060 		    "cleanbuf %d dirtybuf %d\n",
3061 		    vp->v_object, vp->v_object->ref_count,
3062 		    vp->v_object->resident_page_count,
3063 		    vp->v_bufobj.bo_clean.bv_cnt,
3064 		    vp->v_bufobj.bo_dirty.bv_cnt);
3065 	printf("    ");
3066 	lockmgr_printinfo(vp->v_vnlock);
3067 	if (vp->v_data != NULL)
3068 		VOP_PRINT(vp);
3069 }
3070 
3071 #ifdef DDB
3072 /*
3073  * List all of the locked vnodes in the system.
3074  * Called when debugging the kernel.
3075  */
3076 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
3077 {
3078 	struct mount *mp;
3079 	struct vnode *vp;
3080 
3081 	/*
3082 	 * Note: because this is DDB, we can't obey the locking semantics
3083 	 * for these structures, which means we could catch an inconsistent
3084 	 * state and dereference a nasty pointer.  Not much to be done
3085 	 * about that.
3086 	 */
3087 	db_printf("Locked vnodes\n");
3088 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3089 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3090 			if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
3091 				vprint("", vp);
3092 		}
3093 	}
3094 }
3095 
3096 /*
3097  * Show details about the given vnode.
3098  */
3099 DB_SHOW_COMMAND(vnode, db_show_vnode)
3100 {
3101 	struct vnode *vp;
3102 
3103 	if (!have_addr)
3104 		return;
3105 	vp = (struct vnode *)addr;
3106 	vn_printf(vp, "vnode ");
3107 }
3108 
3109 /*
3110  * Show details about the given mount point.
3111  */
3112 DB_SHOW_COMMAND(mount, db_show_mount)
3113 {
3114 	struct mount *mp;
3115 	struct vfsopt *opt;
3116 	struct statfs *sp;
3117 	struct vnode *vp;
3118 	char buf[512];
3119 	uint64_t mflags;
3120 	u_int flags;
3121 
3122 	if (!have_addr) {
3123 		/* No address given, print short info about all mount points. */
3124 		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3125 			db_printf("%p %s on %s (%s)\n", mp,
3126 			    mp->mnt_stat.f_mntfromname,
3127 			    mp->mnt_stat.f_mntonname,
3128 			    mp->mnt_stat.f_fstypename);
3129 			if (db_pager_quit)
3130 				break;
3131 		}
3132 		db_printf("\nMore info: show mount <addr>\n");
3133 		return;
3134 	}
3135 
3136 	mp = (struct mount *)addr;
3137 	db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
3138 	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
3139 
3140 	buf[0] = '\0';
3141 	mflags = mp->mnt_flag;
3142 #define	MNT_FLAG(flag)	do {						\
3143 	if (mflags & (flag)) {						\
3144 		if (buf[0] != '\0')					\
3145 			strlcat(buf, ", ", sizeof(buf));		\
3146 		strlcat(buf, (#flag) + 4, sizeof(buf));			\
3147 		mflags &= ~(flag);					\
3148 	}								\
3149 } while (0)
3150 	MNT_FLAG(MNT_RDONLY);
3151 	MNT_FLAG(MNT_SYNCHRONOUS);
3152 	MNT_FLAG(MNT_NOEXEC);
3153 	MNT_FLAG(MNT_NOSUID);
3154 	MNT_FLAG(MNT_NFS4ACLS);
3155 	MNT_FLAG(MNT_UNION);
3156 	MNT_FLAG(MNT_ASYNC);
3157 	MNT_FLAG(MNT_SUIDDIR);
3158 	MNT_FLAG(MNT_SOFTDEP);
3159 	MNT_FLAG(MNT_NOSYMFOLLOW);
3160 	MNT_FLAG(MNT_GJOURNAL);
3161 	MNT_FLAG(MNT_MULTILABEL);
3162 	MNT_FLAG(MNT_ACLS);
3163 	MNT_FLAG(MNT_NOATIME);
3164 	MNT_FLAG(MNT_NOCLUSTERR);
3165 	MNT_FLAG(MNT_NOCLUSTERW);
3166 	MNT_FLAG(MNT_SUJ);
3167 	MNT_FLAG(MNT_EXRDONLY);
3168 	MNT_FLAG(MNT_EXPORTED);
3169 	MNT_FLAG(MNT_DEFEXPORTED);
3170 	MNT_FLAG(MNT_EXPORTANON);
3171 	MNT_FLAG(MNT_EXKERB);
3172 	MNT_FLAG(MNT_EXPUBLIC);
3173 	MNT_FLAG(MNT_LOCAL);
3174 	MNT_FLAG(MNT_QUOTA);
3175 	MNT_FLAG(MNT_ROOTFS);
3176 	MNT_FLAG(MNT_USER);
3177 	MNT_FLAG(MNT_IGNORE);
3178 	MNT_FLAG(MNT_UPDATE);
3179 	MNT_FLAG(MNT_DELEXPORT);
3180 	MNT_FLAG(MNT_RELOAD);
3181 	MNT_FLAG(MNT_FORCE);
3182 	MNT_FLAG(MNT_SNAPSHOT);
3183 	MNT_FLAG(MNT_BYFSID);
3184 #undef MNT_FLAG
3185 	if (mflags != 0) {
3186 		if (buf[0] != '\0')
3187 			strlcat(buf, ", ", sizeof(buf));
3188 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
3189 		    "0x%016jx", mflags);
3190 	}
3191 	db_printf("    mnt_flag = %s\n", buf);
3192 
3193 	buf[0] = '\0';
3194 	flags = mp->mnt_kern_flag;
3195 #define	MNT_KERN_FLAG(flag)	do {					\
3196 	if (flags & (flag)) {						\
3197 		if (buf[0] != '\0')					\
3198 			strlcat(buf, ", ", sizeof(buf));		\
3199 		strlcat(buf, (#flag) + 5, sizeof(buf));			\
3200 		flags &= ~(flag);					\
3201 	}								\
3202 } while (0)
3203 	MNT_KERN_FLAG(MNTK_UNMOUNTF);
3204 	MNT_KERN_FLAG(MNTK_ASYNC);
3205 	MNT_KERN_FLAG(MNTK_SOFTDEP);
3206 	MNT_KERN_FLAG(MNTK_NOINSMNTQ);
3207 	MNT_KERN_FLAG(MNTK_DRAINING);
3208 	MNT_KERN_FLAG(MNTK_REFEXPIRE);
3209 	MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
3210 	MNT_KERN_FLAG(MNTK_SHARED_WRITES);
3211 	MNT_KERN_FLAG(MNTK_NO_IOPF);
3212 	MNT_KERN_FLAG(MNTK_VGONE_UPPER);
3213 	MNT_KERN_FLAG(MNTK_VGONE_WAITER);
3214 	MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
3215 	MNT_KERN_FLAG(MNTK_MARKER);
3216 	MNT_KERN_FLAG(MNTK_USES_BCACHE);
3217 	MNT_KERN_FLAG(MNTK_NOASYNC);
3218 	MNT_KERN_FLAG(MNTK_UNMOUNT);
3219 	MNT_KERN_FLAG(MNTK_MWAIT);
3220 	MNT_KERN_FLAG(MNTK_SUSPEND);
3221 	MNT_KERN_FLAG(MNTK_SUSPEND2);
3222 	MNT_KERN_FLAG(MNTK_SUSPENDED);
3223 	MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
3224 	MNT_KERN_FLAG(MNTK_NOKNOTE);
3225 #undef MNT_KERN_FLAG
3226 	if (flags != 0) {
3227 		if (buf[0] != '\0')
3228 			strlcat(buf, ", ", sizeof(buf));
3229 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
3230 		    "0x%08x", flags);
3231 	}
3232 	db_printf("    mnt_kern_flag = %s\n", buf);
3233 
3234 	db_printf("    mnt_opt = ");
3235 	opt = TAILQ_FIRST(mp->mnt_opt);
3236 	if (opt != NULL) {
3237 		db_printf("%s", opt->name);
3238 		opt = TAILQ_NEXT(opt, link);
3239 		while (opt != NULL) {
3240 			db_printf(", %s", opt->name);
3241 			opt = TAILQ_NEXT(opt, link);
3242 		}
3243 	}
3244 	db_printf("\n");
3245 
3246 	sp = &mp->mnt_stat;
3247 	db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
3248 	    "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
3249 	    "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
3250 	    "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
3251 	    (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
3252 	    (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
3253 	    (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
3254 	    (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
3255 	    (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
3256 	    (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
3257 	    (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
3258 	    (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
3259 
3260 	db_printf("    mnt_cred = { uid=%u ruid=%u",
3261 	    (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
3262 	if (jailed(mp->mnt_cred))
3263 		db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
3264 	db_printf(" }\n");
3265 	db_printf("    mnt_ref = %d\n", mp->mnt_ref);
3266 	db_printf("    mnt_gen = %d\n", mp->mnt_gen);
3267 	db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
3268 	db_printf("    mnt_activevnodelistsize = %d\n",
3269 	    mp->mnt_activevnodelistsize);
3270 	db_printf("    mnt_writeopcount = %d\n", mp->mnt_writeopcount);
3271 	db_printf("    mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
3272 	db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
3273 	db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
3274 	db_printf("    mnt_lockref = %d\n", mp->mnt_lockref);
3275 	db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
3276 	db_printf("    mnt_secondary_accwrites = %d\n",
3277 	    mp->mnt_secondary_accwrites);
3278 	db_printf("    mnt_gjprovider = %s\n",
3279 	    mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
3280 
3281 	db_printf("\n\nList of active vnodes\n");
3282 	TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) {
3283 		if (vp->v_type != VMARKER) {
3284 			vn_printf(vp, "vnode ");
3285 			if (db_pager_quit)
3286 				break;
3287 		}
3288 	}
3289 	db_printf("\n\nList of inactive vnodes\n");
3290 	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3291 		if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) {
3292 			vn_printf(vp, "vnode ");
3293 			if (db_pager_quit)
3294 				break;
3295 		}
3296 	}
3297 }
3298 #endif	/* DDB */
3299 
3300 /*
3301  * Fill in a struct xvfsconf based on a struct vfsconf.
3302  */
3303 static int
3304 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
3305 {
3306 	struct xvfsconf xvfsp;
3307 
3308 	bzero(&xvfsp, sizeof(xvfsp));
3309 	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
3310 	xvfsp.vfc_typenum = vfsp->vfc_typenum;
3311 	xvfsp.vfc_refcount = vfsp->vfc_refcount;
3312 	xvfsp.vfc_flags = vfsp->vfc_flags;
3313 	/*
3314 	 * These are unused in userland, we keep them
3315 	 * to not break binary compatibility.
3316 	 */
3317 	xvfsp.vfc_vfsops = NULL;
3318 	xvfsp.vfc_next = NULL;
3319 	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3320 }
3321 
3322 #ifdef COMPAT_FREEBSD32
3323 struct xvfsconf32 {
3324 	uint32_t	vfc_vfsops;
3325 	char		vfc_name[MFSNAMELEN];
3326 	int32_t		vfc_typenum;
3327 	int32_t		vfc_refcount;
3328 	int32_t		vfc_flags;
3329 	uint32_t	vfc_next;
3330 };
3331 
3332 static int
3333 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
3334 {
3335 	struct xvfsconf32 xvfsp;
3336 
3337 	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
3338 	xvfsp.vfc_typenum = vfsp->vfc_typenum;
3339 	xvfsp.vfc_refcount = vfsp->vfc_refcount;
3340 	xvfsp.vfc_flags = vfsp->vfc_flags;
3341 	xvfsp.vfc_vfsops = 0;
3342 	xvfsp.vfc_next = 0;
3343 	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3344 }
3345 #endif
3346 
3347 /*
3348  * Top level filesystem related information gathering.
3349  */
3350 static int
3351 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
3352 {
3353 	struct vfsconf *vfsp;
3354 	int error;
3355 
3356 	error = 0;
3357 	vfsconf_slock();
3358 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3359 #ifdef COMPAT_FREEBSD32
3360 		if (req->flags & SCTL_MASK32)
3361 			error = vfsconf2x32(req, vfsp);
3362 		else
3363 #endif
3364 			error = vfsconf2x(req, vfsp);
3365 		if (error)
3366 			break;
3367 	}
3368 	vfsconf_sunlock();
3369 	return (error);
3370 }
3371 
3372 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
3373     CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
3374     "S,xvfsconf", "List of all configured filesystems");
3375 
3376 #ifndef BURN_BRIDGES
3377 static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
3378 
3379 static int
3380 vfs_sysctl(SYSCTL_HANDLER_ARGS)
3381 {
3382 	int *name = (int *)arg1 - 1;	/* XXX */
3383 	u_int namelen = arg2 + 1;	/* XXX */
3384 	struct vfsconf *vfsp;
3385 
3386 	log(LOG_WARNING, "userland calling deprecated sysctl, "
3387 	    "please rebuild world\n");
3388 
3389 #if 1 || defined(COMPAT_PRELITE2)
3390 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
3391 	if (namelen == 1)
3392 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
3393 #endif
3394 
3395 	switch (name[1]) {
3396 	case VFS_MAXTYPENUM:
3397 		if (namelen != 2)
3398 			return (ENOTDIR);
3399 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
3400 	case VFS_CONF:
3401 		if (namelen != 3)
3402 			return (ENOTDIR);	/* overloaded */
3403 		vfsconf_slock();
3404 		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3405 			if (vfsp->vfc_typenum == name[2])
3406 				break;
3407 		}
3408 		vfsconf_sunlock();
3409 		if (vfsp == NULL)
3410 			return (EOPNOTSUPP);
3411 #ifdef COMPAT_FREEBSD32
3412 		if (req->flags & SCTL_MASK32)
3413 			return (vfsconf2x32(req, vfsp));
3414 		else
3415 #endif
3416 			return (vfsconf2x(req, vfsp));
3417 	}
3418 	return (EOPNOTSUPP);
3419 }
3420 
3421 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
3422     CTLFLAG_MPSAFE, vfs_sysctl,
3423     "Generic filesystem");
3424 
3425 #if 1 || defined(COMPAT_PRELITE2)
3426 
3427 static int
3428 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
3429 {
3430 	int error;
3431 	struct vfsconf *vfsp;
3432 	struct ovfsconf ovfs;
3433 
3434 	vfsconf_slock();
3435 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3436 		bzero(&ovfs, sizeof(ovfs));
3437 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
3438 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
3439 		ovfs.vfc_index = vfsp->vfc_typenum;
3440 		ovfs.vfc_refcount = vfsp->vfc_refcount;
3441 		ovfs.vfc_flags = vfsp->vfc_flags;
3442 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
3443 		if (error != 0) {
3444 			vfsconf_sunlock();
3445 			return (error);
3446 		}
3447 	}
3448 	vfsconf_sunlock();
3449 	return (0);
3450 }
3451 
3452 #endif /* 1 || COMPAT_PRELITE2 */
3453 #endif /* !BURN_BRIDGES */
3454 
3455 #define KINFO_VNODESLOP		10
3456 #ifdef notyet
3457 /*
3458  * Dump vnode list (via sysctl).
3459  */
3460 /* ARGSUSED */
3461 static int
3462 sysctl_vnode(SYSCTL_HANDLER_ARGS)
3463 {
3464 	struct xvnode *xvn;
3465 	struct mount *mp;
3466 	struct vnode *vp;
3467 	int error, len, n;
3468 
3469 	/*
3470 	 * Stale numvnodes access is not fatal here.
3471 	 */
3472 	req->lock = 0;
3473 	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
3474 	if (!req->oldptr)
3475 		/* Make an estimate */
3476 		return (SYSCTL_OUT(req, 0, len));
3477 
3478 	error = sysctl_wire_old_buffer(req, 0);
3479 	if (error != 0)
3480 		return (error);
3481 	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
3482 	n = 0;
3483 	mtx_lock(&mountlist_mtx);
3484 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3485 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
3486 			continue;
3487 		MNT_ILOCK(mp);
3488 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3489 			if (n == len)
3490 				break;
3491 			vref(vp);
3492 			xvn[n].xv_size = sizeof *xvn;
3493 			xvn[n].xv_vnode = vp;
3494 			xvn[n].xv_id = 0;	/* XXX compat */
3495 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
3496 			XV_COPY(usecount);
3497 			XV_COPY(writecount);
3498 			XV_COPY(holdcnt);
3499 			XV_COPY(mount);
3500 			XV_COPY(numoutput);
3501 			XV_COPY(type);
3502 #undef XV_COPY
3503 			xvn[n].xv_flag = vp->v_vflag;
3504 
3505 			switch (vp->v_type) {
3506 			case VREG:
3507 			case VDIR:
3508 			case VLNK:
3509 				break;
3510 			case VBLK:
3511 			case VCHR:
3512 				if (vp->v_rdev == NULL) {
3513 					vrele(vp);
3514 					continue;
3515 				}
3516 				xvn[n].xv_dev = dev2udev(vp->v_rdev);
3517 				break;
3518 			case VSOCK:
3519 				xvn[n].xv_socket = vp->v_socket;
3520 				break;
3521 			case VFIFO:
3522 				xvn[n].xv_fifo = vp->v_fifoinfo;
3523 				break;
3524 			case VNON:
3525 			case VBAD:
3526 			default:
3527 				/* shouldn't happen? */
3528 				vrele(vp);
3529 				continue;
3530 			}
3531 			vrele(vp);
3532 			++n;
3533 		}
3534 		MNT_IUNLOCK(mp);
3535 		mtx_lock(&mountlist_mtx);
3536 		vfs_unbusy(mp);
3537 		if (n == len)
3538 			break;
3539 	}
3540 	mtx_unlock(&mountlist_mtx);
3541 
3542 	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
3543 	free(xvn, M_TEMP);
3544 	return (error);
3545 }
3546 
3547 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD |
3548     CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
3549     "");
3550 #endif
3551 
3552 static void
3553 unmount_or_warn(struct mount *mp)
3554 {
3555 	int error;
3556 
3557 	error = dounmount(mp, MNT_FORCE, curthread);
3558 	if (error != 0) {
3559 		printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
3560 		if (error == EBUSY)
3561 			printf("BUSY)\n");
3562 		else
3563 			printf("%d)\n", error);
3564 	}
3565 }
3566 
3567 /*
3568  * Unmount all filesystems. The list is traversed in reverse order
3569  * of mounting to avoid dependencies.
3570  */
3571 void
3572 vfs_unmountall(void)
3573 {
3574 	struct mount *mp, *tmp;
3575 
3576 	CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
3577 
3578 	/*
3579 	 * Since this only runs when rebooting, it is not interlocked.
3580 	 */
3581 	TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) {
3582 		vfs_ref(mp);
3583 
3584 		/*
3585 		 * Forcibly unmounting "/dev" before "/" would prevent clean
3586 		 * unmount of the latter.
3587 		 */
3588 		if (mp == rootdevmp)
3589 			continue;
3590 
3591 		unmount_or_warn(mp);
3592 	}
3593 
3594 	if (rootdevmp != NULL)
3595 		unmount_or_warn(rootdevmp);
3596 }
3597 
3598 /*
3599  * perform msync on all vnodes under a mount point
3600  * the mount point must be locked.
3601  */
3602 void
3603 vfs_msync(struct mount *mp, int flags)
3604 {
3605 	struct vnode *vp, *mvp;
3606 	struct vm_object *obj;
3607 
3608 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
3609 	MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) {
3610 		obj = vp->v_object;
3611 		if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 &&
3612 		    (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
3613 			if (!vget(vp,
3614 			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
3615 			    curthread)) {
3616 				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
3617 					vput(vp);
3618 					continue;
3619 				}
3620 
3621 				obj = vp->v_object;
3622 				if (obj != NULL) {
3623 					VM_OBJECT_WLOCK(obj);
3624 					vm_object_page_clean(obj, 0, 0,
3625 					    flags == MNT_WAIT ?
3626 					    OBJPC_SYNC : OBJPC_NOSYNC);
3627 					VM_OBJECT_WUNLOCK(obj);
3628 				}
3629 				vput(vp);
3630 			}
3631 		} else
3632 			VI_UNLOCK(vp);
3633 	}
3634 }
3635 
3636 static void
3637 destroy_vpollinfo_free(struct vpollinfo *vi)
3638 {
3639 
3640 	knlist_destroy(&vi->vpi_selinfo.si_note);
3641 	mtx_destroy(&vi->vpi_lock);
3642 	uma_zfree(vnodepoll_zone, vi);
3643 }
3644 
3645 static void
3646 destroy_vpollinfo(struct vpollinfo *vi)
3647 {
3648 
3649 	knlist_clear(&vi->vpi_selinfo.si_note, 1);
3650 	seldrain(&vi->vpi_selinfo);
3651 	destroy_vpollinfo_free(vi);
3652 }
3653 
3654 /*
3655  * Initalize per-vnode helper structure to hold poll-related state.
3656  */
3657 void
3658 v_addpollinfo(struct vnode *vp)
3659 {
3660 	struct vpollinfo *vi;
3661 
3662 	if (vp->v_pollinfo != NULL)
3663 		return;
3664 	vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO);
3665 	mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
3666 	knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
3667 	    vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
3668 	VI_LOCK(vp);
3669 	if (vp->v_pollinfo != NULL) {
3670 		VI_UNLOCK(vp);
3671 		destroy_vpollinfo_free(vi);
3672 		return;
3673 	}
3674 	vp->v_pollinfo = vi;
3675 	VI_UNLOCK(vp);
3676 }
3677 
3678 /*
3679  * Record a process's interest in events which might happen to
3680  * a vnode.  Because poll uses the historic select-style interface
3681  * internally, this routine serves as both the ``check for any
3682  * pending events'' and the ``record my interest in future events''
3683  * functions.  (These are done together, while the lock is held,
3684  * to avoid race conditions.)
3685  */
3686 int
3687 vn_pollrecord(struct vnode *vp, struct thread *td, int events)
3688 {
3689 
3690 	v_addpollinfo(vp);
3691 	mtx_lock(&vp->v_pollinfo->vpi_lock);
3692 	if (vp->v_pollinfo->vpi_revents & events) {
3693 		/*
3694 		 * This leaves events we are not interested
3695 		 * in available for the other process which
3696 		 * which presumably had requested them
3697 		 * (otherwise they would never have been
3698 		 * recorded).
3699 		 */
3700 		events &= vp->v_pollinfo->vpi_revents;
3701 		vp->v_pollinfo->vpi_revents &= ~events;
3702 
3703 		mtx_unlock(&vp->v_pollinfo->vpi_lock);
3704 		return (events);
3705 	}
3706 	vp->v_pollinfo->vpi_events |= events;
3707 	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3708 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3709 	return (0);
3710 }
3711 
3712 /*
3713  * Routine to create and manage a filesystem syncer vnode.
3714  */
3715 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
3716 static int	sync_fsync(struct  vop_fsync_args *);
3717 static int	sync_inactive(struct  vop_inactive_args *);
3718 static int	sync_reclaim(struct  vop_reclaim_args *);
3719 
3720 static struct vop_vector sync_vnodeops = {
3721 	.vop_bypass =	VOP_EOPNOTSUPP,
3722 	.vop_close =	sync_close,		/* close */
3723 	.vop_fsync =	sync_fsync,		/* fsync */
3724 	.vop_inactive =	sync_inactive,	/* inactive */
3725 	.vop_reclaim =	sync_reclaim,	/* reclaim */
3726 	.vop_lock1 =	vop_stdlock,	/* lock */
3727 	.vop_unlock =	vop_stdunlock,	/* unlock */
3728 	.vop_islocked =	vop_stdislocked,	/* islocked */
3729 };
3730 
3731 /*
3732  * Create a new filesystem syncer vnode for the specified mount point.
3733  */
3734 void
3735 vfs_allocate_syncvnode(struct mount *mp)
3736 {
3737 	struct vnode *vp;
3738 	struct bufobj *bo;
3739 	static long start, incr, next;
3740 	int error;
3741 
3742 	/* Allocate a new vnode */
3743 	error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
3744 	if (error != 0)
3745 		panic("vfs_allocate_syncvnode: getnewvnode() failed");
3746 	vp->v_type = VNON;
3747 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3748 	vp->v_vflag |= VV_FORCEINSMQ;
3749 	error = insmntque(vp, mp);
3750 	if (error != 0)
3751 		panic("vfs_allocate_syncvnode: insmntque() failed");
3752 	vp->v_vflag &= ~VV_FORCEINSMQ;
3753 	VOP_UNLOCK(vp, 0);
3754 	/*
3755 	 * Place the vnode onto the syncer worklist. We attempt to
3756 	 * scatter them about on the list so that they will go off
3757 	 * at evenly distributed times even if all the filesystems
3758 	 * are mounted at once.
3759 	 */
3760 	next += incr;
3761 	if (next == 0 || next > syncer_maxdelay) {
3762 		start /= 2;
3763 		incr /= 2;
3764 		if (start == 0) {
3765 			start = syncer_maxdelay / 2;
3766 			incr = syncer_maxdelay;
3767 		}
3768 		next = start;
3769 	}
3770 	bo = &vp->v_bufobj;
3771 	BO_LOCK(bo);
3772 	vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
3773 	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
3774 	mtx_lock(&sync_mtx);
3775 	sync_vnode_count++;
3776 	if (mp->mnt_syncer == NULL) {
3777 		mp->mnt_syncer = vp;
3778 		vp = NULL;
3779 	}
3780 	mtx_unlock(&sync_mtx);
3781 	BO_UNLOCK(bo);
3782 	if (vp != NULL) {
3783 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3784 		vgone(vp);
3785 		vput(vp);
3786 	}
3787 }
3788 
3789 void
3790 vfs_deallocate_syncvnode(struct mount *mp)
3791 {
3792 	struct vnode *vp;
3793 
3794 	mtx_lock(&sync_mtx);
3795 	vp = mp->mnt_syncer;
3796 	if (vp != NULL)
3797 		mp->mnt_syncer = NULL;
3798 	mtx_unlock(&sync_mtx);
3799 	if (vp != NULL)
3800 		vrele(vp);
3801 }
3802 
3803 /*
3804  * Do a lazy sync of the filesystem.
3805  */
3806 static int
3807 sync_fsync(struct vop_fsync_args *ap)
3808 {
3809 	struct vnode *syncvp = ap->a_vp;
3810 	struct mount *mp = syncvp->v_mount;
3811 	int error, save;
3812 	struct bufobj *bo;
3813 
3814 	/*
3815 	 * We only need to do something if this is a lazy evaluation.
3816 	 */
3817 	if (ap->a_waitfor != MNT_LAZY)
3818 		return (0);
3819 
3820 	/*
3821 	 * Move ourselves to the back of the sync list.
3822 	 */
3823 	bo = &syncvp->v_bufobj;
3824 	BO_LOCK(bo);
3825 	vn_syncer_add_to_worklist(bo, syncdelay);
3826 	BO_UNLOCK(bo);
3827 
3828 	/*
3829 	 * Walk the list of vnodes pushing all that are dirty and
3830 	 * not already on the sync list.
3831 	 */
3832 	if (vfs_busy(mp, MBF_NOWAIT) != 0)
3833 		return (0);
3834 	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3835 		vfs_unbusy(mp);
3836 		return (0);
3837 	}
3838 	save = curthread_pflags_set(TDP_SYNCIO);
3839 	vfs_msync(mp, MNT_NOWAIT);
3840 	error = VFS_SYNC(mp, MNT_LAZY);
3841 	curthread_pflags_restore(save);
3842 	vn_finished_write(mp);
3843 	vfs_unbusy(mp);
3844 	return (error);
3845 }
3846 
3847 /*
3848  * The syncer vnode is no referenced.
3849  */
3850 static int
3851 sync_inactive(struct vop_inactive_args *ap)
3852 {
3853 
3854 	vgone(ap->a_vp);
3855 	return (0);
3856 }
3857 
3858 /*
3859  * The syncer vnode is no longer needed and is being decommissioned.
3860  *
3861  * Modifications to the worklist must be protected by sync_mtx.
3862  */
3863 static int
3864 sync_reclaim(struct vop_reclaim_args *ap)
3865 {
3866 	struct vnode *vp = ap->a_vp;
3867 	struct bufobj *bo;
3868 
3869 	bo = &vp->v_bufobj;
3870 	BO_LOCK(bo);
3871 	mtx_lock(&sync_mtx);
3872 	if (vp->v_mount->mnt_syncer == vp)
3873 		vp->v_mount->mnt_syncer = NULL;
3874 	if (bo->bo_flag & BO_ONWORKLST) {
3875 		LIST_REMOVE(bo, bo_synclist);
3876 		syncer_worklist_len--;
3877 		sync_vnode_count--;
3878 		bo->bo_flag &= ~BO_ONWORKLST;
3879 	}
3880 	mtx_unlock(&sync_mtx);
3881 	BO_UNLOCK(bo);
3882 
3883 	return (0);
3884 }
3885 
3886 /*
3887  * Check if vnode represents a disk device
3888  */
3889 int
3890 vn_isdisk(struct vnode *vp, int *errp)
3891 {
3892 	int error;
3893 
3894 	if (vp->v_type != VCHR) {
3895 		error = ENOTBLK;
3896 		goto out;
3897 	}
3898 	error = 0;
3899 	dev_lock();
3900 	if (vp->v_rdev == NULL)
3901 		error = ENXIO;
3902 	else if (vp->v_rdev->si_devsw == NULL)
3903 		error = ENXIO;
3904 	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
3905 		error = ENOTBLK;
3906 	dev_unlock();
3907 out:
3908 	if (errp != NULL)
3909 		*errp = error;
3910 	return (error == 0);
3911 }
3912 
3913 /*
3914  * Common filesystem object access control check routine.  Accepts a
3915  * vnode's type, "mode", uid and gid, requested access mode, credentials,
3916  * and optional call-by-reference privused argument allowing vaccess()
3917  * to indicate to the caller whether privilege was used to satisfy the
3918  * request (obsoleted).  Returns 0 on success, or an errno on failure.
3919  */
3920 int
3921 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
3922     accmode_t accmode, struct ucred *cred, int *privused)
3923 {
3924 	accmode_t dac_granted;
3925 	accmode_t priv_granted;
3926 
3927 	KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
3928 	    ("invalid bit in accmode"));
3929 	KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
3930 	    ("VAPPEND without VWRITE"));
3931 
3932 	/*
3933 	 * Look for a normal, non-privileged way to access the file/directory
3934 	 * as requested.  If it exists, go with that.
3935 	 */
3936 
3937 	if (privused != NULL)
3938 		*privused = 0;
3939 
3940 	dac_granted = 0;
3941 
3942 	/* Check the owner. */
3943 	if (cred->cr_uid == file_uid) {
3944 		dac_granted |= VADMIN;
3945 		if (file_mode & S_IXUSR)
3946 			dac_granted |= VEXEC;
3947 		if (file_mode & S_IRUSR)
3948 			dac_granted |= VREAD;
3949 		if (file_mode & S_IWUSR)
3950 			dac_granted |= (VWRITE | VAPPEND);
3951 
3952 		if ((accmode & dac_granted) == accmode)
3953 			return (0);
3954 
3955 		goto privcheck;
3956 	}
3957 
3958 	/* Otherwise, check the groups (first match) */
3959 	if (groupmember(file_gid, cred)) {
3960 		if (file_mode & S_IXGRP)
3961 			dac_granted |= VEXEC;
3962 		if (file_mode & S_IRGRP)
3963 			dac_granted |= VREAD;
3964 		if (file_mode & S_IWGRP)
3965 			dac_granted |= (VWRITE | VAPPEND);
3966 
3967 		if ((accmode & dac_granted) == accmode)
3968 			return (0);
3969 
3970 		goto privcheck;
3971 	}
3972 
3973 	/* Otherwise, check everyone else. */
3974 	if (file_mode & S_IXOTH)
3975 		dac_granted |= VEXEC;
3976 	if (file_mode & S_IROTH)
3977 		dac_granted |= VREAD;
3978 	if (file_mode & S_IWOTH)
3979 		dac_granted |= (VWRITE | VAPPEND);
3980 	if ((accmode & dac_granted) == accmode)
3981 		return (0);
3982 
3983 privcheck:
3984 	/*
3985 	 * Build a privilege mask to determine if the set of privileges
3986 	 * satisfies the requirements when combined with the granted mask
3987 	 * from above.  For each privilege, if the privilege is required,
3988 	 * bitwise or the request type onto the priv_granted mask.
3989 	 */
3990 	priv_granted = 0;
3991 
3992 	if (type == VDIR) {
3993 		/*
3994 		 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
3995 		 * requests, instead of PRIV_VFS_EXEC.
3996 		 */
3997 		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3998 		    !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
3999 			priv_granted |= VEXEC;
4000 	} else {
4001 		/*
4002 		 * Ensure that at least one execute bit is on. Otherwise,
4003 		 * a privileged user will always succeed, and we don't want
4004 		 * this to happen unless the file really is executable.
4005 		 */
4006 		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
4007 		    (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
4008 		    !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
4009 			priv_granted |= VEXEC;
4010 	}
4011 
4012 	if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
4013 	    !priv_check_cred(cred, PRIV_VFS_READ, 0))
4014 		priv_granted |= VREAD;
4015 
4016 	if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
4017 	    !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
4018 		priv_granted |= (VWRITE | VAPPEND);
4019 
4020 	if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
4021 	    !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
4022 		priv_granted |= VADMIN;
4023 
4024 	if ((accmode & (priv_granted | dac_granted)) == accmode) {
4025 		/* XXX audit: privilege used */
4026 		if (privused != NULL)
4027 			*privused = 1;
4028 		return (0);
4029 	}
4030 
4031 	return ((accmode & VADMIN) ? EPERM : EACCES);
4032 }
4033 
4034 /*
4035  * Credential check based on process requesting service, and per-attribute
4036  * permissions.
4037  */
4038 int
4039 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
4040     struct thread *td, accmode_t accmode)
4041 {
4042 
4043 	/*
4044 	 * Kernel-invoked always succeeds.
4045 	 */
4046 	if (cred == NOCRED)
4047 		return (0);
4048 
4049 	/*
4050 	 * Do not allow privileged processes in jail to directly manipulate
4051 	 * system attributes.
4052 	 */
4053 	switch (attrnamespace) {
4054 	case EXTATTR_NAMESPACE_SYSTEM:
4055 		/* Potentially should be: return (EPERM); */
4056 		return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
4057 	case EXTATTR_NAMESPACE_USER:
4058 		return (VOP_ACCESS(vp, accmode, cred, td));
4059 	default:
4060 		return (EPERM);
4061 	}
4062 }
4063 
4064 #ifdef DEBUG_VFS_LOCKS
4065 /*
4066  * This only exists to supress warnings from unlocked specfs accesses.  It is
4067  * no longer ok to have an unlocked VFS.
4068  */
4069 #define	IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL ||		\
4070 	(vp)->v_type == VCHR ||	(vp)->v_type == VBAD)
4071 
4072 int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
4073 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
4074     "Drop into debugger on lock violation");
4075 
4076 int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
4077 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
4078     0, "Check for interlock across VOPs");
4079 
4080 int vfs_badlock_print = 1;	/* Print lock violations. */
4081 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
4082     0, "Print lock violations");
4083 
4084 #ifdef KDB
4085 int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
4086 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
4087     &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
4088 #endif
4089 
4090 static void
4091 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
4092 {
4093 
4094 #ifdef KDB
4095 	if (vfs_badlock_backtrace)
4096 		kdb_backtrace();
4097 #endif
4098 	if (vfs_badlock_print)
4099 		printf("%s: %p %s\n", str, (void *)vp, msg);
4100 	if (vfs_badlock_ddb)
4101 		kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
4102 }
4103 
4104 void
4105 assert_vi_locked(struct vnode *vp, const char *str)
4106 {
4107 
4108 	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
4109 		vfs_badlock("interlock is not locked but should be", str, vp);
4110 }
4111 
4112 void
4113 assert_vi_unlocked(struct vnode *vp, const char *str)
4114 {
4115 
4116 	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
4117 		vfs_badlock("interlock is locked but should not be", str, vp);
4118 }
4119 
4120 void
4121 assert_vop_locked(struct vnode *vp, const char *str)
4122 {
4123 	int locked;
4124 
4125 	if (!IGNORE_LOCK(vp)) {
4126 		locked = VOP_ISLOCKED(vp);
4127 		if (locked == 0 || locked == LK_EXCLOTHER)
4128 			vfs_badlock("is not locked but should be", str, vp);
4129 	}
4130 }
4131 
4132 void
4133 assert_vop_unlocked(struct vnode *vp, const char *str)
4134 {
4135 
4136 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
4137 		vfs_badlock("is locked but should not be", str, vp);
4138 }
4139 
4140 void
4141 assert_vop_elocked(struct vnode *vp, const char *str)
4142 {
4143 
4144 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
4145 		vfs_badlock("is not exclusive locked but should be", str, vp);
4146 }
4147 
4148 #if 0
4149 void
4150 assert_vop_elocked_other(struct vnode *vp, const char *str)
4151 {
4152 
4153 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER)
4154 		vfs_badlock("is not exclusive locked by another thread",
4155 		    str, vp);
4156 }
4157 
4158 void
4159 assert_vop_slocked(struct vnode *vp, const char *str)
4160 {
4161 
4162 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED)
4163 		vfs_badlock("is not locked shared but should be", str, vp);
4164 }
4165 #endif /* 0 */
4166 #endif /* DEBUG_VFS_LOCKS */
4167 
4168 void
4169 vop_rename_fail(struct vop_rename_args *ap)
4170 {
4171 
4172 	if (ap->a_tvp != NULL)
4173 		vput(ap->a_tvp);
4174 	if (ap->a_tdvp == ap->a_tvp)
4175 		vrele(ap->a_tdvp);
4176 	else
4177 		vput(ap->a_tdvp);
4178 	vrele(ap->a_fdvp);
4179 	vrele(ap->a_fvp);
4180 }
4181 
4182 void
4183 vop_rename_pre(void *ap)
4184 {
4185 	struct vop_rename_args *a = ap;
4186 
4187 #ifdef DEBUG_VFS_LOCKS
4188 	if (a->a_tvp)
4189 		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
4190 	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
4191 	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
4192 	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
4193 
4194 	/* Check the source (from). */
4195 	if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
4196 	    (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
4197 		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
4198 	if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
4199 		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
4200 
4201 	/* Check the target. */
4202 	if (a->a_tvp)
4203 		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
4204 	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
4205 #endif
4206 	if (a->a_tdvp != a->a_fdvp)
4207 		vhold(a->a_fdvp);
4208 	if (a->a_tvp != a->a_fvp)
4209 		vhold(a->a_fvp);
4210 	vhold(a->a_tdvp);
4211 	if (a->a_tvp)
4212 		vhold(a->a_tvp);
4213 }
4214 
4215 void
4216 vop_strategy_pre(void *ap)
4217 {
4218 #ifdef DEBUG_VFS_LOCKS
4219 	struct vop_strategy_args *a;
4220 	struct buf *bp;
4221 
4222 	a = ap;
4223 	bp = a->a_bp;
4224 
4225 	/*
4226 	 * Cluster ops lock their component buffers but not the IO container.
4227 	 */
4228 	if ((bp->b_flags & B_CLUSTER) != 0)
4229 		return;
4230 
4231 	if (panicstr == NULL && !BUF_ISLOCKED(bp)) {
4232 		if (vfs_badlock_print)
4233 			printf(
4234 			    "VOP_STRATEGY: bp is not locked but should be\n");
4235 		if (vfs_badlock_ddb)
4236 			kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
4237 	}
4238 #endif
4239 }
4240 
4241 void
4242 vop_lock_pre(void *ap)
4243 {
4244 #ifdef DEBUG_VFS_LOCKS
4245 	struct vop_lock1_args *a = ap;
4246 
4247 	if ((a->a_flags & LK_INTERLOCK) == 0)
4248 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
4249 	else
4250 		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
4251 #endif
4252 }
4253 
4254 void
4255 vop_lock_post(void *ap, int rc)
4256 {
4257 #ifdef DEBUG_VFS_LOCKS
4258 	struct vop_lock1_args *a = ap;
4259 
4260 	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
4261 	if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
4262 		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
4263 #endif
4264 }
4265 
4266 void
4267 vop_unlock_pre(void *ap)
4268 {
4269 #ifdef DEBUG_VFS_LOCKS
4270 	struct vop_unlock_args *a = ap;
4271 
4272 	if (a->a_flags & LK_INTERLOCK)
4273 		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
4274 	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
4275 #endif
4276 }
4277 
4278 void
4279 vop_unlock_post(void *ap, int rc)
4280 {
4281 #ifdef DEBUG_VFS_LOCKS
4282 	struct vop_unlock_args *a = ap;
4283 
4284 	if (a->a_flags & LK_INTERLOCK)
4285 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
4286 #endif
4287 }
4288 
4289 void
4290 vop_create_post(void *ap, int rc)
4291 {
4292 	struct vop_create_args *a = ap;
4293 
4294 	if (!rc)
4295 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4296 }
4297 
4298 void
4299 vop_deleteextattr_post(void *ap, int rc)
4300 {
4301 	struct vop_deleteextattr_args *a = ap;
4302 
4303 	if (!rc)
4304 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4305 }
4306 
4307 void
4308 vop_link_post(void *ap, int rc)
4309 {
4310 	struct vop_link_args *a = ap;
4311 
4312 	if (!rc) {
4313 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
4314 		VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
4315 	}
4316 }
4317 
4318 void
4319 vop_mkdir_post(void *ap, int rc)
4320 {
4321 	struct vop_mkdir_args *a = ap;
4322 
4323 	if (!rc)
4324 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4325 }
4326 
4327 void
4328 vop_mknod_post(void *ap, int rc)
4329 {
4330 	struct vop_mknod_args *a = ap;
4331 
4332 	if (!rc)
4333 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4334 }
4335 
4336 void
4337 vop_reclaim_post(void *ap, int rc)
4338 {
4339 	struct vop_reclaim_args *a = ap;
4340 
4341 	if (!rc)
4342 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_REVOKE);
4343 }
4344 
4345 void
4346 vop_remove_post(void *ap, int rc)
4347 {
4348 	struct vop_remove_args *a = ap;
4349 
4350 	if (!rc) {
4351 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4352 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4353 	}
4354 }
4355 
4356 void
4357 vop_rename_post(void *ap, int rc)
4358 {
4359 	struct vop_rename_args *a = ap;
4360 
4361 	if (!rc) {
4362 		VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
4363 		VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
4364 		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
4365 		if (a->a_tvp)
4366 			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
4367 	}
4368 	if (a->a_tdvp != a->a_fdvp)
4369 		vdrop(a->a_fdvp);
4370 	if (a->a_tvp != a->a_fvp)
4371 		vdrop(a->a_fvp);
4372 	vdrop(a->a_tdvp);
4373 	if (a->a_tvp)
4374 		vdrop(a->a_tvp);
4375 }
4376 
4377 void
4378 vop_rmdir_post(void *ap, int rc)
4379 {
4380 	struct vop_rmdir_args *a = ap;
4381 
4382 	if (!rc) {
4383 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4384 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4385 	}
4386 }
4387 
4388 void
4389 vop_setattr_post(void *ap, int rc)
4390 {
4391 	struct vop_setattr_args *a = ap;
4392 
4393 	if (!rc)
4394 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4395 }
4396 
4397 void
4398 vop_setextattr_post(void *ap, int rc)
4399 {
4400 	struct vop_setextattr_args *a = ap;
4401 
4402 	if (!rc)
4403 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4404 }
4405 
4406 void
4407 vop_symlink_post(void *ap, int rc)
4408 {
4409 	struct vop_symlink_args *a = ap;
4410 
4411 	if (!rc)
4412 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4413 }
4414 
4415 static struct knlist fs_knlist;
4416 
4417 static void
4418 vfs_event_init(void *arg)
4419 {
4420 	knlist_init_mtx(&fs_knlist, NULL);
4421 }
4422 /* XXX - correct order? */
4423 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
4424 
4425 void
4426 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
4427 {
4428 
4429 	KNOTE_UNLOCKED(&fs_knlist, event);
4430 }
4431 
4432 static int	filt_fsattach(struct knote *kn);
4433 static void	filt_fsdetach(struct knote *kn);
4434 static int	filt_fsevent(struct knote *kn, long hint);
4435 
4436 struct filterops fs_filtops = {
4437 	.f_isfd = 0,
4438 	.f_attach = filt_fsattach,
4439 	.f_detach = filt_fsdetach,
4440 	.f_event = filt_fsevent
4441 };
4442 
4443 static int
4444 filt_fsattach(struct knote *kn)
4445 {
4446 
4447 	kn->kn_flags |= EV_CLEAR;
4448 	knlist_add(&fs_knlist, kn, 0);
4449 	return (0);
4450 }
4451 
4452 static void
4453 filt_fsdetach(struct knote *kn)
4454 {
4455 
4456 	knlist_remove(&fs_knlist, kn, 0);
4457 }
4458 
4459 static int
4460 filt_fsevent(struct knote *kn, long hint)
4461 {
4462 
4463 	kn->kn_fflags |= hint;
4464 	return (kn->kn_fflags != 0);
4465 }
4466 
4467 static int
4468 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
4469 {
4470 	struct vfsidctl vc;
4471 	int error;
4472 	struct mount *mp;
4473 
4474 	error = SYSCTL_IN(req, &vc, sizeof(vc));
4475 	if (error)
4476 		return (error);
4477 	if (vc.vc_vers != VFS_CTL_VERS1)
4478 		return (EINVAL);
4479 	mp = vfs_getvfs(&vc.vc_fsid);
4480 	if (mp == NULL)
4481 		return (ENOENT);
4482 	/* ensure that a specific sysctl goes to the right filesystem. */
4483 	if (strcmp(vc.vc_fstypename, "*") != 0 &&
4484 	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
4485 		vfs_rel(mp);
4486 		return (EINVAL);
4487 	}
4488 	VCTLTOREQ(&vc, req);
4489 	error = VFS_SYSCTL(mp, vc.vc_op, req);
4490 	vfs_rel(mp);
4491 	return (error);
4492 }
4493 
4494 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR,
4495     NULL, 0, sysctl_vfs_ctl, "",
4496     "Sysctl by fsid");
4497 
4498 /*
4499  * Function to initialize a va_filerev field sensibly.
4500  * XXX: Wouldn't a random number make a lot more sense ??
4501  */
4502 u_quad_t
4503 init_va_filerev(void)
4504 {
4505 	struct bintime bt;
4506 
4507 	getbinuptime(&bt);
4508 	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
4509 }
4510 
4511 static int	filt_vfsread(struct knote *kn, long hint);
4512 static int	filt_vfswrite(struct knote *kn, long hint);
4513 static int	filt_vfsvnode(struct knote *kn, long hint);
4514 static void	filt_vfsdetach(struct knote *kn);
4515 static struct filterops vfsread_filtops = {
4516 	.f_isfd = 1,
4517 	.f_detach = filt_vfsdetach,
4518 	.f_event = filt_vfsread
4519 };
4520 static struct filterops vfswrite_filtops = {
4521 	.f_isfd = 1,
4522 	.f_detach = filt_vfsdetach,
4523 	.f_event = filt_vfswrite
4524 };
4525 static struct filterops vfsvnode_filtops = {
4526 	.f_isfd = 1,
4527 	.f_detach = filt_vfsdetach,
4528 	.f_event = filt_vfsvnode
4529 };
4530 
4531 static void
4532 vfs_knllock(void *arg)
4533 {
4534 	struct vnode *vp = arg;
4535 
4536 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4537 }
4538 
4539 static void
4540 vfs_knlunlock(void *arg)
4541 {
4542 	struct vnode *vp = arg;
4543 
4544 	VOP_UNLOCK(vp, 0);
4545 }
4546 
4547 static void
4548 vfs_knl_assert_locked(void *arg)
4549 {
4550 #ifdef DEBUG_VFS_LOCKS
4551 	struct vnode *vp = arg;
4552 
4553 	ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
4554 #endif
4555 }
4556 
4557 static void
4558 vfs_knl_assert_unlocked(void *arg)
4559 {
4560 #ifdef DEBUG_VFS_LOCKS
4561 	struct vnode *vp = arg;
4562 
4563 	ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
4564 #endif
4565 }
4566 
4567 int
4568 vfs_kqfilter(struct vop_kqfilter_args *ap)
4569 {
4570 	struct vnode *vp = ap->a_vp;
4571 	struct knote *kn = ap->a_kn;
4572 	struct knlist *knl;
4573 
4574 	switch (kn->kn_filter) {
4575 	case EVFILT_READ:
4576 		kn->kn_fop = &vfsread_filtops;
4577 		break;
4578 	case EVFILT_WRITE:
4579 		kn->kn_fop = &vfswrite_filtops;
4580 		break;
4581 	case EVFILT_VNODE:
4582 		kn->kn_fop = &vfsvnode_filtops;
4583 		break;
4584 	default:
4585 		return (EINVAL);
4586 	}
4587 
4588 	kn->kn_hook = (caddr_t)vp;
4589 
4590 	v_addpollinfo(vp);
4591 	if (vp->v_pollinfo == NULL)
4592 		return (ENOMEM);
4593 	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
4594 	vhold(vp);
4595 	knlist_add(knl, kn, 0);
4596 
4597 	return (0);
4598 }
4599 
4600 /*
4601  * Detach knote from vnode
4602  */
4603 static void
4604 filt_vfsdetach(struct knote *kn)
4605 {
4606 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4607 
4608 	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
4609 	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
4610 	vdrop(vp);
4611 }
4612 
4613 /*ARGSUSED*/
4614 static int
4615 filt_vfsread(struct knote *kn, long hint)
4616 {
4617 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4618 	struct vattr va;
4619 	int res;
4620 
4621 	/*
4622 	 * filesystem is gone, so set the EOF flag and schedule
4623 	 * the knote for deletion.
4624 	 */
4625 	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
4626 		VI_LOCK(vp);
4627 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4628 		VI_UNLOCK(vp);
4629 		return (1);
4630 	}
4631 
4632 	if (VOP_GETATTR(vp, &va, curthread->td_ucred))
4633 		return (0);
4634 
4635 	VI_LOCK(vp);
4636 	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
4637 	res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0;
4638 	VI_UNLOCK(vp);
4639 	return (res);
4640 }
4641 
4642 /*ARGSUSED*/
4643 static int
4644 filt_vfswrite(struct knote *kn, long hint)
4645 {
4646 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4647 
4648 	VI_LOCK(vp);
4649 
4650 	/*
4651 	 * filesystem is gone, so set the EOF flag and schedule
4652 	 * the knote for deletion.
4653 	 */
4654 	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD))
4655 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4656 
4657 	kn->kn_data = 0;
4658 	VI_UNLOCK(vp);
4659 	return (1);
4660 }
4661 
4662 static int
4663 filt_vfsvnode(struct knote *kn, long hint)
4664 {
4665 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4666 	int res;
4667 
4668 	VI_LOCK(vp);
4669 	if (kn->kn_sfflags & hint)
4670 		kn->kn_fflags |= hint;
4671 	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
4672 		kn->kn_flags |= EV_EOF;
4673 		VI_UNLOCK(vp);
4674 		return (1);
4675 	}
4676 	res = (kn->kn_fflags != 0);
4677 	VI_UNLOCK(vp);
4678 	return (res);
4679 }
4680 
4681 int
4682 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
4683 {
4684 	int error;
4685 
4686 	if (dp->d_reclen > ap->a_uio->uio_resid)
4687 		return (ENAMETOOLONG);
4688 	error = uiomove(dp, dp->d_reclen, ap->a_uio);
4689 	if (error) {
4690 		if (ap->a_ncookies != NULL) {
4691 			if (ap->a_cookies != NULL)
4692 				free(ap->a_cookies, M_TEMP);
4693 			ap->a_cookies = NULL;
4694 			*ap->a_ncookies = 0;
4695 		}
4696 		return (error);
4697 	}
4698 	if (ap->a_ncookies == NULL)
4699 		return (0);
4700 
4701 	KASSERT(ap->a_cookies,
4702 	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
4703 
4704 	*ap->a_cookies = realloc(*ap->a_cookies,
4705 	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
4706 	(*ap->a_cookies)[*ap->a_ncookies] = off;
4707 	return (0);
4708 }
4709 
4710 /*
4711  * Mark for update the access time of the file if the filesystem
4712  * supports VOP_MARKATIME.  This functionality is used by execve and
4713  * mmap, so we want to avoid the I/O implied by directly setting
4714  * va_atime for the sake of efficiency.
4715  */
4716 void
4717 vfs_mark_atime(struct vnode *vp, struct ucred *cred)
4718 {
4719 	struct mount *mp;
4720 
4721 	mp = vp->v_mount;
4722 	ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
4723 	if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
4724 		(void)VOP_MARKATIME(vp);
4725 }
4726 
4727 /*
4728  * The purpose of this routine is to remove granularity from accmode_t,
4729  * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
4730  * VADMIN and VAPPEND.
4731  *
4732  * If it returns 0, the caller is supposed to continue with the usual
4733  * access checks using 'accmode' as modified by this routine.  If it
4734  * returns nonzero value, the caller is supposed to return that value
4735  * as errno.
4736  *
4737  * Note that after this routine runs, accmode may be zero.
4738  */
4739 int
4740 vfs_unixify_accmode(accmode_t *accmode)
4741 {
4742 	/*
4743 	 * There is no way to specify explicit "deny" rule using
4744 	 * file mode or POSIX.1e ACLs.
4745 	 */
4746 	if (*accmode & VEXPLICIT_DENY) {
4747 		*accmode = 0;
4748 		return (0);
4749 	}
4750 
4751 	/*
4752 	 * None of these can be translated into usual access bits.
4753 	 * Also, the common case for NFSv4 ACLs is to not contain
4754 	 * either of these bits. Caller should check for VWRITE
4755 	 * on the containing directory instead.
4756 	 */
4757 	if (*accmode & (VDELETE_CHILD | VDELETE))
4758 		return (EPERM);
4759 
4760 	if (*accmode & VADMIN_PERMS) {
4761 		*accmode &= ~VADMIN_PERMS;
4762 		*accmode |= VADMIN;
4763 	}
4764 
4765 	/*
4766 	 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
4767 	 * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
4768 	 */
4769 	*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
4770 
4771 	return (0);
4772 }
4773 
4774 /*
4775  * These are helper functions for filesystems to traverse all
4776  * their vnodes.  See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
4777  *
4778  * This interface replaces MNT_VNODE_FOREACH.
4779  */
4780 
4781 MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
4782 
4783 struct vnode *
4784 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
4785 {
4786 	struct vnode *vp;
4787 
4788 	if (should_yield())
4789 		kern_yield(PRI_USER);
4790 	MNT_ILOCK(mp);
4791 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4792 	vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
4793 	while (vp != NULL && (vp->v_type == VMARKER ||
4794 	    (vp->v_iflag & VI_DOOMED) != 0))
4795 		vp = TAILQ_NEXT(vp, v_nmntvnodes);
4796 
4797 	/* Check if we are done */
4798 	if (vp == NULL) {
4799 		__mnt_vnode_markerfree_all(mvp, mp);
4800 		/* MNT_IUNLOCK(mp); -- done in above function */
4801 		mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
4802 		return (NULL);
4803 	}
4804 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
4805 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
4806 	VI_LOCK(vp);
4807 	MNT_IUNLOCK(mp);
4808 	return (vp);
4809 }
4810 
4811 struct vnode *
4812 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
4813 {
4814 	struct vnode *vp;
4815 
4816 	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
4817 	MNT_ILOCK(mp);
4818 	MNT_REF(mp);
4819 	(*mvp)->v_type = VMARKER;
4820 
4821 	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
4822 	while (vp != NULL && (vp->v_type == VMARKER ||
4823 	    (vp->v_iflag & VI_DOOMED) != 0))
4824 		vp = TAILQ_NEXT(vp, v_nmntvnodes);
4825 
4826 	/* Check if we are done */
4827 	if (vp == NULL) {
4828 		MNT_REL(mp);
4829 		MNT_IUNLOCK(mp);
4830 		free(*mvp, M_VNODE_MARKER);
4831 		*mvp = NULL;
4832 		return (NULL);
4833 	}
4834 	(*mvp)->v_mount = mp;
4835 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
4836 	VI_LOCK(vp);
4837 	MNT_IUNLOCK(mp);
4838 	return (vp);
4839 }
4840 
4841 
4842 void
4843 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
4844 {
4845 
4846 	if (*mvp == NULL) {
4847 		MNT_IUNLOCK(mp);
4848 		return;
4849 	}
4850 
4851 	mtx_assert(MNT_MTX(mp), MA_OWNED);
4852 
4853 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4854 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
4855 	MNT_REL(mp);
4856 	MNT_IUNLOCK(mp);
4857 	free(*mvp, M_VNODE_MARKER);
4858 	*mvp = NULL;
4859 }
4860 
4861 /*
4862  * These are helper functions for filesystems to traverse their
4863  * active vnodes.  See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h
4864  */
4865 static void
4866 mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
4867 {
4868 
4869 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4870 
4871 	MNT_ILOCK(mp);
4872 	MNT_REL(mp);
4873 	MNT_IUNLOCK(mp);
4874 	free(*mvp, M_VNODE_MARKER);
4875 	*mvp = NULL;
4876 }
4877 
4878 static struct vnode *
4879 mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
4880 {
4881 	struct vnode *vp, *nvp;
4882 
4883 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
4884 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4885 restart:
4886 	vp = TAILQ_NEXT(*mvp, v_actfreelist);
4887 	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
4888 	while (vp != NULL) {
4889 		if (vp->v_type == VMARKER) {
4890 			vp = TAILQ_NEXT(vp, v_actfreelist);
4891 			continue;
4892 		}
4893 		if (!VI_TRYLOCK(vp)) {
4894 			if (mp_ncpus == 1 || should_yield()) {
4895 				TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
4896 				mtx_unlock(&vnode_free_list_mtx);
4897 				pause("vnacti", 1);
4898 				mtx_lock(&vnode_free_list_mtx);
4899 				goto restart;
4900 			}
4901 			continue;
4902 		}
4903 		KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
4904 		KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
4905 		    ("alien vnode on the active list %p %p", vp, mp));
4906 		if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0)
4907 			break;
4908 		nvp = TAILQ_NEXT(vp, v_actfreelist);
4909 		VI_UNLOCK(vp);
4910 		vp = nvp;
4911 	}
4912 
4913 	/* Check if we are done */
4914 	if (vp == NULL) {
4915 		mtx_unlock(&vnode_free_list_mtx);
4916 		mnt_vnode_markerfree_active(mvp, mp);
4917 		return (NULL);
4918 	}
4919 	TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist);
4920 	mtx_unlock(&vnode_free_list_mtx);
4921 	ASSERT_VI_LOCKED(vp, "active iter");
4922 	KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
4923 	return (vp);
4924 }
4925 
4926 struct vnode *
4927 __mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
4928 {
4929 
4930 	if (should_yield())
4931 		kern_yield(PRI_USER);
4932 	mtx_lock(&vnode_free_list_mtx);
4933 	return (mnt_vnode_next_active(mvp, mp));
4934 }
4935 
4936 struct vnode *
4937 __mnt_vnode_first_active(struct vnode **mvp, struct mount *mp)
4938 {
4939 	struct vnode *vp;
4940 
4941 	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
4942 	MNT_ILOCK(mp);
4943 	MNT_REF(mp);
4944 	MNT_IUNLOCK(mp);
4945 	(*mvp)->v_type = VMARKER;
4946 	(*mvp)->v_mount = mp;
4947 
4948 	mtx_lock(&vnode_free_list_mtx);
4949 	vp = TAILQ_FIRST(&mp->mnt_activevnodelist);
4950 	if (vp == NULL) {
4951 		mtx_unlock(&vnode_free_list_mtx);
4952 		mnt_vnode_markerfree_active(mvp, mp);
4953 		return (NULL);
4954 	}
4955 	TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
4956 	return (mnt_vnode_next_active(mvp, mp));
4957 }
4958 
4959 void
4960 __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
4961 {
4962 
4963 	if (*mvp == NULL)
4964 		return;
4965 
4966 	mtx_lock(&vnode_free_list_mtx);
4967 	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
4968 	mtx_unlock(&vnode_free_list_mtx);
4969 	mnt_vnode_markerfree_active(mvp, mp);
4970 }
4971