xref: /freebsd/sys/kern/vfs_subr.c (revision 39ee7a7a6bdd1557b1c3532abf60d139798ac88b)
1 /*-
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
35  */
36 
37 /*
38  * External virtual filesystem routines
39  */
40 
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 #include "opt_compat.h"
45 #include "opt_ddb.h"
46 #include "opt_watchdog.h"
47 
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/bio.h>
51 #include <sys/buf.h>
52 #include <sys/condvar.h>
53 #include <sys/conf.h>
54 #include <sys/dirent.h>
55 #include <sys/event.h>
56 #include <sys/eventhandler.h>
57 #include <sys/extattr.h>
58 #include <sys/file.h>
59 #include <sys/fcntl.h>
60 #include <sys/jail.h>
61 #include <sys/kdb.h>
62 #include <sys/kernel.h>
63 #include <sys/kthread.h>
64 #include <sys/lockf.h>
65 #include <sys/malloc.h>
66 #include <sys/mount.h>
67 #include <sys/namei.h>
68 #include <sys/pctrie.h>
69 #include <sys/priv.h>
70 #include <sys/reboot.h>
71 #include <sys/refcount.h>
72 #include <sys/rwlock.h>
73 #include <sys/sched.h>
74 #include <sys/sleepqueue.h>
75 #include <sys/smp.h>
76 #include <sys/stat.h>
77 #include <sys/sysctl.h>
78 #include <sys/syslog.h>
79 #include <sys/vmmeter.h>
80 #include <sys/vnode.h>
81 #include <sys/watchdog.h>
82 
83 #include <machine/stdarg.h>
84 
85 #include <security/mac/mac_framework.h>
86 
87 #include <vm/vm.h>
88 #include <vm/vm_object.h>
89 #include <vm/vm_extern.h>
90 #include <vm/pmap.h>
91 #include <vm/vm_map.h>
92 #include <vm/vm_page.h>
93 #include <vm/vm_kern.h>
94 #include <vm/uma.h>
95 
96 #ifdef DDB
97 #include <ddb/ddb.h>
98 #endif
99 
100 static void	delmntque(struct vnode *vp);
101 static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
102 		    int slpflag, int slptimeo);
103 static void	syncer_shutdown(void *arg, int howto);
104 static int	vtryrecycle(struct vnode *vp);
105 static void	v_init_counters(struct vnode *);
106 static void	v_incr_usecount(struct vnode *);
107 static void	v_incr_devcount(struct vnode *);
108 static void	v_decr_devcount(struct vnode *);
109 static void	vnlru_free(int);
110 static void	vgonel(struct vnode *);
111 static void	vfs_knllock(void *arg);
112 static void	vfs_knlunlock(void *arg);
113 static void	vfs_knl_assert_locked(void *arg);
114 static void	vfs_knl_assert_unlocked(void *arg);
115 static void	destroy_vpollinfo(struct vpollinfo *vi);
116 
117 /*
118  * Number of vnodes in existence.  Increased whenever getnewvnode()
119  * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode.
120  */
121 static unsigned long	numvnodes;
122 
123 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
124     "Number of vnodes in existence");
125 
126 static u_long vnodes_created;
127 SYSCTL_ULONG(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
128     0, "Number of vnodes created by getnewvnode");
129 
130 /*
131  * Conversion tables for conversion from vnode types to inode formats
132  * and back.
133  */
134 enum vtype iftovt_tab[16] = {
135 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
136 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
137 };
138 int vttoif_tab[10] = {
139 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
140 	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
141 };
142 
143 /*
144  * List of vnodes that are ready for recycling.
145  */
146 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
147 
148 /*
149  * Free vnode target.  Free vnodes may simply be files which have been stat'd
150  * but not read.  This is somewhat common, and a small cache of such files
151  * should be kept to avoid recreation costs.
152  */
153 static u_long wantfreevnodes;
154 SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
155 /* Number of vnodes in the free list. */
156 static u_long freevnodes;
157 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0,
158     "Number of vnodes in the free list");
159 
160 static int vlru_allow_cache_src;
161 SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW,
162     &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode");
163 
164 static u_long recycles_count;
165 SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 0,
166     "Number of vnodes recycled to avoid exceding kern.maxvnodes");
167 
168 /*
169  * Various variables used for debugging the new implementation of
170  * reassignbuf().
171  * XXX these are probably of (very) limited utility now.
172  */
173 static int reassignbufcalls;
174 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
175     "Number of calls to reassignbuf");
176 
177 static u_long free_owe_inact;
178 SYSCTL_ULONG(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact, 0,
179     "Number of times free vnodes kept on active list due to VFS "
180     "owing inactivation");
181 
182 /*
183  * Cache for the mount type id assigned to NFS.  This is used for
184  * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
185  */
186 int	nfs_mount_type = -1;
187 
188 /* To keep more than one thread at a time from running vfs_getnewfsid */
189 static struct mtx mntid_mtx;
190 
191 /*
192  * Lock for any access to the following:
193  *	vnode_free_list
194  *	numvnodes
195  *	freevnodes
196  */
197 static struct mtx vnode_free_list_mtx;
198 
199 /* Publicly exported FS */
200 struct nfs_public nfs_pub;
201 
202 static uma_zone_t buf_trie_zone;
203 
204 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
205 static uma_zone_t vnode_zone;
206 static uma_zone_t vnodepoll_zone;
207 
208 /*
209  * The workitem queue.
210  *
211  * It is useful to delay writes of file data and filesystem metadata
212  * for tens of seconds so that quickly created and deleted files need
213  * not waste disk bandwidth being created and removed. To realize this,
214  * we append vnodes to a "workitem" queue. When running with a soft
215  * updates implementation, most pending metadata dependencies should
216  * not wait for more than a few seconds. Thus, mounted on block devices
217  * are delayed only about a half the time that file data is delayed.
218  * Similarly, directory updates are more critical, so are only delayed
219  * about a third the time that file data is delayed. Thus, there are
220  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
221  * one each second (driven off the filesystem syncer process). The
222  * syncer_delayno variable indicates the next queue that is to be processed.
223  * Items that need to be processed soon are placed in this queue:
224  *
225  *	syncer_workitem_pending[syncer_delayno]
226  *
227  * A delay of fifteen seconds is done by placing the request fifteen
228  * entries later in the queue:
229  *
230  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
231  *
232  */
233 static int syncer_delayno;
234 static long syncer_mask;
235 LIST_HEAD(synclist, bufobj);
236 static struct synclist *syncer_workitem_pending;
237 /*
238  * The sync_mtx protects:
239  *	bo->bo_synclist
240  *	sync_vnode_count
241  *	syncer_delayno
242  *	syncer_state
243  *	syncer_workitem_pending
244  *	syncer_worklist_len
245  *	rushjob
246  */
247 static struct mtx sync_mtx;
248 static struct cv sync_wakeup;
249 
250 #define SYNCER_MAXDELAY		32
251 static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
252 static int syncdelay = 30;		/* max time to delay syncing data */
253 static int filedelay = 30;		/* time to delay syncing files */
254 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
255     "Time to delay syncing files (in seconds)");
256 static int dirdelay = 29;		/* time to delay syncing directories */
257 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
258     "Time to delay syncing directories (in seconds)");
259 static int metadelay = 28;		/* time to delay syncing metadata */
260 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
261     "Time to delay syncing metadata (in seconds)");
262 static int rushjob;		/* number of slots to run ASAP */
263 static int stat_rush_requests;	/* number of times I/O speeded up */
264 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
265     "Number of times I/O speeded up (rush requests)");
266 
267 /*
268  * When shutting down the syncer, run it at four times normal speed.
269  */
270 #define SYNCER_SHUTDOWN_SPEEDUP		4
271 static int sync_vnode_count;
272 static int syncer_worklist_len;
273 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
274     syncer_state;
275 
276 /*
277  * Number of vnodes we want to exist at any one time.  This is mostly used
278  * to size hash tables in vnode-related code.  It is normally not used in
279  * getnewvnode(), as wantfreevnodes is normally nonzero.)
280  *
281  * XXX desiredvnodes is historical cruft and should not exist.
282  */
283 int desiredvnodes;
284 
285 static int
286 sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
287 {
288 	int error, old_desiredvnodes;
289 
290 	old_desiredvnodes = desiredvnodes;
291 	if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0)
292 		return (error);
293 	if (old_desiredvnodes != desiredvnodes) {
294 		vfs_hash_changesize(desiredvnodes);
295 		cache_changesize(desiredvnodes);
296 	}
297 	return (0);
298 }
299 
300 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
301     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0,
302     sysctl_update_desiredvnodes, "I", "Maximum number of vnodes");
303 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
304     &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
305 static int vnlru_nowhere;
306 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
307     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
308 
309 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
310 static int vnsz2log;
311 
312 /*
313  * Support for the bufobj clean & dirty pctrie.
314  */
315 static void *
316 buf_trie_alloc(struct pctrie *ptree)
317 {
318 
319 	return uma_zalloc(buf_trie_zone, M_NOWAIT);
320 }
321 
322 static void
323 buf_trie_free(struct pctrie *ptree, void *node)
324 {
325 
326 	uma_zfree(buf_trie_zone, node);
327 }
328 PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);
329 
330 /*
331  * Initialize the vnode management data structures.
332  *
333  * Reevaluate the following cap on the number of vnodes after the physical
334  * memory size exceeds 512GB.  In the limit, as the physical memory size
335  * grows, the ratio of physical pages to vnodes approaches sixteen to one.
336  */
337 #ifndef	MAXVNODES_MAX
338 #define	MAXVNODES_MAX	(512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16))
339 #endif
340 static void
341 vntblinit(void *dummy __unused)
342 {
343 	u_int i;
344 	int physvnodes, virtvnodes;
345 
346 	/*
347 	 * Desiredvnodes is a function of the physical memory size and the
348 	 * kernel's heap size.  Generally speaking, it scales with the
349 	 * physical memory size.  The ratio of desiredvnodes to physical pages
350 	 * is one to four until desiredvnodes exceeds 98,304.  Thereafter, the
351 	 * marginal ratio of desiredvnodes to physical pages is one to
352 	 * sixteen.  However, desiredvnodes is limited by the kernel's heap
353 	 * size.  The memory required by desiredvnodes vnodes and vm objects
354 	 * may not exceed one seventh of the kernel's heap size.
355 	 */
356 	physvnodes = maxproc + vm_cnt.v_page_count / 16 + 3 * min(98304 * 4,
357 	    vm_cnt.v_page_count) / 16;
358 	virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) +
359 	    sizeof(struct vnode)));
360 	desiredvnodes = min(physvnodes, virtvnodes);
361 	if (desiredvnodes > MAXVNODES_MAX) {
362 		if (bootverbose)
363 			printf("Reducing kern.maxvnodes %d -> %d\n",
364 			    desiredvnodes, MAXVNODES_MAX);
365 		desiredvnodes = MAXVNODES_MAX;
366 	}
367 	wantfreevnodes = desiredvnodes / 4;
368 	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
369 	TAILQ_INIT(&vnode_free_list);
370 	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
371 	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
372 	    NULL, NULL, UMA_ALIGN_PTR, 0);
373 	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
374 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
375 	/*
376 	 * Preallocate enough nodes to support one-per buf so that
377 	 * we can not fail an insert.  reassignbuf() callers can not
378 	 * tolerate the insertion failure.
379 	 */
380 	buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
381 	    NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR,
382 	    UMA_ZONE_NOFREE | UMA_ZONE_VM);
383 	uma_prealloc(buf_trie_zone, nbuf);
384 	/*
385 	 * Initialize the filesystem syncer.
386 	 */
387 	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
388 	    &syncer_mask);
389 	syncer_maxdelay = syncer_mask + 1;
390 	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
391 	cv_init(&sync_wakeup, "syncer");
392 	for (i = 1; i <= sizeof(struct vnode); i <<= 1)
393 		vnsz2log++;
394 	vnsz2log--;
395 }
396 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
397 
398 
399 /*
400  * Mark a mount point as busy. Used to synchronize access and to delay
401  * unmounting. Eventually, mountlist_mtx is not released on failure.
402  *
403  * vfs_busy() is a custom lock, it can block the caller.
404  * vfs_busy() only sleeps if the unmount is active on the mount point.
405  * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
406  * vnode belonging to mp.
407  *
408  * Lookup uses vfs_busy() to traverse mount points.
409  * root fs			var fs
410  * / vnode lock		A	/ vnode lock (/var)		D
411  * /var vnode lock	B	/log vnode lock(/var/log)	E
412  * vfs_busy lock	C	vfs_busy lock			F
413  *
414  * Within each file system, the lock order is C->A->B and F->D->E.
415  *
416  * When traversing across mounts, the system follows that lock order:
417  *
418  *        C->A->B
419  *              |
420  *              +->F->D->E
421  *
422  * The lookup() process for namei("/var") illustrates the process:
423  *  VOP_LOOKUP() obtains B while A is held
424  *  vfs_busy() obtains a shared lock on F while A and B are held
425  *  vput() releases lock on B
426  *  vput() releases lock on A
427  *  VFS_ROOT() obtains lock on D while shared lock on F is held
428  *  vfs_unbusy() releases shared lock on F
429  *  vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
430  *    Attempt to lock A (instead of vp_crossmp) while D is held would
431  *    violate the global order, causing deadlocks.
432  *
433  * dounmount() locks B while F is drained.
434  */
435 int
436 vfs_busy(struct mount *mp, int flags)
437 {
438 
439 	MPASS((flags & ~MBF_MASK) == 0);
440 	CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
441 
442 	MNT_ILOCK(mp);
443 	MNT_REF(mp);
444 	/*
445 	 * If mount point is currenly being unmounted, sleep until the
446 	 * mount point fate is decided.  If thread doing the unmounting fails,
447 	 * it will clear MNTK_UNMOUNT flag before waking us up, indicating
448 	 * that this mount point has survived the unmount attempt and vfs_busy
449 	 * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
450 	 * flag in addition to MNTK_UNMOUNT, indicating that mount point is
451 	 * about to be really destroyed.  vfs_busy needs to release its
452 	 * reference on the mount point in this case and return with ENOENT,
453 	 * telling the caller that mount mount it tried to busy is no longer
454 	 * valid.
455 	 */
456 	while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
457 		if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
458 			MNT_REL(mp);
459 			MNT_IUNLOCK(mp);
460 			CTR1(KTR_VFS, "%s: failed busying before sleeping",
461 			    __func__);
462 			return (ENOENT);
463 		}
464 		if (flags & MBF_MNTLSTLOCK)
465 			mtx_unlock(&mountlist_mtx);
466 		mp->mnt_kern_flag |= MNTK_MWAIT;
467 		msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
468 		if (flags & MBF_MNTLSTLOCK)
469 			mtx_lock(&mountlist_mtx);
470 		MNT_ILOCK(mp);
471 	}
472 	if (flags & MBF_MNTLSTLOCK)
473 		mtx_unlock(&mountlist_mtx);
474 	mp->mnt_lockref++;
475 	MNT_IUNLOCK(mp);
476 	return (0);
477 }
478 
479 /*
480  * Free a busy filesystem.
481  */
482 void
483 vfs_unbusy(struct mount *mp)
484 {
485 
486 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
487 	MNT_ILOCK(mp);
488 	MNT_REL(mp);
489 	KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref"));
490 	mp->mnt_lockref--;
491 	if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
492 		MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
493 		CTR1(KTR_VFS, "%s: waking up waiters", __func__);
494 		mp->mnt_kern_flag &= ~MNTK_DRAINING;
495 		wakeup(&mp->mnt_lockref);
496 	}
497 	MNT_IUNLOCK(mp);
498 }
499 
500 /*
501  * Lookup a mount point by filesystem identifier.
502  */
503 struct mount *
504 vfs_getvfs(fsid_t *fsid)
505 {
506 	struct mount *mp;
507 
508 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
509 	mtx_lock(&mountlist_mtx);
510 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
511 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
512 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
513 			vfs_ref(mp);
514 			mtx_unlock(&mountlist_mtx);
515 			return (mp);
516 		}
517 	}
518 	mtx_unlock(&mountlist_mtx);
519 	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
520 	return ((struct mount *) 0);
521 }
522 
523 /*
524  * Lookup a mount point by filesystem identifier, busying it before
525  * returning.
526  *
527  * To avoid congestion on mountlist_mtx, implement simple direct-mapped
528  * cache for popular filesystem identifiers.  The cache is lockess, using
529  * the fact that struct mount's are never freed.  In worst case we may
530  * get pointer to unmounted or even different filesystem, so we have to
531  * check what we got, and go slow way if so.
532  */
533 struct mount *
534 vfs_busyfs(fsid_t *fsid)
535 {
536 #define	FSID_CACHE_SIZE	256
537 	typedef struct mount * volatile vmp_t;
538 	static vmp_t cache[FSID_CACHE_SIZE];
539 	struct mount *mp;
540 	int error;
541 	uint32_t hash;
542 
543 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
544 	hash = fsid->val[0] ^ fsid->val[1];
545 	hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
546 	mp = cache[hash];
547 	if (mp == NULL ||
548 	    mp->mnt_stat.f_fsid.val[0] != fsid->val[0] ||
549 	    mp->mnt_stat.f_fsid.val[1] != fsid->val[1])
550 		goto slow;
551 	if (vfs_busy(mp, 0) != 0) {
552 		cache[hash] = NULL;
553 		goto slow;
554 	}
555 	if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
556 	    mp->mnt_stat.f_fsid.val[1] == fsid->val[1])
557 		return (mp);
558 	else
559 	    vfs_unbusy(mp);
560 
561 slow:
562 	mtx_lock(&mountlist_mtx);
563 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
564 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
565 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
566 			error = vfs_busy(mp, MBF_MNTLSTLOCK);
567 			if (error) {
568 				cache[hash] = NULL;
569 				mtx_unlock(&mountlist_mtx);
570 				return (NULL);
571 			}
572 			cache[hash] = mp;
573 			return (mp);
574 		}
575 	}
576 	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
577 	mtx_unlock(&mountlist_mtx);
578 	return ((struct mount *) 0);
579 }
580 
581 /*
582  * Check if a user can access privileged mount options.
583  */
584 int
585 vfs_suser(struct mount *mp, struct thread *td)
586 {
587 	int error;
588 
589 	/*
590 	 * If the thread is jailed, but this is not a jail-friendly file
591 	 * system, deny immediately.
592 	 */
593 	if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred))
594 		return (EPERM);
595 
596 	/*
597 	 * If the file system was mounted outside the jail of the calling
598 	 * thread, deny immediately.
599 	 */
600 	if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
601 		return (EPERM);
602 
603 	/*
604 	 * If file system supports delegated administration, we don't check
605 	 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
606 	 * by the file system itself.
607 	 * If this is not the user that did original mount, we check for
608 	 * the PRIV_VFS_MOUNT_OWNER privilege.
609 	 */
610 	if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
611 	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
612 		if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
613 			return (error);
614 	}
615 	return (0);
616 }
617 
618 /*
619  * Get a new unique fsid.  Try to make its val[0] unique, since this value
620  * will be used to create fake device numbers for stat().  Also try (but
621  * not so hard) make its val[0] unique mod 2^16, since some emulators only
622  * support 16-bit device numbers.  We end up with unique val[0]'s for the
623  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
624  *
625  * Keep in mind that several mounts may be running in parallel.  Starting
626  * the search one past where the previous search terminated is both a
627  * micro-optimization and a defense against returning the same fsid to
628  * different mounts.
629  */
630 void
631 vfs_getnewfsid(struct mount *mp)
632 {
633 	static uint16_t mntid_base;
634 	struct mount *nmp;
635 	fsid_t tfsid;
636 	int mtype;
637 
638 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
639 	mtx_lock(&mntid_mtx);
640 	mtype = mp->mnt_vfc->vfc_typenum;
641 	tfsid.val[1] = mtype;
642 	mtype = (mtype & 0xFF) << 24;
643 	for (;;) {
644 		tfsid.val[0] = makedev(255,
645 		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
646 		mntid_base++;
647 		if ((nmp = vfs_getvfs(&tfsid)) == NULL)
648 			break;
649 		vfs_rel(nmp);
650 	}
651 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
652 	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
653 	mtx_unlock(&mntid_mtx);
654 }
655 
656 /*
657  * Knob to control the precision of file timestamps:
658  *
659  *   0 = seconds only; nanoseconds zeroed.
660  *   1 = seconds and nanoseconds, accurate within 1/HZ.
661  *   2 = seconds and nanoseconds, truncated to microseconds.
662  * >=3 = seconds and nanoseconds, maximum precision.
663  */
664 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
665 
666 static int timestamp_precision = TSP_USEC;
667 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
668     &timestamp_precision, 0, "File timestamp precision (0: seconds, "
669     "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, "
670     "3+: sec + ns (max. precision))");
671 
672 /*
673  * Get a current timestamp.
674  */
675 void
676 vfs_timestamp(struct timespec *tsp)
677 {
678 	struct timeval tv;
679 
680 	switch (timestamp_precision) {
681 	case TSP_SEC:
682 		tsp->tv_sec = time_second;
683 		tsp->tv_nsec = 0;
684 		break;
685 	case TSP_HZ:
686 		getnanotime(tsp);
687 		break;
688 	case TSP_USEC:
689 		microtime(&tv);
690 		TIMEVAL_TO_TIMESPEC(&tv, tsp);
691 		break;
692 	case TSP_NSEC:
693 	default:
694 		nanotime(tsp);
695 		break;
696 	}
697 }
698 
699 /*
700  * Set vnode attributes to VNOVAL
701  */
702 void
703 vattr_null(struct vattr *vap)
704 {
705 
706 	vap->va_type = VNON;
707 	vap->va_size = VNOVAL;
708 	vap->va_bytes = VNOVAL;
709 	vap->va_mode = VNOVAL;
710 	vap->va_nlink = VNOVAL;
711 	vap->va_uid = VNOVAL;
712 	vap->va_gid = VNOVAL;
713 	vap->va_fsid = VNOVAL;
714 	vap->va_fileid = VNOVAL;
715 	vap->va_blocksize = VNOVAL;
716 	vap->va_rdev = VNOVAL;
717 	vap->va_atime.tv_sec = VNOVAL;
718 	vap->va_atime.tv_nsec = VNOVAL;
719 	vap->va_mtime.tv_sec = VNOVAL;
720 	vap->va_mtime.tv_nsec = VNOVAL;
721 	vap->va_ctime.tv_sec = VNOVAL;
722 	vap->va_ctime.tv_nsec = VNOVAL;
723 	vap->va_birthtime.tv_sec = VNOVAL;
724 	vap->va_birthtime.tv_nsec = VNOVAL;
725 	vap->va_flags = VNOVAL;
726 	vap->va_gen = VNOVAL;
727 	vap->va_vaflags = 0;
728 }
729 
730 /*
731  * This routine is called when we have too many vnodes.  It attempts
732  * to free <count> vnodes and will potentially free vnodes that still
733  * have VM backing store (VM backing store is typically the cause
734  * of a vnode blowout so we want to do this).  Therefore, this operation
735  * is not considered cheap.
736  *
737  * A number of conditions may prevent a vnode from being reclaimed.
738  * the buffer cache may have references on the vnode, a directory
739  * vnode may still have references due to the namei cache representing
740  * underlying files, or the vnode may be in active use.   It is not
741  * desireable to reuse such vnodes.  These conditions may cause the
742  * number of vnodes to reach some minimum value regardless of what
743  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
744  */
745 static int
746 vlrureclaim(struct mount *mp)
747 {
748 	struct vnode *vp;
749 	int done;
750 	int trigger;
751 	int usevnodes;
752 	int count;
753 
754 	/*
755 	 * Calculate the trigger point, don't allow user
756 	 * screwups to blow us up.   This prevents us from
757 	 * recycling vnodes with lots of resident pages.  We
758 	 * aren't trying to free memory, we are trying to
759 	 * free vnodes.
760 	 */
761 	usevnodes = desiredvnodes;
762 	if (usevnodes <= 0)
763 		usevnodes = 1;
764 	trigger = vm_cnt.v_page_count * 2 / usevnodes;
765 	done = 0;
766 	vn_start_write(NULL, &mp, V_WAIT);
767 	MNT_ILOCK(mp);
768 	count = mp->mnt_nvnodelistsize / 10 + 1;
769 	while (count != 0) {
770 		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
771 		while (vp != NULL && vp->v_type == VMARKER)
772 			vp = TAILQ_NEXT(vp, v_nmntvnodes);
773 		if (vp == NULL)
774 			break;
775 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
776 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
777 		--count;
778 		if (!VI_TRYLOCK(vp))
779 			goto next_iter;
780 		/*
781 		 * If it's been deconstructed already, it's still
782 		 * referenced, or it exceeds the trigger, skip it.
783 		 */
784 		if (vp->v_usecount ||
785 		    (!vlru_allow_cache_src &&
786 			!LIST_EMPTY(&(vp)->v_cache_src)) ||
787 		    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
788 		    vp->v_object->resident_page_count > trigger)) {
789 			VI_UNLOCK(vp);
790 			goto next_iter;
791 		}
792 		MNT_IUNLOCK(mp);
793 		vholdl(vp);
794 		if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
795 			vdrop(vp);
796 			goto next_iter_mntunlocked;
797 		}
798 		VI_LOCK(vp);
799 		/*
800 		 * v_usecount may have been bumped after VOP_LOCK() dropped
801 		 * the vnode interlock and before it was locked again.
802 		 *
803 		 * It is not necessary to recheck VI_DOOMED because it can
804 		 * only be set by another thread that holds both the vnode
805 		 * lock and vnode interlock.  If another thread has the
806 		 * vnode lock before we get to VOP_LOCK() and obtains the
807 		 * vnode interlock after VOP_LOCK() drops the vnode
808 		 * interlock, the other thread will be unable to drop the
809 		 * vnode lock before our VOP_LOCK() call fails.
810 		 */
811 		if (vp->v_usecount ||
812 		    (!vlru_allow_cache_src &&
813 			!LIST_EMPTY(&(vp)->v_cache_src)) ||
814 		    (vp->v_object != NULL &&
815 		    vp->v_object->resident_page_count > trigger)) {
816 			VOP_UNLOCK(vp, LK_INTERLOCK);
817 			vdrop(vp);
818 			goto next_iter_mntunlocked;
819 		}
820 		KASSERT((vp->v_iflag & VI_DOOMED) == 0,
821 		    ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
822 		atomic_add_long(&recycles_count, 1);
823 		vgonel(vp);
824 		VOP_UNLOCK(vp, 0);
825 		vdropl(vp);
826 		done++;
827 next_iter_mntunlocked:
828 		if (!should_yield())
829 			goto relock_mnt;
830 		goto yield;
831 next_iter:
832 		if (!should_yield())
833 			continue;
834 		MNT_IUNLOCK(mp);
835 yield:
836 		kern_yield(PRI_USER);
837 relock_mnt:
838 		MNT_ILOCK(mp);
839 	}
840 	MNT_IUNLOCK(mp);
841 	vn_finished_write(mp);
842 	return done;
843 }
844 
845 /*
846  * Attempt to keep the free list at wantfreevnodes length.
847  */
848 static void
849 vnlru_free(int count)
850 {
851 	struct vnode *vp;
852 
853 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
854 	for (; count > 0; count--) {
855 		vp = TAILQ_FIRST(&vnode_free_list);
856 		/*
857 		 * The list can be modified while the free_list_mtx
858 		 * has been dropped and vp could be NULL here.
859 		 */
860 		if (!vp)
861 			break;
862 		VNASSERT(vp->v_op != NULL, vp,
863 		    ("vnlru_free: vnode already reclaimed."));
864 		KASSERT((vp->v_iflag & VI_FREE) != 0,
865 		    ("Removing vnode not on freelist"));
866 		KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
867 		    ("Mangling active vnode"));
868 		TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
869 		/*
870 		 * Don't recycle if we can't get the interlock.
871 		 */
872 		if (!VI_TRYLOCK(vp)) {
873 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
874 			continue;
875 		}
876 		VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0,
877 		    vp, ("vp inconsistent on freelist"));
878 
879 		/*
880 		 * The clear of VI_FREE prevents activation of the
881 		 * vnode.  There is no sense in putting the vnode on
882 		 * the mount point active list, only to remove it
883 		 * later during recycling.  Inline the relevant part
884 		 * of vholdl(), to avoid triggering assertions or
885 		 * activating.
886 		 */
887 		freevnodes--;
888 		vp->v_iflag &= ~VI_FREE;
889 		refcount_acquire(&vp->v_holdcnt);
890 
891 		mtx_unlock(&vnode_free_list_mtx);
892 		VI_UNLOCK(vp);
893 		vtryrecycle(vp);
894 		/*
895 		 * If the recycled succeeded this vdrop will actually free
896 		 * the vnode.  If not it will simply place it back on
897 		 * the free list.
898 		 */
899 		vdrop(vp);
900 		mtx_lock(&vnode_free_list_mtx);
901 	}
902 }
903 /*
904  * Attempt to recycle vnodes in a context that is always safe to block.
905  * Calling vlrurecycle() from the bowels of filesystem code has some
906  * interesting deadlock problems.
907  */
908 static struct proc *vnlruproc;
909 static int vnlruproc_sig;
910 
911 static void
912 vnlru_proc(void)
913 {
914 	struct mount *mp, *nmp;
915 	int done;
916 	struct proc *p = vnlruproc;
917 
918 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
919 	    SHUTDOWN_PRI_FIRST);
920 
921 	for (;;) {
922 		kproc_suspend_check(p);
923 		mtx_lock(&vnode_free_list_mtx);
924 		if (freevnodes > wantfreevnodes)
925 			vnlru_free(freevnodes - wantfreevnodes);
926 		if (numvnodes <= desiredvnodes * 9 / 10) {
927 			vnlruproc_sig = 0;
928 			wakeup(&vnlruproc_sig);
929 			msleep(vnlruproc, &vnode_free_list_mtx,
930 			    PVFS|PDROP, "vlruwt", hz);
931 			continue;
932 		}
933 		mtx_unlock(&vnode_free_list_mtx);
934 		done = 0;
935 		mtx_lock(&mountlist_mtx);
936 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
937 			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
938 				nmp = TAILQ_NEXT(mp, mnt_list);
939 				continue;
940 			}
941 			done += vlrureclaim(mp);
942 			mtx_lock(&mountlist_mtx);
943 			nmp = TAILQ_NEXT(mp, mnt_list);
944 			vfs_unbusy(mp);
945 		}
946 		mtx_unlock(&mountlist_mtx);
947 		if (done == 0) {
948 #if 0
949 			/* These messages are temporary debugging aids */
950 			if (vnlru_nowhere < 5)
951 				printf("vnlru process getting nowhere..\n");
952 			else if (vnlru_nowhere == 5)
953 				printf("vnlru process messages stopped.\n");
954 #endif
955 			vnlru_nowhere++;
956 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
957 		} else
958 			kern_yield(PRI_USER);
959 	}
960 }
961 
962 static struct kproc_desc vnlru_kp = {
963 	"vnlru",
964 	vnlru_proc,
965 	&vnlruproc
966 };
967 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
968     &vnlru_kp);
969 
970 /*
971  * Routines having to do with the management of the vnode table.
972  */
973 
974 /*
975  * Try to recycle a freed vnode.  We abort if anyone picks up a reference
976  * before we actually vgone().  This function must be called with the vnode
977  * held to prevent the vnode from being returned to the free list midway
978  * through vgone().
979  */
980 static int
981 vtryrecycle(struct vnode *vp)
982 {
983 	struct mount *vnmp;
984 
985 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
986 	VNASSERT(vp->v_holdcnt, vp,
987 	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
988 	/*
989 	 * This vnode may found and locked via some other list, if so we
990 	 * can't recycle it yet.
991 	 */
992 	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
993 		CTR2(KTR_VFS,
994 		    "%s: impossible to recycle, vp %p lock is already held",
995 		    __func__, vp);
996 		return (EWOULDBLOCK);
997 	}
998 	/*
999 	 * Don't recycle if its filesystem is being suspended.
1000 	 */
1001 	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
1002 		VOP_UNLOCK(vp, 0);
1003 		CTR2(KTR_VFS,
1004 		    "%s: impossible to recycle, cannot start the write for %p",
1005 		    __func__, vp);
1006 		return (EBUSY);
1007 	}
1008 	/*
1009 	 * If we got this far, we need to acquire the interlock and see if
1010 	 * anyone picked up this vnode from another list.  If not, we will
1011 	 * mark it with DOOMED via vgonel() so that anyone who does find it
1012 	 * will skip over it.
1013 	 */
1014 	VI_LOCK(vp);
1015 	if (vp->v_usecount) {
1016 		VOP_UNLOCK(vp, LK_INTERLOCK);
1017 		vn_finished_write(vnmp);
1018 		CTR2(KTR_VFS,
1019 		    "%s: impossible to recycle, %p is already referenced",
1020 		    __func__, vp);
1021 		return (EBUSY);
1022 	}
1023 	if ((vp->v_iflag & VI_DOOMED) == 0) {
1024 		atomic_add_long(&recycles_count, 1);
1025 		vgonel(vp);
1026 	}
1027 	VOP_UNLOCK(vp, LK_INTERLOCK);
1028 	vn_finished_write(vnmp);
1029 	return (0);
1030 }
1031 
1032 /*
1033  * Wait for available vnodes.
1034  */
1035 static int
1036 getnewvnode_wait(int suspended)
1037 {
1038 
1039 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
1040 	if (numvnodes > desiredvnodes) {
1041 		if (suspended) {
1042 			/*
1043 			 * File system is beeing suspended, we cannot risk a
1044 			 * deadlock here, so allocate new vnode anyway.
1045 			 */
1046 			if (freevnodes > wantfreevnodes)
1047 				vnlru_free(freevnodes - wantfreevnodes);
1048 			return (0);
1049 		}
1050 		if (vnlruproc_sig == 0) {
1051 			vnlruproc_sig = 1;	/* avoid unnecessary wakeups */
1052 			wakeup(vnlruproc);
1053 		}
1054 		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
1055 		    "vlruwk", hz);
1056 	}
1057 	return (numvnodes > desiredvnodes ? ENFILE : 0);
1058 }
1059 
1060 void
1061 getnewvnode_reserve(u_int count)
1062 {
1063 	struct thread *td;
1064 
1065 	td = curthread;
1066 	/* First try to be quick and racy. */
1067 	if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
1068 		td->td_vp_reserv += count;
1069 		return;
1070 	} else
1071 		atomic_subtract_long(&numvnodes, count);
1072 
1073 	mtx_lock(&vnode_free_list_mtx);
1074 	while (count > 0) {
1075 		if (getnewvnode_wait(0) == 0) {
1076 			count--;
1077 			td->td_vp_reserv++;
1078 			atomic_add_long(&numvnodes, 1);
1079 		}
1080 	}
1081 	mtx_unlock(&vnode_free_list_mtx);
1082 }
1083 
1084 void
1085 getnewvnode_drop_reserve(void)
1086 {
1087 	struct thread *td;
1088 
1089 	td = curthread;
1090 	atomic_subtract_long(&numvnodes, td->td_vp_reserv);
1091 	td->td_vp_reserv = 0;
1092 }
1093 
1094 /*
1095  * Return the next vnode from the free list.
1096  */
1097 int
1098 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
1099     struct vnode **vpp)
1100 {
1101 	struct vnode *vp;
1102 	struct bufobj *bo;
1103 	struct thread *td;
1104 	int error;
1105 
1106 	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
1107 	vp = NULL;
1108 	td = curthread;
1109 	if (td->td_vp_reserv > 0) {
1110 		td->td_vp_reserv -= 1;
1111 		goto alloc;
1112 	}
1113 	mtx_lock(&vnode_free_list_mtx);
1114 	/*
1115 	 * Lend our context to reclaim vnodes if they've exceeded the max.
1116 	 */
1117 	if (freevnodes > wantfreevnodes)
1118 		vnlru_free(1);
1119 	error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
1120 	    MNTK_SUSPEND));
1121 #if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
1122 	if (error != 0) {
1123 		mtx_unlock(&vnode_free_list_mtx);
1124 		return (error);
1125 	}
1126 #endif
1127 	atomic_add_long(&numvnodes, 1);
1128 	mtx_unlock(&vnode_free_list_mtx);
1129 alloc:
1130 	atomic_add_long(&vnodes_created, 1);
1131 	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
1132 	/*
1133 	 * Setup locks.
1134 	 */
1135 	vp->v_vnlock = &vp->v_lock;
1136 	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
1137 	/*
1138 	 * By default, don't allow shared locks unless filesystems
1139 	 * opt-in.
1140 	 */
1141 	lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE);
1142 	/*
1143 	 * Initialize bufobj.
1144 	 */
1145 	bo = &vp->v_bufobj;
1146 	bo->__bo_vnode = vp;
1147 	rw_init(BO_LOCKPTR(bo), "bufobj interlock");
1148 	bo->bo_ops = &buf_ops_bio;
1149 	bo->bo_private = vp;
1150 	TAILQ_INIT(&bo->bo_clean.bv_hd);
1151 	TAILQ_INIT(&bo->bo_dirty.bv_hd);
1152 	/*
1153 	 * Initialize namecache.
1154 	 */
1155 	LIST_INIT(&vp->v_cache_src);
1156 	TAILQ_INIT(&vp->v_cache_dst);
1157 	/*
1158 	 * Finalize various vnode identity bits.
1159 	 */
1160 	vp->v_type = VNON;
1161 	vp->v_tag = tag;
1162 	vp->v_op = vops;
1163 	v_init_counters(vp);
1164 	vp->v_data = NULL;
1165 #ifdef MAC
1166 	mac_vnode_init(vp);
1167 	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
1168 		mac_vnode_associate_singlelabel(mp, vp);
1169 	else if (mp == NULL && vops != &dead_vnodeops)
1170 		printf("NULL mp in getnewvnode()\n");
1171 #endif
1172 	if (mp != NULL) {
1173 		bo->bo_bsize = mp->mnt_stat.f_iosize;
1174 		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
1175 			vp->v_vflag |= VV_NOKNOTE;
1176 	}
1177 	rangelock_init(&vp->v_rl);
1178 
1179 	/*
1180 	 * For the filesystems which do not use vfs_hash_insert(),
1181 	 * still initialize v_hash to have vfs_hash_index() useful.
1182 	 * E.g., nullfs uses vfs_hash_index() on the lower vnode for
1183 	 * its own hashing.
1184 	 */
1185 	vp->v_hash = (uintptr_t)vp >> vnsz2log;
1186 
1187 	*vpp = vp;
1188 	return (0);
1189 }
1190 
1191 /*
1192  * Delete from old mount point vnode list, if on one.
1193  */
1194 static void
1195 delmntque(struct vnode *vp)
1196 {
1197 	struct mount *mp;
1198 	int active;
1199 
1200 	mp = vp->v_mount;
1201 	if (mp == NULL)
1202 		return;
1203 	MNT_ILOCK(mp);
1204 	VI_LOCK(vp);
1205 	KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize,
1206 	    ("Active vnode list size %d > Vnode list size %d",
1207 	     mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize));
1208 	active = vp->v_iflag & VI_ACTIVE;
1209 	vp->v_iflag &= ~VI_ACTIVE;
1210 	if (active) {
1211 		mtx_lock(&vnode_free_list_mtx);
1212 		TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
1213 		mp->mnt_activevnodelistsize--;
1214 		mtx_unlock(&vnode_free_list_mtx);
1215 	}
1216 	vp->v_mount = NULL;
1217 	VI_UNLOCK(vp);
1218 	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
1219 		("bad mount point vnode list size"));
1220 	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1221 	mp->mnt_nvnodelistsize--;
1222 	MNT_REL(mp);
1223 	MNT_IUNLOCK(mp);
1224 }
1225 
1226 static void
1227 insmntque_stddtr(struct vnode *vp, void *dtr_arg)
1228 {
1229 
1230 	vp->v_data = NULL;
1231 	vp->v_op = &dead_vnodeops;
1232 	vgone(vp);
1233 	vput(vp);
1234 }
1235 
1236 /*
1237  * Insert into list of vnodes for the new mount point, if available.
1238  */
1239 int
1240 insmntque1(struct vnode *vp, struct mount *mp,
1241 	void (*dtr)(struct vnode *, void *), void *dtr_arg)
1242 {
1243 
1244 	KASSERT(vp->v_mount == NULL,
1245 		("insmntque: vnode already on per mount vnode list"));
1246 	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
1247 	ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
1248 
1249 	/*
1250 	 * We acquire the vnode interlock early to ensure that the
1251 	 * vnode cannot be recycled by another process releasing a
1252 	 * holdcnt on it before we get it on both the vnode list
1253 	 * and the active vnode list. The mount mutex protects only
1254 	 * manipulation of the vnode list and the vnode freelist
1255 	 * mutex protects only manipulation of the active vnode list.
1256 	 * Hence the need to hold the vnode interlock throughout.
1257 	 */
1258 	MNT_ILOCK(mp);
1259 	VI_LOCK(vp);
1260 	if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
1261 	    ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
1262 	    mp->mnt_nvnodelistsize == 0)) &&
1263 	    (vp->v_vflag & VV_FORCEINSMQ) == 0) {
1264 		VI_UNLOCK(vp);
1265 		MNT_IUNLOCK(mp);
1266 		if (dtr != NULL)
1267 			dtr(vp, dtr_arg);
1268 		return (EBUSY);
1269 	}
1270 	vp->v_mount = mp;
1271 	MNT_REF(mp);
1272 	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1273 	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
1274 		("neg mount point vnode list size"));
1275 	mp->mnt_nvnodelistsize++;
1276 	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
1277 	    ("Activating already active vnode"));
1278 	vp->v_iflag |= VI_ACTIVE;
1279 	mtx_lock(&vnode_free_list_mtx);
1280 	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
1281 	mp->mnt_activevnodelistsize++;
1282 	mtx_unlock(&vnode_free_list_mtx);
1283 	VI_UNLOCK(vp);
1284 	MNT_IUNLOCK(mp);
1285 	return (0);
1286 }
1287 
1288 int
1289 insmntque(struct vnode *vp, struct mount *mp)
1290 {
1291 
1292 	return (insmntque1(vp, mp, insmntque_stddtr, NULL));
1293 }
1294 
1295 /*
1296  * Flush out and invalidate all buffers associated with a bufobj
1297  * Called with the underlying object locked.
1298  */
1299 int
1300 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
1301 {
1302 	int error;
1303 
1304 	BO_LOCK(bo);
1305 	if (flags & V_SAVE) {
1306 		error = bufobj_wwait(bo, slpflag, slptimeo);
1307 		if (error) {
1308 			BO_UNLOCK(bo);
1309 			return (error);
1310 		}
1311 		if (bo->bo_dirty.bv_cnt > 0) {
1312 			BO_UNLOCK(bo);
1313 			if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
1314 				return (error);
1315 			/*
1316 			 * XXX We could save a lock/unlock if this was only
1317 			 * enabled under INVARIANTS
1318 			 */
1319 			BO_LOCK(bo);
1320 			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
1321 				panic("vinvalbuf: dirty bufs");
1322 		}
1323 	}
1324 	/*
1325 	 * If you alter this loop please notice that interlock is dropped and
1326 	 * reacquired in flushbuflist.  Special care is needed to ensure that
1327 	 * no race conditions occur from this.
1328 	 */
1329 	do {
1330 		error = flushbuflist(&bo->bo_clean,
1331 		    flags, bo, slpflag, slptimeo);
1332 		if (error == 0 && !(flags & V_CLEANONLY))
1333 			error = flushbuflist(&bo->bo_dirty,
1334 			    flags, bo, slpflag, slptimeo);
1335 		if (error != 0 && error != EAGAIN) {
1336 			BO_UNLOCK(bo);
1337 			return (error);
1338 		}
1339 	} while (error != 0);
1340 
1341 	/*
1342 	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
1343 	 * have write I/O in-progress but if there is a VM object then the
1344 	 * VM object can also have read-I/O in-progress.
1345 	 */
1346 	do {
1347 		bufobj_wwait(bo, 0, 0);
1348 		BO_UNLOCK(bo);
1349 		if (bo->bo_object != NULL) {
1350 			VM_OBJECT_WLOCK(bo->bo_object);
1351 			vm_object_pip_wait(bo->bo_object, "bovlbx");
1352 			VM_OBJECT_WUNLOCK(bo->bo_object);
1353 		}
1354 		BO_LOCK(bo);
1355 	} while (bo->bo_numoutput > 0);
1356 	BO_UNLOCK(bo);
1357 
1358 	/*
1359 	 * Destroy the copy in the VM cache, too.
1360 	 */
1361 	if (bo->bo_object != NULL &&
1362 	    (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) {
1363 		VM_OBJECT_WLOCK(bo->bo_object);
1364 		vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
1365 		    OBJPR_CLEANONLY : 0);
1366 		VM_OBJECT_WUNLOCK(bo->bo_object);
1367 	}
1368 
1369 #ifdef INVARIANTS
1370 	BO_LOCK(bo);
1371 	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 &&
1372 	    (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
1373 		panic("vinvalbuf: flush failed");
1374 	BO_UNLOCK(bo);
1375 #endif
1376 	return (0);
1377 }
1378 
1379 /*
1380  * Flush out and invalidate all buffers associated with a vnode.
1381  * Called with the underlying object locked.
1382  */
1383 int
1384 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
1385 {
1386 
1387 	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
1388 	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
1389 	if (vp->v_object != NULL && vp->v_object->handle != vp)
1390 		return (0);
1391 	return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
1392 }
1393 
1394 /*
1395  * Flush out buffers on the specified list.
1396  *
1397  */
1398 static int
1399 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
1400     int slptimeo)
1401 {
1402 	struct buf *bp, *nbp;
1403 	int retval, error;
1404 	daddr_t lblkno;
1405 	b_xflags_t xflags;
1406 
1407 	ASSERT_BO_WLOCKED(bo);
1408 
1409 	retval = 0;
1410 	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
1411 		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1412 		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1413 			continue;
1414 		}
1415 		lblkno = 0;
1416 		xflags = 0;
1417 		if (nbp != NULL) {
1418 			lblkno = nbp->b_lblkno;
1419 			xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
1420 		}
1421 		retval = EAGAIN;
1422 		error = BUF_TIMELOCK(bp,
1423 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
1424 		    "flushbuf", slpflag, slptimeo);
1425 		if (error) {
1426 			BO_LOCK(bo);
1427 			return (error != ENOLCK ? error : EAGAIN);
1428 		}
1429 		KASSERT(bp->b_bufobj == bo,
1430 		    ("bp %p wrong b_bufobj %p should be %p",
1431 		    bp, bp->b_bufobj, bo));
1432 		/*
1433 		 * XXX Since there are no node locks for NFS, I
1434 		 * believe there is a slight chance that a delayed
1435 		 * write will occur while sleeping just above, so
1436 		 * check for it.
1437 		 */
1438 		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1439 		    (flags & V_SAVE)) {
1440 			bremfree(bp);
1441 			bp->b_flags |= B_ASYNC;
1442 			bwrite(bp);
1443 			BO_LOCK(bo);
1444 			return (EAGAIN);	/* XXX: why not loop ? */
1445 		}
1446 		bremfree(bp);
1447 		bp->b_flags |= (B_INVAL | B_RELBUF);
1448 		bp->b_flags &= ~B_ASYNC;
1449 		brelse(bp);
1450 		BO_LOCK(bo);
1451 		if (nbp != NULL &&
1452 		    (nbp->b_bufobj != bo ||
1453 		     nbp->b_lblkno != lblkno ||
1454 		     (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags))
1455 			break;			/* nbp invalid */
1456 	}
1457 	return (retval);
1458 }
1459 
1460 /*
1461  * Truncate a file's buffer and pages to a specified length.  This
1462  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1463  * sync activity.
1464  */
1465 int
1466 vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize)
1467 {
1468 	struct buf *bp, *nbp;
1469 	int anyfreed;
1470 	int trunclbn;
1471 	struct bufobj *bo;
1472 
1473 	CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__,
1474 	    vp, cred, blksize, (uintmax_t)length);
1475 
1476 	/*
1477 	 * Round up to the *next* lbn.
1478 	 */
1479 	trunclbn = (length + blksize - 1) / blksize;
1480 
1481 	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1482 restart:
1483 	bo = &vp->v_bufobj;
1484 	BO_LOCK(bo);
1485 	anyfreed = 1;
1486 	for (;anyfreed;) {
1487 		anyfreed = 0;
1488 		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
1489 			if (bp->b_lblkno < trunclbn)
1490 				continue;
1491 			if (BUF_LOCK(bp,
1492 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1493 			    BO_LOCKPTR(bo)) == ENOLCK)
1494 				goto restart;
1495 
1496 			bremfree(bp);
1497 			bp->b_flags |= (B_INVAL | B_RELBUF);
1498 			bp->b_flags &= ~B_ASYNC;
1499 			brelse(bp);
1500 			anyfreed = 1;
1501 
1502 			BO_LOCK(bo);
1503 			if (nbp != NULL &&
1504 			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1505 			    (nbp->b_vp != vp) ||
1506 			    (nbp->b_flags & B_DELWRI))) {
1507 				BO_UNLOCK(bo);
1508 				goto restart;
1509 			}
1510 		}
1511 
1512 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1513 			if (bp->b_lblkno < trunclbn)
1514 				continue;
1515 			if (BUF_LOCK(bp,
1516 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1517 			    BO_LOCKPTR(bo)) == ENOLCK)
1518 				goto restart;
1519 			bremfree(bp);
1520 			bp->b_flags |= (B_INVAL | B_RELBUF);
1521 			bp->b_flags &= ~B_ASYNC;
1522 			brelse(bp);
1523 			anyfreed = 1;
1524 
1525 			BO_LOCK(bo);
1526 			if (nbp != NULL &&
1527 			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1528 			    (nbp->b_vp != vp) ||
1529 			    (nbp->b_flags & B_DELWRI) == 0)) {
1530 				BO_UNLOCK(bo);
1531 				goto restart;
1532 			}
1533 		}
1534 	}
1535 
1536 	if (length > 0) {
1537 restartsync:
1538 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1539 			if (bp->b_lblkno > 0)
1540 				continue;
1541 			/*
1542 			 * Since we hold the vnode lock this should only
1543 			 * fail if we're racing with the buf daemon.
1544 			 */
1545 			if (BUF_LOCK(bp,
1546 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1547 			    BO_LOCKPTR(bo)) == ENOLCK) {
1548 				goto restart;
1549 			}
1550 			VNASSERT((bp->b_flags & B_DELWRI), vp,
1551 			    ("buf(%p) on dirty queue without DELWRI", bp));
1552 
1553 			bremfree(bp);
1554 			bawrite(bp);
1555 			BO_LOCK(bo);
1556 			goto restartsync;
1557 		}
1558 	}
1559 
1560 	bufobj_wwait(bo, 0, 0);
1561 	BO_UNLOCK(bo);
1562 	vnode_pager_setsize(vp, length);
1563 
1564 	return (0);
1565 }
1566 
1567 static void
1568 buf_vlist_remove(struct buf *bp)
1569 {
1570 	struct bufv *bv;
1571 
1572 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1573 	ASSERT_BO_WLOCKED(bp->b_bufobj);
1574 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
1575 	    (BX_VNDIRTY|BX_VNCLEAN),
1576 	    ("buf_vlist_remove: Buf %p is on two lists", bp));
1577 	if (bp->b_xflags & BX_VNDIRTY)
1578 		bv = &bp->b_bufobj->bo_dirty;
1579 	else
1580 		bv = &bp->b_bufobj->bo_clean;
1581 	BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
1582 	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
1583 	bv->bv_cnt--;
1584 	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1585 }
1586 
1587 /*
1588  * Add the buffer to the sorted clean or dirty block list.
1589  *
1590  * NOTE: xflags is passed as a constant, optimizing this inline function!
1591  */
1592 static void
1593 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
1594 {
1595 	struct bufv *bv;
1596 	struct buf *n;
1597 	int error;
1598 
1599 	ASSERT_BO_WLOCKED(bo);
1600 	KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
1601 	    ("dead bo %p", bo));
1602 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1603 	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
1604 	bp->b_xflags |= xflags;
1605 	if (xflags & BX_VNDIRTY)
1606 		bv = &bo->bo_dirty;
1607 	else
1608 		bv = &bo->bo_clean;
1609 
1610 	/*
1611 	 * Keep the list ordered.  Optimize empty list insertion.  Assume
1612 	 * we tend to grow at the tail so lookup_le should usually be cheaper
1613 	 * than _ge.
1614 	 */
1615 	if (bv->bv_cnt == 0 ||
1616 	    bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
1617 		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
1618 	else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
1619 		TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
1620 	else
1621 		TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
1622 	error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
1623 	if (error)
1624 		panic("buf_vlist_add:  Preallocated nodes insufficient.");
1625 	bv->bv_cnt++;
1626 }
1627 
1628 /*
1629  * Look up a buffer using the buffer tries.
1630  */
1631 struct buf *
1632 gbincore(struct bufobj *bo, daddr_t lblkno)
1633 {
1634 	struct buf *bp;
1635 
1636 	ASSERT_BO_LOCKED(bo);
1637 	bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
1638 	if (bp != NULL)
1639 		return (bp);
1640 	return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno);
1641 }
1642 
1643 /*
1644  * Associate a buffer with a vnode.
1645  */
1646 void
1647 bgetvp(struct vnode *vp, struct buf *bp)
1648 {
1649 	struct bufobj *bo;
1650 
1651 	bo = &vp->v_bufobj;
1652 	ASSERT_BO_WLOCKED(bo);
1653 	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
1654 
1655 	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
1656 	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
1657 	    ("bgetvp: bp already attached! %p", bp));
1658 
1659 	vhold(vp);
1660 	bp->b_vp = vp;
1661 	bp->b_bufobj = bo;
1662 	/*
1663 	 * Insert onto list for new vnode.
1664 	 */
1665 	buf_vlist_add(bp, bo, BX_VNCLEAN);
1666 }
1667 
1668 /*
1669  * Disassociate a buffer from a vnode.
1670  */
1671 void
1672 brelvp(struct buf *bp)
1673 {
1674 	struct bufobj *bo;
1675 	struct vnode *vp;
1676 
1677 	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1678 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1679 
1680 	/*
1681 	 * Delete from old vnode list, if on one.
1682 	 */
1683 	vp = bp->b_vp;		/* XXX */
1684 	bo = bp->b_bufobj;
1685 	BO_LOCK(bo);
1686 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1687 		buf_vlist_remove(bp);
1688 	else
1689 		panic("brelvp: Buffer %p not on queue.", bp);
1690 	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1691 		bo->bo_flag &= ~BO_ONWORKLST;
1692 		mtx_lock(&sync_mtx);
1693 		LIST_REMOVE(bo, bo_synclist);
1694 		syncer_worklist_len--;
1695 		mtx_unlock(&sync_mtx);
1696 	}
1697 	bp->b_vp = NULL;
1698 	bp->b_bufobj = NULL;
1699 	BO_UNLOCK(bo);
1700 	vdrop(vp);
1701 }
1702 
1703 /*
1704  * Add an item to the syncer work queue.
1705  */
1706 static void
1707 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
1708 {
1709 	int slot;
1710 
1711 	ASSERT_BO_WLOCKED(bo);
1712 
1713 	mtx_lock(&sync_mtx);
1714 	if (bo->bo_flag & BO_ONWORKLST)
1715 		LIST_REMOVE(bo, bo_synclist);
1716 	else {
1717 		bo->bo_flag |= BO_ONWORKLST;
1718 		syncer_worklist_len++;
1719 	}
1720 
1721 	if (delay > syncer_maxdelay - 2)
1722 		delay = syncer_maxdelay - 2;
1723 	slot = (syncer_delayno + delay) & syncer_mask;
1724 
1725 	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
1726 	mtx_unlock(&sync_mtx);
1727 }
1728 
1729 static int
1730 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
1731 {
1732 	int error, len;
1733 
1734 	mtx_lock(&sync_mtx);
1735 	len = syncer_worklist_len - sync_vnode_count;
1736 	mtx_unlock(&sync_mtx);
1737 	error = SYSCTL_OUT(req, &len, sizeof(len));
1738 	return (error);
1739 }
1740 
1741 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
1742     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
1743 
1744 static struct proc *updateproc;
1745 static void sched_sync(void);
1746 static struct kproc_desc up_kp = {
1747 	"syncer",
1748 	sched_sync,
1749 	&updateproc
1750 };
1751 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
1752 
1753 static int
1754 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
1755 {
1756 	struct vnode *vp;
1757 	struct mount *mp;
1758 
1759 	*bo = LIST_FIRST(slp);
1760 	if (*bo == NULL)
1761 		return (0);
1762 	vp = (*bo)->__bo_vnode;	/* XXX */
1763 	if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
1764 		return (1);
1765 	/*
1766 	 * We use vhold in case the vnode does not
1767 	 * successfully sync.  vhold prevents the vnode from
1768 	 * going away when we unlock the sync_mtx so that
1769 	 * we can acquire the vnode interlock.
1770 	 */
1771 	vholdl(vp);
1772 	mtx_unlock(&sync_mtx);
1773 	VI_UNLOCK(vp);
1774 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1775 		vdrop(vp);
1776 		mtx_lock(&sync_mtx);
1777 		return (*bo == LIST_FIRST(slp));
1778 	}
1779 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1780 	(void) VOP_FSYNC(vp, MNT_LAZY, td);
1781 	VOP_UNLOCK(vp, 0);
1782 	vn_finished_write(mp);
1783 	BO_LOCK(*bo);
1784 	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
1785 		/*
1786 		 * Put us back on the worklist.  The worklist
1787 		 * routine will remove us from our current
1788 		 * position and then add us back in at a later
1789 		 * position.
1790 		 */
1791 		vn_syncer_add_to_worklist(*bo, syncdelay);
1792 	}
1793 	BO_UNLOCK(*bo);
1794 	vdrop(vp);
1795 	mtx_lock(&sync_mtx);
1796 	return (0);
1797 }
1798 
1799 static int first_printf = 1;
1800 
1801 /*
1802  * System filesystem synchronizer daemon.
1803  */
1804 static void
1805 sched_sync(void)
1806 {
1807 	struct synclist *next, *slp;
1808 	struct bufobj *bo;
1809 	long starttime;
1810 	struct thread *td = curthread;
1811 	int last_work_seen;
1812 	int net_worklist_len;
1813 	int syncer_final_iter;
1814 	int error;
1815 
1816 	last_work_seen = 0;
1817 	syncer_final_iter = 0;
1818 	syncer_state = SYNCER_RUNNING;
1819 	starttime = time_uptime;
1820 	td->td_pflags |= TDP_NORUNNINGBUF;
1821 
1822 	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
1823 	    SHUTDOWN_PRI_LAST);
1824 
1825 	mtx_lock(&sync_mtx);
1826 	for (;;) {
1827 		if (syncer_state == SYNCER_FINAL_DELAY &&
1828 		    syncer_final_iter == 0) {
1829 			mtx_unlock(&sync_mtx);
1830 			kproc_suspend_check(td->td_proc);
1831 			mtx_lock(&sync_mtx);
1832 		}
1833 		net_worklist_len = syncer_worklist_len - sync_vnode_count;
1834 		if (syncer_state != SYNCER_RUNNING &&
1835 		    starttime != time_uptime) {
1836 			if (first_printf) {
1837 				printf("\nSyncing disks, vnodes remaining...");
1838 				first_printf = 0;
1839 			}
1840 			printf("%d ", net_worklist_len);
1841 		}
1842 		starttime = time_uptime;
1843 
1844 		/*
1845 		 * Push files whose dirty time has expired.  Be careful
1846 		 * of interrupt race on slp queue.
1847 		 *
1848 		 * Skip over empty worklist slots when shutting down.
1849 		 */
1850 		do {
1851 			slp = &syncer_workitem_pending[syncer_delayno];
1852 			syncer_delayno += 1;
1853 			if (syncer_delayno == syncer_maxdelay)
1854 				syncer_delayno = 0;
1855 			next = &syncer_workitem_pending[syncer_delayno];
1856 			/*
1857 			 * If the worklist has wrapped since the
1858 			 * it was emptied of all but syncer vnodes,
1859 			 * switch to the FINAL_DELAY state and run
1860 			 * for one more second.
1861 			 */
1862 			if (syncer_state == SYNCER_SHUTTING_DOWN &&
1863 			    net_worklist_len == 0 &&
1864 			    last_work_seen == syncer_delayno) {
1865 				syncer_state = SYNCER_FINAL_DELAY;
1866 				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
1867 			}
1868 		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
1869 		    syncer_worklist_len > 0);
1870 
1871 		/*
1872 		 * Keep track of the last time there was anything
1873 		 * on the worklist other than syncer vnodes.
1874 		 * Return to the SHUTTING_DOWN state if any
1875 		 * new work appears.
1876 		 */
1877 		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
1878 			last_work_seen = syncer_delayno;
1879 		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
1880 			syncer_state = SYNCER_SHUTTING_DOWN;
1881 		while (!LIST_EMPTY(slp)) {
1882 			error = sync_vnode(slp, &bo, td);
1883 			if (error == 1) {
1884 				LIST_REMOVE(bo, bo_synclist);
1885 				LIST_INSERT_HEAD(next, bo, bo_synclist);
1886 				continue;
1887 			}
1888 
1889 			if (first_printf == 0) {
1890 				/*
1891 				 * Drop the sync mutex, because some watchdog
1892 				 * drivers need to sleep while patting
1893 				 */
1894 				mtx_unlock(&sync_mtx);
1895 				wdog_kern_pat(WD_LASTVAL);
1896 				mtx_lock(&sync_mtx);
1897 			}
1898 
1899 		}
1900 		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
1901 			syncer_final_iter--;
1902 		/*
1903 		 * The variable rushjob allows the kernel to speed up the
1904 		 * processing of the filesystem syncer process. A rushjob
1905 		 * value of N tells the filesystem syncer to process the next
1906 		 * N seconds worth of work on its queue ASAP. Currently rushjob
1907 		 * is used by the soft update code to speed up the filesystem
1908 		 * syncer process when the incore state is getting so far
1909 		 * ahead of the disk that the kernel memory pool is being
1910 		 * threatened with exhaustion.
1911 		 */
1912 		if (rushjob > 0) {
1913 			rushjob -= 1;
1914 			continue;
1915 		}
1916 		/*
1917 		 * Just sleep for a short period of time between
1918 		 * iterations when shutting down to allow some I/O
1919 		 * to happen.
1920 		 *
1921 		 * If it has taken us less than a second to process the
1922 		 * current work, then wait. Otherwise start right over
1923 		 * again. We can still lose time if any single round
1924 		 * takes more than two seconds, but it does not really
1925 		 * matter as we are just trying to generally pace the
1926 		 * filesystem activity.
1927 		 */
1928 		if (syncer_state != SYNCER_RUNNING ||
1929 		    time_uptime == starttime) {
1930 			thread_lock(td);
1931 			sched_prio(td, PPAUSE);
1932 			thread_unlock(td);
1933 		}
1934 		if (syncer_state != SYNCER_RUNNING)
1935 			cv_timedwait(&sync_wakeup, &sync_mtx,
1936 			    hz / SYNCER_SHUTDOWN_SPEEDUP);
1937 		else if (time_uptime == starttime)
1938 			cv_timedwait(&sync_wakeup, &sync_mtx, hz);
1939 	}
1940 }
1941 
1942 /*
1943  * Request the syncer daemon to speed up its work.
1944  * We never push it to speed up more than half of its
1945  * normal turn time, otherwise it could take over the cpu.
1946  */
1947 int
1948 speedup_syncer(void)
1949 {
1950 	int ret = 0;
1951 
1952 	mtx_lock(&sync_mtx);
1953 	if (rushjob < syncdelay / 2) {
1954 		rushjob += 1;
1955 		stat_rush_requests += 1;
1956 		ret = 1;
1957 	}
1958 	mtx_unlock(&sync_mtx);
1959 	cv_broadcast(&sync_wakeup);
1960 	return (ret);
1961 }
1962 
1963 /*
1964  * Tell the syncer to speed up its work and run though its work
1965  * list several times, then tell it to shut down.
1966  */
1967 static void
1968 syncer_shutdown(void *arg, int howto)
1969 {
1970 
1971 	if (howto & RB_NOSYNC)
1972 		return;
1973 	mtx_lock(&sync_mtx);
1974 	syncer_state = SYNCER_SHUTTING_DOWN;
1975 	rushjob = 0;
1976 	mtx_unlock(&sync_mtx);
1977 	cv_broadcast(&sync_wakeup);
1978 	kproc_shutdown(arg, howto);
1979 }
1980 
1981 void
1982 syncer_suspend(void)
1983 {
1984 
1985 	syncer_shutdown(updateproc, 0);
1986 }
1987 
1988 void
1989 syncer_resume(void)
1990 {
1991 
1992 	mtx_lock(&sync_mtx);
1993 	first_printf = 1;
1994 	syncer_state = SYNCER_RUNNING;
1995 	mtx_unlock(&sync_mtx);
1996 	cv_broadcast(&sync_wakeup);
1997 	kproc_resume(updateproc);
1998 }
1999 
2000 /*
2001  * Reassign a buffer from one vnode to another.
2002  * Used to assign file specific control information
2003  * (indirect blocks) to the vnode to which they belong.
2004  */
2005 void
2006 reassignbuf(struct buf *bp)
2007 {
2008 	struct vnode *vp;
2009 	struct bufobj *bo;
2010 	int delay;
2011 #ifdef INVARIANTS
2012 	struct bufv *bv;
2013 #endif
2014 
2015 	vp = bp->b_vp;
2016 	bo = bp->b_bufobj;
2017 	++reassignbufcalls;
2018 
2019 	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
2020 	    bp, bp->b_vp, bp->b_flags);
2021 	/*
2022 	 * B_PAGING flagged buffers cannot be reassigned because their vp
2023 	 * is not fully linked in.
2024 	 */
2025 	if (bp->b_flags & B_PAGING)
2026 		panic("cannot reassign paging buffer");
2027 
2028 	/*
2029 	 * Delete from old vnode list, if on one.
2030 	 */
2031 	BO_LOCK(bo);
2032 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
2033 		buf_vlist_remove(bp);
2034 	else
2035 		panic("reassignbuf: Buffer %p not on queue.", bp);
2036 	/*
2037 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
2038 	 * of clean buffers.
2039 	 */
2040 	if (bp->b_flags & B_DELWRI) {
2041 		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
2042 			switch (vp->v_type) {
2043 			case VDIR:
2044 				delay = dirdelay;
2045 				break;
2046 			case VCHR:
2047 				delay = metadelay;
2048 				break;
2049 			default:
2050 				delay = filedelay;
2051 			}
2052 			vn_syncer_add_to_worklist(bo, delay);
2053 		}
2054 		buf_vlist_add(bp, bo, BX_VNDIRTY);
2055 	} else {
2056 		buf_vlist_add(bp, bo, BX_VNCLEAN);
2057 
2058 		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
2059 			mtx_lock(&sync_mtx);
2060 			LIST_REMOVE(bo, bo_synclist);
2061 			syncer_worklist_len--;
2062 			mtx_unlock(&sync_mtx);
2063 			bo->bo_flag &= ~BO_ONWORKLST;
2064 		}
2065 	}
2066 #ifdef INVARIANTS
2067 	bv = &bo->bo_clean;
2068 	bp = TAILQ_FIRST(&bv->bv_hd);
2069 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2070 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2071 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
2072 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2073 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2074 	bv = &bo->bo_dirty;
2075 	bp = TAILQ_FIRST(&bv->bv_hd);
2076 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2077 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2078 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
2079 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2080 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2081 #endif
2082 	BO_UNLOCK(bo);
2083 }
2084 
2085 /*
2086  * A temporary hack until refcount_* APIs are sorted out.
2087  */
2088 static __inline int
2089 vfs_refcount_acquire_if_not_zero(volatile u_int *count)
2090 {
2091 	u_int old;
2092 
2093 	for (;;) {
2094 		old = *count;
2095 		if (old == 0)
2096 			return (0);
2097 		if (atomic_cmpset_int(count, old, old + 1))
2098 			return (1);
2099 	}
2100 }
2101 
2102 static __inline int
2103 vfs_refcount_release_if_not_last(volatile u_int *count)
2104 {
2105 	u_int old;
2106 
2107 	for (;;) {
2108 		old = *count;
2109 		if (old == 1)
2110 			return (0);
2111 		if (atomic_cmpset_int(count, old, old - 1))
2112 			return (1);
2113 	}
2114 }
2115 
2116 static void
2117 v_init_counters(struct vnode *vp)
2118 {
2119 
2120 	VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0,
2121 	    vp, ("%s called for an initialized vnode", __FUNCTION__));
2122 	ASSERT_VI_UNLOCKED(vp, __FUNCTION__);
2123 
2124 	refcount_init(&vp->v_holdcnt, 1);
2125 	refcount_init(&vp->v_usecount, 1);
2126 }
2127 
2128 /*
2129  * Increment the use and hold counts on the vnode, taking care to reference
2130  * the driver's usecount if this is a chardev.  The _vhold() will remove
2131  * the vnode from the free list if it is presently free.
2132  */
2133 static void
2134 v_incr_usecount(struct vnode *vp)
2135 {
2136 
2137 	ASSERT_VI_UNLOCKED(vp, __func__);
2138 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2139 
2140 	if (vp->v_type == VCHR) {
2141 		VI_LOCK(vp);
2142 		_vhold(vp, true);
2143 		if (vp->v_iflag & VI_OWEINACT) {
2144 			VNASSERT(vp->v_usecount == 0, vp,
2145 			    ("vnode with usecount and VI_OWEINACT set"));
2146 			vp->v_iflag &= ~VI_OWEINACT;
2147 		}
2148 		refcount_acquire(&vp->v_usecount);
2149 		v_incr_devcount(vp);
2150 		VI_UNLOCK(vp);
2151 		return;
2152 	}
2153 
2154 	_vhold(vp, false);
2155 	if (vfs_refcount_acquire_if_not_zero(&vp->v_usecount)) {
2156 		VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
2157 		    ("vnode with usecount and VI_OWEINACT set"));
2158 	} else {
2159 		VI_LOCK(vp);
2160 		if (vp->v_iflag & VI_OWEINACT)
2161 			vp->v_iflag &= ~VI_OWEINACT;
2162 		refcount_acquire(&vp->v_usecount);
2163 		VI_UNLOCK(vp);
2164 	}
2165 }
2166 
2167 /*
2168  * Increment si_usecount of the associated device, if any.
2169  */
2170 static void
2171 v_incr_devcount(struct vnode *vp)
2172 {
2173 
2174 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
2175 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2176 		dev_lock();
2177 		vp->v_rdev->si_usecount++;
2178 		dev_unlock();
2179 	}
2180 }
2181 
2182 /*
2183  * Decrement si_usecount of the associated device, if any.
2184  */
2185 static void
2186 v_decr_devcount(struct vnode *vp)
2187 {
2188 
2189 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
2190 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2191 		dev_lock();
2192 		vp->v_rdev->si_usecount--;
2193 		dev_unlock();
2194 	}
2195 }
2196 
2197 /*
2198  * Grab a particular vnode from the free list, increment its
2199  * reference count and lock it.  VI_DOOMED is set if the vnode
2200  * is being destroyed.  Only callers who specify LK_RETRY will
2201  * see doomed vnodes.  If inactive processing was delayed in
2202  * vput try to do it here.
2203  *
2204  * Notes on lockless counter manipulation:
2205  * _vhold, vputx and other routines make various decisions based
2206  * on either holdcnt or usecount being 0. As long as either contuner
2207  * is not transitioning 0->1 nor 1->0, the manipulation can be done
2208  * with atomic operations. Otherwise the interlock is taken.
2209  */
2210 int
2211 vget(struct vnode *vp, int flags, struct thread *td)
2212 {
2213 	int error, oweinact;
2214 
2215 	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
2216 	    ("vget: invalid lock operation"));
2217 
2218 	if ((flags & LK_INTERLOCK) != 0)
2219 		ASSERT_VI_LOCKED(vp, __func__);
2220 	else
2221 		ASSERT_VI_UNLOCKED(vp, __func__);
2222 	if ((flags & LK_VNHELD) != 0)
2223 		VNASSERT((vp->v_holdcnt > 0), vp,
2224 		    ("vget: LK_VNHELD passed but vnode not held"));
2225 
2226 	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
2227 
2228 	if ((flags & LK_VNHELD) == 0)
2229 		_vhold(vp, (flags & LK_INTERLOCK) != 0);
2230 
2231 	if ((error = vn_lock(vp, flags)) != 0) {
2232 		vdrop(vp);
2233 		CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
2234 		    vp);
2235 		return (error);
2236 	}
2237 	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
2238 		panic("vget: vn_lock failed to return ENOENT\n");
2239 	/*
2240 	 * We don't guarantee that any particular close will
2241 	 * trigger inactive processing so just make a best effort
2242 	 * here at preventing a reference to a removed file.  If
2243 	 * we don't succeed no harm is done.
2244 	 *
2245 	 * Upgrade our holdcnt to a usecount.
2246 	 */
2247 	if (vp->v_type != VCHR &&
2248 	    vfs_refcount_acquire_if_not_zero(&vp->v_usecount)) {
2249 		VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
2250 		    ("vnode with usecount and VI_OWEINACT set"));
2251 	} else {
2252 		VI_LOCK(vp);
2253 		if ((vp->v_iflag & VI_OWEINACT) == 0) {
2254 			oweinact = 0;
2255 		} else {
2256 			oweinact = 1;
2257 			vp->v_iflag &= ~VI_OWEINACT;
2258 		}
2259 		refcount_acquire(&vp->v_usecount);
2260 		v_incr_devcount(vp);
2261 		if (oweinact && VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
2262 		    (flags & LK_NOWAIT) == 0)
2263 			vinactive(vp, td);
2264 		VI_UNLOCK(vp);
2265 	}
2266 	return (0);
2267 }
2268 
2269 /*
2270  * Increase the reference count of a vnode.
2271  */
2272 void
2273 vref(struct vnode *vp)
2274 {
2275 
2276 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2277 	v_incr_usecount(vp);
2278 }
2279 
2280 /*
2281  * Return reference count of a vnode.
2282  *
2283  * The results of this call are only guaranteed when some mechanism is used to
2284  * stop other processes from gaining references to the vnode.  This may be the
2285  * case if the caller holds the only reference.  This is also useful when stale
2286  * data is acceptable as race conditions may be accounted for by some other
2287  * means.
2288  */
2289 int
2290 vrefcnt(struct vnode *vp)
2291 {
2292 
2293 	return (vp->v_usecount);
2294 }
2295 
2296 #define	VPUTX_VRELE	1
2297 #define	VPUTX_VPUT	2
2298 #define	VPUTX_VUNREF	3
2299 
2300 /*
2301  * Decrement the use and hold counts for a vnode.
2302  *
2303  * See an explanation near vget() as to why atomic operation is safe.
2304  */
2305 static void
2306 vputx(struct vnode *vp, int func)
2307 {
2308 	int error;
2309 
2310 	KASSERT(vp != NULL, ("vputx: null vp"));
2311 	if (func == VPUTX_VUNREF)
2312 		ASSERT_VOP_LOCKED(vp, "vunref");
2313 	else if (func == VPUTX_VPUT)
2314 		ASSERT_VOP_LOCKED(vp, "vput");
2315 	else
2316 		KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
2317 	ASSERT_VI_UNLOCKED(vp, __func__);
2318 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2319 
2320 	if (vp->v_type != VCHR &&
2321 	    vfs_refcount_release_if_not_last(&vp->v_usecount)) {
2322 		if (func == VPUTX_VPUT)
2323 			VOP_UNLOCK(vp, 0);
2324 		vdrop(vp);
2325 		return;
2326 	}
2327 
2328 	VI_LOCK(vp);
2329 
2330 	/*
2331 	 * We want to hold the vnode until the inactive finishes to
2332 	 * prevent vgone() races.  We drop the use count here and the
2333 	 * hold count below when we're done.
2334 	 */
2335 	if (!refcount_release(&vp->v_usecount) ||
2336 	    (vp->v_iflag & VI_DOINGINACT)) {
2337 		if (func == VPUTX_VPUT)
2338 			VOP_UNLOCK(vp, 0);
2339 		v_decr_devcount(vp);
2340 		vdropl(vp);
2341 		return;
2342 	}
2343 
2344 	v_decr_devcount(vp);
2345 
2346 	error = 0;
2347 
2348 	if (vp->v_usecount != 0) {
2349 		vprint("vputx: usecount not zero", vp);
2350 		panic("vputx: usecount not zero");
2351 	}
2352 
2353 	CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp);
2354 
2355 	/*
2356 	 * We must call VOP_INACTIVE with the node locked. Mark
2357 	 * as VI_DOINGINACT to avoid recursion.
2358 	 */
2359 	vp->v_iflag |= VI_OWEINACT;
2360 	switch (func) {
2361 	case VPUTX_VRELE:
2362 		error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
2363 		VI_LOCK(vp);
2364 		break;
2365 	case VPUTX_VPUT:
2366 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2367 			error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
2368 			    LK_NOWAIT);
2369 			VI_LOCK(vp);
2370 		}
2371 		break;
2372 	case VPUTX_VUNREF:
2373 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2374 			error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
2375 			VI_LOCK(vp);
2376 		}
2377 		break;
2378 	}
2379 	VNASSERT(vp->v_usecount == 0 || (vp->v_iflag & VI_OWEINACT) == 0, vp,
2380 	    ("vnode with usecount and VI_OWEINACT set"));
2381 	if (error == 0) {
2382 		if (vp->v_iflag & VI_OWEINACT)
2383 			vinactive(vp, curthread);
2384 		if (func != VPUTX_VUNREF)
2385 			VOP_UNLOCK(vp, 0);
2386 	}
2387 	vdropl(vp);
2388 }
2389 
2390 /*
2391  * Vnode put/release.
2392  * If count drops to zero, call inactive routine and return to freelist.
2393  */
2394 void
2395 vrele(struct vnode *vp)
2396 {
2397 
2398 	vputx(vp, VPUTX_VRELE);
2399 }
2400 
2401 /*
2402  * Release an already locked vnode.  This give the same effects as
2403  * unlock+vrele(), but takes less time and avoids releasing and
2404  * re-aquiring the lock (as vrele() acquires the lock internally.)
2405  */
2406 void
2407 vput(struct vnode *vp)
2408 {
2409 
2410 	vputx(vp, VPUTX_VPUT);
2411 }
2412 
2413 /*
2414  * Release an exclusively locked vnode. Do not unlock the vnode lock.
2415  */
2416 void
2417 vunref(struct vnode *vp)
2418 {
2419 
2420 	vputx(vp, VPUTX_VUNREF);
2421 }
2422 
2423 /*
2424  * Increase the hold count and activate if this is the first reference.
2425  */
2426 void
2427 _vhold(struct vnode *vp, bool locked)
2428 {
2429 	struct mount *mp;
2430 
2431 	if (locked)
2432 		ASSERT_VI_LOCKED(vp, __func__);
2433 	else
2434 		ASSERT_VI_UNLOCKED(vp, __func__);
2435 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2436 	if (!locked && vfs_refcount_acquire_if_not_zero(&vp->v_holdcnt)) {
2437 		VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2438 		    ("_vhold: vnode with holdcnt is free"));
2439 		return;
2440 	}
2441 
2442 	if (!locked)
2443 		VI_LOCK(vp);
2444 	if ((vp->v_iflag & VI_FREE) == 0) {
2445 		refcount_acquire(&vp->v_holdcnt);
2446 		if (!locked)
2447 			VI_UNLOCK(vp);
2448 		return;
2449 	}
2450 	VNASSERT(vp->v_holdcnt == 0, vp,
2451 	    ("%s: wrong hold count", __func__));
2452 	VNASSERT(vp->v_op != NULL, vp,
2453 	    ("%s: vnode already reclaimed.", __func__));
2454 	/*
2455 	 * Remove a vnode from the free list, mark it as in use,
2456 	 * and put it on the active list.
2457 	 */
2458 	mtx_lock(&vnode_free_list_mtx);
2459 	TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
2460 	freevnodes--;
2461 	vp->v_iflag &= ~(VI_FREE|VI_AGE);
2462 	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
2463 	    ("Activating already active vnode"));
2464 	vp->v_iflag |= VI_ACTIVE;
2465 	mp = vp->v_mount;
2466 	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
2467 	mp->mnt_activevnodelistsize++;
2468 	mtx_unlock(&vnode_free_list_mtx);
2469 	refcount_acquire(&vp->v_holdcnt);
2470 	if (!locked)
2471 		VI_UNLOCK(vp);
2472 }
2473 
2474 /*
2475  * Drop the hold count of the vnode.  If this is the last reference to
2476  * the vnode we place it on the free list unless it has been vgone'd
2477  * (marked VI_DOOMED) in which case we will free it.
2478  */
2479 void
2480 _vdrop(struct vnode *vp, bool locked)
2481 {
2482 	struct bufobj *bo;
2483 	struct mount *mp;
2484 	int active;
2485 
2486 	if (locked)
2487 		ASSERT_VI_LOCKED(vp, __func__);
2488 	else
2489 		ASSERT_VI_UNLOCKED(vp, __func__);
2490 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2491 	if ((int)vp->v_holdcnt <= 0)
2492 		panic("vdrop: holdcnt %d", vp->v_holdcnt);
2493 	if (vfs_refcount_release_if_not_last(&vp->v_holdcnt)) {
2494 		if (locked)
2495 			VI_UNLOCK(vp);
2496 		return;
2497 	}
2498 
2499 	if (!locked)
2500 		VI_LOCK(vp);
2501 	if (refcount_release(&vp->v_holdcnt) == 0) {
2502 		VI_UNLOCK(vp);
2503 		return;
2504 	}
2505 	if ((vp->v_iflag & VI_DOOMED) == 0) {
2506 		/*
2507 		 * Mark a vnode as free: remove it from its active list
2508 		 * and put it up for recycling on the freelist.
2509 		 */
2510 		VNASSERT(vp->v_op != NULL, vp,
2511 		    ("vdropl: vnode already reclaimed."));
2512 		VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2513 		    ("vnode already free"));
2514 		VNASSERT(vp->v_holdcnt == 0, vp,
2515 		    ("vdropl: freeing when we shouldn't"));
2516 		active = vp->v_iflag & VI_ACTIVE;
2517 		if ((vp->v_iflag & VI_OWEINACT) == 0) {
2518 			vp->v_iflag &= ~VI_ACTIVE;
2519 			mp = vp->v_mount;
2520 			mtx_lock(&vnode_free_list_mtx);
2521 			if (active) {
2522 				TAILQ_REMOVE(&mp->mnt_activevnodelist, vp,
2523 				    v_actfreelist);
2524 				mp->mnt_activevnodelistsize--;
2525 			}
2526 			if (vp->v_iflag & VI_AGE) {
2527 				TAILQ_INSERT_HEAD(&vnode_free_list, vp,
2528 				    v_actfreelist);
2529 			} else {
2530 				TAILQ_INSERT_TAIL(&vnode_free_list, vp,
2531 				    v_actfreelist);
2532 			}
2533 			freevnodes++;
2534 			vp->v_iflag &= ~VI_AGE;
2535 			vp->v_iflag |= VI_FREE;
2536 			mtx_unlock(&vnode_free_list_mtx);
2537 		} else {
2538 			atomic_add_long(&free_owe_inact, 1);
2539 		}
2540 		VI_UNLOCK(vp);
2541 		return;
2542 	}
2543 	/*
2544 	 * The vnode has been marked for destruction, so free it.
2545 	 */
2546 	CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
2547 	atomic_subtract_long(&numvnodes, 1);
2548 	bo = &vp->v_bufobj;
2549 	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2550 	    ("cleaned vnode still on the free list."));
2551 	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
2552 	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
2553 	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
2554 	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
2555 	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
2556 	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
2557 	VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
2558 	    ("clean blk trie not empty"));
2559 	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
2560 	VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
2561 	    ("dirty blk trie not empty"));
2562 	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
2563 	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
2564 	VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
2565 	VI_UNLOCK(vp);
2566 #ifdef MAC
2567 	mac_vnode_destroy(vp);
2568 #endif
2569 	if (vp->v_pollinfo != NULL)
2570 		destroy_vpollinfo(vp->v_pollinfo);
2571 #ifdef INVARIANTS
2572 	/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
2573 	vp->v_op = NULL;
2574 #endif
2575 	rangelock_destroy(&vp->v_rl);
2576 	lockdestroy(vp->v_vnlock);
2577 	mtx_destroy(&vp->v_interlock);
2578 	rw_destroy(BO_LOCKPTR(bo));
2579 	uma_zfree(vnode_zone, vp);
2580 }
2581 
2582 /*
2583  * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
2584  * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
2585  * OWEINACT tracks whether a vnode missed a call to inactive due to a
2586  * failed lock upgrade.
2587  */
2588 void
2589 vinactive(struct vnode *vp, struct thread *td)
2590 {
2591 	struct vm_object *obj;
2592 
2593 	ASSERT_VOP_ELOCKED(vp, "vinactive");
2594 	ASSERT_VI_LOCKED(vp, "vinactive");
2595 	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
2596 	    ("vinactive: recursed on VI_DOINGINACT"));
2597 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2598 	vp->v_iflag |= VI_DOINGINACT;
2599 	vp->v_iflag &= ~VI_OWEINACT;
2600 	VI_UNLOCK(vp);
2601 	/*
2602 	 * Before moving off the active list, we must be sure that any
2603 	 * modified pages are on the vnode's dirty list since these will
2604 	 * no longer be checked once the vnode is on the inactive list.
2605 	 * Because the vnode vm object keeps a hold reference on the vnode
2606 	 * if there is at least one resident non-cached page, the vnode
2607 	 * cannot leave the active list without the page cleanup done.
2608 	 */
2609 	obj = vp->v_object;
2610 	if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) {
2611 		VM_OBJECT_WLOCK(obj);
2612 		vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC);
2613 		VM_OBJECT_WUNLOCK(obj);
2614 	}
2615 	VOP_INACTIVE(vp, td);
2616 	VI_LOCK(vp);
2617 	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
2618 	    ("vinactive: lost VI_DOINGINACT"));
2619 	vp->v_iflag &= ~VI_DOINGINACT;
2620 }
2621 
2622 /*
2623  * Remove any vnodes in the vnode table belonging to mount point mp.
2624  *
2625  * If FORCECLOSE is not specified, there should not be any active ones,
2626  * return error if any are found (nb: this is a user error, not a
2627  * system error). If FORCECLOSE is specified, detach any active vnodes
2628  * that are found.
2629  *
2630  * If WRITECLOSE is set, only flush out regular file vnodes open for
2631  * writing.
2632  *
2633  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2634  *
2635  * `rootrefs' specifies the base reference count for the root vnode
2636  * of this filesystem. The root vnode is considered busy if its
2637  * v_usecount exceeds this value. On a successful return, vflush(, td)
2638  * will call vrele() on the root vnode exactly rootrefs times.
2639  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2640  * be zero.
2641  */
2642 #ifdef DIAGNOSTIC
2643 static int busyprt = 0;		/* print out busy vnodes */
2644 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
2645 #endif
2646 
2647 int
2648 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
2649 {
2650 	struct vnode *vp, *mvp, *rootvp = NULL;
2651 	struct vattr vattr;
2652 	int busy = 0, error;
2653 
2654 	CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
2655 	    rootrefs, flags);
2656 	if (rootrefs > 0) {
2657 		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2658 		    ("vflush: bad args"));
2659 		/*
2660 		 * Get the filesystem root vnode. We can vput() it
2661 		 * immediately, since with rootrefs > 0, it won't go away.
2662 		 */
2663 		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
2664 			CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
2665 			    __func__, error);
2666 			return (error);
2667 		}
2668 		vput(rootvp);
2669 	}
2670 loop:
2671 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2672 		vholdl(vp);
2673 		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
2674 		if (error) {
2675 			vdrop(vp);
2676 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2677 			goto loop;
2678 		}
2679 		/*
2680 		 * Skip over a vnodes marked VV_SYSTEM.
2681 		 */
2682 		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2683 			VOP_UNLOCK(vp, 0);
2684 			vdrop(vp);
2685 			continue;
2686 		}
2687 		/*
2688 		 * If WRITECLOSE is set, flush out unlinked but still open
2689 		 * files (even if open only for reading) and regular file
2690 		 * vnodes open for writing.
2691 		 */
2692 		if (flags & WRITECLOSE) {
2693 			if (vp->v_object != NULL) {
2694 				VM_OBJECT_WLOCK(vp->v_object);
2695 				vm_object_page_clean(vp->v_object, 0, 0, 0);
2696 				VM_OBJECT_WUNLOCK(vp->v_object);
2697 			}
2698 			error = VOP_FSYNC(vp, MNT_WAIT, td);
2699 			if (error != 0) {
2700 				VOP_UNLOCK(vp, 0);
2701 				vdrop(vp);
2702 				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2703 				return (error);
2704 			}
2705 			error = VOP_GETATTR(vp, &vattr, td->td_ucred);
2706 			VI_LOCK(vp);
2707 
2708 			if ((vp->v_type == VNON ||
2709 			    (error == 0 && vattr.va_nlink > 0)) &&
2710 			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
2711 				VOP_UNLOCK(vp, 0);
2712 				vdropl(vp);
2713 				continue;
2714 			}
2715 		} else
2716 			VI_LOCK(vp);
2717 		/*
2718 		 * With v_usecount == 0, all we need to do is clear out the
2719 		 * vnode data structures and we are done.
2720 		 *
2721 		 * If FORCECLOSE is set, forcibly close the vnode.
2722 		 */
2723 		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
2724 			vgonel(vp);
2725 		} else {
2726 			busy++;
2727 #ifdef DIAGNOSTIC
2728 			if (busyprt)
2729 				vprint("vflush: busy vnode", vp);
2730 #endif
2731 		}
2732 		VOP_UNLOCK(vp, 0);
2733 		vdropl(vp);
2734 	}
2735 	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2736 		/*
2737 		 * If just the root vnode is busy, and if its refcount
2738 		 * is equal to `rootrefs', then go ahead and kill it.
2739 		 */
2740 		VI_LOCK(rootvp);
2741 		KASSERT(busy > 0, ("vflush: not busy"));
2742 		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
2743 		    ("vflush: usecount %d < rootrefs %d",
2744 		     rootvp->v_usecount, rootrefs));
2745 		if (busy == 1 && rootvp->v_usecount == rootrefs) {
2746 			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
2747 			vgone(rootvp);
2748 			VOP_UNLOCK(rootvp, 0);
2749 			busy = 0;
2750 		} else
2751 			VI_UNLOCK(rootvp);
2752 	}
2753 	if (busy) {
2754 		CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
2755 		    busy);
2756 		return (EBUSY);
2757 	}
2758 	for (; rootrefs > 0; rootrefs--)
2759 		vrele(rootvp);
2760 	return (0);
2761 }
2762 
2763 /*
2764  * Recycle an unused vnode to the front of the free list.
2765  */
2766 int
2767 vrecycle(struct vnode *vp)
2768 {
2769 	int recycled;
2770 
2771 	ASSERT_VOP_ELOCKED(vp, "vrecycle");
2772 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2773 	recycled = 0;
2774 	VI_LOCK(vp);
2775 	if (vp->v_usecount == 0) {
2776 		recycled = 1;
2777 		vgonel(vp);
2778 	}
2779 	VI_UNLOCK(vp);
2780 	return (recycled);
2781 }
2782 
2783 /*
2784  * Eliminate all activity associated with a vnode
2785  * in preparation for reuse.
2786  */
2787 void
2788 vgone(struct vnode *vp)
2789 {
2790 	VI_LOCK(vp);
2791 	vgonel(vp);
2792 	VI_UNLOCK(vp);
2793 }
2794 
2795 static void
2796 notify_lowervp_vfs_dummy(struct mount *mp __unused,
2797     struct vnode *lowervp __unused)
2798 {
2799 }
2800 
2801 /*
2802  * Notify upper mounts about reclaimed or unlinked vnode.
2803  */
2804 void
2805 vfs_notify_upper(struct vnode *vp, int event)
2806 {
2807 	static struct vfsops vgonel_vfsops = {
2808 		.vfs_reclaim_lowervp = notify_lowervp_vfs_dummy,
2809 		.vfs_unlink_lowervp = notify_lowervp_vfs_dummy,
2810 	};
2811 	struct mount *mp, *ump, *mmp;
2812 
2813 	mp = vp->v_mount;
2814 	if (mp == NULL)
2815 		return;
2816 
2817 	MNT_ILOCK(mp);
2818 	if (TAILQ_EMPTY(&mp->mnt_uppers))
2819 		goto unlock;
2820 	MNT_IUNLOCK(mp);
2821 	mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
2822 	mmp->mnt_op = &vgonel_vfsops;
2823 	mmp->mnt_kern_flag |= MNTK_MARKER;
2824 	MNT_ILOCK(mp);
2825 	mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
2826 	for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
2827 		if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
2828 			ump = TAILQ_NEXT(ump, mnt_upper_link);
2829 			continue;
2830 		}
2831 		TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
2832 		MNT_IUNLOCK(mp);
2833 		switch (event) {
2834 		case VFS_NOTIFY_UPPER_RECLAIM:
2835 			VFS_RECLAIM_LOWERVP(ump, vp);
2836 			break;
2837 		case VFS_NOTIFY_UPPER_UNLINK:
2838 			VFS_UNLINK_LOWERVP(ump, vp);
2839 			break;
2840 		default:
2841 			KASSERT(0, ("invalid event %d", event));
2842 			break;
2843 		}
2844 		MNT_ILOCK(mp);
2845 		ump = TAILQ_NEXT(mmp, mnt_upper_link);
2846 		TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
2847 	}
2848 	free(mmp, M_TEMP);
2849 	mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
2850 	if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
2851 		mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
2852 		wakeup(&mp->mnt_uppers);
2853 	}
2854 unlock:
2855 	MNT_IUNLOCK(mp);
2856 }
2857 
2858 /*
2859  * vgone, with the vp interlock held.
2860  */
2861 static void
2862 vgonel(struct vnode *vp)
2863 {
2864 	struct thread *td;
2865 	int oweinact;
2866 	int active;
2867 	struct mount *mp;
2868 
2869 	ASSERT_VOP_ELOCKED(vp, "vgonel");
2870 	ASSERT_VI_LOCKED(vp, "vgonel");
2871 	VNASSERT(vp->v_holdcnt, vp,
2872 	    ("vgonel: vp %p has no reference.", vp));
2873 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2874 	td = curthread;
2875 
2876 	/*
2877 	 * Don't vgonel if we're already doomed.
2878 	 */
2879 	if (vp->v_iflag & VI_DOOMED)
2880 		return;
2881 	vp->v_iflag |= VI_DOOMED;
2882 
2883 	/*
2884 	 * Check to see if the vnode is in use.  If so, we have to call
2885 	 * VOP_CLOSE() and VOP_INACTIVE().
2886 	 */
2887 	active = vp->v_usecount;
2888 	oweinact = (vp->v_iflag & VI_OWEINACT);
2889 	VI_UNLOCK(vp);
2890 	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
2891 
2892 	/*
2893 	 * If purging an active vnode, it must be closed and
2894 	 * deactivated before being reclaimed.
2895 	 */
2896 	if (active)
2897 		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2898 	if (oweinact || active) {
2899 		VI_LOCK(vp);
2900 		if ((vp->v_iflag & VI_DOINGINACT) == 0)
2901 			vinactive(vp, td);
2902 		VI_UNLOCK(vp);
2903 	}
2904 	if (vp->v_type == VSOCK)
2905 		vfs_unp_reclaim(vp);
2906 
2907 	/*
2908 	 * Clean out any buffers associated with the vnode.
2909 	 * If the flush fails, just toss the buffers.
2910 	 */
2911 	mp = NULL;
2912 	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
2913 		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
2914 	if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
2915 		while (vinvalbuf(vp, 0, 0, 0) != 0)
2916 			;
2917 	}
2918 
2919 	BO_LOCK(&vp->v_bufobj);
2920 	KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
2921 	    vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
2922 	    TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
2923 	    vp->v_bufobj.bo_clean.bv_cnt == 0,
2924 	    ("vp %p bufobj not invalidated", vp));
2925 	vp->v_bufobj.bo_flag |= BO_DEAD;
2926 	BO_UNLOCK(&vp->v_bufobj);
2927 
2928 	/*
2929 	 * Reclaim the vnode.
2930 	 */
2931 	if (VOP_RECLAIM(vp, td))
2932 		panic("vgone: cannot reclaim");
2933 	if (mp != NULL)
2934 		vn_finished_secondary_write(mp);
2935 	VNASSERT(vp->v_object == NULL, vp,
2936 	    ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
2937 	/*
2938 	 * Clear the advisory locks and wake up waiting threads.
2939 	 */
2940 	(void)VOP_ADVLOCKPURGE(vp);
2941 	/*
2942 	 * Delete from old mount point vnode list.
2943 	 */
2944 	delmntque(vp);
2945 	cache_purge(vp);
2946 	/*
2947 	 * Done with purge, reset to the standard lock and invalidate
2948 	 * the vnode.
2949 	 */
2950 	VI_LOCK(vp);
2951 	vp->v_vnlock = &vp->v_lock;
2952 	vp->v_op = &dead_vnodeops;
2953 	vp->v_tag = "none";
2954 	vp->v_type = VBAD;
2955 }
2956 
2957 /*
2958  * Calculate the total number of references to a special device.
2959  */
2960 int
2961 vcount(struct vnode *vp)
2962 {
2963 	int count;
2964 
2965 	dev_lock();
2966 	count = vp->v_rdev->si_usecount;
2967 	dev_unlock();
2968 	return (count);
2969 }
2970 
2971 /*
2972  * Same as above, but using the struct cdev *as argument
2973  */
2974 int
2975 count_dev(struct cdev *dev)
2976 {
2977 	int count;
2978 
2979 	dev_lock();
2980 	count = dev->si_usecount;
2981 	dev_unlock();
2982 	return(count);
2983 }
2984 
2985 /*
2986  * Print out a description of a vnode.
2987  */
2988 static char *typename[] =
2989 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
2990  "VMARKER"};
2991 
2992 void
2993 vn_printf(struct vnode *vp, const char *fmt, ...)
2994 {
2995 	va_list ap;
2996 	char buf[256], buf2[16];
2997 	u_long flags;
2998 
2999 	va_start(ap, fmt);
3000 	vprintf(fmt, ap);
3001 	va_end(ap);
3002 	printf("%p: ", (void *)vp);
3003 	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
3004 	printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
3005 	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
3006 	buf[0] = '\0';
3007 	buf[1] = '\0';
3008 	if (vp->v_vflag & VV_ROOT)
3009 		strlcat(buf, "|VV_ROOT", sizeof(buf));
3010 	if (vp->v_vflag & VV_ISTTY)
3011 		strlcat(buf, "|VV_ISTTY", sizeof(buf));
3012 	if (vp->v_vflag & VV_NOSYNC)
3013 		strlcat(buf, "|VV_NOSYNC", sizeof(buf));
3014 	if (vp->v_vflag & VV_ETERNALDEV)
3015 		strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
3016 	if (vp->v_vflag & VV_CACHEDLABEL)
3017 		strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
3018 	if (vp->v_vflag & VV_TEXT)
3019 		strlcat(buf, "|VV_TEXT", sizeof(buf));
3020 	if (vp->v_vflag & VV_COPYONWRITE)
3021 		strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
3022 	if (vp->v_vflag & VV_SYSTEM)
3023 		strlcat(buf, "|VV_SYSTEM", sizeof(buf));
3024 	if (vp->v_vflag & VV_PROCDEP)
3025 		strlcat(buf, "|VV_PROCDEP", sizeof(buf));
3026 	if (vp->v_vflag & VV_NOKNOTE)
3027 		strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
3028 	if (vp->v_vflag & VV_DELETED)
3029 		strlcat(buf, "|VV_DELETED", sizeof(buf));
3030 	if (vp->v_vflag & VV_MD)
3031 		strlcat(buf, "|VV_MD", sizeof(buf));
3032 	if (vp->v_vflag & VV_FORCEINSMQ)
3033 		strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
3034 	flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
3035 	    VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
3036 	    VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ);
3037 	if (flags != 0) {
3038 		snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
3039 		strlcat(buf, buf2, sizeof(buf));
3040 	}
3041 	if (vp->v_iflag & VI_MOUNT)
3042 		strlcat(buf, "|VI_MOUNT", sizeof(buf));
3043 	if (vp->v_iflag & VI_AGE)
3044 		strlcat(buf, "|VI_AGE", sizeof(buf));
3045 	if (vp->v_iflag & VI_DOOMED)
3046 		strlcat(buf, "|VI_DOOMED", sizeof(buf));
3047 	if (vp->v_iflag & VI_FREE)
3048 		strlcat(buf, "|VI_FREE", sizeof(buf));
3049 	if (vp->v_iflag & VI_ACTIVE)
3050 		strlcat(buf, "|VI_ACTIVE", sizeof(buf));
3051 	if (vp->v_iflag & VI_DOINGINACT)
3052 		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
3053 	if (vp->v_iflag & VI_OWEINACT)
3054 		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
3055 	flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE |
3056 	    VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT);
3057 	if (flags != 0) {
3058 		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
3059 		strlcat(buf, buf2, sizeof(buf));
3060 	}
3061 	printf("    flags (%s)\n", buf + 1);
3062 	if (mtx_owned(VI_MTX(vp)))
3063 		printf(" VI_LOCKed");
3064 	if (vp->v_object != NULL)
3065 		printf("    v_object %p ref %d pages %d "
3066 		    "cleanbuf %d dirtybuf %d\n",
3067 		    vp->v_object, vp->v_object->ref_count,
3068 		    vp->v_object->resident_page_count,
3069 		    vp->v_bufobj.bo_clean.bv_cnt,
3070 		    vp->v_bufobj.bo_dirty.bv_cnt);
3071 	printf("    ");
3072 	lockmgr_printinfo(vp->v_vnlock);
3073 	if (vp->v_data != NULL)
3074 		VOP_PRINT(vp);
3075 }
3076 
3077 #ifdef DDB
3078 /*
3079  * List all of the locked vnodes in the system.
3080  * Called when debugging the kernel.
3081  */
3082 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
3083 {
3084 	struct mount *mp;
3085 	struct vnode *vp;
3086 
3087 	/*
3088 	 * Note: because this is DDB, we can't obey the locking semantics
3089 	 * for these structures, which means we could catch an inconsistent
3090 	 * state and dereference a nasty pointer.  Not much to be done
3091 	 * about that.
3092 	 */
3093 	db_printf("Locked vnodes\n");
3094 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3095 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3096 			if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
3097 				vprint("", vp);
3098 		}
3099 	}
3100 }
3101 
3102 /*
3103  * Show details about the given vnode.
3104  */
3105 DB_SHOW_COMMAND(vnode, db_show_vnode)
3106 {
3107 	struct vnode *vp;
3108 
3109 	if (!have_addr)
3110 		return;
3111 	vp = (struct vnode *)addr;
3112 	vn_printf(vp, "vnode ");
3113 }
3114 
3115 /*
3116  * Show details about the given mount point.
3117  */
3118 DB_SHOW_COMMAND(mount, db_show_mount)
3119 {
3120 	struct mount *mp;
3121 	struct vfsopt *opt;
3122 	struct statfs *sp;
3123 	struct vnode *vp;
3124 	char buf[512];
3125 	uint64_t mflags;
3126 	u_int flags;
3127 
3128 	if (!have_addr) {
3129 		/* No address given, print short info about all mount points. */
3130 		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3131 			db_printf("%p %s on %s (%s)\n", mp,
3132 			    mp->mnt_stat.f_mntfromname,
3133 			    mp->mnt_stat.f_mntonname,
3134 			    mp->mnt_stat.f_fstypename);
3135 			if (db_pager_quit)
3136 				break;
3137 		}
3138 		db_printf("\nMore info: show mount <addr>\n");
3139 		return;
3140 	}
3141 
3142 	mp = (struct mount *)addr;
3143 	db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
3144 	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
3145 
3146 	buf[0] = '\0';
3147 	mflags = mp->mnt_flag;
3148 #define	MNT_FLAG(flag)	do {						\
3149 	if (mflags & (flag)) {						\
3150 		if (buf[0] != '\0')					\
3151 			strlcat(buf, ", ", sizeof(buf));		\
3152 		strlcat(buf, (#flag) + 4, sizeof(buf));			\
3153 		mflags &= ~(flag);					\
3154 	}								\
3155 } while (0)
3156 	MNT_FLAG(MNT_RDONLY);
3157 	MNT_FLAG(MNT_SYNCHRONOUS);
3158 	MNT_FLAG(MNT_NOEXEC);
3159 	MNT_FLAG(MNT_NOSUID);
3160 	MNT_FLAG(MNT_NFS4ACLS);
3161 	MNT_FLAG(MNT_UNION);
3162 	MNT_FLAG(MNT_ASYNC);
3163 	MNT_FLAG(MNT_SUIDDIR);
3164 	MNT_FLAG(MNT_SOFTDEP);
3165 	MNT_FLAG(MNT_NOSYMFOLLOW);
3166 	MNT_FLAG(MNT_GJOURNAL);
3167 	MNT_FLAG(MNT_MULTILABEL);
3168 	MNT_FLAG(MNT_ACLS);
3169 	MNT_FLAG(MNT_NOATIME);
3170 	MNT_FLAG(MNT_NOCLUSTERR);
3171 	MNT_FLAG(MNT_NOCLUSTERW);
3172 	MNT_FLAG(MNT_SUJ);
3173 	MNT_FLAG(MNT_EXRDONLY);
3174 	MNT_FLAG(MNT_EXPORTED);
3175 	MNT_FLAG(MNT_DEFEXPORTED);
3176 	MNT_FLAG(MNT_EXPORTANON);
3177 	MNT_FLAG(MNT_EXKERB);
3178 	MNT_FLAG(MNT_EXPUBLIC);
3179 	MNT_FLAG(MNT_LOCAL);
3180 	MNT_FLAG(MNT_QUOTA);
3181 	MNT_FLAG(MNT_ROOTFS);
3182 	MNT_FLAG(MNT_USER);
3183 	MNT_FLAG(MNT_IGNORE);
3184 	MNT_FLAG(MNT_UPDATE);
3185 	MNT_FLAG(MNT_DELEXPORT);
3186 	MNT_FLAG(MNT_RELOAD);
3187 	MNT_FLAG(MNT_FORCE);
3188 	MNT_FLAG(MNT_SNAPSHOT);
3189 	MNT_FLAG(MNT_BYFSID);
3190 #undef MNT_FLAG
3191 	if (mflags != 0) {
3192 		if (buf[0] != '\0')
3193 			strlcat(buf, ", ", sizeof(buf));
3194 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
3195 		    "0x%016jx", mflags);
3196 	}
3197 	db_printf("    mnt_flag = %s\n", buf);
3198 
3199 	buf[0] = '\0';
3200 	flags = mp->mnt_kern_flag;
3201 #define	MNT_KERN_FLAG(flag)	do {					\
3202 	if (flags & (flag)) {						\
3203 		if (buf[0] != '\0')					\
3204 			strlcat(buf, ", ", sizeof(buf));		\
3205 		strlcat(buf, (#flag) + 5, sizeof(buf));			\
3206 		flags &= ~(flag);					\
3207 	}								\
3208 } while (0)
3209 	MNT_KERN_FLAG(MNTK_UNMOUNTF);
3210 	MNT_KERN_FLAG(MNTK_ASYNC);
3211 	MNT_KERN_FLAG(MNTK_SOFTDEP);
3212 	MNT_KERN_FLAG(MNTK_NOINSMNTQ);
3213 	MNT_KERN_FLAG(MNTK_DRAINING);
3214 	MNT_KERN_FLAG(MNTK_REFEXPIRE);
3215 	MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
3216 	MNT_KERN_FLAG(MNTK_SHARED_WRITES);
3217 	MNT_KERN_FLAG(MNTK_NO_IOPF);
3218 	MNT_KERN_FLAG(MNTK_VGONE_UPPER);
3219 	MNT_KERN_FLAG(MNTK_VGONE_WAITER);
3220 	MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
3221 	MNT_KERN_FLAG(MNTK_MARKER);
3222 	MNT_KERN_FLAG(MNTK_USES_BCACHE);
3223 	MNT_KERN_FLAG(MNTK_NOASYNC);
3224 	MNT_KERN_FLAG(MNTK_UNMOUNT);
3225 	MNT_KERN_FLAG(MNTK_MWAIT);
3226 	MNT_KERN_FLAG(MNTK_SUSPEND);
3227 	MNT_KERN_FLAG(MNTK_SUSPEND2);
3228 	MNT_KERN_FLAG(MNTK_SUSPENDED);
3229 	MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
3230 	MNT_KERN_FLAG(MNTK_NOKNOTE);
3231 #undef MNT_KERN_FLAG
3232 	if (flags != 0) {
3233 		if (buf[0] != '\0')
3234 			strlcat(buf, ", ", sizeof(buf));
3235 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
3236 		    "0x%08x", flags);
3237 	}
3238 	db_printf("    mnt_kern_flag = %s\n", buf);
3239 
3240 	db_printf("    mnt_opt = ");
3241 	opt = TAILQ_FIRST(mp->mnt_opt);
3242 	if (opt != NULL) {
3243 		db_printf("%s", opt->name);
3244 		opt = TAILQ_NEXT(opt, link);
3245 		while (opt != NULL) {
3246 			db_printf(", %s", opt->name);
3247 			opt = TAILQ_NEXT(opt, link);
3248 		}
3249 	}
3250 	db_printf("\n");
3251 
3252 	sp = &mp->mnt_stat;
3253 	db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
3254 	    "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
3255 	    "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
3256 	    "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
3257 	    (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
3258 	    (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
3259 	    (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
3260 	    (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
3261 	    (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
3262 	    (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
3263 	    (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
3264 	    (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
3265 
3266 	db_printf("    mnt_cred = { uid=%u ruid=%u",
3267 	    (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
3268 	if (jailed(mp->mnt_cred))
3269 		db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
3270 	db_printf(" }\n");
3271 	db_printf("    mnt_ref = %d\n", mp->mnt_ref);
3272 	db_printf("    mnt_gen = %d\n", mp->mnt_gen);
3273 	db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
3274 	db_printf("    mnt_activevnodelistsize = %d\n",
3275 	    mp->mnt_activevnodelistsize);
3276 	db_printf("    mnt_writeopcount = %d\n", mp->mnt_writeopcount);
3277 	db_printf("    mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
3278 	db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
3279 	db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
3280 	db_printf("    mnt_lockref = %d\n", mp->mnt_lockref);
3281 	db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
3282 	db_printf("    mnt_secondary_accwrites = %d\n",
3283 	    mp->mnt_secondary_accwrites);
3284 	db_printf("    mnt_gjprovider = %s\n",
3285 	    mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
3286 
3287 	db_printf("\n\nList of active vnodes\n");
3288 	TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) {
3289 		if (vp->v_type != VMARKER) {
3290 			vn_printf(vp, "vnode ");
3291 			if (db_pager_quit)
3292 				break;
3293 		}
3294 	}
3295 	db_printf("\n\nList of inactive vnodes\n");
3296 	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3297 		if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) {
3298 			vn_printf(vp, "vnode ");
3299 			if (db_pager_quit)
3300 				break;
3301 		}
3302 	}
3303 }
3304 #endif	/* DDB */
3305 
3306 /*
3307  * Fill in a struct xvfsconf based on a struct vfsconf.
3308  */
3309 static int
3310 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
3311 {
3312 	struct xvfsconf xvfsp;
3313 
3314 	bzero(&xvfsp, sizeof(xvfsp));
3315 	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
3316 	xvfsp.vfc_typenum = vfsp->vfc_typenum;
3317 	xvfsp.vfc_refcount = vfsp->vfc_refcount;
3318 	xvfsp.vfc_flags = vfsp->vfc_flags;
3319 	/*
3320 	 * These are unused in userland, we keep them
3321 	 * to not break binary compatibility.
3322 	 */
3323 	xvfsp.vfc_vfsops = NULL;
3324 	xvfsp.vfc_next = NULL;
3325 	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3326 }
3327 
3328 #ifdef COMPAT_FREEBSD32
3329 struct xvfsconf32 {
3330 	uint32_t	vfc_vfsops;
3331 	char		vfc_name[MFSNAMELEN];
3332 	int32_t		vfc_typenum;
3333 	int32_t		vfc_refcount;
3334 	int32_t		vfc_flags;
3335 	uint32_t	vfc_next;
3336 };
3337 
3338 static int
3339 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
3340 {
3341 	struct xvfsconf32 xvfsp;
3342 
3343 	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
3344 	xvfsp.vfc_typenum = vfsp->vfc_typenum;
3345 	xvfsp.vfc_refcount = vfsp->vfc_refcount;
3346 	xvfsp.vfc_flags = vfsp->vfc_flags;
3347 	xvfsp.vfc_vfsops = 0;
3348 	xvfsp.vfc_next = 0;
3349 	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3350 }
3351 #endif
3352 
3353 /*
3354  * Top level filesystem related information gathering.
3355  */
3356 static int
3357 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
3358 {
3359 	struct vfsconf *vfsp;
3360 	int error;
3361 
3362 	error = 0;
3363 	vfsconf_slock();
3364 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3365 #ifdef COMPAT_FREEBSD32
3366 		if (req->flags & SCTL_MASK32)
3367 			error = vfsconf2x32(req, vfsp);
3368 		else
3369 #endif
3370 			error = vfsconf2x(req, vfsp);
3371 		if (error)
3372 			break;
3373 	}
3374 	vfsconf_sunlock();
3375 	return (error);
3376 }
3377 
3378 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
3379     CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
3380     "S,xvfsconf", "List of all configured filesystems");
3381 
3382 #ifndef BURN_BRIDGES
3383 static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
3384 
3385 static int
3386 vfs_sysctl(SYSCTL_HANDLER_ARGS)
3387 {
3388 	int *name = (int *)arg1 - 1;	/* XXX */
3389 	u_int namelen = arg2 + 1;	/* XXX */
3390 	struct vfsconf *vfsp;
3391 
3392 	log(LOG_WARNING, "userland calling deprecated sysctl, "
3393 	    "please rebuild world\n");
3394 
3395 #if 1 || defined(COMPAT_PRELITE2)
3396 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
3397 	if (namelen == 1)
3398 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
3399 #endif
3400 
3401 	switch (name[1]) {
3402 	case VFS_MAXTYPENUM:
3403 		if (namelen != 2)
3404 			return (ENOTDIR);
3405 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
3406 	case VFS_CONF:
3407 		if (namelen != 3)
3408 			return (ENOTDIR);	/* overloaded */
3409 		vfsconf_slock();
3410 		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3411 			if (vfsp->vfc_typenum == name[2])
3412 				break;
3413 		}
3414 		vfsconf_sunlock();
3415 		if (vfsp == NULL)
3416 			return (EOPNOTSUPP);
3417 #ifdef COMPAT_FREEBSD32
3418 		if (req->flags & SCTL_MASK32)
3419 			return (vfsconf2x32(req, vfsp));
3420 		else
3421 #endif
3422 			return (vfsconf2x(req, vfsp));
3423 	}
3424 	return (EOPNOTSUPP);
3425 }
3426 
3427 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
3428     CTLFLAG_MPSAFE, vfs_sysctl,
3429     "Generic filesystem");
3430 
3431 #if 1 || defined(COMPAT_PRELITE2)
3432 
3433 static int
3434 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
3435 {
3436 	int error;
3437 	struct vfsconf *vfsp;
3438 	struct ovfsconf ovfs;
3439 
3440 	vfsconf_slock();
3441 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3442 		bzero(&ovfs, sizeof(ovfs));
3443 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
3444 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
3445 		ovfs.vfc_index = vfsp->vfc_typenum;
3446 		ovfs.vfc_refcount = vfsp->vfc_refcount;
3447 		ovfs.vfc_flags = vfsp->vfc_flags;
3448 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
3449 		if (error != 0) {
3450 			vfsconf_sunlock();
3451 			return (error);
3452 		}
3453 	}
3454 	vfsconf_sunlock();
3455 	return (0);
3456 }
3457 
3458 #endif /* 1 || COMPAT_PRELITE2 */
3459 #endif /* !BURN_BRIDGES */
3460 
3461 #define KINFO_VNODESLOP		10
3462 #ifdef notyet
3463 /*
3464  * Dump vnode list (via sysctl).
3465  */
3466 /* ARGSUSED */
3467 static int
3468 sysctl_vnode(SYSCTL_HANDLER_ARGS)
3469 {
3470 	struct xvnode *xvn;
3471 	struct mount *mp;
3472 	struct vnode *vp;
3473 	int error, len, n;
3474 
3475 	/*
3476 	 * Stale numvnodes access is not fatal here.
3477 	 */
3478 	req->lock = 0;
3479 	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
3480 	if (!req->oldptr)
3481 		/* Make an estimate */
3482 		return (SYSCTL_OUT(req, 0, len));
3483 
3484 	error = sysctl_wire_old_buffer(req, 0);
3485 	if (error != 0)
3486 		return (error);
3487 	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
3488 	n = 0;
3489 	mtx_lock(&mountlist_mtx);
3490 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3491 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
3492 			continue;
3493 		MNT_ILOCK(mp);
3494 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3495 			if (n == len)
3496 				break;
3497 			vref(vp);
3498 			xvn[n].xv_size = sizeof *xvn;
3499 			xvn[n].xv_vnode = vp;
3500 			xvn[n].xv_id = 0;	/* XXX compat */
3501 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
3502 			XV_COPY(usecount);
3503 			XV_COPY(writecount);
3504 			XV_COPY(holdcnt);
3505 			XV_COPY(mount);
3506 			XV_COPY(numoutput);
3507 			XV_COPY(type);
3508 #undef XV_COPY
3509 			xvn[n].xv_flag = vp->v_vflag;
3510 
3511 			switch (vp->v_type) {
3512 			case VREG:
3513 			case VDIR:
3514 			case VLNK:
3515 				break;
3516 			case VBLK:
3517 			case VCHR:
3518 				if (vp->v_rdev == NULL) {
3519 					vrele(vp);
3520 					continue;
3521 				}
3522 				xvn[n].xv_dev = dev2udev(vp->v_rdev);
3523 				break;
3524 			case VSOCK:
3525 				xvn[n].xv_socket = vp->v_socket;
3526 				break;
3527 			case VFIFO:
3528 				xvn[n].xv_fifo = vp->v_fifoinfo;
3529 				break;
3530 			case VNON:
3531 			case VBAD:
3532 			default:
3533 				/* shouldn't happen? */
3534 				vrele(vp);
3535 				continue;
3536 			}
3537 			vrele(vp);
3538 			++n;
3539 		}
3540 		MNT_IUNLOCK(mp);
3541 		mtx_lock(&mountlist_mtx);
3542 		vfs_unbusy(mp);
3543 		if (n == len)
3544 			break;
3545 	}
3546 	mtx_unlock(&mountlist_mtx);
3547 
3548 	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
3549 	free(xvn, M_TEMP);
3550 	return (error);
3551 }
3552 
3553 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD |
3554     CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
3555     "");
3556 #endif
3557 
3558 static void
3559 unmount_or_warn(struct mount *mp)
3560 {
3561 	int error;
3562 
3563 	error = dounmount(mp, MNT_FORCE, curthread);
3564 	if (error != 0) {
3565 		printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
3566 		if (error == EBUSY)
3567 			printf("BUSY)\n");
3568 		else
3569 			printf("%d)\n", error);
3570 	}
3571 }
3572 
3573 /*
3574  * Unmount all filesystems. The list is traversed in reverse order
3575  * of mounting to avoid dependencies.
3576  */
3577 void
3578 vfs_unmountall(void)
3579 {
3580 	struct mount *mp, *tmp;
3581 
3582 	CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
3583 
3584 	/*
3585 	 * Since this only runs when rebooting, it is not interlocked.
3586 	 */
3587 	TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) {
3588 		vfs_ref(mp);
3589 
3590 		/*
3591 		 * Forcibly unmounting "/dev" before "/" would prevent clean
3592 		 * unmount of the latter.
3593 		 */
3594 		if (mp == rootdevmp)
3595 			continue;
3596 
3597 		unmount_or_warn(mp);
3598 	}
3599 
3600 	if (rootdevmp != NULL)
3601 		unmount_or_warn(rootdevmp);
3602 }
3603 
3604 /*
3605  * perform msync on all vnodes under a mount point
3606  * the mount point must be locked.
3607  */
3608 void
3609 vfs_msync(struct mount *mp, int flags)
3610 {
3611 	struct vnode *vp, *mvp;
3612 	struct vm_object *obj;
3613 
3614 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
3615 	MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) {
3616 		obj = vp->v_object;
3617 		if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 &&
3618 		    (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
3619 			if (!vget(vp,
3620 			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
3621 			    curthread)) {
3622 				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
3623 					vput(vp);
3624 					continue;
3625 				}
3626 
3627 				obj = vp->v_object;
3628 				if (obj != NULL) {
3629 					VM_OBJECT_WLOCK(obj);
3630 					vm_object_page_clean(obj, 0, 0,
3631 					    flags == MNT_WAIT ?
3632 					    OBJPC_SYNC : OBJPC_NOSYNC);
3633 					VM_OBJECT_WUNLOCK(obj);
3634 				}
3635 				vput(vp);
3636 			}
3637 		} else
3638 			VI_UNLOCK(vp);
3639 	}
3640 }
3641 
3642 static void
3643 destroy_vpollinfo_free(struct vpollinfo *vi)
3644 {
3645 
3646 	knlist_destroy(&vi->vpi_selinfo.si_note);
3647 	mtx_destroy(&vi->vpi_lock);
3648 	uma_zfree(vnodepoll_zone, vi);
3649 }
3650 
3651 static void
3652 destroy_vpollinfo(struct vpollinfo *vi)
3653 {
3654 
3655 	knlist_clear(&vi->vpi_selinfo.si_note, 1);
3656 	seldrain(&vi->vpi_selinfo);
3657 	destroy_vpollinfo_free(vi);
3658 }
3659 
3660 /*
3661  * Initalize per-vnode helper structure to hold poll-related state.
3662  */
3663 void
3664 v_addpollinfo(struct vnode *vp)
3665 {
3666 	struct vpollinfo *vi;
3667 
3668 	if (vp->v_pollinfo != NULL)
3669 		return;
3670 	vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO);
3671 	mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
3672 	knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
3673 	    vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
3674 	VI_LOCK(vp);
3675 	if (vp->v_pollinfo != NULL) {
3676 		VI_UNLOCK(vp);
3677 		destroy_vpollinfo_free(vi);
3678 		return;
3679 	}
3680 	vp->v_pollinfo = vi;
3681 	VI_UNLOCK(vp);
3682 }
3683 
3684 /*
3685  * Record a process's interest in events which might happen to
3686  * a vnode.  Because poll uses the historic select-style interface
3687  * internally, this routine serves as both the ``check for any
3688  * pending events'' and the ``record my interest in future events''
3689  * functions.  (These are done together, while the lock is held,
3690  * to avoid race conditions.)
3691  */
3692 int
3693 vn_pollrecord(struct vnode *vp, struct thread *td, int events)
3694 {
3695 
3696 	v_addpollinfo(vp);
3697 	mtx_lock(&vp->v_pollinfo->vpi_lock);
3698 	if (vp->v_pollinfo->vpi_revents & events) {
3699 		/*
3700 		 * This leaves events we are not interested
3701 		 * in available for the other process which
3702 		 * which presumably had requested them
3703 		 * (otherwise they would never have been
3704 		 * recorded).
3705 		 */
3706 		events &= vp->v_pollinfo->vpi_revents;
3707 		vp->v_pollinfo->vpi_revents &= ~events;
3708 
3709 		mtx_unlock(&vp->v_pollinfo->vpi_lock);
3710 		return (events);
3711 	}
3712 	vp->v_pollinfo->vpi_events |= events;
3713 	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3714 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3715 	return (0);
3716 }
3717 
3718 /*
3719  * Routine to create and manage a filesystem syncer vnode.
3720  */
3721 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
3722 static int	sync_fsync(struct  vop_fsync_args *);
3723 static int	sync_inactive(struct  vop_inactive_args *);
3724 static int	sync_reclaim(struct  vop_reclaim_args *);
3725 
3726 static struct vop_vector sync_vnodeops = {
3727 	.vop_bypass =	VOP_EOPNOTSUPP,
3728 	.vop_close =	sync_close,		/* close */
3729 	.vop_fsync =	sync_fsync,		/* fsync */
3730 	.vop_inactive =	sync_inactive,	/* inactive */
3731 	.vop_reclaim =	sync_reclaim,	/* reclaim */
3732 	.vop_lock1 =	vop_stdlock,	/* lock */
3733 	.vop_unlock =	vop_stdunlock,	/* unlock */
3734 	.vop_islocked =	vop_stdislocked,	/* islocked */
3735 };
3736 
3737 /*
3738  * Create a new filesystem syncer vnode for the specified mount point.
3739  */
3740 void
3741 vfs_allocate_syncvnode(struct mount *mp)
3742 {
3743 	struct vnode *vp;
3744 	struct bufobj *bo;
3745 	static long start, incr, next;
3746 	int error;
3747 
3748 	/* Allocate a new vnode */
3749 	error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
3750 	if (error != 0)
3751 		panic("vfs_allocate_syncvnode: getnewvnode() failed");
3752 	vp->v_type = VNON;
3753 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3754 	vp->v_vflag |= VV_FORCEINSMQ;
3755 	error = insmntque(vp, mp);
3756 	if (error != 0)
3757 		panic("vfs_allocate_syncvnode: insmntque() failed");
3758 	vp->v_vflag &= ~VV_FORCEINSMQ;
3759 	VOP_UNLOCK(vp, 0);
3760 	/*
3761 	 * Place the vnode onto the syncer worklist. We attempt to
3762 	 * scatter them about on the list so that they will go off
3763 	 * at evenly distributed times even if all the filesystems
3764 	 * are mounted at once.
3765 	 */
3766 	next += incr;
3767 	if (next == 0 || next > syncer_maxdelay) {
3768 		start /= 2;
3769 		incr /= 2;
3770 		if (start == 0) {
3771 			start = syncer_maxdelay / 2;
3772 			incr = syncer_maxdelay;
3773 		}
3774 		next = start;
3775 	}
3776 	bo = &vp->v_bufobj;
3777 	BO_LOCK(bo);
3778 	vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
3779 	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
3780 	mtx_lock(&sync_mtx);
3781 	sync_vnode_count++;
3782 	if (mp->mnt_syncer == NULL) {
3783 		mp->mnt_syncer = vp;
3784 		vp = NULL;
3785 	}
3786 	mtx_unlock(&sync_mtx);
3787 	BO_UNLOCK(bo);
3788 	if (vp != NULL) {
3789 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3790 		vgone(vp);
3791 		vput(vp);
3792 	}
3793 }
3794 
3795 void
3796 vfs_deallocate_syncvnode(struct mount *mp)
3797 {
3798 	struct vnode *vp;
3799 
3800 	mtx_lock(&sync_mtx);
3801 	vp = mp->mnt_syncer;
3802 	if (vp != NULL)
3803 		mp->mnt_syncer = NULL;
3804 	mtx_unlock(&sync_mtx);
3805 	if (vp != NULL)
3806 		vrele(vp);
3807 }
3808 
3809 /*
3810  * Do a lazy sync of the filesystem.
3811  */
3812 static int
3813 sync_fsync(struct vop_fsync_args *ap)
3814 {
3815 	struct vnode *syncvp = ap->a_vp;
3816 	struct mount *mp = syncvp->v_mount;
3817 	int error, save;
3818 	struct bufobj *bo;
3819 
3820 	/*
3821 	 * We only need to do something if this is a lazy evaluation.
3822 	 */
3823 	if (ap->a_waitfor != MNT_LAZY)
3824 		return (0);
3825 
3826 	/*
3827 	 * Move ourselves to the back of the sync list.
3828 	 */
3829 	bo = &syncvp->v_bufobj;
3830 	BO_LOCK(bo);
3831 	vn_syncer_add_to_worklist(bo, syncdelay);
3832 	BO_UNLOCK(bo);
3833 
3834 	/*
3835 	 * Walk the list of vnodes pushing all that are dirty and
3836 	 * not already on the sync list.
3837 	 */
3838 	if (vfs_busy(mp, MBF_NOWAIT) != 0)
3839 		return (0);
3840 	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3841 		vfs_unbusy(mp);
3842 		return (0);
3843 	}
3844 	save = curthread_pflags_set(TDP_SYNCIO);
3845 	vfs_msync(mp, MNT_NOWAIT);
3846 	error = VFS_SYNC(mp, MNT_LAZY);
3847 	curthread_pflags_restore(save);
3848 	vn_finished_write(mp);
3849 	vfs_unbusy(mp);
3850 	return (error);
3851 }
3852 
3853 /*
3854  * The syncer vnode is no referenced.
3855  */
3856 static int
3857 sync_inactive(struct vop_inactive_args *ap)
3858 {
3859 
3860 	vgone(ap->a_vp);
3861 	return (0);
3862 }
3863 
3864 /*
3865  * The syncer vnode is no longer needed and is being decommissioned.
3866  *
3867  * Modifications to the worklist must be protected by sync_mtx.
3868  */
3869 static int
3870 sync_reclaim(struct vop_reclaim_args *ap)
3871 {
3872 	struct vnode *vp = ap->a_vp;
3873 	struct bufobj *bo;
3874 
3875 	bo = &vp->v_bufobj;
3876 	BO_LOCK(bo);
3877 	mtx_lock(&sync_mtx);
3878 	if (vp->v_mount->mnt_syncer == vp)
3879 		vp->v_mount->mnt_syncer = NULL;
3880 	if (bo->bo_flag & BO_ONWORKLST) {
3881 		LIST_REMOVE(bo, bo_synclist);
3882 		syncer_worklist_len--;
3883 		sync_vnode_count--;
3884 		bo->bo_flag &= ~BO_ONWORKLST;
3885 	}
3886 	mtx_unlock(&sync_mtx);
3887 	BO_UNLOCK(bo);
3888 
3889 	return (0);
3890 }
3891 
3892 /*
3893  * Check if vnode represents a disk device
3894  */
3895 int
3896 vn_isdisk(struct vnode *vp, int *errp)
3897 {
3898 	int error;
3899 
3900 	if (vp->v_type != VCHR) {
3901 		error = ENOTBLK;
3902 		goto out;
3903 	}
3904 	error = 0;
3905 	dev_lock();
3906 	if (vp->v_rdev == NULL)
3907 		error = ENXIO;
3908 	else if (vp->v_rdev->si_devsw == NULL)
3909 		error = ENXIO;
3910 	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
3911 		error = ENOTBLK;
3912 	dev_unlock();
3913 out:
3914 	if (errp != NULL)
3915 		*errp = error;
3916 	return (error == 0);
3917 }
3918 
3919 /*
3920  * Common filesystem object access control check routine.  Accepts a
3921  * vnode's type, "mode", uid and gid, requested access mode, credentials,
3922  * and optional call-by-reference privused argument allowing vaccess()
3923  * to indicate to the caller whether privilege was used to satisfy the
3924  * request (obsoleted).  Returns 0 on success, or an errno on failure.
3925  */
3926 int
3927 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
3928     accmode_t accmode, struct ucred *cred, int *privused)
3929 {
3930 	accmode_t dac_granted;
3931 	accmode_t priv_granted;
3932 
3933 	KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
3934 	    ("invalid bit in accmode"));
3935 	KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
3936 	    ("VAPPEND without VWRITE"));
3937 
3938 	/*
3939 	 * Look for a normal, non-privileged way to access the file/directory
3940 	 * as requested.  If it exists, go with that.
3941 	 */
3942 
3943 	if (privused != NULL)
3944 		*privused = 0;
3945 
3946 	dac_granted = 0;
3947 
3948 	/* Check the owner. */
3949 	if (cred->cr_uid == file_uid) {
3950 		dac_granted |= VADMIN;
3951 		if (file_mode & S_IXUSR)
3952 			dac_granted |= VEXEC;
3953 		if (file_mode & S_IRUSR)
3954 			dac_granted |= VREAD;
3955 		if (file_mode & S_IWUSR)
3956 			dac_granted |= (VWRITE | VAPPEND);
3957 
3958 		if ((accmode & dac_granted) == accmode)
3959 			return (0);
3960 
3961 		goto privcheck;
3962 	}
3963 
3964 	/* Otherwise, check the groups (first match) */
3965 	if (groupmember(file_gid, cred)) {
3966 		if (file_mode & S_IXGRP)
3967 			dac_granted |= VEXEC;
3968 		if (file_mode & S_IRGRP)
3969 			dac_granted |= VREAD;
3970 		if (file_mode & S_IWGRP)
3971 			dac_granted |= (VWRITE | VAPPEND);
3972 
3973 		if ((accmode & dac_granted) == accmode)
3974 			return (0);
3975 
3976 		goto privcheck;
3977 	}
3978 
3979 	/* Otherwise, check everyone else. */
3980 	if (file_mode & S_IXOTH)
3981 		dac_granted |= VEXEC;
3982 	if (file_mode & S_IROTH)
3983 		dac_granted |= VREAD;
3984 	if (file_mode & S_IWOTH)
3985 		dac_granted |= (VWRITE | VAPPEND);
3986 	if ((accmode & dac_granted) == accmode)
3987 		return (0);
3988 
3989 privcheck:
3990 	/*
3991 	 * Build a privilege mask to determine if the set of privileges
3992 	 * satisfies the requirements when combined with the granted mask
3993 	 * from above.  For each privilege, if the privilege is required,
3994 	 * bitwise or the request type onto the priv_granted mask.
3995 	 */
3996 	priv_granted = 0;
3997 
3998 	if (type == VDIR) {
3999 		/*
4000 		 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
4001 		 * requests, instead of PRIV_VFS_EXEC.
4002 		 */
4003 		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
4004 		    !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
4005 			priv_granted |= VEXEC;
4006 	} else {
4007 		/*
4008 		 * Ensure that at least one execute bit is on. Otherwise,
4009 		 * a privileged user will always succeed, and we don't want
4010 		 * this to happen unless the file really is executable.
4011 		 */
4012 		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
4013 		    (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
4014 		    !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
4015 			priv_granted |= VEXEC;
4016 	}
4017 
4018 	if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
4019 	    !priv_check_cred(cred, PRIV_VFS_READ, 0))
4020 		priv_granted |= VREAD;
4021 
4022 	if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
4023 	    !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
4024 		priv_granted |= (VWRITE | VAPPEND);
4025 
4026 	if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
4027 	    !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
4028 		priv_granted |= VADMIN;
4029 
4030 	if ((accmode & (priv_granted | dac_granted)) == accmode) {
4031 		/* XXX audit: privilege used */
4032 		if (privused != NULL)
4033 			*privused = 1;
4034 		return (0);
4035 	}
4036 
4037 	return ((accmode & VADMIN) ? EPERM : EACCES);
4038 }
4039 
4040 /*
4041  * Credential check based on process requesting service, and per-attribute
4042  * permissions.
4043  */
4044 int
4045 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
4046     struct thread *td, accmode_t accmode)
4047 {
4048 
4049 	/*
4050 	 * Kernel-invoked always succeeds.
4051 	 */
4052 	if (cred == NOCRED)
4053 		return (0);
4054 
4055 	/*
4056 	 * Do not allow privileged processes in jail to directly manipulate
4057 	 * system attributes.
4058 	 */
4059 	switch (attrnamespace) {
4060 	case EXTATTR_NAMESPACE_SYSTEM:
4061 		/* Potentially should be: return (EPERM); */
4062 		return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
4063 	case EXTATTR_NAMESPACE_USER:
4064 		return (VOP_ACCESS(vp, accmode, cred, td));
4065 	default:
4066 		return (EPERM);
4067 	}
4068 }
4069 
4070 #ifdef DEBUG_VFS_LOCKS
4071 /*
4072  * This only exists to supress warnings from unlocked specfs accesses.  It is
4073  * no longer ok to have an unlocked VFS.
4074  */
4075 #define	IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL ||		\
4076 	(vp)->v_type == VCHR ||	(vp)->v_type == VBAD)
4077 
4078 int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
4079 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
4080     "Drop into debugger on lock violation");
4081 
4082 int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
4083 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
4084     0, "Check for interlock across VOPs");
4085 
4086 int vfs_badlock_print = 1;	/* Print lock violations. */
4087 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
4088     0, "Print lock violations");
4089 
4090 #ifdef KDB
4091 int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
4092 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
4093     &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
4094 #endif
4095 
4096 static void
4097 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
4098 {
4099 
4100 #ifdef KDB
4101 	if (vfs_badlock_backtrace)
4102 		kdb_backtrace();
4103 #endif
4104 	if (vfs_badlock_print)
4105 		printf("%s: %p %s\n", str, (void *)vp, msg);
4106 	if (vfs_badlock_ddb)
4107 		kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
4108 }
4109 
4110 void
4111 assert_vi_locked(struct vnode *vp, const char *str)
4112 {
4113 
4114 	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
4115 		vfs_badlock("interlock is not locked but should be", str, vp);
4116 }
4117 
4118 void
4119 assert_vi_unlocked(struct vnode *vp, const char *str)
4120 {
4121 
4122 	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
4123 		vfs_badlock("interlock is locked but should not be", str, vp);
4124 }
4125 
4126 void
4127 assert_vop_locked(struct vnode *vp, const char *str)
4128 {
4129 	int locked;
4130 
4131 	if (!IGNORE_LOCK(vp)) {
4132 		locked = VOP_ISLOCKED(vp);
4133 		if (locked == 0 || locked == LK_EXCLOTHER)
4134 			vfs_badlock("is not locked but should be", str, vp);
4135 	}
4136 }
4137 
4138 void
4139 assert_vop_unlocked(struct vnode *vp, const char *str)
4140 {
4141 
4142 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
4143 		vfs_badlock("is locked but should not be", str, vp);
4144 }
4145 
4146 void
4147 assert_vop_elocked(struct vnode *vp, const char *str)
4148 {
4149 
4150 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
4151 		vfs_badlock("is not exclusive locked but should be", str, vp);
4152 }
4153 
4154 #if 0
4155 void
4156 assert_vop_elocked_other(struct vnode *vp, const char *str)
4157 {
4158 
4159 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER)
4160 		vfs_badlock("is not exclusive locked by another thread",
4161 		    str, vp);
4162 }
4163 
4164 void
4165 assert_vop_slocked(struct vnode *vp, const char *str)
4166 {
4167 
4168 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED)
4169 		vfs_badlock("is not locked shared but should be", str, vp);
4170 }
4171 #endif /* 0 */
4172 #endif /* DEBUG_VFS_LOCKS */
4173 
4174 void
4175 vop_rename_fail(struct vop_rename_args *ap)
4176 {
4177 
4178 	if (ap->a_tvp != NULL)
4179 		vput(ap->a_tvp);
4180 	if (ap->a_tdvp == ap->a_tvp)
4181 		vrele(ap->a_tdvp);
4182 	else
4183 		vput(ap->a_tdvp);
4184 	vrele(ap->a_fdvp);
4185 	vrele(ap->a_fvp);
4186 }
4187 
4188 void
4189 vop_rename_pre(void *ap)
4190 {
4191 	struct vop_rename_args *a = ap;
4192 
4193 #ifdef DEBUG_VFS_LOCKS
4194 	if (a->a_tvp)
4195 		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
4196 	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
4197 	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
4198 	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
4199 
4200 	/* Check the source (from). */
4201 	if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
4202 	    (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
4203 		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
4204 	if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
4205 		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
4206 
4207 	/* Check the target. */
4208 	if (a->a_tvp)
4209 		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
4210 	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
4211 #endif
4212 	if (a->a_tdvp != a->a_fdvp)
4213 		vhold(a->a_fdvp);
4214 	if (a->a_tvp != a->a_fvp)
4215 		vhold(a->a_fvp);
4216 	vhold(a->a_tdvp);
4217 	if (a->a_tvp)
4218 		vhold(a->a_tvp);
4219 }
4220 
4221 void
4222 vop_strategy_pre(void *ap)
4223 {
4224 #ifdef DEBUG_VFS_LOCKS
4225 	struct vop_strategy_args *a;
4226 	struct buf *bp;
4227 
4228 	a = ap;
4229 	bp = a->a_bp;
4230 
4231 	/*
4232 	 * Cluster ops lock their component buffers but not the IO container.
4233 	 */
4234 	if ((bp->b_flags & B_CLUSTER) != 0)
4235 		return;
4236 
4237 	if (panicstr == NULL && !BUF_ISLOCKED(bp)) {
4238 		if (vfs_badlock_print)
4239 			printf(
4240 			    "VOP_STRATEGY: bp is not locked but should be\n");
4241 		if (vfs_badlock_ddb)
4242 			kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
4243 	}
4244 #endif
4245 }
4246 
4247 void
4248 vop_lock_pre(void *ap)
4249 {
4250 #ifdef DEBUG_VFS_LOCKS
4251 	struct vop_lock1_args *a = ap;
4252 
4253 	if ((a->a_flags & LK_INTERLOCK) == 0)
4254 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
4255 	else
4256 		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
4257 #endif
4258 }
4259 
4260 void
4261 vop_lock_post(void *ap, int rc)
4262 {
4263 #ifdef DEBUG_VFS_LOCKS
4264 	struct vop_lock1_args *a = ap;
4265 
4266 	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
4267 	if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
4268 		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
4269 #endif
4270 }
4271 
4272 void
4273 vop_unlock_pre(void *ap)
4274 {
4275 #ifdef DEBUG_VFS_LOCKS
4276 	struct vop_unlock_args *a = ap;
4277 
4278 	if (a->a_flags & LK_INTERLOCK)
4279 		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
4280 	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
4281 #endif
4282 }
4283 
4284 void
4285 vop_unlock_post(void *ap, int rc)
4286 {
4287 #ifdef DEBUG_VFS_LOCKS
4288 	struct vop_unlock_args *a = ap;
4289 
4290 	if (a->a_flags & LK_INTERLOCK)
4291 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
4292 #endif
4293 }
4294 
4295 void
4296 vop_create_post(void *ap, int rc)
4297 {
4298 	struct vop_create_args *a = ap;
4299 
4300 	if (!rc)
4301 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4302 }
4303 
4304 void
4305 vop_deleteextattr_post(void *ap, int rc)
4306 {
4307 	struct vop_deleteextattr_args *a = ap;
4308 
4309 	if (!rc)
4310 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4311 }
4312 
4313 void
4314 vop_link_post(void *ap, int rc)
4315 {
4316 	struct vop_link_args *a = ap;
4317 
4318 	if (!rc) {
4319 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
4320 		VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
4321 	}
4322 }
4323 
4324 void
4325 vop_mkdir_post(void *ap, int rc)
4326 {
4327 	struct vop_mkdir_args *a = ap;
4328 
4329 	if (!rc)
4330 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4331 }
4332 
4333 void
4334 vop_mknod_post(void *ap, int rc)
4335 {
4336 	struct vop_mknod_args *a = ap;
4337 
4338 	if (!rc)
4339 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4340 }
4341 
4342 void
4343 vop_reclaim_post(void *ap, int rc)
4344 {
4345 	struct vop_reclaim_args *a = ap;
4346 
4347 	if (!rc)
4348 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_REVOKE);
4349 }
4350 
4351 void
4352 vop_remove_post(void *ap, int rc)
4353 {
4354 	struct vop_remove_args *a = ap;
4355 
4356 	if (!rc) {
4357 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4358 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4359 	}
4360 }
4361 
4362 void
4363 vop_rename_post(void *ap, int rc)
4364 {
4365 	struct vop_rename_args *a = ap;
4366 
4367 	if (!rc) {
4368 		VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
4369 		VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
4370 		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
4371 		if (a->a_tvp)
4372 			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
4373 	}
4374 	if (a->a_tdvp != a->a_fdvp)
4375 		vdrop(a->a_fdvp);
4376 	if (a->a_tvp != a->a_fvp)
4377 		vdrop(a->a_fvp);
4378 	vdrop(a->a_tdvp);
4379 	if (a->a_tvp)
4380 		vdrop(a->a_tvp);
4381 }
4382 
4383 void
4384 vop_rmdir_post(void *ap, int rc)
4385 {
4386 	struct vop_rmdir_args *a = ap;
4387 
4388 	if (!rc) {
4389 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4390 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4391 	}
4392 }
4393 
4394 void
4395 vop_setattr_post(void *ap, int rc)
4396 {
4397 	struct vop_setattr_args *a = ap;
4398 
4399 	if (!rc)
4400 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4401 }
4402 
4403 void
4404 vop_setextattr_post(void *ap, int rc)
4405 {
4406 	struct vop_setextattr_args *a = ap;
4407 
4408 	if (!rc)
4409 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4410 }
4411 
4412 void
4413 vop_symlink_post(void *ap, int rc)
4414 {
4415 	struct vop_symlink_args *a = ap;
4416 
4417 	if (!rc)
4418 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4419 }
4420 
4421 static struct knlist fs_knlist;
4422 
4423 static void
4424 vfs_event_init(void *arg)
4425 {
4426 	knlist_init_mtx(&fs_knlist, NULL);
4427 }
4428 /* XXX - correct order? */
4429 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
4430 
4431 void
4432 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
4433 {
4434 
4435 	KNOTE_UNLOCKED(&fs_knlist, event);
4436 }
4437 
4438 static int	filt_fsattach(struct knote *kn);
4439 static void	filt_fsdetach(struct knote *kn);
4440 static int	filt_fsevent(struct knote *kn, long hint);
4441 
4442 struct filterops fs_filtops = {
4443 	.f_isfd = 0,
4444 	.f_attach = filt_fsattach,
4445 	.f_detach = filt_fsdetach,
4446 	.f_event = filt_fsevent
4447 };
4448 
4449 static int
4450 filt_fsattach(struct knote *kn)
4451 {
4452 
4453 	kn->kn_flags |= EV_CLEAR;
4454 	knlist_add(&fs_knlist, kn, 0);
4455 	return (0);
4456 }
4457 
4458 static void
4459 filt_fsdetach(struct knote *kn)
4460 {
4461 
4462 	knlist_remove(&fs_knlist, kn, 0);
4463 }
4464 
4465 static int
4466 filt_fsevent(struct knote *kn, long hint)
4467 {
4468 
4469 	kn->kn_fflags |= hint;
4470 	return (kn->kn_fflags != 0);
4471 }
4472 
4473 static int
4474 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
4475 {
4476 	struct vfsidctl vc;
4477 	int error;
4478 	struct mount *mp;
4479 
4480 	error = SYSCTL_IN(req, &vc, sizeof(vc));
4481 	if (error)
4482 		return (error);
4483 	if (vc.vc_vers != VFS_CTL_VERS1)
4484 		return (EINVAL);
4485 	mp = vfs_getvfs(&vc.vc_fsid);
4486 	if (mp == NULL)
4487 		return (ENOENT);
4488 	/* ensure that a specific sysctl goes to the right filesystem. */
4489 	if (strcmp(vc.vc_fstypename, "*") != 0 &&
4490 	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
4491 		vfs_rel(mp);
4492 		return (EINVAL);
4493 	}
4494 	VCTLTOREQ(&vc, req);
4495 	error = VFS_SYSCTL(mp, vc.vc_op, req);
4496 	vfs_rel(mp);
4497 	return (error);
4498 }
4499 
4500 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR,
4501     NULL, 0, sysctl_vfs_ctl, "",
4502     "Sysctl by fsid");
4503 
4504 /*
4505  * Function to initialize a va_filerev field sensibly.
4506  * XXX: Wouldn't a random number make a lot more sense ??
4507  */
4508 u_quad_t
4509 init_va_filerev(void)
4510 {
4511 	struct bintime bt;
4512 
4513 	getbinuptime(&bt);
4514 	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
4515 }
4516 
4517 static int	filt_vfsread(struct knote *kn, long hint);
4518 static int	filt_vfswrite(struct knote *kn, long hint);
4519 static int	filt_vfsvnode(struct knote *kn, long hint);
4520 static void	filt_vfsdetach(struct knote *kn);
4521 static struct filterops vfsread_filtops = {
4522 	.f_isfd = 1,
4523 	.f_detach = filt_vfsdetach,
4524 	.f_event = filt_vfsread
4525 };
4526 static struct filterops vfswrite_filtops = {
4527 	.f_isfd = 1,
4528 	.f_detach = filt_vfsdetach,
4529 	.f_event = filt_vfswrite
4530 };
4531 static struct filterops vfsvnode_filtops = {
4532 	.f_isfd = 1,
4533 	.f_detach = filt_vfsdetach,
4534 	.f_event = filt_vfsvnode
4535 };
4536 
4537 static void
4538 vfs_knllock(void *arg)
4539 {
4540 	struct vnode *vp = arg;
4541 
4542 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4543 }
4544 
4545 static void
4546 vfs_knlunlock(void *arg)
4547 {
4548 	struct vnode *vp = arg;
4549 
4550 	VOP_UNLOCK(vp, 0);
4551 }
4552 
4553 static void
4554 vfs_knl_assert_locked(void *arg)
4555 {
4556 #ifdef DEBUG_VFS_LOCKS
4557 	struct vnode *vp = arg;
4558 
4559 	ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
4560 #endif
4561 }
4562 
4563 static void
4564 vfs_knl_assert_unlocked(void *arg)
4565 {
4566 #ifdef DEBUG_VFS_LOCKS
4567 	struct vnode *vp = arg;
4568 
4569 	ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
4570 #endif
4571 }
4572 
4573 int
4574 vfs_kqfilter(struct vop_kqfilter_args *ap)
4575 {
4576 	struct vnode *vp = ap->a_vp;
4577 	struct knote *kn = ap->a_kn;
4578 	struct knlist *knl;
4579 
4580 	switch (kn->kn_filter) {
4581 	case EVFILT_READ:
4582 		kn->kn_fop = &vfsread_filtops;
4583 		break;
4584 	case EVFILT_WRITE:
4585 		kn->kn_fop = &vfswrite_filtops;
4586 		break;
4587 	case EVFILT_VNODE:
4588 		kn->kn_fop = &vfsvnode_filtops;
4589 		break;
4590 	default:
4591 		return (EINVAL);
4592 	}
4593 
4594 	kn->kn_hook = (caddr_t)vp;
4595 
4596 	v_addpollinfo(vp);
4597 	if (vp->v_pollinfo == NULL)
4598 		return (ENOMEM);
4599 	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
4600 	vhold(vp);
4601 	knlist_add(knl, kn, 0);
4602 
4603 	return (0);
4604 }
4605 
4606 /*
4607  * Detach knote from vnode
4608  */
4609 static void
4610 filt_vfsdetach(struct knote *kn)
4611 {
4612 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4613 
4614 	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
4615 	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
4616 	vdrop(vp);
4617 }
4618 
4619 /*ARGSUSED*/
4620 static int
4621 filt_vfsread(struct knote *kn, long hint)
4622 {
4623 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4624 	struct vattr va;
4625 	int res;
4626 
4627 	/*
4628 	 * filesystem is gone, so set the EOF flag and schedule
4629 	 * the knote for deletion.
4630 	 */
4631 	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
4632 		VI_LOCK(vp);
4633 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4634 		VI_UNLOCK(vp);
4635 		return (1);
4636 	}
4637 
4638 	if (VOP_GETATTR(vp, &va, curthread->td_ucred))
4639 		return (0);
4640 
4641 	VI_LOCK(vp);
4642 	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
4643 	res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0;
4644 	VI_UNLOCK(vp);
4645 	return (res);
4646 }
4647 
4648 /*ARGSUSED*/
4649 static int
4650 filt_vfswrite(struct knote *kn, long hint)
4651 {
4652 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4653 
4654 	VI_LOCK(vp);
4655 
4656 	/*
4657 	 * filesystem is gone, so set the EOF flag and schedule
4658 	 * the knote for deletion.
4659 	 */
4660 	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD))
4661 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4662 
4663 	kn->kn_data = 0;
4664 	VI_UNLOCK(vp);
4665 	return (1);
4666 }
4667 
4668 static int
4669 filt_vfsvnode(struct knote *kn, long hint)
4670 {
4671 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4672 	int res;
4673 
4674 	VI_LOCK(vp);
4675 	if (kn->kn_sfflags & hint)
4676 		kn->kn_fflags |= hint;
4677 	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
4678 		kn->kn_flags |= EV_EOF;
4679 		VI_UNLOCK(vp);
4680 		return (1);
4681 	}
4682 	res = (kn->kn_fflags != 0);
4683 	VI_UNLOCK(vp);
4684 	return (res);
4685 }
4686 
4687 int
4688 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
4689 {
4690 	int error;
4691 
4692 	if (dp->d_reclen > ap->a_uio->uio_resid)
4693 		return (ENAMETOOLONG);
4694 	error = uiomove(dp, dp->d_reclen, ap->a_uio);
4695 	if (error) {
4696 		if (ap->a_ncookies != NULL) {
4697 			if (ap->a_cookies != NULL)
4698 				free(ap->a_cookies, M_TEMP);
4699 			ap->a_cookies = NULL;
4700 			*ap->a_ncookies = 0;
4701 		}
4702 		return (error);
4703 	}
4704 	if (ap->a_ncookies == NULL)
4705 		return (0);
4706 
4707 	KASSERT(ap->a_cookies,
4708 	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
4709 
4710 	*ap->a_cookies = realloc(*ap->a_cookies,
4711 	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
4712 	(*ap->a_cookies)[*ap->a_ncookies] = off;
4713 	return (0);
4714 }
4715 
4716 /*
4717  * Mark for update the access time of the file if the filesystem
4718  * supports VOP_MARKATIME.  This functionality is used by execve and
4719  * mmap, so we want to avoid the I/O implied by directly setting
4720  * va_atime for the sake of efficiency.
4721  */
4722 void
4723 vfs_mark_atime(struct vnode *vp, struct ucred *cred)
4724 {
4725 	struct mount *mp;
4726 
4727 	mp = vp->v_mount;
4728 	ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
4729 	if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
4730 		(void)VOP_MARKATIME(vp);
4731 }
4732 
4733 /*
4734  * The purpose of this routine is to remove granularity from accmode_t,
4735  * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
4736  * VADMIN and VAPPEND.
4737  *
4738  * If it returns 0, the caller is supposed to continue with the usual
4739  * access checks using 'accmode' as modified by this routine.  If it
4740  * returns nonzero value, the caller is supposed to return that value
4741  * as errno.
4742  *
4743  * Note that after this routine runs, accmode may be zero.
4744  */
4745 int
4746 vfs_unixify_accmode(accmode_t *accmode)
4747 {
4748 	/*
4749 	 * There is no way to specify explicit "deny" rule using
4750 	 * file mode or POSIX.1e ACLs.
4751 	 */
4752 	if (*accmode & VEXPLICIT_DENY) {
4753 		*accmode = 0;
4754 		return (0);
4755 	}
4756 
4757 	/*
4758 	 * None of these can be translated into usual access bits.
4759 	 * Also, the common case for NFSv4 ACLs is to not contain
4760 	 * either of these bits. Caller should check for VWRITE
4761 	 * on the containing directory instead.
4762 	 */
4763 	if (*accmode & (VDELETE_CHILD | VDELETE))
4764 		return (EPERM);
4765 
4766 	if (*accmode & VADMIN_PERMS) {
4767 		*accmode &= ~VADMIN_PERMS;
4768 		*accmode |= VADMIN;
4769 	}
4770 
4771 	/*
4772 	 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
4773 	 * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
4774 	 */
4775 	*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
4776 
4777 	return (0);
4778 }
4779 
4780 /*
4781  * These are helper functions for filesystems to traverse all
4782  * their vnodes.  See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
4783  *
4784  * This interface replaces MNT_VNODE_FOREACH.
4785  */
4786 
4787 MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
4788 
4789 struct vnode *
4790 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
4791 {
4792 	struct vnode *vp;
4793 
4794 	if (should_yield())
4795 		kern_yield(PRI_USER);
4796 	MNT_ILOCK(mp);
4797 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4798 	vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
4799 	while (vp != NULL && (vp->v_type == VMARKER ||
4800 	    (vp->v_iflag & VI_DOOMED) != 0))
4801 		vp = TAILQ_NEXT(vp, v_nmntvnodes);
4802 
4803 	/* Check if we are done */
4804 	if (vp == NULL) {
4805 		__mnt_vnode_markerfree_all(mvp, mp);
4806 		/* MNT_IUNLOCK(mp); -- done in above function */
4807 		mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
4808 		return (NULL);
4809 	}
4810 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
4811 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
4812 	VI_LOCK(vp);
4813 	MNT_IUNLOCK(mp);
4814 	return (vp);
4815 }
4816 
4817 struct vnode *
4818 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
4819 {
4820 	struct vnode *vp;
4821 
4822 	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
4823 	MNT_ILOCK(mp);
4824 	MNT_REF(mp);
4825 	(*mvp)->v_type = VMARKER;
4826 
4827 	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
4828 	while (vp != NULL && (vp->v_type == VMARKER ||
4829 	    (vp->v_iflag & VI_DOOMED) != 0))
4830 		vp = TAILQ_NEXT(vp, v_nmntvnodes);
4831 
4832 	/* Check if we are done */
4833 	if (vp == NULL) {
4834 		MNT_REL(mp);
4835 		MNT_IUNLOCK(mp);
4836 		free(*mvp, M_VNODE_MARKER);
4837 		*mvp = NULL;
4838 		return (NULL);
4839 	}
4840 	(*mvp)->v_mount = mp;
4841 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
4842 	VI_LOCK(vp);
4843 	MNT_IUNLOCK(mp);
4844 	return (vp);
4845 }
4846 
4847 
4848 void
4849 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
4850 {
4851 
4852 	if (*mvp == NULL) {
4853 		MNT_IUNLOCK(mp);
4854 		return;
4855 	}
4856 
4857 	mtx_assert(MNT_MTX(mp), MA_OWNED);
4858 
4859 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4860 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
4861 	MNT_REL(mp);
4862 	MNT_IUNLOCK(mp);
4863 	free(*mvp, M_VNODE_MARKER);
4864 	*mvp = NULL;
4865 }
4866 
4867 /*
4868  * These are helper functions for filesystems to traverse their
4869  * active vnodes.  See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h
4870  */
4871 static void
4872 mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
4873 {
4874 
4875 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4876 
4877 	MNT_ILOCK(mp);
4878 	MNT_REL(mp);
4879 	MNT_IUNLOCK(mp);
4880 	free(*mvp, M_VNODE_MARKER);
4881 	*mvp = NULL;
4882 }
4883 
4884 static struct vnode *
4885 mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
4886 {
4887 	struct vnode *vp, *nvp;
4888 
4889 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
4890 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4891 restart:
4892 	vp = TAILQ_NEXT(*mvp, v_actfreelist);
4893 	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
4894 	while (vp != NULL) {
4895 		if (vp->v_type == VMARKER) {
4896 			vp = TAILQ_NEXT(vp, v_actfreelist);
4897 			continue;
4898 		}
4899 		if (!VI_TRYLOCK(vp)) {
4900 			if (mp_ncpus == 1 || should_yield()) {
4901 				TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
4902 				mtx_unlock(&vnode_free_list_mtx);
4903 				pause("vnacti", 1);
4904 				mtx_lock(&vnode_free_list_mtx);
4905 				goto restart;
4906 			}
4907 			continue;
4908 		}
4909 		KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
4910 		KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
4911 		    ("alien vnode on the active list %p %p", vp, mp));
4912 		if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0)
4913 			break;
4914 		nvp = TAILQ_NEXT(vp, v_actfreelist);
4915 		VI_UNLOCK(vp);
4916 		vp = nvp;
4917 	}
4918 
4919 	/* Check if we are done */
4920 	if (vp == NULL) {
4921 		mtx_unlock(&vnode_free_list_mtx);
4922 		mnt_vnode_markerfree_active(mvp, mp);
4923 		return (NULL);
4924 	}
4925 	TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist);
4926 	mtx_unlock(&vnode_free_list_mtx);
4927 	ASSERT_VI_LOCKED(vp, "active iter");
4928 	KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
4929 	return (vp);
4930 }
4931 
4932 struct vnode *
4933 __mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
4934 {
4935 
4936 	if (should_yield())
4937 		kern_yield(PRI_USER);
4938 	mtx_lock(&vnode_free_list_mtx);
4939 	return (mnt_vnode_next_active(mvp, mp));
4940 }
4941 
4942 struct vnode *
4943 __mnt_vnode_first_active(struct vnode **mvp, struct mount *mp)
4944 {
4945 	struct vnode *vp;
4946 
4947 	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
4948 	MNT_ILOCK(mp);
4949 	MNT_REF(mp);
4950 	MNT_IUNLOCK(mp);
4951 	(*mvp)->v_type = VMARKER;
4952 	(*mvp)->v_mount = mp;
4953 
4954 	mtx_lock(&vnode_free_list_mtx);
4955 	vp = TAILQ_FIRST(&mp->mnt_activevnodelist);
4956 	if (vp == NULL) {
4957 		mtx_unlock(&vnode_free_list_mtx);
4958 		mnt_vnode_markerfree_active(mvp, mp);
4959 		return (NULL);
4960 	}
4961 	TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
4962 	return (mnt_vnode_next_active(mvp, mp));
4963 }
4964 
4965 void
4966 __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
4967 {
4968 
4969 	if (*mvp == NULL)
4970 		return;
4971 
4972 	mtx_lock(&vnode_free_list_mtx);
4973 	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
4974 	mtx_unlock(&vnode_free_list_mtx);
4975 	mnt_vnode_markerfree_active(mvp, mp);
4976 }
4977