xref: /freebsd/sys/kern/vfs_subr.c (revision 891b8ed4672a213bbe6f3f10522eeadb34d01b76)
1 /*-
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
35  */
36 
37 /*
38  * External virtual filesystem routines
39  */
40 
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 #include "opt_ddb.h"
45 
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/bio.h>
49 #include <sys/buf.h>
50 #include <sys/condvar.h>
51 #include <sys/conf.h>
52 #include <sys/dirent.h>
53 #include <sys/event.h>
54 #include <sys/eventhandler.h>
55 #include <sys/extattr.h>
56 #include <sys/file.h>
57 #include <sys/fcntl.h>
58 #include <sys/jail.h>
59 #include <sys/kdb.h>
60 #include <sys/kernel.h>
61 #include <sys/kthread.h>
62 #include <sys/lockf.h>
63 #include <sys/malloc.h>
64 #include <sys/mount.h>
65 #include <sys/namei.h>
66 #include <sys/priv.h>
67 #include <sys/reboot.h>
68 #include <sys/sched.h>
69 #include <sys/sleepqueue.h>
70 #include <sys/stat.h>
71 #include <sys/sysctl.h>
72 #include <sys/syslog.h>
73 #include <sys/vmmeter.h>
74 #include <sys/vnode.h>
75 
76 #include <machine/stdarg.h>
77 
78 #include <security/mac/mac_framework.h>
79 
80 #include <vm/vm.h>
81 #include <vm/vm_object.h>
82 #include <vm/vm_extern.h>
83 #include <vm/pmap.h>
84 #include <vm/vm_map.h>
85 #include <vm/vm_page.h>
86 #include <vm/vm_kern.h>
87 #include <vm/uma.h>
88 
89 #ifdef DDB
90 #include <ddb/ddb.h>
91 #endif
92 
93 #define	WI_MPSAFEQ	0
94 #define	WI_GIANTQ	1
95 
96 static void	delmntque(struct vnode *vp);
97 static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
98 		    int slpflag, int slptimeo);
99 static void	syncer_shutdown(void *arg, int howto);
100 static int	vtryrecycle(struct vnode *vp);
101 static void	vbusy(struct vnode *vp);
102 static void	vinactive(struct vnode *, struct thread *);
103 static void	v_incr_usecount(struct vnode *);
104 static void	v_decr_usecount(struct vnode *);
105 static void	v_decr_useonly(struct vnode *);
106 static void	v_upgrade_usecount(struct vnode *);
107 static void	vfree(struct vnode *);
108 static void	vnlru_free(int);
109 static void	vgonel(struct vnode *);
110 static void	vfs_knllock(void *arg);
111 static void	vfs_knlunlock(void *arg);
112 static void	vfs_knl_assert_locked(void *arg);
113 static void	vfs_knl_assert_unlocked(void *arg);
114 static void	destroy_vpollinfo(struct vpollinfo *vi);
115 
116 /*
117  * Number of vnodes in existence.  Increased whenever getnewvnode()
118  * allocates a new vnode, decreased on vdestroy() called on VI_DOOMed
119  * vnode.
120  */
121 static unsigned long	numvnodes;
122 
123 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
124     "Number of vnodes in existence");
125 
126 /*
127  * Conversion tables for conversion from vnode types to inode formats
128  * and back.
129  */
130 enum vtype iftovt_tab[16] = {
131 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
132 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
133 };
134 int vttoif_tab[10] = {
135 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
136 	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
137 };
138 
139 /*
140  * List of vnodes that are ready for recycling.
141  */
142 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
143 
144 /*
145  * Free vnode target.  Free vnodes may simply be files which have been stat'd
146  * but not read.  This is somewhat common, and a small cache of such files
147  * should be kept to avoid recreation costs.
148  */
149 static u_long wantfreevnodes;
150 SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
151 /* Number of vnodes in the free list. */
152 static u_long freevnodes;
153 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0,
154     "Number of vnodes in the free list");
155 
156 static int vlru_allow_cache_src;
157 SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW,
158     &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode");
159 
160 /*
161  * Various variables used for debugging the new implementation of
162  * reassignbuf().
163  * XXX these are probably of (very) limited utility now.
164  */
165 static int reassignbufcalls;
166 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
167     "Number of calls to reassignbuf");
168 
169 /*
170  * Cache for the mount type id assigned to NFS.  This is used for
171  * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
172  */
173 int	nfs_mount_type = -1;
174 
175 /* To keep more than one thread at a time from running vfs_getnewfsid */
176 static struct mtx mntid_mtx;
177 
178 /*
179  * Lock for any access to the following:
180  *	vnode_free_list
181  *	numvnodes
182  *	freevnodes
183  */
184 static struct mtx vnode_free_list_mtx;
185 
186 /* Publicly exported FS */
187 struct nfs_public nfs_pub;
188 
189 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
190 static uma_zone_t vnode_zone;
191 static uma_zone_t vnodepoll_zone;
192 
193 /*
194  * The workitem queue.
195  *
196  * It is useful to delay writes of file data and filesystem metadata
197  * for tens of seconds so that quickly created and deleted files need
198  * not waste disk bandwidth being created and removed. To realize this,
199  * we append vnodes to a "workitem" queue. When running with a soft
200  * updates implementation, most pending metadata dependencies should
201  * not wait for more than a few seconds. Thus, mounted on block devices
202  * are delayed only about a half the time that file data is delayed.
203  * Similarly, directory updates are more critical, so are only delayed
204  * about a third the time that file data is delayed. Thus, there are
205  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
206  * one each second (driven off the filesystem syncer process). The
207  * syncer_delayno variable indicates the next queue that is to be processed.
208  * Items that need to be processed soon are placed in this queue:
209  *
210  *	syncer_workitem_pending[syncer_delayno]
211  *
212  * A delay of fifteen seconds is done by placing the request fifteen
213  * entries later in the queue:
214  *
215  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
216  *
217  */
218 static int syncer_delayno;
219 static long syncer_mask;
220 LIST_HEAD(synclist, bufobj);
221 static struct synclist *syncer_workitem_pending[2];
222 /*
223  * The sync_mtx protects:
224  *	bo->bo_synclist
225  *	sync_vnode_count
226  *	syncer_delayno
227  *	syncer_state
228  *	syncer_workitem_pending
229  *	syncer_worklist_len
230  *	rushjob
231  */
232 static struct mtx sync_mtx;
233 static struct cv sync_wakeup;
234 
235 #define SYNCER_MAXDELAY		32
236 static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
237 static int syncdelay = 30;		/* max time to delay syncing data */
238 static int filedelay = 30;		/* time to delay syncing files */
239 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
240     "Time to delay syncing files (in seconds)");
241 static int dirdelay = 29;		/* time to delay syncing directories */
242 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
243     "Time to delay syncing directories (in seconds)");
244 static int metadelay = 28;		/* time to delay syncing metadata */
245 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
246     "Time to delay syncing metadata (in seconds)");
247 static int rushjob;		/* number of slots to run ASAP */
248 static int stat_rush_requests;	/* number of times I/O speeded up */
249 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
250     "Number of times I/O speeded up (rush requests)");
251 
252 /*
253  * When shutting down the syncer, run it at four times normal speed.
254  */
255 #define SYNCER_SHUTDOWN_SPEEDUP		4
256 static int sync_vnode_count;
257 static int syncer_worklist_len;
258 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
259     syncer_state;
260 
261 /*
262  * Number of vnodes we want to exist at any one time.  This is mostly used
263  * to size hash tables in vnode-related code.  It is normally not used in
264  * getnewvnode(), as wantfreevnodes is normally nonzero.)
265  *
266  * XXX desiredvnodes is historical cruft and should not exist.
267  */
268 int desiredvnodes;
269 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
270     &desiredvnodes, 0, "Maximum number of vnodes");
271 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
272     &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
273 static int vnlru_nowhere;
274 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
275     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
276 
277 /*
278  * Macros to control when a vnode is freed and recycled.  All require
279  * the vnode interlock.
280  */
281 #define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
282 #define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
283 #define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt)
284 
285 
286 /*
287  * Initialize the vnode management data structures.
288  *
289  * Reevaluate the following cap on the number of vnodes after the physical
290  * memory size exceeds 512GB.  In the limit, as the physical memory size
291  * grows, the ratio of physical pages to vnodes approaches sixteen to one.
292  */
293 #ifndef	MAXVNODES_MAX
294 #define	MAXVNODES_MAX	(512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16))
295 #endif
296 static void
297 vntblinit(void *dummy __unused)
298 {
299 	int physvnodes, virtvnodes;
300 
301 	/*
302 	 * Desiredvnodes is a function of the physical memory size and the
303 	 * kernel's heap size.  Generally speaking, it scales with the
304 	 * physical memory size.  The ratio of desiredvnodes to physical pages
305 	 * is one to four until desiredvnodes exceeds 98,304.  Thereafter, the
306 	 * marginal ratio of desiredvnodes to physical pages is one to
307 	 * sixteen.  However, desiredvnodes is limited by the kernel's heap
308 	 * size.  The memory required by desiredvnodes vnodes and vm objects
309 	 * may not exceed one seventh of the kernel's heap size.
310 	 */
311 	physvnodes = maxproc + cnt.v_page_count / 16 + 3 * min(98304 * 4,
312 	    cnt.v_page_count) / 16;
313 	virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) +
314 	    sizeof(struct vnode)));
315 	desiredvnodes = min(physvnodes, virtvnodes);
316 	if (desiredvnodes > MAXVNODES_MAX) {
317 		if (bootverbose)
318 			printf("Reducing kern.maxvnodes %d -> %d\n",
319 			    desiredvnodes, MAXVNODES_MAX);
320 		desiredvnodes = MAXVNODES_MAX;
321 	}
322 	wantfreevnodes = desiredvnodes / 4;
323 	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
324 	TAILQ_INIT(&vnode_free_list);
325 	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
326 	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
327 	    NULL, NULL, UMA_ALIGN_PTR, 0);
328 	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
329 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
330 	/*
331 	 * Initialize the filesystem syncer.
332 	 */
333 	syncer_workitem_pending[WI_MPSAFEQ] = hashinit(syncer_maxdelay, M_VNODE,
334 	    &syncer_mask);
335 	syncer_workitem_pending[WI_GIANTQ] = hashinit(syncer_maxdelay, M_VNODE,
336 	    &syncer_mask);
337 	syncer_maxdelay = syncer_mask + 1;
338 	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
339 	cv_init(&sync_wakeup, "syncer");
340 }
341 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
342 
343 
344 /*
345  * Mark a mount point as busy. Used to synchronize access and to delay
346  * unmounting. Eventually, mountlist_mtx is not released on failure.
347  */
348 int
349 vfs_busy(struct mount *mp, int flags)
350 {
351 
352 	MPASS((flags & ~MBF_MASK) == 0);
353 	CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
354 
355 	MNT_ILOCK(mp);
356 	MNT_REF(mp);
357 	/*
358 	 * If mount point is currenly being unmounted, sleep until the
359 	 * mount point fate is decided.  If thread doing the unmounting fails,
360 	 * it will clear MNTK_UNMOUNT flag before waking us up, indicating
361 	 * that this mount point has survived the unmount attempt and vfs_busy
362 	 * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
363 	 * flag in addition to MNTK_UNMOUNT, indicating that mount point is
364 	 * about to be really destroyed.  vfs_busy needs to release its
365 	 * reference on the mount point in this case and return with ENOENT,
366 	 * telling the caller that mount mount it tried to busy is no longer
367 	 * valid.
368 	 */
369 	while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
370 		if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
371 			MNT_REL(mp);
372 			MNT_IUNLOCK(mp);
373 			CTR1(KTR_VFS, "%s: failed busying before sleeping",
374 			    __func__);
375 			return (ENOENT);
376 		}
377 		if (flags & MBF_MNTLSTLOCK)
378 			mtx_unlock(&mountlist_mtx);
379 		mp->mnt_kern_flag |= MNTK_MWAIT;
380 		msleep(mp, MNT_MTX(mp), PVFS, "vfs_busy", 0);
381 		if (flags & MBF_MNTLSTLOCK)
382 			mtx_lock(&mountlist_mtx);
383 	}
384 	if (flags & MBF_MNTLSTLOCK)
385 		mtx_unlock(&mountlist_mtx);
386 	mp->mnt_lockref++;
387 	MNT_IUNLOCK(mp);
388 	return (0);
389 }
390 
391 /*
392  * Free a busy filesystem.
393  */
394 void
395 vfs_unbusy(struct mount *mp)
396 {
397 
398 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
399 	MNT_ILOCK(mp);
400 	MNT_REL(mp);
401 	KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref"));
402 	mp->mnt_lockref--;
403 	if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
404 		MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
405 		CTR1(KTR_VFS, "%s: waking up waiters", __func__);
406 		mp->mnt_kern_flag &= ~MNTK_DRAINING;
407 		wakeup(&mp->mnt_lockref);
408 	}
409 	MNT_IUNLOCK(mp);
410 }
411 
412 /*
413  * Lookup a mount point by filesystem identifier.
414  */
415 struct mount *
416 vfs_getvfs(fsid_t *fsid)
417 {
418 	struct mount *mp;
419 
420 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
421 	mtx_lock(&mountlist_mtx);
422 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
423 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
424 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
425 			vfs_ref(mp);
426 			mtx_unlock(&mountlist_mtx);
427 			return (mp);
428 		}
429 	}
430 	mtx_unlock(&mountlist_mtx);
431 	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
432 	return ((struct mount *) 0);
433 }
434 
435 /*
436  * Lookup a mount point by filesystem identifier, busying it before
437  * returning.
438  */
439 struct mount *
440 vfs_busyfs(fsid_t *fsid)
441 {
442 	struct mount *mp;
443 	int error;
444 
445 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
446 	mtx_lock(&mountlist_mtx);
447 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
448 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
449 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
450 			error = vfs_busy(mp, MBF_MNTLSTLOCK);
451 			if (error) {
452 				mtx_unlock(&mountlist_mtx);
453 				return (NULL);
454 			}
455 			return (mp);
456 		}
457 	}
458 	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
459 	mtx_unlock(&mountlist_mtx);
460 	return ((struct mount *) 0);
461 }
462 
463 /*
464  * Check if a user can access privileged mount options.
465  */
466 int
467 vfs_suser(struct mount *mp, struct thread *td)
468 {
469 	int error;
470 
471 	/*
472 	 * If the thread is jailed, but this is not a jail-friendly file
473 	 * system, deny immediately.
474 	 */
475 	if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred))
476 		return (EPERM);
477 
478 	/*
479 	 * If the file system was mounted outside the jail of the calling
480 	 * thread, deny immediately.
481 	 */
482 	if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
483 		return (EPERM);
484 
485 	/*
486 	 * If file system supports delegated administration, we don't check
487 	 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
488 	 * by the file system itself.
489 	 * If this is not the user that did original mount, we check for
490 	 * the PRIV_VFS_MOUNT_OWNER privilege.
491 	 */
492 	if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
493 	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
494 		if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
495 			return (error);
496 	}
497 	return (0);
498 }
499 
500 /*
501  * Get a new unique fsid.  Try to make its val[0] unique, since this value
502  * will be used to create fake device numbers for stat().  Also try (but
503  * not so hard) make its val[0] unique mod 2^16, since some emulators only
504  * support 16-bit device numbers.  We end up with unique val[0]'s for the
505  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
506  *
507  * Keep in mind that several mounts may be running in parallel.  Starting
508  * the search one past where the previous search terminated is both a
509  * micro-optimization and a defense against returning the same fsid to
510  * different mounts.
511  */
512 void
513 vfs_getnewfsid(struct mount *mp)
514 {
515 	static uint16_t mntid_base;
516 	struct mount *nmp;
517 	fsid_t tfsid;
518 	int mtype;
519 
520 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
521 	mtx_lock(&mntid_mtx);
522 	mtype = mp->mnt_vfc->vfc_typenum;
523 	tfsid.val[1] = mtype;
524 	mtype = (mtype & 0xFF) << 24;
525 	for (;;) {
526 		tfsid.val[0] = makedev(255,
527 		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
528 		mntid_base++;
529 		if ((nmp = vfs_getvfs(&tfsid)) == NULL)
530 			break;
531 		vfs_rel(nmp);
532 	}
533 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
534 	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
535 	mtx_unlock(&mntid_mtx);
536 }
537 
538 /*
539  * Knob to control the precision of file timestamps:
540  *
541  *   0 = seconds only; nanoseconds zeroed.
542  *   1 = seconds and nanoseconds, accurate within 1/HZ.
543  *   2 = seconds and nanoseconds, truncated to microseconds.
544  * >=3 = seconds and nanoseconds, maximum precision.
545  */
546 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
547 
548 static int timestamp_precision = TSP_SEC;
549 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
550     &timestamp_precision, 0, "File timestamp precision (0: seconds, "
551     "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, "
552     "3+: sec + ns (max. precision))");
553 
554 /*
555  * Get a current timestamp.
556  */
557 void
558 vfs_timestamp(struct timespec *tsp)
559 {
560 	struct timeval tv;
561 
562 	switch (timestamp_precision) {
563 	case TSP_SEC:
564 		tsp->tv_sec = time_second;
565 		tsp->tv_nsec = 0;
566 		break;
567 	case TSP_HZ:
568 		getnanotime(tsp);
569 		break;
570 	case TSP_USEC:
571 		microtime(&tv);
572 		TIMEVAL_TO_TIMESPEC(&tv, tsp);
573 		break;
574 	case TSP_NSEC:
575 	default:
576 		nanotime(tsp);
577 		break;
578 	}
579 }
580 
581 /*
582  * Set vnode attributes to VNOVAL
583  */
584 void
585 vattr_null(struct vattr *vap)
586 {
587 
588 	vap->va_type = VNON;
589 	vap->va_size = VNOVAL;
590 	vap->va_bytes = VNOVAL;
591 	vap->va_mode = VNOVAL;
592 	vap->va_nlink = VNOVAL;
593 	vap->va_uid = VNOVAL;
594 	vap->va_gid = VNOVAL;
595 	vap->va_fsid = VNOVAL;
596 	vap->va_fileid = VNOVAL;
597 	vap->va_blocksize = VNOVAL;
598 	vap->va_rdev = VNOVAL;
599 	vap->va_atime.tv_sec = VNOVAL;
600 	vap->va_atime.tv_nsec = VNOVAL;
601 	vap->va_mtime.tv_sec = VNOVAL;
602 	vap->va_mtime.tv_nsec = VNOVAL;
603 	vap->va_ctime.tv_sec = VNOVAL;
604 	vap->va_ctime.tv_nsec = VNOVAL;
605 	vap->va_birthtime.tv_sec = VNOVAL;
606 	vap->va_birthtime.tv_nsec = VNOVAL;
607 	vap->va_flags = VNOVAL;
608 	vap->va_gen = VNOVAL;
609 	vap->va_vaflags = 0;
610 }
611 
612 /*
613  * This routine is called when we have too many vnodes.  It attempts
614  * to free <count> vnodes and will potentially free vnodes that still
615  * have VM backing store (VM backing store is typically the cause
616  * of a vnode blowout so we want to do this).  Therefore, this operation
617  * is not considered cheap.
618  *
619  * A number of conditions may prevent a vnode from being reclaimed.
620  * the buffer cache may have references on the vnode, a directory
621  * vnode may still have references due to the namei cache representing
622  * underlying files, or the vnode may be in active use.   It is not
623  * desireable to reuse such vnodes.  These conditions may cause the
624  * number of vnodes to reach some minimum value regardless of what
625  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
626  */
627 static int
628 vlrureclaim(struct mount *mp)
629 {
630 	struct vnode *vp;
631 	int done;
632 	int trigger;
633 	int usevnodes;
634 	int count;
635 
636 	/*
637 	 * Calculate the trigger point, don't allow user
638 	 * screwups to blow us up.   This prevents us from
639 	 * recycling vnodes with lots of resident pages.  We
640 	 * aren't trying to free memory, we are trying to
641 	 * free vnodes.
642 	 */
643 	usevnodes = desiredvnodes;
644 	if (usevnodes <= 0)
645 		usevnodes = 1;
646 	trigger = cnt.v_page_count * 2 / usevnodes;
647 	done = 0;
648 	vn_start_write(NULL, &mp, V_WAIT);
649 	MNT_ILOCK(mp);
650 	count = mp->mnt_nvnodelistsize / 10 + 1;
651 	while (count != 0) {
652 		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
653 		while (vp != NULL && vp->v_type == VMARKER)
654 			vp = TAILQ_NEXT(vp, v_nmntvnodes);
655 		if (vp == NULL)
656 			break;
657 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
658 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
659 		--count;
660 		if (!VI_TRYLOCK(vp))
661 			goto next_iter;
662 		/*
663 		 * If it's been deconstructed already, it's still
664 		 * referenced, or it exceeds the trigger, skip it.
665 		 */
666 		if (vp->v_usecount ||
667 		    (!vlru_allow_cache_src &&
668 			!LIST_EMPTY(&(vp)->v_cache_src)) ||
669 		    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
670 		    vp->v_object->resident_page_count > trigger)) {
671 			VI_UNLOCK(vp);
672 			goto next_iter;
673 		}
674 		MNT_IUNLOCK(mp);
675 		vholdl(vp);
676 		if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
677 			vdrop(vp);
678 			goto next_iter_mntunlocked;
679 		}
680 		VI_LOCK(vp);
681 		/*
682 		 * v_usecount may have been bumped after VOP_LOCK() dropped
683 		 * the vnode interlock and before it was locked again.
684 		 *
685 		 * It is not necessary to recheck VI_DOOMED because it can
686 		 * only be set by another thread that holds both the vnode
687 		 * lock and vnode interlock.  If another thread has the
688 		 * vnode lock before we get to VOP_LOCK() and obtains the
689 		 * vnode interlock after VOP_LOCK() drops the vnode
690 		 * interlock, the other thread will be unable to drop the
691 		 * vnode lock before our VOP_LOCK() call fails.
692 		 */
693 		if (vp->v_usecount ||
694 		    (!vlru_allow_cache_src &&
695 			!LIST_EMPTY(&(vp)->v_cache_src)) ||
696 		    (vp->v_object != NULL &&
697 		    vp->v_object->resident_page_count > trigger)) {
698 			VOP_UNLOCK(vp, LK_INTERLOCK);
699 			goto next_iter_mntunlocked;
700 		}
701 		KASSERT((vp->v_iflag & VI_DOOMED) == 0,
702 		    ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
703 		vgonel(vp);
704 		VOP_UNLOCK(vp, 0);
705 		vdropl(vp);
706 		done++;
707 next_iter_mntunlocked:
708 		if (!should_yield())
709 			goto relock_mnt;
710 		goto yield;
711 next_iter:
712 		if (!should_yield())
713 			continue;
714 		MNT_IUNLOCK(mp);
715 yield:
716 		kern_yield(-1);
717 relock_mnt:
718 		MNT_ILOCK(mp);
719 	}
720 	MNT_IUNLOCK(mp);
721 	vn_finished_write(mp);
722 	return done;
723 }
724 
725 /*
726  * Attempt to keep the free list at wantfreevnodes length.
727  */
728 static void
729 vnlru_free(int count)
730 {
731 	struct vnode *vp;
732 	int vfslocked;
733 
734 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
735 	for (; count > 0; count--) {
736 		vp = TAILQ_FIRST(&vnode_free_list);
737 		/*
738 		 * The list can be modified while the free_list_mtx
739 		 * has been dropped and vp could be NULL here.
740 		 */
741 		if (!vp)
742 			break;
743 		VNASSERT(vp->v_op != NULL, vp,
744 		    ("vnlru_free: vnode already reclaimed."));
745 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
746 		/*
747 		 * Don't recycle if we can't get the interlock.
748 		 */
749 		if (!VI_TRYLOCK(vp)) {
750 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
751 			continue;
752 		}
753 		VNASSERT(VCANRECYCLE(vp), vp,
754 		    ("vp inconsistent on freelist"));
755 		freevnodes--;
756 		vp->v_iflag &= ~VI_FREE;
757 		vholdl(vp);
758 		mtx_unlock(&vnode_free_list_mtx);
759 		VI_UNLOCK(vp);
760 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
761 		vtryrecycle(vp);
762 		VFS_UNLOCK_GIANT(vfslocked);
763 		/*
764 		 * If the recycled succeeded this vdrop will actually free
765 		 * the vnode.  If not it will simply place it back on
766 		 * the free list.
767 		 */
768 		vdrop(vp);
769 		mtx_lock(&vnode_free_list_mtx);
770 	}
771 }
772 /*
773  * Attempt to recycle vnodes in a context that is always safe to block.
774  * Calling vlrurecycle() from the bowels of filesystem code has some
775  * interesting deadlock problems.
776  */
777 static struct proc *vnlruproc;
778 static int vnlruproc_sig;
779 
780 static void
781 vnlru_proc(void)
782 {
783 	struct mount *mp, *nmp;
784 	int done, vfslocked;
785 	struct proc *p = vnlruproc;
786 
787 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
788 	    SHUTDOWN_PRI_FIRST);
789 
790 	for (;;) {
791 		kproc_suspend_check(p);
792 		mtx_lock(&vnode_free_list_mtx);
793 		if (freevnodes > wantfreevnodes)
794 			vnlru_free(freevnodes - wantfreevnodes);
795 		if (numvnodes <= desiredvnodes * 9 / 10) {
796 			vnlruproc_sig = 0;
797 			wakeup(&vnlruproc_sig);
798 			msleep(vnlruproc, &vnode_free_list_mtx,
799 			    PVFS|PDROP, "vlruwt", hz);
800 			continue;
801 		}
802 		mtx_unlock(&vnode_free_list_mtx);
803 		done = 0;
804 		mtx_lock(&mountlist_mtx);
805 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
806 			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
807 				nmp = TAILQ_NEXT(mp, mnt_list);
808 				continue;
809 			}
810 			vfslocked = VFS_LOCK_GIANT(mp);
811 			done += vlrureclaim(mp);
812 			VFS_UNLOCK_GIANT(vfslocked);
813 			mtx_lock(&mountlist_mtx);
814 			nmp = TAILQ_NEXT(mp, mnt_list);
815 			vfs_unbusy(mp);
816 		}
817 		mtx_unlock(&mountlist_mtx);
818 		if (done == 0) {
819 #if 0
820 			/* These messages are temporary debugging aids */
821 			if (vnlru_nowhere < 5)
822 				printf("vnlru process getting nowhere..\n");
823 			else if (vnlru_nowhere == 5)
824 				printf("vnlru process messages stopped.\n");
825 #endif
826 			vnlru_nowhere++;
827 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
828 		} else
829 			kern_yield(-1);
830 	}
831 }
832 
833 static struct kproc_desc vnlru_kp = {
834 	"vnlru",
835 	vnlru_proc,
836 	&vnlruproc
837 };
838 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
839     &vnlru_kp);
840 
841 /*
842  * Routines having to do with the management of the vnode table.
843  */
844 
845 void
846 vdestroy(struct vnode *vp)
847 {
848 	struct bufobj *bo;
849 
850 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
851 	mtx_lock(&vnode_free_list_mtx);
852 	numvnodes--;
853 	mtx_unlock(&vnode_free_list_mtx);
854 	bo = &vp->v_bufobj;
855 	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
856 	    ("cleaned vnode still on the free list."));
857 	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
858 	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
859 	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
860 	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
861 	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
862 	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
863 	VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL"));
864 	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
865 	VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL"));
866 	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
867 	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
868 	VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
869 	VI_UNLOCK(vp);
870 #ifdef MAC
871 	mac_vnode_destroy(vp);
872 #endif
873 	if (vp->v_pollinfo != NULL)
874 		destroy_vpollinfo(vp->v_pollinfo);
875 #ifdef INVARIANTS
876 	/* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */
877 	vp->v_op = NULL;
878 #endif
879 	lockdestroy(vp->v_vnlock);
880 	mtx_destroy(&vp->v_interlock);
881 	mtx_destroy(BO_MTX(bo));
882 	uma_zfree(vnode_zone, vp);
883 }
884 
885 /*
886  * Try to recycle a freed vnode.  We abort if anyone picks up a reference
887  * before we actually vgone().  This function must be called with the vnode
888  * held to prevent the vnode from being returned to the free list midway
889  * through vgone().
890  */
891 static int
892 vtryrecycle(struct vnode *vp)
893 {
894 	struct mount *vnmp;
895 
896 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
897 	VNASSERT(vp->v_holdcnt, vp,
898 	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
899 	/*
900 	 * This vnode may found and locked via some other list, if so we
901 	 * can't recycle it yet.
902 	 */
903 	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
904 		CTR2(KTR_VFS,
905 		    "%s: impossible to recycle, vp %p lock is already held",
906 		    __func__, vp);
907 		return (EWOULDBLOCK);
908 	}
909 	/*
910 	 * Don't recycle if its filesystem is being suspended.
911 	 */
912 	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
913 		VOP_UNLOCK(vp, 0);
914 		CTR2(KTR_VFS,
915 		    "%s: impossible to recycle, cannot start the write for %p",
916 		    __func__, vp);
917 		return (EBUSY);
918 	}
919 	/*
920 	 * If we got this far, we need to acquire the interlock and see if
921 	 * anyone picked up this vnode from another list.  If not, we will
922 	 * mark it with DOOMED via vgonel() so that anyone who does find it
923 	 * will skip over it.
924 	 */
925 	VI_LOCK(vp);
926 	if (vp->v_usecount) {
927 		VOP_UNLOCK(vp, LK_INTERLOCK);
928 		vn_finished_write(vnmp);
929 		CTR2(KTR_VFS,
930 		    "%s: impossible to recycle, %p is already referenced",
931 		    __func__, vp);
932 		return (EBUSY);
933 	}
934 	if ((vp->v_iflag & VI_DOOMED) == 0)
935 		vgonel(vp);
936 	VOP_UNLOCK(vp, LK_INTERLOCK);
937 	vn_finished_write(vnmp);
938 	return (0);
939 }
940 
941 /*
942  * Return the next vnode from the free list.
943  */
944 int
945 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
946     struct vnode **vpp)
947 {
948 	struct vnode *vp = NULL;
949 	struct bufobj *bo;
950 
951 	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
952 	mtx_lock(&vnode_free_list_mtx);
953 	/*
954 	 * Lend our context to reclaim vnodes if they've exceeded the max.
955 	 */
956 	if (freevnodes > wantfreevnodes)
957 		vnlru_free(1);
958 	/*
959 	 * Wait for available vnodes.
960 	 */
961 	if (numvnodes > desiredvnodes) {
962 		if (mp != NULL && (mp->mnt_kern_flag & MNTK_SUSPEND)) {
963 			/*
964 			 * File system is beeing suspended, we cannot risk a
965 			 * deadlock here, so allocate new vnode anyway.
966 			 */
967 			if (freevnodes > wantfreevnodes)
968 				vnlru_free(freevnodes - wantfreevnodes);
969 			goto alloc;
970 		}
971 		if (vnlruproc_sig == 0) {
972 			vnlruproc_sig = 1;	/* avoid unnecessary wakeups */
973 			wakeup(vnlruproc);
974 		}
975 		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
976 		    "vlruwk", hz);
977 #if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
978 		if (numvnodes > desiredvnodes) {
979 			mtx_unlock(&vnode_free_list_mtx);
980 			return (ENFILE);
981 		}
982 #endif
983 	}
984 alloc:
985 	numvnodes++;
986 	mtx_unlock(&vnode_free_list_mtx);
987 	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
988 	/*
989 	 * Setup locks.
990 	 */
991 	vp->v_vnlock = &vp->v_lock;
992 	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
993 	/*
994 	 * By default, don't allow shared locks unless filesystems
995 	 * opt-in.
996 	 */
997 	lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE);
998 	/*
999 	 * Initialize bufobj.
1000 	 */
1001 	bo = &vp->v_bufobj;
1002 	bo->__bo_vnode = vp;
1003 	mtx_init(BO_MTX(bo), "bufobj interlock", NULL, MTX_DEF);
1004 	bo->bo_ops = &buf_ops_bio;
1005 	bo->bo_private = vp;
1006 	TAILQ_INIT(&bo->bo_clean.bv_hd);
1007 	TAILQ_INIT(&bo->bo_dirty.bv_hd);
1008 	/*
1009 	 * Initialize namecache.
1010 	 */
1011 	LIST_INIT(&vp->v_cache_src);
1012 	TAILQ_INIT(&vp->v_cache_dst);
1013 	/*
1014 	 * Finalize various vnode identity bits.
1015 	 */
1016 	vp->v_type = VNON;
1017 	vp->v_tag = tag;
1018 	vp->v_op = vops;
1019 	v_incr_usecount(vp);
1020 	vp->v_data = 0;
1021 #ifdef MAC
1022 	mac_vnode_init(vp);
1023 	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
1024 		mac_vnode_associate_singlelabel(mp, vp);
1025 	else if (mp == NULL && vops != &dead_vnodeops)
1026 		printf("NULL mp in getnewvnode()\n");
1027 #endif
1028 	if (mp != NULL) {
1029 		bo->bo_bsize = mp->mnt_stat.f_iosize;
1030 		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
1031 			vp->v_vflag |= VV_NOKNOTE;
1032 	}
1033 
1034 	*vpp = vp;
1035 	return (0);
1036 }
1037 
1038 /*
1039  * Delete from old mount point vnode list, if on one.
1040  */
1041 static void
1042 delmntque(struct vnode *vp)
1043 {
1044 	struct mount *mp;
1045 
1046 	mp = vp->v_mount;
1047 	if (mp == NULL)
1048 		return;
1049 	MNT_ILOCK(mp);
1050 	vp->v_mount = NULL;
1051 	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
1052 		("bad mount point vnode list size"));
1053 	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1054 	mp->mnt_nvnodelistsize--;
1055 	MNT_REL(mp);
1056 	MNT_IUNLOCK(mp);
1057 }
1058 
1059 static void
1060 insmntque_stddtr(struct vnode *vp, void *dtr_arg)
1061 {
1062 
1063 	vp->v_data = NULL;
1064 	vp->v_op = &dead_vnodeops;
1065 	/* XXX non mp-safe fs may still call insmntque with vnode
1066 	   unlocked */
1067 	if (!VOP_ISLOCKED(vp))
1068 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1069 	vgone(vp);
1070 	vput(vp);
1071 }
1072 
1073 /*
1074  * Insert into list of vnodes for the new mount point, if available.
1075  */
1076 int
1077 insmntque1(struct vnode *vp, struct mount *mp,
1078 	void (*dtr)(struct vnode *, void *), void *dtr_arg)
1079 {
1080 	int locked;
1081 
1082 	KASSERT(vp->v_mount == NULL,
1083 		("insmntque: vnode already on per mount vnode list"));
1084 	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
1085 #ifdef DEBUG_VFS_LOCKS
1086 	if (!VFS_NEEDSGIANT(mp))
1087 		ASSERT_VOP_ELOCKED(vp,
1088 		    "insmntque: mp-safe fs and non-locked vp");
1089 #endif
1090 	MNT_ILOCK(mp);
1091 	if ((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
1092 	    ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
1093 	     mp->mnt_nvnodelistsize == 0)) {
1094 		locked = VOP_ISLOCKED(vp);
1095 		if (!locked || (locked == LK_EXCLUSIVE &&
1096 		     (vp->v_vflag & VV_FORCEINSMQ) == 0)) {
1097 			MNT_IUNLOCK(mp);
1098 			if (dtr != NULL)
1099 				dtr(vp, dtr_arg);
1100 			return (EBUSY);
1101 		}
1102 	}
1103 	vp->v_mount = mp;
1104 	MNT_REF(mp);
1105 	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1106 	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
1107 		("neg mount point vnode list size"));
1108 	mp->mnt_nvnodelistsize++;
1109 	MNT_IUNLOCK(mp);
1110 	return (0);
1111 }
1112 
1113 int
1114 insmntque(struct vnode *vp, struct mount *mp)
1115 {
1116 
1117 	return (insmntque1(vp, mp, insmntque_stddtr, NULL));
1118 }
1119 
1120 /*
1121  * Flush out and invalidate all buffers associated with a bufobj
1122  * Called with the underlying object locked.
1123  */
1124 int
1125 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
1126 {
1127 	int error;
1128 
1129 	BO_LOCK(bo);
1130 	if (flags & V_SAVE) {
1131 		error = bufobj_wwait(bo, slpflag, slptimeo);
1132 		if (error) {
1133 			BO_UNLOCK(bo);
1134 			return (error);
1135 		}
1136 		if (bo->bo_dirty.bv_cnt > 0) {
1137 			BO_UNLOCK(bo);
1138 			if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
1139 				return (error);
1140 			/*
1141 			 * XXX We could save a lock/unlock if this was only
1142 			 * enabled under INVARIANTS
1143 			 */
1144 			BO_LOCK(bo);
1145 			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
1146 				panic("vinvalbuf: dirty bufs");
1147 		}
1148 	}
1149 	/*
1150 	 * If you alter this loop please notice that interlock is dropped and
1151 	 * reacquired in flushbuflist.  Special care is needed to ensure that
1152 	 * no race conditions occur from this.
1153 	 */
1154 	do {
1155 		error = flushbuflist(&bo->bo_clean,
1156 		    flags, bo, slpflag, slptimeo);
1157 		if (error == 0)
1158 			error = flushbuflist(&bo->bo_dirty,
1159 			    flags, bo, slpflag, slptimeo);
1160 		if (error != 0 && error != EAGAIN) {
1161 			BO_UNLOCK(bo);
1162 			return (error);
1163 		}
1164 	} while (error != 0);
1165 
1166 	/*
1167 	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
1168 	 * have write I/O in-progress but if there is a VM object then the
1169 	 * VM object can also have read-I/O in-progress.
1170 	 */
1171 	do {
1172 		bufobj_wwait(bo, 0, 0);
1173 		BO_UNLOCK(bo);
1174 		if (bo->bo_object != NULL) {
1175 			VM_OBJECT_LOCK(bo->bo_object);
1176 			vm_object_pip_wait(bo->bo_object, "bovlbx");
1177 			VM_OBJECT_UNLOCK(bo->bo_object);
1178 		}
1179 		BO_LOCK(bo);
1180 	} while (bo->bo_numoutput > 0);
1181 	BO_UNLOCK(bo);
1182 
1183 	/*
1184 	 * Destroy the copy in the VM cache, too.
1185 	 */
1186 	if (bo->bo_object != NULL && (flags & (V_ALT | V_NORMAL)) == 0) {
1187 		VM_OBJECT_LOCK(bo->bo_object);
1188 		vm_object_page_remove(bo->bo_object, 0, 0,
1189 			(flags & V_SAVE) ? TRUE : FALSE);
1190 		VM_OBJECT_UNLOCK(bo->bo_object);
1191 	}
1192 
1193 #ifdef INVARIANTS
1194 	BO_LOCK(bo);
1195 	if ((flags & (V_ALT | V_NORMAL)) == 0 &&
1196 	    (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
1197 		panic("vinvalbuf: flush failed");
1198 	BO_UNLOCK(bo);
1199 #endif
1200 	return (0);
1201 }
1202 
1203 /*
1204  * Flush out and invalidate all buffers associated with a vnode.
1205  * Called with the underlying object locked.
1206  */
1207 int
1208 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
1209 {
1210 
1211 	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
1212 	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
1213 	return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
1214 }
1215 
1216 /*
1217  * Flush out buffers on the specified list.
1218  *
1219  */
1220 static int
1221 flushbuflist( struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
1222     int slptimeo)
1223 {
1224 	struct buf *bp, *nbp;
1225 	int retval, error;
1226 	daddr_t lblkno;
1227 	b_xflags_t xflags;
1228 
1229 	ASSERT_BO_LOCKED(bo);
1230 
1231 	retval = 0;
1232 	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
1233 		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1234 		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1235 			continue;
1236 		}
1237 		lblkno = 0;
1238 		xflags = 0;
1239 		if (nbp != NULL) {
1240 			lblkno = nbp->b_lblkno;
1241 			xflags = nbp->b_xflags &
1242 				(BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN);
1243 		}
1244 		retval = EAGAIN;
1245 		error = BUF_TIMELOCK(bp,
1246 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo),
1247 		    "flushbuf", slpflag, slptimeo);
1248 		if (error) {
1249 			BO_LOCK(bo);
1250 			return (error != ENOLCK ? error : EAGAIN);
1251 		}
1252 		KASSERT(bp->b_bufobj == bo,
1253 		    ("bp %p wrong b_bufobj %p should be %p",
1254 		    bp, bp->b_bufobj, bo));
1255 		if (bp->b_bufobj != bo) {	/* XXX: necessary ? */
1256 			BUF_UNLOCK(bp);
1257 			BO_LOCK(bo);
1258 			return (EAGAIN);
1259 		}
1260 		/*
1261 		 * XXX Since there are no node locks for NFS, I
1262 		 * believe there is a slight chance that a delayed
1263 		 * write will occur while sleeping just above, so
1264 		 * check for it.
1265 		 */
1266 		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1267 		    (flags & V_SAVE)) {
1268 			BO_LOCK(bo);
1269 			bremfree(bp);
1270 			BO_UNLOCK(bo);
1271 			bp->b_flags |= B_ASYNC;
1272 			bwrite(bp);
1273 			BO_LOCK(bo);
1274 			return (EAGAIN);	/* XXX: why not loop ? */
1275 		}
1276 		BO_LOCK(bo);
1277 		bremfree(bp);
1278 		BO_UNLOCK(bo);
1279 		bp->b_flags |= (B_INVAL | B_RELBUF);
1280 		bp->b_flags &= ~B_ASYNC;
1281 		brelse(bp);
1282 		BO_LOCK(bo);
1283 		if (nbp != NULL &&
1284 		    (nbp->b_bufobj != bo ||
1285 		     nbp->b_lblkno != lblkno ||
1286 		     (nbp->b_xflags &
1287 		      (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN)) != xflags))
1288 			break;			/* nbp invalid */
1289 	}
1290 	return (retval);
1291 }
1292 
1293 /*
1294  * Truncate a file's buffer and pages to a specified length.  This
1295  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1296  * sync activity.
1297  */
1298 int
1299 vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td,
1300     off_t length, int blksize)
1301 {
1302 	struct buf *bp, *nbp;
1303 	int anyfreed;
1304 	int trunclbn;
1305 	struct bufobj *bo;
1306 
1307 	CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__,
1308 	    vp, cred, blksize, (uintmax_t)length);
1309 
1310 	/*
1311 	 * Round up to the *next* lbn.
1312 	 */
1313 	trunclbn = (length + blksize - 1) / blksize;
1314 
1315 	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1316 restart:
1317 	bo = &vp->v_bufobj;
1318 	BO_LOCK(bo);
1319 	anyfreed = 1;
1320 	for (;anyfreed;) {
1321 		anyfreed = 0;
1322 		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
1323 			if (bp->b_lblkno < trunclbn)
1324 				continue;
1325 			if (BUF_LOCK(bp,
1326 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1327 			    BO_MTX(bo)) == ENOLCK)
1328 				goto restart;
1329 
1330 			BO_LOCK(bo);
1331 			bremfree(bp);
1332 			BO_UNLOCK(bo);
1333 			bp->b_flags |= (B_INVAL | B_RELBUF);
1334 			bp->b_flags &= ~B_ASYNC;
1335 			brelse(bp);
1336 			anyfreed = 1;
1337 
1338 			BO_LOCK(bo);
1339 			if (nbp != NULL &&
1340 			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1341 			    (nbp->b_vp != vp) ||
1342 			    (nbp->b_flags & B_DELWRI))) {
1343 				BO_UNLOCK(bo);
1344 				goto restart;
1345 			}
1346 		}
1347 
1348 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1349 			if (bp->b_lblkno < trunclbn)
1350 				continue;
1351 			if (BUF_LOCK(bp,
1352 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1353 			    BO_MTX(bo)) == ENOLCK)
1354 				goto restart;
1355 			BO_LOCK(bo);
1356 			bremfree(bp);
1357 			BO_UNLOCK(bo);
1358 			bp->b_flags |= (B_INVAL | B_RELBUF);
1359 			bp->b_flags &= ~B_ASYNC;
1360 			brelse(bp);
1361 			anyfreed = 1;
1362 
1363 			BO_LOCK(bo);
1364 			if (nbp != NULL &&
1365 			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1366 			    (nbp->b_vp != vp) ||
1367 			    (nbp->b_flags & B_DELWRI) == 0)) {
1368 				BO_UNLOCK(bo);
1369 				goto restart;
1370 			}
1371 		}
1372 	}
1373 
1374 	if (length > 0) {
1375 restartsync:
1376 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1377 			if (bp->b_lblkno > 0)
1378 				continue;
1379 			/*
1380 			 * Since we hold the vnode lock this should only
1381 			 * fail if we're racing with the buf daemon.
1382 			 */
1383 			if (BUF_LOCK(bp,
1384 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1385 			    BO_MTX(bo)) == ENOLCK) {
1386 				goto restart;
1387 			}
1388 			VNASSERT((bp->b_flags & B_DELWRI), vp,
1389 			    ("buf(%p) on dirty queue without DELWRI", bp));
1390 
1391 			BO_LOCK(bo);
1392 			bremfree(bp);
1393 			BO_UNLOCK(bo);
1394 			bawrite(bp);
1395 			BO_LOCK(bo);
1396 			goto restartsync;
1397 		}
1398 	}
1399 
1400 	bufobj_wwait(bo, 0, 0);
1401 	BO_UNLOCK(bo);
1402 	vnode_pager_setsize(vp, length);
1403 
1404 	return (0);
1405 }
1406 
1407 /*
1408  * buf_splay() - splay tree core for the clean/dirty list of buffers in
1409  *		 a vnode.
1410  *
1411  *	NOTE: We have to deal with the special case of a background bitmap
1412  *	buffer, a situation where two buffers will have the same logical
1413  *	block offset.  We want (1) only the foreground buffer to be accessed
1414  *	in a lookup and (2) must differentiate between the foreground and
1415  *	background buffer in the splay tree algorithm because the splay
1416  *	tree cannot normally handle multiple entities with the same 'index'.
1417  *	We accomplish this by adding differentiating flags to the splay tree's
1418  *	numerical domain.
1419  */
1420 static
1421 struct buf *
1422 buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
1423 {
1424 	struct buf dummy;
1425 	struct buf *lefttreemax, *righttreemin, *y;
1426 
1427 	if (root == NULL)
1428 		return (NULL);
1429 	lefttreemax = righttreemin = &dummy;
1430 	for (;;) {
1431 		if (lblkno < root->b_lblkno ||
1432 		    (lblkno == root->b_lblkno &&
1433 		    (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1434 			if ((y = root->b_left) == NULL)
1435 				break;
1436 			if (lblkno < y->b_lblkno) {
1437 				/* Rotate right. */
1438 				root->b_left = y->b_right;
1439 				y->b_right = root;
1440 				root = y;
1441 				if ((y = root->b_left) == NULL)
1442 					break;
1443 			}
1444 			/* Link into the new root's right tree. */
1445 			righttreemin->b_left = root;
1446 			righttreemin = root;
1447 		} else if (lblkno > root->b_lblkno ||
1448 		    (lblkno == root->b_lblkno &&
1449 		    (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
1450 			if ((y = root->b_right) == NULL)
1451 				break;
1452 			if (lblkno > y->b_lblkno) {
1453 				/* Rotate left. */
1454 				root->b_right = y->b_left;
1455 				y->b_left = root;
1456 				root = y;
1457 				if ((y = root->b_right) == NULL)
1458 					break;
1459 			}
1460 			/* Link into the new root's left tree. */
1461 			lefttreemax->b_right = root;
1462 			lefttreemax = root;
1463 		} else {
1464 			break;
1465 		}
1466 		root = y;
1467 	}
1468 	/* Assemble the new root. */
1469 	lefttreemax->b_right = root->b_left;
1470 	righttreemin->b_left = root->b_right;
1471 	root->b_left = dummy.b_right;
1472 	root->b_right = dummy.b_left;
1473 	return (root);
1474 }
1475 
1476 static void
1477 buf_vlist_remove(struct buf *bp)
1478 {
1479 	struct buf *root;
1480 	struct bufv *bv;
1481 
1482 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1483 	ASSERT_BO_LOCKED(bp->b_bufobj);
1484 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
1485 	    (BX_VNDIRTY|BX_VNCLEAN),
1486 	    ("buf_vlist_remove: Buf %p is on two lists", bp));
1487 	if (bp->b_xflags & BX_VNDIRTY)
1488 		bv = &bp->b_bufobj->bo_dirty;
1489 	else
1490 		bv = &bp->b_bufobj->bo_clean;
1491 	if (bp != bv->bv_root) {
1492 		root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
1493 		KASSERT(root == bp, ("splay lookup failed in remove"));
1494 	}
1495 	if (bp->b_left == NULL) {
1496 		root = bp->b_right;
1497 	} else {
1498 		root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
1499 		root->b_right = bp->b_right;
1500 	}
1501 	bv->bv_root = root;
1502 	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
1503 	bv->bv_cnt--;
1504 	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1505 }
1506 
1507 /*
1508  * Add the buffer to the sorted clean or dirty block list using a
1509  * splay tree algorithm.
1510  *
1511  * NOTE: xflags is passed as a constant, optimizing this inline function!
1512  */
1513 static void
1514 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
1515 {
1516 	struct buf *root;
1517 	struct bufv *bv;
1518 
1519 	ASSERT_BO_LOCKED(bo);
1520 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1521 	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
1522 	bp->b_xflags |= xflags;
1523 	if (xflags & BX_VNDIRTY)
1524 		bv = &bo->bo_dirty;
1525 	else
1526 		bv = &bo->bo_clean;
1527 
1528 	root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
1529 	if (root == NULL) {
1530 		bp->b_left = NULL;
1531 		bp->b_right = NULL;
1532 		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
1533 	} else if (bp->b_lblkno < root->b_lblkno ||
1534 	    (bp->b_lblkno == root->b_lblkno &&
1535 	    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1536 		bp->b_left = root->b_left;
1537 		bp->b_right = root;
1538 		root->b_left = NULL;
1539 		TAILQ_INSERT_BEFORE(root, bp, b_bobufs);
1540 	} else {
1541 		bp->b_right = root->b_right;
1542 		bp->b_left = root;
1543 		root->b_right = NULL;
1544 		TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs);
1545 	}
1546 	bv->bv_cnt++;
1547 	bv->bv_root = bp;
1548 }
1549 
1550 /*
1551  * Lookup a buffer using the splay tree.  Note that we specifically avoid
1552  * shadow buffers used in background bitmap writes.
1553  *
1554  * This code isn't quite efficient as it could be because we are maintaining
1555  * two sorted lists and do not know which list the block resides in.
1556  *
1557  * During a "make buildworld" the desired buffer is found at one of
1558  * the roots more than 60% of the time.  Thus, checking both roots
1559  * before performing either splay eliminates unnecessary splays on the
1560  * first tree splayed.
1561  */
1562 struct buf *
1563 gbincore(struct bufobj *bo, daddr_t lblkno)
1564 {
1565 	struct buf *bp;
1566 
1567 	ASSERT_BO_LOCKED(bo);
1568 	if ((bp = bo->bo_clean.bv_root) != NULL &&
1569 	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1570 		return (bp);
1571 	if ((bp = bo->bo_dirty.bv_root) != NULL &&
1572 	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1573 		return (bp);
1574 	if ((bp = bo->bo_clean.bv_root) != NULL) {
1575 		bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp);
1576 		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1577 			return (bp);
1578 	}
1579 	if ((bp = bo->bo_dirty.bv_root) != NULL) {
1580 		bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp);
1581 		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1582 			return (bp);
1583 	}
1584 	return (NULL);
1585 }
1586 
1587 /*
1588  * Associate a buffer with a vnode.
1589  */
1590 void
1591 bgetvp(struct vnode *vp, struct buf *bp)
1592 {
1593 	struct bufobj *bo;
1594 
1595 	bo = &vp->v_bufobj;
1596 	ASSERT_BO_LOCKED(bo);
1597 	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
1598 
1599 	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
1600 	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
1601 	    ("bgetvp: bp already attached! %p", bp));
1602 
1603 	vhold(vp);
1604 	if (VFS_NEEDSGIANT(vp->v_mount) || bo->bo_flag & BO_NEEDSGIANT)
1605 		bp->b_flags |= B_NEEDSGIANT;
1606 	bp->b_vp = vp;
1607 	bp->b_bufobj = bo;
1608 	/*
1609 	 * Insert onto list for new vnode.
1610 	 */
1611 	buf_vlist_add(bp, bo, BX_VNCLEAN);
1612 }
1613 
1614 /*
1615  * Disassociate a buffer from a vnode.
1616  */
1617 void
1618 brelvp(struct buf *bp)
1619 {
1620 	struct bufobj *bo;
1621 	struct vnode *vp;
1622 
1623 	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1624 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1625 
1626 	/*
1627 	 * Delete from old vnode list, if on one.
1628 	 */
1629 	vp = bp->b_vp;		/* XXX */
1630 	bo = bp->b_bufobj;
1631 	BO_LOCK(bo);
1632 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1633 		buf_vlist_remove(bp);
1634 	else
1635 		panic("brelvp: Buffer %p not on queue.", bp);
1636 	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1637 		bo->bo_flag &= ~BO_ONWORKLST;
1638 		mtx_lock(&sync_mtx);
1639 		LIST_REMOVE(bo, bo_synclist);
1640 		syncer_worklist_len--;
1641 		mtx_unlock(&sync_mtx);
1642 	}
1643 	bp->b_flags &= ~B_NEEDSGIANT;
1644 	bp->b_vp = NULL;
1645 	bp->b_bufobj = NULL;
1646 	BO_UNLOCK(bo);
1647 	vdrop(vp);
1648 }
1649 
1650 /*
1651  * Add an item to the syncer work queue.
1652  */
1653 static void
1654 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
1655 {
1656 	int queue, slot;
1657 
1658 	ASSERT_BO_LOCKED(bo);
1659 
1660 	mtx_lock(&sync_mtx);
1661 	if (bo->bo_flag & BO_ONWORKLST)
1662 		LIST_REMOVE(bo, bo_synclist);
1663 	else {
1664 		bo->bo_flag |= BO_ONWORKLST;
1665 		syncer_worklist_len++;
1666 	}
1667 
1668 	if (delay > syncer_maxdelay - 2)
1669 		delay = syncer_maxdelay - 2;
1670 	slot = (syncer_delayno + delay) & syncer_mask;
1671 
1672 	queue = VFS_NEEDSGIANT(bo->__bo_vnode->v_mount) ? WI_GIANTQ :
1673 	    WI_MPSAFEQ;
1674 	LIST_INSERT_HEAD(&syncer_workitem_pending[queue][slot], bo,
1675 	    bo_synclist);
1676 	mtx_unlock(&sync_mtx);
1677 }
1678 
1679 static int
1680 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
1681 {
1682 	int error, len;
1683 
1684 	mtx_lock(&sync_mtx);
1685 	len = syncer_worklist_len - sync_vnode_count;
1686 	mtx_unlock(&sync_mtx);
1687 	error = SYSCTL_OUT(req, &len, sizeof(len));
1688 	return (error);
1689 }
1690 
1691 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
1692     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
1693 
1694 static struct proc *updateproc;
1695 static void sched_sync(void);
1696 static struct kproc_desc up_kp = {
1697 	"syncer",
1698 	sched_sync,
1699 	&updateproc
1700 };
1701 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
1702 
1703 static int
1704 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
1705 {
1706 	struct vnode *vp;
1707 	struct mount *mp;
1708 
1709 	*bo = LIST_FIRST(slp);
1710 	if (*bo == NULL)
1711 		return (0);
1712 	vp = (*bo)->__bo_vnode;	/* XXX */
1713 	if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
1714 		return (1);
1715 	/*
1716 	 * We use vhold in case the vnode does not
1717 	 * successfully sync.  vhold prevents the vnode from
1718 	 * going away when we unlock the sync_mtx so that
1719 	 * we can acquire the vnode interlock.
1720 	 */
1721 	vholdl(vp);
1722 	mtx_unlock(&sync_mtx);
1723 	VI_UNLOCK(vp);
1724 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1725 		vdrop(vp);
1726 		mtx_lock(&sync_mtx);
1727 		return (*bo == LIST_FIRST(slp));
1728 	}
1729 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1730 	(void) VOP_FSYNC(vp, MNT_LAZY, td);
1731 	VOP_UNLOCK(vp, 0);
1732 	vn_finished_write(mp);
1733 	BO_LOCK(*bo);
1734 	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
1735 		/*
1736 		 * Put us back on the worklist.  The worklist
1737 		 * routine will remove us from our current
1738 		 * position and then add us back in at a later
1739 		 * position.
1740 		 */
1741 		vn_syncer_add_to_worklist(*bo, syncdelay);
1742 	}
1743 	BO_UNLOCK(*bo);
1744 	vdrop(vp);
1745 	mtx_lock(&sync_mtx);
1746 	return (0);
1747 }
1748 
1749 /*
1750  * System filesystem synchronizer daemon.
1751  */
1752 static void
1753 sched_sync(void)
1754 {
1755 	struct synclist *gnext, *next;
1756 	struct synclist *gslp, *slp;
1757 	struct bufobj *bo;
1758 	long starttime;
1759 	struct thread *td = curthread;
1760 	int last_work_seen;
1761 	int net_worklist_len;
1762 	int syncer_final_iter;
1763 	int first_printf;
1764 	int error;
1765 
1766 	last_work_seen = 0;
1767 	syncer_final_iter = 0;
1768 	first_printf = 1;
1769 	syncer_state = SYNCER_RUNNING;
1770 	starttime = time_uptime;
1771 	td->td_pflags |= TDP_NORUNNINGBUF;
1772 
1773 	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
1774 	    SHUTDOWN_PRI_LAST);
1775 
1776 	mtx_lock(&sync_mtx);
1777 	for (;;) {
1778 		if (syncer_state == SYNCER_FINAL_DELAY &&
1779 		    syncer_final_iter == 0) {
1780 			mtx_unlock(&sync_mtx);
1781 			kproc_suspend_check(td->td_proc);
1782 			mtx_lock(&sync_mtx);
1783 		}
1784 		net_worklist_len = syncer_worklist_len - sync_vnode_count;
1785 		if (syncer_state != SYNCER_RUNNING &&
1786 		    starttime != time_uptime) {
1787 			if (first_printf) {
1788 				printf("\nSyncing disks, vnodes remaining...");
1789 				first_printf = 0;
1790 			}
1791 			printf("%d ", net_worklist_len);
1792 		}
1793 		starttime = time_uptime;
1794 
1795 		/*
1796 		 * Push files whose dirty time has expired.  Be careful
1797 		 * of interrupt race on slp queue.
1798 		 *
1799 		 * Skip over empty worklist slots when shutting down.
1800 		 */
1801 		do {
1802 			slp = &syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno];
1803 			gslp = &syncer_workitem_pending[WI_GIANTQ][syncer_delayno];
1804 			syncer_delayno += 1;
1805 			if (syncer_delayno == syncer_maxdelay)
1806 				syncer_delayno = 0;
1807 			next = &syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno];
1808 			gnext = &syncer_workitem_pending[WI_GIANTQ][syncer_delayno];
1809 			/*
1810 			 * If the worklist has wrapped since the
1811 			 * it was emptied of all but syncer vnodes,
1812 			 * switch to the FINAL_DELAY state and run
1813 			 * for one more second.
1814 			 */
1815 			if (syncer_state == SYNCER_SHUTTING_DOWN &&
1816 			    net_worklist_len == 0 &&
1817 			    last_work_seen == syncer_delayno) {
1818 				syncer_state = SYNCER_FINAL_DELAY;
1819 				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
1820 			}
1821 		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
1822 		    LIST_EMPTY(gslp) && syncer_worklist_len > 0);
1823 
1824 		/*
1825 		 * Keep track of the last time there was anything
1826 		 * on the worklist other than syncer vnodes.
1827 		 * Return to the SHUTTING_DOWN state if any
1828 		 * new work appears.
1829 		 */
1830 		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
1831 			last_work_seen = syncer_delayno;
1832 		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
1833 			syncer_state = SYNCER_SHUTTING_DOWN;
1834 		while (!LIST_EMPTY(slp)) {
1835 			error = sync_vnode(slp, &bo, td);
1836 			if (error == 1) {
1837 				LIST_REMOVE(bo, bo_synclist);
1838 				LIST_INSERT_HEAD(next, bo, bo_synclist);
1839 				continue;
1840 			}
1841 		}
1842 		if (!LIST_EMPTY(gslp)) {
1843 			mtx_unlock(&sync_mtx);
1844 			mtx_lock(&Giant);
1845 			mtx_lock(&sync_mtx);
1846 			while (!LIST_EMPTY(gslp)) {
1847 				error = sync_vnode(gslp, &bo, td);
1848 				if (error == 1) {
1849 					LIST_REMOVE(bo, bo_synclist);
1850 					LIST_INSERT_HEAD(gnext, bo,
1851 					    bo_synclist);
1852 					continue;
1853 				}
1854 			}
1855 			mtx_unlock(&Giant);
1856 		}
1857 		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
1858 			syncer_final_iter--;
1859 		/*
1860 		 * The variable rushjob allows the kernel to speed up the
1861 		 * processing of the filesystem syncer process. A rushjob
1862 		 * value of N tells the filesystem syncer to process the next
1863 		 * N seconds worth of work on its queue ASAP. Currently rushjob
1864 		 * is used by the soft update code to speed up the filesystem
1865 		 * syncer process when the incore state is getting so far
1866 		 * ahead of the disk that the kernel memory pool is being
1867 		 * threatened with exhaustion.
1868 		 */
1869 		if (rushjob > 0) {
1870 			rushjob -= 1;
1871 			continue;
1872 		}
1873 		/*
1874 		 * Just sleep for a short period of time between
1875 		 * iterations when shutting down to allow some I/O
1876 		 * to happen.
1877 		 *
1878 		 * If it has taken us less than a second to process the
1879 		 * current work, then wait. Otherwise start right over
1880 		 * again. We can still lose time if any single round
1881 		 * takes more than two seconds, but it does not really
1882 		 * matter as we are just trying to generally pace the
1883 		 * filesystem activity.
1884 		 */
1885 		if (syncer_state != SYNCER_RUNNING ||
1886 		    time_uptime == starttime) {
1887 			thread_lock(td);
1888 			sched_prio(td, PPAUSE);
1889 			thread_unlock(td);
1890 		}
1891 		if (syncer_state != SYNCER_RUNNING)
1892 			cv_timedwait(&sync_wakeup, &sync_mtx,
1893 			    hz / SYNCER_SHUTDOWN_SPEEDUP);
1894 		else if (time_uptime == starttime)
1895 			cv_timedwait(&sync_wakeup, &sync_mtx, hz);
1896 	}
1897 }
1898 
1899 /*
1900  * Request the syncer daemon to speed up its work.
1901  * We never push it to speed up more than half of its
1902  * normal turn time, otherwise it could take over the cpu.
1903  */
1904 int
1905 speedup_syncer(void)
1906 {
1907 	int ret = 0;
1908 
1909 	mtx_lock(&sync_mtx);
1910 	if (rushjob < syncdelay / 2) {
1911 		rushjob += 1;
1912 		stat_rush_requests += 1;
1913 		ret = 1;
1914 	}
1915 	mtx_unlock(&sync_mtx);
1916 	cv_broadcast(&sync_wakeup);
1917 	return (ret);
1918 }
1919 
1920 /*
1921  * Tell the syncer to speed up its work and run though its work
1922  * list several times, then tell it to shut down.
1923  */
1924 static void
1925 syncer_shutdown(void *arg, int howto)
1926 {
1927 
1928 	if (howto & RB_NOSYNC)
1929 		return;
1930 	mtx_lock(&sync_mtx);
1931 	syncer_state = SYNCER_SHUTTING_DOWN;
1932 	rushjob = 0;
1933 	mtx_unlock(&sync_mtx);
1934 	cv_broadcast(&sync_wakeup);
1935 	kproc_shutdown(arg, howto);
1936 }
1937 
1938 /*
1939  * Reassign a buffer from one vnode to another.
1940  * Used to assign file specific control information
1941  * (indirect blocks) to the vnode to which they belong.
1942  */
1943 void
1944 reassignbuf(struct buf *bp)
1945 {
1946 	struct vnode *vp;
1947 	struct bufobj *bo;
1948 	int delay;
1949 #ifdef INVARIANTS
1950 	struct bufv *bv;
1951 #endif
1952 
1953 	vp = bp->b_vp;
1954 	bo = bp->b_bufobj;
1955 	++reassignbufcalls;
1956 
1957 	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
1958 	    bp, bp->b_vp, bp->b_flags);
1959 	/*
1960 	 * B_PAGING flagged buffers cannot be reassigned because their vp
1961 	 * is not fully linked in.
1962 	 */
1963 	if (bp->b_flags & B_PAGING)
1964 		panic("cannot reassign paging buffer");
1965 
1966 	/*
1967 	 * Delete from old vnode list, if on one.
1968 	 */
1969 	BO_LOCK(bo);
1970 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1971 		buf_vlist_remove(bp);
1972 	else
1973 		panic("reassignbuf: Buffer %p not on queue.", bp);
1974 	/*
1975 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
1976 	 * of clean buffers.
1977 	 */
1978 	if (bp->b_flags & B_DELWRI) {
1979 		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
1980 			switch (vp->v_type) {
1981 			case VDIR:
1982 				delay = dirdelay;
1983 				break;
1984 			case VCHR:
1985 				delay = metadelay;
1986 				break;
1987 			default:
1988 				delay = filedelay;
1989 			}
1990 			vn_syncer_add_to_worklist(bo, delay);
1991 		}
1992 		buf_vlist_add(bp, bo, BX_VNDIRTY);
1993 	} else {
1994 		buf_vlist_add(bp, bo, BX_VNCLEAN);
1995 
1996 		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1997 			mtx_lock(&sync_mtx);
1998 			LIST_REMOVE(bo, bo_synclist);
1999 			syncer_worklist_len--;
2000 			mtx_unlock(&sync_mtx);
2001 			bo->bo_flag &= ~BO_ONWORKLST;
2002 		}
2003 	}
2004 #ifdef INVARIANTS
2005 	bv = &bo->bo_clean;
2006 	bp = TAILQ_FIRST(&bv->bv_hd);
2007 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2008 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2009 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
2010 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2011 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2012 	bv = &bo->bo_dirty;
2013 	bp = TAILQ_FIRST(&bv->bv_hd);
2014 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2015 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2016 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
2017 	KASSERT(bp == NULL || bp->b_bufobj == bo,
2018 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2019 #endif
2020 	BO_UNLOCK(bo);
2021 }
2022 
2023 /*
2024  * Increment the use and hold counts on the vnode, taking care to reference
2025  * the driver's usecount if this is a chardev.  The vholdl() will remove
2026  * the vnode from the free list if it is presently free.  Requires the
2027  * vnode interlock and returns with it held.
2028  */
2029 static void
2030 v_incr_usecount(struct vnode *vp)
2031 {
2032 
2033 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2034 	vp->v_usecount++;
2035 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2036 		dev_lock();
2037 		vp->v_rdev->si_usecount++;
2038 		dev_unlock();
2039 	}
2040 	vholdl(vp);
2041 }
2042 
2043 /*
2044  * Turn a holdcnt into a use+holdcnt such that only one call to
2045  * v_decr_usecount is needed.
2046  */
2047 static void
2048 v_upgrade_usecount(struct vnode *vp)
2049 {
2050 
2051 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2052 	vp->v_usecount++;
2053 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2054 		dev_lock();
2055 		vp->v_rdev->si_usecount++;
2056 		dev_unlock();
2057 	}
2058 }
2059 
2060 /*
2061  * Decrement the vnode use and hold count along with the driver's usecount
2062  * if this is a chardev.  The vdropl() below releases the vnode interlock
2063  * as it may free the vnode.
2064  */
2065 static void
2066 v_decr_usecount(struct vnode *vp)
2067 {
2068 
2069 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
2070 	VNASSERT(vp->v_usecount > 0, vp,
2071 	    ("v_decr_usecount: negative usecount"));
2072 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2073 	vp->v_usecount--;
2074 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2075 		dev_lock();
2076 		vp->v_rdev->si_usecount--;
2077 		dev_unlock();
2078 	}
2079 	vdropl(vp);
2080 }
2081 
2082 /*
2083  * Decrement only the use count and driver use count.  This is intended to
2084  * be paired with a follow on vdropl() to release the remaining hold count.
2085  * In this way we may vgone() a vnode with a 0 usecount without risk of
2086  * having it end up on a free list because the hold count is kept above 0.
2087  */
2088 static void
2089 v_decr_useonly(struct vnode *vp)
2090 {
2091 
2092 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
2093 	VNASSERT(vp->v_usecount > 0, vp,
2094 	    ("v_decr_useonly: negative usecount"));
2095 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2096 	vp->v_usecount--;
2097 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2098 		dev_lock();
2099 		vp->v_rdev->si_usecount--;
2100 		dev_unlock();
2101 	}
2102 }
2103 
2104 /*
2105  * Grab a particular vnode from the free list, increment its
2106  * reference count and lock it.  VI_DOOMED is set if the vnode
2107  * is being destroyed.  Only callers who specify LK_RETRY will
2108  * see doomed vnodes.  If inactive processing was delayed in
2109  * vput try to do it here.
2110  */
2111 int
2112 vget(struct vnode *vp, int flags, struct thread *td)
2113 {
2114 	int error;
2115 
2116 	error = 0;
2117 	VFS_ASSERT_GIANT(vp->v_mount);
2118 	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
2119 	    ("vget: invalid lock operation"));
2120 	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
2121 
2122 	if ((flags & LK_INTERLOCK) == 0)
2123 		VI_LOCK(vp);
2124 	vholdl(vp);
2125 	if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) {
2126 		vdrop(vp);
2127 		CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
2128 		    vp);
2129 		return (error);
2130 	}
2131 	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
2132 		panic("vget: vn_lock failed to return ENOENT\n");
2133 	VI_LOCK(vp);
2134 	/* Upgrade our holdcnt to a usecount. */
2135 	v_upgrade_usecount(vp);
2136 	/*
2137 	 * We don't guarantee that any particular close will
2138 	 * trigger inactive processing so just make a best effort
2139 	 * here at preventing a reference to a removed file.  If
2140 	 * we don't succeed no harm is done.
2141 	 */
2142 	if (vp->v_iflag & VI_OWEINACT) {
2143 		if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
2144 		    (flags & LK_NOWAIT) == 0)
2145 			vinactive(vp, td);
2146 		vp->v_iflag &= ~VI_OWEINACT;
2147 	}
2148 	VI_UNLOCK(vp);
2149 	return (0);
2150 }
2151 
2152 /*
2153  * Increase the reference count of a vnode.
2154  */
2155 void
2156 vref(struct vnode *vp)
2157 {
2158 
2159 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2160 	VI_LOCK(vp);
2161 	v_incr_usecount(vp);
2162 	VI_UNLOCK(vp);
2163 }
2164 
2165 /*
2166  * Return reference count of a vnode.
2167  *
2168  * The results of this call are only guaranteed when some mechanism other
2169  * than the VI lock is used to stop other processes from gaining references
2170  * to the vnode.  This may be the case if the caller holds the only reference.
2171  * This is also useful when stale data is acceptable as race conditions may
2172  * be accounted for by some other means.
2173  */
2174 int
2175 vrefcnt(struct vnode *vp)
2176 {
2177 	int usecnt;
2178 
2179 	VI_LOCK(vp);
2180 	usecnt = vp->v_usecount;
2181 	VI_UNLOCK(vp);
2182 
2183 	return (usecnt);
2184 }
2185 
2186 #define	VPUTX_VRELE	1
2187 #define	VPUTX_VPUT	2
2188 #define	VPUTX_VUNREF	3
2189 
2190 static void
2191 vputx(struct vnode *vp, int func)
2192 {
2193 	int error;
2194 
2195 	KASSERT(vp != NULL, ("vputx: null vp"));
2196 	if (func == VPUTX_VUNREF)
2197 		ASSERT_VOP_LOCKED(vp, "vunref");
2198 	else if (func == VPUTX_VPUT)
2199 		ASSERT_VOP_LOCKED(vp, "vput");
2200 	else
2201 		KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
2202 	VFS_ASSERT_GIANT(vp->v_mount);
2203 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2204 	VI_LOCK(vp);
2205 
2206 	/* Skip this v_writecount check if we're going to panic below. */
2207 	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
2208 	    ("vputx: missed vn_close"));
2209 	error = 0;
2210 
2211 	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2212 	    vp->v_usecount == 1)) {
2213 		if (func == VPUTX_VPUT)
2214 			VOP_UNLOCK(vp, 0);
2215 		v_decr_usecount(vp);
2216 		return;
2217 	}
2218 
2219 	if (vp->v_usecount != 1) {
2220 		vprint("vputx: negative ref count", vp);
2221 		panic("vputx: negative ref cnt");
2222 	}
2223 	CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp);
2224 	/*
2225 	 * We want to hold the vnode until the inactive finishes to
2226 	 * prevent vgone() races.  We drop the use count here and the
2227 	 * hold count below when we're done.
2228 	 */
2229 	v_decr_useonly(vp);
2230 	/*
2231 	 * We must call VOP_INACTIVE with the node locked. Mark
2232 	 * as VI_DOINGINACT to avoid recursion.
2233 	 */
2234 	vp->v_iflag |= VI_OWEINACT;
2235 	switch (func) {
2236 	case VPUTX_VRELE:
2237 		error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
2238 		VI_LOCK(vp);
2239 		break;
2240 	case VPUTX_VPUT:
2241 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2242 			error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
2243 			    LK_NOWAIT);
2244 			VI_LOCK(vp);
2245 		}
2246 		break;
2247 	case VPUTX_VUNREF:
2248 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
2249 			error = EBUSY;
2250 		break;
2251 	}
2252 	if (vp->v_usecount > 0)
2253 		vp->v_iflag &= ~VI_OWEINACT;
2254 	if (error == 0) {
2255 		if (vp->v_iflag & VI_OWEINACT)
2256 			vinactive(vp, curthread);
2257 		if (func != VPUTX_VUNREF)
2258 			VOP_UNLOCK(vp, 0);
2259 	}
2260 	vdropl(vp);
2261 }
2262 
2263 /*
2264  * Vnode put/release.
2265  * If count drops to zero, call inactive routine and return to freelist.
2266  */
2267 void
2268 vrele(struct vnode *vp)
2269 {
2270 
2271 	vputx(vp, VPUTX_VRELE);
2272 }
2273 
2274 /*
2275  * Release an already locked vnode.  This give the same effects as
2276  * unlock+vrele(), but takes less time and avoids releasing and
2277  * re-aquiring the lock (as vrele() acquires the lock internally.)
2278  */
2279 void
2280 vput(struct vnode *vp)
2281 {
2282 
2283 	vputx(vp, VPUTX_VPUT);
2284 }
2285 
2286 /*
2287  * Release an exclusively locked vnode. Do not unlock the vnode lock.
2288  */
2289 void
2290 vunref(struct vnode *vp)
2291 {
2292 
2293 	vputx(vp, VPUTX_VUNREF);
2294 }
2295 
2296 /*
2297  * Somebody doesn't want the vnode recycled.
2298  */
2299 void
2300 vhold(struct vnode *vp)
2301 {
2302 
2303 	VI_LOCK(vp);
2304 	vholdl(vp);
2305 	VI_UNLOCK(vp);
2306 }
2307 
2308 void
2309 vholdl(struct vnode *vp)
2310 {
2311 
2312 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2313 	vp->v_holdcnt++;
2314 	if (VSHOULDBUSY(vp))
2315 		vbusy(vp);
2316 }
2317 
2318 /*
2319  * Note that there is one less who cares about this vnode.  vdrop() is the
2320  * opposite of vhold().
2321  */
2322 void
2323 vdrop(struct vnode *vp)
2324 {
2325 
2326 	VI_LOCK(vp);
2327 	vdropl(vp);
2328 }
2329 
2330 /*
2331  * Drop the hold count of the vnode.  If this is the last reference to
2332  * the vnode we will free it if it has been vgone'd otherwise it is
2333  * placed on the free list.
2334  */
2335 void
2336 vdropl(struct vnode *vp)
2337 {
2338 
2339 	ASSERT_VI_LOCKED(vp, "vdropl");
2340 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2341 	if (vp->v_holdcnt <= 0)
2342 		panic("vdrop: holdcnt %d", vp->v_holdcnt);
2343 	vp->v_holdcnt--;
2344 	if (vp->v_holdcnt == 0) {
2345 		if (vp->v_iflag & VI_DOOMED) {
2346 			CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__,
2347 			    vp);
2348 			vdestroy(vp);
2349 			return;
2350 		} else
2351 			vfree(vp);
2352 	}
2353 	VI_UNLOCK(vp);
2354 }
2355 
2356 /*
2357  * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
2358  * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
2359  * OWEINACT tracks whether a vnode missed a call to inactive due to a
2360  * failed lock upgrade.
2361  */
2362 static void
2363 vinactive(struct vnode *vp, struct thread *td)
2364 {
2365 
2366 	ASSERT_VOP_ELOCKED(vp, "vinactive");
2367 	ASSERT_VI_LOCKED(vp, "vinactive");
2368 	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
2369 	    ("vinactive: recursed on VI_DOINGINACT"));
2370 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2371 	vp->v_iflag |= VI_DOINGINACT;
2372 	vp->v_iflag &= ~VI_OWEINACT;
2373 	VI_UNLOCK(vp);
2374 	VOP_INACTIVE(vp, td);
2375 	VI_LOCK(vp);
2376 	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
2377 	    ("vinactive: lost VI_DOINGINACT"));
2378 	vp->v_iflag &= ~VI_DOINGINACT;
2379 }
2380 
2381 /*
2382  * Remove any vnodes in the vnode table belonging to mount point mp.
2383  *
2384  * If FORCECLOSE is not specified, there should not be any active ones,
2385  * return error if any are found (nb: this is a user error, not a
2386  * system error). If FORCECLOSE is specified, detach any active vnodes
2387  * that are found.
2388  *
2389  * If WRITECLOSE is set, only flush out regular file vnodes open for
2390  * writing.
2391  *
2392  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2393  *
2394  * `rootrefs' specifies the base reference count for the root vnode
2395  * of this filesystem. The root vnode is considered busy if its
2396  * v_usecount exceeds this value. On a successful return, vflush(, td)
2397  * will call vrele() on the root vnode exactly rootrefs times.
2398  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2399  * be zero.
2400  */
2401 #ifdef DIAGNOSTIC
2402 static int busyprt = 0;		/* print out busy vnodes */
2403 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
2404 #endif
2405 
2406 int
2407 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
2408 {
2409 	struct vnode *vp, *mvp, *rootvp = NULL;
2410 	struct vattr vattr;
2411 	int busy = 0, error;
2412 
2413 	CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
2414 	    rootrefs, flags);
2415 	if (rootrefs > 0) {
2416 		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2417 		    ("vflush: bad args"));
2418 		/*
2419 		 * Get the filesystem root vnode. We can vput() it
2420 		 * immediately, since with rootrefs > 0, it won't go away.
2421 		 */
2422 		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
2423 			CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
2424 			    __func__, error);
2425 			return (error);
2426 		}
2427 		vput(rootvp);
2428 	}
2429 	MNT_ILOCK(mp);
2430 loop:
2431 	MNT_VNODE_FOREACH(vp, mp, mvp) {
2432 		VI_LOCK(vp);
2433 		vholdl(vp);
2434 		MNT_IUNLOCK(mp);
2435 		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
2436 		if (error) {
2437 			vdrop(vp);
2438 			MNT_ILOCK(mp);
2439 			MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
2440 			goto loop;
2441 		}
2442 		/*
2443 		 * Skip over a vnodes marked VV_SYSTEM.
2444 		 */
2445 		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2446 			VOP_UNLOCK(vp, 0);
2447 			vdrop(vp);
2448 			MNT_ILOCK(mp);
2449 			continue;
2450 		}
2451 		/*
2452 		 * If WRITECLOSE is set, flush out unlinked but still open
2453 		 * files (even if open only for reading) and regular file
2454 		 * vnodes open for writing.
2455 		 */
2456 		if (flags & WRITECLOSE) {
2457 			error = VOP_GETATTR(vp, &vattr, td->td_ucred);
2458 			VI_LOCK(vp);
2459 
2460 			if ((vp->v_type == VNON ||
2461 			    (error == 0 && vattr.va_nlink > 0)) &&
2462 			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
2463 				VOP_UNLOCK(vp, 0);
2464 				vdropl(vp);
2465 				MNT_ILOCK(mp);
2466 				continue;
2467 			}
2468 		} else
2469 			VI_LOCK(vp);
2470 		/*
2471 		 * With v_usecount == 0, all we need to do is clear out the
2472 		 * vnode data structures and we are done.
2473 		 *
2474 		 * If FORCECLOSE is set, forcibly close the vnode.
2475 		 */
2476 		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
2477 			VNASSERT(vp->v_usecount == 0 ||
2478 			    (vp->v_type != VCHR && vp->v_type != VBLK), vp,
2479 			    ("device VNODE %p is FORCECLOSED", vp));
2480 			vgonel(vp);
2481 		} else {
2482 			busy++;
2483 #ifdef DIAGNOSTIC
2484 			if (busyprt)
2485 				vprint("vflush: busy vnode", vp);
2486 #endif
2487 		}
2488 		VOP_UNLOCK(vp, 0);
2489 		vdropl(vp);
2490 		MNT_ILOCK(mp);
2491 	}
2492 	MNT_IUNLOCK(mp);
2493 	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2494 		/*
2495 		 * If just the root vnode is busy, and if its refcount
2496 		 * is equal to `rootrefs', then go ahead and kill it.
2497 		 */
2498 		VI_LOCK(rootvp);
2499 		KASSERT(busy > 0, ("vflush: not busy"));
2500 		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
2501 		    ("vflush: usecount %d < rootrefs %d",
2502 		     rootvp->v_usecount, rootrefs));
2503 		if (busy == 1 && rootvp->v_usecount == rootrefs) {
2504 			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
2505 			vgone(rootvp);
2506 			VOP_UNLOCK(rootvp, 0);
2507 			busy = 0;
2508 		} else
2509 			VI_UNLOCK(rootvp);
2510 	}
2511 	if (busy) {
2512 		CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
2513 		    busy);
2514 		return (EBUSY);
2515 	}
2516 	for (; rootrefs > 0; rootrefs--)
2517 		vrele(rootvp);
2518 	return (0);
2519 }
2520 
2521 /*
2522  * Recycle an unused vnode to the front of the free list.
2523  */
2524 int
2525 vrecycle(struct vnode *vp, struct thread *td)
2526 {
2527 	int recycled;
2528 
2529 	ASSERT_VOP_ELOCKED(vp, "vrecycle");
2530 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2531 	recycled = 0;
2532 	VI_LOCK(vp);
2533 	if (vp->v_usecount == 0) {
2534 		recycled = 1;
2535 		vgonel(vp);
2536 	}
2537 	VI_UNLOCK(vp);
2538 	return (recycled);
2539 }
2540 
2541 /*
2542  * Eliminate all activity associated with a vnode
2543  * in preparation for reuse.
2544  */
2545 void
2546 vgone(struct vnode *vp)
2547 {
2548 	VI_LOCK(vp);
2549 	vgonel(vp);
2550 	VI_UNLOCK(vp);
2551 }
2552 
2553 /*
2554  * vgone, with the vp interlock held.
2555  */
2556 void
2557 vgonel(struct vnode *vp)
2558 {
2559 	struct thread *td;
2560 	int oweinact;
2561 	int active;
2562 	struct mount *mp;
2563 
2564 	ASSERT_VOP_ELOCKED(vp, "vgonel");
2565 	ASSERT_VI_LOCKED(vp, "vgonel");
2566 	VNASSERT(vp->v_holdcnt, vp,
2567 	    ("vgonel: vp %p has no reference.", vp));
2568 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2569 	td = curthread;
2570 
2571 	/*
2572 	 * Don't vgonel if we're already doomed.
2573 	 */
2574 	if (vp->v_iflag & VI_DOOMED)
2575 		return;
2576 	vp->v_iflag |= VI_DOOMED;
2577 	/*
2578 	 * Check to see if the vnode is in use.  If so, we have to call
2579 	 * VOP_CLOSE() and VOP_INACTIVE().
2580 	 */
2581 	active = vp->v_usecount;
2582 	oweinact = (vp->v_iflag & VI_OWEINACT);
2583 	VI_UNLOCK(vp);
2584 	/*
2585 	 * Clean out any buffers associated with the vnode.
2586 	 * If the flush fails, just toss the buffers.
2587 	 */
2588 	mp = NULL;
2589 	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
2590 		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
2591 	if (vinvalbuf(vp, V_SAVE, 0, 0) != 0)
2592 		vinvalbuf(vp, 0, 0, 0);
2593 
2594 	/*
2595 	 * If purging an active vnode, it must be closed and
2596 	 * deactivated before being reclaimed.
2597 	 */
2598 	if (active)
2599 		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2600 	if (oweinact || active) {
2601 		VI_LOCK(vp);
2602 		if ((vp->v_iflag & VI_DOINGINACT) == 0)
2603 			vinactive(vp, td);
2604 		VI_UNLOCK(vp);
2605 	}
2606 	/*
2607 	 * Reclaim the vnode.
2608 	 */
2609 	if (VOP_RECLAIM(vp, td))
2610 		panic("vgone: cannot reclaim");
2611 	if (mp != NULL)
2612 		vn_finished_secondary_write(mp);
2613 	VNASSERT(vp->v_object == NULL, vp,
2614 	    ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
2615 	/*
2616 	 * Clear the advisory locks and wake up waiting threads.
2617 	 */
2618 	(void)VOP_ADVLOCKPURGE(vp);
2619 	/*
2620 	 * Delete from old mount point vnode list.
2621 	 */
2622 	delmntque(vp);
2623 	cache_purge(vp);
2624 	/*
2625 	 * Done with purge, reset to the standard lock and invalidate
2626 	 * the vnode.
2627 	 */
2628 	VI_LOCK(vp);
2629 	vp->v_vnlock = &vp->v_lock;
2630 	vp->v_op = &dead_vnodeops;
2631 	vp->v_tag = "none";
2632 	vp->v_type = VBAD;
2633 }
2634 
2635 /*
2636  * Calculate the total number of references to a special device.
2637  */
2638 int
2639 vcount(struct vnode *vp)
2640 {
2641 	int count;
2642 
2643 	dev_lock();
2644 	count = vp->v_rdev->si_usecount;
2645 	dev_unlock();
2646 	return (count);
2647 }
2648 
2649 /*
2650  * Same as above, but using the struct cdev *as argument
2651  */
2652 int
2653 count_dev(struct cdev *dev)
2654 {
2655 	int count;
2656 
2657 	dev_lock();
2658 	count = dev->si_usecount;
2659 	dev_unlock();
2660 	return(count);
2661 }
2662 
2663 /*
2664  * Print out a description of a vnode.
2665  */
2666 static char *typename[] =
2667 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
2668  "VMARKER"};
2669 
2670 void
2671 vn_printf(struct vnode *vp, const char *fmt, ...)
2672 {
2673 	va_list ap;
2674 	char buf[256], buf2[16];
2675 	u_long flags;
2676 
2677 	va_start(ap, fmt);
2678 	vprintf(fmt, ap);
2679 	va_end(ap);
2680 	printf("%p: ", (void *)vp);
2681 	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
2682 	printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
2683 	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
2684 	buf[0] = '\0';
2685 	buf[1] = '\0';
2686 	if (vp->v_vflag & VV_ROOT)
2687 		strlcat(buf, "|VV_ROOT", sizeof(buf));
2688 	if (vp->v_vflag & VV_ISTTY)
2689 		strlcat(buf, "|VV_ISTTY", sizeof(buf));
2690 	if (vp->v_vflag & VV_NOSYNC)
2691 		strlcat(buf, "|VV_NOSYNC", sizeof(buf));
2692 	if (vp->v_vflag & VV_CACHEDLABEL)
2693 		strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
2694 	if (vp->v_vflag & VV_TEXT)
2695 		strlcat(buf, "|VV_TEXT", sizeof(buf));
2696 	if (vp->v_vflag & VV_COPYONWRITE)
2697 		strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
2698 	if (vp->v_vflag & VV_SYSTEM)
2699 		strlcat(buf, "|VV_SYSTEM", sizeof(buf));
2700 	if (vp->v_vflag & VV_PROCDEP)
2701 		strlcat(buf, "|VV_PROCDEP", sizeof(buf));
2702 	if (vp->v_vflag & VV_NOKNOTE)
2703 		strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
2704 	if (vp->v_vflag & VV_DELETED)
2705 		strlcat(buf, "|VV_DELETED", sizeof(buf));
2706 	if (vp->v_vflag & VV_MD)
2707 		strlcat(buf, "|VV_MD", sizeof(buf));
2708 	flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC |
2709 	    VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
2710 	    VV_NOKNOTE | VV_DELETED | VV_MD);
2711 	if (flags != 0) {
2712 		snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
2713 		strlcat(buf, buf2, sizeof(buf));
2714 	}
2715 	if (vp->v_iflag & VI_MOUNT)
2716 		strlcat(buf, "|VI_MOUNT", sizeof(buf));
2717 	if (vp->v_iflag & VI_AGE)
2718 		strlcat(buf, "|VI_AGE", sizeof(buf));
2719 	if (vp->v_iflag & VI_DOOMED)
2720 		strlcat(buf, "|VI_DOOMED", sizeof(buf));
2721 	if (vp->v_iflag & VI_FREE)
2722 		strlcat(buf, "|VI_FREE", sizeof(buf));
2723 	if (vp->v_iflag & VI_DOINGINACT)
2724 		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
2725 	if (vp->v_iflag & VI_OWEINACT)
2726 		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
2727 	flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE |
2728 	    VI_DOINGINACT | VI_OWEINACT);
2729 	if (flags != 0) {
2730 		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
2731 		strlcat(buf, buf2, sizeof(buf));
2732 	}
2733 	printf("    flags (%s)\n", buf + 1);
2734 	if (mtx_owned(VI_MTX(vp)))
2735 		printf(" VI_LOCKed");
2736 	if (vp->v_object != NULL)
2737 		printf("    v_object %p ref %d pages %d\n",
2738 		    vp->v_object, vp->v_object->ref_count,
2739 		    vp->v_object->resident_page_count);
2740 	printf("    ");
2741 	lockmgr_printinfo(vp->v_vnlock);
2742 	if (vp->v_data != NULL)
2743 		VOP_PRINT(vp);
2744 }
2745 
2746 #ifdef DDB
2747 /*
2748  * List all of the locked vnodes in the system.
2749  * Called when debugging the kernel.
2750  */
2751 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
2752 {
2753 	struct mount *mp, *nmp;
2754 	struct vnode *vp;
2755 
2756 	/*
2757 	 * Note: because this is DDB, we can't obey the locking semantics
2758 	 * for these structures, which means we could catch an inconsistent
2759 	 * state and dereference a nasty pointer.  Not much to be done
2760 	 * about that.
2761 	 */
2762 	db_printf("Locked vnodes\n");
2763 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2764 		nmp = TAILQ_NEXT(mp, mnt_list);
2765 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2766 			if (vp->v_type != VMARKER &&
2767 			    VOP_ISLOCKED(vp))
2768 				vprint("", vp);
2769 		}
2770 		nmp = TAILQ_NEXT(mp, mnt_list);
2771 	}
2772 }
2773 
2774 /*
2775  * Show details about the given vnode.
2776  */
2777 DB_SHOW_COMMAND(vnode, db_show_vnode)
2778 {
2779 	struct vnode *vp;
2780 
2781 	if (!have_addr)
2782 		return;
2783 	vp = (struct vnode *)addr;
2784 	vn_printf(vp, "vnode ");
2785 }
2786 
2787 /*
2788  * Show details about the given mount point.
2789  */
2790 DB_SHOW_COMMAND(mount, db_show_mount)
2791 {
2792 	struct mount *mp;
2793 	struct vfsopt *opt;
2794 	struct statfs *sp;
2795 	struct vnode *vp;
2796 	char buf[512];
2797 	u_int flags;
2798 
2799 	if (!have_addr) {
2800 		/* No address given, print short info about all mount points. */
2801 		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2802 			db_printf("%p %s on %s (%s)\n", mp,
2803 			    mp->mnt_stat.f_mntfromname,
2804 			    mp->mnt_stat.f_mntonname,
2805 			    mp->mnt_stat.f_fstypename);
2806 			if (db_pager_quit)
2807 				break;
2808 		}
2809 		db_printf("\nMore info: show mount <addr>\n");
2810 		return;
2811 	}
2812 
2813 	mp = (struct mount *)addr;
2814 	db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
2815 	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
2816 
2817 	buf[0] = '\0';
2818 	flags = mp->mnt_flag;
2819 #define	MNT_FLAG(flag)	do {						\
2820 	if (flags & (flag)) {						\
2821 		if (buf[0] != '\0')					\
2822 			strlcat(buf, ", ", sizeof(buf));		\
2823 		strlcat(buf, (#flag) + 4, sizeof(buf));			\
2824 		flags &= ~(flag);					\
2825 	}								\
2826 } while (0)
2827 	MNT_FLAG(MNT_RDONLY);
2828 	MNT_FLAG(MNT_SYNCHRONOUS);
2829 	MNT_FLAG(MNT_NOEXEC);
2830 	MNT_FLAG(MNT_NOSUID);
2831 	MNT_FLAG(MNT_UNION);
2832 	MNT_FLAG(MNT_ASYNC);
2833 	MNT_FLAG(MNT_SUIDDIR);
2834 	MNT_FLAG(MNT_SOFTDEP);
2835 	MNT_FLAG(MNT_NOSYMFOLLOW);
2836 	MNT_FLAG(MNT_GJOURNAL);
2837 	MNT_FLAG(MNT_MULTILABEL);
2838 	MNT_FLAG(MNT_ACLS);
2839 	MNT_FLAG(MNT_NOATIME);
2840 	MNT_FLAG(MNT_NOCLUSTERR);
2841 	MNT_FLAG(MNT_NOCLUSTERW);
2842 	MNT_FLAG(MNT_NFS4ACLS);
2843 	MNT_FLAG(MNT_EXRDONLY);
2844 	MNT_FLAG(MNT_EXPORTED);
2845 	MNT_FLAG(MNT_DEFEXPORTED);
2846 	MNT_FLAG(MNT_EXPORTANON);
2847 	MNT_FLAG(MNT_EXKERB);
2848 	MNT_FLAG(MNT_EXPUBLIC);
2849 	MNT_FLAG(MNT_LOCAL);
2850 	MNT_FLAG(MNT_QUOTA);
2851 	MNT_FLAG(MNT_ROOTFS);
2852 	MNT_FLAG(MNT_USER);
2853 	MNT_FLAG(MNT_IGNORE);
2854 	MNT_FLAG(MNT_UPDATE);
2855 	MNT_FLAG(MNT_DELEXPORT);
2856 	MNT_FLAG(MNT_RELOAD);
2857 	MNT_FLAG(MNT_FORCE);
2858 	MNT_FLAG(MNT_SNAPSHOT);
2859 	MNT_FLAG(MNT_BYFSID);
2860 	MNT_FLAG(MNT_SOFTDEP);
2861 #undef MNT_FLAG
2862 	if (flags != 0) {
2863 		if (buf[0] != '\0')
2864 			strlcat(buf, ", ", sizeof(buf));
2865 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
2866 		    "0x%08x", flags);
2867 	}
2868 	db_printf("    mnt_flag = %s\n", buf);
2869 
2870 	buf[0] = '\0';
2871 	flags = mp->mnt_kern_flag;
2872 #define	MNT_KERN_FLAG(flag)	do {					\
2873 	if (flags & (flag)) {						\
2874 		if (buf[0] != '\0')					\
2875 			strlcat(buf, ", ", sizeof(buf));		\
2876 		strlcat(buf, (#flag) + 5, sizeof(buf));			\
2877 		flags &= ~(flag);					\
2878 	}								\
2879 } while (0)
2880 	MNT_KERN_FLAG(MNTK_UNMOUNTF);
2881 	MNT_KERN_FLAG(MNTK_ASYNC);
2882 	MNT_KERN_FLAG(MNTK_SOFTDEP);
2883 	MNT_KERN_FLAG(MNTK_NOINSMNTQ);
2884 	MNT_KERN_FLAG(MNTK_DRAINING);
2885 	MNT_KERN_FLAG(MNTK_REFEXPIRE);
2886 	MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
2887 	MNT_KERN_FLAG(MNTK_SHARED_WRITES);
2888 	MNT_KERN_FLAG(MNTK_SUJ);
2889 	MNT_KERN_FLAG(MNTK_UNMOUNT);
2890 	MNT_KERN_FLAG(MNTK_MWAIT);
2891 	MNT_KERN_FLAG(MNTK_SUSPEND);
2892 	MNT_KERN_FLAG(MNTK_SUSPEND2);
2893 	MNT_KERN_FLAG(MNTK_SUSPENDED);
2894 	MNT_KERN_FLAG(MNTK_MPSAFE);
2895 	MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
2896 	MNT_KERN_FLAG(MNTK_NOKNOTE);
2897 #undef MNT_KERN_FLAG
2898 	if (flags != 0) {
2899 		if (buf[0] != '\0')
2900 			strlcat(buf, ", ", sizeof(buf));
2901 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
2902 		    "0x%08x", flags);
2903 	}
2904 	db_printf("    mnt_kern_flag = %s\n", buf);
2905 
2906 	db_printf("    mnt_opt = ");
2907 	opt = TAILQ_FIRST(mp->mnt_opt);
2908 	if (opt != NULL) {
2909 		db_printf("%s", opt->name);
2910 		opt = TAILQ_NEXT(opt, link);
2911 		while (opt != NULL) {
2912 			db_printf(", %s", opt->name);
2913 			opt = TAILQ_NEXT(opt, link);
2914 		}
2915 	}
2916 	db_printf("\n");
2917 
2918 	sp = &mp->mnt_stat;
2919 	db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
2920 	    "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
2921 	    "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
2922 	    "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
2923 	    (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
2924 	    (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
2925 	    (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
2926 	    (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
2927 	    (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
2928 	    (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
2929 	    (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
2930 	    (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
2931 
2932 	db_printf("    mnt_cred = { uid=%u ruid=%u",
2933 	    (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
2934 	if (jailed(mp->mnt_cred))
2935 		db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
2936 	db_printf(" }\n");
2937 	db_printf("    mnt_ref = %d\n", mp->mnt_ref);
2938 	db_printf("    mnt_gen = %d\n", mp->mnt_gen);
2939 	db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
2940 	db_printf("    mnt_writeopcount = %d\n", mp->mnt_writeopcount);
2941 	db_printf("    mnt_noasync = %u\n", mp->mnt_noasync);
2942 	db_printf("    mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
2943 	db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
2944 	db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
2945 	db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
2946 	db_printf("    mnt_secondary_accwrites = %d\n",
2947 	    mp->mnt_secondary_accwrites);
2948 	db_printf("    mnt_gjprovider = %s\n",
2949 	    mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
2950 	db_printf("\n");
2951 
2952 	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2953 		if (vp->v_type != VMARKER) {
2954 			vn_printf(vp, "vnode ");
2955 			if (db_pager_quit)
2956 				break;
2957 		}
2958 	}
2959 }
2960 #endif	/* DDB */
2961 
2962 /*
2963  * Fill in a struct xvfsconf based on a struct vfsconf.
2964  */
2965 static void
2966 vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp)
2967 {
2968 
2969 	strcpy(xvfsp->vfc_name, vfsp->vfc_name);
2970 	xvfsp->vfc_typenum = vfsp->vfc_typenum;
2971 	xvfsp->vfc_refcount = vfsp->vfc_refcount;
2972 	xvfsp->vfc_flags = vfsp->vfc_flags;
2973 	/*
2974 	 * These are unused in userland, we keep them
2975 	 * to not break binary compatibility.
2976 	 */
2977 	xvfsp->vfc_vfsops = NULL;
2978 	xvfsp->vfc_next = NULL;
2979 }
2980 
2981 /*
2982  * Top level filesystem related information gathering.
2983  */
2984 static int
2985 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
2986 {
2987 	struct vfsconf *vfsp;
2988 	struct xvfsconf xvfsp;
2989 	int error;
2990 
2991 	error = 0;
2992 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
2993 		bzero(&xvfsp, sizeof(xvfsp));
2994 		vfsconf2x(vfsp, &xvfsp);
2995 		error = SYSCTL_OUT(req, &xvfsp, sizeof xvfsp);
2996 		if (error)
2997 			break;
2998 	}
2999 	return (error);
3000 }
3001 
3002 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD,
3003     NULL, 0, sysctl_vfs_conflist,
3004     "S,xvfsconf", "List of all configured filesystems");
3005 
3006 #ifndef BURN_BRIDGES
3007 static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
3008 
3009 static int
3010 vfs_sysctl(SYSCTL_HANDLER_ARGS)
3011 {
3012 	int *name = (int *)arg1 - 1;	/* XXX */
3013 	u_int namelen = arg2 + 1;	/* XXX */
3014 	struct vfsconf *vfsp;
3015 	struct xvfsconf xvfsp;
3016 
3017 	printf("WARNING: userland calling deprecated sysctl, "
3018 	    "please rebuild world\n");
3019 
3020 #if 1 || defined(COMPAT_PRELITE2)
3021 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
3022 	if (namelen == 1)
3023 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
3024 #endif
3025 
3026 	switch (name[1]) {
3027 	case VFS_MAXTYPENUM:
3028 		if (namelen != 2)
3029 			return (ENOTDIR);
3030 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
3031 	case VFS_CONF:
3032 		if (namelen != 3)
3033 			return (ENOTDIR);	/* overloaded */
3034 		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
3035 			if (vfsp->vfc_typenum == name[2])
3036 				break;
3037 		if (vfsp == NULL)
3038 			return (EOPNOTSUPP);
3039 		bzero(&xvfsp, sizeof(xvfsp));
3040 		vfsconf2x(vfsp, &xvfsp);
3041 		return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3042 	}
3043 	return (EOPNOTSUPP);
3044 }
3045 
3046 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP,
3047     vfs_sysctl, "Generic filesystem");
3048 
3049 #if 1 || defined(COMPAT_PRELITE2)
3050 
3051 static int
3052 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
3053 {
3054 	int error;
3055 	struct vfsconf *vfsp;
3056 	struct ovfsconf ovfs;
3057 
3058 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3059 		bzero(&ovfs, sizeof(ovfs));
3060 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
3061 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
3062 		ovfs.vfc_index = vfsp->vfc_typenum;
3063 		ovfs.vfc_refcount = vfsp->vfc_refcount;
3064 		ovfs.vfc_flags = vfsp->vfc_flags;
3065 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
3066 		if (error)
3067 			return error;
3068 	}
3069 	return 0;
3070 }
3071 
3072 #endif /* 1 || COMPAT_PRELITE2 */
3073 #endif /* !BURN_BRIDGES */
3074 
3075 #define KINFO_VNODESLOP		10
3076 #ifdef notyet
3077 /*
3078  * Dump vnode list (via sysctl).
3079  */
3080 /* ARGSUSED */
3081 static int
3082 sysctl_vnode(SYSCTL_HANDLER_ARGS)
3083 {
3084 	struct xvnode *xvn;
3085 	struct mount *mp;
3086 	struct vnode *vp;
3087 	int error, len, n;
3088 
3089 	/*
3090 	 * Stale numvnodes access is not fatal here.
3091 	 */
3092 	req->lock = 0;
3093 	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
3094 	if (!req->oldptr)
3095 		/* Make an estimate */
3096 		return (SYSCTL_OUT(req, 0, len));
3097 
3098 	error = sysctl_wire_old_buffer(req, 0);
3099 	if (error != 0)
3100 		return (error);
3101 	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
3102 	n = 0;
3103 	mtx_lock(&mountlist_mtx);
3104 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3105 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
3106 			continue;
3107 		MNT_ILOCK(mp);
3108 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3109 			if (n == len)
3110 				break;
3111 			vref(vp);
3112 			xvn[n].xv_size = sizeof *xvn;
3113 			xvn[n].xv_vnode = vp;
3114 			xvn[n].xv_id = 0;	/* XXX compat */
3115 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
3116 			XV_COPY(usecount);
3117 			XV_COPY(writecount);
3118 			XV_COPY(holdcnt);
3119 			XV_COPY(mount);
3120 			XV_COPY(numoutput);
3121 			XV_COPY(type);
3122 #undef XV_COPY
3123 			xvn[n].xv_flag = vp->v_vflag;
3124 
3125 			switch (vp->v_type) {
3126 			case VREG:
3127 			case VDIR:
3128 			case VLNK:
3129 				break;
3130 			case VBLK:
3131 			case VCHR:
3132 				if (vp->v_rdev == NULL) {
3133 					vrele(vp);
3134 					continue;
3135 				}
3136 				xvn[n].xv_dev = dev2udev(vp->v_rdev);
3137 				break;
3138 			case VSOCK:
3139 				xvn[n].xv_socket = vp->v_socket;
3140 				break;
3141 			case VFIFO:
3142 				xvn[n].xv_fifo = vp->v_fifoinfo;
3143 				break;
3144 			case VNON:
3145 			case VBAD:
3146 			default:
3147 				/* shouldn't happen? */
3148 				vrele(vp);
3149 				continue;
3150 			}
3151 			vrele(vp);
3152 			++n;
3153 		}
3154 		MNT_IUNLOCK(mp);
3155 		mtx_lock(&mountlist_mtx);
3156 		vfs_unbusy(mp);
3157 		if (n == len)
3158 			break;
3159 	}
3160 	mtx_unlock(&mountlist_mtx);
3161 
3162 	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
3163 	free(xvn, M_TEMP);
3164 	return (error);
3165 }
3166 
3167 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
3168     0, 0, sysctl_vnode, "S,xvnode", "");
3169 #endif
3170 
3171 /*
3172  * Unmount all filesystems. The list is traversed in reverse order
3173  * of mounting to avoid dependencies.
3174  */
3175 void
3176 vfs_unmountall(void)
3177 {
3178 	struct mount *mp;
3179 	struct thread *td;
3180 	int error;
3181 
3182 	KASSERT(curthread != NULL, ("vfs_unmountall: NULL curthread"));
3183 	CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
3184 	td = curthread;
3185 
3186 	/*
3187 	 * Since this only runs when rebooting, it is not interlocked.
3188 	 */
3189 	while(!TAILQ_EMPTY(&mountlist)) {
3190 		mp = TAILQ_LAST(&mountlist, mntlist);
3191 		error = dounmount(mp, MNT_FORCE, td);
3192 		if (error) {
3193 			TAILQ_REMOVE(&mountlist, mp, mnt_list);
3194 			/*
3195 			 * XXX: Due to the way in which we mount the root
3196 			 * file system off of devfs, devfs will generate a
3197 			 * "busy" warning when we try to unmount it before
3198 			 * the root.  Don't print a warning as a result in
3199 			 * order to avoid false positive errors that may
3200 			 * cause needless upset.
3201 			 */
3202 			if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
3203 				printf("unmount of %s failed (",
3204 				    mp->mnt_stat.f_mntonname);
3205 				if (error == EBUSY)
3206 					printf("BUSY)\n");
3207 				else
3208 					printf("%d)\n", error);
3209 			}
3210 		} else {
3211 			/* The unmount has removed mp from the mountlist */
3212 		}
3213 	}
3214 }
3215 
3216 /*
3217  * perform msync on all vnodes under a mount point
3218  * the mount point must be locked.
3219  */
3220 void
3221 vfs_msync(struct mount *mp, int flags)
3222 {
3223 	struct vnode *vp, *mvp;
3224 	struct vm_object *obj;
3225 
3226 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
3227 	MNT_ILOCK(mp);
3228 	MNT_VNODE_FOREACH(vp, mp, mvp) {
3229 		VI_LOCK(vp);
3230 		obj = vp->v_object;
3231 		if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 &&
3232 		    (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
3233 			MNT_IUNLOCK(mp);
3234 			if (!vget(vp,
3235 			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
3236 			    curthread)) {
3237 				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
3238 					vput(vp);
3239 					MNT_ILOCK(mp);
3240 					continue;
3241 				}
3242 
3243 				obj = vp->v_object;
3244 				if (obj != NULL) {
3245 					VM_OBJECT_LOCK(obj);
3246 					vm_object_page_clean(obj, 0, 0,
3247 					    flags == MNT_WAIT ?
3248 					    OBJPC_SYNC : OBJPC_NOSYNC);
3249 					VM_OBJECT_UNLOCK(obj);
3250 				}
3251 				vput(vp);
3252 			}
3253 			MNT_ILOCK(mp);
3254 		} else
3255 			VI_UNLOCK(vp);
3256 	}
3257 	MNT_IUNLOCK(mp);
3258 }
3259 
3260 /*
3261  * Mark a vnode as free, putting it up for recycling.
3262  */
3263 static void
3264 vfree(struct vnode *vp)
3265 {
3266 
3267 	ASSERT_VI_LOCKED(vp, "vfree");
3268 	mtx_lock(&vnode_free_list_mtx);
3269 	VNASSERT(vp->v_op != NULL, vp, ("vfree: vnode already reclaimed."));
3270 	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free"));
3271 	VNASSERT(VSHOULDFREE(vp), vp, ("vfree: freeing when we shouldn't"));
3272 	VNASSERT((vp->v_iflag & VI_DOOMED) == 0, vp,
3273 	    ("vfree: Freeing doomed vnode"));
3274 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3275 	if (vp->v_iflag & VI_AGE) {
3276 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
3277 	} else {
3278 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
3279 	}
3280 	freevnodes++;
3281 	vp->v_iflag &= ~VI_AGE;
3282 	vp->v_iflag |= VI_FREE;
3283 	mtx_unlock(&vnode_free_list_mtx);
3284 }
3285 
3286 /*
3287  * Opposite of vfree() - mark a vnode as in use.
3288  */
3289 static void
3290 vbusy(struct vnode *vp)
3291 {
3292 	ASSERT_VI_LOCKED(vp, "vbusy");
3293 	VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free"));
3294 	VNASSERT(vp->v_op != NULL, vp, ("vbusy: vnode already reclaimed."));
3295 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3296 
3297 	mtx_lock(&vnode_free_list_mtx);
3298 	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
3299 	freevnodes--;
3300 	vp->v_iflag &= ~(VI_FREE|VI_AGE);
3301 	mtx_unlock(&vnode_free_list_mtx);
3302 }
3303 
3304 static void
3305 destroy_vpollinfo(struct vpollinfo *vi)
3306 {
3307 	knlist_destroy(&vi->vpi_selinfo.si_note);
3308 	mtx_destroy(&vi->vpi_lock);
3309 	uma_zfree(vnodepoll_zone, vi);
3310 }
3311 
3312 /*
3313  * Initalize per-vnode helper structure to hold poll-related state.
3314  */
3315 void
3316 v_addpollinfo(struct vnode *vp)
3317 {
3318 	struct vpollinfo *vi;
3319 
3320 	if (vp->v_pollinfo != NULL)
3321 		return;
3322 	vi = uma_zalloc(vnodepoll_zone, M_WAITOK);
3323 	mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
3324 	knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
3325 	    vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
3326 	VI_LOCK(vp);
3327 	if (vp->v_pollinfo != NULL) {
3328 		VI_UNLOCK(vp);
3329 		destroy_vpollinfo(vi);
3330 		return;
3331 	}
3332 	vp->v_pollinfo = vi;
3333 	VI_UNLOCK(vp);
3334 }
3335 
3336 /*
3337  * Record a process's interest in events which might happen to
3338  * a vnode.  Because poll uses the historic select-style interface
3339  * internally, this routine serves as both the ``check for any
3340  * pending events'' and the ``record my interest in future events''
3341  * functions.  (These are done together, while the lock is held,
3342  * to avoid race conditions.)
3343  */
3344 int
3345 vn_pollrecord(struct vnode *vp, struct thread *td, int events)
3346 {
3347 
3348 	v_addpollinfo(vp);
3349 	mtx_lock(&vp->v_pollinfo->vpi_lock);
3350 	if (vp->v_pollinfo->vpi_revents & events) {
3351 		/*
3352 		 * This leaves events we are not interested
3353 		 * in available for the other process which
3354 		 * which presumably had requested them
3355 		 * (otherwise they would never have been
3356 		 * recorded).
3357 		 */
3358 		events &= vp->v_pollinfo->vpi_revents;
3359 		vp->v_pollinfo->vpi_revents &= ~events;
3360 
3361 		mtx_unlock(&vp->v_pollinfo->vpi_lock);
3362 		return (events);
3363 	}
3364 	vp->v_pollinfo->vpi_events |= events;
3365 	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3366 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3367 	return (0);
3368 }
3369 
3370 /*
3371  * Routine to create and manage a filesystem syncer vnode.
3372  */
3373 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
3374 static int	sync_fsync(struct  vop_fsync_args *);
3375 static int	sync_inactive(struct  vop_inactive_args *);
3376 static int	sync_reclaim(struct  vop_reclaim_args *);
3377 
3378 static struct vop_vector sync_vnodeops = {
3379 	.vop_bypass =	VOP_EOPNOTSUPP,
3380 	.vop_close =	sync_close,		/* close */
3381 	.vop_fsync =	sync_fsync,		/* fsync */
3382 	.vop_inactive =	sync_inactive,	/* inactive */
3383 	.vop_reclaim =	sync_reclaim,	/* reclaim */
3384 	.vop_lock1 =	vop_stdlock,	/* lock */
3385 	.vop_unlock =	vop_stdunlock,	/* unlock */
3386 	.vop_islocked =	vop_stdislocked,	/* islocked */
3387 };
3388 
3389 /*
3390  * Create a new filesystem syncer vnode for the specified mount point.
3391  */
3392 void
3393 vfs_allocate_syncvnode(struct mount *mp)
3394 {
3395 	struct vnode *vp;
3396 	struct bufobj *bo;
3397 	static long start, incr, next;
3398 	int error;
3399 
3400 	/* Allocate a new vnode */
3401 	error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
3402 	if (error != 0)
3403 		panic("vfs_allocate_syncvnode: getnewvnode() failed");
3404 	vp->v_type = VNON;
3405 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3406 	vp->v_vflag |= VV_FORCEINSMQ;
3407 	error = insmntque(vp, mp);
3408 	if (error != 0)
3409 		panic("vfs_allocate_syncvnode: insmntque() failed");
3410 	vp->v_vflag &= ~VV_FORCEINSMQ;
3411 	VOP_UNLOCK(vp, 0);
3412 	/*
3413 	 * Place the vnode onto the syncer worklist. We attempt to
3414 	 * scatter them about on the list so that they will go off
3415 	 * at evenly distributed times even if all the filesystems
3416 	 * are mounted at once.
3417 	 */
3418 	next += incr;
3419 	if (next == 0 || next > syncer_maxdelay) {
3420 		start /= 2;
3421 		incr /= 2;
3422 		if (start == 0) {
3423 			start = syncer_maxdelay / 2;
3424 			incr = syncer_maxdelay;
3425 		}
3426 		next = start;
3427 	}
3428 	bo = &vp->v_bufobj;
3429 	BO_LOCK(bo);
3430 	vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
3431 	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
3432 	mtx_lock(&sync_mtx);
3433 	sync_vnode_count++;
3434 	if (mp->mnt_syncer == NULL) {
3435 		mp->mnt_syncer = vp;
3436 		vp = NULL;
3437 	}
3438 	mtx_unlock(&sync_mtx);
3439 	BO_UNLOCK(bo);
3440 	if (vp != NULL) {
3441 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3442 		vgone(vp);
3443 		vput(vp);
3444 	}
3445 }
3446 
3447 void
3448 vfs_deallocate_syncvnode(struct mount *mp)
3449 {
3450 	struct vnode *vp;
3451 
3452 	mtx_lock(&sync_mtx);
3453 	vp = mp->mnt_syncer;
3454 	if (vp != NULL)
3455 		mp->mnt_syncer = NULL;
3456 	mtx_unlock(&sync_mtx);
3457 	if (vp != NULL)
3458 		vrele(vp);
3459 }
3460 
3461 /*
3462  * Do a lazy sync of the filesystem.
3463  */
3464 static int
3465 sync_fsync(struct vop_fsync_args *ap)
3466 {
3467 	struct vnode *syncvp = ap->a_vp;
3468 	struct mount *mp = syncvp->v_mount;
3469 	int error;
3470 	struct bufobj *bo;
3471 
3472 	/*
3473 	 * We only need to do something if this is a lazy evaluation.
3474 	 */
3475 	if (ap->a_waitfor != MNT_LAZY)
3476 		return (0);
3477 
3478 	/*
3479 	 * Move ourselves to the back of the sync list.
3480 	 */
3481 	bo = &syncvp->v_bufobj;
3482 	BO_LOCK(bo);
3483 	vn_syncer_add_to_worklist(bo, syncdelay);
3484 	BO_UNLOCK(bo);
3485 
3486 	/*
3487 	 * Walk the list of vnodes pushing all that are dirty and
3488 	 * not already on the sync list.
3489 	 */
3490 	mtx_lock(&mountlist_mtx);
3491 	if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) {
3492 		mtx_unlock(&mountlist_mtx);
3493 		return (0);
3494 	}
3495 	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3496 		vfs_unbusy(mp);
3497 		return (0);
3498 	}
3499 	MNT_ILOCK(mp);
3500 	mp->mnt_noasync++;
3501 	mp->mnt_kern_flag &= ~MNTK_ASYNC;
3502 	MNT_IUNLOCK(mp);
3503 	vfs_msync(mp, MNT_NOWAIT);
3504 	error = VFS_SYNC(mp, MNT_LAZY);
3505 	MNT_ILOCK(mp);
3506 	mp->mnt_noasync--;
3507 	if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
3508 		mp->mnt_kern_flag |= MNTK_ASYNC;
3509 	MNT_IUNLOCK(mp);
3510 	vn_finished_write(mp);
3511 	vfs_unbusy(mp);
3512 	return (error);
3513 }
3514 
3515 /*
3516  * The syncer vnode is no referenced.
3517  */
3518 static int
3519 sync_inactive(struct vop_inactive_args *ap)
3520 {
3521 
3522 	vgone(ap->a_vp);
3523 	return (0);
3524 }
3525 
3526 /*
3527  * The syncer vnode is no longer needed and is being decommissioned.
3528  *
3529  * Modifications to the worklist must be protected by sync_mtx.
3530  */
3531 static int
3532 sync_reclaim(struct vop_reclaim_args *ap)
3533 {
3534 	struct vnode *vp = ap->a_vp;
3535 	struct bufobj *bo;
3536 
3537 	bo = &vp->v_bufobj;
3538 	BO_LOCK(bo);
3539 	mtx_lock(&sync_mtx);
3540 	if (vp->v_mount->mnt_syncer == vp)
3541 		vp->v_mount->mnt_syncer = NULL;
3542 	if (bo->bo_flag & BO_ONWORKLST) {
3543 		LIST_REMOVE(bo, bo_synclist);
3544 		syncer_worklist_len--;
3545 		sync_vnode_count--;
3546 		bo->bo_flag &= ~BO_ONWORKLST;
3547 	}
3548 	mtx_unlock(&sync_mtx);
3549 	BO_UNLOCK(bo);
3550 
3551 	return (0);
3552 }
3553 
3554 /*
3555  * Check if vnode represents a disk device
3556  */
3557 int
3558 vn_isdisk(struct vnode *vp, int *errp)
3559 {
3560 	int error;
3561 
3562 	error = 0;
3563 	dev_lock();
3564 	if (vp->v_type != VCHR)
3565 		error = ENOTBLK;
3566 	else if (vp->v_rdev == NULL)
3567 		error = ENXIO;
3568 	else if (vp->v_rdev->si_devsw == NULL)
3569 		error = ENXIO;
3570 	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
3571 		error = ENOTBLK;
3572 	dev_unlock();
3573 	if (errp != NULL)
3574 		*errp = error;
3575 	return (error == 0);
3576 }
3577 
3578 /*
3579  * Common filesystem object access control check routine.  Accepts a
3580  * vnode's type, "mode", uid and gid, requested access mode, credentials,
3581  * and optional call-by-reference privused argument allowing vaccess()
3582  * to indicate to the caller whether privilege was used to satisfy the
3583  * request (obsoleted).  Returns 0 on success, or an errno on failure.
3584  *
3585  * The ifdef'd CAPABILITIES version is here for reference, but is not
3586  * actually used.
3587  */
3588 int
3589 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
3590     accmode_t accmode, struct ucred *cred, int *privused)
3591 {
3592 	accmode_t dac_granted;
3593 	accmode_t priv_granted;
3594 
3595 	KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
3596 	    ("invalid bit in accmode"));
3597 	KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
3598 	    ("VAPPEND without VWRITE"));
3599 
3600 	/*
3601 	 * Look for a normal, non-privileged way to access the file/directory
3602 	 * as requested.  If it exists, go with that.
3603 	 */
3604 
3605 	if (privused != NULL)
3606 		*privused = 0;
3607 
3608 	dac_granted = 0;
3609 
3610 	/* Check the owner. */
3611 	if (cred->cr_uid == file_uid) {
3612 		dac_granted |= VADMIN;
3613 		if (file_mode & S_IXUSR)
3614 			dac_granted |= VEXEC;
3615 		if (file_mode & S_IRUSR)
3616 			dac_granted |= VREAD;
3617 		if (file_mode & S_IWUSR)
3618 			dac_granted |= (VWRITE | VAPPEND);
3619 
3620 		if ((accmode & dac_granted) == accmode)
3621 			return (0);
3622 
3623 		goto privcheck;
3624 	}
3625 
3626 	/* Otherwise, check the groups (first match) */
3627 	if (groupmember(file_gid, cred)) {
3628 		if (file_mode & S_IXGRP)
3629 			dac_granted |= VEXEC;
3630 		if (file_mode & S_IRGRP)
3631 			dac_granted |= VREAD;
3632 		if (file_mode & S_IWGRP)
3633 			dac_granted |= (VWRITE | VAPPEND);
3634 
3635 		if ((accmode & dac_granted) == accmode)
3636 			return (0);
3637 
3638 		goto privcheck;
3639 	}
3640 
3641 	/* Otherwise, check everyone else. */
3642 	if (file_mode & S_IXOTH)
3643 		dac_granted |= VEXEC;
3644 	if (file_mode & S_IROTH)
3645 		dac_granted |= VREAD;
3646 	if (file_mode & S_IWOTH)
3647 		dac_granted |= (VWRITE | VAPPEND);
3648 	if ((accmode & dac_granted) == accmode)
3649 		return (0);
3650 
3651 privcheck:
3652 	/*
3653 	 * Build a privilege mask to determine if the set of privileges
3654 	 * satisfies the requirements when combined with the granted mask
3655 	 * from above.  For each privilege, if the privilege is required,
3656 	 * bitwise or the request type onto the priv_granted mask.
3657 	 */
3658 	priv_granted = 0;
3659 
3660 	if (type == VDIR) {
3661 		/*
3662 		 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
3663 		 * requests, instead of PRIV_VFS_EXEC.
3664 		 */
3665 		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3666 		    !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
3667 			priv_granted |= VEXEC;
3668 	} else {
3669 		/*
3670 		 * Ensure that at least one execute bit is on. Otherwise,
3671 		 * a privileged user will always succeed, and we don't want
3672 		 * this to happen unless the file really is executable.
3673 		 */
3674 		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3675 		    (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
3676 		    !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
3677 			priv_granted |= VEXEC;
3678 	}
3679 
3680 	if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
3681 	    !priv_check_cred(cred, PRIV_VFS_READ, 0))
3682 		priv_granted |= VREAD;
3683 
3684 	if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3685 	    !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
3686 		priv_granted |= (VWRITE | VAPPEND);
3687 
3688 	if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3689 	    !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
3690 		priv_granted |= VADMIN;
3691 
3692 	if ((accmode & (priv_granted | dac_granted)) == accmode) {
3693 		/* XXX audit: privilege used */
3694 		if (privused != NULL)
3695 			*privused = 1;
3696 		return (0);
3697 	}
3698 
3699 	return ((accmode & VADMIN) ? EPERM : EACCES);
3700 }
3701 
3702 /*
3703  * Credential check based on process requesting service, and per-attribute
3704  * permissions.
3705  */
3706 int
3707 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
3708     struct thread *td, accmode_t accmode)
3709 {
3710 
3711 	/*
3712 	 * Kernel-invoked always succeeds.
3713 	 */
3714 	if (cred == NOCRED)
3715 		return (0);
3716 
3717 	/*
3718 	 * Do not allow privileged processes in jail to directly manipulate
3719 	 * system attributes.
3720 	 */
3721 	switch (attrnamespace) {
3722 	case EXTATTR_NAMESPACE_SYSTEM:
3723 		/* Potentially should be: return (EPERM); */
3724 		return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
3725 	case EXTATTR_NAMESPACE_USER:
3726 		return (VOP_ACCESS(vp, accmode, cred, td));
3727 	default:
3728 		return (EPERM);
3729 	}
3730 }
3731 
3732 #ifdef DEBUG_VFS_LOCKS
3733 /*
3734  * This only exists to supress warnings from unlocked specfs accesses.  It is
3735  * no longer ok to have an unlocked VFS.
3736  */
3737 #define	IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL ||		\
3738 	(vp)->v_type == VCHR ||	(vp)->v_type == VBAD)
3739 
3740 int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
3741 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
3742     "Drop into debugger on lock violation");
3743 
3744 int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
3745 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
3746     0, "Check for interlock across VOPs");
3747 
3748 int vfs_badlock_print = 1;	/* Print lock violations. */
3749 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
3750     0, "Print lock violations");
3751 
3752 #ifdef KDB
3753 int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
3754 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
3755     &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
3756 #endif
3757 
3758 static void
3759 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
3760 {
3761 
3762 #ifdef KDB
3763 	if (vfs_badlock_backtrace)
3764 		kdb_backtrace();
3765 #endif
3766 	if (vfs_badlock_print)
3767 		printf("%s: %p %s\n", str, (void *)vp, msg);
3768 	if (vfs_badlock_ddb)
3769 		kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
3770 }
3771 
3772 void
3773 assert_vi_locked(struct vnode *vp, const char *str)
3774 {
3775 
3776 	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
3777 		vfs_badlock("interlock is not locked but should be", str, vp);
3778 }
3779 
3780 void
3781 assert_vi_unlocked(struct vnode *vp, const char *str)
3782 {
3783 
3784 	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
3785 		vfs_badlock("interlock is locked but should not be", str, vp);
3786 }
3787 
3788 void
3789 assert_vop_locked(struct vnode *vp, const char *str)
3790 {
3791 
3792 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == 0)
3793 		vfs_badlock("is not locked but should be", str, vp);
3794 }
3795 
3796 void
3797 assert_vop_unlocked(struct vnode *vp, const char *str)
3798 {
3799 
3800 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
3801 		vfs_badlock("is locked but should not be", str, vp);
3802 }
3803 
3804 void
3805 assert_vop_elocked(struct vnode *vp, const char *str)
3806 {
3807 
3808 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
3809 		vfs_badlock("is not exclusive locked but should be", str, vp);
3810 }
3811 
3812 #if 0
3813 void
3814 assert_vop_elocked_other(struct vnode *vp, const char *str)
3815 {
3816 
3817 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER)
3818 		vfs_badlock("is not exclusive locked by another thread",
3819 		    str, vp);
3820 }
3821 
3822 void
3823 assert_vop_slocked(struct vnode *vp, const char *str)
3824 {
3825 
3826 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED)
3827 		vfs_badlock("is not locked shared but should be", str, vp);
3828 }
3829 #endif /* 0 */
3830 #endif /* DEBUG_VFS_LOCKS */
3831 
3832 void
3833 vop_rename_fail(struct vop_rename_args *ap)
3834 {
3835 
3836 	if (ap->a_tvp != NULL)
3837 		vput(ap->a_tvp);
3838 	if (ap->a_tdvp == ap->a_tvp)
3839 		vrele(ap->a_tdvp);
3840 	else
3841 		vput(ap->a_tdvp);
3842 	vrele(ap->a_fdvp);
3843 	vrele(ap->a_fvp);
3844 }
3845 
3846 void
3847 vop_rename_pre(void *ap)
3848 {
3849 	struct vop_rename_args *a = ap;
3850 
3851 #ifdef DEBUG_VFS_LOCKS
3852 	if (a->a_tvp)
3853 		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
3854 	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
3855 	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
3856 	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
3857 
3858 	/* Check the source (from). */
3859 	if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
3860 	    (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
3861 		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
3862 	if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
3863 		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
3864 
3865 	/* Check the target. */
3866 	if (a->a_tvp)
3867 		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
3868 	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
3869 #endif
3870 	if (a->a_tdvp != a->a_fdvp)
3871 		vhold(a->a_fdvp);
3872 	if (a->a_tvp != a->a_fvp)
3873 		vhold(a->a_fvp);
3874 	vhold(a->a_tdvp);
3875 	if (a->a_tvp)
3876 		vhold(a->a_tvp);
3877 }
3878 
3879 void
3880 vop_strategy_pre(void *ap)
3881 {
3882 #ifdef DEBUG_VFS_LOCKS
3883 	struct vop_strategy_args *a;
3884 	struct buf *bp;
3885 
3886 	a = ap;
3887 	bp = a->a_bp;
3888 
3889 	/*
3890 	 * Cluster ops lock their component buffers but not the IO container.
3891 	 */
3892 	if ((bp->b_flags & B_CLUSTER) != 0)
3893 		return;
3894 
3895 	if (panicstr == NULL && !BUF_ISLOCKED(bp)) {
3896 		if (vfs_badlock_print)
3897 			printf(
3898 			    "VOP_STRATEGY: bp is not locked but should be\n");
3899 		if (vfs_badlock_ddb)
3900 			kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
3901 	}
3902 #endif
3903 }
3904 
3905 void
3906 vop_lookup_pre(void *ap)
3907 {
3908 #ifdef DEBUG_VFS_LOCKS
3909 	struct vop_lookup_args *a;
3910 	struct vnode *dvp;
3911 
3912 	a = ap;
3913 	dvp = a->a_dvp;
3914 	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
3915 	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
3916 #endif
3917 }
3918 
3919 void
3920 vop_lookup_post(void *ap, int rc)
3921 {
3922 #ifdef DEBUG_VFS_LOCKS
3923 	struct vop_lookup_args *a;
3924 	struct vnode *dvp;
3925 	struct vnode *vp;
3926 
3927 	a = ap;
3928 	dvp = a->a_dvp;
3929 	vp = *(a->a_vpp);
3930 
3931 	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
3932 	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
3933 
3934 	if (!rc)
3935 		ASSERT_VOP_LOCKED(vp, "VOP_LOOKUP (child)");
3936 #endif
3937 }
3938 
3939 void
3940 vop_lock_pre(void *ap)
3941 {
3942 #ifdef DEBUG_VFS_LOCKS
3943 	struct vop_lock1_args *a = ap;
3944 
3945 	if ((a->a_flags & LK_INTERLOCK) == 0)
3946 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
3947 	else
3948 		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
3949 #endif
3950 }
3951 
3952 void
3953 vop_lock_post(void *ap, int rc)
3954 {
3955 #ifdef DEBUG_VFS_LOCKS
3956 	struct vop_lock1_args *a = ap;
3957 
3958 	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
3959 	if (rc == 0)
3960 		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
3961 #endif
3962 }
3963 
3964 void
3965 vop_unlock_pre(void *ap)
3966 {
3967 #ifdef DEBUG_VFS_LOCKS
3968 	struct vop_unlock_args *a = ap;
3969 
3970 	if (a->a_flags & LK_INTERLOCK)
3971 		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
3972 	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
3973 #endif
3974 }
3975 
3976 void
3977 vop_unlock_post(void *ap, int rc)
3978 {
3979 #ifdef DEBUG_VFS_LOCKS
3980 	struct vop_unlock_args *a = ap;
3981 
3982 	if (a->a_flags & LK_INTERLOCK)
3983 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
3984 #endif
3985 }
3986 
3987 void
3988 vop_create_post(void *ap, int rc)
3989 {
3990 	struct vop_create_args *a = ap;
3991 
3992 	if (!rc)
3993 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
3994 }
3995 
3996 void
3997 vop_link_post(void *ap, int rc)
3998 {
3999 	struct vop_link_args *a = ap;
4000 
4001 	if (!rc) {
4002 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
4003 		VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
4004 	}
4005 }
4006 
4007 void
4008 vop_mkdir_post(void *ap, int rc)
4009 {
4010 	struct vop_mkdir_args *a = ap;
4011 
4012 	if (!rc)
4013 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4014 }
4015 
4016 void
4017 vop_mknod_post(void *ap, int rc)
4018 {
4019 	struct vop_mknod_args *a = ap;
4020 
4021 	if (!rc)
4022 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4023 }
4024 
4025 void
4026 vop_remove_post(void *ap, int rc)
4027 {
4028 	struct vop_remove_args *a = ap;
4029 
4030 	if (!rc) {
4031 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4032 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4033 	}
4034 }
4035 
4036 void
4037 vop_rename_post(void *ap, int rc)
4038 {
4039 	struct vop_rename_args *a = ap;
4040 
4041 	if (!rc) {
4042 		VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
4043 		VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
4044 		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
4045 		if (a->a_tvp)
4046 			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
4047 	}
4048 	if (a->a_tdvp != a->a_fdvp)
4049 		vdrop(a->a_fdvp);
4050 	if (a->a_tvp != a->a_fvp)
4051 		vdrop(a->a_fvp);
4052 	vdrop(a->a_tdvp);
4053 	if (a->a_tvp)
4054 		vdrop(a->a_tvp);
4055 }
4056 
4057 void
4058 vop_rmdir_post(void *ap, int rc)
4059 {
4060 	struct vop_rmdir_args *a = ap;
4061 
4062 	if (!rc) {
4063 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4064 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4065 	}
4066 }
4067 
4068 void
4069 vop_setattr_post(void *ap, int rc)
4070 {
4071 	struct vop_setattr_args *a = ap;
4072 
4073 	if (!rc)
4074 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4075 }
4076 
4077 void
4078 vop_symlink_post(void *ap, int rc)
4079 {
4080 	struct vop_symlink_args *a = ap;
4081 
4082 	if (!rc)
4083 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4084 }
4085 
4086 static struct knlist fs_knlist;
4087 
4088 static void
4089 vfs_event_init(void *arg)
4090 {
4091 	knlist_init_mtx(&fs_knlist, NULL);
4092 }
4093 /* XXX - correct order? */
4094 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
4095 
4096 void
4097 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
4098 {
4099 
4100 	KNOTE_UNLOCKED(&fs_knlist, event);
4101 }
4102 
4103 static int	filt_fsattach(struct knote *kn);
4104 static void	filt_fsdetach(struct knote *kn);
4105 static int	filt_fsevent(struct knote *kn, long hint);
4106 
4107 struct filterops fs_filtops = {
4108 	.f_isfd = 0,
4109 	.f_attach = filt_fsattach,
4110 	.f_detach = filt_fsdetach,
4111 	.f_event = filt_fsevent
4112 };
4113 
4114 static int
4115 filt_fsattach(struct knote *kn)
4116 {
4117 
4118 	kn->kn_flags |= EV_CLEAR;
4119 	knlist_add(&fs_knlist, kn, 0);
4120 	return (0);
4121 }
4122 
4123 static void
4124 filt_fsdetach(struct knote *kn)
4125 {
4126 
4127 	knlist_remove(&fs_knlist, kn, 0);
4128 }
4129 
4130 static int
4131 filt_fsevent(struct knote *kn, long hint)
4132 {
4133 
4134 	kn->kn_fflags |= hint;
4135 	return (kn->kn_fflags != 0);
4136 }
4137 
4138 static int
4139 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
4140 {
4141 	struct vfsidctl vc;
4142 	int error;
4143 	struct mount *mp;
4144 
4145 	error = SYSCTL_IN(req, &vc, sizeof(vc));
4146 	if (error)
4147 		return (error);
4148 	if (vc.vc_vers != VFS_CTL_VERS1)
4149 		return (EINVAL);
4150 	mp = vfs_getvfs(&vc.vc_fsid);
4151 	if (mp == NULL)
4152 		return (ENOENT);
4153 	/* ensure that a specific sysctl goes to the right filesystem. */
4154 	if (strcmp(vc.vc_fstypename, "*") != 0 &&
4155 	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
4156 		vfs_rel(mp);
4157 		return (EINVAL);
4158 	}
4159 	VCTLTOREQ(&vc, req);
4160 	error = VFS_SYSCTL(mp, vc.vc_op, req);
4161 	vfs_rel(mp);
4162 	return (error);
4163 }
4164 
4165 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR,
4166     NULL, 0, sysctl_vfs_ctl, "",
4167     "Sysctl by fsid");
4168 
4169 /*
4170  * Function to initialize a va_filerev field sensibly.
4171  * XXX: Wouldn't a random number make a lot more sense ??
4172  */
4173 u_quad_t
4174 init_va_filerev(void)
4175 {
4176 	struct bintime bt;
4177 
4178 	getbinuptime(&bt);
4179 	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
4180 }
4181 
4182 static int	filt_vfsread(struct knote *kn, long hint);
4183 static int	filt_vfswrite(struct knote *kn, long hint);
4184 static int	filt_vfsvnode(struct knote *kn, long hint);
4185 static void	filt_vfsdetach(struct knote *kn);
4186 static struct filterops vfsread_filtops = {
4187 	.f_isfd = 1,
4188 	.f_detach = filt_vfsdetach,
4189 	.f_event = filt_vfsread
4190 };
4191 static struct filterops vfswrite_filtops = {
4192 	.f_isfd = 1,
4193 	.f_detach = filt_vfsdetach,
4194 	.f_event = filt_vfswrite
4195 };
4196 static struct filterops vfsvnode_filtops = {
4197 	.f_isfd = 1,
4198 	.f_detach = filt_vfsdetach,
4199 	.f_event = filt_vfsvnode
4200 };
4201 
4202 static void
4203 vfs_knllock(void *arg)
4204 {
4205 	struct vnode *vp = arg;
4206 
4207 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4208 }
4209 
4210 static void
4211 vfs_knlunlock(void *arg)
4212 {
4213 	struct vnode *vp = arg;
4214 
4215 	VOP_UNLOCK(vp, 0);
4216 }
4217 
4218 static void
4219 vfs_knl_assert_locked(void *arg)
4220 {
4221 #ifdef DEBUG_VFS_LOCKS
4222 	struct vnode *vp = arg;
4223 
4224 	ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
4225 #endif
4226 }
4227 
4228 static void
4229 vfs_knl_assert_unlocked(void *arg)
4230 {
4231 #ifdef DEBUG_VFS_LOCKS
4232 	struct vnode *vp = arg;
4233 
4234 	ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
4235 #endif
4236 }
4237 
4238 int
4239 vfs_kqfilter(struct vop_kqfilter_args *ap)
4240 {
4241 	struct vnode *vp = ap->a_vp;
4242 	struct knote *kn = ap->a_kn;
4243 	struct knlist *knl;
4244 
4245 	switch (kn->kn_filter) {
4246 	case EVFILT_READ:
4247 		kn->kn_fop = &vfsread_filtops;
4248 		break;
4249 	case EVFILT_WRITE:
4250 		kn->kn_fop = &vfswrite_filtops;
4251 		break;
4252 	case EVFILT_VNODE:
4253 		kn->kn_fop = &vfsvnode_filtops;
4254 		break;
4255 	default:
4256 		return (EINVAL);
4257 	}
4258 
4259 	kn->kn_hook = (caddr_t)vp;
4260 
4261 	v_addpollinfo(vp);
4262 	if (vp->v_pollinfo == NULL)
4263 		return (ENOMEM);
4264 	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
4265 	knlist_add(knl, kn, 0);
4266 
4267 	return (0);
4268 }
4269 
4270 /*
4271  * Detach knote from vnode
4272  */
4273 static void
4274 filt_vfsdetach(struct knote *kn)
4275 {
4276 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4277 
4278 	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
4279 	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
4280 }
4281 
4282 /*ARGSUSED*/
4283 static int
4284 filt_vfsread(struct knote *kn, long hint)
4285 {
4286 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4287 	struct vattr va;
4288 	int res;
4289 
4290 	/*
4291 	 * filesystem is gone, so set the EOF flag and schedule
4292 	 * the knote for deletion.
4293 	 */
4294 	if (hint == NOTE_REVOKE) {
4295 		VI_LOCK(vp);
4296 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4297 		VI_UNLOCK(vp);
4298 		return (1);
4299 	}
4300 
4301 	if (VOP_GETATTR(vp, &va, curthread->td_ucred))
4302 		return (0);
4303 
4304 	VI_LOCK(vp);
4305 	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
4306 	res = (kn->kn_data != 0);
4307 	VI_UNLOCK(vp);
4308 	return (res);
4309 }
4310 
4311 /*ARGSUSED*/
4312 static int
4313 filt_vfswrite(struct knote *kn, long hint)
4314 {
4315 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4316 
4317 	VI_LOCK(vp);
4318 
4319 	/*
4320 	 * filesystem is gone, so set the EOF flag and schedule
4321 	 * the knote for deletion.
4322 	 */
4323 	if (hint == NOTE_REVOKE)
4324 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4325 
4326 	kn->kn_data = 0;
4327 	VI_UNLOCK(vp);
4328 	return (1);
4329 }
4330 
4331 static int
4332 filt_vfsvnode(struct knote *kn, long hint)
4333 {
4334 	struct vnode *vp = (struct vnode *)kn->kn_hook;
4335 	int res;
4336 
4337 	VI_LOCK(vp);
4338 	if (kn->kn_sfflags & hint)
4339 		kn->kn_fflags |= hint;
4340 	if (hint == NOTE_REVOKE) {
4341 		kn->kn_flags |= EV_EOF;
4342 		VI_UNLOCK(vp);
4343 		return (1);
4344 	}
4345 	res = (kn->kn_fflags != 0);
4346 	VI_UNLOCK(vp);
4347 	return (res);
4348 }
4349 
4350 int
4351 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
4352 {
4353 	int error;
4354 
4355 	if (dp->d_reclen > ap->a_uio->uio_resid)
4356 		return (ENAMETOOLONG);
4357 	error = uiomove(dp, dp->d_reclen, ap->a_uio);
4358 	if (error) {
4359 		if (ap->a_ncookies != NULL) {
4360 			if (ap->a_cookies != NULL)
4361 				free(ap->a_cookies, M_TEMP);
4362 			ap->a_cookies = NULL;
4363 			*ap->a_ncookies = 0;
4364 		}
4365 		return (error);
4366 	}
4367 	if (ap->a_ncookies == NULL)
4368 		return (0);
4369 
4370 	KASSERT(ap->a_cookies,
4371 	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
4372 
4373 	*ap->a_cookies = realloc(*ap->a_cookies,
4374 	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
4375 	(*ap->a_cookies)[*ap->a_ncookies] = off;
4376 	return (0);
4377 }
4378 
4379 /*
4380  * Mark for update the access time of the file if the filesystem
4381  * supports VOP_MARKATIME.  This functionality is used by execve and
4382  * mmap, so we want to avoid the I/O implied by directly setting
4383  * va_atime for the sake of efficiency.
4384  */
4385 void
4386 vfs_mark_atime(struct vnode *vp, struct ucred *cred)
4387 {
4388 	struct mount *mp;
4389 
4390 	mp = vp->v_mount;
4391 	VFS_ASSERT_GIANT(mp);
4392 	ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
4393 	if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
4394 		(void)VOP_MARKATIME(vp);
4395 }
4396 
4397 /*
4398  * The purpose of this routine is to remove granularity from accmode_t,
4399  * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
4400  * VADMIN and VAPPEND.
4401  *
4402  * If it returns 0, the caller is supposed to continue with the usual
4403  * access checks using 'accmode' as modified by this routine.  If it
4404  * returns nonzero value, the caller is supposed to return that value
4405  * as errno.
4406  *
4407  * Note that after this routine runs, accmode may be zero.
4408  */
4409 int
4410 vfs_unixify_accmode(accmode_t *accmode)
4411 {
4412 	/*
4413 	 * There is no way to specify explicit "deny" rule using
4414 	 * file mode or POSIX.1e ACLs.
4415 	 */
4416 	if (*accmode & VEXPLICIT_DENY) {
4417 		*accmode = 0;
4418 		return (0);
4419 	}
4420 
4421 	/*
4422 	 * None of these can be translated into usual access bits.
4423 	 * Also, the common case for NFSv4 ACLs is to not contain
4424 	 * either of these bits. Caller should check for VWRITE
4425 	 * on the containing directory instead.
4426 	 */
4427 	if (*accmode & (VDELETE_CHILD | VDELETE))
4428 		return (EPERM);
4429 
4430 	if (*accmode & VADMIN_PERMS) {
4431 		*accmode &= ~VADMIN_PERMS;
4432 		*accmode |= VADMIN;
4433 	}
4434 
4435 	/*
4436 	 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
4437 	 * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
4438 	 */
4439 	*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
4440 
4441 	return (0);
4442 }
4443