xref: /freebsd/sys/kern/vfs_subr.c (revision c4f6a2a9e1b1879b618c436ab4f56ff75c73a0f5)
1 /*
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
39  * $FreeBSD$
40  */
41 
42 /*
43  * External virtual filesystem routines
44  */
45 #include "opt_ddb.h"
46 #include "opt_mac.h"
47 
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/bio.h>
51 #include <sys/buf.h>
52 #include <sys/conf.h>
53 #include <sys/eventhandler.h>
54 #include <sys/fcntl.h>
55 #include <sys/kernel.h>
56 #include <sys/kthread.h>
57 #include <sys/mac.h>
58 #include <sys/malloc.h>
59 #include <sys/mount.h>
60 #include <sys/namei.h>
61 #include <sys/stat.h>
62 #include <sys/sysctl.h>
63 #include <sys/syslog.h>
64 #include <sys/vmmeter.h>
65 #include <sys/vnode.h>
66 
67 #include <vm/vm.h>
68 #include <vm/vm_object.h>
69 #include <vm/vm_extern.h>
70 #include <vm/pmap.h>
71 #include <vm/vm_map.h>
72 #include <vm/vm_page.h>
73 #include <vm/uma.h>
74 
75 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
76 
77 static void	addalias(struct vnode *vp, dev_t nvp_rdev);
78 static void	insmntque(struct vnode *vp, struct mount *mp);
79 static void	vclean(struct vnode *vp, int flags, struct thread *td);
80 static void	vlruvp(struct vnode *vp);
81 static int	flushbuflist(struct buf *blist, int flags, struct vnode *vp,
82 		    int slpflag, int slptimeo, int *errorp);
83 static int	vcanrecycle(struct vnode *vp);
84 
85 
86 /*
87  * Number of vnodes in existence.  Increased whenever getnewvnode()
88  * allocates a new vnode, never decreased.
89  */
90 static unsigned long	numvnodes;
91 
92 SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
93 
94 /*
95  * Conversion tables for conversion from vnode types to inode formats
96  * and back.
97  */
98 enum vtype iftovt_tab[16] = {
99 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
100 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
101 };
102 int vttoif_tab[9] = {
103 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
104 	S_IFSOCK, S_IFIFO, S_IFMT,
105 };
106 
107 /*
108  * List of vnodes that are ready for recycling.
109  */
110 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
111 
112 /*
113  * Minimum number of free vnodes.  If there are fewer than this free vnodes,
114  * getnewvnode() will return a newly allocated vnode.
115  */
116 static u_long wantfreevnodes = 25;
117 SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
118 /* Number of vnodes in the free list. */
119 static u_long freevnodes;
120 SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
121 
122 /*
123  * Various variables used for debugging the new implementation of
124  * reassignbuf().
125  * XXX these are probably of (very) limited utility now.
126  */
127 static int reassignbufcalls;
128 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
129 static int nameileafonly;
130 SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, "");
131 
132 #ifdef ENABLE_VFS_IOOPT
133 /* See NOTES for a description of this setting. */
134 int vfs_ioopt;
135 SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
136 #endif
137 
138 /*
139  * Cache for the mount type id assigned to NFS.  This is used for
140  * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
141  */
142 int	nfs_mount_type = -1;
143 
144 /* To keep more than one thread at a time from running vfs_getnewfsid */
145 static struct mtx mntid_mtx;
146 
147 /*
148  * Lock for any access to the following:
149  *	vnode_free_list
150  *	numvnodes
151  *	freevnodes
152  */
153 static struct mtx vnode_free_list_mtx;
154 
155 /*
156  * For any iteration/modification of dev->si_hlist (linked through
157  * v_specnext)
158  */
159 static struct mtx spechash_mtx;
160 
161 /* Publicly exported FS */
162 struct nfs_public nfs_pub;
163 
164 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
165 static uma_zone_t vnode_zone;
166 static uma_zone_t vnodepoll_zone;
167 
168 /* Set to 1 to print out reclaim of active vnodes */
169 int	prtactive;
170 
171 /*
172  * The workitem queue.
173  *
174  * It is useful to delay writes of file data and filesystem metadata
175  * for tens of seconds so that quickly created and deleted files need
176  * not waste disk bandwidth being created and removed. To realize this,
177  * we append vnodes to a "workitem" queue. When running with a soft
178  * updates implementation, most pending metadata dependencies should
179  * not wait for more than a few seconds. Thus, mounted on block devices
180  * are delayed only about a half the time that file data is delayed.
181  * Similarly, directory updates are more critical, so are only delayed
182  * about a third the time that file data is delayed. Thus, there are
183  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
184  * one each second (driven off the filesystem syncer process). The
185  * syncer_delayno variable indicates the next queue that is to be processed.
186  * Items that need to be processed soon are placed in this queue:
187  *
188  *	syncer_workitem_pending[syncer_delayno]
189  *
190  * A delay of fifteen seconds is done by placing the request fifteen
191  * entries later in the queue:
192  *
193  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
194  *
195  */
196 static int syncer_delayno;
197 static long syncer_mask;
198 LIST_HEAD(synclist, vnode);
199 static struct synclist *syncer_workitem_pending;
200 
201 #define SYNCER_MAXDELAY		32
202 static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
203 static int syncdelay = 30;		/* max time to delay syncing data */
204 static int filedelay = 30;		/* time to delay syncing files */
205 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
206 static int dirdelay = 29;		/* time to delay syncing directories */
207 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
208 static int metadelay = 28;		/* time to delay syncing metadata */
209 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
210 static int rushjob;		/* number of slots to run ASAP */
211 static int stat_rush_requests;	/* number of times I/O speeded up */
212 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
213 
214 /*
215  * Number of vnodes we want to exist at any one time.  This is mostly used
216  * to size hash tables in vnode-related code.  It is normally not used in
217  * getnewvnode(), as wantfreevnodes is normally nonzero.)
218  *
219  * XXX desiredvnodes is historical cruft and should not exist.
220  */
221 int desiredvnodes;
222 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
223     &desiredvnodes, 0, "Maximum number of vnodes");
224 static int minvnodes;
225 SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
226     &minvnodes, 0, "Minimum number of vnodes");
227 static int vnlru_nowhere;
228 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0,
229     "Number of times the vnlru process ran without success");
230 
231 /* Hook for calling soft updates */
232 int (*softdep_process_worklist_hook)(struct mount *);
233 
234 #ifdef DEBUG_VFS_LOCKS
235 /* Print lock violations */
236 int vfs_badlock_print = 1;
237 
238 /* Panic on violation */
239 int vfs_badlock_panic = 1;
240 
241 /* Check for interlock across VOPs */
242 int vfs_badlock_mutex = 0;
243 
244 void
245 vop_rename_pre(void *ap)
246 {
247 	struct vop_rename_args *a = ap;
248 
249 	if (a->a_tvp)
250 		ASSERT_VI_UNLOCKED(a->a_tvp);
251 	ASSERT_VI_UNLOCKED(a->a_tdvp);
252 	ASSERT_VI_UNLOCKED(a->a_fvp);
253 	ASSERT_VI_UNLOCKED(a->a_fdvp);
254 
255 	/* Check the source (from) */
256 	if (a->a_tdvp != a->a_fdvp)
257 		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked.\n");
258 	if (a->a_tvp != a->a_fvp)
259 		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: tvp locked.\n");
260 
261 	/* Check the target */
262 	if (a->a_tvp)
263 		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked.\n");
264 
265 	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked.\n");
266 }
267 
268 void
269 vop_strategy_pre(void *ap)
270 {
271 	struct vop_strategy_args *a = ap;
272 	struct buf *bp;
273 
274 	bp = a->a_bp;
275 
276 	/*
277 	 * Cluster ops lock their component buffers but not the IO container.
278 	 */
279 	if ((bp->b_flags & B_CLUSTER) != 0)
280 		return;
281 
282 	if (BUF_REFCNT(bp) < 1) {
283 		if (vfs_badlock_print)
284 			printf("VOP_STRATEGY: bp is not locked but should be.\n");
285 		if (vfs_badlock_panic)
286 			Debugger("Lock violation.\n");
287 	}
288 }
289 
290 void
291 vop_lookup_pre(void *ap)
292 {
293 	struct vop_lookup_args *a = ap;
294 	struct vnode *dvp;
295 
296 	dvp = a->a_dvp;
297 
298 	ASSERT_VI_UNLOCKED(dvp);
299 	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
300 }
301 
302 void
303 vop_lookup_post(void *ap, int rc)
304 {
305 	struct vop_lookup_args *a = ap;
306 	struct componentname *cnp;
307 	struct vnode *dvp;
308 	struct vnode *vp;
309 	int flags;
310 
311 	dvp = a->a_dvp;
312 	cnp = a->a_cnp;
313 	vp = *(a->a_vpp);
314 	flags = cnp->cn_flags;
315 
316 
317 	ASSERT_VI_UNLOCKED(dvp);
318 	/*
319 	 * If this is the last path component for this lookup and LOCPARENT
320 	 * is set, OR if there is an error the directory has to be locked.
321 	 */
322 	if ((flags & LOCKPARENT) && (flags & ISLASTCN))
323 		ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (LOCKPARENT)");
324 	else if (rc != 0)
325 		ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (error)");
326 	else if (dvp != vp)
327 		ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (dvp)");
328 
329 	if (flags & PDIRUNLOCK)
330 		ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (PDIRUNLOCK)");
331 
332 	if (rc == 0) {
333 		ASSERT_VI_UNLOCKED(vp);
334 		ASSERT_VOP_LOCKED(vp, "VOP_LOOKUP (vpp)");
335 	}
336 }
337 
338 void
339 vop_unlock_pre(void *ap)
340 {
341 	struct vop_unlock_args *a = ap;
342 
343 	if ((a->a_flags & LK_INTERLOCK) == 0)
344 		ASSERT_VI_UNLOCKED(a->a_vp);
345 	else
346 		ASSERT_VI_LOCKED(a->a_vp);
347 }
348 
349 void
350 vop_unlock_post(void *ap, int rc)
351 {
352 	struct vop_unlock_args *a = ap;
353 
354 	ASSERT_VI_UNLOCKED(a->a_vp);
355 }
356 
357 void
358 vop_lock_pre(void *ap)
359 {
360 	struct vop_lock_args *a = ap;
361 
362 	if ((a->a_flags & LK_INTERLOCK) == 0)
363 		ASSERT_VI_UNLOCKED(a->a_vp);
364 	else
365 		ASSERT_VI_LOCKED(a->a_vp);
366 }
367 
368 void
369 vop_lock_post(void *ap, int rc)
370 {
371 	struct vop_lock_args *a = ap;
372 
373 	ASSERT_VI_UNLOCKED(a->a_vp);
374 	ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
375 }
376 
377 #endif	/* DEBUG_VFS_LOCKS */
378 
379 void
380 v_addpollinfo(struct vnode *vp)
381 {
382 	vp->v_pollinfo = uma_zalloc(vnodepoll_zone, M_WAITOK);
383 	mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
384 }
385 
386 /*
387  * Initialize the vnode management data structures.
388  */
389 static void
390 vntblinit(void *dummy __unused)
391 {
392 
393 	desiredvnodes = maxproc + cnt.v_page_count / 4;
394 	minvnodes = desiredvnodes / 4;
395 	mtx_init(&mountlist_mtx, "mountlist", NULL, MTX_DEF);
396 	mtx_init(&mntvnode_mtx, "mntvnode", NULL, MTX_DEF);
397 	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
398 	mtx_init(&spechash_mtx, "spechash", NULL, MTX_DEF);
399 	TAILQ_INIT(&vnode_free_list);
400 	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
401 	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
402 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
403 	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
404 	      NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
405 	/*
406 	 * Initialize the filesystem syncer.
407 	 */
408 	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
409 		&syncer_mask);
410 	syncer_maxdelay = syncer_mask + 1;
411 }
412 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL)
413 
414 
415 /*
416  * Mark a mount point as busy. Used to synchronize access and to delay
417  * unmounting. Interlock is not released on failure.
418  */
419 int
420 vfs_busy(mp, flags, interlkp, td)
421 	struct mount *mp;
422 	int flags;
423 	struct mtx *interlkp;
424 	struct thread *td;
425 {
426 	int lkflags;
427 
428 	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
429 		if (flags & LK_NOWAIT)
430 			return (ENOENT);
431 		mp->mnt_kern_flag |= MNTK_MWAIT;
432 		/*
433 		 * Since all busy locks are shared except the exclusive
434 		 * lock granted when unmounting, the only place that a
435 		 * wakeup needs to be done is at the release of the
436 		 * exclusive lock at the end of dounmount.
437 		 */
438 		msleep(mp, interlkp, PVFS, "vfs_busy", 0);
439 		return (ENOENT);
440 	}
441 	lkflags = LK_SHARED | LK_NOPAUSE;
442 	if (interlkp)
443 		lkflags |= LK_INTERLOCK;
444 	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, td))
445 		panic("vfs_busy: unexpected lock failure");
446 	return (0);
447 }
448 
449 /*
450  * Free a busy filesystem.
451  */
452 void
453 vfs_unbusy(mp, td)
454 	struct mount *mp;
455 	struct thread *td;
456 {
457 
458 	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
459 }
460 
461 /*
462  * Lookup a mount point by filesystem identifier.
463  */
464 struct mount *
465 vfs_getvfs(fsid)
466 	fsid_t *fsid;
467 {
468 	register struct mount *mp;
469 
470 	mtx_lock(&mountlist_mtx);
471 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
472 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
473 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
474 			mtx_unlock(&mountlist_mtx);
475 			return (mp);
476 	    }
477 	}
478 	mtx_unlock(&mountlist_mtx);
479 	return ((struct mount *) 0);
480 }
481 
482 /*
483  * Get a new unique fsid.  Try to make its val[0] unique, since this value
484  * will be used to create fake device numbers for stat().  Also try (but
485  * not so hard) make its val[0] unique mod 2^16, since some emulators only
486  * support 16-bit device numbers.  We end up with unique val[0]'s for the
487  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
488  *
489  * Keep in mind that several mounts may be running in parallel.  Starting
490  * the search one past where the previous search terminated is both a
491  * micro-optimization and a defense against returning the same fsid to
492  * different mounts.
493  */
494 void
495 vfs_getnewfsid(mp)
496 	struct mount *mp;
497 {
498 	static u_int16_t mntid_base;
499 	fsid_t tfsid;
500 	int mtype;
501 
502 	mtx_lock(&mntid_mtx);
503 	mtype = mp->mnt_vfc->vfc_typenum;
504 	tfsid.val[1] = mtype;
505 	mtype = (mtype & 0xFF) << 24;
506 	for (;;) {
507 		tfsid.val[0] = makeudev(255,
508 		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
509 		mntid_base++;
510 		if (vfs_getvfs(&tfsid) == NULL)
511 			break;
512 	}
513 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
514 	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
515 	mtx_unlock(&mntid_mtx);
516 }
517 
518 /*
519  * Knob to control the precision of file timestamps:
520  *
521  *   0 = seconds only; nanoseconds zeroed.
522  *   1 = seconds and nanoseconds, accurate within 1/HZ.
523  *   2 = seconds and nanoseconds, truncated to microseconds.
524  * >=3 = seconds and nanoseconds, maximum precision.
525  */
526 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
527 
528 static int timestamp_precision = TSP_SEC;
529 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
530     &timestamp_precision, 0, "");
531 
532 /*
533  * Get a current timestamp.
534  */
535 void
536 vfs_timestamp(tsp)
537 	struct timespec *tsp;
538 {
539 	struct timeval tv;
540 
541 	switch (timestamp_precision) {
542 	case TSP_SEC:
543 		tsp->tv_sec = time_second;
544 		tsp->tv_nsec = 0;
545 		break;
546 	case TSP_HZ:
547 		getnanotime(tsp);
548 		break;
549 	case TSP_USEC:
550 		microtime(&tv);
551 		TIMEVAL_TO_TIMESPEC(&tv, tsp);
552 		break;
553 	case TSP_NSEC:
554 	default:
555 		nanotime(tsp);
556 		break;
557 	}
558 }
559 
560 /*
561  * Set vnode attributes to VNOVAL
562  */
563 void
564 vattr_null(vap)
565 	register struct vattr *vap;
566 {
567 
568 	vap->va_type = VNON;
569 	vap->va_size = VNOVAL;
570 	vap->va_bytes = VNOVAL;
571 	vap->va_mode = VNOVAL;
572 	vap->va_nlink = VNOVAL;
573 	vap->va_uid = VNOVAL;
574 	vap->va_gid = VNOVAL;
575 	vap->va_fsid = VNOVAL;
576 	vap->va_fileid = VNOVAL;
577 	vap->va_blocksize = VNOVAL;
578 	vap->va_rdev = VNOVAL;
579 	vap->va_atime.tv_sec = VNOVAL;
580 	vap->va_atime.tv_nsec = VNOVAL;
581 	vap->va_mtime.tv_sec = VNOVAL;
582 	vap->va_mtime.tv_nsec = VNOVAL;
583 	vap->va_ctime.tv_sec = VNOVAL;
584 	vap->va_ctime.tv_nsec = VNOVAL;
585 	vap->va_birthtime.tv_sec = VNOVAL;
586 	vap->va_birthtime.tv_nsec = VNOVAL;
587 	vap->va_flags = VNOVAL;
588 	vap->va_gen = VNOVAL;
589 	vap->va_vaflags = 0;
590 }
591 
592 /*
593  * This routine is called when we have too many vnodes.  It attempts
594  * to free <count> vnodes and will potentially free vnodes that still
595  * have VM backing store (VM backing store is typically the cause
596  * of a vnode blowout so we want to do this).  Therefore, this operation
597  * is not considered cheap.
598  *
599  * A number of conditions may prevent a vnode from being reclaimed.
600  * the buffer cache may have references on the vnode, a directory
601  * vnode may still have references due to the namei cache representing
602  * underlying files, or the vnode may be in active use.   It is not
603  * desireable to reuse such vnodes.  These conditions may cause the
604  * number of vnodes to reach some minimum value regardless of what
605  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
606  */
607 static int
608 vlrureclaim(struct mount *mp, int count)
609 {
610 	struct vnode *vp;
611 	int done;
612 	int trigger;
613 	int usevnodes;
614 
615 	/*
616 	 * Calculate the trigger point, don't allow user
617 	 * screwups to blow us up.   This prevents us from
618 	 * recycling vnodes with lots of resident pages.  We
619 	 * aren't trying to free memory, we are trying to
620 	 * free vnodes.
621 	 */
622 	usevnodes = desiredvnodes;
623 	if (usevnodes <= 0)
624 		usevnodes = 1;
625 	trigger = cnt.v_page_count * 2 / usevnodes;
626 
627 	done = 0;
628 	mtx_lock(&mntvnode_mtx);
629 	while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
630 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
631 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
632 
633 		if (vp->v_type != VNON &&
634 		    vp->v_type != VBAD &&
635 		    VI_TRYLOCK(vp)) {
636 			if (VMIGHTFREE(vp) &&           /* critical path opt */
637 			    (vp->v_object == NULL ||
638 			    vp->v_object->resident_page_count < trigger)) {
639 				mtx_unlock(&mntvnode_mtx);
640 				vgonel(vp, curthread);
641 				done++;
642 				mtx_lock(&mntvnode_mtx);
643 			} else
644 				VI_UNLOCK(vp);
645 		}
646 		--count;
647 	}
648 	mtx_unlock(&mntvnode_mtx);
649 	return done;
650 }
651 
652 /*
653  * Attempt to recycle vnodes in a context that is always safe to block.
654  * Calling vlrurecycle() from the bowels of filesystem code has some
655  * interesting deadlock problems.
656  */
657 static struct proc *vnlruproc;
658 static int vnlruproc_sig;
659 
660 static void
661 vnlru_proc(void)
662 {
663 	struct mount *mp, *nmp;
664 	int s;
665 	int done;
666 	struct proc *p = vnlruproc;
667 	struct thread *td = FIRST_THREAD_IN_PROC(p);	/* XXXKSE */
668 
669 	mtx_lock(&Giant);
670 
671 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
672 	    SHUTDOWN_PRI_FIRST);
673 
674 	s = splbio();
675 	for (;;) {
676 		kthread_suspend_check(p);
677 		mtx_lock(&vnode_free_list_mtx);
678 		if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) {
679 			mtx_unlock(&vnode_free_list_mtx);
680 			vnlruproc_sig = 0;
681 			tsleep(vnlruproc, PVFS, "vlruwt", 0);
682 			continue;
683 		}
684 		mtx_unlock(&vnode_free_list_mtx);
685 		done = 0;
686 		mtx_lock(&mountlist_mtx);
687 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
688 			if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
689 				nmp = TAILQ_NEXT(mp, mnt_list);
690 				continue;
691 			}
692 			done += vlrureclaim(mp, 10);
693 			mtx_lock(&mountlist_mtx);
694 			nmp = TAILQ_NEXT(mp, mnt_list);
695 			vfs_unbusy(mp, td);
696 		}
697 		mtx_unlock(&mountlist_mtx);
698 		if (done == 0) {
699 #if 0
700 			/* These messages are temporary debugging aids */
701 			if (vnlru_nowhere < 5)
702 				printf("vnlru process getting nowhere..\n");
703 			else if (vnlru_nowhere == 5)
704 				printf("vnlru process messages stopped.\n");
705 #endif
706 			vnlru_nowhere++;
707 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
708 		}
709 	}
710 	splx(s);
711 }
712 
713 static struct kproc_desc vnlru_kp = {
714 	"vnlru",
715 	vnlru_proc,
716 	&vnlruproc
717 };
718 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
719 
720 
721 /*
722  * Routines having to do with the management of the vnode table.
723  */
724 
725 /*
726  * Check to see if a free vnode can be recycled.  If it can, return it locked
727  * with the vn lock, but not interlock.  Otherwise indicate the error.
728  */
729 static int
730 vcanrecycle(struct vnode *vp)
731 {
732 	struct thread *td = curthread;
733 	vm_object_t object;
734 	int error;
735 
736 	/* Don't recycle if we can't get the interlock */
737 	if (!mtx_trylock(&vp->v_interlock))
738 		return (EWOULDBLOCK);
739 
740 	/* We should be able to immediately acquire this */
741 	/* XXX This looks like it should panic if it fails */
742 	if (vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td) != 0) {
743 		VI_UNLOCK(vp);
744 		return (EWOULDBLOCK);
745 	}
746 	/*
747 	 * Don't recycle if we still have cached pages.
748 	 */
749 	if (VOP_GETVOBJECT(vp, &object) == 0 &&
750 	     (object->resident_page_count ||
751 	      object->ref_count)) {
752 		error = EBUSY;
753 		goto done;
754 	}
755 	if (LIST_FIRST(&vp->v_cache_src)) {
756 		/*
757 		 * note: nameileafonly sysctl is temporary,
758 		 * for debugging only, and will eventually be
759 		 * removed.
760 		 */
761 		if (nameileafonly > 0) {
762 			/*
763 			 * Do not reuse namei-cached directory
764 			 * vnodes that have cached
765 			 * subdirectories.
766 			 */
767 			if (cache_leaf_test(vp) < 0) {
768 				error = EISDIR;
769 				goto done;
770 			}
771 		} else if (nameileafonly < 0 ||
772 			    vmiodirenable == 0) {
773 			/*
774 			 * Do not reuse namei-cached directory
775 			 * vnodes if nameileafonly is -1 or
776 			 * if VMIO backing for directories is
777 			 * turned off (otherwise we reuse them
778 			 * too quickly).
779 			 */
780 			error = EBUSY;
781 			goto done;
782 		}
783 	}
784 	return (0);
785 done:
786 	VOP_UNLOCK(vp, 0, td);
787 	return (error);
788 }
789 
790 /*
791  * Return the next vnode from the free list.
792  */
793 int
794 getnewvnode(tag, mp, vops, vpp)
795 	enum vtagtype tag;
796 	struct mount *mp;
797 	vop_t **vops;
798 	struct vnode **vpp;
799 {
800 	int s;
801 	struct thread *td = curthread;	/* XXX */
802 	struct vnode *vp = NULL;
803 	struct mount *vnmp;
804 
805 	s = splbio();
806 	mtx_lock(&vnode_free_list_mtx);
807 
808 	/*
809 	 * Try to reuse vnodes if we hit the max.  This situation only
810 	 * occurs in certain large-memory (2G+) situations.  We cannot
811 	 * attempt to directly reclaim vnodes due to nasty recursion
812 	 * problems.
813 	 */
814 	if (vnlruproc_sig == 0 && numvnodes - freevnodes > desiredvnodes) {
815 		vnlruproc_sig = 1;      /* avoid unnecessary wakeups */
816 		wakeup(vnlruproc);
817 	}
818 
819 	/*
820 	 * Attempt to reuse a vnode already on the free list, allocating
821 	 * a new vnode if we can't find one or if we have not reached a
822 	 * good minimum for good LRU performance.
823 	 */
824 
825 	if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) {
826 		int error;
827 		int count;
828 
829 		for (count = 0; count < freevnodes; count++) {
830 			vp = TAILQ_FIRST(&vnode_free_list);
831 
832 			KASSERT(vp->v_usecount == 0,
833 			    ("getnewvnode: free vnode isn't"));
834 
835 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
836 			/*
837 			 * We have to drop the free list mtx to avoid lock
838 			 * order reversals with interlock.
839 			 */
840 			mtx_unlock(&vnode_free_list_mtx);
841 			error = vcanrecycle(vp);
842 			/*
843 			 * Skip over it if its filesystem is being suspended.
844 			 */
845 			if (error == 0 &&
846 			    vn_start_write(vp, &vnmp, V_NOWAIT) != 0)
847 				error = EBUSY;
848 
849 			mtx_lock(&vnode_free_list_mtx);
850 			if (error != 0)
851 				TAILQ_INSERT_TAIL(&vnode_free_list, vp,
852 				    v_freelist);
853 			else
854 				break;
855 		}
856 	}
857 	/*
858 	 * Unlocked access to this vp is ok because we are assured that there
859 	 * are no other references to it.
860 	 */
861 	if (vp) {
862 		freevnodes--;
863 		mtx_unlock(&vnode_free_list_mtx);
864 
865 		vp->v_iflag |= VI_DOOMED;
866 		vp->v_iflag &= ~VI_FREE;
867 		cache_purge(vp);
868 		if (vp->v_type != VBAD) {
869 			VOP_UNLOCK(vp, 0, td);
870 			vgone(vp);
871 		} else {
872 			VOP_UNLOCK(vp, 0, td);
873 		}
874 		vn_finished_write(vnmp);
875 
876 #ifdef INVARIANTS
877 		{
878 			if (vp->v_data)
879 				panic("cleaned vnode isn't");
880 			VI_LOCK(vp);
881 			if (vp->v_numoutput)
882 				panic("Clean vnode has pending I/O's");
883 			if (vp->v_writecount != 0)
884 				panic("Non-zero write count");
885 			VI_UNLOCK(vp);
886 		}
887 #endif
888 		if (vp->v_pollinfo) {
889 			mtx_destroy(&vp->v_pollinfo->vpi_lock);
890 			uma_zfree(vnodepoll_zone, vp->v_pollinfo);
891 		}
892 		vp->v_pollinfo = NULL;
893 #ifdef MAC
894 		mac_destroy_vnode(vp);
895 #endif
896 		vp->v_iflag = 0;
897 		vp->v_vflag = 0;
898 		vp->v_lastw = 0;
899 		vp->v_lasta = 0;
900 		vp->v_cstart = 0;
901 		vp->v_clen = 0;
902 		vp->v_socket = 0;
903 		KASSERT(vp->v_cleanblkroot == NULL, ("cleanblkroot not NULL"));
904 		KASSERT(vp->v_dirtyblkroot == NULL, ("dirtyblkroot not NULL"));
905 	} else {
906 		numvnodes++;
907 		mtx_unlock(&vnode_free_list_mtx);
908 
909 		vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
910 		mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
911 		vp->v_dd = vp;
912 		cache_purge(vp);
913 		LIST_INIT(&vp->v_cache_src);
914 		TAILQ_INIT(&vp->v_cache_dst);
915 	}
916 
917 	TAILQ_INIT(&vp->v_cleanblkhd);
918 	TAILQ_INIT(&vp->v_dirtyblkhd);
919 	vp->v_type = VNON;
920 	vp->v_tag = tag;
921 	vp->v_op = vops;
922 	lockinit(&vp->v_lock, PVFS, "vnlock", VLKTIMEOUT, LK_NOPAUSE);
923 #ifdef MAC
924 	mac_init_vnode(vp);
925 #endif
926 	insmntque(vp, mp);
927 	*vpp = vp;
928 	vp->v_usecount = 1;
929 	vp->v_data = 0;
930 	vp->v_cachedid = -1;
931 
932 	splx(s);
933 
934 #if 0
935 	mp_fixme("This code does not lock access to numvnodes && freevnodes.");
936 	vnodeallocs++;
937 	if (vnodeallocs % vnoderecycleperiod == 0 &&
938 	    freevnodes < vnoderecycleminfreevn &&
939 	    vnoderecyclemintotalvn < numvnodes) {
940 		/* Recycle vnodes. */
941 		cache_purgeleafdirs(vnoderecyclenumber);
942 	}
943 #endif
944 
945 	return (0);
946 }
947 
948 /*
949  * Move a vnode from one mount queue to another.
950  */
951 static void
952 insmntque(vp, mp)
953 	register struct vnode *vp;
954 	register struct mount *mp;
955 {
956 
957 	mtx_lock(&mntvnode_mtx);
958 	/*
959 	 * Delete from old mount point vnode list, if on one.
960 	 */
961 	if (vp->v_mount != NULL)
962 		TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes);
963 	/*
964 	 * Insert into list of vnodes for the new mount point, if available.
965 	 */
966 	if ((vp->v_mount = mp) == NULL) {
967 		mtx_unlock(&mntvnode_mtx);
968 		return;
969 	}
970 	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
971 	mtx_unlock(&mntvnode_mtx);
972 }
973 
974 /*
975  * Update outstanding I/O count and do wakeup if requested.
976  */
977 void
978 vwakeup(bp)
979 	register struct buf *bp;
980 {
981 	register struct vnode *vp;
982 
983 	bp->b_flags &= ~B_WRITEINPROG;
984 	if ((vp = bp->b_vp)) {
985 		VI_LOCK(vp);
986 		vp->v_numoutput--;
987 		if (vp->v_numoutput < 0)
988 			panic("vwakeup: neg numoutput");
989 		if ((vp->v_numoutput == 0) && (vp->v_iflag & VI_BWAIT)) {
990 			vp->v_iflag &= ~VI_BWAIT;
991 			wakeup(&vp->v_numoutput);
992 		}
993 		VI_UNLOCK(vp);
994 	}
995 }
996 
997 /*
998  * Flush out and invalidate all buffers associated with a vnode.
999  * Called with the underlying object locked.
1000  */
1001 int
1002 vinvalbuf(vp, flags, cred, td, slpflag, slptimeo)
1003 	struct vnode *vp;
1004 	int flags;
1005 	struct ucred *cred;
1006 	struct thread *td;
1007 	int slpflag, slptimeo;
1008 {
1009 	struct buf *blist;
1010 	int s, error;
1011 	vm_object_t object;
1012 
1013 	GIANT_REQUIRED;
1014 
1015 	if (flags & V_SAVE) {
1016 		s = splbio();
1017 		VI_LOCK(vp);
1018 		while (vp->v_numoutput) {
1019 			vp->v_iflag |= VI_BWAIT;
1020 			error = msleep(&vp->v_numoutput, VI_MTX(vp),
1021 			    slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
1022 			if (error) {
1023 				VI_UNLOCK(vp);
1024 				splx(s);
1025 				return (error);
1026 			}
1027 		}
1028 		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
1029 			splx(s);
1030 			VI_UNLOCK(vp);
1031 			if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, td)) != 0)
1032 				return (error);
1033 			/*
1034 			 * XXX We could save a lock/unlock if this was only
1035 			 * enabled under INVARIANTS
1036 			 */
1037 			VI_LOCK(vp);
1038 			s = splbio();
1039 			if (vp->v_numoutput > 0 ||
1040 			    !TAILQ_EMPTY(&vp->v_dirtyblkhd))
1041 				panic("vinvalbuf: dirty bufs");
1042 		}
1043 		VI_UNLOCK(vp);
1044 		splx(s);
1045 	}
1046 	s = splbio();
1047 	for (error = 0;;) {
1048 		if ((blist = TAILQ_FIRST(&vp->v_cleanblkhd)) != 0 &&
1049 		    flushbuflist(blist, flags, vp, slpflag, slptimeo, &error)) {
1050 			if (error)
1051 				break;
1052 			continue;
1053 		}
1054 		if ((blist = TAILQ_FIRST(&vp->v_dirtyblkhd)) != 0 &&
1055 		    flushbuflist(blist, flags, vp, slpflag, slptimeo, &error)) {
1056 			if (error)
1057 				break;
1058 			continue;
1059 		}
1060 		break;
1061 	}
1062 	if (error) {
1063 		splx(s);
1064 		return (error);
1065 	}
1066 
1067 	/*
1068 	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
1069 	 * have write I/O in-progress but if there is a VM object then the
1070 	 * VM object can also have read-I/O in-progress.
1071 	 */
1072 	VI_LOCK(vp);
1073 	do {
1074 		while (vp->v_numoutput > 0) {
1075 			vp->v_iflag |= VI_BWAIT;
1076 			msleep(&vp->v_numoutput, VI_MTX(vp), PVM, "vnvlbv", 0);
1077 		}
1078 		VI_UNLOCK(vp);
1079 		if (VOP_GETVOBJECT(vp, &object) == 0) {
1080 			while (object->paging_in_progress)
1081 			vm_object_pip_sleep(object, "vnvlbx");
1082 		}
1083 		VI_LOCK(vp);
1084 	} while (vp->v_numoutput > 0);
1085 	VI_UNLOCK(vp);
1086 
1087 	splx(s);
1088 
1089 	/*
1090 	 * Destroy the copy in the VM cache, too.
1091 	 */
1092 	if (VOP_GETVOBJECT(vp, &object) == 0) {
1093 		vm_object_page_remove(object, 0, 0,
1094 			(flags & V_SAVE) ? TRUE : FALSE);
1095 	}
1096 
1097 	if ((flags & (V_ALT | V_NORMAL)) == 0 &&
1098 	    (!TAILQ_EMPTY(&vp->v_dirtyblkhd) ||
1099 	     !TAILQ_EMPTY(&vp->v_cleanblkhd)))
1100 		panic("vinvalbuf: flush failed");
1101 	return (0);
1102 }
1103 
1104 /*
1105  * Flush out buffers on the specified list.
1106  */
1107 static int
1108 flushbuflist(blist, flags, vp, slpflag, slptimeo, errorp)
1109 	struct buf *blist;
1110 	int flags;
1111 	struct vnode *vp;
1112 	int slpflag, slptimeo;
1113 	int *errorp;
1114 {
1115 	struct buf *bp, *nbp;
1116 	int found, error;
1117 
1118 	for (found = 0, bp = blist; bp; bp = nbp) {
1119 		nbp = TAILQ_NEXT(bp, b_vnbufs);
1120 		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1121 		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))
1122 			continue;
1123 		found += 1;
1124 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
1125 			error = BUF_TIMELOCK(bp,
1126 			    LK_EXCLUSIVE | LK_SLEEPFAIL,
1127 			    "flushbuf", slpflag, slptimeo);
1128 			if (error != ENOLCK)
1129 				*errorp = error;
1130 			return (found);
1131 		}
1132 		/*
1133 		 * XXX Since there are no node locks for NFS, I
1134 		 * believe there is a slight chance that a delayed
1135 		 * write will occur while sleeping just above, so
1136 		 * check for it.  Note that vfs_bio_awrite expects
1137 		 * buffers to reside on a queue, while BUF_WRITE and
1138 		 * brelse do not.
1139 		 */
1140 		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1141 			(flags & V_SAVE)) {
1142 
1143 			if (bp->b_vp == vp) {
1144 				if (bp->b_flags & B_CLUSTEROK) {
1145 					BUF_UNLOCK(bp);
1146 					vfs_bio_awrite(bp);
1147 				} else {
1148 					bremfree(bp);
1149 					bp->b_flags |= B_ASYNC;
1150 					BUF_WRITE(bp);
1151 				}
1152 			} else {
1153 				bremfree(bp);
1154 				(void) BUF_WRITE(bp);
1155 			}
1156 			return (found);
1157 		}
1158 		bremfree(bp);
1159 		bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
1160 		bp->b_flags &= ~B_ASYNC;
1161 		brelse(bp);
1162 	}
1163 	return (found);
1164 }
1165 
1166 /*
1167  * Truncate a file's buffer and pages to a specified length.  This
1168  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1169  * sync activity.
1170  */
1171 int
1172 vtruncbuf(vp, cred, td, length, blksize)
1173 	register struct vnode *vp;
1174 	struct ucred *cred;
1175 	struct thread *td;
1176 	off_t length;
1177 	int blksize;
1178 {
1179 	register struct buf *bp;
1180 	struct buf *nbp;
1181 	int s, anyfreed;
1182 	int trunclbn;
1183 
1184 	/*
1185 	 * Round up to the *next* lbn.
1186 	 */
1187 	trunclbn = (length + blksize - 1) / blksize;
1188 
1189 	s = splbio();
1190 restart:
1191 	anyfreed = 1;
1192 	for (;anyfreed;) {
1193 		anyfreed = 0;
1194 		for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
1195 			nbp = TAILQ_NEXT(bp, b_vnbufs);
1196 			if (bp->b_lblkno >= trunclbn) {
1197 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
1198 					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
1199 					goto restart;
1200 				} else {
1201 					bremfree(bp);
1202 					bp->b_flags |= (B_INVAL | B_RELBUF);
1203 					bp->b_flags &= ~B_ASYNC;
1204 					brelse(bp);
1205 					anyfreed = 1;
1206 				}
1207 				if (nbp &&
1208 				    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1209 				    (nbp->b_vp != vp) ||
1210 				    (nbp->b_flags & B_DELWRI))) {
1211 					goto restart;
1212 				}
1213 			}
1214 		}
1215 
1216 		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
1217 			nbp = TAILQ_NEXT(bp, b_vnbufs);
1218 			if (bp->b_lblkno >= trunclbn) {
1219 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
1220 					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
1221 					goto restart;
1222 				} else {
1223 					bremfree(bp);
1224 					bp->b_flags |= (B_INVAL | B_RELBUF);
1225 					bp->b_flags &= ~B_ASYNC;
1226 					brelse(bp);
1227 					anyfreed = 1;
1228 				}
1229 				if (nbp &&
1230 				    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1231 				    (nbp->b_vp != vp) ||
1232 				    (nbp->b_flags & B_DELWRI) == 0)) {
1233 					goto restart;
1234 				}
1235 			}
1236 		}
1237 	}
1238 
1239 	if (length > 0) {
1240 restartsync:
1241 		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
1242 			nbp = TAILQ_NEXT(bp, b_vnbufs);
1243 			if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
1244 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
1245 					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
1246 					goto restart;
1247 				} else {
1248 					bremfree(bp);
1249 					if (bp->b_vp == vp) {
1250 						bp->b_flags |= B_ASYNC;
1251 					} else {
1252 						bp->b_flags &= ~B_ASYNC;
1253 					}
1254 					BUF_WRITE(bp);
1255 				}
1256 				goto restartsync;
1257 			}
1258 
1259 		}
1260 	}
1261 
1262 	VI_LOCK(vp);
1263 	while (vp->v_numoutput > 0) {
1264 		vp->v_iflag |= VI_BWAIT;
1265 		msleep(&vp->v_numoutput, VI_MTX(vp), PVM, "vbtrunc", 0);
1266 	}
1267 	VI_UNLOCK(vp);
1268 	splx(s);
1269 
1270 	vnode_pager_setsize(vp, length);
1271 
1272 	return (0);
1273 }
1274 
1275 /*
1276  * buf_splay() - splay tree core for the clean/dirty list of buffers in
1277  * 		 a vnode.
1278  *
1279  *	NOTE: We have to deal with the special case of a background bitmap
1280  *	buffer, a situation where two buffers will have the same logical
1281  *	block offset.  We want (1) only the foreground buffer to be accessed
1282  *	in a lookup and (2) must differentiate between the foreground and
1283  *	background buffer in the splay tree algorithm because the splay
1284  *	tree cannot normally handle multiple entities with the same 'index'.
1285  *	We accomplish this by adding differentiating flags to the splay tree's
1286  *	numerical domain.
1287  */
1288 static
1289 struct buf *
1290 buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
1291 {
1292 	struct buf dummy;
1293 	struct buf *lefttreemax, *righttreemin, *y;
1294 
1295 	if (root == NULL)
1296 		return (NULL);
1297 	lefttreemax = righttreemin = &dummy;
1298 	for (;;) {
1299 		if (lblkno < root->b_lblkno ||
1300 		    (lblkno == root->b_lblkno &&
1301 		    (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1302 			if ((y = root->b_left) == NULL)
1303 				break;
1304 			if (lblkno < y->b_lblkno) {
1305 				/* Rotate right. */
1306 				root->b_left = y->b_right;
1307 				y->b_right = root;
1308 				root = y;
1309 				if ((y = root->b_left) == NULL)
1310 					break;
1311 			}
1312 			/* Link into the new root's right tree. */
1313 			righttreemin->b_left = root;
1314 			righttreemin = root;
1315 		} else if (lblkno > root->b_lblkno ||
1316 		    (lblkno == root->b_lblkno &&
1317 		    (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
1318 			if ((y = root->b_right) == NULL)
1319 				break;
1320 			if (lblkno > y->b_lblkno) {
1321 				/* Rotate left. */
1322 				root->b_right = y->b_left;
1323 				y->b_left = root;
1324 				root = y;
1325 				if ((y = root->b_right) == NULL)
1326 					break;
1327 			}
1328 			/* Link into the new root's left tree. */
1329 			lefttreemax->b_right = root;
1330 			lefttreemax = root;
1331 		} else {
1332 			break;
1333 		}
1334 		root = y;
1335 	}
1336 	/* Assemble the new root. */
1337 	lefttreemax->b_right = root->b_left;
1338 	righttreemin->b_left = root->b_right;
1339 	root->b_left = dummy.b_right;
1340 	root->b_right = dummy.b_left;
1341 	return (root);
1342 }
1343 
1344 static
1345 void
1346 buf_vlist_remove(struct buf *bp)
1347 {
1348 	struct vnode *vp = bp->b_vp;
1349 	struct buf *root;
1350 
1351 	if (bp->b_xflags & BX_VNDIRTY) {
1352 		if (bp != vp->v_dirtyblkroot) {
1353 			root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_dirtyblkroot);
1354 			KASSERT(root == bp, ("splay lookup failed during dirty remove"));
1355 		}
1356 		if (bp->b_left == NULL) {
1357 			root = bp->b_right;
1358 		} else {
1359 			root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
1360 			root->b_right = bp->b_right;
1361 		}
1362 		vp->v_dirtyblkroot = root;
1363 		TAILQ_REMOVE(&vp->v_dirtyblkhd, bp, b_vnbufs);
1364 	} else {
1365 		/* KASSERT(bp->b_xflags & BX_VNCLEAN, ("bp wasn't clean")); */
1366 		if (bp != vp->v_cleanblkroot) {
1367 			root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_cleanblkroot);
1368 			KASSERT(root == bp, ("splay lookup failed during clean remove"));
1369 		}
1370 		if (bp->b_left == NULL) {
1371 			root = bp->b_right;
1372 		} else {
1373 			root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
1374 			root->b_right = bp->b_right;
1375 		}
1376 		vp->v_cleanblkroot = root;
1377 		TAILQ_REMOVE(&vp->v_cleanblkhd, bp, b_vnbufs);
1378 	}
1379 	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1380 }
1381 
1382 /*
1383  * Add the buffer to the sorted clean or dirty block list using a
1384  * splay tree algorithm.
1385  *
1386  * NOTE: xflags is passed as a constant, optimizing this inline function!
1387  */
1388 static
1389 void
1390 buf_vlist_add(struct buf *bp, struct vnode *vp, b_xflags_t xflags)
1391 {
1392 	struct buf *root;
1393 
1394 	bp->b_xflags |= xflags;
1395 	if (xflags & BX_VNDIRTY) {
1396 		root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_dirtyblkroot);
1397 		if (root == NULL) {
1398 			bp->b_left = NULL;
1399 			bp->b_right = NULL;
1400 			TAILQ_INSERT_TAIL(&vp->v_dirtyblkhd, bp, b_vnbufs);
1401 		} else if (bp->b_lblkno < root->b_lblkno ||
1402 		    (bp->b_lblkno == root->b_lblkno &&
1403 		    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1404 			bp->b_left = root->b_left;
1405 			bp->b_right = root;
1406 			root->b_left = NULL;
1407 			TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
1408 		} else {
1409 			bp->b_right = root->b_right;
1410 			bp->b_left = root;
1411 			root->b_right = NULL;
1412 			TAILQ_INSERT_AFTER(&vp->v_dirtyblkhd,
1413 			    root, bp, b_vnbufs);
1414 		}
1415 		vp->v_dirtyblkroot = bp;
1416 	} else {
1417 		/* KASSERT(xflags & BX_VNCLEAN, ("xflags not clean")); */
1418 		root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_cleanblkroot);
1419 		if (root == NULL) {
1420 			bp->b_left = NULL;
1421 			bp->b_right = NULL;
1422 			TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
1423 		} else if (bp->b_lblkno < root->b_lblkno ||
1424 		    (bp->b_lblkno == root->b_lblkno &&
1425 		    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1426 			bp->b_left = root->b_left;
1427 			bp->b_right = root;
1428 			root->b_left = NULL;
1429 			TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
1430 		} else {
1431 			bp->b_right = root->b_right;
1432 			bp->b_left = root;
1433 			root->b_right = NULL;
1434 			TAILQ_INSERT_AFTER(&vp->v_cleanblkhd,
1435 			    root, bp, b_vnbufs);
1436 		}
1437 		vp->v_cleanblkroot = bp;
1438 	}
1439 }
1440 
1441 #ifndef USE_BUFHASH
1442 
1443 /*
1444  * Lookup a buffer using the splay tree.  Note that we specifically avoid
1445  * shadow buffers used in background bitmap writes.
1446  *
1447  * This code isn't quite efficient as it could be because we are maintaining
1448  * two sorted lists and do not know which list the block resides in.
1449  */
1450 struct buf *
1451 gbincore(struct vnode *vp, daddr_t lblkno)
1452 {
1453 	struct buf *bp;
1454 
1455 	GIANT_REQUIRED;
1456 
1457 	bp = vp->v_cleanblkroot = buf_splay(lblkno, 0, vp->v_cleanblkroot);
1458 	if (bp && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1459 		return(bp);
1460 	bp = vp->v_dirtyblkroot = buf_splay(lblkno, 0, vp->v_dirtyblkroot);
1461 	if (bp && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1462 		return(bp);
1463 	return(NULL);
1464 }
1465 
1466 #endif
1467 
1468 /*
1469  * Associate a buffer with a vnode.
1470  */
1471 void
1472 bgetvp(vp, bp)
1473 	register struct vnode *vp;
1474 	register struct buf *bp;
1475 {
1476 	int s;
1477 
1478 	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
1479 
1480 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1481 	    ("bgetvp: bp already attached! %p", bp));
1482 
1483 	vhold(vp);
1484 	bp->b_vp = vp;
1485 	bp->b_dev = vn_todev(vp);
1486 	/*
1487 	 * Insert onto list for new vnode.
1488 	 */
1489 	s = splbio();
1490 	buf_vlist_add(bp, vp, BX_VNCLEAN);
1491 	splx(s);
1492 }
1493 
1494 /*
1495  * Disassociate a buffer from a vnode.
1496  */
1497 void
1498 brelvp(bp)
1499 	register struct buf *bp;
1500 {
1501 	struct vnode *vp;
1502 	int s;
1503 
1504 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1505 
1506 	/*
1507 	 * Delete from old vnode list, if on one.
1508 	 */
1509 	vp = bp->b_vp;
1510 	s = splbio();
1511 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1512 		buf_vlist_remove(bp);
1513 	VI_LOCK(vp);
1514 	if ((vp->v_iflag & VI_ONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
1515 		vp->v_iflag &= ~VI_ONWORKLST;
1516 		LIST_REMOVE(vp, v_synclist);
1517 	}
1518 	VI_UNLOCK(vp);
1519 	splx(s);
1520 	bp->b_vp = (struct vnode *) 0;
1521 	vdrop(vp);
1522 	if (bp->b_object)
1523 		bp->b_object = NULL;
1524 }
1525 
1526 /*
1527  * Add an item to the syncer work queue.
1528  */
1529 static void
1530 vn_syncer_add_to_worklist(struct vnode *vp, int delay)
1531 {
1532 	int s, slot;
1533 
1534 	s = splbio();
1535 	ASSERT_VI_LOCKED(vp);
1536 
1537 	if (vp->v_iflag & VI_ONWORKLST)
1538 		LIST_REMOVE(vp, v_synclist);
1539 	else
1540 		vp->v_iflag |= VI_ONWORKLST;
1541 
1542 	if (delay > syncer_maxdelay - 2)
1543 		delay = syncer_maxdelay - 2;
1544 	slot = (syncer_delayno + delay) & syncer_mask;
1545 
1546 	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
1547 
1548 	splx(s);
1549 }
1550 
1551 struct  proc *updateproc;
1552 static void sched_sync(void);
1553 static struct kproc_desc up_kp = {
1554 	"syncer",
1555 	sched_sync,
1556 	&updateproc
1557 };
1558 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
1559 
1560 /*
1561  * System filesystem synchronizer daemon.
1562  */
1563 void
1564 sched_sync(void)
1565 {
1566 	struct synclist *slp;
1567 	struct vnode *vp;
1568 	struct mount *mp;
1569 	long starttime;
1570 	int s;
1571 	struct thread *td = FIRST_THREAD_IN_PROC(updateproc);  /* XXXKSE */
1572 
1573 	mtx_lock(&Giant);
1574 
1575 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, td->td_proc,
1576 	    SHUTDOWN_PRI_LAST);
1577 
1578 	for (;;) {
1579 		kthread_suspend_check(td->td_proc);
1580 
1581 		starttime = time_second;
1582 
1583 		/*
1584 		 * Push files whose dirty time has expired.  Be careful
1585 		 * of interrupt race on slp queue.
1586 		 */
1587 		s = splbio();
1588 		slp = &syncer_workitem_pending[syncer_delayno];
1589 		syncer_delayno += 1;
1590 		if (syncer_delayno == syncer_maxdelay)
1591 			syncer_delayno = 0;
1592 		splx(s);
1593 
1594 		while ((vp = LIST_FIRST(slp)) != NULL) {
1595 			if (VOP_ISLOCKED(vp, NULL) == 0 &&
1596 			    vn_start_write(vp, &mp, V_NOWAIT) == 0) {
1597 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1598 				(void) VOP_FSYNC(vp, td->td_ucred, MNT_LAZY, td);
1599 				VOP_UNLOCK(vp, 0, td);
1600 				vn_finished_write(mp);
1601 			}
1602 			s = splbio();
1603 			if (LIST_FIRST(slp) == vp) {
1604 				/*
1605 				 * Note: v_tag VT_VFS vps can remain on the
1606 				 * worklist too with no dirty blocks, but
1607 				 * since sync_fsync() moves it to a different
1608 				 * slot we are safe.
1609 				 */
1610 				if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
1611 				    !vn_isdisk(vp, NULL))
1612 					panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
1613 				/*
1614 				 * Put us back on the worklist.  The worklist
1615 				 * routine will remove us from our current
1616 				 * position and then add us back in at a later
1617 				 * position.
1618 				 */
1619 				VI_LOCK(vp);
1620 				vn_syncer_add_to_worklist(vp, syncdelay);
1621 				VI_UNLOCK(vp);
1622 			}
1623 			splx(s);
1624 		}
1625 
1626 		/*
1627 		 * Do soft update processing.
1628 		 */
1629 		if (softdep_process_worklist_hook != NULL)
1630 			(*softdep_process_worklist_hook)(NULL);
1631 
1632 		/*
1633 		 * The variable rushjob allows the kernel to speed up the
1634 		 * processing of the filesystem syncer process. A rushjob
1635 		 * value of N tells the filesystem syncer to process the next
1636 		 * N seconds worth of work on its queue ASAP. Currently rushjob
1637 		 * is used by the soft update code to speed up the filesystem
1638 		 * syncer process when the incore state is getting so far
1639 		 * ahead of the disk that the kernel memory pool is being
1640 		 * threatened with exhaustion.
1641 		 */
1642 		if (rushjob > 0) {
1643 			rushjob -= 1;
1644 			continue;
1645 		}
1646 		/*
1647 		 * If it has taken us less than a second to process the
1648 		 * current work, then wait. Otherwise start right over
1649 		 * again. We can still lose time if any single round
1650 		 * takes more than two seconds, but it does not really
1651 		 * matter as we are just trying to generally pace the
1652 		 * filesystem activity.
1653 		 */
1654 		if (time_second == starttime)
1655 			tsleep(&lbolt, PPAUSE, "syncer", 0);
1656 	}
1657 }
1658 
1659 /*
1660  * Request the syncer daemon to speed up its work.
1661  * We never push it to speed up more than half of its
1662  * normal turn time, otherwise it could take over the cpu.
1663  * XXXKSE  only one update?
1664  */
1665 int
1666 speedup_syncer()
1667 {
1668 
1669 	mtx_lock_spin(&sched_lock);
1670 	if (FIRST_THREAD_IN_PROC(updateproc)->td_wchan == &lbolt) /* XXXKSE */
1671 		setrunnable(FIRST_THREAD_IN_PROC(updateproc));
1672 	mtx_unlock_spin(&sched_lock);
1673 	if (rushjob < syncdelay / 2) {
1674 		rushjob += 1;
1675 		stat_rush_requests += 1;
1676 		return (1);
1677 	}
1678 	return(0);
1679 }
1680 
1681 /*
1682  * Associate a p-buffer with a vnode.
1683  *
1684  * Also sets B_PAGING flag to indicate that vnode is not fully associated
1685  * with the buffer.  i.e. the bp has not been linked into the vnode or
1686  * ref-counted.
1687  */
1688 void
1689 pbgetvp(vp, bp)
1690 	register struct vnode *vp;
1691 	register struct buf *bp;
1692 {
1693 
1694 	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
1695 
1696 	bp->b_vp = vp;
1697 	bp->b_flags |= B_PAGING;
1698 	bp->b_dev = vn_todev(vp);
1699 }
1700 
1701 /*
1702  * Disassociate a p-buffer from a vnode.
1703  */
1704 void
1705 pbrelvp(bp)
1706 	register struct buf *bp;
1707 {
1708 
1709 	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
1710 
1711 	/* XXX REMOVE ME */
1712 	if (TAILQ_NEXT(bp, b_vnbufs) != NULL) {
1713 		panic(
1714 		    "relpbuf(): b_vp was probably reassignbuf()d %p %x",
1715 		    bp,
1716 		    (int)bp->b_flags
1717 		);
1718 	}
1719 	bp->b_vp = (struct vnode *) 0;
1720 	bp->b_flags &= ~B_PAGING;
1721 }
1722 
1723 /*
1724  * Reassign a buffer from one vnode to another.
1725  * Used to assign file specific control information
1726  * (indirect blocks) to the vnode to which they belong.
1727  */
1728 void
1729 reassignbuf(bp, newvp)
1730 	register struct buf *bp;
1731 	register struct vnode *newvp;
1732 {
1733 	int delay;
1734 	int s;
1735 
1736 	if (newvp == NULL) {
1737 		printf("reassignbuf: NULL");
1738 		return;
1739 	}
1740 	++reassignbufcalls;
1741 
1742 	/*
1743 	 * B_PAGING flagged buffers cannot be reassigned because their vp
1744 	 * is not fully linked in.
1745 	 */
1746 	if (bp->b_flags & B_PAGING)
1747 		panic("cannot reassign paging buffer");
1748 
1749 	s = splbio();
1750 	/*
1751 	 * Delete from old vnode list, if on one.
1752 	 */
1753 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
1754 		buf_vlist_remove(bp);
1755 		if (bp->b_vp != newvp) {
1756 			vdrop(bp->b_vp);
1757 			bp->b_vp = NULL;	/* for clarification */
1758 		}
1759 	}
1760 	/*
1761 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
1762 	 * of clean buffers.
1763 	 */
1764 	if (bp->b_flags & B_DELWRI) {
1765 		VI_LOCK(newvp);
1766 		if ((newvp->v_iflag & VI_ONWORKLST) == 0) {
1767 			switch (newvp->v_type) {
1768 			case VDIR:
1769 				delay = dirdelay;
1770 				break;
1771 			case VCHR:
1772 				if (newvp->v_rdev->si_mountpoint != NULL) {
1773 					delay = metadelay;
1774 					break;
1775 				}
1776 				/* fall through */
1777 			default:
1778 				delay = filedelay;
1779 			}
1780 			vn_syncer_add_to_worklist(newvp, delay);
1781 		}
1782 		VI_UNLOCK(newvp);
1783 		buf_vlist_add(bp, newvp, BX_VNDIRTY);
1784 	} else {
1785 		buf_vlist_add(bp, newvp, BX_VNCLEAN);
1786 
1787 		VI_LOCK(newvp);
1788 		if ((newvp->v_iflag & VI_ONWORKLST) &&
1789 		    TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
1790 			newvp->v_iflag &= ~VI_ONWORKLST;
1791 			LIST_REMOVE(newvp, v_synclist);
1792 		}
1793 		VI_UNLOCK(newvp);
1794 	}
1795 	if (bp->b_vp != newvp) {
1796 		bp->b_vp = newvp;
1797 		vhold(bp->b_vp);
1798 	}
1799 	splx(s);
1800 }
1801 
1802 /*
1803  * Create a vnode for a device.
1804  * Used for mounting the root filesystem.
1805  */
1806 int
1807 bdevvp(dev, vpp)
1808 	dev_t dev;
1809 	struct vnode **vpp;
1810 {
1811 	register struct vnode *vp;
1812 	struct vnode *nvp;
1813 	int error;
1814 
1815 	if (dev == NODEV) {
1816 		*vpp = NULLVP;
1817 		return (ENXIO);
1818 	}
1819 	if (vfinddev(dev, VCHR, vpp))
1820 		return (0);
1821 	error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
1822 	if (error) {
1823 		*vpp = NULLVP;
1824 		return (error);
1825 	}
1826 	vp = nvp;
1827 	vp->v_type = VCHR;
1828 	addalias(vp, dev);
1829 	*vpp = vp;
1830 	return (0);
1831 }
1832 
1833 /*
1834  * Add vnode to the alias list hung off the dev_t.
1835  *
1836  * The reason for this gunk is that multiple vnodes can reference
1837  * the same physical device, so checking vp->v_usecount to see
1838  * how many users there are is inadequate; the v_usecount for
1839  * the vnodes need to be accumulated.  vcount() does that.
1840  */
1841 struct vnode *
1842 addaliasu(nvp, nvp_rdev)
1843 	struct vnode *nvp;
1844 	udev_t nvp_rdev;
1845 {
1846 	struct vnode *ovp;
1847 	vop_t **ops;
1848 	dev_t dev;
1849 
1850 	if (nvp->v_type == VBLK)
1851 		return (nvp);
1852 	if (nvp->v_type != VCHR)
1853 		panic("addaliasu on non-special vnode");
1854 	dev = udev2dev(nvp_rdev, 0);
1855 	/*
1856 	 * Check to see if we have a bdevvp vnode with no associated
1857 	 * filesystem. If so, we want to associate the filesystem of
1858 	 * the new newly instigated vnode with the bdevvp vnode and
1859 	 * discard the newly created vnode rather than leaving the
1860 	 * bdevvp vnode lying around with no associated filesystem.
1861 	 */
1862 	if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) {
1863 		addalias(nvp, dev);
1864 		return (nvp);
1865 	}
1866 	/*
1867 	 * Discard unneeded vnode, but save its node specific data.
1868 	 * Note that if there is a lock, it is carried over in the
1869 	 * node specific data to the replacement vnode.
1870 	 */
1871 	vref(ovp);
1872 	ovp->v_data = nvp->v_data;
1873 	ovp->v_tag = nvp->v_tag;
1874 	nvp->v_data = NULL;
1875 	lockinit(&ovp->v_lock, PVFS, nvp->v_lock.lk_wmesg,
1876 	    nvp->v_lock.lk_timo, nvp->v_lock.lk_flags & LK_EXTFLG_MASK);
1877 	if (nvp->v_vnlock)
1878 		ovp->v_vnlock = &ovp->v_lock;
1879 	ops = ovp->v_op;
1880 	ovp->v_op = nvp->v_op;
1881 	if (VOP_ISLOCKED(nvp, curthread)) {
1882 		VOP_UNLOCK(nvp, 0, curthread);
1883 		vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curthread);
1884 	}
1885 	nvp->v_op = ops;
1886 	insmntque(ovp, nvp->v_mount);
1887 	vrele(nvp);
1888 	vgone(nvp);
1889 	return (ovp);
1890 }
1891 
1892 /* This is a local helper function that do the same as addaliasu, but for a
1893  * dev_t instead of an udev_t. */
1894 static void
1895 addalias(nvp, dev)
1896 	struct vnode *nvp;
1897 	dev_t dev;
1898 {
1899 
1900 	KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode"));
1901 	nvp->v_rdev = dev;
1902 	mtx_lock(&spechash_mtx);
1903 	SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
1904 	mtx_unlock(&spechash_mtx);
1905 }
1906 
1907 /*
1908  * Grab a particular vnode from the free list, increment its
1909  * reference count and lock it. The vnode lock bit is set if the
1910  * vnode is being eliminated in vgone. The process is awakened
1911  * when the transition is completed, and an error returned to
1912  * indicate that the vnode is no longer usable (possibly having
1913  * been changed to a new filesystem type).
1914  */
1915 int
1916 vget(vp, flags, td)
1917 	register struct vnode *vp;
1918 	int flags;
1919 	struct thread *td;
1920 {
1921 	int error;
1922 
1923 	/*
1924 	 * If the vnode is in the process of being cleaned out for
1925 	 * another use, we wait for the cleaning to finish and then
1926 	 * return failure. Cleaning is determined by checking that
1927 	 * the VI_XLOCK flag is set.
1928 	 */
1929 	if ((flags & LK_INTERLOCK) == 0)
1930 		VI_LOCK(vp);
1931 	if (vp->v_iflag & VI_XLOCK) {
1932 		if (vp->v_vxproc == curthread) {
1933 #if 0
1934 			/* this can now occur in normal operation */
1935 			log(LOG_INFO, "VXLOCK interlock avoided\n");
1936 #endif
1937 		} else {
1938 			vp->v_iflag |= VI_XWANT;
1939 			msleep(vp, VI_MTX(vp), PINOD | PDROP, "vget", 0);
1940 			mp_fixme("interlock not released.");
1941 			return (ENOENT);
1942 		}
1943 	}
1944 
1945 	vp->v_usecount++;
1946 
1947 	if (VSHOULDBUSY(vp))
1948 		vbusy(vp);
1949 	if (flags & LK_TYPE_MASK) {
1950 		if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) {
1951 			/*
1952 			 * must expand vrele here because we do not want
1953 			 * to call VOP_INACTIVE if the reference count
1954 			 * drops back to zero since it was never really
1955 			 * active. We must remove it from the free list
1956 			 * before sleeping so that multiple processes do
1957 			 * not try to recycle it.
1958 			 */
1959 			VI_LOCK(vp);
1960 			vp->v_usecount--;
1961 			if (VSHOULDFREE(vp))
1962 				vfree(vp);
1963 			else
1964 				vlruvp(vp);
1965 			VI_UNLOCK(vp);
1966 		}
1967 		return (error);
1968 	}
1969 	VI_UNLOCK(vp);
1970 	return (0);
1971 }
1972 
1973 /*
1974  * Increase the reference count of a vnode.
1975  */
1976 void
1977 vref(struct vnode *vp)
1978 {
1979 	mtx_lock(&vp->v_interlock);
1980 	vp->v_usecount++;
1981 	mtx_unlock(&vp->v_interlock);
1982 }
1983 
1984 /*
1985  * Vnode put/release.
1986  * If count drops to zero, call inactive routine and return to freelist.
1987  */
1988 void
1989 vrele(vp)
1990 	struct vnode *vp;
1991 {
1992 	struct thread *td = curthread;	/* XXX */
1993 
1994 	KASSERT(vp != NULL, ("vrele: null vp"));
1995 
1996 	VI_LOCK(vp);
1997 
1998 	/* Skip this v_writecount check if we're going to panic below. */
1999 	KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
2000 	    ("vrele: missed vn_close"));
2001 
2002 	if (vp->v_usecount > 1) {
2003 
2004 		vp->v_usecount--;
2005 		VI_UNLOCK(vp);
2006 
2007 		return;
2008 	}
2009 
2010 	if (vp->v_usecount == 1) {
2011 		vp->v_usecount--;
2012 		/*
2013 		 * We must call VOP_INACTIVE with the node locked.
2014 		 * If we are doing a vput, the node is already locked,
2015 		 * but, in the case of vrele, we must explicitly lock
2016 		 * the vnode before calling VOP_INACTIVE.
2017 		 */
2018 		if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0)
2019 			VOP_INACTIVE(vp, td);
2020 		VI_LOCK(vp);
2021 		if (VSHOULDFREE(vp))
2022 			vfree(vp);
2023 		else
2024 			vlruvp(vp);
2025 		VI_UNLOCK(vp);
2026 
2027 	} else {
2028 #ifdef DIAGNOSTIC
2029 		vprint("vrele: negative ref count", vp);
2030 		VI_UNLOCK(vp);
2031 #endif
2032 		panic("vrele: negative ref cnt");
2033 	}
2034 }
2035 
2036 /*
2037  * Release an already locked vnode.  This give the same effects as
2038  * unlock+vrele(), but takes less time and avoids releasing and
2039  * re-aquiring the lock (as vrele() aquires the lock internally.)
2040  */
2041 void
2042 vput(vp)
2043 	struct vnode *vp;
2044 {
2045 	struct thread *td = curthread;	/* XXX */
2046 
2047 	GIANT_REQUIRED;
2048 
2049 	KASSERT(vp != NULL, ("vput: null vp"));
2050 	mtx_lock(&vp->v_interlock);
2051 	/* Skip this v_writecount check if we're going to panic below. */
2052 	KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
2053 	    ("vput: missed vn_close"));
2054 
2055 	if (vp->v_usecount > 1) {
2056 		vp->v_usecount--;
2057 		VOP_UNLOCK(vp, LK_INTERLOCK, td);
2058 		return;
2059 	}
2060 
2061 	if (vp->v_usecount == 1) {
2062 		vp->v_usecount--;
2063 		/*
2064 		 * We must call VOP_INACTIVE with the node locked.
2065 		 * If we are doing a vput, the node is already locked,
2066 		 * so we just need to release the vnode mutex.
2067 		 */
2068 		VI_UNLOCK(vp);
2069 		VOP_INACTIVE(vp, td);
2070 		VI_LOCK(vp);
2071 		if (VSHOULDFREE(vp))
2072 			vfree(vp);
2073 		else
2074 			vlruvp(vp);
2075 		VI_UNLOCK(vp);
2076 
2077 	} else {
2078 #ifdef DIAGNOSTIC
2079 		vprint("vput: negative ref count", vp);
2080 #endif
2081 		panic("vput: negative ref cnt");
2082 	}
2083 }
2084 
2085 /*
2086  * Somebody doesn't want the vnode recycled.
2087  */
2088 void
2089 vhold(vp)
2090 	register struct vnode *vp;
2091 {
2092 	int s;
2093 
2094 	s = splbio();
2095 	vp->v_holdcnt++;
2096 	VI_LOCK(vp);
2097 	if (VSHOULDBUSY(vp))
2098 		vbusy(vp);
2099 	VI_UNLOCK(vp);
2100 	splx(s);
2101 }
2102 
2103 /*
2104  * Note that there is one less who cares about this vnode.  vdrop() is the
2105  * opposite of vhold().
2106  */
2107 void
2108 vdrop(vp)
2109 	register struct vnode *vp;
2110 {
2111 	int s;
2112 
2113 	s = splbio();
2114 	if (vp->v_holdcnt <= 0)
2115 		panic("vdrop: holdcnt");
2116 	vp->v_holdcnt--;
2117 	VI_LOCK(vp);
2118 	if (VSHOULDFREE(vp))
2119 		vfree(vp);
2120 	else
2121 		vlruvp(vp);
2122 	VI_UNLOCK(vp);
2123 	splx(s);
2124 }
2125 
2126 /*
2127  * Remove any vnodes in the vnode table belonging to mount point mp.
2128  *
2129  * If FORCECLOSE is not specified, there should not be any active ones,
2130  * return error if any are found (nb: this is a user error, not a
2131  * system error). If FORCECLOSE is specified, detach any active vnodes
2132  * that are found.
2133  *
2134  * If WRITECLOSE is set, only flush out regular file vnodes open for
2135  * writing.
2136  *
2137  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2138  *
2139  * `rootrefs' specifies the base reference count for the root vnode
2140  * of this filesystem. The root vnode is considered busy if its
2141  * v_usecount exceeds this value. On a successful return, vflush()
2142  * will call vrele() on the root vnode exactly rootrefs times.
2143  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2144  * be zero.
2145  */
2146 #ifdef DIAGNOSTIC
2147 static int busyprt = 0;		/* print out busy vnodes */
2148 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
2149 #endif
2150 
2151 int
2152 vflush(mp, rootrefs, flags)
2153 	struct mount *mp;
2154 	int rootrefs;
2155 	int flags;
2156 {
2157 	struct thread *td = curthread;	/* XXX */
2158 	struct vnode *vp, *nvp, *rootvp = NULL;
2159 	struct vattr vattr;
2160 	int busy = 0, error;
2161 
2162 	if (rootrefs > 0) {
2163 		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2164 		    ("vflush: bad args"));
2165 		/*
2166 		 * Get the filesystem root vnode. We can vput() it
2167 		 * immediately, since with rootrefs > 0, it won't go away.
2168 		 */
2169 		if ((error = VFS_ROOT(mp, &rootvp)) != 0)
2170 			return (error);
2171 		vput(rootvp);
2172 
2173 	}
2174 	mtx_lock(&mntvnode_mtx);
2175 loop:
2176 	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp; vp = nvp) {
2177 		/*
2178 		 * Make sure this vnode wasn't reclaimed in getnewvnode().
2179 		 * Start over if it has (it won't be on the list anymore).
2180 		 */
2181 		if (vp->v_mount != mp)
2182 			goto loop;
2183 		nvp = TAILQ_NEXT(vp, v_nmntvnodes);
2184 
2185 		mtx_unlock(&mntvnode_mtx);
2186 		VI_LOCK(vp);
2187 		/*
2188 		 * Skip over a vnodes marked VV_SYSTEM.
2189 		 */
2190 		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2191 			VI_UNLOCK(vp);
2192 			mtx_lock(&mntvnode_mtx);
2193 			continue;
2194 		}
2195 		/*
2196 		 * If WRITECLOSE is set, flush out unlinked but still open
2197 		 * files (even if open only for reading) and regular file
2198 		 * vnodes open for writing.
2199 		 */
2200 		mp_fixme("Getattr called with interlock held!");
2201 		if ((flags & WRITECLOSE) &&
2202 		    (vp->v_type == VNON ||
2203 		    (VOP_GETATTR(vp, &vattr, td->td_ucred, td) == 0 &&
2204 		    vattr.va_nlink > 0)) &&
2205 		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
2206 			mtx_unlock(&vp->v_interlock);
2207 			mtx_lock(&mntvnode_mtx);
2208 			continue;
2209 		}
2210 
2211 		/*
2212 		 * With v_usecount == 0, all we need to do is clear out the
2213 		 * vnode data structures and we are done.
2214 		 */
2215 		if (vp->v_usecount == 0) {
2216 			vgonel(vp, td);
2217 			mtx_lock(&mntvnode_mtx);
2218 			continue;
2219 		}
2220 
2221 		/*
2222 		 * If FORCECLOSE is set, forcibly close the vnode. For block
2223 		 * or character devices, revert to an anonymous device. For
2224 		 * all other files, just kill them.
2225 		 */
2226 		if (flags & FORCECLOSE) {
2227 			if (vp->v_type != VCHR) {
2228 				vgonel(vp, td);
2229 			} else {
2230 				vclean(vp, 0, td);
2231 				VI_UNLOCK(vp);
2232 				vp->v_op = spec_vnodeop_p;
2233 				insmntque(vp, (struct mount *) 0);
2234 			}
2235 			mtx_lock(&mntvnode_mtx);
2236 			continue;
2237 		}
2238 #ifdef DIAGNOSTIC
2239 		if (busyprt)
2240 			vprint("vflush: busy vnode", vp);
2241 #endif
2242 		VI_UNLOCK(vp);
2243 		mtx_lock(&mntvnode_mtx);
2244 		busy++;
2245 	}
2246 	mtx_unlock(&mntvnode_mtx);
2247 	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2248 		/*
2249 		 * If just the root vnode is busy, and if its refcount
2250 		 * is equal to `rootrefs', then go ahead and kill it.
2251 		 */
2252 		mtx_lock(&rootvp->v_interlock);
2253 		KASSERT(busy > 0, ("vflush: not busy"));
2254 		KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs"));
2255 		if (busy == 1 && rootvp->v_usecount == rootrefs) {
2256 			vgonel(rootvp, td);
2257 			busy = 0;
2258 		} else
2259 			mtx_unlock(&rootvp->v_interlock);
2260 	}
2261 	if (busy)
2262 		return (EBUSY);
2263 	for (; rootrefs > 0; rootrefs--)
2264 		vrele(rootvp);
2265 	return (0);
2266 }
2267 
2268 /*
2269  * This moves a now (likely recyclable) vnode to the end of the
2270  * mountlist.  XXX However, it is temporarily disabled until we
2271  * can clean up ffs_sync() and friends, which have loop restart
2272  * conditions which this code causes to operate O(N^2).
2273  */
2274 static void
2275 vlruvp(struct vnode *vp)
2276 {
2277 #if 0
2278 	struct mount *mp;
2279 
2280 	if ((mp = vp->v_mount) != NULL) {
2281 		mtx_lock(&mntvnode_mtx);
2282 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
2283 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
2284 		mtx_unlock(&mntvnode_mtx);
2285 	}
2286 #endif
2287 }
2288 
2289 /*
2290  * Disassociate the underlying filesystem from a vnode.
2291  */
2292 static void
2293 vclean(vp, flags, td)
2294 	struct vnode *vp;
2295 	int flags;
2296 	struct thread *td;
2297 {
2298 	int active;
2299 
2300 	ASSERT_VI_LOCKED(vp);
2301 	/*
2302 	 * Check to see if the vnode is in use. If so we have to reference it
2303 	 * before we clean it out so that its count cannot fall to zero and
2304 	 * generate a race against ourselves to recycle it.
2305 	 */
2306 	if ((active = vp->v_usecount))
2307 		vp->v_usecount++;
2308 
2309 	/*
2310 	 * Prevent the vnode from being recycled or brought into use while we
2311 	 * clean it out.
2312 	 */
2313 	if (vp->v_iflag & VI_XLOCK)
2314 		panic("vclean: deadlock");
2315 	vp->v_iflag |= VI_XLOCK;
2316 	vp->v_vxproc = curthread;
2317 	/*
2318 	 * Even if the count is zero, the VOP_INACTIVE routine may still
2319 	 * have the object locked while it cleans it out. The VOP_LOCK
2320 	 * ensures that the VOP_INACTIVE routine is done with its work.
2321 	 * For active vnodes, it ensures that no other activity can
2322 	 * occur while the underlying object is being cleaned out.
2323 	 */
2324 	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
2325 
2326 	/*
2327 	 * Clean out any buffers associated with the vnode.
2328 	 * If the flush fails, just toss the buffers.
2329 	 */
2330 	if (flags & DOCLOSE) {
2331 		if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL)
2332 			(void) vn_write_suspend_wait(vp, NULL, V_WAIT);
2333 		if (vinvalbuf(vp, V_SAVE, NOCRED, td, 0, 0) != 0)
2334 			vinvalbuf(vp, 0, NOCRED, td, 0, 0);
2335 	}
2336 
2337 	VOP_DESTROYVOBJECT(vp);
2338 
2339 	/*
2340 	 * Any other processes trying to obtain this lock must first
2341 	 * wait for VXLOCK to clear, then call the new lock operation.
2342 	 */
2343 	VOP_UNLOCK(vp, 0, td);
2344 
2345 	/*
2346 	 * If purging an active vnode, it must be closed and
2347 	 * deactivated before being reclaimed. Note that the
2348 	 * VOP_INACTIVE will unlock the vnode.
2349 	 */
2350 	if (active) {
2351 		if (flags & DOCLOSE)
2352 			VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2353 		if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) != 0)
2354 			panic("vclean: cannot relock.");
2355 		VOP_INACTIVE(vp, td);
2356 	}
2357 
2358 	/*
2359 	 * Reclaim the vnode.
2360 	 */
2361 	if (VOP_RECLAIM(vp, td))
2362 		panic("vclean: cannot reclaim");
2363 
2364 	if (active) {
2365 		/*
2366 		 * Inline copy of vrele() since VOP_INACTIVE
2367 		 * has already been called.
2368 		 */
2369 		VI_LOCK(vp);
2370 		if (--vp->v_usecount <= 0) {
2371 #ifdef DIAGNOSTIC
2372 			if (vp->v_usecount < 0 || vp->v_writecount != 0) {
2373 				vprint("vclean: bad ref count", vp);
2374 				panic("vclean: ref cnt");
2375 			}
2376 #endif
2377 			vfree(vp);
2378 		}
2379 		VI_UNLOCK(vp);
2380 	}
2381 
2382 	cache_purge(vp);
2383 	vp->v_vnlock = NULL;
2384 	lockdestroy(&vp->v_lock);
2385 
2386 	VI_LOCK(vp);
2387 	if (VSHOULDFREE(vp))
2388 		vfree(vp);
2389 
2390 	/*
2391 	 * Done with purge, notify sleepers of the grim news.
2392 	 */
2393 	vp->v_op = dead_vnodeop_p;
2394 	if (vp->v_pollinfo != NULL)
2395 		vn_pollgone(vp);
2396 	vp->v_tag = VT_NON;
2397 	vp->v_iflag &= ~VI_XLOCK;
2398 	vp->v_vxproc = NULL;
2399 	if (vp->v_iflag & VI_XWANT) {
2400 		vp->v_iflag &= ~VI_XWANT;
2401 		wakeup(vp);
2402 	}
2403 }
2404 
2405 /*
2406  * Eliminate all activity associated with the requested vnode
2407  * and with all vnodes aliased to the requested vnode.
2408  */
2409 int
2410 vop_revoke(ap)
2411 	struct vop_revoke_args /* {
2412 		struct vnode *a_vp;
2413 		int a_flags;
2414 	} */ *ap;
2415 {
2416 	struct vnode *vp, *vq;
2417 	dev_t dev;
2418 
2419 	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
2420 
2421 	vp = ap->a_vp;
2422 	VI_LOCK(vp);
2423 	/*
2424 	 * If a vgone (or vclean) is already in progress,
2425 	 * wait until it is done and return.
2426 	 */
2427 	if (vp->v_iflag & VI_XLOCK) {
2428 		vp->v_iflag |= VI_XWANT;
2429 		msleep(vp, VI_MTX(vp), PINOD | PDROP,
2430 		    "vop_revokeall", 0);
2431 		VI_UNLOCK(vp);
2432 		return (0);
2433 	}
2434 	VI_UNLOCK(vp);
2435 	dev = vp->v_rdev;
2436 	for (;;) {
2437 		mtx_lock(&spechash_mtx);
2438 		vq = SLIST_FIRST(&dev->si_hlist);
2439 		mtx_unlock(&spechash_mtx);
2440 		if (!vq)
2441 			break;
2442 		vgone(vq);
2443 	}
2444 	return (0);
2445 }
2446 
2447 /*
2448  * Recycle an unused vnode to the front of the free list.
2449  * Release the passed interlock if the vnode will be recycled.
2450  */
2451 int
2452 vrecycle(vp, inter_lkp, td)
2453 	struct vnode *vp;
2454 	struct mtx *inter_lkp;
2455 	struct thread *td;
2456 {
2457 
2458 	mtx_lock(&vp->v_interlock);
2459 	if (vp->v_usecount == 0) {
2460 		if (inter_lkp) {
2461 			mtx_unlock(inter_lkp);
2462 		}
2463 		vgonel(vp, td);
2464 		return (1);
2465 	}
2466 	mtx_unlock(&vp->v_interlock);
2467 	return (0);
2468 }
2469 
2470 /*
2471  * Eliminate all activity associated with a vnode
2472  * in preparation for reuse.
2473  */
2474 void
2475 vgone(vp)
2476 	register struct vnode *vp;
2477 {
2478 	struct thread *td = curthread;	/* XXX */
2479 
2480 	VI_LOCK(vp);
2481 	vgonel(vp, td);
2482 }
2483 
2484 /*
2485  * vgone, with the vp interlock held.
2486  */
2487 void
2488 vgonel(vp, td)
2489 	struct vnode *vp;
2490 	struct thread *td;
2491 {
2492 	int s;
2493 
2494 	/*
2495 	 * If a vgone (or vclean) is already in progress,
2496 	 * wait until it is done and return.
2497 	 */
2498 	ASSERT_VI_LOCKED(vp);
2499 	if (vp->v_iflag & VI_XLOCK) {
2500 		vp->v_iflag |= VI_XWANT;
2501 		VI_UNLOCK(vp);
2502 		tsleep(vp, PINOD | PDROP, "vgone", 0);
2503 		return;
2504 	}
2505 
2506 	/*
2507 	 * Clean out the filesystem specific data.
2508 	 */
2509 	vclean(vp, DOCLOSE, td);
2510 	VI_UNLOCK(vp);
2511 
2512 	/*
2513 	 * Delete from old mount point vnode list, if on one.
2514 	 */
2515 	if (vp->v_mount != NULL)
2516 		insmntque(vp, (struct mount *)0);
2517 	/*
2518 	 * If special device, remove it from special device alias list
2519 	 * if it is on one.
2520 	 */
2521 	if (vp->v_type == VCHR && vp->v_rdev != NULL && vp->v_rdev != NODEV) {
2522 		mtx_lock(&spechash_mtx);
2523 		SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext);
2524 		freedev(vp->v_rdev);
2525 		mtx_unlock(&spechash_mtx);
2526 		vp->v_rdev = NULL;
2527 	}
2528 
2529 	/*
2530 	 * If it is on the freelist and not already at the head,
2531 	 * move it to the head of the list. The test of the
2532 	 * VDOOMED flag and the reference count of zero is because
2533 	 * it will be removed from the free list by getnewvnode,
2534 	 * but will not have its reference count incremented until
2535 	 * after calling vgone. If the reference count were
2536 	 * incremented first, vgone would (incorrectly) try to
2537 	 * close the previous instance of the underlying object.
2538 	 */
2539 	VI_LOCK(vp);
2540 	if (vp->v_usecount == 0 && !(vp->v_iflag & VI_DOOMED)) {
2541 		s = splbio();
2542 		mtx_lock(&vnode_free_list_mtx);
2543 		if (vp->v_iflag & VI_FREE) {
2544 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2545 		} else {
2546 			vp->v_iflag |= VI_FREE;
2547 			freevnodes++;
2548 		}
2549 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2550 		mtx_unlock(&vnode_free_list_mtx);
2551 		splx(s);
2552 	}
2553 
2554 	vp->v_type = VBAD;
2555 	VI_UNLOCK(vp);
2556 }
2557 
2558 /*
2559  * Lookup a vnode by device number.
2560  */
2561 int
2562 vfinddev(dev, type, vpp)
2563 	dev_t dev;
2564 	enum vtype type;
2565 	struct vnode **vpp;
2566 {
2567 	struct vnode *vp;
2568 
2569 	mtx_lock(&spechash_mtx);
2570 	SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
2571 		if (type == vp->v_type) {
2572 			*vpp = vp;
2573 			mtx_unlock(&spechash_mtx);
2574 			return (1);
2575 		}
2576 	}
2577 	mtx_unlock(&spechash_mtx);
2578 	return (0);
2579 }
2580 
2581 /*
2582  * Calculate the total number of references to a special device.
2583  */
2584 int
2585 vcount(vp)
2586 	struct vnode *vp;
2587 {
2588 	struct vnode *vq;
2589 	int count;
2590 
2591 	count = 0;
2592 	mtx_lock(&spechash_mtx);
2593 	SLIST_FOREACH(vq, &vp->v_rdev->si_hlist, v_specnext)
2594 		count += vq->v_usecount;
2595 	mtx_unlock(&spechash_mtx);
2596 	return (count);
2597 }
2598 
2599 /*
2600  * Same as above, but using the dev_t as argument
2601  */
2602 int
2603 count_dev(dev)
2604 	dev_t dev;
2605 {
2606 	struct vnode *vp;
2607 
2608 	vp = SLIST_FIRST(&dev->si_hlist);
2609 	if (vp == NULL)
2610 		return (0);
2611 	return(vcount(vp));
2612 }
2613 
2614 /*
2615  * Print out a description of a vnode.
2616  */
2617 static char *typename[] =
2618 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
2619 
2620 void
2621 vprint(label, vp)
2622 	char *label;
2623 	struct vnode *vp;
2624 {
2625 	char buf[96];
2626 
2627 	if (label != NULL)
2628 		printf("%s: %p: ", label, (void *)vp);
2629 	else
2630 		printf("%p: ", (void *)vp);
2631 	printf("type %s, usecount %d, writecount %d, refcount %d,",
2632 	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
2633 	    vp->v_holdcnt);
2634 	buf[0] = '\0';
2635 	if (vp->v_vflag & VV_ROOT)
2636 		strcat(buf, "|VV_ROOT");
2637 	if (vp->v_vflag & VV_TEXT)
2638 		strcat(buf, "|VV_TEXT");
2639 	if (vp->v_vflag & VV_SYSTEM)
2640 		strcat(buf, "|VV_SYSTEM");
2641 	if (vp->v_iflag & VI_XLOCK)
2642 		strcat(buf, "|VI_XLOCK");
2643 	if (vp->v_iflag & VI_XWANT)
2644 		strcat(buf, "|VI_XWANT");
2645 	if (vp->v_iflag & VI_BWAIT)
2646 		strcat(buf, "|VI_BWAIT");
2647 	if (vp->v_iflag & VI_DOOMED)
2648 		strcat(buf, "|VI_DOOMED");
2649 	if (vp->v_iflag & VI_FREE)
2650 		strcat(buf, "|VI_FREE");
2651 	if (vp->v_vflag & VV_OBJBUF)
2652 		strcat(buf, "|VV_OBJBUF");
2653 	if (buf[0] != '\0')
2654 		printf(" flags (%s)", &buf[1]);
2655 	if (vp->v_data == NULL) {
2656 		printf("\n");
2657 	} else {
2658 		printf("\n\t");
2659 		VOP_PRINT(vp);
2660 	}
2661 }
2662 
2663 #ifdef DDB
2664 #include <ddb/ddb.h>
2665 /*
2666  * List all of the locked vnodes in the system.
2667  * Called when debugging the kernel.
2668  */
2669 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
2670 {
2671 	struct thread *td = curthread;	/* XXX */
2672 	struct mount *mp, *nmp;
2673 	struct vnode *vp;
2674 
2675 	printf("Locked vnodes\n");
2676 	mtx_lock(&mountlist_mtx);
2677 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2678 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
2679 			nmp = TAILQ_NEXT(mp, mnt_list);
2680 			continue;
2681 		}
2682 		mtx_lock(&mntvnode_mtx);
2683 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2684 			if (VOP_ISLOCKED(vp, NULL))
2685 				vprint((char *)0, vp);
2686 		}
2687 		mtx_unlock(&mntvnode_mtx);
2688 		mtx_lock(&mountlist_mtx);
2689 		nmp = TAILQ_NEXT(mp, mnt_list);
2690 		vfs_unbusy(mp, td);
2691 	}
2692 	mtx_unlock(&mountlist_mtx);
2693 }
2694 #endif
2695 
2696 /*
2697  * Fill in a struct xvfsconf based on a struct vfsconf.
2698  */
2699 static void
2700 vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp)
2701 {
2702 
2703 	strcpy(xvfsp->vfc_name, vfsp->vfc_name);
2704 	xvfsp->vfc_typenum = vfsp->vfc_typenum;
2705 	xvfsp->vfc_refcount = vfsp->vfc_refcount;
2706 	xvfsp->vfc_flags = vfsp->vfc_flags;
2707 	/*
2708 	 * These are unused in userland, we keep them
2709 	 * to not break binary compatibility.
2710 	 */
2711 	xvfsp->vfc_vfsops = NULL;
2712 	xvfsp->vfc_next = NULL;
2713 }
2714 
2715 static int
2716 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
2717 {
2718 	struct vfsconf *vfsp;
2719 	struct xvfsconf *xvfsp;
2720 	int cnt, error, i;
2721 
2722 	cnt = 0;
2723 	for (vfsp = vfsconf; vfsp != NULL; vfsp = vfsp->vfc_next)
2724 		cnt++;
2725 	xvfsp = malloc(sizeof(struct xvfsconf) * cnt, M_TEMP, M_WAITOK);
2726 	/*
2727 	 * Handle the race that we will have here when struct vfsconf
2728 	 * will be locked down by using both cnt and checking vfc_next
2729 	 * against NULL to determine the end of the loop.  The race will
2730 	 * happen because we will have to unlock before calling malloc().
2731 	 * We are protected by Giant for now.
2732 	 */
2733 	i = 0;
2734 	for (vfsp = vfsconf; vfsp != NULL && i < cnt; vfsp = vfsp->vfc_next) {
2735 		vfsconf2x(vfsp, xvfsp + i);
2736 		i++;
2737 	}
2738 	error = SYSCTL_OUT(req, xvfsp, sizeof(struct xvfsconf) * i);
2739 	free(xvfsp, M_TEMP);
2740 	return (error);
2741 }
2742 
2743 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist,
2744     "S,xvfsconf", "List of all configured filesystems");
2745 
2746 /*
2747  * Top level filesystem related information gathering.
2748  */
2749 static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
2750 
2751 static int
2752 vfs_sysctl(SYSCTL_HANDLER_ARGS)
2753 {
2754 	int *name = (int *)arg1 - 1;	/* XXX */
2755 	u_int namelen = arg2 + 1;	/* XXX */
2756 	struct vfsconf *vfsp;
2757 	struct xvfsconf xvfsp;
2758 
2759 	printf("WARNING: userland calling deprecated sysctl, "
2760 	    "please rebuild world\n");
2761 
2762 #if 1 || defined(COMPAT_PRELITE2)
2763 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2764 	if (namelen == 1)
2765 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2766 #endif
2767 
2768 	switch (name[1]) {
2769 	case VFS_MAXTYPENUM:
2770 		if (namelen != 2)
2771 			return (ENOTDIR);
2772 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2773 	case VFS_CONF:
2774 		if (namelen != 3)
2775 			return (ENOTDIR);	/* overloaded */
2776 		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2777 			if (vfsp->vfc_typenum == name[2])
2778 				break;
2779 		if (vfsp == NULL)
2780 			return (EOPNOTSUPP);
2781 		vfsconf2x(vfsp, &xvfsp);
2782 		return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
2783 	}
2784 	return (EOPNOTSUPP);
2785 }
2786 
2787 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP, vfs_sysctl,
2788 	"Generic filesystem");
2789 
2790 #if 1 || defined(COMPAT_PRELITE2)
2791 
2792 static int
2793 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
2794 {
2795 	int error;
2796 	struct vfsconf *vfsp;
2797 	struct ovfsconf ovfs;
2798 
2799 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
2800 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
2801 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
2802 		ovfs.vfc_index = vfsp->vfc_typenum;
2803 		ovfs.vfc_refcount = vfsp->vfc_refcount;
2804 		ovfs.vfc_flags = vfsp->vfc_flags;
2805 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2806 		if (error)
2807 			return error;
2808 	}
2809 	return 0;
2810 }
2811 
2812 #endif /* 1 || COMPAT_PRELITE2 */
2813 
2814 #define KINFO_VNODESLOP		10
2815 /*
2816  * Dump vnode list (via sysctl).
2817  */
2818 /* ARGSUSED */
2819 static int
2820 sysctl_vnode(SYSCTL_HANDLER_ARGS)
2821 {
2822 	struct xvnode *xvn;
2823 	struct thread *td = req->td;
2824 	struct mount *mp;
2825 	struct vnode *vp;
2826 	int error, len, n;
2827 
2828 	/*
2829 	 * Stale numvnodes access is not fatal here.
2830 	 */
2831 	req->lock = 0;
2832 	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
2833 	if (!req->oldptr)
2834 		/* Make an estimate */
2835 		return (SYSCTL_OUT(req, 0, len));
2836 
2837 	sysctl_wire_old_buffer(req, 0);
2838 	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
2839 	n = 0;
2840 	mtx_lock(&mountlist_mtx);
2841 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2842 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
2843 			continue;
2844 		mtx_lock(&mntvnode_mtx);
2845 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2846 			if (n == len)
2847 				break;
2848 			vref(vp);
2849 			xvn[n].xv_size = sizeof *xvn;
2850 			xvn[n].xv_vnode = vp;
2851 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
2852 			XV_COPY(usecount);
2853 			XV_COPY(writecount);
2854 			XV_COPY(holdcnt);
2855 			XV_COPY(id);
2856 			XV_COPY(mount);
2857 			XV_COPY(numoutput);
2858 			XV_COPY(type);
2859 #undef XV_COPY
2860 			xvn[n].xv_flag = vp->v_vflag;
2861 
2862 			switch (vp->v_type) {
2863 			case VREG:
2864 			case VDIR:
2865 			case VLNK:
2866 				xvn[n].xv_dev = vp->v_cachedfs;
2867 				xvn[n].xv_ino = vp->v_cachedid;
2868 				break;
2869 			case VBLK:
2870 			case VCHR:
2871 				if (vp->v_rdev == NULL) {
2872 					vrele(vp);
2873 					continue;
2874 				}
2875 				xvn[n].xv_dev = dev2udev(vp->v_rdev);
2876 				break;
2877 			case VSOCK:
2878 				xvn[n].xv_socket = vp->v_socket;
2879 				break;
2880 			case VFIFO:
2881 				xvn[n].xv_fifo = vp->v_fifoinfo;
2882 				break;
2883 			case VNON:
2884 			case VBAD:
2885 			default:
2886 				/* shouldn't happen? */
2887 				vrele(vp);
2888 				continue;
2889 			}
2890 			vrele(vp);
2891 			++n;
2892 		}
2893 		mtx_unlock(&mntvnode_mtx);
2894 		mtx_lock(&mountlist_mtx);
2895 		vfs_unbusy(mp, td);
2896 		if (n == len)
2897 			break;
2898 	}
2899 	mtx_unlock(&mountlist_mtx);
2900 
2901 	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
2902 	free(xvn, M_TEMP);
2903 	return (error);
2904 }
2905 
2906 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
2907 	0, 0, sysctl_vnode, "S,xvnode", "");
2908 
2909 /*
2910  * Check to see if a filesystem is mounted on a block device.
2911  */
2912 int
2913 vfs_mountedon(vp)
2914 	struct vnode *vp;
2915 {
2916 
2917 	if (vp->v_rdev->si_mountpoint != NULL)
2918 		return (EBUSY);
2919 	return (0);
2920 }
2921 
2922 /*
2923  * Unmount all filesystems. The list is traversed in reverse order
2924  * of mounting to avoid dependencies.
2925  */
2926 void
2927 vfs_unmountall()
2928 {
2929 	struct mount *mp;
2930 	struct thread *td;
2931 	int error;
2932 
2933 	if (curthread != NULL)
2934 		td = curthread;
2935 	else
2936 		td = FIRST_THREAD_IN_PROC(initproc); /* XXX XXX proc0? */
2937 	/*
2938 	 * Since this only runs when rebooting, it is not interlocked.
2939 	 */
2940 	while(!TAILQ_EMPTY(&mountlist)) {
2941 		mp = TAILQ_LAST(&mountlist, mntlist);
2942 		error = dounmount(mp, MNT_FORCE, td);
2943 		if (error) {
2944 			TAILQ_REMOVE(&mountlist, mp, mnt_list);
2945 			printf("unmount of %s failed (",
2946 			    mp->mnt_stat.f_mntonname);
2947 			if (error == EBUSY)
2948 				printf("BUSY)\n");
2949 			else
2950 				printf("%d)\n", error);
2951 		} else {
2952 			/* The unmount has removed mp from the mountlist */
2953 		}
2954 	}
2955 }
2956 
2957 /*
2958  * perform msync on all vnodes under a mount point
2959  * the mount point must be locked.
2960  */
2961 void
2962 vfs_msync(struct mount *mp, int flags)
2963 {
2964 	struct vnode *vp, *nvp;
2965 	struct vm_object *obj;
2966 	int tries;
2967 
2968 	GIANT_REQUIRED;
2969 
2970 	tries = 5;
2971 	mtx_lock(&mntvnode_mtx);
2972 loop:
2973 	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) {
2974 		if (vp->v_mount != mp) {
2975 			if (--tries > 0)
2976 				goto loop;
2977 			break;
2978 		}
2979 		nvp = TAILQ_NEXT(vp, v_nmntvnodes);
2980 
2981 		mp_fixme("What locks do we need here?");
2982 		if (vp->v_iflag & VI_XLOCK)	/* XXX: what if MNT_WAIT? */
2983 			continue;
2984 
2985 		if (vp->v_vflag & VV_NOSYNC)	/* unlinked, skip it */
2986 			continue;
2987 
2988 		if ((vp->v_iflag & VI_OBJDIRTY) &&
2989 		    (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
2990 			mtx_unlock(&mntvnode_mtx);
2991 			if (!vget(vp,
2992 			    LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curthread)) {
2993 				if (VOP_GETVOBJECT(vp, &obj) == 0) {
2994 					vm_object_page_clean(obj, 0, 0,
2995 					    flags == MNT_WAIT ?
2996 					    OBJPC_SYNC : OBJPC_NOSYNC);
2997 				}
2998 				vput(vp);
2999 			}
3000 			mtx_lock(&mntvnode_mtx);
3001 			if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) {
3002 				if (--tries > 0)
3003 					goto loop;
3004 				break;
3005 			}
3006 		}
3007 	}
3008 	mtx_unlock(&mntvnode_mtx);
3009 }
3010 
3011 /*
3012  * Create the VM object needed for VMIO and mmap support.  This
3013  * is done for all VREG files in the system.  Some filesystems might
3014  * afford the additional metadata buffering capability of the
3015  * VMIO code by making the device node be VMIO mode also.
3016  *
3017  * vp must be locked when vfs_object_create is called.
3018  */
3019 int
3020 vfs_object_create(vp, td, cred)
3021 	struct vnode *vp;
3022 	struct thread *td;
3023 	struct ucred *cred;
3024 {
3025 	GIANT_REQUIRED;
3026 	return (VOP_CREATEVOBJECT(vp, cred, td));
3027 }
3028 
3029 /*
3030  * Mark a vnode as free, putting it up for recycling.
3031  */
3032 void
3033 vfree(vp)
3034 	struct vnode *vp;
3035 {
3036 	int s;
3037 
3038 	ASSERT_VI_LOCKED(vp);
3039 	s = splbio();
3040 	mtx_lock(&vnode_free_list_mtx);
3041 	KASSERT((vp->v_iflag & VI_FREE) == 0, ("vnode already free"));
3042 	if (vp->v_iflag & VI_AGE) {
3043 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
3044 	} else {
3045 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
3046 	}
3047 	freevnodes++;
3048 	mtx_unlock(&vnode_free_list_mtx);
3049 	vp->v_iflag &= ~VI_AGE;
3050 	vp->v_iflag |= VI_FREE;
3051 	splx(s);
3052 }
3053 
3054 /*
3055  * Opposite of vfree() - mark a vnode as in use.
3056  */
3057 void
3058 vbusy(vp)
3059 	struct vnode *vp;
3060 {
3061 	int s;
3062 
3063 	s = splbio();
3064 	ASSERT_VI_LOCKED(vp);
3065 	mtx_lock(&vnode_free_list_mtx);
3066 	KASSERT((vp->v_iflag & VI_FREE) != 0, ("vnode not free"));
3067 	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
3068 	freevnodes--;
3069 	mtx_unlock(&vnode_free_list_mtx);
3070 	vp->v_iflag &= ~(VI_FREE|VI_AGE);
3071 	splx(s);
3072 }
3073 
3074 /*
3075  * Record a process's interest in events which might happen to
3076  * a vnode.  Because poll uses the historic select-style interface
3077  * internally, this routine serves as both the ``check for any
3078  * pending events'' and the ``record my interest in future events''
3079  * functions.  (These are done together, while the lock is held,
3080  * to avoid race conditions.)
3081  */
3082 int
3083 vn_pollrecord(vp, td, events)
3084 	struct vnode *vp;
3085 	struct thread *td;
3086 	short events;
3087 {
3088 
3089 	if (vp->v_pollinfo == NULL)
3090 		v_addpollinfo(vp);
3091 	mtx_lock(&vp->v_pollinfo->vpi_lock);
3092 	if (vp->v_pollinfo->vpi_revents & events) {
3093 		/*
3094 		 * This leaves events we are not interested
3095 		 * in available for the other process which
3096 		 * which presumably had requested them
3097 		 * (otherwise they would never have been
3098 		 * recorded).
3099 		 */
3100 		events &= vp->v_pollinfo->vpi_revents;
3101 		vp->v_pollinfo->vpi_revents &= ~events;
3102 
3103 		mtx_unlock(&vp->v_pollinfo->vpi_lock);
3104 		return events;
3105 	}
3106 	vp->v_pollinfo->vpi_events |= events;
3107 	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3108 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3109 	return 0;
3110 }
3111 
3112 /*
3113  * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
3114  * it is possible for us to miss an event due to race conditions, but
3115  * that condition is expected to be rare, so for the moment it is the
3116  * preferred interface.
3117  */
3118 void
3119 vn_pollevent(vp, events)
3120 	struct vnode *vp;
3121 	short events;
3122 {
3123 
3124 	if (vp->v_pollinfo == NULL)
3125 		v_addpollinfo(vp);
3126 	mtx_lock(&vp->v_pollinfo->vpi_lock);
3127 	if (vp->v_pollinfo->vpi_events & events) {
3128 		/*
3129 		 * We clear vpi_events so that we don't
3130 		 * call selwakeup() twice if two events are
3131 		 * posted before the polling process(es) is
3132 		 * awakened.  This also ensures that we take at
3133 		 * most one selwakeup() if the polling process
3134 		 * is no longer interested.  However, it does
3135 		 * mean that only one event can be noticed at
3136 		 * a time.  (Perhaps we should only clear those
3137 		 * event bits which we note?) XXX
3138 		 */
3139 		vp->v_pollinfo->vpi_events = 0;	/* &= ~events ??? */
3140 		vp->v_pollinfo->vpi_revents |= events;
3141 		selwakeup(&vp->v_pollinfo->vpi_selinfo);
3142 	}
3143 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3144 }
3145 
3146 /*
3147  * Wake up anyone polling on vp because it is being revoked.
3148  * This depends on dead_poll() returning POLLHUP for correct
3149  * behavior.
3150  */
3151 void
3152 vn_pollgone(vp)
3153 	struct vnode *vp;
3154 {
3155 
3156 	mtx_lock(&vp->v_pollinfo->vpi_lock);
3157 	VN_KNOTE(vp, NOTE_REVOKE);
3158 	if (vp->v_pollinfo->vpi_events) {
3159 		vp->v_pollinfo->vpi_events = 0;
3160 		selwakeup(&vp->v_pollinfo->vpi_selinfo);
3161 	}
3162 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3163 }
3164 
3165 
3166 
3167 /*
3168  * Routine to create and manage a filesystem syncer vnode.
3169  */
3170 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
3171 static int	sync_fsync(struct  vop_fsync_args *);
3172 static int	sync_inactive(struct  vop_inactive_args *);
3173 static int	sync_reclaim(struct  vop_reclaim_args *);
3174 static int	sync_print(struct vop_print_args *);
3175 
3176 static vop_t **sync_vnodeop_p;
3177 static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
3178 	{ &vop_default_desc,	(vop_t *) vop_eopnotsupp },
3179 	{ &vop_close_desc,	(vop_t *) sync_close },		/* close */
3180 	{ &vop_fsync_desc,	(vop_t *) sync_fsync },		/* fsync */
3181 	{ &vop_inactive_desc,	(vop_t *) sync_inactive },	/* inactive */
3182 	{ &vop_reclaim_desc,	(vop_t *) sync_reclaim },	/* reclaim */
3183 	{ &vop_lock_desc,	(vop_t *) vop_stdlock },	/* lock */
3184 	{ &vop_unlock_desc,	(vop_t *) vop_stdunlock },	/* unlock */
3185 	{ &vop_print_desc,	(vop_t *) sync_print },		/* print */
3186 	{ &vop_islocked_desc,	(vop_t *) vop_stdislocked },	/* islocked */
3187 	{ NULL, NULL }
3188 };
3189 static struct vnodeopv_desc sync_vnodeop_opv_desc =
3190 	{ &sync_vnodeop_p, sync_vnodeop_entries };
3191 
3192 VNODEOP_SET(sync_vnodeop_opv_desc);
3193 
3194 /*
3195  * Create a new filesystem syncer vnode for the specified mount point.
3196  */
3197 int
3198 vfs_allocate_syncvnode(mp)
3199 	struct mount *mp;
3200 {
3201 	struct vnode *vp;
3202 	static long start, incr, next;
3203 	int error;
3204 
3205 	/* Allocate a new vnode */
3206 	if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
3207 		mp->mnt_syncer = NULL;
3208 		return (error);
3209 	}
3210 	vp->v_type = VNON;
3211 	/*
3212 	 * Place the vnode onto the syncer worklist. We attempt to
3213 	 * scatter them about on the list so that they will go off
3214 	 * at evenly distributed times even if all the filesystems
3215 	 * are mounted at once.
3216 	 */
3217 	next += incr;
3218 	if (next == 0 || next > syncer_maxdelay) {
3219 		start /= 2;
3220 		incr /= 2;
3221 		if (start == 0) {
3222 			start = syncer_maxdelay / 2;
3223 			incr = syncer_maxdelay;
3224 		}
3225 		next = start;
3226 	}
3227 	VI_LOCK(vp);
3228 	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
3229 	VI_UNLOCK(vp);
3230 	mp->mnt_syncer = vp;
3231 	return (0);
3232 }
3233 
3234 /*
3235  * Do a lazy sync of the filesystem.
3236  */
3237 static int
3238 sync_fsync(ap)
3239 	struct vop_fsync_args /* {
3240 		struct vnode *a_vp;
3241 		struct ucred *a_cred;
3242 		int a_waitfor;
3243 		struct thread *a_td;
3244 	} */ *ap;
3245 {
3246 	struct vnode *syncvp = ap->a_vp;
3247 	struct mount *mp = syncvp->v_mount;
3248 	struct thread *td = ap->a_td;
3249 	int asyncflag;
3250 
3251 	/*
3252 	 * We only need to do something if this is a lazy evaluation.
3253 	 */
3254 	if (ap->a_waitfor != MNT_LAZY)
3255 		return (0);
3256 
3257 	/*
3258 	 * Move ourselves to the back of the sync list.
3259 	 */
3260 	VI_LOCK(syncvp);
3261 	vn_syncer_add_to_worklist(syncvp, syncdelay);
3262 	VI_UNLOCK(syncvp);
3263 
3264 	/*
3265 	 * Walk the list of vnodes pushing all that are dirty and
3266 	 * not already on the sync list.
3267 	 */
3268 	mtx_lock(&mountlist_mtx);
3269 	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) {
3270 		mtx_unlock(&mountlist_mtx);
3271 		return (0);
3272 	}
3273 	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3274 		vfs_unbusy(mp, td);
3275 		return (0);
3276 	}
3277 	asyncflag = mp->mnt_flag & MNT_ASYNC;
3278 	mp->mnt_flag &= ~MNT_ASYNC;
3279 	vfs_msync(mp, MNT_NOWAIT);
3280 	VFS_SYNC(mp, MNT_LAZY, ap->a_cred, td);
3281 	if (asyncflag)
3282 		mp->mnt_flag |= MNT_ASYNC;
3283 	vn_finished_write(mp);
3284 	vfs_unbusy(mp, td);
3285 	return (0);
3286 }
3287 
3288 /*
3289  * The syncer vnode is no referenced.
3290  */
3291 static int
3292 sync_inactive(ap)
3293 	struct vop_inactive_args /* {
3294 		struct vnode *a_vp;
3295 		struct thread *a_td;
3296 	} */ *ap;
3297 {
3298 
3299 	VOP_UNLOCK(ap->a_vp, 0, ap->a_td);
3300 	vgone(ap->a_vp);
3301 	return (0);
3302 }
3303 
3304 /*
3305  * The syncer vnode is no longer needed and is being decommissioned.
3306  *
3307  * Modifications to the worklist must be protected at splbio().
3308  */
3309 static int
3310 sync_reclaim(ap)
3311 	struct vop_reclaim_args /* {
3312 		struct vnode *a_vp;
3313 	} */ *ap;
3314 {
3315 	struct vnode *vp = ap->a_vp;
3316 	int s;
3317 
3318 	s = splbio();
3319 	vp->v_mount->mnt_syncer = NULL;
3320 	VI_LOCK(vp);
3321 	if (vp->v_iflag & VI_ONWORKLST) {
3322 		LIST_REMOVE(vp, v_synclist);
3323 		vp->v_iflag &= ~VI_ONWORKLST;
3324 	}
3325 	VI_UNLOCK(vp);
3326 	splx(s);
3327 
3328 	return (0);
3329 }
3330 
3331 /*
3332  * Print out a syncer vnode.
3333  */
3334 static int
3335 sync_print(ap)
3336 	struct vop_print_args /* {
3337 		struct vnode *a_vp;
3338 	} */ *ap;
3339 {
3340 	struct vnode *vp = ap->a_vp;
3341 
3342 	printf("syncer vnode");
3343 	if (vp->v_vnlock != NULL)
3344 		lockmgr_printinfo(vp->v_vnlock);
3345 	printf("\n");
3346 	return (0);
3347 }
3348 
3349 /*
3350  * extract the dev_t from a VCHR
3351  */
3352 dev_t
3353 vn_todev(vp)
3354 	struct vnode *vp;
3355 {
3356 	if (vp->v_type != VCHR)
3357 		return (NODEV);
3358 	return (vp->v_rdev);
3359 }
3360 
3361 /*
3362  * Check if vnode represents a disk device
3363  */
3364 int
3365 vn_isdisk(vp, errp)
3366 	struct vnode *vp;
3367 	int *errp;
3368 {
3369 	struct cdevsw *cdevsw;
3370 
3371 	if (vp->v_type != VCHR) {
3372 		if (errp != NULL)
3373 			*errp = ENOTBLK;
3374 		return (0);
3375 	}
3376 	if (vp->v_rdev == NULL) {
3377 		if (errp != NULL)
3378 			*errp = ENXIO;
3379 		return (0);
3380 	}
3381 	cdevsw = devsw(vp->v_rdev);
3382 	if (cdevsw == NULL) {
3383 		if (errp != NULL)
3384 			*errp = ENXIO;
3385 		return (0);
3386 	}
3387 	if (!(cdevsw->d_flags & D_DISK)) {
3388 		if (errp != NULL)
3389 			*errp = ENOTBLK;
3390 		return (0);
3391 	}
3392 	if (errp != NULL)
3393 		*errp = 0;
3394 	return (1);
3395 }
3396 
3397 /*
3398  * Free data allocated by namei(); see namei(9) for details.
3399  */
3400 void
3401 NDFREE(ndp, flags)
3402      struct nameidata *ndp;
3403      const uint flags;
3404 {
3405 	if (!(flags & NDF_NO_FREE_PNBUF) &&
3406 	    (ndp->ni_cnd.cn_flags & HASBUF)) {
3407 		uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
3408 		ndp->ni_cnd.cn_flags &= ~HASBUF;
3409 	}
3410 	if (!(flags & NDF_NO_DVP_UNLOCK) &&
3411 	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
3412 	    ndp->ni_dvp != ndp->ni_vp)
3413 		VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_thread);
3414 	if (!(flags & NDF_NO_DVP_RELE) &&
3415 	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
3416 		vrele(ndp->ni_dvp);
3417 		ndp->ni_dvp = NULL;
3418 	}
3419 	if (!(flags & NDF_NO_VP_UNLOCK) &&
3420 	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
3421 		VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_thread);
3422 	if (!(flags & NDF_NO_VP_RELE) &&
3423 	    ndp->ni_vp) {
3424 		vrele(ndp->ni_vp);
3425 		ndp->ni_vp = NULL;
3426 	}
3427 	if (!(flags & NDF_NO_STARTDIR_RELE) &&
3428 	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
3429 		vrele(ndp->ni_startdir);
3430 		ndp->ni_startdir = NULL;
3431 	}
3432 }
3433 
3434 /*
3435  * Common filesystem object access control check routine.  Accepts a
3436  * vnode's type, "mode", uid and gid, requested access mode, credentials,
3437  * and optional call-by-reference privused argument allowing vaccess()
3438  * to indicate to the caller whether privilege was used to satisfy the
3439  * request (obsoleted).  Returns 0 on success, or an errno on failure.
3440  */
3441 int
3442 vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
3443 	enum vtype type;
3444 	mode_t file_mode;
3445 	uid_t file_uid;
3446 	gid_t file_gid;
3447 	mode_t acc_mode;
3448 	struct ucred *cred;
3449 	int *privused;
3450 {
3451 	mode_t dac_granted;
3452 #ifdef CAPABILITIES
3453 	mode_t cap_granted;
3454 #endif
3455 
3456 	/*
3457 	 * Look for a normal, non-privileged way to access the file/directory
3458 	 * as requested.  If it exists, go with that.
3459 	 */
3460 
3461 	if (privused != NULL)
3462 		*privused = 0;
3463 
3464 	dac_granted = 0;
3465 
3466 	/* Check the owner. */
3467 	if (cred->cr_uid == file_uid) {
3468 		dac_granted |= VADMIN;
3469 		if (file_mode & S_IXUSR)
3470 			dac_granted |= VEXEC;
3471 		if (file_mode & S_IRUSR)
3472 			dac_granted |= VREAD;
3473 		if (file_mode & S_IWUSR)
3474 			dac_granted |= (VWRITE | VAPPEND);
3475 
3476 		if ((acc_mode & dac_granted) == acc_mode)
3477 			return (0);
3478 
3479 		goto privcheck;
3480 	}
3481 
3482 	/* Otherwise, check the groups (first match) */
3483 	if (groupmember(file_gid, cred)) {
3484 		if (file_mode & S_IXGRP)
3485 			dac_granted |= VEXEC;
3486 		if (file_mode & S_IRGRP)
3487 			dac_granted |= VREAD;
3488 		if (file_mode & S_IWGRP)
3489 			dac_granted |= (VWRITE | VAPPEND);
3490 
3491 		if ((acc_mode & dac_granted) == acc_mode)
3492 			return (0);
3493 
3494 		goto privcheck;
3495 	}
3496 
3497 	/* Otherwise, check everyone else. */
3498 	if (file_mode & S_IXOTH)
3499 		dac_granted |= VEXEC;
3500 	if (file_mode & S_IROTH)
3501 		dac_granted |= VREAD;
3502 	if (file_mode & S_IWOTH)
3503 		dac_granted |= (VWRITE | VAPPEND);
3504 	if ((acc_mode & dac_granted) == acc_mode)
3505 		return (0);
3506 
3507 privcheck:
3508 	if (!suser_cred(cred, PRISON_ROOT)) {
3509 		/* XXX audit: privilege used */
3510 		if (privused != NULL)
3511 			*privused = 1;
3512 		return (0);
3513 	}
3514 
3515 #ifdef CAPABILITIES
3516 	/*
3517 	 * Build a capability mask to determine if the set of capabilities
3518 	 * satisfies the requirements when combined with the granted mask
3519 	 * from above.
3520 	 * For each capability, if the capability is required, bitwise
3521 	 * or the request type onto the cap_granted mask.
3522 	 */
3523 	cap_granted = 0;
3524 
3525 	if (type == VDIR) {
3526 		/*
3527 		 * For directories, use CAP_DAC_READ_SEARCH to satisfy
3528 		 * VEXEC requests, instead of CAP_DAC_EXECUTE.
3529 		 */
3530 		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3531 		    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3532 			cap_granted |= VEXEC;
3533 	} else {
3534 		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3535 		    !cap_check(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT))
3536 			cap_granted |= VEXEC;
3537 	}
3538 
3539 	if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
3540 	    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3541 		cap_granted |= VREAD;
3542 
3543 	if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3544 	    !cap_check(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT))
3545 		cap_granted |= (VWRITE | VAPPEND);
3546 
3547 	if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3548 	    !cap_check(cred, NULL, CAP_FOWNER, PRISON_ROOT))
3549 		cap_granted |= VADMIN;
3550 
3551 	if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
3552 		/* XXX audit: privilege used */
3553 		if (privused != NULL)
3554 			*privused = 1;
3555 		return (0);
3556 	}
3557 #endif
3558 
3559 	return ((acc_mode & VADMIN) ? EPERM : EACCES);
3560 }
3561