xref: /freebsd/sys/kern/vfs_subr.c (revision d98dc34f523c46cf3aa2c86a70166af88a26b2cb)
1 /*
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
39  * $FreeBSD$
40  */
41 
42 /*
43  * External virtual filesystem routines
44  */
45 #include "opt_ddb.h"
46 #include "opt_ffs.h"
47 
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/bio.h>
51 #include <sys/buf.h>
52 #include <sys/conf.h>
53 #include <sys/dirent.h>
54 #include <sys/domain.h>
55 #include <sys/eventhandler.h>
56 #include <sys/event.h>
57 #include <sys/fcntl.h>
58 #include <sys/kernel.h>
59 #include <sys/kthread.h>
60 #include <sys/ktr.h>
61 #include <sys/malloc.h>
62 #include <net/radix.h>
63 #include <sys/socket.h>
64 #include <sys/mount.h>
65 #include <sys/mutex.h>
66 #include <sys/namei.h>
67 #include <sys/proc.h>
68 #include <sys/reboot.h>
69 #include <sys/socket.h>
70 #include <sys/stat.h>
71 #include <sys/sysctl.h>
72 #include <sys/vmmeter.h>
73 #include <sys/vnode.h>
74 
75 #include <machine/limits.h>
76 
77 #include <vm/vm.h>
78 #include <vm/vm_object.h>
79 #include <vm/vm_extern.h>
80 #include <vm/pmap.h>
81 #include <vm/vm_map.h>
82 #include <vm/vm_page.h>
83 #include <vm/vm_pager.h>
84 #include <vm/vnode_pager.h>
85 #include <vm/vm_zone.h>
86 
87 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
88 
89 static void	addalias __P((struct vnode *vp, dev_t nvp_rdev));
90 static void	insmntque __P((struct vnode *vp, struct mount *mp));
91 static void	vclean __P((struct vnode *vp, int flags, struct proc *p));
92 
93 /*
94  * Number of vnodes in existence.  Increased whenever getnewvnode()
95  * allocates a new vnode, never decreased.
96  */
97 static unsigned long	numvnodes;
98 SYSCTL_LONG(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
99 
100 /*
101  * Conversion tables for conversion from vnode types to inode formats
102  * and back.
103  */
104 enum vtype iftovt_tab[16] = {
105 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
106 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
107 };
108 int vttoif_tab[9] = {
109 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
110 	S_IFSOCK, S_IFIFO, S_IFMT,
111 };
112 
113 /*
114  * List of vnodes that are ready for recycling.
115  */
116 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
117 
118 /*
119  * Minimum number of free vnodes.  If there are fewer than this free vnodes,
120  * getnewvnode() will return a newly allocated vnode.
121  */
122 static u_long wantfreevnodes = 25;
123 SYSCTL_LONG(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
124 /* Number of vnodes in the free list. */
125 static u_long freevnodes = 0;
126 SYSCTL_LONG(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
127 /* Number of vnode allocation. */
128 static u_long vnodeallocs = 0;
129 SYSCTL_LONG(_debug, OID_AUTO, vnodeallocs, CTLFLAG_RD, &vnodeallocs, 0, "");
130 /* Period of vnode recycle from namecache in vnode allocation times. */
131 static u_long vnoderecycleperiod = 1000;
132 SYSCTL_LONG(_debug, OID_AUTO, vnoderecycleperiod, CTLFLAG_RW, &vnoderecycleperiod, 0, "");
133 /* Minimum number of total vnodes required to invoke vnode recycle from namecache. */
134 static u_long vnoderecyclemintotalvn = 2000;
135 SYSCTL_LONG(_debug, OID_AUTO, vnoderecyclemintotalvn, CTLFLAG_RW, &vnoderecyclemintotalvn, 0, "");
136 /* Minimum number of free vnodes required to invoke vnode recycle from namecache. */
137 static u_long vnoderecycleminfreevn = 2000;
138 SYSCTL_LONG(_debug, OID_AUTO, vnoderecycleminfreevn, CTLFLAG_RW, &vnoderecycleminfreevn, 0, "");
139 /* Number of vnodes attempted to recycle at a time. */
140 static u_long vnoderecyclenumber = 3000;
141 SYSCTL_LONG(_debug, OID_AUTO, vnoderecyclenumber, CTLFLAG_RW, &vnoderecyclenumber, 0, "");
142 
143 /*
144  * Various variables used for debugging the new implementation of
145  * reassignbuf().
146  * XXX these are probably of (very) limited utility now.
147  */
148 static int reassignbufcalls;
149 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
150 static int reassignbufloops;
151 SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, "");
152 static int reassignbufsortgood;
153 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, "");
154 static int reassignbufsortbad;
155 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, "");
156 /* Set to 0 for old insertion-sort based reassignbuf, 1 for modern method. */
157 static int reassignbufmethod = 1;
158 SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
159 
160 #ifdef ENABLE_VFS_IOOPT
161 /* See NOTES for a description of this setting. */
162 int vfs_ioopt = 0;
163 SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
164 #endif
165 
166 /* List of mounted filesystems. */
167 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
168 
169 /* For any iteration/modification of mountlist */
170 struct mtx mountlist_mtx;
171 
172 /* For any iteration/modification of mnt_vnodelist */
173 struct mtx mntvnode_mtx;
174 
175 /*
176  * Cache for the mount type id assigned to NFS.  This is used for
177  * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
178  */
179 int	nfs_mount_type = -1;
180 
181 /* To keep more than one thread at a time from running vfs_getnewfsid */
182 static struct mtx mntid_mtx;
183 
184 /* For any iteration/modification of vnode_free_list */
185 static struct mtx vnode_free_list_mtx;
186 
187 /*
188  * For any iteration/modification of dev->si_hlist (linked through
189  * v_specnext)
190  */
191 static struct mtx spechash_mtx;
192 
193 /* Publicly exported FS */
194 struct nfs_public nfs_pub;
195 
196 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
197 static vm_zone_t vnode_zone;
198 
199 /* Set to 1 to print out reclaim of active vnodes */
200 int	prtactive = 0;
201 
202 /*
203  * The workitem queue.
204  *
205  * It is useful to delay writes of file data and filesystem metadata
206  * for tens of seconds so that quickly created and deleted files need
207  * not waste disk bandwidth being created and removed. To realize this,
208  * we append vnodes to a "workitem" queue. When running with a soft
209  * updates implementation, most pending metadata dependencies should
210  * not wait for more than a few seconds. Thus, mounted on block devices
211  * are delayed only about a half the time that file data is delayed.
212  * Similarly, directory updates are more critical, so are only delayed
213  * about a third the time that file data is delayed. Thus, there are
214  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
215  * one each second (driven off the filesystem syncer process). The
216  * syncer_delayno variable indicates the next queue that is to be processed.
217  * Items that need to be processed soon are placed in this queue:
218  *
219  *	syncer_workitem_pending[syncer_delayno]
220  *
221  * A delay of fifteen seconds is done by placing the request fifteen
222  * entries later in the queue:
223  *
224  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
225  *
226  */
227 static int syncer_delayno = 0;
228 static long syncer_mask;
229 LIST_HEAD(synclist, vnode);
230 static struct synclist *syncer_workitem_pending;
231 
232 #define SYNCER_MAXDELAY		32
233 static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
234 time_t syncdelay = 30;		/* max time to delay syncing data */
235 time_t filedelay = 30;		/* time to delay syncing files */
236 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
237 time_t dirdelay = 29;		/* time to delay syncing directories */
238 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
239 time_t metadelay = 28;		/* time to delay syncing metadata */
240 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
241 static int rushjob;		/* number of slots to run ASAP */
242 static int stat_rush_requests;	/* number of times I/O speeded up */
243 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
244 
245 /*
246  * Number of vnodes we want to exist at any one time.  This is mostly used
247  * to size hash tables in vnode-related code.  It is normally not used in
248  * getnewvnode(), as wantfreevnodes is normally nonzero.)
249  *
250  * XXX desiredvnodes is historical cruft and should not exist.
251  */
252 int desiredvnodes;
253 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
254     &desiredvnodes, 0, "Maximum number of vnodes");
255 
256 static void	vfs_free_addrlist __P((struct netexport *nep));
257 static int	vfs_free_netcred __P((struct radix_node *rn, void *w));
258 static int	vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
259 				       struct export_args *argp));
260 
261 /*
262  * Initialize the vnode management data structures.
263  */
264 static void
265 vntblinit(void *dummy __unused)
266 {
267 
268 	desiredvnodes = maxproc + cnt.v_page_count / 4;
269 	mtx_init(&mountlist_mtx, "mountlist", MTX_DEF);
270 	mtx_init(&mntvnode_mtx, "mntvnode", MTX_DEF);
271 	mtx_init(&mntid_mtx, "mntid", MTX_DEF);
272 	mtx_init(&spechash_mtx, "spechash", MTX_DEF);
273 	TAILQ_INIT(&vnode_free_list);
274 	mtx_init(&vnode_free_list_mtx, "vnode_free_list", MTX_DEF);
275 	vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
276 	/*
277 	 * Initialize the filesystem syncer.
278 	 */
279 	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
280 		&syncer_mask);
281 	syncer_maxdelay = syncer_mask + 1;
282 }
283 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL)
284 
285 
286 /*
287  * Mark a mount point as busy. Used to synchronize access and to delay
288  * unmounting. Interlock is not released on failure.
289  */
290 int
291 vfs_busy(mp, flags, interlkp, p)
292 	struct mount *mp;
293 	int flags;
294 	struct mtx *interlkp;
295 	struct proc *p;
296 {
297 	int lkflags;
298 
299 	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
300 		if (flags & LK_NOWAIT)
301 			return (ENOENT);
302 		mp->mnt_kern_flag |= MNTK_MWAIT;
303 		/*
304 		 * Since all busy locks are shared except the exclusive
305 		 * lock granted when unmounting, the only place that a
306 		 * wakeup needs to be done is at the release of the
307 		 * exclusive lock at the end of dounmount.
308 		 */
309 		msleep((caddr_t)mp, interlkp, PVFS, "vfs_busy", 0);
310 		return (ENOENT);
311 	}
312 	lkflags = LK_SHARED | LK_NOPAUSE;
313 	if (interlkp)
314 		lkflags |= LK_INTERLOCK;
315 	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
316 		panic("vfs_busy: unexpected lock failure");
317 	return (0);
318 }
319 
320 /*
321  * Free a busy filesystem.
322  */
323 void
324 vfs_unbusy(mp, p)
325 	struct mount *mp;
326 	struct proc *p;
327 {
328 
329 	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
330 }
331 
332 /*
333  * Lookup a filesystem type, and if found allocate and initialize
334  * a mount structure for it.
335  *
336  * Devname is usually updated by mount(8) after booting.
337  */
338 int
339 vfs_rootmountalloc(fstypename, devname, mpp)
340 	char *fstypename;
341 	char *devname;
342 	struct mount **mpp;
343 {
344 	struct proc *p = curproc;	/* XXX */
345 	struct vfsconf *vfsp;
346 	struct mount *mp;
347 
348 	if (fstypename == NULL)
349 		return (ENODEV);
350 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
351 		if (!strcmp(vfsp->vfc_name, fstypename))
352 			break;
353 	if (vfsp == NULL)
354 		return (ENODEV);
355 	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO);
356 	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
357 	(void)vfs_busy(mp, LK_NOWAIT, 0, p);
358 	LIST_INIT(&mp->mnt_vnodelist);
359 	mp->mnt_vfc = vfsp;
360 	mp->mnt_op = vfsp->vfc_vfsops;
361 	mp->mnt_flag = MNT_RDONLY;
362 	mp->mnt_vnodecovered = NULLVP;
363 	vfsp->vfc_refcount++;
364 	mp->mnt_iosize_max = DFLTPHYS;
365 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
366 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
367 	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
368 	mp->mnt_stat.f_mntonname[0] = '/';
369 	mp->mnt_stat.f_mntonname[1] = 0;
370 	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
371 	*mpp = mp;
372 	return (0);
373 }
374 
375 /*
376  * Find an appropriate filesystem to use for the root. If a filesystem
377  * has not been preselected, walk through the list of known filesystems
378  * trying those that have mountroot routines, and try them until one
379  * works or we have tried them all.
380  */
381 #ifdef notdef	/* XXX JH */
382 int
383 lite2_vfs_mountroot()
384 {
385 	struct vfsconf *vfsp;
386 	extern int (*lite2_mountroot) __P((void));
387 	int error;
388 
389 	if (lite2_mountroot != NULL)
390 		return ((*lite2_mountroot)());
391 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
392 		if (vfsp->vfc_mountroot == NULL)
393 			continue;
394 		if ((error = (*vfsp->vfc_mountroot)()) == 0)
395 			return (0);
396 		printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
397 	}
398 	return (ENODEV);
399 }
400 #endif
401 
402 /*
403  * Lookup a mount point by filesystem identifier.
404  */
405 struct mount *
406 vfs_getvfs(fsid)
407 	fsid_t *fsid;
408 {
409 	register struct mount *mp;
410 
411 	mtx_lock(&mountlist_mtx);
412 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
413 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
414 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
415 			mtx_unlock(&mountlist_mtx);
416 			return (mp);
417 	    }
418 	}
419 	mtx_unlock(&mountlist_mtx);
420 	return ((struct mount *) 0);
421 }
422 
423 /*
424  * Get a new unique fsid.  Try to make its val[0] unique, since this value
425  * will be used to create fake device numbers for stat().  Also try (but
426  * not so hard) make its val[0] unique mod 2^16, since some emulators only
427  * support 16-bit device numbers.  We end up with unique val[0]'s for the
428  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
429  *
430  * Keep in mind that several mounts may be running in parallel.  Starting
431  * the search one past where the previous search terminated is both a
432  * micro-optimization and a defense against returning the same fsid to
433  * different mounts.
434  */
435 void
436 vfs_getnewfsid(mp)
437 	struct mount *mp;
438 {
439 	static u_int16_t mntid_base;
440 	fsid_t tfsid;
441 	int mtype;
442 
443 	mtx_lock(&mntid_mtx);
444 	mtype = mp->mnt_vfc->vfc_typenum;
445 	tfsid.val[1] = mtype;
446 	mtype = (mtype & 0xFF) << 24;
447 	for (;;) {
448 		tfsid.val[0] = makeudev(255,
449 		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
450 		mntid_base++;
451 		if (vfs_getvfs(&tfsid) == NULL)
452 			break;
453 	}
454 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
455 	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
456 	mtx_unlock(&mntid_mtx);
457 }
458 
459 /*
460  * Knob to control the precision of file timestamps:
461  *
462  *   0 = seconds only; nanoseconds zeroed.
463  *   1 = seconds and nanoseconds, accurate within 1/HZ.
464  *   2 = seconds and nanoseconds, truncated to microseconds.
465  * >=3 = seconds and nanoseconds, maximum precision.
466  */
467 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
468 
469 static int timestamp_precision = TSP_SEC;
470 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
471     &timestamp_precision, 0, "");
472 
473 /*
474  * Get a current timestamp.
475  */
476 void
477 vfs_timestamp(tsp)
478 	struct timespec *tsp;
479 {
480 	struct timeval tv;
481 
482 	switch (timestamp_precision) {
483 	case TSP_SEC:
484 		tsp->tv_sec = time_second;
485 		tsp->tv_nsec = 0;
486 		break;
487 	case TSP_HZ:
488 		getnanotime(tsp);
489 		break;
490 	case TSP_USEC:
491 		microtime(&tv);
492 		TIMEVAL_TO_TIMESPEC(&tv, tsp);
493 		break;
494 	case TSP_NSEC:
495 	default:
496 		nanotime(tsp);
497 		break;
498 	}
499 }
500 
501 /*
502  * Set vnode attributes to VNOVAL
503  */
504 void
505 vattr_null(vap)
506 	register struct vattr *vap;
507 {
508 
509 	vap->va_type = VNON;
510 	vap->va_size = VNOVAL;
511 	vap->va_bytes = VNOVAL;
512 	vap->va_mode = VNOVAL;
513 	vap->va_nlink = VNOVAL;
514 	vap->va_uid = VNOVAL;
515 	vap->va_gid = VNOVAL;
516 	vap->va_fsid = VNOVAL;
517 	vap->va_fileid = VNOVAL;
518 	vap->va_blocksize = VNOVAL;
519 	vap->va_rdev = VNOVAL;
520 	vap->va_atime.tv_sec = VNOVAL;
521 	vap->va_atime.tv_nsec = VNOVAL;
522 	vap->va_mtime.tv_sec = VNOVAL;
523 	vap->va_mtime.tv_nsec = VNOVAL;
524 	vap->va_ctime.tv_sec = VNOVAL;
525 	vap->va_ctime.tv_nsec = VNOVAL;
526 	vap->va_flags = VNOVAL;
527 	vap->va_gen = VNOVAL;
528 	vap->va_vaflags = 0;
529 }
530 
531 /*
532  * Routines having to do with the management of the vnode table.
533  */
534 
535 /*
536  * Return the next vnode from the free list.
537  */
538 int
539 getnewvnode(tag, mp, vops, vpp)
540 	enum vtagtype tag;
541 	struct mount *mp;
542 	vop_t **vops;
543 	struct vnode **vpp;
544 {
545 	int s, count;
546 	struct proc *p = curproc;	/* XXX */
547 	struct vnode *vp = NULL;
548 	struct mount *vnmp;
549 	vm_object_t object;
550 
551 	/*
552 	 * We take the least recently used vnode from the freelist
553 	 * if we can get it and it has no cached pages, and no
554 	 * namecache entries are relative to it.
555 	 * Otherwise we allocate a new vnode
556 	 */
557 
558 	s = splbio();
559 	mtx_lock(&vnode_free_list_mtx);
560 
561 	if (wantfreevnodes && freevnodes < wantfreevnodes) {
562 		vp = NULL;
563 	} else if (!wantfreevnodes && freevnodes <= desiredvnodes) {
564 		/*
565 		 * XXX: this is only here to be backwards compatible
566 		 */
567 		vp = NULL;
568 	} else for (count = 0; count < freevnodes; count++) {
569 		vp = TAILQ_FIRST(&vnode_free_list);
570 		if (vp == NULL || vp->v_usecount)
571 			panic("getnewvnode: free vnode isn't");
572 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
573 
574 		/*
575 		 * Don't recycle if active in the namecache or
576 		 * if it still has cached pages or we cannot get
577 		 * its interlock.
578 		 */
579 		if (LIST_FIRST(&vp->v_cache_src) != NULL ||
580 		    (VOP_GETVOBJECT(vp, &object) == 0 &&
581 		     (object->resident_page_count || object->ref_count)) ||
582 		    !mtx_trylock(&vp->v_interlock)) {
583 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
584 			vp = NULL;
585 			continue;
586 		}
587 		/*
588 		 * Skip over it if its filesystem is being suspended.
589 		 */
590 		if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0)
591 			break;
592 		mtx_unlock(&vp->v_interlock);
593 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
594 		vp = NULL;
595 	}
596 	if (vp) {
597 		vp->v_flag |= VDOOMED;
598 		vp->v_flag &= ~VFREE;
599 		freevnodes--;
600 		mtx_unlock(&vnode_free_list_mtx);
601 		cache_purge(vp);
602 		vp->v_lease = NULL;
603 		if (vp->v_type != VBAD) {
604 			vgonel(vp, p);
605 		} else {
606 			mtx_unlock(&vp->v_interlock);
607 		}
608 		vn_finished_write(vnmp);
609 
610 #ifdef INVARIANTS
611 		{
612 			int s;
613 
614 			if (vp->v_data)
615 				panic("cleaned vnode isn't");
616 			s = splbio();
617 			if (vp->v_numoutput)
618 				panic("Clean vnode has pending I/O's");
619 			splx(s);
620 			if (vp->v_writecount != 0)
621 				panic("Non-zero write count");
622 		}
623 #endif
624 		vp->v_flag = 0;
625 		vp->v_lastw = 0;
626 		vp->v_lasta = 0;
627 		vp->v_cstart = 0;
628 		vp->v_clen = 0;
629 		vp->v_socket = 0;
630 	} else {
631 		mtx_unlock(&vnode_free_list_mtx);
632 		vp = (struct vnode *) zalloc(vnode_zone);
633 		bzero((char *) vp, sizeof *vp);
634 		mtx_init(&vp->v_interlock, "vnode interlock", MTX_DEF);
635 		vp->v_dd = vp;
636 		mtx_init(&vp->v_pollinfo.vpi_lock, "vnode pollinfo", MTX_DEF);
637 		cache_purge(vp);
638 		LIST_INIT(&vp->v_cache_src);
639 		TAILQ_INIT(&vp->v_cache_dst);
640 		numvnodes++;
641 	}
642 
643 	TAILQ_INIT(&vp->v_cleanblkhd);
644 	TAILQ_INIT(&vp->v_dirtyblkhd);
645 	vp->v_type = VNON;
646 	vp->v_tag = tag;
647 	vp->v_op = vops;
648 	lockinit(&vp->v_lock, PVFS, "vnlock", 0, LK_NOPAUSE);
649 	insmntque(vp, mp);
650 	*vpp = vp;
651 	vp->v_usecount = 1;
652 	vp->v_data = 0;
653 
654 	splx(s);
655 
656 	vfs_object_create(vp, p, p->p_ucred);
657 
658 	vnodeallocs++;
659 	if (vnodeallocs % vnoderecycleperiod == 0 &&
660 	    freevnodes < vnoderecycleminfreevn &&
661 	    vnoderecyclemintotalvn < numvnodes) {
662 		/* Recycle vnodes. */
663 		cache_purgeleafdirs(vnoderecyclenumber);
664 	}
665 
666 	return (0);
667 }
668 
669 /*
670  * Move a vnode from one mount queue to another.
671  */
672 static void
673 insmntque(vp, mp)
674 	register struct vnode *vp;
675 	register struct mount *mp;
676 {
677 
678 	mtx_lock(&mntvnode_mtx);
679 	/*
680 	 * Delete from old mount point vnode list, if on one.
681 	 */
682 	if (vp->v_mount != NULL)
683 		LIST_REMOVE(vp, v_mntvnodes);
684 	/*
685 	 * Insert into list of vnodes for the new mount point, if available.
686 	 */
687 	if ((vp->v_mount = mp) == NULL) {
688 		mtx_unlock(&mntvnode_mtx);
689 		return;
690 	}
691 	LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
692 	mtx_unlock(&mntvnode_mtx);
693 }
694 
695 /*
696  * Update outstanding I/O count and do wakeup if requested.
697  */
698 void
699 vwakeup(bp)
700 	register struct buf *bp;
701 {
702 	register struct vnode *vp;
703 
704 	bp->b_flags &= ~B_WRITEINPROG;
705 	if ((vp = bp->b_vp)) {
706 		vp->v_numoutput--;
707 		if (vp->v_numoutput < 0)
708 			panic("vwakeup: neg numoutput");
709 		if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
710 			vp->v_flag &= ~VBWAIT;
711 			wakeup((caddr_t) &vp->v_numoutput);
712 		}
713 	}
714 }
715 
716 /*
717  * Flush out and invalidate all buffers associated with a vnode.
718  * Called with the underlying object locked.
719  */
720 int
721 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
722 	register struct vnode *vp;
723 	int flags;
724 	struct ucred *cred;
725 	struct proc *p;
726 	int slpflag, slptimeo;
727 {
728 	register struct buf *bp;
729 	struct buf *nbp, *blist;
730 	int s, error;
731 	vm_object_t object;
732 
733 	if (flags & V_SAVE) {
734 		s = splbio();
735 		while (vp->v_numoutput) {
736 			vp->v_flag |= VBWAIT;
737 			error = tsleep((caddr_t)&vp->v_numoutput,
738 			    slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
739 			if (error) {
740 				splx(s);
741 				return (error);
742 			}
743 		}
744 		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
745 			splx(s);
746 			if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
747 				return (error);
748 			s = splbio();
749 			if (vp->v_numoutput > 0 ||
750 			    !TAILQ_EMPTY(&vp->v_dirtyblkhd))
751 				panic("vinvalbuf: dirty bufs");
752 		}
753 		splx(s);
754   	}
755 	s = splbio();
756 	for (;;) {
757 		blist = TAILQ_FIRST(&vp->v_cleanblkhd);
758 		if (!blist)
759 			blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
760 		if (!blist)
761 			break;
762 
763 		for (bp = blist; bp; bp = nbp) {
764 			nbp = TAILQ_NEXT(bp, b_vnbufs);
765 			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
766 				error = BUF_TIMELOCK(bp,
767 				    LK_EXCLUSIVE | LK_SLEEPFAIL,
768 				    "vinvalbuf", slpflag, slptimeo);
769 				if (error == ENOLCK)
770 					break;
771 				splx(s);
772 				return (error);
773 			}
774 			/*
775 			 * XXX Since there are no node locks for NFS, I
776 			 * believe there is a slight chance that a delayed
777 			 * write will occur while sleeping just above, so
778 			 * check for it.  Note that vfs_bio_awrite expects
779 			 * buffers to reside on a queue, while BUF_WRITE and
780 			 * brelse do not.
781 			 */
782 			if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
783 				(flags & V_SAVE)) {
784 
785 				if (bp->b_vp == vp) {
786 					if (bp->b_flags & B_CLUSTEROK) {
787 						BUF_UNLOCK(bp);
788 						vfs_bio_awrite(bp);
789 					} else {
790 						bremfree(bp);
791 						bp->b_flags |= B_ASYNC;
792 						BUF_WRITE(bp);
793 					}
794 				} else {
795 					bremfree(bp);
796 					(void) BUF_WRITE(bp);
797 				}
798 				break;
799 			}
800 			bremfree(bp);
801 			bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
802 			bp->b_flags &= ~B_ASYNC;
803 			brelse(bp);
804 		}
805 	}
806 
807 	while (vp->v_numoutput > 0) {
808 		vp->v_flag |= VBWAIT;
809 		tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
810 	}
811 
812 	splx(s);
813 
814 	/*
815 	 * Destroy the copy in the VM cache, too.
816 	 */
817 	mtx_lock(&vp->v_interlock);
818 	if (VOP_GETVOBJECT(vp, &object) == 0) {
819 		vm_object_page_remove(object, 0, 0,
820 			(flags & V_SAVE) ? TRUE : FALSE);
821 	}
822 	mtx_unlock(&vp->v_interlock);
823 
824 	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd))
825 		panic("vinvalbuf: flush failed");
826 	return (0);
827 }
828 
829 /*
830  * Truncate a file's buffer and pages to a specified length.  This
831  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
832  * sync activity.
833  */
834 int
835 vtruncbuf(vp, cred, p, length, blksize)
836 	register struct vnode *vp;
837 	struct ucred *cred;
838 	struct proc *p;
839 	off_t length;
840 	int blksize;
841 {
842 	register struct buf *bp;
843 	struct buf *nbp;
844 	int s, anyfreed;
845 	int trunclbn;
846 
847 	/*
848 	 * Round up to the *next* lbn.
849 	 */
850 	trunclbn = (length + blksize - 1) / blksize;
851 
852 	s = splbio();
853 restart:
854 	anyfreed = 1;
855 	for (;anyfreed;) {
856 		anyfreed = 0;
857 		for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
858 			nbp = TAILQ_NEXT(bp, b_vnbufs);
859 			if (bp->b_lblkno >= trunclbn) {
860 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
861 					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
862 					goto restart;
863 				} else {
864 					bremfree(bp);
865 					bp->b_flags |= (B_INVAL | B_RELBUF);
866 					bp->b_flags &= ~B_ASYNC;
867 					brelse(bp);
868 					anyfreed = 1;
869 				}
870 				if (nbp &&
871 				    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
872 				    (nbp->b_vp != vp) ||
873 				    (nbp->b_flags & B_DELWRI))) {
874 					goto restart;
875 				}
876 			}
877 		}
878 
879 		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
880 			nbp = TAILQ_NEXT(bp, b_vnbufs);
881 			if (bp->b_lblkno >= trunclbn) {
882 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
883 					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
884 					goto restart;
885 				} else {
886 					bremfree(bp);
887 					bp->b_flags |= (B_INVAL | B_RELBUF);
888 					bp->b_flags &= ~B_ASYNC;
889 					brelse(bp);
890 					anyfreed = 1;
891 				}
892 				if (nbp &&
893 				    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
894 				    (nbp->b_vp != vp) ||
895 				    (nbp->b_flags & B_DELWRI) == 0)) {
896 					goto restart;
897 				}
898 			}
899 		}
900 	}
901 
902 	if (length > 0) {
903 restartsync:
904 		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
905 			nbp = TAILQ_NEXT(bp, b_vnbufs);
906 			if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
907 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
908 					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
909 					goto restart;
910 				} else {
911 					bremfree(bp);
912 					if (bp->b_vp == vp) {
913 						bp->b_flags |= B_ASYNC;
914 					} else {
915 						bp->b_flags &= ~B_ASYNC;
916 					}
917 					BUF_WRITE(bp);
918 				}
919 				goto restartsync;
920 			}
921 
922 		}
923 	}
924 
925 	while (vp->v_numoutput > 0) {
926 		vp->v_flag |= VBWAIT;
927 		tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0);
928 	}
929 
930 	splx(s);
931 
932 	vnode_pager_setsize(vp, length);
933 
934 	return (0);
935 }
936 
937 /*
938  * Associate a buffer with a vnode.
939  */
940 void
941 bgetvp(vp, bp)
942 	register struct vnode *vp;
943 	register struct buf *bp;
944 {
945 	int s;
946 
947 	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
948 
949 	vhold(vp);
950 	bp->b_vp = vp;
951 	bp->b_dev = vn_todev(vp);
952 	/*
953 	 * Insert onto list for new vnode.
954 	 */
955 	s = splbio();
956 	bp->b_xflags |= BX_VNCLEAN;
957 	bp->b_xflags &= ~BX_VNDIRTY;
958 	TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
959 	splx(s);
960 }
961 
962 /*
963  * Disassociate a buffer from a vnode.
964  */
965 void
966 brelvp(bp)
967 	register struct buf *bp;
968 {
969 	struct vnode *vp;
970 	struct buflists *listheadp;
971 	int s;
972 
973 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
974 
975 	/*
976 	 * Delete from old vnode list, if on one.
977 	 */
978 	vp = bp->b_vp;
979 	s = splbio();
980 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
981 		if (bp->b_xflags & BX_VNDIRTY)
982 			listheadp = &vp->v_dirtyblkhd;
983 		else
984 			listheadp = &vp->v_cleanblkhd;
985 		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
986 		bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
987 	}
988 	if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
989 		vp->v_flag &= ~VONWORKLST;
990 		LIST_REMOVE(vp, v_synclist);
991 	}
992 	splx(s);
993 	bp->b_vp = (struct vnode *) 0;
994 	vdrop(vp);
995 }
996 
997 /*
998  * Add an item to the syncer work queue.
999  */
1000 static void
1001 vn_syncer_add_to_worklist(struct vnode *vp, int delay)
1002 {
1003 	int s, slot;
1004 
1005 	s = splbio();
1006 
1007 	if (vp->v_flag & VONWORKLST) {
1008 		LIST_REMOVE(vp, v_synclist);
1009 	}
1010 
1011 	if (delay > syncer_maxdelay - 2)
1012 		delay = syncer_maxdelay - 2;
1013 	slot = (syncer_delayno + delay) & syncer_mask;
1014 
1015 	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
1016 	vp->v_flag |= VONWORKLST;
1017 	splx(s);
1018 }
1019 
1020 struct  proc *updateproc;
1021 static void sched_sync __P((void));
1022 static struct kproc_desc up_kp = {
1023 	"syncer",
1024 	sched_sync,
1025 	&updateproc
1026 };
1027 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
1028 
1029 /*
1030  * System filesystem synchronizer daemon.
1031  */
1032 void
1033 sched_sync(void)
1034 {
1035 	struct synclist *slp;
1036 	struct vnode *vp;
1037 	struct mount *mp;
1038 	long starttime;
1039 	int s;
1040 	struct proc *p = updateproc;
1041 
1042 	mtx_lock(&Giant);
1043 
1044 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
1045 	    SHUTDOWN_PRI_LAST);
1046 
1047 	for (;;) {
1048 		kthread_suspend_check(p);
1049 
1050 		starttime = time_second;
1051 
1052 		/*
1053 		 * Push files whose dirty time has expired.  Be careful
1054 		 * of interrupt race on slp queue.
1055 		 */
1056 		s = splbio();
1057 		slp = &syncer_workitem_pending[syncer_delayno];
1058 		syncer_delayno += 1;
1059 		if (syncer_delayno == syncer_maxdelay)
1060 			syncer_delayno = 0;
1061 		splx(s);
1062 
1063 		while ((vp = LIST_FIRST(slp)) != NULL) {
1064 			if (VOP_ISLOCKED(vp, NULL) == 0 &&
1065 			    vn_start_write(vp, &mp, V_NOWAIT) == 0) {
1066 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
1067 				(void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
1068 				VOP_UNLOCK(vp, 0, p);
1069 				vn_finished_write(mp);
1070 			}
1071 			s = splbio();
1072 			if (LIST_FIRST(slp) == vp) {
1073 				/*
1074 				 * Note: v_tag VT_VFS vps can remain on the
1075 				 * worklist too with no dirty blocks, but
1076 				 * since sync_fsync() moves it to a different
1077 				 * slot we are safe.
1078 				 */
1079 				if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
1080 				    !vn_isdisk(vp, NULL))
1081 					panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
1082 				/*
1083 				 * Put us back on the worklist.  The worklist
1084 				 * routine will remove us from our current
1085 				 * position and then add us back in at a later
1086 				 * position.
1087 				 */
1088 				vn_syncer_add_to_worklist(vp, syncdelay);
1089 			}
1090 			splx(s);
1091 		}
1092 
1093 		/*
1094 		 * Do soft update processing.
1095 		 */
1096 #ifdef SOFTUPDATES
1097 		softdep_process_worklist(NULL);
1098 #endif
1099 
1100 		/*
1101 		 * The variable rushjob allows the kernel to speed up the
1102 		 * processing of the filesystem syncer process. A rushjob
1103 		 * value of N tells the filesystem syncer to process the next
1104 		 * N seconds worth of work on its queue ASAP. Currently rushjob
1105 		 * is used by the soft update code to speed up the filesystem
1106 		 * syncer process when the incore state is getting so far
1107 		 * ahead of the disk that the kernel memory pool is being
1108 		 * threatened with exhaustion.
1109 		 */
1110 		if (rushjob > 0) {
1111 			rushjob -= 1;
1112 			continue;
1113 		}
1114 		/*
1115 		 * If it has taken us less than a second to process the
1116 		 * current work, then wait. Otherwise start right over
1117 		 * again. We can still lose time if any single round
1118 		 * takes more than two seconds, but it does not really
1119 		 * matter as we are just trying to generally pace the
1120 		 * filesystem activity.
1121 		 */
1122 		if (time_second == starttime)
1123 			tsleep(&lbolt, PPAUSE, "syncer", 0);
1124 	}
1125 }
1126 
1127 /*
1128  * Request the syncer daemon to speed up its work.
1129  * We never push it to speed up more than half of its
1130  * normal turn time, otherwise it could take over the cpu.
1131  */
1132 int
1133 speedup_syncer()
1134 {
1135 
1136 	mtx_lock_spin(&sched_lock);
1137 	if (updateproc->p_wchan == &lbolt)
1138 		setrunnable(updateproc);
1139 	mtx_unlock_spin(&sched_lock);
1140 	if (rushjob < syncdelay / 2) {
1141 		rushjob += 1;
1142 		stat_rush_requests += 1;
1143 		return (1);
1144 	}
1145 	return(0);
1146 }
1147 
1148 /*
1149  * Associate a p-buffer with a vnode.
1150  *
1151  * Also sets B_PAGING flag to indicate that vnode is not fully associated
1152  * with the buffer.  i.e. the bp has not been linked into the vnode or
1153  * ref-counted.
1154  */
1155 void
1156 pbgetvp(vp, bp)
1157 	register struct vnode *vp;
1158 	register struct buf *bp;
1159 {
1160 
1161 	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
1162 
1163 	bp->b_vp = vp;
1164 	bp->b_flags |= B_PAGING;
1165 	bp->b_dev = vn_todev(vp);
1166 }
1167 
1168 /*
1169  * Disassociate a p-buffer from a vnode.
1170  */
1171 void
1172 pbrelvp(bp)
1173 	register struct buf *bp;
1174 {
1175 
1176 	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
1177 
1178 	/* XXX REMOVE ME */
1179 	if (TAILQ_NEXT(bp, b_vnbufs) != NULL) {
1180 		panic(
1181 		    "relpbuf(): b_vp was probably reassignbuf()d %p %x",
1182 		    bp,
1183 		    (int)bp->b_flags
1184 		);
1185 	}
1186 	bp->b_vp = (struct vnode *) 0;
1187 	bp->b_flags &= ~B_PAGING;
1188 }
1189 
1190 /*
1191  * Change the vnode a pager buffer is associated with.
1192  */
1193 void
1194 pbreassignbuf(bp, newvp)
1195 	struct buf *bp;
1196 	struct vnode *newvp;
1197 {
1198 
1199 	KASSERT(bp->b_flags & B_PAGING,
1200 	    ("pbreassignbuf() on non phys bp %p", bp));
1201 	bp->b_vp = newvp;
1202 }
1203 
1204 /*
1205  * Reassign a buffer from one vnode to another.
1206  * Used to assign file specific control information
1207  * (indirect blocks) to the vnode to which they belong.
1208  */
1209 void
1210 reassignbuf(bp, newvp)
1211 	register struct buf *bp;
1212 	register struct vnode *newvp;
1213 {
1214 	struct buflists *listheadp;
1215 	int delay;
1216 	int s;
1217 
1218 	if (newvp == NULL) {
1219 		printf("reassignbuf: NULL");
1220 		return;
1221 	}
1222 	++reassignbufcalls;
1223 
1224 	/*
1225 	 * B_PAGING flagged buffers cannot be reassigned because their vp
1226 	 * is not fully linked in.
1227 	 */
1228 	if (bp->b_flags & B_PAGING)
1229 		panic("cannot reassign paging buffer");
1230 
1231 	s = splbio();
1232 	/*
1233 	 * Delete from old vnode list, if on one.
1234 	 */
1235 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
1236 		if (bp->b_xflags & BX_VNDIRTY)
1237 			listheadp = &bp->b_vp->v_dirtyblkhd;
1238 		else
1239 			listheadp = &bp->b_vp->v_cleanblkhd;
1240 		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
1241 		bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1242 		if (bp->b_vp != newvp) {
1243 			vdrop(bp->b_vp);
1244 			bp->b_vp = NULL;	/* for clarification */
1245 		}
1246 	}
1247 	/*
1248 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
1249 	 * of clean buffers.
1250 	 */
1251 	if (bp->b_flags & B_DELWRI) {
1252 		struct buf *tbp;
1253 
1254 		listheadp = &newvp->v_dirtyblkhd;
1255 		if ((newvp->v_flag & VONWORKLST) == 0) {
1256 			switch (newvp->v_type) {
1257 			case VDIR:
1258 				delay = dirdelay;
1259 				break;
1260 			case VCHR:
1261 				if (newvp->v_rdev->si_mountpoint != NULL) {
1262 					delay = metadelay;
1263 					break;
1264 				}
1265 				/* fall through */
1266 			default:
1267 				delay = filedelay;
1268 			}
1269 			vn_syncer_add_to_worklist(newvp, delay);
1270 		}
1271 		bp->b_xflags |= BX_VNDIRTY;
1272 		tbp = TAILQ_FIRST(listheadp);
1273 		if (tbp == NULL ||
1274 		    bp->b_lblkno == 0 ||
1275 		    (bp->b_lblkno > 0 && tbp->b_lblkno < 0) ||
1276 		    (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) {
1277 			TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
1278 			++reassignbufsortgood;
1279 		} else if (bp->b_lblkno < 0) {
1280 			TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
1281 			++reassignbufsortgood;
1282 		} else if (reassignbufmethod == 1) {
1283 			/*
1284 			 * New sorting algorithm, only handle sequential case,
1285 			 * otherwise append to end (but before metadata)
1286 			 */
1287 			if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL &&
1288 			    (tbp->b_xflags & BX_VNDIRTY)) {
1289 				/*
1290 				 * Found the best place to insert the buffer
1291 				 */
1292 				TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1293 				++reassignbufsortgood;
1294 			} else {
1295 				/*
1296 				 * Missed, append to end, but before meta-data.
1297 				 * We know that the head buffer in the list is
1298 				 * not meta-data due to prior conditionals.
1299 				 *
1300 				 * Indirect effects:  NFS second stage write
1301 				 * tends to wind up here, giving maximum
1302 				 * distance between the unstable write and the
1303 				 * commit rpc.
1304 				 */
1305 				tbp = TAILQ_LAST(listheadp, buflists);
1306 				while (tbp && tbp->b_lblkno < 0)
1307 					tbp = TAILQ_PREV(tbp, buflists, b_vnbufs);
1308 				TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1309 				++reassignbufsortbad;
1310 			}
1311 		} else {
1312 			/*
1313 			 * Old sorting algorithm, scan queue and insert
1314 			 */
1315 			struct buf *ttbp;
1316 			while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
1317 			    (ttbp->b_lblkno < bp->b_lblkno)) {
1318 				++reassignbufloops;
1319 				tbp = ttbp;
1320 			}
1321 			TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1322 		}
1323 	} else {
1324 		bp->b_xflags |= BX_VNCLEAN;
1325 		TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
1326 		if ((newvp->v_flag & VONWORKLST) &&
1327 		    TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
1328 			newvp->v_flag &= ~VONWORKLST;
1329 			LIST_REMOVE(newvp, v_synclist);
1330 		}
1331 	}
1332 	if (bp->b_vp != newvp) {
1333 		bp->b_vp = newvp;
1334 		vhold(bp->b_vp);
1335 	}
1336 	splx(s);
1337 }
1338 
1339 /*
1340  * Create a vnode for a device.
1341  * Used for mounting the root file system.
1342  */
1343 int
1344 bdevvp(dev, vpp)
1345 	dev_t dev;
1346 	struct vnode **vpp;
1347 {
1348 	register struct vnode *vp;
1349 	struct vnode *nvp;
1350 	int error;
1351 
1352 	if (dev == NODEV) {
1353 		*vpp = NULLVP;
1354 		return (ENXIO);
1355 	}
1356 	if (vfinddev(dev, VCHR, vpp))
1357 		return (0);
1358 	error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
1359 	if (error) {
1360 		*vpp = NULLVP;
1361 		return (error);
1362 	}
1363 	vp = nvp;
1364 	vp->v_type = VCHR;
1365 	addalias(vp, dev);
1366 	*vpp = vp;
1367 	return (0);
1368 }
1369 
1370 /*
1371  * Add vnode to the alias list hung off the dev_t.
1372  *
1373  * The reason for this gunk is that multiple vnodes can reference
1374  * the same physical device, so checking vp->v_usecount to see
1375  * how many users there are is inadequate; the v_usecount for
1376  * the vnodes need to be accumulated.  vcount() does that.
1377  */
1378 struct vnode *
1379 addaliasu(nvp, nvp_rdev)
1380 	struct vnode *nvp;
1381 	udev_t nvp_rdev;
1382 {
1383 	struct vnode *ovp;
1384 	vop_t **ops;
1385 	dev_t dev;
1386 
1387 	if (nvp->v_type == VBLK)
1388 		return (nvp);
1389 	if (nvp->v_type != VCHR)
1390 		panic("addaliasu on non-special vnode");
1391 	dev = udev2dev(nvp_rdev, 0);
1392 	/*
1393 	 * Check to see if we have a bdevvp vnode with no associated
1394 	 * filesystem. If so, we want to associate the filesystem of
1395 	 * the new newly instigated vnode with the bdevvp vnode and
1396 	 * discard the newly created vnode rather than leaving the
1397 	 * bdevvp vnode lying around with no associated filesystem.
1398 	 */
1399 	if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) {
1400 		addalias(nvp, dev);
1401 		return (nvp);
1402 	}
1403 	/*
1404 	 * Discard unneeded vnode, but save its node specific data.
1405 	 * Note that if there is a lock, it is carried over in the
1406 	 * node specific data to the replacement vnode.
1407 	 */
1408 	vref(ovp);
1409 	ovp->v_data = nvp->v_data;
1410 	ovp->v_tag = nvp->v_tag;
1411 	nvp->v_data = NULL;
1412 	lockinit(&ovp->v_lock, PVFS, nvp->v_lock.lk_wmesg,
1413 	    nvp->v_lock.lk_timo, nvp->v_lock.lk_flags & LK_EXTFLG_MASK);
1414 	if (nvp->v_vnlock)
1415 		ovp->v_vnlock = &ovp->v_lock;
1416 	ops = ovp->v_op;
1417 	ovp->v_op = nvp->v_op;
1418 	if (VOP_ISLOCKED(nvp, curproc)) {
1419 		VOP_UNLOCK(nvp, 0, curproc);
1420 		vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curproc);
1421 	}
1422 	nvp->v_op = ops;
1423 	insmntque(ovp, nvp->v_mount);
1424 	vrele(nvp);
1425 	vgone(nvp);
1426 	return (ovp);
1427 }
1428 
1429 /* This is a local helper function that do the same as addaliasu, but for a
1430  * dev_t instead of an udev_t. */
1431 static void
1432 addalias(nvp, dev)
1433 	struct vnode *nvp;
1434 	dev_t dev;
1435 {
1436 
1437 	KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode"));
1438 	nvp->v_rdev = dev;
1439 	mtx_lock(&spechash_mtx);
1440 	SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
1441 	mtx_unlock(&spechash_mtx);
1442 }
1443 
1444 /*
1445  * Grab a particular vnode from the free list, increment its
1446  * reference count and lock it. The vnode lock bit is set if the
1447  * vnode is being eliminated in vgone. The process is awakened
1448  * when the transition is completed, and an error returned to
1449  * indicate that the vnode is no longer usable (possibly having
1450  * been changed to a new file system type).
1451  */
1452 int
1453 vget(vp, flags, p)
1454 	register struct vnode *vp;
1455 	int flags;
1456 	struct proc *p;
1457 {
1458 	int error;
1459 
1460 	/*
1461 	 * If the vnode is in the process of being cleaned out for
1462 	 * another use, we wait for the cleaning to finish and then
1463 	 * return failure. Cleaning is determined by checking that
1464 	 * the VXLOCK flag is set.
1465 	 */
1466 	if ((flags & LK_INTERLOCK) == 0)
1467 		mtx_lock(&vp->v_interlock);
1468 	if (vp->v_flag & VXLOCK) {
1469 		if (vp->v_vxproc == curproc) {
1470 			printf("VXLOCK interlock avoided\n");
1471 		} else {
1472 			vp->v_flag |= VXWANT;
1473 			msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP,
1474 			    "vget", 0);
1475 			return (ENOENT);
1476 		}
1477 	}
1478 
1479 	vp->v_usecount++;
1480 
1481 	if (VSHOULDBUSY(vp))
1482 		vbusy(vp);
1483 	if (flags & LK_TYPE_MASK) {
1484 		if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
1485 			/*
1486 			 * must expand vrele here because we do not want
1487 			 * to call VOP_INACTIVE if the reference count
1488 			 * drops back to zero since it was never really
1489 			 * active. We must remove it from the free list
1490 			 * before sleeping so that multiple processes do
1491 			 * not try to recycle it.
1492 			 */
1493 			mtx_lock(&vp->v_interlock);
1494 			vp->v_usecount--;
1495 			if (VSHOULDFREE(vp))
1496 				vfree(vp);
1497 			mtx_unlock(&vp->v_interlock);
1498 		}
1499 		return (error);
1500 	}
1501 	mtx_unlock(&vp->v_interlock);
1502 	return (0);
1503 }
1504 
1505 /*
1506  * Increase the reference count of a vnode.
1507  */
1508 void
1509 vref(struct vnode *vp)
1510 {
1511 	mtx_lock(&vp->v_interlock);
1512 	vp->v_usecount++;
1513 	mtx_unlock(&vp->v_interlock);
1514 }
1515 
1516 /*
1517  * Vnode put/release.
1518  * If count drops to zero, call inactive routine and return to freelist.
1519  */
1520 void
1521 vrele(vp)
1522 	struct vnode *vp;
1523 {
1524 	struct proc *p = curproc;	/* XXX */
1525 
1526 	KASSERT(vp != NULL, ("vrele: null vp"));
1527 
1528 	mtx_lock(&vp->v_interlock);
1529 
1530 	KASSERT(vp->v_writecount < vp->v_usecount, ("vrele: missed vn_close"));
1531 
1532 	if (vp->v_usecount > 1) {
1533 
1534 		vp->v_usecount--;
1535 		mtx_unlock(&vp->v_interlock);
1536 
1537 		return;
1538 	}
1539 
1540 	if (vp->v_usecount == 1) {
1541 
1542 		vp->v_usecount--;
1543 		if (VSHOULDFREE(vp))
1544 			vfree(vp);
1545 	/*
1546 	 * If we are doing a vput, the node is already locked, and we must
1547 	 * call VOP_INACTIVE with the node locked.  So, in the case of
1548 	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1549 	 */
1550 		if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
1551 			VOP_INACTIVE(vp, p);
1552 		}
1553 
1554 	} else {
1555 #ifdef DIAGNOSTIC
1556 		vprint("vrele: negative ref count", vp);
1557 		mtx_unlock(&vp->v_interlock);
1558 #endif
1559 		panic("vrele: negative ref cnt");
1560 	}
1561 }
1562 
1563 /*
1564  * Release an already locked vnode.  This give the same effects as
1565  * unlock+vrele(), but takes less time and avoids releasing and
1566  * re-aquiring the lock (as vrele() aquires the lock internally.)
1567  */
1568 void
1569 vput(vp)
1570 	struct vnode *vp;
1571 {
1572 	struct proc *p = curproc;	/* XXX */
1573 
1574 	KASSERT(vp != NULL, ("vput: null vp"));
1575 	mtx_lock(&vp->v_interlock);
1576 	KASSERT(vp->v_writecount < vp->v_usecount, ("vput: missed vn_close"));
1577 
1578 	if (vp->v_usecount > 1) {
1579 
1580 		vp->v_usecount--;
1581 		VOP_UNLOCK(vp, LK_INTERLOCK, p);
1582 		return;
1583 
1584 	}
1585 
1586 	if (vp->v_usecount == 1) {
1587 
1588 		vp->v_usecount--;
1589 		if (VSHOULDFREE(vp))
1590 			vfree(vp);
1591 	/*
1592 	 * If we are doing a vput, the node is already locked, and we must
1593 	 * call VOP_INACTIVE with the node locked.  So, in the case of
1594 	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1595 	 */
1596 		mtx_unlock(&vp->v_interlock);
1597 		VOP_INACTIVE(vp, p);
1598 
1599 	} else {
1600 #ifdef DIAGNOSTIC
1601 		vprint("vput: negative ref count", vp);
1602 #endif
1603 		panic("vput: negative ref cnt");
1604 	}
1605 }
1606 
1607 /*
1608  * Somebody doesn't want the vnode recycled.
1609  */
1610 void
1611 vhold(vp)
1612 	register struct vnode *vp;
1613 {
1614 	int s;
1615 
1616   	s = splbio();
1617 	vp->v_holdcnt++;
1618 	if (VSHOULDBUSY(vp))
1619 		vbusy(vp);
1620 	splx(s);
1621 }
1622 
1623 /*
1624  * Note that there is one less who cares about this vnode.  vdrop() is the
1625  * opposite of vhold().
1626  */
1627 void
1628 vdrop(vp)
1629 	register struct vnode *vp;
1630 {
1631 	int s;
1632 
1633 	s = splbio();
1634 	if (vp->v_holdcnt <= 0)
1635 		panic("vdrop: holdcnt");
1636 	vp->v_holdcnt--;
1637 	if (VSHOULDFREE(vp))
1638 		vfree(vp);
1639 	splx(s);
1640 }
1641 
1642 /*
1643  * Remove any vnodes in the vnode table belonging to mount point mp.
1644  *
1645  * If MNT_NOFORCE is specified, there should not be any active ones,
1646  * return error if any are found (nb: this is a user error, not a
1647  * system error). If MNT_FORCE is specified, detach any active vnodes
1648  * that are found.
1649  */
1650 #ifdef DIAGNOSTIC
1651 static int busyprt = 0;		/* print out busy vnodes */
1652 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
1653 #endif
1654 
1655 int
1656 vflush(mp, skipvp, flags)
1657 	struct mount *mp;
1658 	struct vnode *skipvp;
1659 	int flags;
1660 {
1661 	struct proc *p = curproc;	/* XXX */
1662 	struct vnode *vp, *nvp;
1663 	int busy = 0;
1664 
1665 	mtx_lock(&mntvnode_mtx);
1666 loop:
1667 	for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
1668 		/*
1669 		 * Make sure this vnode wasn't reclaimed in getnewvnode().
1670 		 * Start over if it has (it won't be on the list anymore).
1671 		 */
1672 		if (vp->v_mount != mp)
1673 			goto loop;
1674 		nvp = LIST_NEXT(vp, v_mntvnodes);
1675 		/*
1676 		 * Skip over a selected vnode.
1677 		 */
1678 		if (vp == skipvp)
1679 			continue;
1680 
1681 		mtx_lock(&vp->v_interlock);
1682 		/*
1683 		 * Skip over a vnodes marked VSYSTEM.
1684 		 */
1685 		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
1686 			mtx_unlock(&vp->v_interlock);
1687 			continue;
1688 		}
1689 		/*
1690 		 * If WRITECLOSE is set, only flush out regular file vnodes
1691 		 * open for writing.
1692 		 */
1693 		if ((flags & WRITECLOSE) &&
1694 		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
1695 			mtx_unlock(&vp->v_interlock);
1696 			continue;
1697 		}
1698 
1699 		/*
1700 		 * With v_usecount == 0, all we need to do is clear out the
1701 		 * vnode data structures and we are done.
1702 		 */
1703 		if (vp->v_usecount == 0) {
1704 			mtx_unlock(&mntvnode_mtx);
1705 			vgonel(vp, p);
1706 			mtx_lock(&mntvnode_mtx);
1707 			continue;
1708 		}
1709 
1710 		/*
1711 		 * If FORCECLOSE is set, forcibly close the vnode. For block
1712 		 * or character devices, revert to an anonymous device. For
1713 		 * all other files, just kill them.
1714 		 */
1715 		if (flags & FORCECLOSE) {
1716 			mtx_unlock(&mntvnode_mtx);
1717 			if (vp->v_type != VCHR) {
1718 				vgonel(vp, p);
1719 			} else {
1720 				vclean(vp, 0, p);
1721 				vp->v_op = spec_vnodeop_p;
1722 				insmntque(vp, (struct mount *) 0);
1723 			}
1724 			mtx_lock(&mntvnode_mtx);
1725 			continue;
1726 		}
1727 #ifdef DIAGNOSTIC
1728 		if (busyprt)
1729 			vprint("vflush: busy vnode", vp);
1730 #endif
1731 		mtx_unlock(&vp->v_interlock);
1732 		busy++;
1733 	}
1734 	mtx_unlock(&mntvnode_mtx);
1735 	if (busy)
1736 		return (EBUSY);
1737 	return (0);
1738 }
1739 
1740 /*
1741  * Disassociate the underlying file system from a vnode.
1742  */
1743 static void
1744 vclean(vp, flags, p)
1745 	struct vnode *vp;
1746 	int flags;
1747 	struct proc *p;
1748 {
1749 	int active;
1750 
1751 	/*
1752 	 * Check to see if the vnode is in use. If so we have to reference it
1753 	 * before we clean it out so that its count cannot fall to zero and
1754 	 * generate a race against ourselves to recycle it.
1755 	 */
1756 	if ((active = vp->v_usecount))
1757 		vp->v_usecount++;
1758 
1759 	/*
1760 	 * Prevent the vnode from being recycled or brought into use while we
1761 	 * clean it out.
1762 	 */
1763 	if (vp->v_flag & VXLOCK)
1764 		panic("vclean: deadlock");
1765 	vp->v_flag |= VXLOCK;
1766 	vp->v_vxproc = curproc;
1767 	/*
1768 	 * Even if the count is zero, the VOP_INACTIVE routine may still
1769 	 * have the object locked while it cleans it out. The VOP_LOCK
1770 	 * ensures that the VOP_INACTIVE routine is done with its work.
1771 	 * For active vnodes, it ensures that no other activity can
1772 	 * occur while the underlying object is being cleaned out.
1773 	 */
1774 	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
1775 
1776 	/*
1777 	 * Clean out any buffers associated with the vnode.
1778 	 * If the flush fails, just toss the buffers.
1779 	 */
1780 	if (flags & DOCLOSE) {
1781 		if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL)
1782 			(void) vn_write_suspend_wait(vp, NULL, V_WAIT);
1783 		if (vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0) != 0)
1784 			vinvalbuf(vp, 0, NOCRED, p, 0, 0);
1785 	}
1786 
1787 	VOP_DESTROYVOBJECT(vp);
1788 
1789 	/*
1790 	 * If purging an active vnode, it must be closed and
1791 	 * deactivated before being reclaimed. Note that the
1792 	 * VOP_INACTIVE will unlock the vnode.
1793 	 */
1794 	if (active) {
1795 		if (flags & DOCLOSE)
1796 			VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
1797 		VOP_INACTIVE(vp, p);
1798 	} else {
1799 		/*
1800 		 * Any other processes trying to obtain this lock must first
1801 		 * wait for VXLOCK to clear, then call the new lock operation.
1802 		 */
1803 		VOP_UNLOCK(vp, 0, p);
1804 	}
1805 	/*
1806 	 * Reclaim the vnode.
1807 	 */
1808 	if (VOP_RECLAIM(vp, p))
1809 		panic("vclean: cannot reclaim");
1810 
1811 	if (active) {
1812 		/*
1813 		 * Inline copy of vrele() since VOP_INACTIVE
1814 		 * has already been called.
1815 		 */
1816 		mtx_lock(&vp->v_interlock);
1817 		if (--vp->v_usecount <= 0) {
1818 #ifdef DIAGNOSTIC
1819 			if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1820 				vprint("vclean: bad ref count", vp);
1821 				panic("vclean: ref cnt");
1822 			}
1823 #endif
1824 			vfree(vp);
1825 		}
1826 		mtx_unlock(&vp->v_interlock);
1827 	}
1828 
1829 	cache_purge(vp);
1830 	vp->v_vnlock = NULL;
1831 	lockdestroy(&vp->v_lock);
1832 
1833 	if (VSHOULDFREE(vp))
1834 		vfree(vp);
1835 
1836 	/*
1837 	 * Done with purge, notify sleepers of the grim news.
1838 	 */
1839 	vp->v_op = dead_vnodeop_p;
1840 	vn_pollgone(vp);
1841 	vp->v_tag = VT_NON;
1842 	vp->v_flag &= ~VXLOCK;
1843 	vp->v_vxproc = NULL;
1844 	if (vp->v_flag & VXWANT) {
1845 		vp->v_flag &= ~VXWANT;
1846 		wakeup((caddr_t) vp);
1847 	}
1848 }
1849 
1850 /*
1851  * Eliminate all activity associated with the requested vnode
1852  * and with all vnodes aliased to the requested vnode.
1853  */
1854 int
1855 vop_revoke(ap)
1856 	struct vop_revoke_args /* {
1857 		struct vnode *a_vp;
1858 		int a_flags;
1859 	} */ *ap;
1860 {
1861 	struct vnode *vp, *vq;
1862 	dev_t dev;
1863 
1864 	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
1865 
1866 	vp = ap->a_vp;
1867 	/*
1868 	 * If a vgone (or vclean) is already in progress,
1869 	 * wait until it is done and return.
1870 	 */
1871 	if (vp->v_flag & VXLOCK) {
1872 		vp->v_flag |= VXWANT;
1873 		msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP,
1874 		    "vop_revokeall", 0);
1875 		return (0);
1876 	}
1877 	dev = vp->v_rdev;
1878 	for (;;) {
1879 		mtx_lock(&spechash_mtx);
1880 		vq = SLIST_FIRST(&dev->si_hlist);
1881 		mtx_unlock(&spechash_mtx);
1882 		if (!vq)
1883 			break;
1884 		vgone(vq);
1885 	}
1886 	return (0);
1887 }
1888 
1889 /*
1890  * Recycle an unused vnode to the front of the free list.
1891  * Release the passed interlock if the vnode will be recycled.
1892  */
1893 int
1894 vrecycle(vp, inter_lkp, p)
1895 	struct vnode *vp;
1896 	struct mtx *inter_lkp;
1897 	struct proc *p;
1898 {
1899 
1900 	mtx_lock(&vp->v_interlock);
1901 	if (vp->v_usecount == 0) {
1902 		if (inter_lkp) {
1903 			mtx_unlock(inter_lkp);
1904 		}
1905 		vgonel(vp, p);
1906 		return (1);
1907 	}
1908 	mtx_unlock(&vp->v_interlock);
1909 	return (0);
1910 }
1911 
1912 /*
1913  * Eliminate all activity associated with a vnode
1914  * in preparation for reuse.
1915  */
1916 void
1917 vgone(vp)
1918 	register struct vnode *vp;
1919 {
1920 	struct proc *p = curproc;	/* XXX */
1921 
1922 	mtx_lock(&vp->v_interlock);
1923 	vgonel(vp, p);
1924 }
1925 
1926 /*
1927  * vgone, with the vp interlock held.
1928  */
1929 void
1930 vgonel(vp, p)
1931 	struct vnode *vp;
1932 	struct proc *p;
1933 {
1934 	int s;
1935 
1936 	/*
1937 	 * If a vgone (or vclean) is already in progress,
1938 	 * wait until it is done and return.
1939 	 */
1940 	if (vp->v_flag & VXLOCK) {
1941 		vp->v_flag |= VXWANT;
1942 		msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP,
1943 		    "vgone", 0);
1944 		return;
1945 	}
1946 
1947 	/*
1948 	 * Clean out the filesystem specific data.
1949 	 */
1950 	vclean(vp, DOCLOSE, p);
1951 	mtx_lock(&vp->v_interlock);
1952 
1953 	/*
1954 	 * Delete from old mount point vnode list, if on one.
1955 	 */
1956 	if (vp->v_mount != NULL)
1957 		insmntque(vp, (struct mount *)0);
1958 	/*
1959 	 * If special device, remove it from special device alias list
1960 	 * if it is on one.
1961 	 */
1962 	if (vp->v_type == VCHR && vp->v_rdev != NULL && vp->v_rdev != NODEV) {
1963 		mtx_lock(&spechash_mtx);
1964 		SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext);
1965 		freedev(vp->v_rdev);
1966 		mtx_unlock(&spechash_mtx);
1967 		vp->v_rdev = NULL;
1968 	}
1969 
1970 	/*
1971 	 * If it is on the freelist and not already at the head,
1972 	 * move it to the head of the list. The test of the
1973 	 * VDOOMED flag and the reference count of zero is because
1974 	 * it will be removed from the free list by getnewvnode,
1975 	 * but will not have its reference count incremented until
1976 	 * after calling vgone. If the reference count were
1977 	 * incremented first, vgone would (incorrectly) try to
1978 	 * close the previous instance of the underlying object.
1979 	 */
1980 	if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
1981 		s = splbio();
1982 		mtx_lock(&vnode_free_list_mtx);
1983 		if (vp->v_flag & VFREE)
1984 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1985 		else
1986 			freevnodes++;
1987 		vp->v_flag |= VFREE;
1988 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1989 		mtx_unlock(&vnode_free_list_mtx);
1990 		splx(s);
1991 	}
1992 
1993 	vp->v_type = VBAD;
1994 	mtx_unlock(&vp->v_interlock);
1995 }
1996 
1997 /*
1998  * Lookup a vnode by device number.
1999  */
2000 int
2001 vfinddev(dev, type, vpp)
2002 	dev_t dev;
2003 	enum vtype type;
2004 	struct vnode **vpp;
2005 {
2006 	struct vnode *vp;
2007 
2008 	mtx_lock(&spechash_mtx);
2009 	SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
2010 		if (type == vp->v_type) {
2011 			*vpp = vp;
2012 			mtx_unlock(&spechash_mtx);
2013 			return (1);
2014 		}
2015 	}
2016 	mtx_unlock(&spechash_mtx);
2017 	return (0);
2018 }
2019 
2020 /*
2021  * Calculate the total number of references to a special device.
2022  */
2023 int
2024 vcount(vp)
2025 	struct vnode *vp;
2026 {
2027 	struct vnode *vq;
2028 	int count;
2029 
2030 	count = 0;
2031 	mtx_lock(&spechash_mtx);
2032 	SLIST_FOREACH(vq, &vp->v_rdev->si_hlist, v_specnext)
2033 		count += vq->v_usecount;
2034 	mtx_unlock(&spechash_mtx);
2035 	return (count);
2036 }
2037 
2038 /*
2039  * Same as above, but using the dev_t as argument
2040  */
2041 int
2042 count_dev(dev)
2043 	dev_t dev;
2044 {
2045 	struct vnode *vp;
2046 
2047 	vp = SLIST_FIRST(&dev->si_hlist);
2048 	if (vp == NULL)
2049 		return (0);
2050 	return(vcount(vp));
2051 }
2052 
2053 /*
2054  * Print out a description of a vnode.
2055  */
2056 static char *typename[] =
2057 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
2058 
2059 void
2060 vprint(label, vp)
2061 	char *label;
2062 	struct vnode *vp;
2063 {
2064 	char buf[96];
2065 
2066 	if (label != NULL)
2067 		printf("%s: %p: ", label, (void *)vp);
2068 	else
2069 		printf("%p: ", (void *)vp);
2070 	printf("type %s, usecount %d, writecount %d, refcount %d,",
2071 	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
2072 	    vp->v_holdcnt);
2073 	buf[0] = '\0';
2074 	if (vp->v_flag & VROOT)
2075 		strcat(buf, "|VROOT");
2076 	if (vp->v_flag & VTEXT)
2077 		strcat(buf, "|VTEXT");
2078 	if (vp->v_flag & VSYSTEM)
2079 		strcat(buf, "|VSYSTEM");
2080 	if (vp->v_flag & VXLOCK)
2081 		strcat(buf, "|VXLOCK");
2082 	if (vp->v_flag & VXWANT)
2083 		strcat(buf, "|VXWANT");
2084 	if (vp->v_flag & VBWAIT)
2085 		strcat(buf, "|VBWAIT");
2086 	if (vp->v_flag & VDOOMED)
2087 		strcat(buf, "|VDOOMED");
2088 	if (vp->v_flag & VFREE)
2089 		strcat(buf, "|VFREE");
2090 	if (vp->v_flag & VOBJBUF)
2091 		strcat(buf, "|VOBJBUF");
2092 	if (buf[0] != '\0')
2093 		printf(" flags (%s)", &buf[1]);
2094 	if (vp->v_data == NULL) {
2095 		printf("\n");
2096 	} else {
2097 		printf("\n\t");
2098 		VOP_PRINT(vp);
2099 	}
2100 }
2101 
2102 #ifdef DDB
2103 #include <ddb/ddb.h>
2104 /*
2105  * List all of the locked vnodes in the system.
2106  * Called when debugging the kernel.
2107  */
2108 DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
2109 {
2110 	struct proc *p = curproc;	/* XXX */
2111 	struct mount *mp, *nmp;
2112 	struct vnode *vp;
2113 
2114 	printf("Locked vnodes\n");
2115 	mtx_lock(&mountlist_mtx);
2116 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2117 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, p)) {
2118 			nmp = TAILQ_NEXT(mp, mnt_list);
2119 			continue;
2120 		}
2121 		LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
2122 			if (VOP_ISLOCKED(vp, NULL))
2123 				vprint((char *)0, vp);
2124 		}
2125 		mtx_lock(&mountlist_mtx);
2126 		nmp = TAILQ_NEXT(mp, mnt_list);
2127 		vfs_unbusy(mp, p);
2128 	}
2129 	mtx_unlock(&mountlist_mtx);
2130 }
2131 #endif
2132 
2133 /*
2134  * Top level filesystem related information gathering.
2135  */
2136 static int	sysctl_ovfs_conf __P((SYSCTL_HANDLER_ARGS));
2137 
2138 static int
2139 vfs_sysctl(SYSCTL_HANDLER_ARGS)
2140 {
2141 	int *name = (int *)arg1 - 1;	/* XXX */
2142 	u_int namelen = arg2 + 1;	/* XXX */
2143 	struct vfsconf *vfsp;
2144 
2145 #if 1 || defined(COMPAT_PRELITE2)
2146 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2147 	if (namelen == 1)
2148 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2149 #endif
2150 
2151 	/* XXX the below code does not compile; vfs_sysctl does not exist. */
2152 #ifdef notyet
2153 	/* all sysctl names at this level are at least name and field */
2154 	if (namelen < 2)
2155 		return (ENOTDIR);		/* overloaded */
2156 	if (name[0] != VFS_GENERIC) {
2157 		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2158 			if (vfsp->vfc_typenum == name[0])
2159 				break;
2160 		if (vfsp == NULL)
2161 			return (EOPNOTSUPP);
2162 		return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
2163 		    oldp, oldlenp, newp, newlen, p));
2164 	}
2165 #endif
2166 	switch (name[1]) {
2167 	case VFS_MAXTYPENUM:
2168 		if (namelen != 2)
2169 			return (ENOTDIR);
2170 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2171 	case VFS_CONF:
2172 		if (namelen != 3)
2173 			return (ENOTDIR);	/* overloaded */
2174 		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2175 			if (vfsp->vfc_typenum == name[2])
2176 				break;
2177 		if (vfsp == NULL)
2178 			return (EOPNOTSUPP);
2179 		return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
2180 	}
2181 	return (EOPNOTSUPP);
2182 }
2183 
2184 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
2185 	"Generic filesystem");
2186 
2187 #if 1 || defined(COMPAT_PRELITE2)
2188 
2189 static int
2190 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
2191 {
2192 	int error;
2193 	struct vfsconf *vfsp;
2194 	struct ovfsconf ovfs;
2195 
2196 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
2197 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
2198 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
2199 		ovfs.vfc_index = vfsp->vfc_typenum;
2200 		ovfs.vfc_refcount = vfsp->vfc_refcount;
2201 		ovfs.vfc_flags = vfsp->vfc_flags;
2202 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2203 		if (error)
2204 			return error;
2205 	}
2206 	return 0;
2207 }
2208 
2209 #endif /* 1 || COMPAT_PRELITE2 */
2210 
2211 #if COMPILING_LINT
2212 #define KINFO_VNODESLOP	10
2213 /*
2214  * Dump vnode list (via sysctl).
2215  * Copyout address of vnode followed by vnode.
2216  */
2217 /* ARGSUSED */
2218 static int
2219 sysctl_vnode(SYSCTL_HANDLER_ARGS)
2220 {
2221 	struct proc *p = curproc;	/* XXX */
2222 	struct mount *mp, *nmp;
2223 	struct vnode *nvp, *vp;
2224 	int error;
2225 
2226 #define VPTRSZ	sizeof (struct vnode *)
2227 #define VNODESZ	sizeof (struct vnode)
2228 
2229 	req->lock = 0;
2230 	if (!req->oldptr) /* Make an estimate */
2231 		return (SYSCTL_OUT(req, 0,
2232 			(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
2233 
2234 	mtx_lock(&mountlist_mtx);
2235 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2236 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, p)) {
2237 			nmp = TAILQ_NEXT(mp, mnt_list);
2238 			continue;
2239 		}
2240 again:
2241 		mtx_lock(&mntvnode_mtx);
2242 		for (vp = LIST_FIRST(&mp->mnt_vnodelist);
2243 		     vp != NULL;
2244 		     vp = nvp) {
2245 			/*
2246 			 * Check that the vp is still associated with
2247 			 * this filesystem.  RACE: could have been
2248 			 * recycled onto the same filesystem.
2249 			 */
2250 			if (vp->v_mount != mp) {
2251 				mtx_unlock(&mntvnode_mtx);
2252 				goto again;
2253 			}
2254 			nvp = LIST_NEXT(vp, v_mntvnodes);
2255 			mtx_unlock(&mntvnode_mtx);
2256 			if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
2257 			    (error = SYSCTL_OUT(req, vp, VNODESZ)))
2258 				return (error);
2259 			mtx_lock(&mntvnode_mtx);
2260 		}
2261 		mtx_unlock(&mntvnode_mtx);
2262 		mtx_lock(&mountlist_mtx);
2263 		nmp = TAILQ_NEXT(mp, mnt_list);
2264 		vfs_unbusy(mp, p);
2265 	}
2266 	mtx_unlock(&mountlist_mtx);
2267 
2268 	return (0);
2269 }
2270 
2271 /*
2272  * XXX
2273  * Exporting the vnode list on large systems causes them to crash.
2274  * Exporting the vnode list on medium systems causes sysctl to coredump.
2275  */
2276 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
2277 	0, 0, sysctl_vnode, "S,vnode", "");
2278 #endif
2279 
2280 /*
2281  * Check to see if a filesystem is mounted on a block device.
2282  */
2283 int
2284 vfs_mountedon(vp)
2285 	struct vnode *vp;
2286 {
2287 
2288 	if (vp->v_rdev->si_mountpoint != NULL)
2289 		return (EBUSY);
2290 	return (0);
2291 }
2292 
2293 /*
2294  * Unmount all filesystems. The list is traversed in reverse order
2295  * of mounting to avoid dependencies.
2296  */
2297 void
2298 vfs_unmountall()
2299 {
2300 	struct mount *mp;
2301 	struct proc *p;
2302 	int error;
2303 
2304 	if (curproc != NULL)
2305 		p = curproc;
2306 	else
2307 		p = initproc;	/* XXX XXX should this be proc0? */
2308 	/*
2309 	 * Since this only runs when rebooting, it is not interlocked.
2310 	 */
2311 	while(!TAILQ_EMPTY(&mountlist)) {
2312 		mp = TAILQ_LAST(&mountlist, mntlist);
2313 		error = dounmount(mp, MNT_FORCE, p);
2314 		if (error) {
2315 			TAILQ_REMOVE(&mountlist, mp, mnt_list);
2316 			printf("unmount of %s failed (",
2317 			    mp->mnt_stat.f_mntonname);
2318 			if (error == EBUSY)
2319 				printf("BUSY)\n");
2320 			else
2321 				printf("%d)\n", error);
2322 		} else {
2323 			/* The unmount has removed mp from the mountlist */
2324 		}
2325 	}
2326 }
2327 
2328 /*
2329  * Build hash lists of net addresses and hang them off the mount point.
2330  * Called by ufs_mount() to set up the lists of export addresses.
2331  */
2332 static int
2333 vfs_hang_addrlist(mp, nep, argp)
2334 	struct mount *mp;
2335 	struct netexport *nep;
2336 	struct export_args *argp;
2337 {
2338 	register struct netcred *np;
2339 	register struct radix_node_head *rnh;
2340 	register int i;
2341 	struct radix_node *rn;
2342 	struct sockaddr *saddr, *smask = 0;
2343 	struct domain *dom;
2344 	int error;
2345 
2346 	if (argp->ex_addrlen == 0) {
2347 		if (mp->mnt_flag & MNT_DEFEXPORTED)
2348 			return (EPERM);
2349 		np = &nep->ne_defexported;
2350 		np->netc_exflags = argp->ex_flags;
2351 		bzero(&np->netc_anon, sizeof(np->netc_anon));
2352 		np->netc_anon.cr_uid = argp->ex_anon.cr_uid;
2353 		np->netc_anon.cr_ngroups = argp->ex_anon.cr_ngroups;
2354 		bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups,
2355 		    sizeof(np->netc_anon.cr_groups));
2356 		np->netc_anon.cr_ref = 1;
2357 		mp->mnt_flag |= MNT_DEFEXPORTED;
2358 		return (0);
2359 	}
2360 	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
2361 	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK | M_ZERO);
2362 	saddr = (struct sockaddr *) (np + 1);
2363 	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
2364 		goto out;
2365 	if (saddr->sa_len > argp->ex_addrlen)
2366 		saddr->sa_len = argp->ex_addrlen;
2367 	if (argp->ex_masklen) {
2368 		smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
2369 		error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen);
2370 		if (error)
2371 			goto out;
2372 		if (smask->sa_len > argp->ex_masklen)
2373 			smask->sa_len = argp->ex_masklen;
2374 	}
2375 	i = saddr->sa_family;
2376 	if ((rnh = nep->ne_rtable[i]) == 0) {
2377 		/*
2378 		 * Seems silly to initialize every AF when most are not used,
2379 		 * do so on demand here
2380 		 */
2381 		for (dom = domains; dom; dom = dom->dom_next)
2382 			if (dom->dom_family == i && dom->dom_rtattach) {
2383 				dom->dom_rtattach((void **) &nep->ne_rtable[i],
2384 				    dom->dom_rtoffset);
2385 				break;
2386 			}
2387 		if ((rnh = nep->ne_rtable[i]) == 0) {
2388 			error = ENOBUFS;
2389 			goto out;
2390 		}
2391 	}
2392 	rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
2393 	    np->netc_rnodes);
2394 	if (rn == 0 || np != (struct netcred *) rn) {	/* already exists */
2395 		error = EPERM;
2396 		goto out;
2397 	}
2398 	np->netc_exflags = argp->ex_flags;
2399 	bzero(&np->netc_anon, sizeof(np->netc_anon));
2400 	np->netc_anon.cr_uid = argp->ex_anon.cr_uid;
2401 	np->netc_anon.cr_ngroups = argp->ex_anon.cr_ngroups;
2402 	bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups,
2403 	    sizeof(np->netc_anon.cr_groups));
2404 	np->netc_anon.cr_ref = 1;
2405 	return (0);
2406 out:
2407 	free(np, M_NETADDR);
2408 	return (error);
2409 }
2410 
2411 /* Helper for vfs_free_addrlist. */
2412 /* ARGSUSED */
2413 static int
2414 vfs_free_netcred(rn, w)
2415 	struct radix_node *rn;
2416 	void *w;
2417 {
2418 	register struct radix_node_head *rnh = (struct radix_node_head *) w;
2419 
2420 	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
2421 	free((caddr_t) rn, M_NETADDR);
2422 	return (0);
2423 }
2424 
2425 /*
2426  * Free the net address hash lists that are hanging off the mount points.
2427  */
2428 static void
2429 vfs_free_addrlist(nep)
2430 	struct netexport *nep;
2431 {
2432 	register int i;
2433 	register struct radix_node_head *rnh;
2434 
2435 	for (i = 0; i <= AF_MAX; i++)
2436 		if ((rnh = nep->ne_rtable[i])) {
2437 			(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
2438 			    (caddr_t) rnh);
2439 			free((caddr_t) rnh, M_RTABLE);
2440 			nep->ne_rtable[i] = 0;
2441 		}
2442 }
2443 
2444 /*
2445  * High level function to manipulate export options on a mount point
2446  * and the passed in netexport.
2447  * Struct export_args *argp is the variable used to twiddle options,
2448  * the structure is described in sys/mount.h
2449  */
2450 int
2451 vfs_export(mp, nep, argp)
2452 	struct mount *mp;
2453 	struct netexport *nep;
2454 	struct export_args *argp;
2455 {
2456 	int error;
2457 
2458 	if (argp->ex_flags & MNT_DELEXPORT) {
2459 		if (mp->mnt_flag & MNT_EXPUBLIC) {
2460 			vfs_setpublicfs(NULL, NULL, NULL);
2461 			mp->mnt_flag &= ~MNT_EXPUBLIC;
2462 		}
2463 		vfs_free_addrlist(nep);
2464 		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
2465 	}
2466 	if (argp->ex_flags & MNT_EXPORTED) {
2467 		if (argp->ex_flags & MNT_EXPUBLIC) {
2468 			if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
2469 				return (error);
2470 			mp->mnt_flag |= MNT_EXPUBLIC;
2471 		}
2472 		if ((error = vfs_hang_addrlist(mp, nep, argp)))
2473 			return (error);
2474 		mp->mnt_flag |= MNT_EXPORTED;
2475 	}
2476 	return (0);
2477 }
2478 
2479 /*
2480  * Set the publicly exported filesystem (WebNFS). Currently, only
2481  * one public filesystem is possible in the spec (RFC 2054 and 2055)
2482  */
2483 int
2484 vfs_setpublicfs(mp, nep, argp)
2485 	struct mount *mp;
2486 	struct netexport *nep;
2487 	struct export_args *argp;
2488 {
2489 	int error;
2490 	struct vnode *rvp;
2491 	char *cp;
2492 
2493 	/*
2494 	 * mp == NULL -> invalidate the current info, the FS is
2495 	 * no longer exported. May be called from either vfs_export
2496 	 * or unmount, so check if it hasn't already been done.
2497 	 */
2498 	if (mp == NULL) {
2499 		if (nfs_pub.np_valid) {
2500 			nfs_pub.np_valid = 0;
2501 			if (nfs_pub.np_index != NULL) {
2502 				FREE(nfs_pub.np_index, M_TEMP);
2503 				nfs_pub.np_index = NULL;
2504 			}
2505 		}
2506 		return (0);
2507 	}
2508 
2509 	/*
2510 	 * Only one allowed at a time.
2511 	 */
2512 	if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
2513 		return (EBUSY);
2514 
2515 	/*
2516 	 * Get real filehandle for root of exported FS.
2517 	 */
2518 	bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
2519 	nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
2520 
2521 	if ((error = VFS_ROOT(mp, &rvp)))
2522 		return (error);
2523 
2524 	if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
2525 		return (error);
2526 
2527 	vput(rvp);
2528 
2529 	/*
2530 	 * If an indexfile was specified, pull it in.
2531 	 */
2532 	if (argp->ex_indexfile != NULL) {
2533 		MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
2534 		    M_WAITOK);
2535 		error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
2536 		    MAXNAMLEN, (size_t *)0);
2537 		if (!error) {
2538 			/*
2539 			 * Check for illegal filenames.
2540 			 */
2541 			for (cp = nfs_pub.np_index; *cp; cp++) {
2542 				if (*cp == '/') {
2543 					error = EINVAL;
2544 					break;
2545 				}
2546 			}
2547 		}
2548 		if (error) {
2549 			FREE(nfs_pub.np_index, M_TEMP);
2550 			return (error);
2551 		}
2552 	}
2553 
2554 	nfs_pub.np_mount = mp;
2555 	nfs_pub.np_valid = 1;
2556 	return (0);
2557 }
2558 
2559 /*
2560  * Used by the filesystems to determine if a given network address
2561  * (passed in 'nam') is present in thier exports list, returns a pointer
2562  * to struct netcred so that the filesystem can examine it for
2563  * access rights (read/write/etc).
2564  */
2565 struct netcred *
2566 vfs_export_lookup(mp, nep, nam)
2567 	register struct mount *mp;
2568 	struct netexport *nep;
2569 	struct sockaddr *nam;
2570 {
2571 	register struct netcred *np;
2572 	register struct radix_node_head *rnh;
2573 	struct sockaddr *saddr;
2574 
2575 	np = NULL;
2576 	if (mp->mnt_flag & MNT_EXPORTED) {
2577 		/*
2578 		 * Lookup in the export list first.
2579 		 */
2580 		if (nam != NULL) {
2581 			saddr = nam;
2582 			rnh = nep->ne_rtable[saddr->sa_family];
2583 			if (rnh != NULL) {
2584 				np = (struct netcred *)
2585 					(*rnh->rnh_matchaddr)((caddr_t)saddr,
2586 							      rnh);
2587 				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
2588 					np = NULL;
2589 			}
2590 		}
2591 		/*
2592 		 * If no address match, use the default if it exists.
2593 		 */
2594 		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
2595 			np = &nep->ne_defexported;
2596 	}
2597 	return (np);
2598 }
2599 
2600 /*
2601  * perform msync on all vnodes under a mount point
2602  * the mount point must be locked.
2603  */
2604 void
2605 vfs_msync(struct mount *mp, int flags) {
2606 	struct vnode *vp, *nvp;
2607 	struct vm_object *obj;
2608 	int anyio, tries;
2609 
2610 	tries = 5;
2611 loop:
2612 	anyio = 0;
2613 	for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) {
2614 
2615 		nvp = LIST_NEXT(vp, v_mntvnodes);
2616 
2617 		if (vp->v_mount != mp) {
2618 			goto loop;
2619 		}
2620 
2621 		if (vp->v_flag & VXLOCK)	/* XXX: what if MNT_WAIT? */
2622 			continue;
2623 
2624 		if (flags != MNT_WAIT) {
2625 			if (VOP_GETVOBJECT(vp, &obj) != 0 ||
2626 			    (obj->flags & OBJ_MIGHTBEDIRTY) == 0)
2627 				continue;
2628 			if (VOP_ISLOCKED(vp, NULL))
2629 				continue;
2630 		}
2631 
2632 		mtx_lock(&vp->v_interlock);
2633 		if (VOP_GETVOBJECT(vp, &obj) == 0 &&
2634 		    (obj->flags & OBJ_MIGHTBEDIRTY)) {
2635 			if (!vget(vp,
2636 				LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
2637 				if (VOP_GETVOBJECT(vp, &obj) == 0) {
2638 					vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC);
2639 					anyio = 1;
2640 				}
2641 				vput(vp);
2642 			}
2643 		} else {
2644 			mtx_unlock(&vp->v_interlock);
2645 		}
2646 	}
2647 	if (anyio && (--tries > 0))
2648 		goto loop;
2649 }
2650 
2651 /*
2652  * Create the VM object needed for VMIO and mmap support.  This
2653  * is done for all VREG files in the system.  Some filesystems might
2654  * afford the additional metadata buffering capability of the
2655  * VMIO code by making the device node be VMIO mode also.
2656  *
2657  * vp must be locked when vfs_object_create is called.
2658  */
2659 int
2660 vfs_object_create(vp, p, cred)
2661 	struct vnode *vp;
2662 	struct proc *p;
2663 	struct ucred *cred;
2664 {
2665 	return (VOP_CREATEVOBJECT(vp, cred, p));
2666 }
2667 
2668 /*
2669  * Mark a vnode as free, putting it up for recycling.
2670  */
2671 void
2672 vfree(vp)
2673 	struct vnode *vp;
2674 {
2675 	int s;
2676 
2677 	s = splbio();
2678 	mtx_lock(&vnode_free_list_mtx);
2679 	KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free"));
2680 	if (vp->v_flag & VAGE) {
2681 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2682 	} else {
2683 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
2684 	}
2685 	freevnodes++;
2686 	mtx_unlock(&vnode_free_list_mtx);
2687 	vp->v_flag &= ~VAGE;
2688 	vp->v_flag |= VFREE;
2689 	splx(s);
2690 }
2691 
2692 /*
2693  * Opposite of vfree() - mark a vnode as in use.
2694  */
2695 void
2696 vbusy(vp)
2697 	struct vnode *vp;
2698 {
2699 	int s;
2700 
2701 	s = splbio();
2702 	mtx_lock(&vnode_free_list_mtx);
2703 	KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free"));
2704 	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2705 	freevnodes--;
2706 	mtx_unlock(&vnode_free_list_mtx);
2707 	vp->v_flag &= ~(VFREE|VAGE);
2708 	splx(s);
2709 }
2710 
2711 /*
2712  * Record a process's interest in events which might happen to
2713  * a vnode.  Because poll uses the historic select-style interface
2714  * internally, this routine serves as both the ``check for any
2715  * pending events'' and the ``record my interest in future events''
2716  * functions.  (These are done together, while the lock is held,
2717  * to avoid race conditions.)
2718  */
2719 int
2720 vn_pollrecord(vp, p, events)
2721 	struct vnode *vp;
2722 	struct proc *p;
2723 	short events;
2724 {
2725 	mtx_lock(&vp->v_pollinfo.vpi_lock);
2726 	if (vp->v_pollinfo.vpi_revents & events) {
2727 		/*
2728 		 * This leaves events we are not interested
2729 		 * in available for the other process which
2730 		 * which presumably had requested them
2731 		 * (otherwise they would never have been
2732 		 * recorded).
2733 		 */
2734 		events &= vp->v_pollinfo.vpi_revents;
2735 		vp->v_pollinfo.vpi_revents &= ~events;
2736 
2737 		mtx_unlock(&vp->v_pollinfo.vpi_lock);
2738 		return events;
2739 	}
2740 	vp->v_pollinfo.vpi_events |= events;
2741 	selrecord(p, &vp->v_pollinfo.vpi_selinfo);
2742 	mtx_unlock(&vp->v_pollinfo.vpi_lock);
2743 	return 0;
2744 }
2745 
2746 /*
2747  * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
2748  * it is possible for us to miss an event due to race conditions, but
2749  * that condition is expected to be rare, so for the moment it is the
2750  * preferred interface.
2751  */
2752 void
2753 vn_pollevent(vp, events)
2754 	struct vnode *vp;
2755 	short events;
2756 {
2757 	mtx_lock(&vp->v_pollinfo.vpi_lock);
2758 	if (vp->v_pollinfo.vpi_events & events) {
2759 		/*
2760 		 * We clear vpi_events so that we don't
2761 		 * call selwakeup() twice if two events are
2762 		 * posted before the polling process(es) is
2763 		 * awakened.  This also ensures that we take at
2764 		 * most one selwakeup() if the polling process
2765 		 * is no longer interested.  However, it does
2766 		 * mean that only one event can be noticed at
2767 		 * a time.  (Perhaps we should only clear those
2768 		 * event bits which we note?) XXX
2769 		 */
2770 		vp->v_pollinfo.vpi_events = 0;	/* &= ~events ??? */
2771 		vp->v_pollinfo.vpi_revents |= events;
2772 		selwakeup(&vp->v_pollinfo.vpi_selinfo);
2773 	}
2774 	mtx_unlock(&vp->v_pollinfo.vpi_lock);
2775 }
2776 
2777 #define VN_KNOTE(vp, b) \
2778 	KNOTE((struct klist *)&vp->v_pollinfo.vpi_selinfo.si_note, (b))
2779 
2780 /*
2781  * Wake up anyone polling on vp because it is being revoked.
2782  * This depends on dead_poll() returning POLLHUP for correct
2783  * behavior.
2784  */
2785 void
2786 vn_pollgone(vp)
2787 	struct vnode *vp;
2788 {
2789 	mtx_lock(&vp->v_pollinfo.vpi_lock);
2790         VN_KNOTE(vp, NOTE_REVOKE);
2791 	if (vp->v_pollinfo.vpi_events) {
2792 		vp->v_pollinfo.vpi_events = 0;
2793 		selwakeup(&vp->v_pollinfo.vpi_selinfo);
2794 	}
2795 	mtx_unlock(&vp->v_pollinfo.vpi_lock);
2796 }
2797 
2798 
2799 
2800 /*
2801  * Routine to create and manage a filesystem syncer vnode.
2802  */
2803 #define sync_close ((int (*) __P((struct  vop_close_args *)))nullop)
2804 static int	sync_fsync __P((struct  vop_fsync_args *));
2805 static int	sync_inactive __P((struct  vop_inactive_args *));
2806 static int	sync_reclaim  __P((struct  vop_reclaim_args *));
2807 #define sync_lock ((int (*) __P((struct  vop_lock_args *)))vop_nolock)
2808 #define sync_unlock ((int (*) __P((struct  vop_unlock_args *)))vop_nounlock)
2809 static int	sync_print __P((struct vop_print_args *));
2810 #define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked)
2811 
2812 static vop_t **sync_vnodeop_p;
2813 static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
2814 	{ &vop_default_desc,	(vop_t *) vop_eopnotsupp },
2815 	{ &vop_close_desc,	(vop_t *) sync_close },		/* close */
2816 	{ &vop_fsync_desc,	(vop_t *) sync_fsync },		/* fsync */
2817 	{ &vop_inactive_desc,	(vop_t *) sync_inactive },	/* inactive */
2818 	{ &vop_reclaim_desc,	(vop_t *) sync_reclaim },	/* reclaim */
2819 	{ &vop_lock_desc,	(vop_t *) sync_lock },		/* lock */
2820 	{ &vop_unlock_desc,	(vop_t *) sync_unlock },	/* unlock */
2821 	{ &vop_print_desc,	(vop_t *) sync_print },		/* print */
2822 	{ &vop_islocked_desc,	(vop_t *) sync_islocked },	/* islocked */
2823 	{ NULL, NULL }
2824 };
2825 static struct vnodeopv_desc sync_vnodeop_opv_desc =
2826 	{ &sync_vnodeop_p, sync_vnodeop_entries };
2827 
2828 VNODEOP_SET(sync_vnodeop_opv_desc);
2829 
2830 /*
2831  * Create a new filesystem syncer vnode for the specified mount point.
2832  */
2833 int
2834 vfs_allocate_syncvnode(mp)
2835 	struct mount *mp;
2836 {
2837 	struct vnode *vp;
2838 	static long start, incr, next;
2839 	int error;
2840 
2841 	/* Allocate a new vnode */
2842 	if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
2843 		mp->mnt_syncer = NULL;
2844 		return (error);
2845 	}
2846 	vp->v_type = VNON;
2847 	/*
2848 	 * Place the vnode onto the syncer worklist. We attempt to
2849 	 * scatter them about on the list so that they will go off
2850 	 * at evenly distributed times even if all the filesystems
2851 	 * are mounted at once.
2852 	 */
2853 	next += incr;
2854 	if (next == 0 || next > syncer_maxdelay) {
2855 		start /= 2;
2856 		incr /= 2;
2857 		if (start == 0) {
2858 			start = syncer_maxdelay / 2;
2859 			incr = syncer_maxdelay;
2860 		}
2861 		next = start;
2862 	}
2863 	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
2864 	mp->mnt_syncer = vp;
2865 	return (0);
2866 }
2867 
2868 /*
2869  * Do a lazy sync of the filesystem.
2870  */
2871 static int
2872 sync_fsync(ap)
2873 	struct vop_fsync_args /* {
2874 		struct vnode *a_vp;
2875 		struct ucred *a_cred;
2876 		int a_waitfor;
2877 		struct proc *a_p;
2878 	} */ *ap;
2879 {
2880 	struct vnode *syncvp = ap->a_vp;
2881 	struct mount *mp = syncvp->v_mount;
2882 	struct proc *p = ap->a_p;
2883 	int asyncflag;
2884 
2885 	/*
2886 	 * We only need to do something if this is a lazy evaluation.
2887 	 */
2888 	if (ap->a_waitfor != MNT_LAZY)
2889 		return (0);
2890 
2891 	/*
2892 	 * Move ourselves to the back of the sync list.
2893 	 */
2894 	vn_syncer_add_to_worklist(syncvp, syncdelay);
2895 
2896 	/*
2897 	 * Walk the list of vnodes pushing all that are dirty and
2898 	 * not already on the sync list.
2899 	 */
2900 	mtx_lock(&mountlist_mtx);
2901 	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, p) != 0) {
2902 		mtx_unlock(&mountlist_mtx);
2903 		return (0);
2904 	}
2905 	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
2906 		vfs_unbusy(mp, p);
2907 		return (0);
2908 	}
2909 	asyncflag = mp->mnt_flag & MNT_ASYNC;
2910 	mp->mnt_flag &= ~MNT_ASYNC;
2911 	vfs_msync(mp, MNT_NOWAIT);
2912 	VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p);
2913 	if (asyncflag)
2914 		mp->mnt_flag |= MNT_ASYNC;
2915 	vn_finished_write(mp);
2916 	vfs_unbusy(mp, p);
2917 	return (0);
2918 }
2919 
2920 /*
2921  * The syncer vnode is no referenced.
2922  */
2923 static int
2924 sync_inactive(ap)
2925 	struct vop_inactive_args /* {
2926 		struct vnode *a_vp;
2927 		struct proc *a_p;
2928 	} */ *ap;
2929 {
2930 
2931 	vgone(ap->a_vp);
2932 	return (0);
2933 }
2934 
2935 /*
2936  * The syncer vnode is no longer needed and is being decommissioned.
2937  *
2938  * Modifications to the worklist must be protected at splbio().
2939  */
2940 static int
2941 sync_reclaim(ap)
2942 	struct vop_reclaim_args /* {
2943 		struct vnode *a_vp;
2944 	} */ *ap;
2945 {
2946 	struct vnode *vp = ap->a_vp;
2947 	int s;
2948 
2949 	s = splbio();
2950 	vp->v_mount->mnt_syncer = NULL;
2951 	if (vp->v_flag & VONWORKLST) {
2952 		LIST_REMOVE(vp, v_synclist);
2953 		vp->v_flag &= ~VONWORKLST;
2954 	}
2955 	splx(s);
2956 
2957 	return (0);
2958 }
2959 
2960 /*
2961  * Print out a syncer vnode.
2962  */
2963 static int
2964 sync_print(ap)
2965 	struct vop_print_args /* {
2966 		struct vnode *a_vp;
2967 	} */ *ap;
2968 {
2969 	struct vnode *vp = ap->a_vp;
2970 
2971 	printf("syncer vnode");
2972 	if (vp->v_vnlock != NULL)
2973 		lockmgr_printinfo(vp->v_vnlock);
2974 	printf("\n");
2975 	return (0);
2976 }
2977 
2978 /*
2979  * extract the dev_t from a VCHR
2980  */
2981 dev_t
2982 vn_todev(vp)
2983 	struct vnode *vp;
2984 {
2985 	if (vp->v_type != VCHR)
2986 		return (NODEV);
2987 	return (vp->v_rdev);
2988 }
2989 
2990 /*
2991  * Check if vnode represents a disk device
2992  */
2993 int
2994 vn_isdisk(vp, errp)
2995 	struct vnode *vp;
2996 	int *errp;
2997 {
2998 	struct cdevsw *cdevsw;
2999 
3000 	if (vp->v_type != VCHR) {
3001 		if (errp != NULL)
3002 			*errp = ENOTBLK;
3003 		return (0);
3004 	}
3005 	if (vp->v_rdev == NULL) {
3006 		if (errp != NULL)
3007 			*errp = ENXIO;
3008 		return (0);
3009 	}
3010 	cdevsw = devsw(vp->v_rdev);
3011 	if (cdevsw == NULL) {
3012 		if (errp != NULL)
3013 			*errp = ENXIO;
3014 		return (0);
3015 	}
3016 	if (!(cdevsw->d_flags & D_DISK)) {
3017 		if (errp != NULL)
3018 			*errp = ENOTBLK;
3019 		return (0);
3020 	}
3021 	if (errp != NULL)
3022 		*errp = 0;
3023 	return (1);
3024 }
3025 
3026 /*
3027  * Free data allocated by namei(); see namei(9) for details.
3028  */
3029 void
3030 NDFREE(ndp, flags)
3031      struct nameidata *ndp;
3032      const uint flags;
3033 {
3034 	if (!(flags & NDF_NO_FREE_PNBUF) &&
3035 	    (ndp->ni_cnd.cn_flags & HASBUF)) {
3036 		zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
3037 		ndp->ni_cnd.cn_flags &= ~HASBUF;
3038 	}
3039 	if (!(flags & NDF_NO_DVP_UNLOCK) &&
3040 	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
3041 	    ndp->ni_dvp != ndp->ni_vp)
3042 		VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_proc);
3043 	if (!(flags & NDF_NO_DVP_RELE) &&
3044 	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
3045 		vrele(ndp->ni_dvp);
3046 		ndp->ni_dvp = NULL;
3047 	}
3048 	if (!(flags & NDF_NO_VP_UNLOCK) &&
3049 	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
3050 		VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_proc);
3051 	if (!(flags & NDF_NO_VP_RELE) &&
3052 	    ndp->ni_vp) {
3053 		vrele(ndp->ni_vp);
3054 		ndp->ni_vp = NULL;
3055 	}
3056 	if (!(flags & NDF_NO_STARTDIR_RELE) &&
3057 	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
3058 		vrele(ndp->ni_startdir);
3059 		ndp->ni_startdir = NULL;
3060 	}
3061 }
3062 
3063 /*
3064  * Common file system object access control check routine.  Accepts a
3065  * vnode's type, "mode", uid and gid, requested access mode, credentials,
3066  * and optional call-by-reference privused argument allowing vaccess()
3067  * to indicate to the caller whether privilege was used to satisfy the
3068  * request.  Returns 0 on success, or an errno on failure.
3069  */
3070 int
3071 vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
3072 	enum vtype type;
3073 	mode_t file_mode;
3074 	uid_t file_uid;
3075 	gid_t file_gid;
3076 	mode_t acc_mode;
3077 	struct ucred *cred;
3078 	int *privused;
3079 {
3080 	mode_t dac_granted;
3081 #ifdef CAPABILITIES
3082 	mode_t cap_granted;
3083 #endif
3084 
3085 	/*
3086 	 * Look for a normal, non-privileged way to access the file/directory
3087 	 * as requested.  If it exists, go with that.
3088 	 */
3089 
3090 	if (privused != NULL)
3091 		*privused = 0;
3092 
3093 	dac_granted = 0;
3094 
3095 	/* Check the owner. */
3096 	if (cred->cr_uid == file_uid) {
3097 		dac_granted |= VADMIN;
3098 		if (file_mode & S_IXUSR)
3099 			dac_granted |= VEXEC;
3100 		if (file_mode & S_IRUSR)
3101 			dac_granted |= VREAD;
3102 		if (file_mode & S_IWUSR)
3103 			dac_granted |= VWRITE;
3104 
3105 		if ((acc_mode & dac_granted) == acc_mode)
3106 			return (0);
3107 
3108 		goto privcheck;
3109 	}
3110 
3111 	/* Otherwise, check the groups (first match) */
3112 	if (groupmember(file_gid, cred)) {
3113 		if (file_mode & S_IXGRP)
3114 			dac_granted |= VEXEC;
3115 		if (file_mode & S_IRGRP)
3116 			dac_granted |= VREAD;
3117 		if (file_mode & S_IWGRP)
3118 			dac_granted |= VWRITE;
3119 
3120 		if ((acc_mode & dac_granted) == acc_mode)
3121 			return (0);
3122 
3123 		goto privcheck;
3124 	}
3125 
3126 	/* Otherwise, check everyone else. */
3127 	if (file_mode & S_IXOTH)
3128 		dac_granted |= VEXEC;
3129 	if (file_mode & S_IROTH)
3130 		dac_granted |= VREAD;
3131 	if (file_mode & S_IWOTH)
3132 		dac_granted |= VWRITE;
3133 	if ((acc_mode & dac_granted) == acc_mode)
3134 		return (0);
3135 
3136 privcheck:
3137 	if (!suser_xxx(cred, NULL, PRISON_ROOT)) {
3138 		/* XXX audit: privilege used */
3139 		if (privused != NULL)
3140 			*privused = 1;
3141 		return (0);
3142 	}
3143 
3144 #ifdef CAPABILITIES
3145 	/*
3146 	 * Build a capability mask to determine if the set of capabilities
3147 	 * satisfies the requirements when combined with the granted mask
3148 	 * from above.
3149 	 * For each capability, if the capability is required, bitwise
3150 	 * or the request type onto the cap_granted mask.
3151 	 */
3152 	cap_granted = 0;
3153 	if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3154 	    !cap_check_xxx(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT))
3155 	    cap_granted |= VEXEC;
3156 
3157 	if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
3158 	    !cap_check_xxx(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3159 		cap_granted |= VREAD;
3160 
3161 	if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3162 	    !cap_check_xxx(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT))
3163 		cap_granted |= VWRITE;
3164 
3165 	if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3166 	    !cap_check_xxx(cred, NULL, CAP_FOWNER, PRISON_ROOT))
3167 		cap_granted |= VADMIN;
3168 
3169 	if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
3170 		/* XXX audit: privilege used */
3171 		if (privused != NULL)
3172 			*privused = 1;
3173 		return (0);
3174 	}
3175 #endif
3176 
3177 	return ((acc_mode & VADMIN) ? EPERM : EACCES);
3178 }
3179