xref: /freebsd/sys/kern/vfs_subr.c (revision 4a0f765fbf09711e612e86fce8bb09ec43f482d9)
1 /*
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
39  * $Id$
40  */
41 
42 /*
43  * External virtual filesystem routines
44  */
45 #include "opt_ddb.h"
46 #include "opt_devfs.h"
47 
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/kernel.h>
51 #include <sys/file.h>
52 #include <sys/proc.h>
53 #include <sys/mount.h>
54 #include <sys/time.h>
55 #include <sys/vnode.h>
56 #include <sys/stat.h>
57 #include <sys/namei.h>
58 #include <sys/ucred.h>
59 #include <sys/buf.h>
60 #include <sys/errno.h>
61 #include <sys/malloc.h>
62 #include <sys/domain.h>
63 #include <sys/mbuf.h>
64 
65 #include <vm/vm.h>
66 #include <vm/vm_param.h>
67 #include <vm/vm_object.h>
68 #include <vm/vm_extern.h>
69 #include <vm/vm_pager.h>
70 #include <vm/vnode_pager.h>
71 #include <sys/sysctl.h>
72 
73 #include <miscfs/specfs/specdev.h>
74 
75 #ifdef DDB
76 extern void	printlockedvnodes __P((void));
77 #endif
78 static void	vclean __P((struct vnode *vp, int flags, struct proc *p));
79 extern void	vgonel __P((struct vnode *vp, struct proc *p));
80 unsigned long	numvnodes;
81 extern void	vfs_unmountroot __P((struct mount *rootfs));
82 
83 enum vtype iftovt_tab[16] = {
84 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
85 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
86 };
87 int vttoif_tab[9] = {
88 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
89 	S_IFSOCK, S_IFIFO, S_IFMT,
90 };
91 
92 /*
93  * Insq/Remq for the vnode usage lists.
94  */
95 #define	bufinsvn(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_vnbufs)
96 #define	bufremvn(bp) {							\
97 	LIST_REMOVE(bp, b_vnbufs);					\
98 	(bp)->b_vnbufs.le_next = NOLIST;				\
99 }
100 TAILQ_HEAD(freelst, vnode) vnode_free_list;	/* vnode free list */
101 static u_long freevnodes = 0;
102 
103 struct mntlist mountlist;	/* mounted filesystem list */
104 struct simplelock mountlist_slock;
105 static struct simplelock mntid_slock;
106 struct simplelock mntvnode_slock;
107 struct simplelock vnode_free_list_slock;
108 static struct simplelock spechash_slock;
109 
110 int desiredvnodes;
111 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "");
112 
113 static void	vfs_free_addrlist __P((struct netexport *nep));
114 static int	vfs_free_netcred __P((struct radix_node *rn, void *w));
115 static int	vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
116 				       struct export_args *argp));
117 
118 /*
119  * Initialize the vnode management data structures.
120  */
121 void
122 vntblinit()
123 {
124 
125 	desiredvnodes = maxproc + vm_object_cache_max;
126 	simple_lock_init(&mntvnode_slock);
127 	simple_lock_init(&mntid_slock);
128 	simple_lock_init(&spechash_slock);
129 	TAILQ_INIT(&vnode_free_list);
130 	simple_lock_init(&vnode_free_list_slock);
131 	CIRCLEQ_INIT(&mountlist);
132 }
133 
134 /*
135  * Mark a mount point as busy. Used to synchronize access and to delay
136  * unmounting. Interlock is not released on failure.
137  */
138 int
139 vfs_busy(mp, flags, interlkp, p)
140 	struct mount *mp;
141 	int flags;
142 	struct simplelock *interlkp;
143 	struct proc *p;
144 {
145 	int lkflags;
146 
147 	if (mp->mnt_flag & MNT_UNMOUNT) {
148 		if (flags & LK_NOWAIT)
149 			return (ENOENT);
150 		mp->mnt_flag |= MNT_MWAIT;
151 		if (interlkp) {
152 			simple_unlock(interlkp);
153 		}
154 		/*
155 		 * Since all busy locks are shared except the exclusive
156 		 * lock granted when unmounting, the only place that a
157 		 * wakeup needs to be done is at the release of the
158 		 * exclusive lock at the end of dounmount.
159 		 */
160 		tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
161 		if (interlkp) {
162 			simple_lock(interlkp);
163 		}
164 		return (ENOENT);
165 	}
166 	lkflags = LK_SHARED;
167 	if (interlkp)
168 		lkflags |= LK_INTERLOCK;
169 	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
170 		panic("vfs_busy: unexpected lock failure");
171 	return (0);
172 }
173 
174 /*
175  * Free a busy filesystem.
176  */
177 void
178 vfs_unbusy(mp, p)
179 	struct mount *mp;
180 	struct proc *p;
181 {
182 
183 	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
184 }
185 
186 /*
187  * Lookup a filesystem type, and if found allocate and initialize
188  * a mount structure for it.
189  *
190  * Devname is usually updated by mount(8) after booting.
191  */
192 int
193 vfs_rootmountalloc(fstypename, devname, mpp)
194 	char *fstypename;
195 	char *devname;
196 	struct mount **mpp;
197 {
198 	struct proc *p = curproc;	/* XXX */
199 	struct vfsconf *vfsp;
200 	struct mount *mp;
201 
202 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
203 		if (!strcmp(vfsp->vfc_name, fstypename))
204 			break;
205 	if (vfsp == NULL)
206 		return (ENODEV);
207 	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
208 	bzero((char *)mp, (u_long)sizeof(struct mount));
209 	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
210 	(void)vfs_busy(mp, LK_NOWAIT, 0, p);
211 	LIST_INIT(&mp->mnt_vnodelist);
212 	mp->mnt_vfc = vfsp;
213 	mp->mnt_op = vfsp->vfc_vfsops;
214 	mp->mnt_flag = MNT_RDONLY;
215 	mp->mnt_vnodecovered = NULLVP;
216 	vfsp->vfc_refcount++;
217 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
218 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
219 	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
220 	mp->mnt_stat.f_mntonname[0] = '/';
221 	mp->mnt_stat.f_mntonname[1] = 0;
222 	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
223 	*mpp = mp;
224 	return (0);
225 }
226 
227 /*
228  * Find an appropriate filesystem to use for the root. If a filesystem
229  * has not been preselected, walk through the list of known filesystems
230  * trying those that have mountroot routines, and try them until one
231  * works or we have tried them all.
232  */
233 #ifdef notdef	/* XXX JH */
234 int
235 lite2_vfs_mountroot(void)
236 {
237 	struct vfsconf *vfsp;
238 	extern int (*lite2_mountroot)(void);
239 	int error;
240 
241 	if (lite2_mountroot != NULL)
242 		return ((*lite2_mountroot)());
243 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
244 		if (vfsp->vfc_mountroot == NULL)
245 			continue;
246 		if ((error = (*vfsp->vfc_mountroot)()) == 0)
247 			return (0);
248 		printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
249 	}
250 	return (ENODEV);
251 }
252 #endif
253 
254 /*
255  * Lookup a mount point by filesystem identifier.
256  */
257 struct mount *
258 vfs_getvfs(fsid)
259 	fsid_t *fsid;
260 {
261 	register struct mount *mp;
262 
263 	simple_lock(&mountlist_slock);
264 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
265 	    mp = mp->mnt_list.cqe_next) {
266 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
267 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
268 			simple_unlock(&mountlist_slock);
269 			return (mp);
270 	    }
271 	}
272 	simple_unlock(&mountlist_slock);
273 	return ((struct mount *) 0);
274 }
275 
276 /*
277  * Get a new unique fsid
278  */
279 void
280 vfs_getnewfsid(mp)
281 	struct mount *mp;
282 {
283 	static u_short xxxfs_mntid;
284 
285 	fsid_t tfsid;
286 	int mtype;
287 
288 	simple_lock(&mntid_slock);
289 	mtype = mp->mnt_vfc->vfc_typenum;
290 	mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
291 	mp->mnt_stat.f_fsid.val[1] = mtype;
292 	if (xxxfs_mntid == 0)
293 		++xxxfs_mntid;
294 	tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
295 	tfsid.val[1] = mtype;
296 	if (mountlist.cqh_first != (void *)&mountlist) {
297 		while (vfs_getvfs(&tfsid)) {
298 			tfsid.val[0]++;
299 			xxxfs_mntid++;
300 		}
301 	}
302 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
303 	simple_unlock(&mntid_slock);
304 }
305 
306 /*
307  * Set vnode attributes to VNOVAL
308  */
309 void
310 vattr_null(vap)
311 	register struct vattr *vap;
312 {
313 
314 	vap->va_type = VNON;
315 	vap->va_size = VNOVAL;
316 	vap->va_bytes = VNOVAL;
317 	vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid =
318 	    vap->va_fsid = vap->va_fileid =
319 	    vap->va_blocksize = vap->va_rdev =
320 	    vap->va_atime.tv_sec = vap->va_atime.tv_nsec =
321 	    vap->va_mtime.tv_sec = vap->va_mtime.tv_nsec =
322 	    vap->va_ctime.tv_sec = vap->va_ctime.tv_nsec =
323 	    vap->va_flags = vap->va_gen = VNOVAL;
324 	vap->va_vaflags = 0;
325 }
326 
327 void
328 vfs_unmountroot(struct mount *rootfs)
329 {
330 	struct proc *p = curproc;	/* XXX */
331 	struct mount *mp = rootfs;
332 	int error;
333 
334 	if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
335 		printf("failed to unmount root\n");
336 		return;
337 	}
338 	mp->mnt_flag |= MNT_UNMOUNT;
339 	vnode_pager_umount(mp);	/* release cached vnodes */
340 	cache_purgevfs(mp);	/* remove cache entries for this file sys */
341 
342 	if ((error = VFS_SYNC(mp, MNT_WAIT, initproc->p_ucred, initproc)))
343 		printf("sync of root filesystem failed (%d)\n", error);
344 
345 	if ((error = VFS_UNMOUNT(mp, MNT_FORCE, initproc))) {
346 		printf("unmount of root filesystem failed (");
347 		if (error == EBUSY)
348 			printf("BUSY)\n");
349 		else
350 			printf("%d)\n", error);
351 	}
352 	mp->mnt_flag &= ~MNT_UNMOUNT;
353 	vfs_unbusy(mp, p);
354 }
355 
356 /*
357  * Routines having to do with the management of the vnode table.
358  */
359 extern vop_t **dead_vnodeop_p;
360 
361 /*
362  * Return the next vnode from the free list.
363  */
364 int
365 getnewvnode(tag, mp, vops, vpp)
366 	enum vtagtype tag;
367 	struct mount *mp;
368 	vop_t **vops;
369 	struct vnode **vpp;
370 {
371 	struct proc *p = curproc;	/* XXX */
372 	struct vnode *vp;
373 
374 	simple_lock(&vnode_free_list_slock);
375 retry:
376 	/*
377 	 * we allocate a new vnode if
378 	 * 	1. we don't have any free
379 	 *		Pretty obvious, we actually used to panic, but that
380 	 *		is a silly thing to do.
381 	 *	2. we havn't filled our pool yet
382 	 *		We don't want to trash the incore (VM-)vnodecache.
383 	 *	3. if less that 1/4th of our vnodes are free.
384 	 *		We don't want to trash the namei cache either.
385 	 */
386 	if (freevnodes < (numvnodes >> 2) ||
387 	    numvnodes < desiredvnodes ||
388 	    vnode_free_list.tqh_first == NULL) {
389 		simple_unlock(&vnode_free_list_slock);
390 		vp = (struct vnode *) malloc((u_long) sizeof *vp,
391 		    M_VNODE, M_WAITOK);
392 		bzero((char *) vp, sizeof *vp);
393 		numvnodes++;
394 	} else {
395 		for (vp = vnode_free_list.tqh_first;
396 				vp != NULLVP; vp = vp->v_freelist.tqe_next) {
397 			if (simple_lock_try(&vp->v_interlock))
398 				break;
399 		}
400 		/*
401 		 * Unless this is a bad time of the month, at most
402 		 * the first NCPUS items on the free list are
403 		 * locked, so this is close enough to being empty.
404 		 */
405 		if (vp == NULLVP) {
406 			simple_unlock(&vnode_free_list_slock);
407 			tablefull("vnode");
408 			*vpp = 0;
409 			return (ENFILE);
410 		}
411 		if (vp->v_usecount)
412 			panic("free vnode isn't");
413 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
414 		if (vp->v_usage > 0) {
415 			--vp->v_usage;
416 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
417 			goto retry;
418 		}
419 		freevnodes--;
420 
421 		/* see comment on why 0xdeadb is set at end of vgone (below) */
422 		vp->v_freelist.tqe_prev = (struct vnode **) 0xdeadb;
423 		simple_unlock(&vnode_free_list_slock);
424 		vp->v_lease = NULL;
425 		if (vp->v_type != VBAD)
426 			vgonel(vp, p);
427 		else {
428 			simple_unlock(&vp->v_interlock);
429 		}
430 
431 #ifdef DIAGNOSTIC
432 		{
433 			int s;
434 
435 			if (vp->v_data)
436 				panic("cleaned vnode isn't");
437 			s = splbio();
438 			if (vp->v_numoutput)
439 				panic("Clean vnode has pending I/O's");
440 			splx(s);
441 		}
442 #endif
443 		vp->v_flag = 0;
444 		vp->v_lastr = 0;
445 		vp->v_lastw = 0;
446 		vp->v_lasta = 0;
447 		vp->v_cstart = 0;
448 		vp->v_clen = 0;
449 		vp->v_socket = 0;
450 		vp->v_writecount = 0;	/* XXX */
451 		vp->v_usage = 0;
452 	}
453 	vp->v_type = VNON;
454 	cache_purge(vp);
455 	vp->v_tag = tag;
456 	vp->v_op = vops;
457 	insmntque(vp, mp);
458 	*vpp = vp;
459 	vp->v_usecount = 1;
460 	vp->v_data = 0;
461 	return (0);
462 }
463 
464 /*
465  * Move a vnode from one mount queue to another.
466  */
467 void
468 insmntque(vp, mp)
469 	register struct vnode *vp;
470 	register struct mount *mp;
471 {
472 
473 	simple_lock(&mntvnode_slock);
474 	/*
475 	 * Delete from old mount point vnode list, if on one.
476 	 */
477 	if (vp->v_mount != NULL)
478 		LIST_REMOVE(vp, v_mntvnodes);
479 	/*
480 	 * Insert into list of vnodes for the new mount point, if available.
481 	 */
482 	if ((vp->v_mount = mp) == NULL) {
483 		simple_unlock(&mntvnode_slock);
484 		return;
485 	}
486 	LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
487 	simple_unlock(&mntvnode_slock);
488 }
489 
490 /*
491  * Update outstanding I/O count and do wakeup if requested.
492  */
493 void
494 vwakeup(bp)
495 	register struct buf *bp;
496 {
497 	register struct vnode *vp;
498 
499 	bp->b_flags &= ~B_WRITEINPROG;
500 	if ((vp = bp->b_vp)) {
501 		vp->v_numoutput--;
502 		if (vp->v_numoutput < 0)
503 			panic("vwakeup: neg numoutput");
504 		if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
505 			vp->v_flag &= ~VBWAIT;
506 			wakeup((caddr_t) &vp->v_numoutput);
507 		}
508 	}
509 }
510 
511 /*
512  * Flush out and invalidate all buffers associated with a vnode.
513  * Called with the underlying object locked.
514  */
515 int
516 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
517 	register struct vnode *vp;
518 	int flags;
519 	struct ucred *cred;
520 	struct proc *p;
521 	int slpflag, slptimeo;
522 {
523 	register struct buf *bp;
524 	struct buf *nbp, *blist;
525 	int s, error;
526 	vm_object_t object;
527 
528 	if (flags & V_SAVE) {
529 		if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)))
530 			return (error);
531 		if (vp->v_dirtyblkhd.lh_first != NULL)
532 			panic("vinvalbuf: dirty bufs");
533 	}
534 
535 	s = splbio();
536 	for (;;) {
537 		if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA))
538 			while (blist && blist->b_lblkno < 0)
539 				blist = blist->b_vnbufs.le_next;
540 		if (!blist && (blist = vp->v_dirtyblkhd.lh_first) &&
541 		    (flags & V_SAVEMETA))
542 			while (blist && blist->b_lblkno < 0)
543 				blist = blist->b_vnbufs.le_next;
544 		if (!blist)
545 			break;
546 
547 		for (bp = blist; bp; bp = nbp) {
548 			nbp = bp->b_vnbufs.le_next;
549 			if ((flags & V_SAVEMETA) && bp->b_lblkno < 0)
550 				continue;
551 			if (bp->b_flags & B_BUSY) {
552 				bp->b_flags |= B_WANTED;
553 				error = tsleep((caddr_t) bp,
554 				    slpflag | (PRIBIO + 1), "vinvalbuf",
555 				    slptimeo);
556 				splx(s);
557 				if (error)
558 					return (error);
559 				break;
560 			}
561 			bremfree(bp);
562 			bp->b_flags |= B_BUSY;
563 			/*
564 			 * XXX Since there are no node locks for NFS, I
565 			 * believe there is a slight chance that a delayed
566 			 * write will occur while sleeping just above, so
567 			 * check for it.
568 			 */
569 			if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) {
570 				(void) VOP_BWRITE(bp);
571 				break;
572 			}
573 			bp->b_flags |= (B_INVAL|B_NOCACHE|B_RELBUF);
574 			brelse(bp);
575 		}
576 	}
577 	splx(s);
578 
579 	s = splbio();
580 	while (vp->v_numoutput > 0) {
581 		vp->v_flag |= VBWAIT;
582 		tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
583 	}
584 	splx(s);
585 
586 	/*
587 	 * Destroy the copy in the VM cache, too.
588 	 */
589 	object = vp->v_object;
590 	if (object != NULL) {
591 		vm_object_page_remove(object, 0, object->size,
592 		    (flags & V_SAVE) ? TRUE : FALSE);
593 	}
594 	if (!(flags & V_SAVEMETA) &&
595 	    (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first))
596 		panic("vinvalbuf: flush failed");
597 	return (0);
598 }
599 
600 /*
601  * Associate a buffer with a vnode.
602  */
603 void
604 bgetvp(vp, bp)
605 	register struct vnode *vp;
606 	register struct buf *bp;
607 {
608 	int s;
609 
610 	if (bp->b_vp)
611 		panic("bgetvp: not free");
612 	VHOLD(vp);
613 	bp->b_vp = vp;
614 	if (vp->v_type == VBLK || vp->v_type == VCHR)
615 		bp->b_dev = vp->v_rdev;
616 	else
617 		bp->b_dev = NODEV;
618 	/*
619 	 * Insert onto list for new vnode.
620 	 */
621 	s = splbio();
622 	bufinsvn(bp, &vp->v_cleanblkhd);
623 	splx(s);
624 }
625 
626 /*
627  * Disassociate a buffer from a vnode.
628  */
629 void
630 brelvp(bp)
631 	register struct buf *bp;
632 {
633 	struct vnode *vp;
634 	int s;
635 
636 	if (bp->b_vp == (struct vnode *) 0)
637 		panic("brelvp: NULL");
638 	/*
639 	 * Delete from old vnode list, if on one.
640 	 */
641 	s = splbio();
642 	if (bp->b_vnbufs.le_next != NOLIST)
643 		bufremvn(bp);
644 	splx(s);
645 
646 	vp = bp->b_vp;
647 	bp->b_vp = (struct vnode *) 0;
648 	HOLDRELE(vp);
649 }
650 
651 /*
652  * Associate a p-buffer with a vnode.
653  */
654 void
655 pbgetvp(vp, bp)
656 	register struct vnode *vp;
657 	register struct buf *bp;
658 {
659 	if (bp->b_vp)
660 		panic("pbgetvp: not free");
661 	VHOLD(vp);
662 	bp->b_vp = vp;
663 	if (vp->v_type == VBLK || vp->v_type == VCHR)
664 		bp->b_dev = vp->v_rdev;
665 	else
666 		bp->b_dev = NODEV;
667 }
668 
669 /*
670  * Disassociate a p-buffer from a vnode.
671  */
672 void
673 pbrelvp(bp)
674 	register struct buf *bp;
675 {
676 	struct vnode *vp;
677 
678 	if (bp->b_vp == (struct vnode *) 0)
679 		panic("brelvp: NULL");
680 
681 	vp = bp->b_vp;
682 	bp->b_vp = (struct vnode *) 0;
683 	HOLDRELE(vp);
684 }
685 
686 /*
687  * Reassign a buffer from one vnode to another.
688  * Used to assign file specific control information
689  * (indirect blocks) to the vnode to which they belong.
690  */
691 void
692 reassignbuf(bp, newvp)
693 	register struct buf *bp;
694 	register struct vnode *newvp;
695 {
696 	int s;
697 
698 	if (newvp == NULL) {
699 		printf("reassignbuf: NULL");
700 		return;
701 	}
702 
703 	s = splbio();
704 	/*
705 	 * Delete from old vnode list, if on one.
706 	 */
707 	if (bp->b_vnbufs.le_next != NOLIST)
708 		bufremvn(bp);
709 	/*
710 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
711 	 * of clean buffers.
712 	 */
713 	if (bp->b_flags & B_DELWRI) {
714 		struct buf *tbp;
715 
716 		tbp = newvp->v_dirtyblkhd.lh_first;
717 		if (!tbp || (tbp->b_lblkno > bp->b_lblkno)) {
718 			bufinsvn(bp, &newvp->v_dirtyblkhd);
719 		} else {
720 			while (tbp->b_vnbufs.le_next &&
721 				(tbp->b_vnbufs.le_next->b_lblkno < bp->b_lblkno)) {
722 				tbp = tbp->b_vnbufs.le_next;
723 			}
724 			LIST_INSERT_AFTER(tbp, bp, b_vnbufs);
725 		}
726 	} else {
727 		bufinsvn(bp, &newvp->v_cleanblkhd);
728 	}
729 	splx(s);
730 }
731 
732 #ifndef DEVFS_ROOT
733 /*
734  * Create a vnode for a block device.
735  * Used for root filesystem, argdev, and swap areas.
736  * Also used for memory file system special devices.
737  */
738 int
739 bdevvp(dev, vpp)
740 	dev_t dev;
741 	struct vnode **vpp;
742 {
743 	register struct vnode *vp;
744 	struct vnode *nvp;
745 	int error;
746 
747 	if (dev == NODEV)
748 		return (0);
749 	error = getnewvnode(VT_NON, (struct mount *) 0, spec_vnodeop_p, &nvp);
750 	if (error) {
751 		*vpp = 0;
752 		return (error);
753 	}
754 	vp = nvp;
755 	vp->v_type = VBLK;
756 	if ((nvp = checkalias(vp, dev, (struct mount *) 0))) {
757 		vput(vp);
758 		vp = nvp;
759 	}
760 	*vpp = vp;
761 	return (0);
762 }
763 #endif /* !DEVFS_ROOT */
764 
765 /*
766  * Check to see if the new vnode represents a special device
767  * for which we already have a vnode (either because of
768  * bdevvp() or because of a different vnode representing
769  * the same block device). If such an alias exists, deallocate
770  * the existing contents and return the aliased vnode. The
771  * caller is responsible for filling it with its new contents.
772  */
773 struct vnode *
774 checkalias(nvp, nvp_rdev, mp)
775 	register struct vnode *nvp;
776 	dev_t nvp_rdev;
777 	struct mount *mp;
778 {
779 	struct proc *p = curproc;	/* XXX */
780 	struct vnode *vp;
781 	struct vnode **vpp;
782 
783 	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
784 		return (NULLVP);
785 
786 	vpp = &speclisth[SPECHASH(nvp_rdev)];
787 loop:
788 	simple_lock(&spechash_slock);
789 	for (vp = *vpp; vp; vp = vp->v_specnext) {
790 		if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
791 			continue;
792 		/*
793 		 * Alias, but not in use, so flush it out.
794 		 */
795 		simple_lock(&vp->v_interlock);
796 		if (vp->v_usecount == 0) {
797 			simple_unlock(&spechash_slock);
798 			vgonel(vp, p);
799 			goto loop;
800 		}
801 		simple_unlock(&spechash_slock);
802 		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) {
803 			goto loop;
804 		}
805 		simple_lock(&spechash_slock);
806 		break;
807 	}
808 
809 	if (vp == NULL || vp->v_tag != VT_NON) {
810 		MALLOC(nvp->v_specinfo, struct specinfo *,
811 		    sizeof(struct specinfo), M_VNODE, M_WAITOK);
812 		nvp->v_rdev = nvp_rdev;
813 		nvp->v_hashchain = vpp;
814 		nvp->v_specnext = *vpp;
815 		nvp->v_specflags = 0;
816 		simple_unlock(&spechash_slock);
817 		*vpp = nvp;
818 		if (vp != NULLVP) {
819 			nvp->v_flag |= VALIASED;
820 			vp->v_flag |= VALIASED;
821 			vput(vp);
822 		}
823 		return (NULLVP);
824 	}
825 	simple_unlock(&spechash_slock);
826 	VOP_UNLOCK(vp, 0, p);
827 	simple_lock(&vp->v_interlock);
828 	vclean(vp, 0, p);
829 	vp->v_op = nvp->v_op;
830 	vp->v_tag = nvp->v_tag;
831 	nvp->v_type = VNON;
832 	insmntque(vp, mp);
833 	return (vp);
834 }
835 
836 /*
837  * Grab a particular vnode from the free list, increment its
838  * reference count and lock it. The vnode lock bit is set the
839  * vnode is being eliminated in vgone. The process is awakened
840  * when the transition is completed, and an error returned to
841  * indicate that the vnode is no longer usable (possibly having
842  * been changed to a new file system type).
843  */
844 int
845 vget(vp, flags, p)
846 	register struct vnode *vp;
847 	int flags;
848 	struct proc *p;
849 {
850 	int error;
851 
852 	/*
853 	 * If the vnode is in the process of being cleaned out for
854 	 * another use, we wait for the cleaning to finish and then
855 	 * return failure. Cleaning is determined by checking that
856 	 * the VXLOCK flag is set.
857 	 */
858 	if ((flags & LK_INTERLOCK) == 0) {
859 		simple_lock(&vp->v_interlock);
860 	}
861 	if (vp->v_flag & VXLOCK) {
862 		vp->v_flag |= VXWANT;
863 		simple_unlock(&vp->v_interlock);
864 		tsleep((caddr_t)vp, PINOD, "vget", 0);
865 		return (ENOENT);
866 	}
867 	if (vp->v_usecount == 0) {
868 		simple_lock(&vnode_free_list_slock);
869 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
870 		simple_unlock(&vnode_free_list_slock);
871 		freevnodes--;
872 	}
873 	vp->v_usecount++;
874 	/*
875 	 * Create the VM object, if needed
876 	 */
877 	if ((vp->v_type == VREG) &&
878 		((vp->v_object == NULL) ||
879 			(vp->v_object->flags & OBJ_VFS_REF) == 0)) {
880 		/*
881 		 * XXX vfs_object_create probably needs the interlock.
882 		 */
883 		simple_unlock(&vp->v_interlock);
884 		vfs_object_create(vp, curproc, curproc->p_ucred, 0);
885 		simple_lock(&vp->v_interlock);
886 	}
887 	if (flags & LK_TYPE_MASK) {
888 		if (error = vn_lock(vp, flags | LK_INTERLOCK, p))
889 			vrele(vp);
890 		return (error);
891 	}
892 	simple_unlock(&vp->v_interlock);
893 	return (0);
894 }
895 
896 /*
897  * Stubs to use when there is no locking to be done on the underlying object.
898  * A minimal shared lock is necessary to ensure that the underlying object
899  * is not revoked while an operation is in progress. So, an active shared
900  * count is maintained in an auxillary vnode lock structure.
901  */
902 int
903 vop_nolock(ap)
904 	struct vop_lock_args /* {
905 		struct vnode *a_vp;
906 		int a_flags;
907 		struct proc *a_p;
908 	} */ *ap;
909 {
910 #ifdef notyet
911 	/*
912 	 * This code cannot be used until all the non-locking filesystems
913 	 * (notably NFS) are converted to properly lock and release nodes.
914 	 * Also, certain vnode operations change the locking state within
915 	 * the operation (create, mknod, remove, link, rename, mkdir, rmdir,
916 	 * and symlink). Ideally these operations should not change the
917 	 * lock state, but should be changed to let the caller of the
918 	 * function unlock them. Otherwise all intermediate vnode layers
919 	 * (such as union, umapfs, etc) must catch these functions to do
920 	 * the necessary locking at their layer. Note that the inactive
921 	 * and lookup operations also change their lock state, but this
922 	 * cannot be avoided, so these two operations will always need
923 	 * to be handled in intermediate layers.
924 	 */
925 	struct vnode *vp = ap->a_vp;
926 	int vnflags, flags = ap->a_flags;
927 
928 	if (vp->v_vnlock == NULL) {
929 		if ((flags & LK_TYPE_MASK) == LK_DRAIN)
930 			return (0);
931 		MALLOC(vp->v_vnlock, struct lock *, sizeof(struct lock),
932 		    M_VNODE, M_WAITOK);
933 		lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0);
934 	}
935 	switch (flags & LK_TYPE_MASK) {
936 	case LK_DRAIN:
937 		vnflags = LK_DRAIN;
938 		break;
939 	case LK_EXCLUSIVE:
940 	case LK_SHARED:
941 		vnflags = LK_SHARED;
942 		break;
943 	case LK_UPGRADE:
944 	case LK_EXCLUPGRADE:
945 	case LK_DOWNGRADE:
946 		return (0);
947 	case LK_RELEASE:
948 	default:
949 		panic("vop_nolock: bad operation %d", flags & LK_TYPE_MASK);
950 	}
951 	if (flags & LK_INTERLOCK)
952 		vnflags |= LK_INTERLOCK;
953 	return(lockmgr(vp->v_vnlock, vnflags, &vp->v_interlock, ap->a_p));
954 #else /* for now */
955 	/*
956 	 * Since we are not using the lock manager, we must clear
957 	 * the interlock here.
958 	 */
959 	if (ap->a_flags & LK_INTERLOCK) {
960 		simple_unlock(&ap->a_vp->v_interlock);
961 	}
962 	return (0);
963 #endif
964 }
965 
966 /*
967  * Decrement the active use count.
968  */
969 int
970 vop_nounlock(ap)
971 	struct vop_unlock_args /* {
972 		struct vnode *a_vp;
973 		int a_flags;
974 		struct proc *a_p;
975 	} */ *ap;
976 {
977 	struct vnode *vp = ap->a_vp;
978 
979 	if (vp->v_vnlock == NULL)
980 		return (0);
981 	return (lockmgr(vp->v_vnlock, LK_RELEASE, NULL, ap->a_p));
982 }
983 
984 /*
985  * Return whether or not the node is in use.
986  */
987 int
988 vop_noislocked(ap)
989 	struct vop_islocked_args /* {
990 		struct vnode *a_vp;
991 	} */ *ap;
992 {
993 	struct vnode *vp = ap->a_vp;
994 
995 	if (vp->v_vnlock == NULL)
996 		return (0);
997 	return (lockstatus(vp->v_vnlock));
998 }
999 
1000 /* #ifdef DIAGNOSTIC */
1001 /*
1002  * Vnode reference, just increment the count
1003  */
1004 void
1005 vref(vp)
1006 	struct vnode *vp;
1007 {
1008 	simple_lock(&vp->v_interlock);
1009 	if (vp->v_usecount <= 0)
1010 		panic("vref used where vget required");
1011 
1012 	vp->v_usecount++;
1013 
1014 	if ((vp->v_type == VREG) &&
1015 		((vp->v_object == NULL) ||
1016 			((vp->v_object->flags & OBJ_VFS_REF) == 0)) ) {
1017 		/*
1018 		 * We need to lock to VP during the time that
1019 		 * the object is created.  This is necessary to
1020 		 * keep the system from re-entrantly doing it
1021 		 * multiple times.
1022 		 * XXX vfs_object_create probably needs the interlock?
1023 		 */
1024 		simple_unlock(&vp->v_interlock);
1025 		vfs_object_create(vp, curproc, curproc->p_ucred, 0);
1026 		simple_lock(&vp->v_interlock);
1027 	}
1028 }
1029 
1030 /*
1031  * vput(), just unlock and vrele()
1032  */
1033 void
1034 vput(vp)
1035 	struct vnode *vp;
1036 {
1037 	VOP_UNLOCK(vp, 0, curproc);
1038 	vrele(vp);
1039 }
1040 
1041 /*
1042  * Vnode release.
1043  * If count drops to zero, call inactive routine and return to freelist.
1044  */
1045 void
1046 vrele(vp)
1047 	struct vnode *vp;
1048 {
1049 	struct proc *p = curproc;	/* XXX */
1050 
1051 #ifdef DIAGNOSTIC
1052 	if (vp == NULL)
1053 		panic("vrele: null vp");
1054 #endif
1055 	simple_lock(&vp->v_interlock);
1056 	vp->v_usecount--;
1057 
1058 	if ((vp->v_usecount == 1) &&
1059 		vp->v_object &&
1060 		(vp->v_object->flags & OBJ_VFS_REF)) {
1061 		vp->v_object->flags &= ~OBJ_VFS_REF;
1062 		simple_unlock(&vp->v_interlock);
1063 		vm_object_deallocate(vp->v_object);
1064 		return;
1065 	}
1066 
1067 	if (vp->v_usecount > 0) {
1068 		simple_unlock(&vp->v_interlock);
1069 		return;
1070 	}
1071 
1072 	if (vp->v_usecount < 0) {
1073 #ifdef DIAGNOSTIC
1074 		vprint("vrele: negative ref count", vp);
1075 #endif
1076 		panic("vrele: negative ref cnt");
1077 	}
1078 	simple_lock(&vnode_free_list_slock);
1079 	if (vp->v_flag & VAGE) {
1080 		if(vp->v_tag != VT_TFS)
1081 			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1082 		vp->v_flag &= ~VAGE;
1083 		vp->v_usage = 0;
1084 	} else {
1085 		if(vp->v_tag != VT_TFS)
1086 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1087 	}
1088 	simple_unlock(&vnode_free_list_slock);
1089 
1090 	freevnodes++;
1091 
1092 	if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0)
1093 		VOP_INACTIVE(vp, p);
1094 }
1095 
1096 #ifdef DIAGNOSTIC
1097 /*
1098  * Page or buffer structure gets a reference.
1099  */
1100 void
1101 vhold(vp)
1102 	register struct vnode *vp;
1103 {
1104 
1105 	simple_lock(&vp->v_interlock);
1106 	vp->v_holdcnt++;
1107 	simple_unlock(&vp->v_interlock);
1108 }
1109 
1110 /*
1111  * Page or buffer structure frees a reference.
1112  */
1113 void
1114 holdrele(vp)
1115 	register struct vnode *vp;
1116 {
1117 
1118 	simple_lock(&vp->v_interlock);
1119 	if (vp->v_holdcnt <= 0)
1120 		panic("holdrele: holdcnt");
1121 	vp->v_holdcnt--;
1122 	simple_unlock(&vp->v_interlock);
1123 }
1124 #endif /* DIAGNOSTIC */
1125 
1126 /*
1127  * Remove any vnodes in the vnode table belonging to mount point mp.
1128  *
1129  * If MNT_NOFORCE is specified, there should not be any active ones,
1130  * return error if any are found (nb: this is a user error, not a
1131  * system error). If MNT_FORCE is specified, detach any active vnodes
1132  * that are found.
1133  */
1134 #ifdef DIAGNOSTIC
1135 static int busyprt = 0;		/* print out busy vnodes */
1136 SYSCTL_INT(_debug, 1, busyprt, CTLFLAG_RW, &busyprt, 0, "");
1137 #endif
1138 
1139 int
1140 vflush(mp, skipvp, flags)
1141 	struct mount *mp;
1142 	struct vnode *skipvp;
1143 	int flags;
1144 {
1145 	struct proc *p = curproc;	/* XXX */
1146 	struct vnode *vp, *nvp;
1147 	int busy = 0;
1148 
1149 	simple_lock(&mntvnode_slock);
1150 loop:
1151 	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
1152 		/*
1153 		 * Make sure this vnode wasn't reclaimed in getnewvnode().
1154 		 * Start over if it has (it won't be on the list anymore).
1155 		 */
1156 		if (vp->v_mount != mp)
1157 			goto loop;
1158 		nvp = vp->v_mntvnodes.le_next;
1159 		/*
1160 		 * Skip over a selected vnode.
1161 		 */
1162 		if (vp == skipvp)
1163 			continue;
1164 
1165 		simple_lock(&vp->v_interlock);
1166 		/*
1167 		 * Skip over a vnodes marked VSYSTEM.
1168 		 */
1169 		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
1170 			simple_unlock(&vp->v_interlock);
1171 			continue;
1172 		}
1173 		/*
1174 		 * If WRITECLOSE is set, only flush out regular file vnodes
1175 		 * open for writing.
1176 		 */
1177 		if ((flags & WRITECLOSE) &&
1178 		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
1179 			simple_unlock(&vp->v_interlock);
1180 			continue;
1181 		}
1182 
1183 		if (vp->v_object && (vp->v_object->flags & OBJ_VFS_REF)) {
1184 			simple_unlock(&vp->v_interlock);
1185 			simple_unlock(&mntvnode_slock);
1186 			vm_object_reference(vp->v_object);
1187 			pager_cache(vp->v_object, FALSE);
1188 			vp->v_object->flags &= ~OBJ_VFS_REF;
1189 			vm_object_deallocate(vp->v_object);
1190 			simple_lock(&mntvnode_slock);
1191 			simple_lock(&vp->v_interlock);
1192 		}
1193 
1194 		/*
1195 		 * With v_usecount == 0, all we need to do is clear out the
1196 		 * vnode data structures and we are done.
1197 		 */
1198 		if (vp->v_usecount == 0) {
1199 			simple_unlock(&mntvnode_slock);
1200 			vgonel(vp, p);
1201 			simple_lock(&mntvnode_slock);
1202 			continue;
1203 		}
1204 
1205 		/*
1206 		 * If FORCECLOSE is set, forcibly close the vnode. For block
1207 		 * or character devices, revert to an anonymous device. For
1208 		 * all other files, just kill them.
1209 		 */
1210 		if (flags & FORCECLOSE) {
1211 			simple_unlock(&mntvnode_slock);
1212 			if (vp->v_type != VBLK && vp->v_type != VCHR) {
1213 				vgonel(vp, p);
1214 			} else {
1215 				vclean(vp, 0, p);
1216 				vp->v_op = spec_vnodeop_p;
1217 				insmntque(vp, (struct mount *) 0);
1218 			}
1219 			simple_lock(&mntvnode_slock);
1220 			continue;
1221 		}
1222 #ifdef DIAGNOSTIC
1223 		if (busyprt)
1224 			vprint("vflush: busy vnode", vp);
1225 #endif
1226 		simple_unlock(&vp->v_interlock);
1227 		busy++;
1228 	}
1229 	simple_unlock(&mntvnode_slock);
1230 	if (busy)
1231 		return (EBUSY);
1232 	return (0);
1233 }
1234 
1235 /*
1236  * Disassociate the underlying file system from a vnode.
1237  */
1238 static void
1239 vclean(struct vnode *vp, int flags, struct proc *p)
1240 {
1241 	int active;
1242 
1243 	/*
1244 	 * Check to see if the vnode is in use. If so we have to reference it
1245 	 * before we clean it out so that its count cannot fall to zero and
1246 	 * generate a race against ourselves to recycle it.
1247 	 */
1248 	if ((active = vp->v_usecount))
1249 		vp->v_usecount++;
1250 	/*
1251 	 * Prevent the vnode from being recycled or brought into use while we
1252 	 * clean it out.
1253 	 */
1254 	if (vp->v_flag & VXLOCK)
1255 		panic("vclean: deadlock");
1256 	vp->v_flag |= VXLOCK;
1257 	/*
1258 	 * Even if the count is zero, the VOP_INACTIVE routine may still
1259 	 * have the object locked while it cleans it out. The VOP_LOCK
1260 	 * ensures that the VOP_INACTIVE routine is done with its work.
1261 	 * For active vnodes, it ensures that no other activity can
1262 	 * occur while the underlying object is being cleaned out.
1263 	 */
1264 	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
1265 	/*
1266 	 * Clean out any buffers associated with the vnode.
1267 	 */
1268 	if (flags & DOCLOSE)
1269 		vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
1270 	/*
1271 	 * If purging an active vnode, it must be closed and
1272 	 * deactivated before being reclaimed. Note that the
1273 	 * VOP_INACTIVE will unlock the vnode.
1274 	 */
1275 	if (active) {
1276 		if (flags & DOCLOSE)
1277 			VOP_CLOSE(vp, IO_NDELAY, NOCRED, p);
1278 		VOP_INACTIVE(vp, p);
1279 	} else {
1280 		/*
1281 		 * Any other processes trying to obtain this lock must first
1282 		 * wait for VXLOCK to clear, then call the new lock operation.
1283 		 */
1284 		VOP_UNLOCK(vp, 0, p);
1285 	}
1286 	/*
1287 	 * Reclaim the vnode.
1288 	 */
1289 	if (VOP_RECLAIM(vp, p))
1290 		panic("vclean: cannot reclaim");
1291 	if (active)
1292 		vrele(vp);
1293 	cache_purge(vp);
1294 	if (vp->v_vnlock) {
1295 		if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0)
1296 			vprint("vclean: lock not drained", vp);
1297 		FREE(vp->v_vnlock, M_VNODE);
1298 		vp->v_vnlock = NULL;
1299 	}
1300 
1301 	/*
1302 	 * Done with purge, notify sleepers of the grim news.
1303 	 */
1304 	vp->v_op = dead_vnodeop_p;
1305 	vp->v_tag = VT_NON;
1306 	vp->v_flag &= ~VXLOCK;
1307 	if (vp->v_flag & VXWANT) {
1308 		vp->v_flag &= ~VXWANT;
1309 		wakeup((caddr_t) vp);
1310 	}
1311 }
1312 
1313 /*
1314  * Eliminate all activity associated with the requested vnode
1315  * and with all vnodes aliased to the requested vnode.
1316  */
1317 int
1318 vop_revoke(ap)
1319 	struct vop_revoke_args /* {
1320 		struct vnode *a_vp;
1321 		int a_flags;
1322 	} */ *ap;
1323 {
1324 	struct vnode *vp, *vq;
1325 	struct proc *p = curproc;	/* XXX */
1326 
1327 #ifdef DIAGNOSTIC
1328 	if ((ap->a_flags & REVOKEALL) == 0)
1329 		panic("vop_revoke");
1330 #endif
1331 
1332 	vp = ap->a_vp;
1333 	simple_lock(&vp->v_interlock);
1334 
1335 	if (vp->v_flag & VALIASED) {
1336 		/*
1337 		 * If a vgone (or vclean) is already in progress,
1338 		 * wait until it is done and return.
1339 		 */
1340 		if (vp->v_flag & VXLOCK) {
1341 			vp->v_flag |= VXWANT;
1342 			simple_unlock(&vp->v_interlock);
1343 			tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
1344 			return (0);
1345 		}
1346 		/*
1347 		 * Ensure that vp will not be vgone'd while we
1348 		 * are eliminating its aliases.
1349 		 */
1350 		vp->v_flag |= VXLOCK;
1351 		simple_unlock(&vp->v_interlock);
1352 		while (vp->v_flag & VALIASED) {
1353 			simple_lock(&spechash_slock);
1354 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1355 				if (vq->v_rdev != vp->v_rdev ||
1356 				    vq->v_type != vp->v_type || vp == vq)
1357 					continue;
1358 				simple_unlock(&spechash_slock);
1359 				vgone(vq);
1360 				break;
1361 			}
1362 			if (vq == NULLVP) {
1363 				simple_unlock(&spechash_slock);
1364 			}
1365 		}
1366 		/*
1367 		 * Remove the lock so that vgone below will
1368 		 * really eliminate the vnode after which time
1369 		 * vgone will awaken any sleepers.
1370 		 */
1371 		simple_lock(&vp->v_interlock);
1372 		vp->v_flag &= ~VXLOCK;
1373 	}
1374 	vgonel(vp, p);
1375 	return (0);
1376 }
1377 
1378 /*
1379  * Recycle an unused vnode to the front of the free list.
1380  * Release the passed interlock if the vnode will be recycled.
1381  */
1382 int
1383 vrecycle(vp, inter_lkp, p)
1384 	struct vnode *vp;
1385 	struct simplelock *inter_lkp;
1386 	struct proc *p;
1387 {
1388 
1389 	simple_lock(&vp->v_interlock);
1390 	if (vp->v_usecount == 0) {
1391 		if (inter_lkp) {
1392 			simple_unlock(inter_lkp);
1393 		}
1394 		vgonel(vp, p);
1395 		return (1);
1396 	}
1397 	simple_unlock(&vp->v_interlock);
1398 	return (0);
1399 }
1400 
1401 /*
1402  * Eliminate all activity associated with a vnode
1403  * in preparation for reuse.
1404  */
1405 void
1406 vgone(vp)
1407 	register struct vnode *vp;
1408 {
1409 	struct proc *p = curproc;	/* XXX */
1410 
1411 	simple_lock(&vp->v_interlock);
1412 	vgonel(vp, p);
1413 }
1414 
1415 /*
1416  * vgone, with the vp interlock held.
1417  */
1418 void
1419 vgonel(vp, p)
1420 	struct vnode *vp;
1421 	struct proc *p;
1422 {
1423 	struct vnode *vq;
1424 	struct vnode *vx;
1425 
1426 	/*
1427 	 * If a vgone (or vclean) is already in progress,
1428 	 * wait until it is done and return.
1429 	 */
1430 	if (vp->v_flag & VXLOCK) {
1431 		vp->v_flag |= VXWANT;
1432 		simple_unlock(&vp->v_interlock);
1433 		tsleep((caddr_t)vp, PINOD, "vgone", 0);
1434 		return;
1435 	}
1436 
1437 	if (vp->v_object) {
1438 		vp->v_object->flags |= OBJ_VNODE_GONE;
1439 	}
1440 
1441 	/*
1442 	 * Clean out the filesystem specific data.
1443 	 */
1444 	vclean(vp, DOCLOSE, p);
1445 	/*
1446 	 * Delete from old mount point vnode list, if on one.
1447 	 */
1448 	if (vp->v_mount != NULL)
1449 		insmntque(vp, (struct mount *)0);
1450 	/*
1451 	 * If special device, remove it from special device alias list
1452 	 * if it is on one.
1453 	 */
1454 	if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
1455 		simple_lock(&spechash_slock);
1456 		if (*vp->v_hashchain == vp) {
1457 			*vp->v_hashchain = vp->v_specnext;
1458 		} else {
1459 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1460 				if (vq->v_specnext != vp)
1461 					continue;
1462 				vq->v_specnext = vp->v_specnext;
1463 				break;
1464 			}
1465 			if (vq == NULL)
1466 				panic("missing bdev");
1467 		}
1468 		if (vp->v_flag & VALIASED) {
1469 			vx = NULL;
1470 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1471 				if (vq->v_rdev != vp->v_rdev ||
1472 				    vq->v_type != vp->v_type)
1473 					continue;
1474 				if (vx)
1475 					break;
1476 				vx = vq;
1477 			}
1478 			if (vx == NULL)
1479 				panic("missing alias");
1480 			if (vq == NULL)
1481 				vx->v_flag &= ~VALIASED;
1482 			vp->v_flag &= ~VALIASED;
1483 		}
1484 		simple_unlock(&spechash_slock);
1485 		FREE(vp->v_specinfo, M_VNODE);
1486 		vp->v_specinfo = NULL;
1487 	}
1488 
1489 	/*
1490 	 * If it is on the freelist and not already at the head,
1491 	 * move it to the head of the list. The test of the back
1492 	 * pointer and the reference count of zero is because
1493 	 * it will be removed from the free list by getnewvnode,
1494 	 * but will not have its reference count incremented until
1495 	 * after calling vgone. If the reference count were
1496 	 * incremented first, vgone would (incorrectly) try to
1497 	 * close the previous instance of the underlying object.
1498 	 * So, the back pointer is explicitly set to `0xdeadb' in
1499 	 * getnewvnode after removing it from the freelist to ensure
1500 	 * that we do not try to move it here.
1501 	 */
1502 	if (vp->v_usecount == 0) {
1503 		simple_lock(&vnode_free_list_slock);
1504 		if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) &&
1505 			vnode_free_list.tqh_first != vp) {
1506 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1507 			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1508 		}
1509 		simple_unlock(&vnode_free_list_slock);
1510 	}
1511 
1512 	vp->v_type = VBAD;
1513 }
1514 
1515 /*
1516  * Lookup a vnode by device number.
1517  */
1518 int
1519 vfinddev(dev, type, vpp)
1520 	dev_t dev;
1521 	enum vtype type;
1522 	struct vnode **vpp;
1523 {
1524 	register struct vnode *vp;
1525 
1526 	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1527 		if (dev != vp->v_rdev || type != vp->v_type)
1528 			continue;
1529 		*vpp = vp;
1530 		return (1);
1531 	}
1532 	return (0);
1533 }
1534 
1535 /*
1536  * Calculate the total number of references to a special device.
1537  */
1538 int
1539 vcount(vp)
1540 	register struct vnode *vp;
1541 {
1542 	struct vnode *vq, *vnext;
1543 	int count;
1544 
1545 loop:
1546 	if ((vp->v_flag & VALIASED) == 0)
1547 		return (vp->v_usecount);
1548 	for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
1549 		vnext = vq->v_specnext;
1550 		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1551 			continue;
1552 		/*
1553 		 * Alias, but not in use, so flush it out.
1554 		 */
1555 		if (vq->v_usecount == 0 && vq != vp) {
1556 			vgone(vq);
1557 			goto loop;
1558 		}
1559 		count += vq->v_usecount;
1560 	}
1561 	return (count);
1562 }
1563 
1564 /*
1565  * Print out a description of a vnode.
1566  */
1567 static char *typename[] =
1568 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
1569 
1570 void
1571 vprint(label, vp)
1572 	char *label;
1573 	register struct vnode *vp;
1574 {
1575 	char buf[64];
1576 
1577 	if (label != NULL)
1578 		printf("%s: ", label);
1579 	printf("type %s, usecount %d, writecount %d, refcount %ld,",
1580 	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1581 	    vp->v_holdcnt);
1582 	buf[0] = '\0';
1583 	if (vp->v_flag & VROOT)
1584 		strcat(buf, "|VROOT");
1585 	if (vp->v_flag & VTEXT)
1586 		strcat(buf, "|VTEXT");
1587 	if (vp->v_flag & VSYSTEM)
1588 		strcat(buf, "|VSYSTEM");
1589 	if (vp->v_flag & VXLOCK)
1590 		strcat(buf, "|VXLOCK");
1591 	if (vp->v_flag & VXWANT)
1592 		strcat(buf, "|VXWANT");
1593 	if (vp->v_flag & VBWAIT)
1594 		strcat(buf, "|VBWAIT");
1595 	if (vp->v_flag & VALIASED)
1596 		strcat(buf, "|VALIASED");
1597 	if (buf[0] != '\0')
1598 		printf(" flags (%s)", &buf[1]);
1599 	if (vp->v_data == NULL) {
1600 		printf("\n");
1601 	} else {
1602 		printf("\n\t");
1603 		VOP_PRINT(vp);
1604 	}
1605 }
1606 
1607 #ifdef DDB
1608 /*
1609  * List all of the locked vnodes in the system.
1610  * Called when debugging the kernel.
1611  */
1612 void
1613 printlockedvnodes(void)
1614 {
1615 	register struct mount *mp;
1616 	register struct vnode *vp;
1617 
1618 	printf("Locked vnodes\n");
1619 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
1620 	    mp = mp->mnt_list.cqe_next) {
1621 		for (vp = mp->mnt_vnodelist.lh_first;
1622 		    vp != NULL;
1623 		    vp = vp->v_mntvnodes.le_next)
1624 			if (VOP_ISLOCKED(vp))
1625 				vprint((char *) 0, vp);
1626 	}
1627 }
1628 #endif
1629 
1630 int kinfo_vdebug = 1;
1631 int kinfo_vgetfailed;
1632 
1633 #define KINFO_VNODESLOP	10
1634 /*
1635  * Dump vnode list (via sysctl).
1636  * Copyout address of vnode followed by vnode.
1637  */
1638 /* ARGSUSED */
1639 static int
1640 sysctl_vnode SYSCTL_HANDLER_ARGS
1641 {
1642 	struct proc *p = curproc;	/* XXX */
1643 	register struct mount *mp, *nmp;
1644 	struct vnode *vp;
1645 	int error;
1646 
1647 #define VPTRSZ	sizeof (struct vnode *)
1648 #define VNODESZ	sizeof (struct vnode)
1649 
1650 	req->lock = 0;
1651 	if (!req->oldptr) /* Make an estimate */
1652 		return (SYSCTL_OUT(req, 0,
1653 			(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
1654 
1655 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
1656 		nmp = mp->mnt_list.cqe_next;
1657 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p))
1658 			continue;
1659 again:
1660 		for (vp = mp->mnt_vnodelist.lh_first;
1661 		    vp != NULL;
1662 		    vp = vp->v_mntvnodes.le_next) {
1663 			/*
1664 			 * Check that the vp is still associated with this
1665 			 * filesystem.  RACE: could have been recycled onto
1666 			 * the same filesystem.
1667 			 */
1668 			if (vp->v_mount != mp) {
1669 				if (kinfo_vdebug)
1670 					printf("kinfo: vp changed\n");
1671 				goto again;
1672 			}
1673 			if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
1674 			    (error = SYSCTL_OUT(req, vp, VNODESZ))) {
1675 				vfs_unbusy(mp, p);
1676 				return (error);
1677 			}
1678 		}
1679 		vfs_unbusy(mp, p);
1680 	}
1681 
1682 	return (0);
1683 }
1684 
1685 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
1686 	0, 0, sysctl_vnode, "S,vnode", "");
1687 
1688 /*
1689  * Check to see if a filesystem is mounted on a block device.
1690  */
1691 int
1692 vfs_mountedon(vp)
1693 	struct vnode *vp;
1694 {
1695 	struct vnode *vq;
1696 	int error = 0;
1697 
1698 	if (vp->v_specflags & SI_MOUNTEDON)
1699 		return (EBUSY);
1700 	if (vp->v_flag & VALIASED) {
1701 		simple_lock(&spechash_slock);
1702 		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1703 			if (vq->v_rdev != vp->v_rdev ||
1704 			    vq->v_type != vp->v_type)
1705 				continue;
1706 			if (vq->v_specflags & SI_MOUNTEDON) {
1707 				error = EBUSY;
1708 				break;
1709 			}
1710 		}
1711 		simple_unlock(&spechash_slock);
1712 	}
1713 	return (error);
1714 }
1715 
1716 /*
1717  * Unmount all filesystems.  The list is traversed in reverse order
1718  * of mounting to avoid dependencies.  Should only be called by halt().
1719  */
1720 void
1721 vfs_unmountall()
1722 {
1723 	struct mount *mp, *nmp, *rootfs = NULL;
1724 	int error;
1725 
1726 	/* unmount all but rootfs */
1727 	for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
1728 		nmp = mp->mnt_list.cqe_prev;
1729 
1730 		if (mp->mnt_flag & MNT_ROOTFS) {
1731 			rootfs = mp;
1732 			continue;
1733 		}
1734 		error = dounmount(mp, MNT_FORCE, initproc);
1735 		if (error) {
1736 			printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
1737 			if (error == EBUSY)
1738 				printf("BUSY)\n");
1739 			else
1740 				printf("%d)\n", error);
1741 		}
1742 	}
1743 
1744 	/* and finally... */
1745 	if (rootfs) {
1746 		vfs_unmountroot(rootfs);
1747 	} else {
1748 		printf("no root filesystem\n");
1749 	}
1750 }
1751 
1752 /*
1753  * Build hash lists of net addresses and hang them off the mount point.
1754  * Called by ufs_mount() to set up the lists of export addresses.
1755  */
1756 static int
1757 vfs_hang_addrlist(struct mount *mp, struct netexport *nep,
1758 	struct export_args *argp)
1759 {
1760 	register struct netcred *np;
1761 	register struct radix_node_head *rnh;
1762 	register int i;
1763 	struct radix_node *rn;
1764 	struct sockaddr *saddr, *smask = 0;
1765 	struct domain *dom;
1766 	int error;
1767 
1768 	if (argp->ex_addrlen == 0) {
1769 		if (mp->mnt_flag & MNT_DEFEXPORTED)
1770 			return (EPERM);
1771 		np = &nep->ne_defexported;
1772 		np->netc_exflags = argp->ex_flags;
1773 		np->netc_anon = argp->ex_anon;
1774 		np->netc_anon.cr_ref = 1;
1775 		mp->mnt_flag |= MNT_DEFEXPORTED;
1776 		return (0);
1777 	}
1778 	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
1779 	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
1780 	bzero((caddr_t) np, i);
1781 	saddr = (struct sockaddr *) (np + 1);
1782 	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
1783 		goto out;
1784 	if (saddr->sa_len > argp->ex_addrlen)
1785 		saddr->sa_len = argp->ex_addrlen;
1786 	if (argp->ex_masklen) {
1787 		smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
1788 		error = copyin(argp->ex_addr, (caddr_t) smask, argp->ex_masklen);
1789 		if (error)
1790 			goto out;
1791 		if (smask->sa_len > argp->ex_masklen)
1792 			smask->sa_len = argp->ex_masklen;
1793 	}
1794 	i = saddr->sa_family;
1795 	if ((rnh = nep->ne_rtable[i]) == 0) {
1796 		/*
1797 		 * Seems silly to initialize every AF when most are not used,
1798 		 * do so on demand here
1799 		 */
1800 		for (dom = domains; dom; dom = dom->dom_next)
1801 			if (dom->dom_family == i && dom->dom_rtattach) {
1802 				dom->dom_rtattach((void **) &nep->ne_rtable[i],
1803 				    dom->dom_rtoffset);
1804 				break;
1805 			}
1806 		if ((rnh = nep->ne_rtable[i]) == 0) {
1807 			error = ENOBUFS;
1808 			goto out;
1809 		}
1810 	}
1811 	rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
1812 	    np->netc_rnodes);
1813 	if (rn == 0 || np != (struct netcred *) rn) {	/* already exists */
1814 		error = EPERM;
1815 		goto out;
1816 	}
1817 	np->netc_exflags = argp->ex_flags;
1818 	np->netc_anon = argp->ex_anon;
1819 	np->netc_anon.cr_ref = 1;
1820 	return (0);
1821 out:
1822 	free(np, M_NETADDR);
1823 	return (error);
1824 }
1825 
1826 /* ARGSUSED */
1827 static int
1828 vfs_free_netcred(struct radix_node *rn, void *w)
1829 {
1830 	register struct radix_node_head *rnh = (struct radix_node_head *) w;
1831 
1832 	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
1833 	free((caddr_t) rn, M_NETADDR);
1834 	return (0);
1835 }
1836 
1837 /*
1838  * Free the net address hash lists that are hanging off the mount points.
1839  */
1840 static void
1841 vfs_free_addrlist(struct netexport *nep)
1842 {
1843 	register int i;
1844 	register struct radix_node_head *rnh;
1845 
1846 	for (i = 0; i <= AF_MAX; i++)
1847 		if ((rnh = nep->ne_rtable[i])) {
1848 			(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
1849 			    (caddr_t) rnh);
1850 			free((caddr_t) rnh, M_RTABLE);
1851 			nep->ne_rtable[i] = 0;
1852 		}
1853 }
1854 
1855 int
1856 vfs_export(mp, nep, argp)
1857 	struct mount *mp;
1858 	struct netexport *nep;
1859 	struct export_args *argp;
1860 {
1861 	int error;
1862 
1863 	if (argp->ex_flags & MNT_DELEXPORT) {
1864 		vfs_free_addrlist(nep);
1865 		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
1866 	}
1867 	if (argp->ex_flags & MNT_EXPORTED) {
1868 		if ((error = vfs_hang_addrlist(mp, nep, argp)))
1869 			return (error);
1870 		mp->mnt_flag |= MNT_EXPORTED;
1871 	}
1872 	return (0);
1873 }
1874 
1875 struct netcred *
1876 vfs_export_lookup(mp, nep, nam)
1877 	register struct mount *mp;
1878 	struct netexport *nep;
1879 	struct mbuf *nam;
1880 {
1881 	register struct netcred *np;
1882 	register struct radix_node_head *rnh;
1883 	struct sockaddr *saddr;
1884 
1885 	np = NULL;
1886 	if (mp->mnt_flag & MNT_EXPORTED) {
1887 		/*
1888 		 * Lookup in the export list first.
1889 		 */
1890 		if (nam != NULL) {
1891 			saddr = mtod(nam, struct sockaddr *);
1892 			rnh = nep->ne_rtable[saddr->sa_family];
1893 			if (rnh != NULL) {
1894 				np = (struct netcred *)
1895 					(*rnh->rnh_matchaddr)((caddr_t)saddr,
1896 							      rnh);
1897 				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
1898 					np = NULL;
1899 			}
1900 		}
1901 		/*
1902 		 * If no address match, use the default if it exists.
1903 		 */
1904 		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
1905 			np = &nep->ne_defexported;
1906 	}
1907 	return (np);
1908 }
1909 
1910 /*
1911  * perform msync on all vnodes under a mount point
1912  * the mount point must be locked.
1913  */
1914 void
1915 vfs_msync(struct mount *mp, int flags) {
1916 	struct vnode *vp, *nvp;
1917 loop:
1918 	for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) {
1919 
1920 		if (vp->v_mount != mp)
1921 			goto loop;
1922 		nvp = vp->v_mntvnodes.le_next;
1923 		if (VOP_ISLOCKED(vp) && (flags != MNT_WAIT))
1924 			continue;
1925 		if (vp->v_object &&
1926 		   (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
1927 			vm_object_page_clean(vp->v_object, 0, 0, TRUE, TRUE);
1928 		}
1929 	}
1930 }
1931 
1932 /*
1933  * Create the VM object needed for VMIO and mmap support.  This
1934  * is done for all VREG files in the system.  Some filesystems might
1935  * afford the additional metadata buffering capability of the
1936  * VMIO code by making the device node be VMIO mode also.
1937  */
1938 int
1939 vfs_object_create(vp, p, cred, waslocked)
1940 	struct vnode *vp;
1941 	struct proc *p;
1942 	struct ucred *cred;
1943 	int waslocked;
1944 {
1945 	struct vattr vat;
1946 	vm_object_t object;
1947 	int error = 0;
1948 
1949 retry:
1950 	if ((object = vp->v_object) == NULL) {
1951 		if (vp->v_type == VREG) {
1952 			if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0)
1953 				goto retn;
1954 			(void) vnode_pager_alloc(vp,
1955 				OFF_TO_IDX(round_page(vat.va_size)), 0, 0);
1956 		} else {
1957 			/*
1958 			 * This simply allocates the biggest object possible
1959 			 * for a VBLK vnode.  This should be fixed, but doesn't
1960 			 * cause any problems (yet).
1961 			 */
1962 			(void) vnode_pager_alloc(vp, INT_MAX, 0, 0);
1963 		}
1964 		vp->v_object->flags |= OBJ_VFS_REF;
1965 	} else {
1966 		if (object->flags & OBJ_DEAD) {
1967 			if (waslocked)
1968 				VOP_UNLOCK(vp, 0, p);
1969 			tsleep(object, PVM, "vodead", 0);
1970 			if (waslocked)
1971 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
1972 			goto retry;
1973 		}
1974 		if ((object->flags & OBJ_VFS_REF) == 0) {
1975 			object->flags |= OBJ_VFS_REF;
1976 			vm_object_reference(object);
1977 		}
1978 	}
1979 	if (vp->v_object)
1980 		vp->v_flag |= VVMIO;
1981 
1982 retn:
1983 	return error;
1984 }
1985