xref: /freebsd/sys/kern/vfs_subr.c (revision 9ee40678bbdcedc6a3ac1e311abe740018911cf1)
1 /*
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
39  * $Id: vfs_subr.c,v 1.62 1996/10/17 02:49:24 dyson Exp $
40  */
41 
42 /*
43  * External virtual filesystem routines
44  */
45 #include "opt_ddb.h"
46 
47 #include <sys/param.h>
48 #include <sys/systm.h>
49 #include <sys/kernel.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/mount.h>
53 #include <sys/time.h>
54 #include <sys/vnode.h>
55 #include <sys/stat.h>
56 #include <sys/namei.h>
57 #include <sys/ucred.h>
58 #include <sys/buf.h>
59 #include <sys/errno.h>
60 #include <sys/malloc.h>
61 #include <sys/domain.h>
62 #include <sys/mbuf.h>
63 
64 #include <vm/vm.h>
65 #include <vm/vm_param.h>
66 #include <vm/vm_object.h>
67 #include <vm/vm_extern.h>
68 #include <vm/vm_pager.h>
69 #include <vm/vnode_pager.h>
70 #include <sys/sysctl.h>
71 
72 #include <miscfs/specfs/specdev.h>
73 
74 #ifdef DDB
75 extern void	printlockedvnodes __P((void));
76 #endif
77 extern void	vclean __P((struct vnode *vp, int flags));
78 extern void	vfs_unmountroot __P((struct mount *rootfs));
79 
80 enum vtype iftovt_tab[16] = {
81 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
82 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
83 };
84 int vttoif_tab[9] = {
85 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
86 	S_IFSOCK, S_IFIFO, S_IFMT,
87 };
88 
89 /*
90  * Insq/Remq for the vnode usage lists.
91  */
92 #define	bufinsvn(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_vnbufs)
93 #define	bufremvn(bp) {  \
94 	LIST_REMOVE(bp, b_vnbufs); \
95 	(bp)->b_vnbufs.le_next = NOLIST; \
96 }
97 
98 TAILQ_HEAD(freelst, vnode) vnode_free_list;	/* vnode free list */
99 static u_long freevnodes = 0;
100 
101 struct mntlist mountlist;	/* mounted filesystem list */
102 
103 int desiredvnodes;
104 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RD, &desiredvnodes, 0, "");
105 
106 static void	vfs_free_addrlist __P((struct netexport *nep));
107 static int	vfs_free_netcred __P((struct radix_node *rn, void *w));
108 static int	vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
109 				       struct export_args *argp));
110 
111 /*
112  * Initialize the vnode management data structures.
113  */
114 void
115 vntblinit()
116 {
117 	desiredvnodes = maxproc + vm_object_cache_max + extravnodes;
118 
119 	TAILQ_INIT(&vnode_free_list);
120 	CIRCLEQ_INIT(&mountlist);
121 }
122 
123 /*
124  * Lock a filesystem.
125  * Used to prevent access to it while mounting and unmounting.
126  */
127 int
128 vfs_lock(mp)
129 	register struct mount *mp;
130 {
131 
132 	while (mp->mnt_flag & MNT_MLOCK) {
133 		mp->mnt_flag |= MNT_MWAIT;
134 		(void) tsleep((caddr_t) mp, PVFS, "vfslck", 0);
135 	}
136 	mp->mnt_flag |= MNT_MLOCK;
137 	return (0);
138 }
139 
140 /*
141  * Unlock a locked filesystem.
142  * Panic if filesystem is not locked.
143  */
144 void
145 vfs_unlock(mp)
146 	register struct mount *mp;
147 {
148 
149 	if ((mp->mnt_flag & MNT_MLOCK) == 0)
150 		panic("vfs_unlock: not locked");
151 	mp->mnt_flag &= ~MNT_MLOCK;
152 	if (mp->mnt_flag & MNT_MWAIT) {
153 		mp->mnt_flag &= ~MNT_MWAIT;
154 		wakeup((caddr_t) mp);
155 	}
156 }
157 
158 /*
159  * Mark a mount point as busy.
160  * Used to synchronize access and to delay unmounting.
161  */
162 int
163 vfs_busy(mp)
164 	register struct mount *mp;
165 {
166 
167 	while (mp->mnt_flag & MNT_MPBUSY) {
168 		mp->mnt_flag |= MNT_MPWANT;
169 		(void) tsleep((caddr_t) &mp->mnt_flag, PVFS, "vfsbsy", 0);
170 	}
171 	if (mp->mnt_flag & MNT_UNMOUNT)
172 		return (1);
173 	mp->mnt_flag |= MNT_MPBUSY;
174 	return (0);
175 }
176 
177 /*
178  * Free a busy filesystem.
179  * Panic if filesystem is not busy.
180  */
181 void
182 vfs_unbusy(mp)
183 	register struct mount *mp;
184 {
185 
186 	if ((mp->mnt_flag & MNT_MPBUSY) == 0)
187 		panic("vfs_unbusy: not busy");
188 	mp->mnt_flag &= ~MNT_MPBUSY;
189 	if (mp->mnt_flag & MNT_MPWANT) {
190 		mp->mnt_flag &= ~MNT_MPWANT;
191 		wakeup((caddr_t) &mp->mnt_flag);
192 	}
193 }
194 
195 void
196 vfs_unmountroot(struct mount *rootfs)
197 {
198 	struct mount *mp = rootfs;
199 	int error;
200 
201 	if (vfs_busy(mp)) {
202 		printf("failed to unmount root\n");
203 		return;
204 	}
205 	mp->mnt_flag |= MNT_UNMOUNT;
206 	if ((error = vfs_lock(mp))) {
207 		printf("lock of root filesystem failed (%d)\n", error);
208 		return;
209 	}
210 	vnode_pager_umount(mp);	/* release cached vnodes */
211 	cache_purgevfs(mp);	/* remove cache entries for this file sys */
212 
213 	if ((error = VFS_SYNC(mp, MNT_WAIT, initproc->p_ucred, initproc)))
214 		printf("sync of root filesystem failed (%d)\n", error);
215 
216 	if ((error = VFS_UNMOUNT(mp, MNT_FORCE, initproc))) {
217 		printf("unmount of root filesystem failed (");
218 		if (error == EBUSY)
219 			printf("BUSY)\n");
220 		else
221 			printf("%d)\n", error);
222 	}
223 	mp->mnt_flag &= ~MNT_UNMOUNT;
224 	vfs_unbusy(mp);
225 }
226 
227 /*
228  * Unmount all filesystems.  Should only be called by halt().
229  */
230 void
231 vfs_unmountall()
232 {
233 	struct mount *mp, *nmp, *rootfs = NULL;
234 	int error;
235 
236 	/* unmount all but rootfs */
237 	for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
238 		nmp = mp->mnt_list.cqe_prev;
239 
240 		if (mp->mnt_flag & MNT_ROOTFS) {
241 			rootfs = mp;
242 			continue;
243 		}
244 		error = dounmount(mp, MNT_FORCE, initproc);
245 		if (error) {
246 			printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
247 			if (error == EBUSY)
248 				printf("BUSY)\n");
249 			else
250 				printf("%d)\n", error);
251 		}
252 	}
253 
254 	/* and finally... */
255 	if (rootfs) {
256 		vfs_unmountroot(rootfs);
257 	} else {
258 		printf("no root filesystem\n");
259 	}
260 }
261 
262 /*
263  * Lookup a mount point by filesystem identifier.
264  */
265 struct mount *
266 getvfs(fsid)
267 	fsid_t *fsid;
268 {
269 	register struct mount *mp;
270 
271 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
272 	    mp = mp->mnt_list.cqe_next) {
273 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
274 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1])
275 			return (mp);
276 	}
277 	return ((struct mount *) 0);
278 }
279 
280 /*
281  * Get a new unique fsid
282  */
283 void
284 getnewfsid(mp, mtype)
285 	struct mount *mp;
286 	int mtype;
287 {
288 	static u_short xxxfs_mntid;
289 
290 	fsid_t tfsid;
291 
292 	mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
293 	mp->mnt_stat.f_fsid.val[1] = mtype;
294 	if (xxxfs_mntid == 0)
295 		++xxxfs_mntid;
296 	tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
297 	tfsid.val[1] = mtype;
298 	if (mountlist.cqh_first != (void *)&mountlist) {
299 		while (getvfs(&tfsid)) {
300 			tfsid.val[0]++;
301 			xxxfs_mntid++;
302 		}
303 	}
304 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
305 }
306 
307 /*
308  * Set vnode attributes to VNOVAL
309  */
310 void
311 vattr_null(vap)
312 	register struct vattr *vap;
313 {
314 
315 	vap->va_type = VNON;
316 	vap->va_size = VNOVAL;
317 	vap->va_bytes = VNOVAL;
318 	vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid =
319 	    vap->va_fsid = vap->va_fileid =
320 	    vap->va_blocksize = vap->va_rdev =
321 	    vap->va_atime.tv_sec = vap->va_atime.tv_nsec =
322 	    vap->va_mtime.tv_sec = vap->va_mtime.tv_nsec =
323 	    vap->va_ctime.tv_sec = vap->va_ctime.tv_nsec =
324 	    vap->va_flags = vap->va_gen = VNOVAL;
325 	vap->va_vaflags = 0;
326 }
327 
328 /*
329  * Routines having to do with the management of the vnode table.
330  */
331 extern vop_t **dead_vnodeop_p;
332 
333 /*
334  * Return the next vnode from the free list.
335  */
336 int
337 getnewvnode(tag, mp, vops, vpp)
338 	enum vtagtype tag;
339 	struct mount *mp;
340 	vop_t **vops;
341 	struct vnode **vpp;
342 {
343 	register struct vnode *vp;
344 
345 retry:
346 	vp = vnode_free_list.tqh_first;
347 	/*
348 	 * we allocate a new vnode if
349 	 * 	1. we don't have any free
350 	 *		Pretty obvious, we actually used to panic, but that
351 	 *		is a silly thing to do.
352 	 *	2. we havn't filled our pool yet
353 	 *		We don't want to trash the incore (VM-)vnodecache.
354 	 *	3. if less that 1/4th of our vnodes are free.
355 	 *		We don't want to trash the namei cache either.
356 	 */
357 	if (freevnodes < (numvnodes >> 2) ||
358 	    numvnodes < desiredvnodes ||
359 	    vp == NULL) {
360 		vp = (struct vnode *) malloc((u_long) sizeof *vp,
361 		    M_VNODE, M_WAITOK);
362 		bzero((char *) vp, sizeof *vp);
363 		numvnodes++;
364 	} else {
365 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
366 		if (vp->v_usage > 0) {
367 			--vp->v_usage;
368 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
369 			goto retry;
370 		}
371 		freevnodes--;
372 		if (vp->v_usecount)
373 			panic("free vnode isn't");
374 
375 		/* see comment on why 0xdeadb is set at end of vgone (below) */
376 		vp->v_freelist.tqe_prev = (struct vnode **) 0xdeadb;
377 		vp->v_lease = NULL;
378 		if (vp->v_type != VBAD)
379 			vgone(vp);
380 
381 #ifdef DIAGNOSTIC
382 		{
383 			int s;
384 
385 			if (vp->v_data)
386 				panic("cleaned vnode isn't");
387 			s = splbio();
388 			if (vp->v_numoutput)
389 				panic("Clean vnode has pending I/O's");
390 			splx(s);
391 		}
392 #endif
393 		vp->v_flag = 0;
394 		vp->v_lastr = 0;
395 		vp->v_ralen = 0;
396 		vp->v_maxra = 0;
397 		vp->v_lastw = 0;
398 		vp->v_lasta = 0;
399 		vp->v_cstart = 0;
400 		vp->v_clen = 0;
401 		vp->v_socket = 0;
402 		vp->v_writecount = 0;	/* XXX */
403 		vp->v_usage = 0;
404 	}
405 	vp->v_type = VNON;
406 	cache_purge(vp);
407 	vp->v_tag = tag;
408 	vp->v_op = vops;
409 	insmntque(vp, mp);
410 	*vpp = vp;
411 	vp->v_usecount = 1;
412 	vp->v_data = 0;
413 	return (0);
414 }
415 
416 /*
417  * Move a vnode from one mount queue to another.
418  */
419 void
420 insmntque(vp, mp)
421 	register struct vnode *vp;
422 	register struct mount *mp;
423 {
424 
425 	/*
426 	 * Delete from old mount point vnode list, if on one.
427 	 */
428 	if (vp->v_mount != NULL)
429 		LIST_REMOVE(vp, v_mntvnodes);
430 	/*
431 	 * Insert into list of vnodes for the new mount point, if available.
432 	 */
433 	if ((vp->v_mount = mp) == NULL)
434 		return;
435 	LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
436 }
437 
438 /*
439  * Update outstanding I/O count and do wakeup if requested.
440  */
441 void
442 vwakeup(bp)
443 	register struct buf *bp;
444 {
445 	register struct vnode *vp;
446 
447 	bp->b_flags &= ~B_WRITEINPROG;
448 	if ((vp = bp->b_vp)) {
449 		vp->v_numoutput--;
450 		if (vp->v_numoutput < 0)
451 			panic("vwakeup: neg numoutput");
452 		if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
453 			vp->v_flag &= ~VBWAIT;
454 			wakeup((caddr_t) &vp->v_numoutput);
455 		}
456 	}
457 }
458 
459 /*
460  * Flush out and invalidate all buffers associated with a vnode.
461  * Called with the underlying object locked.
462  */
463 int
464 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
465 	register struct vnode *vp;
466 	int flags;
467 	struct ucred *cred;
468 	struct proc *p;
469 	int slpflag, slptimeo;
470 {
471 	register struct buf *bp;
472 	struct buf *nbp, *blist;
473 	int s, error;
474 	vm_object_t object;
475 
476 	if (flags & V_SAVE) {
477 		if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)))
478 			return (error);
479 		if (vp->v_dirtyblkhd.lh_first != NULL)
480 			panic("vinvalbuf: dirty bufs");
481 	}
482 
483 	s = splbio();
484 	for (;;) {
485 		if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA))
486 			while (blist && blist->b_lblkno < 0)
487 				blist = blist->b_vnbufs.le_next;
488 		if (!blist && (blist = vp->v_dirtyblkhd.lh_first) &&
489 		    (flags & V_SAVEMETA))
490 			while (blist && blist->b_lblkno < 0)
491 				blist = blist->b_vnbufs.le_next;
492 		if (!blist)
493 			break;
494 
495 		for (bp = blist; bp; bp = nbp) {
496 			nbp = bp->b_vnbufs.le_next;
497 			if ((flags & V_SAVEMETA) && bp->b_lblkno < 0)
498 				continue;
499 			if (bp->b_flags & B_BUSY) {
500 				bp->b_flags |= B_WANTED;
501 				error = tsleep((caddr_t) bp,
502 				    slpflag | (PRIBIO + 1), "vinvalbuf",
503 				    slptimeo);
504 				splx(s);
505 				if (error)
506 					return (error);
507 				break;
508 			}
509 			bremfree(bp);
510 			bp->b_flags |= B_BUSY;
511 			/*
512 			 * XXX Since there are no node locks for NFS, I
513 			 * believe there is a slight chance that a delayed
514 			 * write will occur while sleeping just above, so
515 			 * check for it.
516 			 */
517 			if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) {
518 				(void) VOP_BWRITE(bp);
519 				break;
520 			}
521 			bp->b_flags |= (B_INVAL|B_NOCACHE|B_RELBUF);
522 			brelse(bp);
523 		}
524 	}
525 	splx(s);
526 
527 	s = splbio();
528 	while (vp->v_numoutput > 0) {
529 		vp->v_flag |= VBWAIT;
530 		tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
531 	}
532 	splx(s);
533 
534 	/*
535 	 * Destroy the copy in the VM cache, too.
536 	 */
537 	object = vp->v_object;
538 	if (object != NULL) {
539 		vm_object_page_remove(object, 0, object->size,
540 		    (flags & V_SAVE) ? TRUE : FALSE);
541 	}
542 	if (!(flags & V_SAVEMETA) &&
543 	    (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first))
544 		panic("vinvalbuf: flush failed");
545 	return (0);
546 }
547 
548 /*
549  * Associate a buffer with a vnode.
550  */
551 void
552 bgetvp(vp, bp)
553 	register struct vnode *vp;
554 	register struct buf *bp;
555 {
556 	int s;
557 
558 	if (bp->b_vp)
559 		panic("bgetvp: not free");
560 	VHOLD(vp);
561 	bp->b_vp = vp;
562 	if (vp->v_type == VBLK || vp->v_type == VCHR)
563 		bp->b_dev = vp->v_rdev;
564 	else
565 		bp->b_dev = NODEV;
566 	/*
567 	 * Insert onto list for new vnode.
568 	 */
569 	s = splbio();
570 	bufinsvn(bp, &vp->v_cleanblkhd);
571 	splx(s);
572 }
573 
574 /*
575  * Disassociate a buffer from a vnode.
576  */
577 void
578 brelvp(bp)
579 	register struct buf *bp;
580 {
581 	struct vnode *vp;
582 	int s;
583 
584 	if (bp->b_vp == (struct vnode *) 0)
585 		panic("brelvp: NULL");
586 	/*
587 	 * Delete from old vnode list, if on one.
588 	 */
589 	s = splbio();
590 	if (bp->b_vnbufs.le_next != NOLIST)
591 		bufremvn(bp);
592 	splx(s);
593 
594 	vp = bp->b_vp;
595 	bp->b_vp = (struct vnode *) 0;
596 	HOLDRELE(vp);
597 }
598 
599 /*
600  * Associate a p-buffer with a vnode.
601  */
602 void
603 pbgetvp(vp, bp)
604 	register struct vnode *vp;
605 	register struct buf *bp;
606 {
607 	if (bp->b_vp)
608 		panic("pbgetvp: not free");
609 	VHOLD(vp);
610 	bp->b_vp = vp;
611 	if (vp->v_type == VBLK || vp->v_type == VCHR)
612 		bp->b_dev = vp->v_rdev;
613 	else
614 		bp->b_dev = NODEV;
615 }
616 
617 /*
618  * Disassociate a p-buffer from a vnode.
619  */
620 void
621 pbrelvp(bp)
622 	register struct buf *bp;
623 {
624 	struct vnode *vp;
625 
626 	if (bp->b_vp == (struct vnode *) 0)
627 		panic("brelvp: NULL");
628 
629 	vp = bp->b_vp;
630 	bp->b_vp = (struct vnode *) 0;
631 	HOLDRELE(vp);
632 }
633 
634 /*
635  * Reassign a buffer from one vnode to another.
636  * Used to assign file specific control information
637  * (indirect blocks) to the vnode to which they belong.
638  */
639 void
640 reassignbuf(bp, newvp)
641 	register struct buf *bp;
642 	register struct vnode *newvp;
643 {
644 	int s;
645 
646 	if (newvp == NULL) {
647 		printf("reassignbuf: NULL");
648 		return;
649 	}
650 
651 	s = splbio();
652 	/*
653 	 * Delete from old vnode list, if on one.
654 	 */
655 	if (bp->b_vnbufs.le_next != NOLIST)
656 		bufremvn(bp);
657 	/*
658 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
659 	 * of clean buffers.
660 	 */
661 	if (bp->b_flags & B_DELWRI) {
662 		struct buf *tbp;
663 
664 		tbp = newvp->v_dirtyblkhd.lh_first;
665 		if (!tbp || (tbp->b_lblkno > bp->b_lblkno)) {
666 			bufinsvn(bp, &newvp->v_dirtyblkhd);
667 		} else {
668 			while (tbp->b_vnbufs.le_next &&
669 				(tbp->b_vnbufs.le_next->b_lblkno < bp->b_lblkno)) {
670 				tbp = tbp->b_vnbufs.le_next;
671 			}
672 			LIST_INSERT_AFTER(tbp, bp, b_vnbufs);
673 		}
674 	} else {
675 		bufinsvn(bp, &newvp->v_cleanblkhd);
676 	}
677 	splx(s);
678 }
679 
680 #ifndef DEVFS_ROOT
681 /*
682  * Create a vnode for a block device.
683  * Used for root filesystem, argdev, and swap areas.
684  * Also used for memory file system special devices.
685  */
686 int
687 bdevvp(dev, vpp)
688 	dev_t dev;
689 	struct vnode **vpp;
690 {
691 	register struct vnode *vp;
692 	struct vnode *nvp;
693 	int error;
694 
695 	if (dev == NODEV)
696 		return (0);
697 	error = getnewvnode(VT_NON, (struct mount *) 0, spec_vnodeop_p, &nvp);
698 	if (error) {
699 		*vpp = 0;
700 		return (error);
701 	}
702 	vp = nvp;
703 	vp->v_type = VBLK;
704 	if ((nvp = checkalias(vp, dev, (struct mount *) 0))) {
705 		vput(vp);
706 		vp = nvp;
707 	}
708 	*vpp = vp;
709 	return (0);
710 }
711 #endif /* !DEVFS_ROOT */
712 
713 /*
714  * Check to see if the new vnode represents a special device
715  * for which we already have a vnode (either because of
716  * bdevvp() or because of a different vnode representing
717  * the same block device). If such an alias exists, deallocate
718  * the existing contents and return the aliased vnode. The
719  * caller is responsible for filling it with its new contents.
720  */
721 struct vnode *
722 checkalias(nvp, nvp_rdev, mp)
723 	register struct vnode *nvp;
724 	dev_t nvp_rdev;
725 	struct mount *mp;
726 {
727 	register struct vnode *vp;
728 	struct vnode **vpp;
729 
730 	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
731 		return (NULLVP);
732 
733 	vpp = &speclisth[SPECHASH(nvp_rdev)];
734 loop:
735 	for (vp = *vpp; vp; vp = vp->v_specnext) {
736 		if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
737 			continue;
738 		/*
739 		 * Alias, but not in use, so flush it out.
740 		 */
741 		if (vp->v_usecount == 0) {
742 			vgone(vp);
743 			goto loop;
744 		}
745 		if (vget(vp, 1))
746 			goto loop;
747 		break;
748 	}
749 
750 	if (vp == NULL || vp->v_tag != VT_NON) {
751 		MALLOC(nvp->v_specinfo, struct specinfo *,
752 		    sizeof(struct specinfo), M_VNODE, M_WAITOK);
753 		nvp->v_rdev = nvp_rdev;
754 		nvp->v_hashchain = vpp;
755 		nvp->v_specnext = *vpp;
756 		nvp->v_specflags = 0;
757 		*vpp = nvp;
758 		if (vp != NULL) {
759 			nvp->v_flag |= VALIASED;
760 			vp->v_flag |= VALIASED;
761 			vput(vp);
762 		}
763 		return (NULLVP);
764 	}
765 	VOP_UNLOCK(vp);
766 	vclean(vp, 0);
767 	vp->v_op = nvp->v_op;
768 	vp->v_tag = nvp->v_tag;
769 	nvp->v_type = VNON;
770 	insmntque(vp, mp);
771 	return (vp);
772 }
773 
774 /*
775  * Grab a particular vnode from the free list, increment its
776  * reference count and lock it. The vnode lock bit is set the
777  * vnode is being eliminated in vgone. The process is awakened
778  * when the transition is completed, and an error returned to
779  * indicate that the vnode is no longer usable (possibly having
780  * been changed to a new file system type).
781  */
782 int
783 vget(vp, lockflag)
784 	register struct vnode *vp;
785 	int lockflag;
786 {
787 
788 	/*
789 	 * If the vnode is in the process of being cleaned out for another
790 	 * use, we wait for the cleaning to finish and then return failure.
791 	 * Cleaning is determined either by checking that the VXLOCK flag is
792 	 * set, or that the use count is zero with the back pointer set to
793 	 * show that it has been removed from the free list by getnewvnode.
794 	 * The VXLOCK flag may not have been set yet because vclean is blocked
795 	 * in the VOP_LOCK call waiting for the VOP_INACTIVE to complete.
796 	 */
797 	if ((vp->v_flag & VXLOCK) ||
798 	    (vp->v_usecount == 0 &&
799 		vp->v_freelist.tqe_prev == (struct vnode **) 0xdeadb)) {
800 		vp->v_flag |= VXWANT;
801 		(void) tsleep((caddr_t) vp, PINOD, "vget", 0);
802 		return (1);
803 	}
804 	if (vp->v_usecount == 0) {
805 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
806 		freevnodes--;
807 	}
808 	vp->v_usecount++;
809 
810 	/*
811 	 * Create the VM object, if needed
812 	 */
813 	if ((vp->v_type == VREG) &&
814 		((vp->v_object == NULL) ||
815 			(vp->v_object->flags & OBJ_VFS_REF) == 0)) {
816 		vfs_object_create(vp, curproc, curproc->p_ucred, 0);
817 	}
818 	if (lockflag)
819 		VOP_LOCK(vp);
820 
821 	return (0);
822 }
823 
824 /*
825  * Vnode reference, just increment the count
826  */
827 void
828 vref(vp)
829 	struct vnode *vp;
830 {
831 	if (vp->v_usecount <= 0)
832 		panic("vref used where vget required");
833 
834 	vp->v_usecount++;
835 
836 	if ((vp->v_type == VREG) &&
837 		((vp->v_object == NULL) ||
838 			((vp->v_object->flags & OBJ_VFS_REF) == 0)) ) {
839 		/*
840 		 * We need to lock to VP during the time that
841 		 * the object is created.  This is necessary to
842 		 * keep the system from re-entrantly doing it
843 		 * multiple times.
844 		 */
845 		vfs_object_create(vp, curproc, curproc->p_ucred, 0);
846 	}
847 }
848 
849 /*
850  * vput(), just unlock and vrele()
851  */
852 void
853 vput(vp)
854 	register struct vnode *vp;
855 {
856 	VOP_UNLOCK(vp);
857 	vrele(vp);
858 }
859 
860 /*
861  * Vnode release.
862  * If count drops to zero, call inactive routine and return to freelist.
863  */
864 void
865 vrele(vp)
866 	register struct vnode *vp;
867 {
868 
869 #ifdef DIAGNOSTIC
870 	if (vp == NULL)
871 		panic("vrele: null vp");
872 #endif
873 
874 	vp->v_usecount--;
875 
876 	if ((vp->v_usecount == 1) &&
877 		vp->v_object &&
878 		(vp->v_object->flags & OBJ_VFS_REF)) {
879 		vp->v_object->flags &= ~OBJ_VFS_REF;
880 		vm_object_deallocate(vp->v_object);
881 		return;
882 	}
883 
884 	if (vp->v_usecount > 0)
885 		return;
886 
887 	if (vp->v_usecount < 0) {
888 #ifdef DIAGNOSTIC
889 		vprint("vrele: negative ref count", vp);
890 #endif
891 		panic("vrele: negative reference cnt");
892 	}
893 	if (vp->v_flag & VAGE) {
894 		if(vp->v_tag != VT_TFS)
895 			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
896 		vp->v_flag &= ~VAGE;
897 		vp->v_usage = 0;
898 	} else {
899 		if(vp->v_tag != VT_TFS)
900 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
901 	}
902 	freevnodes++;
903 
904 	VOP_INACTIVE(vp);
905 }
906 
907 #ifdef DIAGNOSTIC
908 /*
909  * Page or buffer structure gets a reference.
910  */
911 void
912 vhold(vp)
913 	register struct vnode *vp;
914 {
915 
916 	vp->v_holdcnt++;
917 }
918 
919 /*
920  * Page or buffer structure frees a reference.
921  */
922 void
923 holdrele(vp)
924 	register struct vnode *vp;
925 {
926 
927 	if (vp->v_holdcnt <= 0)
928 		panic("holdrele: holdcnt");
929 	vp->v_holdcnt--;
930 }
931 #endif /* DIAGNOSTIC */
932 
933 /*
934  * Remove any vnodes in the vnode table belonging to mount point mp.
935  *
936  * If MNT_NOFORCE is specified, there should not be any active ones,
937  * return error if any are found (nb: this is a user error, not a
938  * system error). If MNT_FORCE is specified, detach any active vnodes
939  * that are found.
940  */
941 #ifdef DIAGNOSTIC
942 static int busyprt = 0;		/* print out busy vnodes */
943 SYSCTL_INT(_debug, 1, busyprt, CTLFLAG_RW, &busyprt, 0, "");
944 #endif
945 
946 int
947 vflush(mp, skipvp, flags)
948 	struct mount *mp;
949 	struct vnode *skipvp;
950 	int flags;
951 {
952 	register struct vnode *vp, *nvp;
953 	int busy = 0;
954 
955 	if ((mp->mnt_flag & MNT_MPBUSY) == 0)
956 		panic("vflush: not busy");
957 loop:
958 	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
959 		/*
960 		 * Make sure this vnode wasn't reclaimed in getnewvnode().
961 		 * Start over if it has (it won't be on the list anymore).
962 		 */
963 		if (vp->v_mount != mp)
964 			goto loop;
965 		nvp = vp->v_mntvnodes.le_next;
966 		/*
967 		 * Skip over a selected vnode.
968 		 */
969 		if (vp == skipvp)
970 			continue;
971 		/*
972 		 * Skip over a vnodes marked VSYSTEM.
973 		 */
974 		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM))
975 			continue;
976 		/*
977 		 * If WRITECLOSE is set, only flush out regular file vnodes
978 		 * open for writing.
979 		 */
980 		if ((flags & WRITECLOSE) &&
981 		    (vp->v_writecount == 0 || vp->v_type != VREG))
982 			continue;
983 
984 		if (vp->v_object && (vp->v_object->flags & OBJ_VFS_REF)) {
985 			vm_object_reference(vp->v_object);
986 			pager_cache(vp->v_object, FALSE);
987 			vp->v_object->flags &= ~OBJ_VFS_REF;
988 			vm_object_deallocate(vp->v_object);
989 		}
990 
991 		/*
992 		 * With v_usecount == 0, all we need to do is clear out the
993 		 * vnode data structures and we are done.
994 		 */
995 		if (vp->v_usecount == 0) {
996 			vgone(vp);
997 			continue;
998 		}
999 
1000 		/*
1001 		 * If FORCECLOSE is set, forcibly close the vnode. For block
1002 		 * or character devices, revert to an anonymous device. For
1003 		 * all other files, just kill them.
1004 		 */
1005 		if (flags & FORCECLOSE) {
1006 			if (vp->v_type != VBLK && vp->v_type != VCHR) {
1007 				vgone(vp);
1008 			} else {
1009 				vclean(vp, 0);
1010 				vp->v_op = spec_vnodeop_p;
1011 				insmntque(vp, (struct mount *) 0);
1012 			}
1013 			continue;
1014 		}
1015 #ifdef DIAGNOSTIC
1016 		if (busyprt)
1017 			vprint("vflush: busy vnode", vp);
1018 #endif
1019 		busy++;
1020 	}
1021 	if (busy)
1022 		return (EBUSY);
1023 	return (0);
1024 }
1025 
1026 /*
1027  * Disassociate the underlying file system from a vnode.
1028  */
1029 void
1030 vclean(struct vnode *vp, int flags)
1031 {
1032 	int active;
1033 
1034 	/*
1035 	 * Check to see if the vnode is in use. If so we have to reference it
1036 	 * before we clean it out so that its count cannot fall to zero and
1037 	 * generate a race against ourselves to recycle it.
1038 	 */
1039 	if ((active = vp->v_usecount))
1040 		VREF(vp);
1041 	/*
1042 	 * Even if the count is zero, the VOP_INACTIVE routine may still have
1043 	 * the object locked while it cleans it out. The VOP_LOCK ensures that
1044 	 * the VOP_INACTIVE routine is done with its work. For active vnodes,
1045 	 * it ensures that no other activity can occur while the underlying
1046 	 * object is being cleaned out.
1047 	 */
1048 	VOP_LOCK(vp);
1049 	/*
1050 	 * Prevent the vnode from being recycled or brought into use while we
1051 	 * clean it out.
1052 	 */
1053 	if (vp->v_flag & VXLOCK)
1054 		panic("vclean: deadlock");
1055 	vp->v_flag |= VXLOCK;
1056 	/*
1057 	 * Clean out any buffers associated with the vnode.
1058 	 */
1059 	if (flags & DOCLOSE)
1060 		vinvalbuf(vp, V_SAVE, NOCRED, NULL, 0, 0);
1061 	/*
1062 	 * Any other processes trying to obtain this lock must first wait for
1063 	 * VXLOCK to clear, then call the new lock operation.
1064 	 */
1065 	VOP_UNLOCK(vp);
1066 	/*
1067 	 * If purging an active vnode, it must be closed and deactivated
1068 	 * before being reclaimed.
1069 	 */
1070 	if (active) {
1071 		if (flags & DOCLOSE)
1072 			VOP_CLOSE(vp, FNONBLOCK, NOCRED, NULL);
1073 		VOP_INACTIVE(vp);
1074 	}
1075 	/*
1076 	 * Reclaim the vnode.
1077 	 */
1078 	if (VOP_RECLAIM(vp))
1079 		panic("vclean: cannot reclaim");
1080 	if (active)
1081 		vrele(vp);
1082 
1083 	/*
1084 	 * Done with purge, notify sleepers of the grim news.
1085 	 */
1086 	vp->v_op = dead_vnodeop_p;
1087 	vp->v_tag = VT_NON;
1088 	vp->v_flag &= ~VXLOCK;
1089 	if (vp->v_flag & VXWANT) {
1090 		vp->v_flag &= ~VXWANT;
1091 		wakeup((caddr_t) vp);
1092 	}
1093 }
1094 
1095 /*
1096  * Eliminate all activity associated with  the requested vnode
1097  * and with all vnodes aliased to the requested vnode.
1098  */
1099 void
1100 vgoneall(vp)
1101 	register struct vnode *vp;
1102 {
1103 	register struct vnode *vq;
1104 
1105 	if (vp->v_flag & VALIASED) {
1106 		/*
1107 		 * If a vgone (or vclean) is already in progress, wait until
1108 		 * it is done and return.
1109 		 */
1110 		if (vp->v_flag & VXLOCK) {
1111 			vp->v_flag |= VXWANT;
1112 			(void) tsleep((caddr_t) vp, PINOD, "vgall", 0);
1113 			return;
1114 		}
1115 		/*
1116 		 * Ensure that vp will not be vgone'd while we are eliminating
1117 		 * its aliases.
1118 		 */
1119 		vp->v_flag |= VXLOCK;
1120 		while (vp->v_flag & VALIASED) {
1121 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1122 				if (vq->v_rdev != vp->v_rdev ||
1123 				    vq->v_type != vp->v_type || vp == vq)
1124 					continue;
1125 				vgone(vq);
1126 				break;
1127 			}
1128 		}
1129 		/*
1130 		 * Remove the lock so that vgone below will really eliminate
1131 		 * the vnode after which time vgone will awaken any sleepers.
1132 		 */
1133 		vp->v_flag &= ~VXLOCK;
1134 	}
1135 	vgone(vp);
1136 }
1137 
1138 /*
1139  * Eliminate all activity associated with a vnode
1140  * in preparation for reuse.
1141  */
1142 void
1143 vgone(vp)
1144 	register struct vnode *vp;
1145 {
1146 	register struct vnode *vq;
1147 	struct vnode *vx;
1148 
1149 	/*
1150 	 * If a vgone (or vclean) is already in progress, wait until it is
1151 	 * done and return.
1152 	 */
1153 	if (vp->v_flag & VXLOCK) {
1154 		vp->v_flag |= VXWANT;
1155 		(void) tsleep((caddr_t) vp, PINOD, "vgone", 0);
1156 		return;
1157 	}
1158 
1159 	if (vp->v_object) {
1160 		vp->v_object->flags |= OBJ_VNODE_GONE;
1161 	}
1162 
1163 	/*
1164 	 * Clean out the filesystem specific data.
1165 	 */
1166 	vclean(vp, DOCLOSE);
1167 	/*
1168 	 * Delete from old mount point vnode list, if on one.
1169 	 */
1170 	if (vp->v_mount != NULL) {
1171 		LIST_REMOVE(vp, v_mntvnodes);
1172 		vp->v_mount = NULL;
1173 	}
1174 	/*
1175 	 * If special device, remove it from special device alias list.
1176 	 */
1177 	if (vp->v_type == VBLK || vp->v_type == VCHR) {
1178 		if (*vp->v_hashchain == vp) {
1179 			*vp->v_hashchain = vp->v_specnext;
1180 		} else {
1181 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1182 				if (vq->v_specnext != vp)
1183 					continue;
1184 				vq->v_specnext = vp->v_specnext;
1185 				break;
1186 			}
1187 			if (vq == NULL)
1188 				panic("missing bdev");
1189 		}
1190 		if (vp->v_flag & VALIASED) {
1191 			vx = NULL;
1192 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1193 				if (vq->v_rdev != vp->v_rdev ||
1194 				    vq->v_type != vp->v_type)
1195 					continue;
1196 				if (vx)
1197 					break;
1198 				vx = vq;
1199 			}
1200 			if (vx == NULL)
1201 				panic("missing alias");
1202 			if (vq == NULL)
1203 				vx->v_flag &= ~VALIASED;
1204 			vp->v_flag &= ~VALIASED;
1205 		}
1206 		FREE(vp->v_specinfo, M_VNODE);
1207 		vp->v_specinfo = NULL;
1208 	}
1209 	/*
1210 	 * If it is on the freelist and not already at the head, move it to
1211 	 * the head of the list. The test of the back pointer and the
1212 	 * reference count of zero is because it will be removed from the free
1213 	 * list by getnewvnode, but will not have its reference count
1214 	 * incremented until after calling vgone. If the reference count were
1215 	 * incremented first, vgone would (incorrectly) try to close the
1216 	 * previous instance of the underlying object. So, the back pointer is
1217 	 * explicitly set to `0xdeadb' in getnewvnode after removing it from
1218 	 * the freelist to ensure that we do not try to move it here.
1219 	 */
1220 	if (vp->v_usecount == 0 &&
1221 	    vp->v_freelist.tqe_prev != (struct vnode **) 0xdeadb &&
1222 	    vnode_free_list.tqh_first != vp) {
1223 		if(vp->v_tag != VT_TFS) {
1224 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1225 			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1226 		}
1227 	}
1228 	vp->v_type = VBAD;
1229 }
1230 
1231 /*
1232  * Lookup a vnode by device number.
1233  */
1234 int
1235 vfinddev(dev, type, vpp)
1236 	dev_t dev;
1237 	enum vtype type;
1238 	struct vnode **vpp;
1239 {
1240 	register struct vnode *vp;
1241 
1242 	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1243 		if (dev != vp->v_rdev || type != vp->v_type)
1244 			continue;
1245 		*vpp = vp;
1246 		return (1);
1247 	}
1248 	return (0);
1249 }
1250 
1251 /*
1252  * Calculate the total number of references to a special device.
1253  */
1254 int
1255 vcount(vp)
1256 	register struct vnode *vp;
1257 {
1258 	register struct vnode *vq, *vnext;
1259 	int count;
1260 
1261 loop:
1262 	if ((vp->v_flag & VALIASED) == 0)
1263 		return (vp->v_usecount);
1264 	for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
1265 		vnext = vq->v_specnext;
1266 		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1267 			continue;
1268 		/*
1269 		 * Alias, but not in use, so flush it out.
1270 		 */
1271 		if (vq->v_usecount == 0 && vq != vp) {
1272 			vgone(vq);
1273 			goto loop;
1274 		}
1275 		count += vq->v_usecount;
1276 	}
1277 	return (count);
1278 }
1279 
1280 /*
1281  * Print out a description of a vnode.
1282  */
1283 static char *typename[] =
1284 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
1285 
1286 void
1287 vprint(label, vp)
1288 	char *label;
1289 	register struct vnode *vp;
1290 {
1291 	char buf[64];
1292 
1293 	if (label != NULL)
1294 		printf("%s: ", label);
1295 	printf("type %s, usecount %d, writecount %d, refcount %ld,",
1296 	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1297 	    vp->v_holdcnt);
1298 	buf[0] = '\0';
1299 	if (vp->v_flag & VROOT)
1300 		strcat(buf, "|VROOT");
1301 	if (vp->v_flag & VTEXT)
1302 		strcat(buf, "|VTEXT");
1303 	if (vp->v_flag & VSYSTEM)
1304 		strcat(buf, "|VSYSTEM");
1305 	if (vp->v_flag & VXLOCK)
1306 		strcat(buf, "|VXLOCK");
1307 	if (vp->v_flag & VXWANT)
1308 		strcat(buf, "|VXWANT");
1309 	if (vp->v_flag & VBWAIT)
1310 		strcat(buf, "|VBWAIT");
1311 	if (vp->v_flag & VALIASED)
1312 		strcat(buf, "|VALIASED");
1313 	if (buf[0] != '\0')
1314 		printf(" flags (%s)", &buf[1]);
1315 	if (vp->v_data == NULL) {
1316 		printf("\n");
1317 	} else {
1318 		printf("\n\t");
1319 		VOP_PRINT(vp);
1320 	}
1321 }
1322 
1323 #ifdef DDB
1324 /*
1325  * List all of the locked vnodes in the system.
1326  * Called when debugging the kernel.
1327  */
1328 void
1329 printlockedvnodes(void)
1330 {
1331 	register struct mount *mp;
1332 	register struct vnode *vp;
1333 
1334 	printf("Locked vnodes\n");
1335 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
1336 	    mp = mp->mnt_list.cqe_next) {
1337 		for (vp = mp->mnt_vnodelist.lh_first;
1338 		    vp != NULL;
1339 		    vp = vp->v_mntvnodes.le_next)
1340 			if (VOP_ISLOCKED(vp))
1341 				vprint((char *) 0, vp);
1342 	}
1343 }
1344 #endif
1345 
1346 int kinfo_vdebug = 1;
1347 int kinfo_vgetfailed;
1348 
1349 #define KINFO_VNODESLOP	10
1350 /*
1351  * Dump vnode list (via sysctl).
1352  * Copyout address of vnode followed by vnode.
1353  */
1354 /* ARGSUSED */
1355 static int
1356 sysctl_vnode SYSCTL_HANDLER_ARGS
1357 {
1358 	register struct mount *mp, *nmp;
1359 	struct vnode *vp;
1360 	int error;
1361 
1362 #define VPTRSZ	sizeof (struct vnode *)
1363 #define VNODESZ	sizeof (struct vnode)
1364 
1365 	req->lock = 0;
1366 	if (!req->oldptr) /* Make an estimate */
1367 		return (SYSCTL_OUT(req, 0,
1368 			(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
1369 
1370 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
1371 		nmp = mp->mnt_list.cqe_next;
1372 		if (vfs_busy(mp))
1373 			continue;
1374 again:
1375 		for (vp = mp->mnt_vnodelist.lh_first;
1376 		    vp != NULL;
1377 		    vp = vp->v_mntvnodes.le_next) {
1378 			/*
1379 			 * Check that the vp is still associated with this
1380 			 * filesystem.  RACE: could have been recycled onto
1381 			 * the same filesystem.
1382 			 */
1383 			if (vp->v_mount != mp) {
1384 				if (kinfo_vdebug)
1385 					printf("kinfo: vp changed\n");
1386 				goto again;
1387 			}
1388 			if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
1389 			    (error = SYSCTL_OUT(req, vp, VNODESZ))) {
1390 				vfs_unbusy(mp);
1391 				return (error);
1392 			}
1393 		}
1394 		vfs_unbusy(mp);
1395 	}
1396 
1397 	return (0);
1398 }
1399 
1400 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
1401 	0, 0, sysctl_vnode, "S,vnode", "");
1402 
1403 /*
1404  * Check to see if a filesystem is mounted on a block device.
1405  */
1406 int
1407 vfs_mountedon(vp)
1408 	register struct vnode *vp;
1409 {
1410 	register struct vnode *vq;
1411 
1412 	if (vp->v_specflags & SI_MOUNTEDON)
1413 		return (EBUSY);
1414 	if (vp->v_flag & VALIASED) {
1415 		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1416 			if (vq->v_rdev != vp->v_rdev ||
1417 			    vq->v_type != vp->v_type)
1418 				continue;
1419 			if (vq->v_specflags & SI_MOUNTEDON)
1420 				return (EBUSY);
1421 		}
1422 	}
1423 	return (0);
1424 }
1425 
1426 /*
1427  * Build hash lists of net addresses and hang them off the mount point.
1428  * Called by ufs_mount() to set up the lists of export addresses.
1429  */
1430 static int
1431 vfs_hang_addrlist(struct mount *mp, struct netexport *nep,
1432 	struct export_args *argp)
1433 {
1434 	register struct netcred *np;
1435 	register struct radix_node_head *rnh;
1436 	register int i;
1437 	struct radix_node *rn;
1438 	struct sockaddr *saddr, *smask = 0;
1439 	struct domain *dom;
1440 	int error;
1441 
1442 	if (argp->ex_addrlen == 0) {
1443 		if (mp->mnt_flag & MNT_DEFEXPORTED)
1444 			return (EPERM);
1445 		np = &nep->ne_defexported;
1446 		np->netc_exflags = argp->ex_flags;
1447 		np->netc_anon = argp->ex_anon;
1448 		np->netc_anon.cr_ref = 1;
1449 		mp->mnt_flag |= MNT_DEFEXPORTED;
1450 		return (0);
1451 	}
1452 	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
1453 	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
1454 	bzero((caddr_t) np, i);
1455 	saddr = (struct sockaddr *) (np + 1);
1456 	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
1457 		goto out;
1458 	if (saddr->sa_len > argp->ex_addrlen)
1459 		saddr->sa_len = argp->ex_addrlen;
1460 	if (argp->ex_masklen) {
1461 		smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
1462 		error = copyin(argp->ex_addr, (caddr_t) smask, argp->ex_masklen);
1463 		if (error)
1464 			goto out;
1465 		if (smask->sa_len > argp->ex_masklen)
1466 			smask->sa_len = argp->ex_masklen;
1467 	}
1468 	i = saddr->sa_family;
1469 	if ((rnh = nep->ne_rtable[i]) == 0) {
1470 		/*
1471 		 * Seems silly to initialize every AF when most are not used,
1472 		 * do so on demand here
1473 		 */
1474 		for (dom = domains; dom; dom = dom->dom_next)
1475 			if (dom->dom_family == i && dom->dom_rtattach) {
1476 				dom->dom_rtattach((void **) &nep->ne_rtable[i],
1477 				    dom->dom_rtoffset);
1478 				break;
1479 			}
1480 		if ((rnh = nep->ne_rtable[i]) == 0) {
1481 			error = ENOBUFS;
1482 			goto out;
1483 		}
1484 	}
1485 	rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
1486 	    np->netc_rnodes);
1487 	if (rn == 0 || np != (struct netcred *) rn) {	/* already exists */
1488 		error = EPERM;
1489 		goto out;
1490 	}
1491 	np->netc_exflags = argp->ex_flags;
1492 	np->netc_anon = argp->ex_anon;
1493 	np->netc_anon.cr_ref = 1;
1494 	return (0);
1495 out:
1496 	free(np, M_NETADDR);
1497 	return (error);
1498 }
1499 
1500 /* ARGSUSED */
1501 static int
1502 vfs_free_netcred(struct radix_node *rn, void *w)
1503 {
1504 	register struct radix_node_head *rnh = (struct radix_node_head *) w;
1505 
1506 	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
1507 	free((caddr_t) rn, M_NETADDR);
1508 	return (0);
1509 }
1510 
1511 /*
1512  * Free the net address hash lists that are hanging off the mount points.
1513  */
1514 static void
1515 vfs_free_addrlist(struct netexport *nep)
1516 {
1517 	register int i;
1518 	register struct radix_node_head *rnh;
1519 
1520 	for (i = 0; i <= AF_MAX; i++)
1521 		if ((rnh = nep->ne_rtable[i])) {
1522 			(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
1523 			    (caddr_t) rnh);
1524 			free((caddr_t) rnh, M_RTABLE);
1525 			nep->ne_rtable[i] = 0;
1526 		}
1527 }
1528 
1529 int
1530 vfs_export(mp, nep, argp)
1531 	struct mount *mp;
1532 	struct netexport *nep;
1533 	struct export_args *argp;
1534 {
1535 	int error;
1536 
1537 	if (argp->ex_flags & MNT_DELEXPORT) {
1538 		vfs_free_addrlist(nep);
1539 		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
1540 	}
1541 	if (argp->ex_flags & MNT_EXPORTED) {
1542 		if ((error = vfs_hang_addrlist(mp, nep, argp)))
1543 			return (error);
1544 		mp->mnt_flag |= MNT_EXPORTED;
1545 	}
1546 	return (0);
1547 }
1548 
1549 struct netcred *
1550 vfs_export_lookup(mp, nep, nam)
1551 	register struct mount *mp;
1552 	struct netexport *nep;
1553 	struct mbuf *nam;
1554 {
1555 	register struct netcred *np;
1556 	register struct radix_node_head *rnh;
1557 	struct sockaddr *saddr;
1558 
1559 	np = NULL;
1560 	if (mp->mnt_flag & MNT_EXPORTED) {
1561 		/*
1562 		 * Lookup in the export list first.
1563 		 */
1564 		if (nam != NULL) {
1565 			saddr = mtod(nam, struct sockaddr *);
1566 			rnh = nep->ne_rtable[saddr->sa_family];
1567 			if (rnh != NULL) {
1568 				np = (struct netcred *)
1569 				    (*rnh->rnh_matchaddr) ((caddr_t) saddr,
1570 				    rnh);
1571 				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
1572 					np = NULL;
1573 			}
1574 		}
1575 		/*
1576 		 * If no address match, use the default if it exists.
1577 		 */
1578 		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
1579 			np = &nep->ne_defexported;
1580 	}
1581 	return (np);
1582 }
1583 
1584 
1585 /*
1586  * perform msync on all vnodes under a mount point
1587  * the mount point must be locked.
1588  */
1589 void
1590 vfs_msync(struct mount *mp, int flags) {
1591 	struct vnode *vp, *nvp;
1592 loop:
1593 	for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) {
1594 
1595 		if (vp->v_mount != mp)
1596 			goto loop;
1597 		nvp = vp->v_mntvnodes.le_next;
1598 		if (VOP_ISLOCKED(vp) && (flags != MNT_WAIT))
1599 			continue;
1600 		if (vp->v_object &&
1601 		   (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
1602 			vm_object_page_clean(vp->v_object, 0, 0, TRUE, TRUE);
1603 		}
1604 	}
1605 }
1606 
1607 /*
1608  * Create the VM object needed for VMIO and mmap support.  This
1609  * is done for all VREG files in the system.  Some filesystems might
1610  * afford the additional metadata buffering capability of the
1611  * VMIO code by making the device node be VMIO mode also.
1612  */
1613 int
1614 vfs_object_create(vp, p, cred, waslocked)
1615 	struct vnode *vp;
1616 	struct proc *p;
1617 	struct ucred *cred;
1618 	int waslocked;
1619 {
1620 	struct vattr vat;
1621 	vm_object_t object;
1622 	int error = 0;
1623 
1624 retry:
1625 	if ((object = vp->v_object) == NULL) {
1626 		if (vp->v_type == VREG) {
1627 			if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0)
1628 				goto retn;
1629 			(void) vnode_pager_alloc(vp,
1630 				OFF_TO_IDX(round_page(vat.va_size)), 0, 0);
1631 		} else {
1632 			/*
1633 			 * This simply allocates the biggest object possible
1634 			 * for a VBLK vnode.  This should be fixed, but doesn't
1635 			 * cause any problems (yet).
1636 			 */
1637 			(void) vnode_pager_alloc(vp, INT_MAX, 0, 0);
1638 		}
1639 		vp->v_object->flags |= OBJ_VFS_REF;
1640 	} else {
1641 		if (object->flags & OBJ_DEAD) {
1642 			if (waslocked)
1643 				VOP_UNLOCK(vp);
1644 			tsleep(object, PVM, "vodead", 0);
1645 			if (waslocked)
1646 				VOP_LOCK(vp);
1647 			goto retry;
1648 		}
1649 		if ((object->flags & OBJ_VFS_REF) == 0) {
1650 			object->flags |= OBJ_VFS_REF;
1651 			vm_object_reference(object);
1652 		}
1653 	}
1654 	if (vp->v_object)
1655 		vp->v_flag |= VVMIO;
1656 
1657 retn:
1658 	return error;
1659 }
1660