xref: /freebsd/sys/kern/vfs_subr.c (revision afe61c15161c324a7af299a9b8457aba5afc92db)
1 /*
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
39  */
40 
41 /*
42  * External virtual filesystem routines
43  */
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/proc.h>
48 #include <sys/mount.h>
49 #include <sys/time.h>
50 #include <sys/vnode.h>
51 #include <sys/stat.h>
52 #include <sys/namei.h>
53 #include <sys/ucred.h>
54 #include <sys/buf.h>
55 #include <sys/errno.h>
56 #include <sys/malloc.h>
57 #include <sys/domain.h>
58 #include <sys/mbuf.h>
59 
60 #include <vm/vm.h>
61 #include <sys/sysctl.h>
62 
63 #include <miscfs/specfs/specdev.h>
64 
65 void	insmntque	__P((struct vnode *, struct mount *));
66 
67 enum vtype iftovt_tab[16] = {
68 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
69 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
70 };
71 int	vttoif_tab[9] = {
72 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
73 	S_IFSOCK, S_IFIFO, S_IFMT,
74 };
75 
76 /*
77  * Insq/Remq for the vnode usage lists.
78  */
79 #define	bufinsvn(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_vnbufs)
80 #define	bufremvn(bp) {  \
81 	LIST_REMOVE(bp, b_vnbufs); \
82 	(bp)->b_vnbufs.le_next = NOLIST; \
83 }
84 
85 TAILQ_HEAD(freelst, vnode) vnode_free_list;	/* vnode free list */
86 struct mntlist mountlist;			/* mounted filesystem list */
87 
88 /*
89  * Initialize the vnode management data structures.
90  */
91 void
92 vntblinit()
93 {
94 
95 	TAILQ_INIT(&vnode_free_list);
96 	TAILQ_INIT(&mountlist);
97 }
98 
99 /*
100  * Lock a filesystem.
101  * Used to prevent access to it while mounting and unmounting.
102  */
103 int
104 vfs_lock(mp)
105 	register struct mount *mp;
106 {
107 
108 	while(mp->mnt_flag & MNT_MLOCK) {
109 		mp->mnt_flag |= MNT_MWAIT;
110 		sleep((caddr_t)mp, PVFS);
111 	}
112 	mp->mnt_flag |= MNT_MLOCK;
113 	return (0);
114 }
115 
116 /*
117  * Unlock a locked filesystem.
118  * Panic if filesystem is not locked.
119  */
120 void
121 vfs_unlock(mp)
122 	register struct mount *mp;
123 {
124 
125 	if ((mp->mnt_flag & MNT_MLOCK) == 0)
126 		panic("vfs_unlock: not locked");
127 	mp->mnt_flag &= ~MNT_MLOCK;
128 	if (mp->mnt_flag & MNT_MWAIT) {
129 		mp->mnt_flag &= ~MNT_MWAIT;
130 		wakeup((caddr_t)mp);
131 	}
132 }
133 
134 /*
135  * Mark a mount point as busy.
136  * Used to synchronize access and to delay unmounting.
137  */
138 int
139 vfs_busy(mp)
140 	register struct mount *mp;
141 {
142 
143 	while(mp->mnt_flag & MNT_MPBUSY) {
144 		mp->mnt_flag |= MNT_MPWANT;
145 		sleep((caddr_t)&mp->mnt_flag, PVFS);
146 	}
147 	if (mp->mnt_flag & MNT_UNMOUNT)
148 		return (1);
149 	mp->mnt_flag |= MNT_MPBUSY;
150 	return (0);
151 }
152 
153 /*
154  * Free a busy filesystem.
155  * Panic if filesystem is not busy.
156  */
157 void
158 vfs_unbusy(mp)
159 	register struct mount *mp;
160 {
161 
162 	if ((mp->mnt_flag & MNT_MPBUSY) == 0)
163 		panic("vfs_unbusy: not busy");
164 	mp->mnt_flag &= ~MNT_MPBUSY;
165 	if (mp->mnt_flag & MNT_MPWANT) {
166 		mp->mnt_flag &= ~MNT_MPWANT;
167 		wakeup((caddr_t)&mp->mnt_flag);
168 	}
169 }
170 
171 /*
172  * Lookup a mount point by filesystem identifier.
173  */
174 struct mount *
175 getvfs(fsid)
176 	fsid_t *fsid;
177 {
178 	register struct mount *mp;
179 
180 	for (mp = mountlist.tqh_first; mp != NULL; mp = mp->mnt_list.tqe_next) {
181 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
182 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1])
183 			return (mp);
184 	}
185 	return ((struct mount *)0);
186 }
187 
188 /*
189  * Get a new unique fsid
190  */
191 void
192 getnewfsid(mp, mtype)
193 	struct mount *mp;
194 	int mtype;
195 {
196 static u_short xxxfs_mntid;
197 
198 	fsid_t tfsid;
199 
200 	mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
201 	mp->mnt_stat.f_fsid.val[1] = mtype;
202 	if (xxxfs_mntid == 0)
203 		++xxxfs_mntid;
204 	tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
205 	tfsid.val[1] = mtype;
206 	if (mountlist.tqh_first != NULL) {
207 		while (getvfs(&tfsid)) {
208 			tfsid.val[0]++;
209 			xxxfs_mntid++;
210 		}
211 	}
212 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
213 }
214 
215 /*
216  * Set vnode attributes to VNOVAL
217  */
218 void
219 vattr_null(vap)
220 	register struct vattr *vap;
221 {
222 
223 	vap->va_type = VNON;
224 	vap->va_size = VNOVAL;
225 	vap->va_bytes = VNOVAL;
226 	vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid =
227 		vap->va_fsid = vap->va_fileid =
228 		vap->va_blocksize = vap->va_rdev =
229 		vap->va_atime.ts_sec = vap->va_atime.ts_nsec =
230 		vap->va_mtime.ts_sec = vap->va_mtime.ts_nsec =
231 		vap->va_ctime.ts_sec = vap->va_ctime.ts_nsec =
232 		vap->va_flags = vap->va_gen = VNOVAL;
233 	vap->va_vaflags = 0;
234 }
235 
236 /*
237  * Routines having to do with the management of the vnode table.
238  */
239 extern int (**dead_vnodeop_p)();
240 extern void vclean();
241 long numvnodes;
242 extern struct vattr va_null;
243 
244 /*
245  * Return the next vnode from the free list.
246  */
247 int
248 getnewvnode(tag, mp, vops, vpp)
249 	enum vtagtype tag;
250 	struct mount *mp;
251 	int (**vops)();
252 	struct vnode **vpp;
253 {
254 	register struct vnode *vp;
255 	int s;
256 
257 	if ((vnode_free_list.tqh_first == NULL &&
258 	     numvnodes < 2 * desiredvnodes) ||
259 	    numvnodes < desiredvnodes) {
260 		vp = (struct vnode *)malloc((u_long)sizeof *vp,
261 		    M_VNODE, M_WAITOK);
262 		bzero((char *)vp, sizeof *vp);
263 		numvnodes++;
264 	} else {
265 		if ((vp = vnode_free_list.tqh_first) == NULL) {
266 			tablefull("vnode");
267 			*vpp = 0;
268 			return (ENFILE);
269 		}
270 		if (vp->v_usecount)
271 			panic("free vnode isn't");
272 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
273 		/* see comment on why 0xdeadb is set at end of vgone (below) */
274 		vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb;
275 		vp->v_lease = NULL;
276 		if (vp->v_type != VBAD)
277 			vgone(vp);
278 #ifdef DIAGNOSTIC
279 		if (vp->v_data)
280 			panic("cleaned vnode isn't");
281 		s = splbio();
282 		if (vp->v_numoutput)
283 			panic("Clean vnode has pending I/O's");
284 		splx(s);
285 #endif
286 		vp->v_flag = 0;
287 		vp->v_lastr = 0;
288 		vp->v_ralen = 0;
289 		vp->v_maxra = 0;
290 		vp->v_lastw = 0;
291 		vp->v_lasta = 0;
292 		vp->v_cstart = 0;
293 		vp->v_clen = 0;
294 		vp->v_socket = 0;
295 	}
296 	vp->v_type = VNON;
297 	cache_purge(vp);
298 	vp->v_tag = tag;
299 	vp->v_op = vops;
300 	insmntque(vp, mp);
301 	*vpp = vp;
302 	vp->v_usecount = 1;
303 	vp->v_data = 0;
304 	return (0);
305 }
306 
307 /*
308  * Move a vnode from one mount queue to another.
309  */
310 void
311 insmntque(vp, mp)
312 	register struct vnode *vp;
313 	register struct mount *mp;
314 {
315 
316 	/*
317 	 * Delete from old mount point vnode list, if on one.
318 	 */
319 	if (vp->v_mount != NULL)
320 		LIST_REMOVE(vp, v_mntvnodes);
321 	/*
322 	 * Insert into list of vnodes for the new mount point, if available.
323 	 */
324 	if ((vp->v_mount = mp) == NULL)
325 		return;
326 	LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
327 }
328 
329 /*
330  * Update outstanding I/O count and do wakeup if requested.
331  */
332 void
333 vwakeup(bp)
334 	register struct buf *bp;
335 {
336 	register struct vnode *vp;
337 
338 	bp->b_flags &= ~B_WRITEINPROG;
339 	if (vp = bp->b_vp) {
340 		vp->v_numoutput--;
341 		if (vp->v_numoutput < 0)
342 			panic("vwakeup: neg numoutput");
343 		if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) {
344 			if (vp->v_numoutput < 0)
345 				panic("vwakeup: neg numoutput");
346 			vp->v_flag &= ~VBWAIT;
347 			wakeup((caddr_t)&vp->v_numoutput);
348 		}
349 	}
350 }
351 
352 /*
353  * Flush out and invalidate all buffers associated with a vnode.
354  * Called with the underlying object locked.
355  */
356 int
357 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
358 	register struct vnode *vp;
359 	int flags;
360 	struct ucred *cred;
361 	struct proc *p;
362 	int slpflag, slptimeo;
363 {
364 	register struct buf *bp;
365 	struct buf *nbp, *blist;
366 	int s, error;
367 
368 	if (flags & V_SAVE) {
369 		if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p))
370 			return (error);
371 		if (vp->v_dirtyblkhd.lh_first != NULL)
372 			panic("vinvalbuf: dirty bufs");
373 	}
374 	for (;;) {
375 		if ((blist = vp->v_cleanblkhd.lh_first) && flags & V_SAVEMETA)
376 			while (blist && blist->b_lblkno < 0)
377 				blist = blist->b_vnbufs.le_next;
378 		if (!blist && (blist = vp->v_dirtyblkhd.lh_first) &&
379 		    (flags & V_SAVEMETA))
380 			while (blist && blist->b_lblkno < 0)
381 				blist = blist->b_vnbufs.le_next;
382 		if (!blist)
383 			break;
384 
385 		for (bp = blist; bp; bp = nbp) {
386 			nbp = bp->b_vnbufs.le_next;
387 			if (flags & V_SAVEMETA && bp->b_lblkno < 0)
388 				continue;
389 			s = splbio();
390 			if (bp->b_flags & B_BUSY) {
391 				bp->b_flags |= B_WANTED;
392 				error = tsleep((caddr_t)bp,
393 					slpflag | (PRIBIO + 1), "vinvalbuf",
394 					slptimeo);
395 				splx(s);
396 				if (error)
397 					return (error);
398 				break;
399 			}
400 			bremfree(bp);
401 			bp->b_flags |= B_BUSY;
402 			splx(s);
403 			/*
404 			 * XXX Since there are no node locks for NFS, I believe
405 			 * there is a slight chance that a delayed write will
406 			 * occur while sleeping just above, so check for it.
407 			 */
408 			if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) {
409 				(void) VOP_BWRITE(bp);
410 				break;
411 			}
412 			bp->b_flags |= B_INVAL;
413 			brelse(bp);
414 		}
415 	}
416 	if (!(flags & V_SAVEMETA) &&
417 	    (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first))
418 		panic("vinvalbuf: flush failed");
419 	return (0);
420 }
421 
422 /*
423  * Associate a buffer with a vnode.
424  */
425 void
426 bgetvp(vp, bp)
427 	register struct vnode *vp;
428 	register struct buf *bp;
429 {
430 
431 	if (bp->b_vp)
432 		panic("bgetvp: not free");
433 	VHOLD(vp);
434 	bp->b_vp = vp;
435 	if (vp->v_type == VBLK || vp->v_type == VCHR)
436 		bp->b_dev = vp->v_rdev;
437 	else
438 		bp->b_dev = NODEV;
439 	/*
440 	 * Insert onto list for new vnode.
441 	 */
442 	bufinsvn(bp, &vp->v_cleanblkhd);
443 }
444 
445 /*
446  * Disassociate a buffer from a vnode.
447  */
448 void
449 brelvp(bp)
450 	register struct buf *bp;
451 {
452 	struct vnode *vp;
453 
454 	if (bp->b_vp == (struct vnode *) 0)
455 		panic("brelvp: NULL");
456 	/*
457 	 * Delete from old vnode list, if on one.
458 	 */
459 	if (bp->b_vnbufs.le_next != NOLIST)
460 		bufremvn(bp);
461 	vp = bp->b_vp;
462 	bp->b_vp = (struct vnode *) 0;
463 	HOLDRELE(vp);
464 }
465 
466 /*
467  * Reassign a buffer from one vnode to another.
468  * Used to assign file specific control information
469  * (indirect blocks) to the vnode to which they belong.
470  */
471 void
472 reassignbuf(bp, newvp)
473 	register struct buf *bp;
474 	register struct vnode *newvp;
475 {
476 	register struct buflists *listheadp;
477 
478 	if (newvp == NULL) {
479 		printf("reassignbuf: NULL");
480 		return;
481 	}
482 	/*
483 	 * Delete from old vnode list, if on one.
484 	 */
485 	if (bp->b_vnbufs.le_next != NOLIST)
486 		bufremvn(bp);
487 	/*
488 	 * If dirty, put on list of dirty buffers;
489 	 * otherwise insert onto list of clean buffers.
490 	 */
491 	if (bp->b_flags & B_DELWRI)
492 		listheadp = &newvp->v_dirtyblkhd;
493 	else
494 		listheadp = &newvp->v_cleanblkhd;
495 	bufinsvn(bp, listheadp);
496 }
497 
498 /*
499  * Create a vnode for a block device.
500  * Used for root filesystem, argdev, and swap areas.
501  * Also used for memory file system special devices.
502  */
503 int
504 bdevvp(dev, vpp)
505 	dev_t dev;
506 	struct vnode **vpp;
507 {
508 	register struct vnode *vp;
509 	struct vnode *nvp;
510 	int error;
511 
512 	if (dev == NODEV)
513 		return (0);
514 	error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
515 	if (error) {
516 		*vpp = 0;
517 		return (error);
518 	}
519 	vp = nvp;
520 	vp->v_type = VBLK;
521 	if (nvp = checkalias(vp, dev, (struct mount *)0)) {
522 		vput(vp);
523 		vp = nvp;
524 	}
525 	*vpp = vp;
526 	return (0);
527 }
528 
529 /*
530  * Check to see if the new vnode represents a special device
531  * for which we already have a vnode (either because of
532  * bdevvp() or because of a different vnode representing
533  * the same block device). If such an alias exists, deallocate
534  * the existing contents and return the aliased vnode. The
535  * caller is responsible for filling it with its new contents.
536  */
537 struct vnode *
538 checkalias(nvp, nvp_rdev, mp)
539 	register struct vnode *nvp;
540 	dev_t nvp_rdev;
541 	struct mount *mp;
542 {
543 	register struct vnode *vp;
544 	struct vnode **vpp;
545 
546 	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
547 		return (NULLVP);
548 
549 	vpp = &speclisth[SPECHASH(nvp_rdev)];
550 loop:
551 	for (vp = *vpp; vp; vp = vp->v_specnext) {
552 		if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
553 			continue;
554 		/*
555 		 * Alias, but not in use, so flush it out.
556 		 */
557 		if (vp->v_usecount == 0) {
558 			vgone(vp);
559 			goto loop;
560 		}
561 		if (vget(vp, 1))
562 			goto loop;
563 		break;
564 	}
565 	if (vp == NULL || vp->v_tag != VT_NON) {
566 		MALLOC(nvp->v_specinfo, struct specinfo *,
567 			sizeof(struct specinfo), M_VNODE, M_WAITOK);
568 		nvp->v_rdev = nvp_rdev;
569 		nvp->v_hashchain = vpp;
570 		nvp->v_specnext = *vpp;
571 		nvp->v_specflags = 0;
572 		*vpp = nvp;
573 		if (vp != NULL) {
574 			nvp->v_flag |= VALIASED;
575 			vp->v_flag |= VALIASED;
576 			vput(vp);
577 		}
578 		return (NULLVP);
579 	}
580 	VOP_UNLOCK(vp);
581 	vclean(vp, 0);
582 	vp->v_op = nvp->v_op;
583 	vp->v_tag = nvp->v_tag;
584 	nvp->v_type = VNON;
585 	insmntque(vp, mp);
586 	return (vp);
587 }
588 
589 /*
590  * Grab a particular vnode from the free list, increment its
591  * reference count and lock it. The vnode lock bit is set the
592  * vnode is being eliminated in vgone. The process is awakened
593  * when the transition is completed, and an error returned to
594  * indicate that the vnode is no longer usable (possibly having
595  * been changed to a new file system type).
596  */
597 int
598 vget(vp, lockflag)
599 	register struct vnode *vp;
600 	int lockflag;
601 {
602 
603 	/*
604 	 * If the vnode is in the process of being cleaned out for
605 	 * another use, we wait for the cleaning to finish and then
606 	 * return failure. Cleaning is determined either by checking
607 	 * that the VXLOCK flag is set, or that the use count is
608 	 * zero with the back pointer set to show that it has been
609 	 * removed from the free list by getnewvnode. The VXLOCK
610 	 * flag may not have been set yet because vclean is blocked in
611 	 * the VOP_LOCK call waiting for the VOP_INACTIVE to complete.
612 	 */
613 	if ((vp->v_flag & VXLOCK) ||
614 	    (vp->v_usecount == 0 &&
615 	     vp->v_freelist.tqe_prev == (struct vnode **)0xdeadb)) {
616 		vp->v_flag |= VXWANT;
617 		sleep((caddr_t)vp, PINOD);
618 		return (1);
619 	}
620 	if (vp->v_usecount == 0)
621 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
622 	vp->v_usecount++;
623 	if (lockflag)
624 		VOP_LOCK(vp);
625 	return (0);
626 }
627 
628 /*
629  * Vnode reference, just increment the count
630  */
631 void
632 vref(vp)
633 	struct vnode *vp;
634 {
635 
636 	if (vp->v_usecount <= 0)
637 		panic("vref used where vget required");
638 	vp->v_usecount++;
639 }
640 
641 /*
642  * vput(), just unlock and vrele()
643  */
644 void
645 vput(vp)
646 	register struct vnode *vp;
647 {
648 
649 	VOP_UNLOCK(vp);
650 	vrele(vp);
651 }
652 
653 /*
654  * Vnode release.
655  * If count drops to zero, call inactive routine and return to freelist.
656  */
657 void
658 vrele(vp)
659 	register struct vnode *vp;
660 {
661 
662 #ifdef DIAGNOSTIC
663 	if (vp == NULL)
664 		panic("vrele: null vp");
665 #endif
666 	vp->v_usecount--;
667 	if (vp->v_usecount > 0)
668 		return;
669 #ifdef DIAGNOSTIC
670 	if (vp->v_usecount != 0 || vp->v_writecount != 0) {
671 		vprint("vrele: bad ref count", vp);
672 		panic("vrele: ref cnt");
673 	}
674 #endif
675 	/*
676 	 * insert at tail of LRU list
677 	 */
678 	TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
679 	VOP_INACTIVE(vp);
680 }
681 
682 /*
683  * Page or buffer structure gets a reference.
684  */
685 void
686 vhold(vp)
687 	register struct vnode *vp;
688 {
689 
690 	vp->v_holdcnt++;
691 }
692 
693 /*
694  * Page or buffer structure frees a reference.
695  */
696 void
697 holdrele(vp)
698 	register struct vnode *vp;
699 {
700 
701 	if (vp->v_holdcnt <= 0)
702 		panic("holdrele: holdcnt");
703 	vp->v_holdcnt--;
704 }
705 
706 /*
707  * Remove any vnodes in the vnode table belonging to mount point mp.
708  *
709  * If MNT_NOFORCE is specified, there should not be any active ones,
710  * return error if any are found (nb: this is a user error, not a
711  * system error). If MNT_FORCE is specified, detach any active vnodes
712  * that are found.
713  */
714 #ifdef DIAGNOSTIC
715 int busyprt = 0;	/* print out busy vnodes */
716 struct ctldebug debug1 = { "busyprt", &busyprt };
717 #endif
718 
719 int
720 vflush(mp, skipvp, flags)
721 	struct mount *mp;
722 	struct vnode *skipvp;
723 	int flags;
724 {
725 	register struct vnode *vp, *nvp;
726 	int busy = 0;
727 
728 	if ((mp->mnt_flag & MNT_MPBUSY) == 0)
729 		panic("vflush: not busy");
730 loop:
731 	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
732 		if (vp->v_mount != mp)
733 			goto loop;
734 		nvp = vp->v_mntvnodes.le_next;
735 		/*
736 		 * Skip over a selected vnode.
737 		 */
738 		if (vp == skipvp)
739 			continue;
740 		/*
741 		 * Skip over a vnodes marked VSYSTEM.
742 		 */
743 		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM))
744 			continue;
745 		/*
746 		 * If WRITECLOSE is set, only flush out regular file
747 		 * vnodes open for writing.
748 		 */
749 		if ((flags & WRITECLOSE) &&
750 		    (vp->v_writecount == 0 || vp->v_type != VREG))
751 			continue;
752 		/*
753 		 * With v_usecount == 0, all we need to do is clear
754 		 * out the vnode data structures and we are done.
755 		 */
756 		if (vp->v_usecount == 0) {
757 			vgone(vp);
758 			continue;
759 		}
760 		/*
761 		 * If FORCECLOSE is set, forcibly close the vnode.
762 		 * For block or character devices, revert to an
763 		 * anonymous device. For all other files, just kill them.
764 		 */
765 		if (flags & FORCECLOSE) {
766 			if (vp->v_type != VBLK && vp->v_type != VCHR) {
767 				vgone(vp);
768 			} else {
769 				vclean(vp, 0);
770 				vp->v_op = spec_vnodeop_p;
771 				insmntque(vp, (struct mount *)0);
772 			}
773 			continue;
774 		}
775 #ifdef DIAGNOSTIC
776 		if (busyprt)
777 			vprint("vflush: busy vnode", vp);
778 #endif
779 		busy++;
780 	}
781 	if (busy)
782 		return (EBUSY);
783 	return (0);
784 }
785 
786 /*
787  * Disassociate the underlying file system from a vnode.
788  */
789 void
790 vclean(vp, flags)
791 	register struct vnode *vp;
792 	int flags;
793 {
794 	int active;
795 
796 	/*
797 	 * Check to see if the vnode is in use.
798 	 * If so we have to reference it before we clean it out
799 	 * so that its count cannot fall to zero and generate a
800 	 * race against ourselves to recycle it.
801 	 */
802 	if (active = vp->v_usecount)
803 		VREF(vp);
804 	/*
805 	 * Even if the count is zero, the VOP_INACTIVE routine may still
806 	 * have the object locked while it cleans it out. The VOP_LOCK
807 	 * ensures that the VOP_INACTIVE routine is done with its work.
808 	 * For active vnodes, it ensures that no other activity can
809 	 * occur while the underlying object is being cleaned out.
810 	 */
811 	VOP_LOCK(vp);
812 	/*
813 	 * Prevent the vnode from being recycled or
814 	 * brought into use while we clean it out.
815 	 */
816 	if (vp->v_flag & VXLOCK)
817 		panic("vclean: deadlock");
818 	vp->v_flag |= VXLOCK;
819 	/*
820 	 * Clean out any buffers associated with the vnode.
821 	 */
822 	if (flags & DOCLOSE)
823 		vinvalbuf(vp, V_SAVE, NOCRED, NULL, 0, 0);
824 	/*
825 	 * Any other processes trying to obtain this lock must first
826 	 * wait for VXLOCK to clear, then call the new lock operation.
827 	 */
828 	VOP_UNLOCK(vp);
829 	/*
830 	 * If purging an active vnode, it must be closed and
831 	 * deactivated before being reclaimed.
832 	 */
833 	if (active) {
834 		if (flags & DOCLOSE)
835 			VOP_CLOSE(vp, IO_NDELAY, NOCRED, NULL);
836 		VOP_INACTIVE(vp);
837 	}
838 	/*
839 	 * Reclaim the vnode.
840 	 */
841 	if (VOP_RECLAIM(vp))
842 		panic("vclean: cannot reclaim");
843 	if (active)
844 		vrele(vp);
845 
846 	/*
847 	 * Done with purge, notify sleepers of the grim news.
848 	 */
849 	vp->v_op = dead_vnodeop_p;
850 	vp->v_tag = VT_NON;
851 	vp->v_flag &= ~VXLOCK;
852 	if (vp->v_flag & VXWANT) {
853 		vp->v_flag &= ~VXWANT;
854 		wakeup((caddr_t)vp);
855 	}
856 }
857 
858 /*
859  * Eliminate all activity associated with  the requested vnode
860  * and with all vnodes aliased to the requested vnode.
861  */
862 void
863 vgoneall(vp)
864 	register struct vnode *vp;
865 {
866 	register struct vnode *vq;
867 
868 	if (vp->v_flag & VALIASED) {
869 		/*
870 		 * If a vgone (or vclean) is already in progress,
871 		 * wait until it is done and return.
872 		 */
873 		if (vp->v_flag & VXLOCK) {
874 			vp->v_flag |= VXWANT;
875 			sleep((caddr_t)vp, PINOD);
876 			return;
877 		}
878 		/*
879 		 * Ensure that vp will not be vgone'd while we
880 		 * are eliminating its aliases.
881 		 */
882 		vp->v_flag |= VXLOCK;
883 		while (vp->v_flag & VALIASED) {
884 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
885 				if (vq->v_rdev != vp->v_rdev ||
886 				    vq->v_type != vp->v_type || vp == vq)
887 					continue;
888 				vgone(vq);
889 				break;
890 			}
891 		}
892 		/*
893 		 * Remove the lock so that vgone below will
894 		 * really eliminate the vnode after which time
895 		 * vgone will awaken any sleepers.
896 		 */
897 		vp->v_flag &= ~VXLOCK;
898 	}
899 	vgone(vp);
900 }
901 
902 /*
903  * Eliminate all activity associated with a vnode
904  * in preparation for reuse.
905  */
906 void
907 vgone(vp)
908 	register struct vnode *vp;
909 {
910 	register struct vnode *vq;
911 	struct vnode *vx;
912 
913 	/*
914 	 * If a vgone (or vclean) is already in progress,
915 	 * wait until it is done and return.
916 	 */
917 	if (vp->v_flag & VXLOCK) {
918 		vp->v_flag |= VXWANT;
919 		sleep((caddr_t)vp, PINOD);
920 		return;
921 	}
922 	/*
923 	 * Clean out the filesystem specific data.
924 	 */
925 	vclean(vp, DOCLOSE);
926 	/*
927 	 * Delete from old mount point vnode list, if on one.
928 	 */
929 	if (vp->v_mount != NULL) {
930 		LIST_REMOVE(vp, v_mntvnodes);
931 		vp->v_mount = NULL;
932 	}
933 	/*
934 	 * If special device, remove it from special device alias list.
935 	 */
936 	if (vp->v_type == VBLK || vp->v_type == VCHR) {
937 		if (*vp->v_hashchain == vp) {
938 			*vp->v_hashchain = vp->v_specnext;
939 		} else {
940 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
941 				if (vq->v_specnext != vp)
942 					continue;
943 				vq->v_specnext = vp->v_specnext;
944 				break;
945 			}
946 			if (vq == NULL)
947 				panic("missing bdev");
948 		}
949 		if (vp->v_flag & VALIASED) {
950 			vx = NULL;
951 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
952 				if (vq->v_rdev != vp->v_rdev ||
953 				    vq->v_type != vp->v_type)
954 					continue;
955 				if (vx)
956 					break;
957 				vx = vq;
958 			}
959 			if (vx == NULL)
960 				panic("missing alias");
961 			if (vq == NULL)
962 				vx->v_flag &= ~VALIASED;
963 			vp->v_flag &= ~VALIASED;
964 		}
965 		FREE(vp->v_specinfo, M_VNODE);
966 		vp->v_specinfo = NULL;
967 	}
968 	/*
969 	 * If it is on the freelist and not already at the head,
970 	 * move it to the head of the list. The test of the back
971 	 * pointer and the reference count of zero is because
972 	 * it will be removed from the free list by getnewvnode,
973 	 * but will not have its reference count incremented until
974 	 * after calling vgone. If the reference count were
975 	 * incremented first, vgone would (incorrectly) try to
976 	 * close the previous instance of the underlying object.
977 	 * So, the back pointer is explicitly set to `0xdeadb' in
978 	 * getnewvnode after removing it from the freelist to ensure
979 	 * that we do not try to move it here.
980 	 */
981 	if (vp->v_usecount == 0 &&
982 	    vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb &&
983 	    vnode_free_list.tqh_first != vp) {
984 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
985 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
986 	}
987 	vp->v_type = VBAD;
988 }
989 
990 /*
991  * Lookup a vnode by device number.
992  */
993 int
994 vfinddev(dev, type, vpp)
995 	dev_t dev;
996 	enum vtype type;
997 	struct vnode **vpp;
998 {
999 	register struct vnode *vp;
1000 
1001 	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1002 		if (dev != vp->v_rdev || type != vp->v_type)
1003 			continue;
1004 		*vpp = vp;
1005 		return (1);
1006 	}
1007 	return (0);
1008 }
1009 
1010 /*
1011  * Calculate the total number of references to a special device.
1012  */
1013 int
1014 vcount(vp)
1015 	register struct vnode *vp;
1016 {
1017 	register struct vnode *vq, *vnext;
1018 	int count;
1019 
1020 loop:
1021 	if ((vp->v_flag & VALIASED) == 0)
1022 		return (vp->v_usecount);
1023 	for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
1024 		vnext = vq->v_specnext;
1025 		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1026 			continue;
1027 		/*
1028 		 * Alias, but not in use, so flush it out.
1029 		 */
1030 		if (vq->v_usecount == 0 && vq != vp) {
1031 			vgone(vq);
1032 			goto loop;
1033 		}
1034 		count += vq->v_usecount;
1035 	}
1036 	return (count);
1037 }
1038 
1039 /*
1040  * Print out a description of a vnode.
1041  */
1042 static char *typename[] =
1043    { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
1044 
1045 void
1046 vprint(label, vp)
1047 	char *label;
1048 	register struct vnode *vp;
1049 {
1050 	char buf[64];
1051 
1052 	if (label != NULL)
1053 		printf("%s: ", label);
1054 	printf("type %s, usecount %d, writecount %d, refcount %d,",
1055 		typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1056 		vp->v_holdcnt);
1057 	buf[0] = '\0';
1058 	if (vp->v_flag & VROOT)
1059 		strcat(buf, "|VROOT");
1060 	if (vp->v_flag & VTEXT)
1061 		strcat(buf, "|VTEXT");
1062 	if (vp->v_flag & VSYSTEM)
1063 		strcat(buf, "|VSYSTEM");
1064 	if (vp->v_flag & VXLOCK)
1065 		strcat(buf, "|VXLOCK");
1066 	if (vp->v_flag & VXWANT)
1067 		strcat(buf, "|VXWANT");
1068 	if (vp->v_flag & VBWAIT)
1069 		strcat(buf, "|VBWAIT");
1070 	if (vp->v_flag & VALIASED)
1071 		strcat(buf, "|VALIASED");
1072 	if (buf[0] != '\0')
1073 		printf(" flags (%s)", &buf[1]);
1074 	if (vp->v_data == NULL) {
1075 		printf("\n");
1076 	} else {
1077 		printf("\n\t");
1078 		VOP_PRINT(vp);
1079 	}
1080 }
1081 
1082 #ifdef DEBUG
1083 /*
1084  * List all of the locked vnodes in the system.
1085  * Called when debugging the kernel.
1086  */
1087 void
1088 printlockedvnodes()
1089 {
1090 	register struct mount *mp;
1091 	register struct vnode *vp;
1092 
1093 	printf("Locked vnodes\n");
1094 	for (mp = mountlist.tqh_first; mp != NULL; mp = mp->mnt_list.tqe_next) {
1095 		for (vp = mp->mnt_vnodelist.lh_first;
1096 		     vp != NULL;
1097 		     vp = vp->v_mntvnodes.le_next)
1098 			if (VOP_ISLOCKED(vp))
1099 				vprint((char *)0, vp);
1100 	}
1101 }
1102 #endif
1103 
1104 int kinfo_vdebug = 1;
1105 int kinfo_vgetfailed;
1106 #define KINFO_VNODESLOP	10
1107 /*
1108  * Dump vnode list (via sysctl).
1109  * Copyout address of vnode followed by vnode.
1110  */
1111 /* ARGSUSED */
1112 int
1113 sysctl_vnode(where, sizep)
1114 	char *where;
1115 	size_t *sizep;
1116 {
1117 	register struct mount *mp, *nmp;
1118 	struct vnode *vp;
1119 	register char *bp = where, *savebp;
1120 	char *ewhere;
1121 	int error;
1122 
1123 #define VPTRSZ	sizeof (struct vnode *)
1124 #define VNODESZ	sizeof (struct vnode)
1125 	if (where == NULL) {
1126 		*sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
1127 		return (0);
1128 	}
1129 	ewhere = where + *sizep;
1130 
1131 	for (mp = mountlist.tqh_first; mp != NULL; mp = nmp) {
1132 		nmp = mp->mnt_list.tqe_next;
1133 		if (vfs_busy(mp))
1134 			continue;
1135 		savebp = bp;
1136 again:
1137 		for (vp = mp->mnt_vnodelist.lh_first;
1138 		     vp != NULL;
1139 		     vp = vp->v_mntvnodes.le_next) {
1140 			/*
1141 			 * Check that the vp is still associated with
1142 			 * this filesystem.  RACE: could have been
1143 			 * recycled onto the same filesystem.
1144 			 */
1145 			if (vp->v_mount != mp) {
1146 				if (kinfo_vdebug)
1147 					printf("kinfo: vp changed\n");
1148 				bp = savebp;
1149 				goto again;
1150 			}
1151 			if (bp + VPTRSZ + VNODESZ > ewhere) {
1152 				*sizep = bp - where;
1153 				return (ENOMEM);
1154 			}
1155 			if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) ||
1156 			   (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ)))
1157 				return (error);
1158 			bp += VPTRSZ + VNODESZ;
1159 		}
1160 		vfs_unbusy(mp);
1161 	}
1162 
1163 	*sizep = bp - where;
1164 	return (0);
1165 }
1166 
1167 /*
1168  * Check to see if a filesystem is mounted on a block device.
1169  */
1170 int
1171 vfs_mountedon(vp)
1172 	register struct vnode *vp;
1173 {
1174 	register struct vnode *vq;
1175 
1176 	if (vp->v_specflags & SI_MOUNTEDON)
1177 		return (EBUSY);
1178 	if (vp->v_flag & VALIASED) {
1179 		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1180 			if (vq->v_rdev != vp->v_rdev ||
1181 			    vq->v_type != vp->v_type)
1182 				continue;
1183 			if (vq->v_specflags & SI_MOUNTEDON)
1184 				return (EBUSY);
1185 		}
1186 	}
1187 	return (0);
1188 }
1189 
1190 /*
1191  * Build hash lists of net addresses and hang them off the mount point.
1192  * Called by ufs_mount() to set up the lists of export addresses.
1193  */
1194 static int
1195 vfs_hang_addrlist(mp, nep, argp)
1196 	struct mount *mp;
1197 	struct netexport *nep;
1198 	struct export_args *argp;
1199 {
1200 	register struct netcred *np;
1201 	register struct radix_node_head *rnh;
1202 	register int i;
1203 	struct radix_node *rn;
1204 	struct sockaddr *saddr, *smask = 0;
1205 	struct domain *dom;
1206 	int error;
1207 
1208 	if (argp->ex_addrlen == 0) {
1209 		if (mp->mnt_flag & MNT_DEFEXPORTED)
1210 			return (EPERM);
1211 		np = &nep->ne_defexported;
1212 		np->netc_exflags = argp->ex_flags;
1213 		np->netc_anon = argp->ex_anon;
1214 		np->netc_anon.cr_ref = 1;
1215 		mp->mnt_flag |= MNT_DEFEXPORTED;
1216 		return (0);
1217 	}
1218 	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
1219 	np = (struct netcred *)malloc(i, M_NETADDR, M_WAITOK);
1220 	bzero((caddr_t)np, i);
1221 	saddr = (struct sockaddr *)(np + 1);
1222 	if (error = copyin(argp->ex_addr, (caddr_t)saddr, argp->ex_addrlen))
1223 		goto out;
1224 	if (saddr->sa_len > argp->ex_addrlen)
1225 		saddr->sa_len = argp->ex_addrlen;
1226 	if (argp->ex_masklen) {
1227 		smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);
1228 		error = copyin(argp->ex_addr, (caddr_t)smask, argp->ex_masklen);
1229 		if (error)
1230 			goto out;
1231 		if (smask->sa_len > argp->ex_masklen)
1232 			smask->sa_len = argp->ex_masklen;
1233 	}
1234 	i = saddr->sa_family;
1235 	if ((rnh = nep->ne_rtable[i]) == 0) {
1236 		/*
1237 		 * Seems silly to initialize every AF when most are not
1238 		 * used, do so on demand here
1239 		 */
1240 		for (dom = domains; dom; dom = dom->dom_next)
1241 			if (dom->dom_family == i && dom->dom_rtattach) {
1242 				dom->dom_rtattach((void **)&nep->ne_rtable[i],
1243 					dom->dom_rtoffset);
1244 				break;
1245 			}
1246 		if ((rnh = nep->ne_rtable[i]) == 0) {
1247 			error = ENOBUFS;
1248 			goto out;
1249 		}
1250 	}
1251 	rn = (*rnh->rnh_addaddr)((caddr_t)saddr, (caddr_t)smask, rnh,
1252 		np->netc_rnodes);
1253 	if (rn == 0 || np != (struct netcred *)rn) { /* already exists */
1254 		error = EPERM;
1255 		goto out;
1256 	}
1257 	np->netc_exflags = argp->ex_flags;
1258 	np->netc_anon = argp->ex_anon;
1259 	np->netc_anon.cr_ref = 1;
1260 	return (0);
1261 out:
1262 	free(np, M_NETADDR);
1263 	return (error);
1264 }
1265 
1266 /* ARGSUSED */
1267 static int
1268 vfs_free_netcred(rn, w)
1269 	struct radix_node *rn;
1270 	caddr_t w;
1271 {
1272 	register struct radix_node_head *rnh = (struct radix_node_head *)w;
1273 
1274 	(*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh);
1275 	free((caddr_t)rn, M_NETADDR);
1276 	return (0);
1277 }
1278 
1279 /*
1280  * Free the net address hash lists that are hanging off the mount points.
1281  */
1282 static void
1283 vfs_free_addrlist(nep)
1284 	struct netexport *nep;
1285 {
1286 	register int i;
1287 	register struct radix_node_head *rnh;
1288 
1289 	for (i = 0; i <= AF_MAX; i++)
1290 		if (rnh = nep->ne_rtable[i]) {
1291 			(*rnh->rnh_walktree)(rnh, vfs_free_netcred,
1292 			    (caddr_t)rnh);
1293 			free((caddr_t)rnh, M_RTABLE);
1294 			nep->ne_rtable[i] = 0;
1295 		}
1296 }
1297 
1298 int
1299 vfs_export(mp, nep, argp)
1300 	struct mount *mp;
1301 	struct netexport *nep;
1302 	struct export_args *argp;
1303 {
1304 	int error;
1305 
1306 	if (argp->ex_flags & MNT_DELEXPORT) {
1307 		vfs_free_addrlist(nep);
1308 		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
1309 	}
1310 	if (argp->ex_flags & MNT_EXPORTED) {
1311 		if (error = vfs_hang_addrlist(mp, nep, argp))
1312 			return (error);
1313 		mp->mnt_flag |= MNT_EXPORTED;
1314 	}
1315 	return (0);
1316 }
1317 
1318 struct netcred *
1319 vfs_export_lookup(mp, nep, nam)
1320 	register struct mount *mp;
1321 	struct netexport *nep;
1322 	struct mbuf *nam;
1323 {
1324 	register struct netcred *np;
1325 	register struct radix_node_head *rnh;
1326 	struct sockaddr *saddr;
1327 
1328 	np = NULL;
1329 	if (mp->mnt_flag & MNT_EXPORTED) {
1330 		/*
1331 		 * Lookup in the export list first.
1332 		 */
1333 		if (nam != NULL) {
1334 			saddr = mtod(nam, struct sockaddr *);
1335 			rnh = nep->ne_rtable[saddr->sa_family];
1336 			if (rnh != NULL) {
1337 				np = (struct netcred *)
1338 					(*rnh->rnh_matchaddr)((caddr_t)saddr,
1339 							      rnh);
1340 				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
1341 					np = NULL;
1342 			}
1343 		}
1344 		/*
1345 		 * If no address match, use the default if it exists.
1346 		 */
1347 		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
1348 			np = &nep->ne_defexported;
1349 	}
1350 	return (np);
1351 }
1352