xref: /freebsd/sys/kern/vfs_default.c (revision 7d0d268b8a67f28ccefdd0b8ce6fb38acac78d80)
1 /*-
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed
6  * to Berkeley by John Heidemann of the UCLA Ficus project.
7  *
8  * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/bio.h>
41 #include <sys/buf.h>
42 #include <sys/conf.h>
43 #include <sys/event.h>
44 #include <sys/kernel.h>
45 #include <sys/limits.h>
46 #include <sys/lock.h>
47 #include <sys/lockf.h>
48 #include <sys/malloc.h>
49 #include <sys/mount.h>
50 #include <sys/mutex.h>
51 #include <sys/unistd.h>
52 #include <sys/vnode.h>
53 #include <sys/poll.h>
54 
55 #include <vm/vm.h>
56 #include <vm/vm_object.h>
57 #include <vm/vm_extern.h>
58 #include <vm/pmap.h>
59 #include <vm/vm_map.h>
60 #include <vm/vm_page.h>
61 #include <vm/vm_pager.h>
62 #include <vm/vnode_pager.h>
63 
64 static int	vop_nolookup(struct vop_lookup_args *);
65 static int	vop_nostrategy(struct vop_strategy_args *);
66 
67 /*
68  * This vnode table stores what we want to do if the filesystem doesn't
69  * implement a particular VOP.
70  *
71  * If there is no specific entry here, we will return EOPNOTSUPP.
72  *
73  */
74 
75 struct vop_vector default_vnodeops = {
76 	.vop_default =		NULL,
77 	.vop_bypass =		VOP_EOPNOTSUPP,
78 
79 	.vop_advlock =		vop_stdadvlock,
80 	.vop_advlockasync =	vop_stdadvlockasync,
81 	.vop_bmap =		vop_stdbmap,
82 	.vop_close =		VOP_NULL,
83 	.vop_fsync =		VOP_NULL,
84 	.vop_getpages =		vop_stdgetpages,
85 	.vop_getwritemount = 	vop_stdgetwritemount,
86 	.vop_inactive =		VOP_NULL,
87 	.vop_ioctl =		VOP_ENOTTY,
88 	.vop_kqfilter =		vop_stdkqfilter,
89 	.vop_islocked =		vop_stdislocked,
90 	.vop_lease =		VOP_NULL,
91 	.vop_lock1 =		vop_stdlock,
92 	.vop_lookup =		vop_nolookup,
93 	.vop_open =		VOP_NULL,
94 	.vop_pathconf =		VOP_EINVAL,
95 	.vop_poll =		vop_nopoll,
96 	.vop_putpages =		vop_stdputpages,
97 	.vop_readlink =		VOP_EINVAL,
98 	.vop_revoke =		VOP_PANIC,
99 	.vop_strategy =		vop_nostrategy,
100 	.vop_unlock =		vop_stdunlock,
101 	.vop_vptocnp =		VOP_ENOENT,
102 	.vop_vptofh =		vop_stdvptofh,
103 };
104 
105 /*
106  * Series of placeholder functions for various error returns for
107  * VOPs.
108  */
109 
110 int
111 vop_eopnotsupp(struct vop_generic_args *ap)
112 {
113 	/*
114 	printf("vop_notsupp[%s]\n", ap->a_desc->vdesc_name);
115 	*/
116 
117 	return (EOPNOTSUPP);
118 }
119 
120 int
121 vop_ebadf(struct vop_generic_args *ap)
122 {
123 
124 	return (EBADF);
125 }
126 
127 int
128 vop_enotty(struct vop_generic_args *ap)
129 {
130 
131 	return (ENOTTY);
132 }
133 
134 int
135 vop_einval(struct vop_generic_args *ap)
136 {
137 
138 	return (EINVAL);
139 }
140 
141 int
142 vop_enoent(struct vop_generic_args *ap)
143 {
144 
145 	return (ENOENT);
146 }
147 
148 int
149 vop_null(struct vop_generic_args *ap)
150 {
151 
152 	return (0);
153 }
154 
155 /*
156  * Helper function to panic on some bad VOPs in some filesystems.
157  */
158 int
159 vop_panic(struct vop_generic_args *ap)
160 {
161 
162 	panic("filesystem goof: vop_panic[%s]", ap->a_desc->vdesc_name);
163 }
164 
165 /*
166  * vop_std<something> and vop_no<something> are default functions for use by
167  * filesystems that need the "default reasonable" implementation for a
168  * particular operation.
169  *
170  * The documentation for the operations they implement exists (if it exists)
171  * in the VOP_<SOMETHING>(9) manpage (all uppercase).
172  */
173 
174 /*
175  * Default vop for filesystems that do not support name lookup
176  */
177 static int
178 vop_nolookup(ap)
179 	struct vop_lookup_args /* {
180 		struct vnode *a_dvp;
181 		struct vnode **a_vpp;
182 		struct componentname *a_cnp;
183 	} */ *ap;
184 {
185 
186 	*ap->a_vpp = NULL;
187 	return (ENOTDIR);
188 }
189 
190 /*
191  *	vop_nostrategy:
192  *
193  *	Strategy routine for VFS devices that have none.
194  *
195  *	BIO_ERROR and B_INVAL must be cleared prior to calling any strategy
196  *	routine.  Typically this is done for a BIO_READ strategy call.
197  *	Typically B_INVAL is assumed to already be clear prior to a write
198  *	and should not be cleared manually unless you just made the buffer
199  *	invalid.  BIO_ERROR should be cleared either way.
200  */
201 
202 static int
203 vop_nostrategy (struct vop_strategy_args *ap)
204 {
205 	printf("No strategy for buffer at %p\n", ap->a_bp);
206 	vprint("vnode", ap->a_vp);
207 	ap->a_bp->b_ioflags |= BIO_ERROR;
208 	ap->a_bp->b_error = EOPNOTSUPP;
209 	bufdone(ap->a_bp);
210 	return (EOPNOTSUPP);
211 }
212 
213 /*
214  * Advisory record locking support
215  */
216 int
217 vop_stdadvlock(struct vop_advlock_args *ap)
218 {
219 	struct vnode *vp;
220 	struct ucred *cred;
221 	struct vattr vattr;
222 	int error;
223 
224 	vp = ap->a_vp;
225 	cred = curthread->td_ucred;
226 	vn_lock(vp, LK_SHARED | LK_RETRY);
227 	error = VOP_GETATTR(vp, &vattr, cred);
228 	VOP_UNLOCK(vp, 0);
229 	if (error)
230 		return (error);
231 
232 	return (lf_advlock(ap, &(vp->v_lockf), vattr.va_size));
233 }
234 
235 int
236 vop_stdadvlockasync(struct vop_advlockasync_args *ap)
237 {
238 	struct vnode *vp;
239 	struct ucred *cred;
240 	struct vattr vattr;
241 	int error;
242 
243 	vp = ap->a_vp;
244 	cred = curthread->td_ucred;
245 	vn_lock(vp, LK_SHARED | LK_RETRY);
246 	error = VOP_GETATTR(vp, &vattr, cred);
247 	VOP_UNLOCK(vp, 0);
248 	if (error)
249 		return (error);
250 
251 	return (lf_advlockasync(ap, &(vp->v_lockf), vattr.va_size));
252 }
253 
254 /*
255  * vop_stdpathconf:
256  *
257  * Standard implementation of POSIX pathconf, to get information about limits
258  * for a filesystem.
259  * Override per filesystem for the case where the filesystem has smaller
260  * limits.
261  */
262 int
263 vop_stdpathconf(ap)
264 	struct vop_pathconf_args /* {
265 	struct vnode *a_vp;
266 	int a_name;
267 	int *a_retval;
268 	} */ *ap;
269 {
270 
271 	switch (ap->a_name) {
272 		case _PC_NAME_MAX:
273 			*ap->a_retval = NAME_MAX;
274 			return (0);
275 		case _PC_PATH_MAX:
276 			*ap->a_retval = PATH_MAX;
277 			return (0);
278 		case _PC_LINK_MAX:
279 			*ap->a_retval = LINK_MAX;
280 			return (0);
281 		case _PC_MAX_CANON:
282 			*ap->a_retval = MAX_CANON;
283 			return (0);
284 		case _PC_MAX_INPUT:
285 			*ap->a_retval = MAX_INPUT;
286 			return (0);
287 		case _PC_PIPE_BUF:
288 			*ap->a_retval = PIPE_BUF;
289 			return (0);
290 		case _PC_CHOWN_RESTRICTED:
291 			*ap->a_retval = 1;
292 			return (0);
293 		case _PC_VDISABLE:
294 			*ap->a_retval = _POSIX_VDISABLE;
295 			return (0);
296 		default:
297 			return (EINVAL);
298 	}
299 	/* NOTREACHED */
300 }
301 
302 /*
303  * Standard lock, unlock and islocked functions.
304  */
305 int
306 vop_stdlock(ap)
307 	struct vop_lock1_args /* {
308 		struct vnode *a_vp;
309 		int a_flags;
310 		char *file;
311 		int line;
312 	} */ *ap;
313 {
314 	struct vnode *vp = ap->a_vp;
315 
316 	return (_lockmgr_args(vp->v_vnlock, ap->a_flags, VI_MTX(vp),
317 	    LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, ap->a_file,
318 	    ap->a_line));
319 }
320 
321 /* See above. */
322 int
323 vop_stdunlock(ap)
324 	struct vop_unlock_args /* {
325 		struct vnode *a_vp;
326 		int a_flags;
327 	} */ *ap;
328 {
329 	struct vnode *vp = ap->a_vp;
330 
331 	return (lockmgr(vp->v_vnlock, ap->a_flags | LK_RELEASE, VI_MTX(vp)));
332 }
333 
334 /* See above. */
335 int
336 vop_stdislocked(ap)
337 	struct vop_islocked_args /* {
338 		struct vnode *a_vp;
339 	} */ *ap;
340 {
341 
342 	return (lockstatus(ap->a_vp->v_vnlock));
343 }
344 
345 /*
346  * Return true for select/poll.
347  */
348 int
349 vop_nopoll(ap)
350 	struct vop_poll_args /* {
351 		struct vnode *a_vp;
352 		int  a_events;
353 		struct ucred *a_cred;
354 		struct thread *a_td;
355 	} */ *ap;
356 {
357 	/*
358 	 * Return true for read/write.  If the user asked for something
359 	 * special, return POLLNVAL, so that clients have a way of
360 	 * determining reliably whether or not the extended
361 	 * functionality is present without hard-coding knowledge
362 	 * of specific filesystem implementations.
363 	 * Stay in sync with kern_conf.c::no_poll().
364 	 */
365 	if (ap->a_events & ~POLLSTANDARD)
366 		return (POLLNVAL);
367 
368 	return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
369 }
370 
371 /*
372  * Implement poll for local filesystems that support it.
373  */
374 int
375 vop_stdpoll(ap)
376 	struct vop_poll_args /* {
377 		struct vnode *a_vp;
378 		int  a_events;
379 		struct ucred *a_cred;
380 		struct thread *a_td;
381 	} */ *ap;
382 {
383 	if (ap->a_events & ~POLLSTANDARD)
384 		return (vn_pollrecord(ap->a_vp, ap->a_td, ap->a_events));
385 	return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
386 }
387 
388 /*
389  * Return our mount point, as we will take charge of the writes.
390  */
391 int
392 vop_stdgetwritemount(ap)
393 	struct vop_getwritemount_args /* {
394 		struct vnode *a_vp;
395 		struct mount **a_mpp;
396 	} */ *ap;
397 {
398 	struct mount *mp;
399 
400 	/*
401 	 * XXX Since this is called unlocked we may be recycled while
402 	 * attempting to ref the mount.  If this is the case or mountpoint
403 	 * will be set to NULL.  We only have to prevent this call from
404 	 * returning with a ref to an incorrect mountpoint.  It is not
405 	 * harmful to return with a ref to our previous mountpoint.
406 	 */
407 	mp = ap->a_vp->v_mount;
408 	if (mp != NULL) {
409 		vfs_ref(mp);
410 		if (mp != ap->a_vp->v_mount) {
411 			vfs_rel(mp);
412 			mp = NULL;
413 		}
414 	}
415 	*(ap->a_mpp) = mp;
416 	return (0);
417 }
418 
419 /* XXX Needs good comment and VOP_BMAP(9) manpage */
420 int
421 vop_stdbmap(ap)
422 	struct vop_bmap_args /* {
423 		struct vnode *a_vp;
424 		daddr_t  a_bn;
425 		struct bufobj **a_bop;
426 		daddr_t *a_bnp;
427 		int *a_runp;
428 		int *a_runb;
429 	} */ *ap;
430 {
431 
432 	if (ap->a_bop != NULL)
433 		*ap->a_bop = &ap->a_vp->v_bufobj;
434 	if (ap->a_bnp != NULL)
435 		*ap->a_bnp = ap->a_bn * btodb(ap->a_vp->v_mount->mnt_stat.f_iosize);
436 	if (ap->a_runp != NULL)
437 		*ap->a_runp = 0;
438 	if (ap->a_runb != NULL)
439 		*ap->a_runb = 0;
440 	return (0);
441 }
442 
443 int
444 vop_stdfsync(ap)
445 	struct vop_fsync_args /* {
446 		struct vnode *a_vp;
447 		struct ucred *a_cred;
448 		int a_waitfor;
449 		struct thread *a_td;
450 	} */ *ap;
451 {
452 	struct vnode *vp = ap->a_vp;
453 	struct buf *bp;
454 	struct bufobj *bo;
455 	struct buf *nbp;
456 	int error = 0;
457 	int maxretry = 1000;     /* large, arbitrarily chosen */
458 
459 	bo = &vp->v_bufobj;
460 	BO_LOCK(bo);
461 loop1:
462 	/*
463 	 * MARK/SCAN initialization to avoid infinite loops.
464 	 */
465         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
466                 bp->b_vflags &= ~BV_SCANNED;
467 		bp->b_error = 0;
468 	}
469 
470 	/*
471 	 * Flush all dirty buffers associated with a vnode.
472 	 */
473 loop2:
474 	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
475 		if ((bp->b_vflags & BV_SCANNED) != 0)
476 			continue;
477 		bp->b_vflags |= BV_SCANNED;
478 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
479 			continue;
480 		BO_UNLOCK(bo);
481 		KASSERT(bp->b_bufobj == bo,
482 		    ("bp %p wrong b_bufobj %p should be %p",
483 		    bp, bp->b_bufobj, bo));
484 		if ((bp->b_flags & B_DELWRI) == 0)
485 			panic("fsync: not dirty");
486 		if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) {
487 			vfs_bio_awrite(bp);
488 		} else {
489 			bremfree(bp);
490 			bawrite(bp);
491 		}
492 		BO_LOCK(bo);
493 		goto loop2;
494 	}
495 
496 	/*
497 	 * If synchronous the caller expects us to completely resolve all
498 	 * dirty buffers in the system.  Wait for in-progress I/O to
499 	 * complete (which could include background bitmap writes), then
500 	 * retry if dirty blocks still exist.
501 	 */
502 	if (ap->a_waitfor == MNT_WAIT) {
503 		bufobj_wwait(bo, 0, 0);
504 		if (bo->bo_dirty.bv_cnt > 0) {
505 			/*
506 			 * If we are unable to write any of these buffers
507 			 * then we fail now rather than trying endlessly
508 			 * to write them out.
509 			 */
510 			TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
511 				if ((error = bp->b_error) == 0)
512 					continue;
513 			if (error == 0 && --maxretry >= 0)
514 				goto loop1;
515 			error = EAGAIN;
516 		}
517 	}
518 	BO_UNLOCK(bo);
519 	if (error == EAGAIN)
520 		vprint("fsync: giving up on dirty", vp);
521 
522 	return (error);
523 }
524 
525 /* XXX Needs good comment and more info in the manpage (VOP_GETPAGES(9)). */
526 int
527 vop_stdgetpages(ap)
528 	struct vop_getpages_args /* {
529 		struct vnode *a_vp;
530 		vm_page_t *a_m;
531 		int a_count;
532 		int a_reqpage;
533 		vm_ooffset_t a_offset;
534 	} */ *ap;
535 {
536 
537 	return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
538 	    ap->a_count, ap->a_reqpage);
539 }
540 
541 int
542 vop_stdkqfilter(struct vop_kqfilter_args *ap)
543 {
544 	return vfs_kqfilter(ap);
545 }
546 
547 /* XXX Needs good comment and more info in the manpage (VOP_PUTPAGES(9)). */
548 int
549 vop_stdputpages(ap)
550 	struct vop_putpages_args /* {
551 		struct vnode *a_vp;
552 		vm_page_t *a_m;
553 		int a_count;
554 		int a_sync;
555 		int *a_rtvals;
556 		vm_ooffset_t a_offset;
557 	} */ *ap;
558 {
559 
560 	return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count,
561 	     ap->a_sync, ap->a_rtvals);
562 }
563 
564 int
565 vop_stdvptofh(struct vop_vptofh_args *ap)
566 {
567 	return (EOPNOTSUPP);
568 }
569 
570 /*
571  * vfs default ops
572  * used to fill the vfs function table to get reasonable default return values.
573  */
574 int
575 vfs_stdroot (mp, flags, vpp, td)
576 	struct mount *mp;
577 	int flags;
578 	struct vnode **vpp;
579 	struct thread *td;
580 {
581 
582 	return (EOPNOTSUPP);
583 }
584 
585 int
586 vfs_stdstatfs (mp, sbp, td)
587 	struct mount *mp;
588 	struct statfs *sbp;
589 	struct thread *td;
590 {
591 
592 	return (EOPNOTSUPP);
593 }
594 
595 int
596 vfs_stdquotactl (mp, cmds, uid, arg, td)
597 	struct mount *mp;
598 	int cmds;
599 	uid_t uid;
600 	void *arg;
601 	struct thread *td;
602 {
603 
604 	return (EOPNOTSUPP);
605 }
606 
607 int
608 vfs_stdsync(mp, waitfor, td)
609 	struct mount *mp;
610 	int waitfor;
611 	struct thread *td;
612 {
613 	struct vnode *vp, *mvp;
614 	int error, lockreq, allerror = 0;
615 
616 	lockreq = LK_EXCLUSIVE | LK_INTERLOCK;
617 	if (waitfor != MNT_WAIT)
618 		lockreq |= LK_NOWAIT;
619 	/*
620 	 * Force stale buffer cache information to be flushed.
621 	 */
622 	MNT_ILOCK(mp);
623 loop:
624 	MNT_VNODE_FOREACH(vp, mp, mvp) {
625 		/* bv_cnt is an acceptable race here. */
626 		if (vp->v_bufobj.bo_dirty.bv_cnt == 0)
627 			continue;
628 		VI_LOCK(vp);
629 		MNT_IUNLOCK(mp);
630 		if ((error = vget(vp, lockreq, td)) != 0) {
631 			MNT_ILOCK(mp);
632 			if (error == ENOENT) {
633 				MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
634 				goto loop;
635 			}
636 			continue;
637 		}
638 		error = VOP_FSYNC(vp, waitfor, td);
639 		if (error)
640 			allerror = error;
641 
642 		/* Do not turn this into vput.  td is not always curthread. */
643 		VOP_UNLOCK(vp, 0);
644 		vrele(vp);
645 		MNT_ILOCK(mp);
646 	}
647 	MNT_IUNLOCK(mp);
648 	return (allerror);
649 }
650 
651 int
652 vfs_stdnosync (mp, waitfor, td)
653 	struct mount *mp;
654 	int waitfor;
655 	struct thread *td;
656 {
657 
658 	return (0);
659 }
660 
661 int
662 vfs_stdvget (mp, ino, flags, vpp)
663 	struct mount *mp;
664 	ino_t ino;
665 	int flags;
666 	struct vnode **vpp;
667 {
668 
669 	return (EOPNOTSUPP);
670 }
671 
672 int
673 vfs_stdfhtovp (mp, fhp, vpp)
674 	struct mount *mp;
675 	struct fid *fhp;
676 	struct vnode **vpp;
677 {
678 
679 	return (EOPNOTSUPP);
680 }
681 
682 int
683 vfs_stdinit (vfsp)
684 	struct vfsconf *vfsp;
685 {
686 
687 	return (0);
688 }
689 
690 int
691 vfs_stduninit (vfsp)
692 	struct vfsconf *vfsp;
693 {
694 
695 	return(0);
696 }
697 
698 int
699 vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname, td)
700 	struct mount *mp;
701 	int cmd;
702 	struct vnode *filename_vp;
703 	int attrnamespace;
704 	const char *attrname;
705 	struct thread *td;
706 {
707 
708 	if (filename_vp != NULL)
709 		VOP_UNLOCK(filename_vp, 0);
710 	return (EOPNOTSUPP);
711 }
712 
713 int
714 vfs_stdsysctl(mp, op, req)
715 	struct mount *mp;
716 	fsctlop_t op;
717 	struct sysctl_req *req;
718 {
719 
720 	return (EOPNOTSUPP);
721 }
722 
723 /* end of vfs default ops */
724