xref: /titanic_50/usr/src/uts/common/fs/lofs/lofs_vfsops.c (revision c64027834c5ffc60c557c2b12555e0cd4d30320c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/param.h>
29 #include <sys/errno.h>
30 #include <sys/vfs.h>
31 #include <sys/vnode.h>
32 #include <sys/uio.h>
33 #include <sys/pathname.h>
34 #include <sys/kmem.h>
35 #include <sys/cred.h>
36 #include <sys/statvfs.h>
37 #include <sys/fs/lofs_info.h>
38 #include <sys/fs/lofs_node.h>
39 #include <sys/mount.h>
40 #include <sys/mntent.h>
41 #include <sys/mkdev.h>
42 #include <sys/priv.h>
43 #include <sys/sysmacros.h>
44 #include <sys/systm.h>
45 #include <sys/cmn_err.h>
46 #include <sys/policy.h>
47 #include <sys/tsol/label.h>
48 #include "fs/fs_subr.h"
49 
50 /*
51  * This is the loadable module wrapper.
52  */
53 #include <sys/modctl.h>
54 
55 static mntopts_t lofs_mntopts;
56 
57 static int lofsinit(int, char *);
58 
59 static vfsdef_t vfw = {
60 	VFSDEF_VERSION,
61 	"lofs",
62 	lofsinit,
63 	VSW_HASPROTO|VSW_STATS,
64 	&lofs_mntopts
65 };
66 
67 /*
68  * LOFS mount options table
69  */
70 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
71 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
72 static char *sub_cancel[] = { MNTOPT_LOFS_NOSUB, NULL };
73 static char *nosub_cancel[] = { MNTOPT_LOFS_SUB, NULL };
74 
75 static mntopt_t mntopts[] = {
76 /*
77  *	option name		cancel option	default arg	flags
78  *		private data
79  */
80 	{ MNTOPT_XATTR,		xattr_cancel,	NULL,		0,
81 		(void *)0 },
82 	{ MNTOPT_NOXATTR,	noxattr_cancel,	NULL,		0,
83 		(void *)0 },
84 	{ MNTOPT_LOFS_SUB,	sub_cancel,	NULL,		0,
85 		(void *)0 },
86 	{ MNTOPT_LOFS_NOSUB,	nosub_cancel,	NULL,		0,
87 		(void *)0 },
88 };
89 
90 static mntopts_t lofs_mntopts = {
91 	sizeof (mntopts) / sizeof (mntopt_t),
92 	mntopts
93 };
94 
95 /*
96  * Module linkage information for the kernel.
97  */
98 
99 static struct modlfs modlfs = {
100 	&mod_fsops, "filesystem for lofs", &vfw
101 };
102 
103 static struct modlinkage modlinkage = {
104 	MODREV_1, (void *)&modlfs, NULL
105 };
106 
107 /*
108  * This is the module initialization routine.
109  */
110 
111 int
112 _init(void)
113 {
114 	int status;
115 
116 	lofs_subrinit();
117 	status = mod_install(&modlinkage);
118 	if (status != 0) {
119 		/*
120 		 * Cleanup previously initialized work.
121 		 */
122 		lofs_subrfini();
123 	}
124 
125 	return (status);
126 }
127 
128 /*
129  * Don't allow the lofs module to be unloaded for now.
130  * There is a memory leak if it gets unloaded.
131  */
132 
133 int
134 _fini(void)
135 {
136 	return (EBUSY);
137 }
138 
139 int
140 _info(struct modinfo *modinfop)
141 {
142 	return (mod_info(&modlinkage, modinfop));
143 }
144 
145 
146 static int lofsfstype;
147 vfsops_t *lo_vfsops;
148 
149 /*
150  * lo mount vfsop
151  * Set up mount info record and attach it to vfs struct.
152  */
153 /*ARGSUSED*/
154 static int
155 lo_mount(struct vfs *vfsp,
156 	struct vnode *vp,
157 	struct mounta *uap,
158 	struct cred *cr)
159 {
160 	int error;
161 	struct vnode *srootvp = NULL;	/* the server's root */
162 	struct vnode *realrootvp;
163 	struct loinfo *li;
164 	int nodev;
165 
166 	nodev = vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL);
167 
168 	if ((error = secpolicy_fs_mount(cr, vp, vfsp)) != 0)
169 		return (EPERM);
170 
171 	/*
172 	 * Loopback devices which get "nodevices" added can be done without
173 	 * "nodevices" set because we cannot import devices into a zone
174 	 * with loopback.  Note that we have all zone privileges when
175 	 * this happens; if not, we'd have gotten "nosuid".
176 	 */
177 	if (!nodev && vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
178 		vfs_setmntopt(vfsp, MNTOPT_DEVICES, NULL, VFS_NODISPLAY);
179 
180 	mutex_enter(&vp->v_lock);
181 	if (!(uap->flags & MS_OVERLAY) &&
182 	    (vp->v_count != 1 || (vp->v_flag & VROOT))) {
183 		mutex_exit(&vp->v_lock);
184 		return (EBUSY);
185 	}
186 	mutex_exit(&vp->v_lock);
187 
188 	/*
189 	 * Find real root, and make vfs point to real vfs
190 	 */
191 	if (error = lookupname(uap->spec, (uap->flags & MS_SYSSPACE) ?
192 		UIO_SYSSPACE : UIO_USERSPACE, FOLLOW, NULLVPP,
193 	    &realrootvp))
194 		return (error);
195 
196 	/*
197 	 * Enforce MAC policy if needed.
198 	 *
199 	 * Loopback mounts must not allow writing up. The dominance test
200 	 * is intended to prevent a global zone caller from accidentally
201 	 * creating write-up conditions between two labeled zones.
202 	 * Local zones can't violate MAC on their own without help from
203 	 * the global zone because they can't name a pathname that
204 	 * they don't already have.
205 	 *
206 	 * The special case check for the NET_MAC_AWARE process flag is
207 	 * to support the case of the automounter in the global zone. We
208 	 * permit automounting of local zone directories such as home
209 	 * directories, into the global zone as required by setlabel,
210 	 * zonecopy, and saving of desktop sessions. Such mounts are
211 	 * trusted not to expose the contents of one zone's directories
212 	 * to another by leaking them through the global zone.
213 	 */
214 	if (is_system_labeled() && crgetzoneid(cr) == GLOBAL_ZONEID) {
215 		char	specname[MAXPATHLEN];
216 		zone_t	*from_zptr;
217 		zone_t	*to_zptr;
218 
219 		if (vnodetopath(NULL, realrootvp, specname,
220 		    sizeof (specname), CRED()) != 0)
221 			return (EACCES);
222 
223 		from_zptr = zone_find_by_path(specname);
224 		to_zptr = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
225 
226 		/*
227 		 * Special case for zone devfs: the zone for /dev will
228 		 * incorrectly appear as the global zone since it's not
229 		 * under the zone rootpath.  So for zone devfs check allow
230 		 * read-write mounts.
231 		 *
232 		 * Second special case for scratch zones used for Live Upgrade:
233 		 * this is used to mount the zone's root from /root to /a in
234 		 * the scratch zone.  As with the other special case, this
235 		 * appears to be outside of the zone because it's not under
236 		 * the zone rootpath, which is $ZONEPATH/lu in the scratch
237 		 * zone case.
238 		 */
239 
240 		if (from_zptr != to_zptr &&
241 		    !(to_zptr->zone_flags & ZF_IS_SCRATCH)) {
242 			/*
243 			 * We know at this point that the labels aren't equal
244 			 * because the zone pointers aren't equal, and zones
245 			 * can't share a label.
246 			 *
247 			 * If the source is the global zone then making
248 			 * it available to a local zone must be done in
249 			 * read-only mode as the label will become admin_low.
250 			 *
251 			 * If it is a mount between local zones then if
252 			 * the current process is in the global zone and has
253 			 * the NET_MAC_AWARE flag, then regular read-write
254 			 * access is allowed.  If it's in some other zone, but
255 			 * the label on the mount point dominates the original
256 			 * source, then allow the mount as read-only
257 			 * ("read-down").
258 			 */
259 			if (from_zptr->zone_id == GLOBAL_ZONEID) {
260 				/* make the mount read-only */
261 				vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
262 			} else { /* cross-zone mount */
263 				if (to_zptr->zone_id == GLOBAL_ZONEID &&
264 				    /* LINTED: no consequent */
265 				    getpflags(NET_MAC_AWARE, cr) != 0) {
266 					/* Allow the mount as read-write */
267 				} else if (bldominates(
268 				    label2bslabel(to_zptr->zone_slabel),
269 				    label2bslabel(from_zptr->zone_slabel))) {
270 					/* make the mount read-only */
271 					vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
272 				} else {
273 					zone_rele(to_zptr);
274 					zone_rele(from_zptr);
275 					return (EACCES);
276 				}
277 			}
278 		}
279 		zone_rele(to_zptr);
280 		zone_rele(from_zptr);
281 	}
282 
283 	/*
284 	 * realrootvp may be an AUTOFS node, in which case we
285 	 * perform a VOP_ACCESS() to trigger the mount of the
286 	 * intended filesystem, so we loopback mount the intended
287 	 * filesystem instead of the AUTOFS filesystem.
288 	 */
289 	(void) VOP_ACCESS(realrootvp, 0, 0, cr);
290 
291 	/*
292 	 * We're interested in the top most filesystem.
293 	 * This is specially important when uap->spec is a trigger
294 	 * AUTOFS node, since we're really interested in mounting the
295 	 * filesystem AUTOFS mounted as result of the VOP_ACCESS()
296 	 * call not the AUTOFS node itself.
297 	 */
298 	if (vn_mountedvfs(realrootvp) != NULL) {
299 		if (error = traverse(&realrootvp)) {
300 			VN_RELE(realrootvp);
301 			return (error);
302 		}
303 	}
304 
305 	/*
306 	 * Allocate a vfs info struct and attach it
307 	 */
308 	li = kmem_zalloc(sizeof (struct loinfo), KM_SLEEP);
309 	li->li_realvfs = realrootvp->v_vfsp;
310 	li->li_mountvfs = vfsp;
311 
312 	/*
313 	 * Set mount flags to be inherited by loopback vfs's
314 	 */
315 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
316 		li->li_mflag |= VFS_RDONLY;
317 	}
318 	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
319 		li->li_mflag |= (VFS_NOSETUID|VFS_NODEVICES);
320 	}
321 	if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
322 		li->li_mflag |= VFS_NODEVICES;
323 	}
324 	if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
325 		li->li_mflag |= VFS_NOSETUID;
326 	}
327 	/*
328 	 * Permissive flags are added to the "deny" bitmap.
329 	 */
330 	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
331 		li->li_dflag |= VFS_XATTR;
332 	}
333 	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
334 		li->li_dflag |= VFS_NBMAND;
335 	}
336 
337 	/*
338 	 * Propagate inheritable mount flags from the real vfs.
339 	 */
340 	if ((li->li_realvfs->vfs_flag & VFS_RDONLY) &&
341 	    !vfs_optionisset(vfsp, MNTOPT_RO, NULL))
342 		vfs_setmntopt(vfsp, MNTOPT_RO, NULL,
343 		    VFS_NODISPLAY);
344 	if ((li->li_realvfs->vfs_flag & VFS_NOSETUID) &&
345 	    !vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
346 		vfs_setmntopt(vfsp, MNTOPT_NOSETUID, NULL,
347 		    VFS_NODISPLAY);
348 	if ((li->li_realvfs->vfs_flag & VFS_NODEVICES) &&
349 	    !vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
350 		vfs_setmntopt(vfsp, MNTOPT_NODEVICES, NULL,
351 		    VFS_NODISPLAY);
352 	/*
353 	 * Permissive flags such as VFS_XATTR, as opposed to restrictive flags
354 	 * such as VFS_RDONLY, are handled differently.  An explicit
355 	 * MNTOPT_NOXATTR should override the underlying filesystem's VFS_XATTR.
356 	 */
357 	if ((li->li_realvfs->vfs_flag & VFS_XATTR) &&
358 	    !vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL) &&
359 	    !vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
360 		vfs_setmntopt(vfsp, MNTOPT_XATTR, NULL,
361 		    VFS_NODISPLAY);
362 	if ((li->li_realvfs->vfs_flag & VFS_NBMAND) &&
363 	    !vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL) &&
364 	    !vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
365 		vfs_setmntopt(vfsp, MNTOPT_NBMAND, NULL,
366 		    VFS_NODISPLAY);
367 
368 	li->li_refct = 0;
369 	vfsp->vfs_data = (caddr_t)li;
370 	vfsp->vfs_bcount = 0;
371 	vfsp->vfs_fstype = lofsfstype;
372 	vfsp->vfs_bsize = li->li_realvfs->vfs_bsize;
373 
374 	vfsp->vfs_dev = li->li_realvfs->vfs_dev;
375 	vfsp->vfs_fsid.val[0] = li->li_realvfs->vfs_fsid.val[0];
376 	vfsp->vfs_fsid.val[1] = li->li_realvfs->vfs_fsid.val[1];
377 
378 	if (vfs_optionisset(vfsp, MNTOPT_LOFS_NOSUB, NULL)) {
379 		li->li_flag |= LO_NOSUB;
380 	}
381 
382 	/*
383 	 * Setup the hashtable. If the root of this mount isn't a directory,
384 	 * there's no point in allocating a large hashtable. A table with one
385 	 * bucket is sufficient.
386 	 */
387 	if (realrootvp->v_type != VDIR)
388 		lsetup(li, 1);
389 	else
390 		lsetup(li, 0);
391 
392 	/*
393 	 * Make the root vnode
394 	 */
395 	srootvp = makelonode(realrootvp, li, 0);
396 	srootvp->v_flag |= VROOT;
397 	li->li_rootvp = srootvp;
398 
399 #ifdef LODEBUG
400 	lo_dprint(4, "lo_mount: vfs %p realvfs %p root %p realroot %p li %p\n",
401 	    vfsp, li->li_realvfs, srootvp, realrootvp, li);
402 #endif
403 	return (0);
404 }
405 
406 /*
407  * Undo loopback mount
408  */
409 static int
410 lo_unmount(struct vfs *vfsp, int flag, struct cred *cr)
411 {
412 	struct loinfo *li;
413 
414 	if (secpolicy_fs_unmount(cr, vfsp) != 0)
415 		return (EPERM);
416 
417 	/*
418 	 * Forced unmount is not supported by this file system
419 	 * and thus, ENOTSUP, is being returned.
420 	 */
421 	if (flag & MS_FORCE)
422 		return (ENOTSUP);
423 
424 	li = vtoli(vfsp);
425 #ifdef LODEBUG
426 	lo_dprint(4, "lo_unmount(%p) li %p\n", vfsp, li);
427 #endif
428 	if (li->li_refct != 1 || li->li_rootvp->v_count != 1) {
429 #ifdef LODEBUG
430 		lo_dprint(4, "refct %d v_ct %d\n", li->li_refct,
431 		    li->li_rootvp->v_count);
432 #endif
433 		return (EBUSY);
434 	}
435 	VN_RELE(li->li_rootvp);
436 	return (0);
437 }
438 
439 /*
440  * Find root of lofs mount.
441  */
442 static int
443 lo_root(struct vfs *vfsp, struct vnode **vpp)
444 {
445 	*vpp = vtoli(vfsp)->li_rootvp;
446 #ifdef LODEBUG
447 	lo_dprint(4, "lo_root(0x%p) = %p\n", vfsp, *vpp);
448 #endif
449 	/*
450 	 * If the root of the filesystem is a special file, return the specvp
451 	 * version of the vnode. We don't save the specvp vnode in our
452 	 * hashtable since that's exclusively for lnodes.
453 	 */
454 	if (IS_DEVVP(*vpp)) {
455 		struct vnode *svp;
456 
457 		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, kcred);
458 		if (svp == NULL)
459 			return (ENOSYS);
460 		*vpp = svp;
461 	} else {
462 		VN_HOLD(*vpp);
463 	}
464 
465 	return (0);
466 }
467 
468 /*
469  * Get file system statistics.
470  */
471 static int
472 lo_statvfs(register struct vfs *vfsp, struct statvfs64 *sbp)
473 {
474 	vnode_t *realrootvp;
475 
476 #ifdef LODEBUG
477 	lo_dprint(4, "lostatvfs %p\n", vfsp);
478 #endif
479 	/*
480 	 * Using realrootvp->v_vfsp (instead of the realvfsp that was
481 	 * cached) is necessary to make lofs work woth forced UFS unmounts.
482 	 * In the case of a forced unmount, UFS stores a set of dummy vfsops
483 	 * in all the (i)vnodes in the filesystem. The dummy ops simply
484 	 * returns back EIO.
485 	 */
486 	(void) lo_realvfs(vfsp, &realrootvp);
487 	if (realrootvp != NULL)
488 		return (VFS_STATVFS(realrootvp->v_vfsp, sbp));
489 	else
490 		return (EIO);
491 }
492 
493 /*
494  * LOFS doesn't have any data or metadata to flush, pending I/O on the
495  * underlying filesystem will be flushed when such filesystem is synched.
496  */
497 /* ARGSUSED */
498 static int
499 lo_sync(struct vfs *vfsp,
500 	short flag,
501 	struct cred *cr)
502 {
503 #ifdef LODEBUG
504 	lo_dprint(4, "lo_sync: %p\n", vfsp);
505 #endif
506 	return (0);
507 }
508 
509 /*
510  * Obtain the vnode from the underlying filesystem.
511  */
512 static int
513 lo_vget(struct vfs *vfsp, struct vnode **vpp, struct fid *fidp)
514 {
515 	vnode_t *realrootvp;
516 
517 #ifdef LODEBUG
518 	lo_dprint(4, "lo_vget: %p\n", vfsp);
519 #endif
520 	(void) lo_realvfs(vfsp, &realrootvp);
521 	if (realrootvp != NULL)
522 		return (VFS_VGET(realrootvp->v_vfsp, vpp, fidp));
523 	else
524 		return (EIO);
525 }
526 
527 /*
528  * Free mount-specific data.
529  */
530 static void
531 lo_freevfs(struct vfs *vfsp)
532 {
533 	struct loinfo *li = vtoli(vfsp);
534 
535 	ldestroy(li);
536 	kmem_free(li, sizeof (struct loinfo));
537 }
538 
539 static int
540 lofsinit(int fstyp, char *name)
541 {
542 	static const fs_operation_def_t lo_vfsops_template[] = {
543 		VFSNAME_MOUNT, lo_mount,
544 		VFSNAME_UNMOUNT, lo_unmount,
545 		VFSNAME_ROOT, lo_root,
546 		VFSNAME_STATVFS, lo_statvfs,
547 		VFSNAME_SYNC, (fs_generic_func_p) lo_sync,
548 		VFSNAME_VGET, lo_vget,
549 		VFSNAME_FREEVFS, (fs_generic_func_p) lo_freevfs,
550 		NULL, NULL
551 	};
552 	int error;
553 
554 	error = vfs_setfsops(fstyp, lo_vfsops_template, &lo_vfsops);
555 	if (error != 0) {
556 		cmn_err(CE_WARN, "lofsinit: bad vfs ops template");
557 		return (error);
558 	}
559 
560 	error = vn_make_ops(name, lo_vnodeops_template, &lo_vnodeops);
561 	if (error != 0) {
562 		(void) vfs_freevfsops_by_type(fstyp);
563 		cmn_err(CE_WARN, "lofsinit: bad vnode ops template");
564 		return (error);
565 	}
566 
567 	lofsfstype = fstyp;
568 
569 	return (0);
570 }
571