xref: /illumos-gate/usr/src/uts/common/fs/lofs/lofs_vfsops.c (revision 35a5a3587fd94b666239c157d3722745250ccbd7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/param.h>
29 #include <sys/errno.h>
30 #include <sys/vfs.h>
31 #include <sys/vfs_opreg.h>
32 #include <sys/vnode.h>
33 #include <sys/uio.h>
34 #include <sys/pathname.h>
35 #include <sys/kmem.h>
36 #include <sys/cred.h>
37 #include <sys/statvfs.h>
38 #include <sys/fs/lofs_info.h>
39 #include <sys/fs/lofs_node.h>
40 #include <sys/mount.h>
41 #include <sys/mntent.h>
42 #include <sys/mkdev.h>
43 #include <sys/priv.h>
44 #include <sys/sysmacros.h>
45 #include <sys/systm.h>
46 #include <sys/cmn_err.h>
47 #include <sys/policy.h>
48 #include <sys/tsol/label.h>
49 #include "fs/fs_subr.h"
50 
51 /*
52  * This is the loadable module wrapper.
53  */
54 #include <sys/modctl.h>
55 
56 static mntopts_t lofs_mntopts;
57 
58 static int lofsinit(int, char *);
59 
60 static vfsdef_t vfw = {
61 	VFSDEF_VERSION,
62 	"lofs",
63 	lofsinit,
64 	VSW_HASPROTO|VSW_STATS,
65 	&lofs_mntopts
66 };
67 
68 /*
69  * LOFS mount options table
70  */
71 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
72 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
73 static char *sub_cancel[] = { MNTOPT_LOFS_NOSUB, NULL };
74 static char *nosub_cancel[] = { MNTOPT_LOFS_SUB, NULL };
75 
76 static mntopt_t mntopts[] = {
77 /*
78  *	option name		cancel option	default arg	flags
79  *		private data
80  */
81 	{ MNTOPT_XATTR,		xattr_cancel,	NULL,		0,
82 		(void *)0 },
83 	{ MNTOPT_NOXATTR,	noxattr_cancel,	NULL,		0,
84 		(void *)0 },
85 	{ MNTOPT_LOFS_SUB,	sub_cancel,	NULL,		0,
86 		(void *)0 },
87 	{ MNTOPT_LOFS_NOSUB,	nosub_cancel,	NULL,		0,
88 		(void *)0 },
89 };
90 
91 static mntopts_t lofs_mntopts = {
92 	sizeof (mntopts) / sizeof (mntopt_t),
93 	mntopts
94 };
95 
96 /*
97  * Module linkage information for the kernel.
98  */
99 
100 static struct modlfs modlfs = {
101 	&mod_fsops, "filesystem for lofs", &vfw
102 };
103 
104 static struct modlinkage modlinkage = {
105 	MODREV_1, (void *)&modlfs, NULL
106 };
107 
108 /*
109  * This is the module initialization routine.
110  */
111 
112 int
113 _init(void)
114 {
115 	int status;
116 
117 	lofs_subrinit();
118 	status = mod_install(&modlinkage);
119 	if (status != 0) {
120 		/*
121 		 * Cleanup previously initialized work.
122 		 */
123 		lofs_subrfini();
124 	}
125 
126 	return (status);
127 }
128 
129 /*
130  * Don't allow the lofs module to be unloaded for now.
131  * There is a memory leak if it gets unloaded.
132  */
133 
134 int
135 _fini(void)
136 {
137 	return (EBUSY);
138 }
139 
140 int
141 _info(struct modinfo *modinfop)
142 {
143 	return (mod_info(&modlinkage, modinfop));
144 }
145 
146 
147 static int lofsfstype;
148 vfsops_t *lo_vfsops;
149 
150 /*
151  * lo mount vfsop
152  * Set up mount info record and attach it to vfs struct.
153  */
154 /*ARGSUSED*/
155 static int
156 lo_mount(struct vfs *vfsp,
157 	struct vnode *vp,
158 	struct mounta *uap,
159 	struct cred *cr)
160 {
161 	int error;
162 	struct vnode *srootvp = NULL;	/* the server's root */
163 	struct vnode *realrootvp;
164 	struct loinfo *li;
165 	int nodev;
166 
167 	nodev = vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL);
168 
169 	if ((error = secpolicy_fs_mount(cr, vp, vfsp)) != 0)
170 		return (EPERM);
171 
172 	/*
173 	 * Loopback devices which get "nodevices" added can be done without
174 	 * "nodevices" set because we cannot import devices into a zone
175 	 * with loopback.  Note that we have all zone privileges when
176 	 * this happens; if not, we'd have gotten "nosuid".
177 	 */
178 	if (!nodev && vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
179 		vfs_setmntopt(vfsp, MNTOPT_DEVICES, NULL, VFS_NODISPLAY);
180 
181 	mutex_enter(&vp->v_lock);
182 	if (!(uap->flags & MS_OVERLAY) &&
183 	    (vp->v_count != 1 || (vp->v_flag & VROOT))) {
184 		mutex_exit(&vp->v_lock);
185 		return (EBUSY);
186 	}
187 	mutex_exit(&vp->v_lock);
188 
189 	/*
190 	 * Find real root, and make vfs point to real vfs
191 	 */
192 
193 	if (error = lookupname(uap->spec, (uap->flags & MS_SYSSPACE) ?
194 	    UIO_SYSSPACE : UIO_USERSPACE, FOLLOW, NULLVPP, &realrootvp))
195 		return (error);
196 
197 	/*
198 	 * Enforce MAC policy if needed.
199 	 *
200 	 * Loopback mounts must not allow writing up. The dominance test
201 	 * is intended to prevent a global zone caller from accidentally
202 	 * creating write-up conditions between two labeled zones.
203 	 * Local zones can't violate MAC on their own without help from
204 	 * the global zone because they can't name a pathname that
205 	 * they don't already have.
206 	 *
207 	 * The special case check for the NET_MAC_AWARE process flag is
208 	 * to support the case of the automounter in the global zone. We
209 	 * permit automounting of local zone directories such as home
210 	 * directories, into the global zone as required by setlabel,
211 	 * zonecopy, and saving of desktop sessions. Such mounts are
212 	 * trusted not to expose the contents of one zone's directories
213 	 * to another by leaking them through the global zone.
214 	 */
215 	if (is_system_labeled() && crgetzoneid(cr) == GLOBAL_ZONEID) {
216 		char	specname[MAXPATHLEN];
217 		zone_t	*from_zptr;
218 		zone_t	*to_zptr;
219 
220 		if (vnodetopath(NULL, realrootvp, specname,
221 		    sizeof (specname), CRED()) != 0) {
222 			VN_RELE(realrootvp);
223 			return (EACCES);
224 		}
225 
226 		from_zptr = zone_find_by_path(specname);
227 		to_zptr = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
228 
229 		/*
230 		 * Special case for zone devfs: the zone for /dev will
231 		 * incorrectly appear as the global zone since it's not
232 		 * under the zone rootpath.  So for zone devfs check allow
233 		 * read-write mounts.
234 		 *
235 		 * Second special case for scratch zones used for Live Upgrade:
236 		 * this is used to mount the zone's root from /root to /a in
237 		 * the scratch zone.  As with the other special case, this
238 		 * appears to be outside of the zone because it's not under
239 		 * the zone rootpath, which is $ZONEPATH/lu in the scratch
240 		 * zone case.
241 		 */
242 
243 		if (from_zptr != to_zptr &&
244 		    !(to_zptr->zone_flags & ZF_IS_SCRATCH)) {
245 			/*
246 			 * We know at this point that the labels aren't equal
247 			 * because the zone pointers aren't equal, and zones
248 			 * can't share a label.
249 			 *
250 			 * If the source is the global zone then making
251 			 * it available to a local zone must be done in
252 			 * read-only mode as the label will become admin_low.
253 			 *
254 			 * If it is a mount between local zones then if
255 			 * the current process is in the global zone and has
256 			 * the NET_MAC_AWARE flag, then regular read-write
257 			 * access is allowed.  If it's in some other zone, but
258 			 * the label on the mount point dominates the original
259 			 * source, then allow the mount as read-only
260 			 * ("read-down").
261 			 */
262 			if (from_zptr->zone_id == GLOBAL_ZONEID) {
263 				/* make the mount read-only */
264 				vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
265 			} else { /* cross-zone mount */
266 				if (to_zptr->zone_id == GLOBAL_ZONEID &&
267 				    /* LINTED: no consequent */
268 				    getpflags(NET_MAC_AWARE, cr) != 0) {
269 					/* Allow the mount as read-write */
270 				} else if (bldominates(
271 				    label2bslabel(to_zptr->zone_slabel),
272 				    label2bslabel(from_zptr->zone_slabel))) {
273 					/* make the mount read-only */
274 					vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
275 				} else {
276 					VN_RELE(realrootvp);
277 					zone_rele(to_zptr);
278 					zone_rele(from_zptr);
279 					return (EACCES);
280 				}
281 			}
282 		}
283 		zone_rele(to_zptr);
284 		zone_rele(from_zptr);
285 	}
286 
287 	/*
288 	 * realrootvp may be an AUTOFS node, in which case we
289 	 * perform a VOP_ACCESS() to trigger the mount of the
290 	 * intended filesystem, so we loopback mount the intended
291 	 * filesystem instead of the AUTOFS filesystem.
292 	 */
293 	(void) VOP_ACCESS(realrootvp, 0, 0, cr, NULL);
294 
295 	/*
296 	 * We're interested in the top most filesystem.
297 	 * This is specially important when uap->spec is a trigger
298 	 * AUTOFS node, since we're really interested in mounting the
299 	 * filesystem AUTOFS mounted as result of the VOP_ACCESS()
300 	 * call not the AUTOFS node itself.
301 	 */
302 	if (vn_mountedvfs(realrootvp) != NULL) {
303 		if (error = traverse(&realrootvp)) {
304 			VN_RELE(realrootvp);
305 			return (error);
306 		}
307 	}
308 
309 	/*
310 	 * Allocate a vfs info struct and attach it
311 	 */
312 	li = kmem_zalloc(sizeof (struct loinfo), KM_SLEEP);
313 	li->li_realvfs = realrootvp->v_vfsp;
314 	li->li_mountvfs = vfsp;
315 
316 	/*
317 	 * Set mount flags to be inherited by loopback vfs's
318 	 */
319 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
320 		li->li_mflag |= VFS_RDONLY;
321 	}
322 	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
323 		li->li_mflag |= (VFS_NOSETUID|VFS_NODEVICES);
324 	}
325 	if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
326 		li->li_mflag |= VFS_NODEVICES;
327 	}
328 	if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
329 		li->li_mflag |= VFS_NOSETUID;
330 	}
331 	/*
332 	 * Permissive flags are added to the "deny" bitmap.
333 	 */
334 	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
335 		li->li_dflag |= VFS_XATTR;
336 	}
337 	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
338 		li->li_dflag |= VFS_NBMAND;
339 	}
340 
341 	/*
342 	 * Propagate inheritable mount flags from the real vfs.
343 	 */
344 	if ((li->li_realvfs->vfs_flag & VFS_RDONLY) &&
345 	    !vfs_optionisset(vfsp, MNTOPT_RO, NULL))
346 		vfs_setmntopt(vfsp, MNTOPT_RO, NULL,
347 		    VFS_NODISPLAY);
348 	if ((li->li_realvfs->vfs_flag & VFS_NOSETUID) &&
349 	    !vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
350 		vfs_setmntopt(vfsp, MNTOPT_NOSETUID, NULL,
351 		    VFS_NODISPLAY);
352 	if ((li->li_realvfs->vfs_flag & VFS_NODEVICES) &&
353 	    !vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
354 		vfs_setmntopt(vfsp, MNTOPT_NODEVICES, NULL,
355 		    VFS_NODISPLAY);
356 	/*
357 	 * Permissive flags such as VFS_XATTR, as opposed to restrictive flags
358 	 * such as VFS_RDONLY, are handled differently.  An explicit
359 	 * MNTOPT_NOXATTR should override the underlying filesystem's VFS_XATTR.
360 	 */
361 	if ((li->li_realvfs->vfs_flag & VFS_XATTR) &&
362 	    !vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL) &&
363 	    !vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
364 		vfs_setmntopt(vfsp, MNTOPT_XATTR, NULL,
365 		    VFS_NODISPLAY);
366 	if ((li->li_realvfs->vfs_flag & VFS_NBMAND) &&
367 	    !vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL) &&
368 	    !vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
369 		vfs_setmntopt(vfsp, MNTOPT_NBMAND, NULL,
370 		    VFS_NODISPLAY);
371 
372 	li->li_refct = 0;
373 	vfsp->vfs_data = (caddr_t)li;
374 	vfsp->vfs_bcount = 0;
375 	vfsp->vfs_fstype = lofsfstype;
376 	vfsp->vfs_bsize = li->li_realvfs->vfs_bsize;
377 
378 	vfsp->vfs_dev = li->li_realvfs->vfs_dev;
379 	vfsp->vfs_fsid.val[0] = li->li_realvfs->vfs_fsid.val[0];
380 	vfsp->vfs_fsid.val[1] = li->li_realvfs->vfs_fsid.val[1];
381 
382 	if (vfs_optionisset(vfsp, MNTOPT_LOFS_NOSUB, NULL)) {
383 		li->li_flag |= LO_NOSUB;
384 	}
385 
386 	/*
387 	 * Propagate any VFS features
388 	 */
389 
390 	vfs_propagate_features(li->li_realvfs, vfsp);
391 
392 	/*
393 	 * Setup the hashtable. If the root of this mount isn't a directory,
394 	 * there's no point in allocating a large hashtable. A table with one
395 	 * bucket is sufficient.
396 	 */
397 	if (realrootvp->v_type != VDIR)
398 		lsetup(li, 1);
399 	else
400 		lsetup(li, 0);
401 
402 	/*
403 	 * Make the root vnode
404 	 */
405 	srootvp = makelonode(realrootvp, li, 0);
406 	srootvp->v_flag |= VROOT;
407 	li->li_rootvp = srootvp;
408 
409 #ifdef LODEBUG
410 	lo_dprint(4, "lo_mount: vfs %p realvfs %p root %p realroot %p li %p\n",
411 	    vfsp, li->li_realvfs, srootvp, realrootvp, li);
412 #endif
413 	return (0);
414 }
415 
416 /*
417  * Undo loopback mount
418  */
419 static int
420 lo_unmount(struct vfs *vfsp, int flag, struct cred *cr)
421 {
422 	struct loinfo *li;
423 
424 	if (secpolicy_fs_unmount(cr, vfsp) != 0)
425 		return (EPERM);
426 
427 	/*
428 	 * Forced unmount is not supported by this file system
429 	 * and thus, ENOTSUP, is being returned.
430 	 */
431 	if (flag & MS_FORCE)
432 		return (ENOTSUP);
433 
434 	li = vtoli(vfsp);
435 #ifdef LODEBUG
436 	lo_dprint(4, "lo_unmount(%p) li %p\n", vfsp, li);
437 #endif
438 	if (li->li_refct != 1 || li->li_rootvp->v_count != 1) {
439 #ifdef LODEBUG
440 		lo_dprint(4, "refct %d v_ct %d\n", li->li_refct,
441 		    li->li_rootvp->v_count);
442 #endif
443 		return (EBUSY);
444 	}
445 	VN_RELE(li->li_rootvp);
446 	return (0);
447 }
448 
449 /*
450  * Find root of lofs mount.
451  */
452 static int
453 lo_root(struct vfs *vfsp, struct vnode **vpp)
454 {
455 	*vpp = vtoli(vfsp)->li_rootvp;
456 #ifdef LODEBUG
457 	lo_dprint(4, "lo_root(0x%p) = %p\n", vfsp, *vpp);
458 #endif
459 	/*
460 	 * If the root of the filesystem is a special file, return the specvp
461 	 * version of the vnode. We don't save the specvp vnode in our
462 	 * hashtable since that's exclusively for lnodes.
463 	 */
464 	if (IS_DEVVP(*vpp)) {
465 		struct vnode *svp;
466 
467 		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, kcred);
468 		if (svp == NULL)
469 			return (ENOSYS);
470 		*vpp = svp;
471 	} else {
472 		VN_HOLD(*vpp);
473 	}
474 
475 	return (0);
476 }
477 
478 /*
479  * Get file system statistics.
480  */
481 static int
482 lo_statvfs(register struct vfs *vfsp, struct statvfs64 *sbp)
483 {
484 	vnode_t *realrootvp;
485 
486 #ifdef LODEBUG
487 	lo_dprint(4, "lostatvfs %p\n", vfsp);
488 #endif
489 	/*
490 	 * Using realrootvp->v_vfsp (instead of the realvfsp that was
491 	 * cached) is necessary to make lofs work woth forced UFS unmounts.
492 	 * In the case of a forced unmount, UFS stores a set of dummy vfsops
493 	 * in all the (i)vnodes in the filesystem. The dummy ops simply
494 	 * returns back EIO.
495 	 */
496 	(void) lo_realvfs(vfsp, &realrootvp);
497 	if (realrootvp != NULL)
498 		return (VFS_STATVFS(realrootvp->v_vfsp, sbp));
499 	else
500 		return (EIO);
501 }
502 
503 /*
504  * LOFS doesn't have any data or metadata to flush, pending I/O on the
505  * underlying filesystem will be flushed when such filesystem is synched.
506  */
507 /* ARGSUSED */
508 static int
509 lo_sync(struct vfs *vfsp,
510 	short flag,
511 	struct cred *cr)
512 {
513 #ifdef LODEBUG
514 	lo_dprint(4, "lo_sync: %p\n", vfsp);
515 #endif
516 	return (0);
517 }
518 
519 /*
520  * Obtain the vnode from the underlying filesystem.
521  */
522 static int
523 lo_vget(struct vfs *vfsp, struct vnode **vpp, struct fid *fidp)
524 {
525 	vnode_t *realrootvp;
526 
527 #ifdef LODEBUG
528 	lo_dprint(4, "lo_vget: %p\n", vfsp);
529 #endif
530 	(void) lo_realvfs(vfsp, &realrootvp);
531 	if (realrootvp != NULL)
532 		return (VFS_VGET(realrootvp->v_vfsp, vpp, fidp));
533 	else
534 		return (EIO);
535 }
536 
537 /*
538  * Free mount-specific data.
539  */
540 static void
541 lo_freevfs(struct vfs *vfsp)
542 {
543 	struct loinfo *li = vtoli(vfsp);
544 
545 	ldestroy(li);
546 	kmem_free(li, sizeof (struct loinfo));
547 }
548 
549 static int
550 lofsinit(int fstyp, char *name)
551 {
552 	static const fs_operation_def_t lo_vfsops_template[] = {
553 		VFSNAME_MOUNT,		{ .vfs_mount = lo_mount },
554 		VFSNAME_UNMOUNT,	{ .vfs_unmount = lo_unmount },
555 		VFSNAME_ROOT,		{ .vfs_root = lo_root },
556 		VFSNAME_STATVFS,	{ .vfs_statvfs = lo_statvfs },
557 		VFSNAME_SYNC,		{ .vfs_sync = lo_sync },
558 		VFSNAME_VGET,		{ .vfs_vget = lo_vget },
559 		VFSNAME_FREEVFS,	{ .vfs_freevfs = lo_freevfs },
560 		NULL,			NULL
561 	};
562 	int error;
563 
564 	error = vfs_setfsops(fstyp, lo_vfsops_template, &lo_vfsops);
565 	if (error != 0) {
566 		cmn_err(CE_WARN, "lofsinit: bad vfs ops template");
567 		return (error);
568 	}
569 
570 	error = vn_make_ops(name, lo_vnodeops_template, &lo_vnodeops);
571 	if (error != 0) {
572 		(void) vfs_freevfsops_by_type(fstyp);
573 		cmn_err(CE_WARN, "lofsinit: bad vnode ops template");
574 		return (error);
575 	}
576 
577 	lofsfstype = fstyp;
578 
579 	return (0);
580 }
581