xref: /illumos-gate/usr/src/uts/common/fs/lofs/lofs_vfsops.c (revision 2833423dc59f4c35fe4713dbb942950c82df0437)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2024 Oxide Computer Company
24  */
25 
26 #include <sys/param.h>
27 #include <sys/errno.h>
28 #include <sys/vfs.h>
29 #include <sys/vfs_opreg.h>
30 #include <sys/vnode.h>
31 #include <sys/uio.h>
32 #include <sys/pathname.h>
33 #include <sys/kmem.h>
34 #include <sys/cred.h>
35 #include <sys/statvfs.h>
36 #include <sys/fs/lofs_info.h>
37 #include <sys/fs/lofs_node.h>
38 #include <sys/mount.h>
39 #include <sys/mntent.h>
40 #include <sys/mkdev.h>
41 #include <sys/priv.h>
42 #include <sys/sysmacros.h>
43 #include <sys/systm.h>
44 #include <sys/cmn_err.h>
45 #include <sys/policy.h>
46 #include <sys/tsol/label.h>
47 #include "fs/fs_subr.h"
48 
49 /*
50  * This is the loadable module wrapper.
51  */
52 #include <sys/modctl.h>
53 
54 static mntopts_t lofs_mntopts;
55 
56 static int lofsinit(int, char *);
57 
58 static vfsdef_t vfw = {
59 	VFSDEF_VERSION,
60 	"lofs",
61 	lofsinit,
62 	VSW_HASPROTO|VSW_STATS|VSW_ZMOUNT,
63 	&lofs_mntopts
64 };
65 
66 /*
67  * LOFS mount options table
68  */
69 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
70 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
71 static char *sub_cancel[] = { MNTOPT_LOFS_NOSUB, NULL };
72 static char *nosub_cancel[] = { MNTOPT_LOFS_SUB, NULL };
73 
74 static mntopt_t mntopts[] = {
75 /*
76  *	option name		cancel option	default arg	flags
77  *		private data
78  */
79 	{ MNTOPT_XATTR,		xattr_cancel,	NULL,		0,
80 		(void *)0 },
81 	{ MNTOPT_NOXATTR,	noxattr_cancel,	NULL,		0,
82 		(void *)0 },
83 	{ MNTOPT_LOFS_SUB,	sub_cancel,	NULL,		0,
84 		(void *)0 },
85 	{ MNTOPT_LOFS_NOSUB,	nosub_cancel,	NULL,		0,
86 		(void *)0 },
87 };
88 
89 static mntopts_t lofs_mntopts = {
90 	sizeof (mntopts) / sizeof (mntopt_t),
91 	mntopts
92 };
93 
94 /*
95  * Module linkage information for the kernel.
96  */
97 
98 static struct modlfs modlfs = {
99 	&mod_fsops, "filesystem for lofs", &vfw
100 };
101 
102 static struct modlinkage modlinkage = {
103 	MODREV_1, (void *)&modlfs, NULL
104 };
105 
106 /*
107  * This is the module initialization routine.
108  */
109 
110 int
111 _init(void)
112 {
113 	int status;
114 
115 	lofs_subrinit();
116 	status = mod_install(&modlinkage);
117 	if (status != 0) {
118 		/*
119 		 * Cleanup previously initialized work.
120 		 */
121 		lofs_subrfini();
122 	}
123 
124 	return (status);
125 }
126 
127 /*
128  * Don't allow the lofs module to be unloaded for now.
129  * There is a memory leak if it gets unloaded.
130  */
131 
132 int
133 _fini(void)
134 {
135 	return (EBUSY);
136 }
137 
138 int
139 _info(struct modinfo *modinfop)
140 {
141 	return (mod_info(&modlinkage, modinfop));
142 }
143 
144 
145 static int lofsfstype;
146 vfsops_t *lo_vfsops;
147 
148 /*
149  * lo mount vfsop
150  * Set up mount info record and attach it to vfs struct.
151  */
152 /*ARGSUSED*/
153 static int
154 lo_mount(struct vfs *vfsp, struct vnode *vp, struct mounta *uap,
155     struct cred *cr)
156 {
157 	int error;
158 	struct vnode *srootvp = NULL;	/* the server's root */
159 	struct vnode *realrootvp;
160 	struct loinfo *li;
161 	int nodev;
162 
163 	nodev = vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL);
164 
165 	if ((error = secpolicy_fs_mount(cr, vp, vfsp)) != 0)
166 		return (EPERM);
167 
168 	/*
169 	 * Loopback devices which get "nodevices" added can be done without
170 	 * "nodevices" set because we cannot import devices into a zone
171 	 * with loopback.  Note that we have all zone privileges when
172 	 * this happens; if not, we'd have gotten "nosuid".
173 	 */
174 	if (!nodev && vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
175 		vfs_setmntopt(vfsp, MNTOPT_DEVICES, NULL, VFS_NODISPLAY);
176 
177 	mutex_enter(&vp->v_lock);
178 	if (!(uap->flags & MS_OVERLAY) &&
179 	    (vp->v_count != 1 || (vp->v_flag & VROOT))) {
180 		mutex_exit(&vp->v_lock);
181 		return (EBUSY);
182 	}
183 	mutex_exit(&vp->v_lock);
184 
185 	/*
186 	 * Find real root, and make vfs point to real vfs
187 	 */
188 
189 	if (error = lookupname(uap->spec, (uap->flags & MS_SYSSPACE) ?
190 	    UIO_SYSSPACE : UIO_USERSPACE, FOLLOW, NULLVPP, &realrootvp))
191 		return (error);
192 
193 	/*
194 	 * Enforce MAC policy if needed.
195 	 *
196 	 * Loopback mounts must not allow writing up. The dominance test
197 	 * is intended to prevent a global zone caller from accidentally
198 	 * creating write-up conditions between two labeled zones.
199 	 * Local zones can't violate MAC on their own without help from
200 	 * the global zone because they can't name a pathname that
201 	 * they don't already have.
202 	 *
203 	 * The special case check for the NET_MAC_AWARE process flag is
204 	 * to support the case of the automounter in the global zone. We
205 	 * permit automounting of local zone directories such as home
206 	 * directories, into the global zone as required by setlabel,
207 	 * zonecopy, and saving of desktop sessions. Such mounts are
208 	 * trusted not to expose the contents of one zone's directories
209 	 * to another by leaking them through the global zone.
210 	 */
211 	if (is_system_labeled() && crgetzoneid(cr) == GLOBAL_ZONEID) {
212 		char	specname[MAXPATHLEN];
213 		zone_t	*from_zptr;
214 		zone_t	*to_zptr;
215 
216 		if (vnodetopath(NULL, realrootvp, specname,
217 		    sizeof (specname), CRED()) != 0) {
218 			VN_RELE(realrootvp);
219 			return (EACCES);
220 		}
221 
222 		from_zptr = zone_find_by_path(specname);
223 		to_zptr = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
224 
225 		/*
226 		 * Special case for scratch zones used for Live Upgrade:
227 		 * this is used to mount the zone's root from /root to /a in
228 		 * the scratch zone.  As with the other special case, this
229 		 * appears to be outside of the zone because it's not under
230 		 * the zone rootpath, which is $ZONEPATH/lu in the scratch
231 		 * zone case.
232 		 */
233 
234 		if (from_zptr != to_zptr &&
235 		    !(to_zptr->zone_flags & ZF_IS_SCRATCH)) {
236 			/*
237 			 * We know at this point that the labels aren't equal
238 			 * because the zone pointers aren't equal, and zones
239 			 * can't share a label.
240 			 *
241 			 * If the source is the global zone then making
242 			 * it available to a local zone must be done in
243 			 * read-only mode as the label will become admin_low.
244 			 *
245 			 * If it is a mount between local zones then if
246 			 * the current process is in the global zone and has
247 			 * the NET_MAC_AWARE flag, then regular read-write
248 			 * access is allowed.  If it's in some other zone, but
249 			 * the label on the mount point dominates the original
250 			 * source, then allow the mount as read-only
251 			 * ("read-down").
252 			 */
253 			if (from_zptr->zone_id == GLOBAL_ZONEID) {
254 				/* make the mount read-only */
255 				vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
256 			} else { /* cross-zone mount */
257 				if (to_zptr->zone_id == GLOBAL_ZONEID &&
258 				    /* LINTED: no consequent */
259 				    getpflags(NET_MAC_AWARE, cr) != 0) {
260 					/* Allow the mount as read-write */
261 				} else if (bldominates(
262 				    label2bslabel(to_zptr->zone_slabel),
263 				    label2bslabel(from_zptr->zone_slabel))) {
264 					/* make the mount read-only */
265 					vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
266 				} else {
267 					VN_RELE(realrootvp);
268 					zone_rele(to_zptr);
269 					zone_rele(from_zptr);
270 					return (EACCES);
271 				}
272 			}
273 		}
274 		zone_rele(to_zptr);
275 		zone_rele(from_zptr);
276 	}
277 
278 	/*
279 	 * realrootvp may be an AUTOFS node, in which case we perform a
280 	 * VOP_ACCESS() to trigger the mount of the intended filesystem.
281 	 * This causes a loopback mount of the intended filesystem instead
282 	 * of the AUTOFS filesystem.
283 	 *
284 	 * If a lofs mount creates a mount loop (such that a lofs vfs is
285 	 * mounted on an autofs node and that lofs vfs points back to the
286 	 * autofs node which it is mounted on) then a VOP_ACCESS call will
287 	 * create a deadlock. Once this deadlock is released, VOP_ACCESS will
288 	 * return EINTR. In such a case we don't want the lofs vfs to be
289 	 * created as the loop could panic the system.
290 	 */
291 	if ((error = VOP_ACCESS(realrootvp, 0, 0, cr, NULL)) != 0) {
292 		VN_RELE(realrootvp);
293 		return (error);
294 	}
295 
296 	/*
297 	 * We're interested in the top most filesystem.
298 	 * This is specially important when uap->spec is a trigger
299 	 * AUTOFS node, since we're really interested in mounting the
300 	 * filesystem AUTOFS mounted as result of the VOP_ACCESS()
301 	 * call not the AUTOFS node itself.
302 	 */
303 	if (vn_mountedvfs(realrootvp) != NULL) {
304 		if (error = traverse(&realrootvp)) {
305 			VN_RELE(realrootvp);
306 			return (error);
307 		}
308 	}
309 
310 	/*
311 	 * Allocate a vfs info struct and attach it
312 	 */
313 	li = kmem_zalloc(sizeof (struct loinfo), KM_SLEEP);
314 	li->li_realvfs = realrootvp->v_vfsp;
315 	li->li_mountvfs = vfsp;
316 
317 	/*
318 	 * Set mount flags to be inherited by loopback vfs's
319 	 */
320 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
321 		li->li_mflag |= VFS_RDONLY;
322 	}
323 	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
324 		li->li_mflag |= (VFS_NOSETUID|VFS_NODEVICES);
325 	}
326 	if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
327 		li->li_mflag |= VFS_NODEVICES;
328 	}
329 	if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
330 		li->li_mflag |= VFS_NOSETUID;
331 	}
332 	/*
333 	 * Permissive flags are added to the "deny" bitmap.
334 	 */
335 	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
336 		li->li_dflag |= VFS_XATTR;
337 	}
338 	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
339 		li->li_dflag |= VFS_NBMAND;
340 	}
341 
342 	/*
343 	 * Propagate inheritable mount flags from the real vfs.
344 	 */
345 	if ((li->li_realvfs->vfs_flag & VFS_RDONLY) &&
346 	    !vfs_optionisset(vfsp, MNTOPT_RO, NULL))
347 		vfs_setmntopt(vfsp, MNTOPT_RO, NULL,
348 		    VFS_NODISPLAY);
349 	if ((li->li_realvfs->vfs_flag & VFS_NOSETUID) &&
350 	    !vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
351 		vfs_setmntopt(vfsp, MNTOPT_NOSETUID, NULL,
352 		    VFS_NODISPLAY);
353 	if ((li->li_realvfs->vfs_flag & VFS_NODEVICES) &&
354 	    !vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
355 		vfs_setmntopt(vfsp, MNTOPT_NODEVICES, NULL,
356 		    VFS_NODISPLAY);
357 	/*
358 	 * Permissive flags such as VFS_XATTR, as opposed to restrictive flags
359 	 * such as VFS_RDONLY, are handled differently.  An explicit
360 	 * MNTOPT_NOXATTR should override the underlying filesystem's VFS_XATTR.
361 	 */
362 	if ((li->li_realvfs->vfs_flag & VFS_XATTR) &&
363 	    !vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL) &&
364 	    !vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
365 		vfs_setmntopt(vfsp, MNTOPT_XATTR, NULL,
366 		    VFS_NODISPLAY);
367 	if ((li->li_realvfs->vfs_flag & VFS_NBMAND) &&
368 	    !vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL) &&
369 	    !vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
370 		vfs_setmntopt(vfsp, MNTOPT_NBMAND, NULL,
371 		    VFS_NODISPLAY);
372 
373 	li->li_refct = 0;
374 	vfsp->vfs_data = (caddr_t)li;
375 	vfsp->vfs_bcount = 0;
376 	vfsp->vfs_fstype = lofsfstype;
377 	vfsp->vfs_bsize = li->li_realvfs->vfs_bsize;
378 
379 	vfsp->vfs_dev = li->li_realvfs->vfs_dev;
380 	vfsp->vfs_fsid.val[0] = li->li_realvfs->vfs_fsid.val[0];
381 	vfsp->vfs_fsid.val[1] = li->li_realvfs->vfs_fsid.val[1];
382 
383 	if (vfs_optionisset(vfsp, MNTOPT_LOFS_NOSUB, NULL)) {
384 		li->li_flag |= LO_NOSUB;
385 	}
386 
387 	/*
388 	 * Propagate any VFS features
389 	 */
390 
391 	vfs_propagate_features(li->li_realvfs, vfsp);
392 
393 	/*
394 	 * Setup the hashtable. If the root of this mount isn't a directory,
395 	 * there's no point in allocating a large hashtable. A table with one
396 	 * bucket is sufficient.
397 	 */
398 	if (realrootvp->v_type != VDIR)
399 		lsetup(li, 1);
400 	else
401 		lsetup(li, 0);
402 
403 	/*
404 	 * Make the root vnode
405 	 */
406 	srootvp = makelonode(realrootvp, li, 0);
407 	srootvp->v_flag |= VROOT;
408 	li->li_rootvp = srootvp;
409 
410 #ifdef LODEBUG
411 	lo_dprint(4, "lo_mount: vfs %p realvfs %p root %p realroot %p li %p\n",
412 	    vfsp, li->li_realvfs, srootvp, realrootvp, li);
413 #endif
414 	return (0);
415 }
416 
417 /*
418  * Undo loopback mount
419  */
420 static int
421 lo_unmount(struct vfs *vfsp, int flag, struct cred *cr)
422 {
423 	struct loinfo *li;
424 
425 	if (secpolicy_fs_unmount(cr, vfsp) != 0)
426 		return (EPERM);
427 
428 	/*
429 	 * Forced unmount is not supported by this file system
430 	 * and thus, ENOTSUP, is being returned.
431 	 */
432 	if (flag & MS_FORCE)
433 		return (ENOTSUP);
434 
435 	li = vtoli(vfsp);
436 #ifdef LODEBUG
437 	lo_dprint(4, "lo_unmount(%p) li %p\n", vfsp, li);
438 #endif
439 	if (li->li_refct != 1 || li->li_rootvp->v_count != 1) {
440 #ifdef LODEBUG
441 		lo_dprint(4, "refct %d v_ct %d\n", li->li_refct,
442 		    li->li_rootvp->v_count);
443 #endif
444 		return (EBUSY);
445 	}
446 	VN_RELE(li->li_rootvp);
447 	return (0);
448 }
449 
450 /*
451  * Find root of lofs mount.
452  */
453 static int
454 lo_root(struct vfs *vfsp, struct vnode **vpp)
455 {
456 	*vpp = vtoli(vfsp)->li_rootvp;
457 #ifdef LODEBUG
458 	lo_dprint(4, "lo_root(0x%p) = %p\n", vfsp, *vpp);
459 #endif
460 	/*
461 	 * If the root of the filesystem is a special file, return the specvp
462 	 * version of the vnode. We don't save the specvp vnode in our
463 	 * hashtable since that's exclusively for lnodes.
464 	 */
465 	if (IS_DEVVP(*vpp)) {
466 		struct vnode *svp;
467 
468 		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, kcred);
469 		if (svp == NULL)
470 			return (ENOSYS);
471 		*vpp = svp;
472 	} else {
473 		VN_HOLD(*vpp);
474 	}
475 
476 	return (0);
477 }
478 
479 /*
480  * Get file system statistics.
481  */
482 static int
483 lo_statvfs(register struct vfs *vfsp, struct statvfs64 *sbp)
484 {
485 	vnode_t *realrootvp;
486 
487 #ifdef LODEBUG
488 	lo_dprint(4, "lostatvfs %p\n", vfsp);
489 #endif
490 	/*
491 	 * Using realrootvp->v_vfsp (instead of the realvfsp that was
492 	 * cached) is necessary to make lofs work woth forced UFS unmounts.
493 	 * In the case of a forced unmount, UFS stores a set of dummy vfsops
494 	 * in all the (i)vnodes in the filesystem. The dummy ops simply
495 	 * returns back EIO.
496 	 */
497 	(void) lo_realvfs(vfsp, &realrootvp);
498 	if (realrootvp != NULL)
499 		return (VFS_STATVFS(realrootvp->v_vfsp, sbp));
500 	else
501 		return (EIO);
502 }
503 
504 /*
505  * LOFS doesn't have any data or metadata to flush, pending I/O on the
506  * underlying filesystem will be flushed when such filesystem is synched.
507  */
508 /* ARGSUSED */
509 static int
510 lo_sync(struct vfs *vfsp, short flag, struct cred *cr)
511 {
512 #ifdef LODEBUG
513 	lo_dprint(4, "lo_sync: %p\n", vfsp);
514 #endif
515 	return (0);
516 }
517 
518 /*
519  * While the general sync(2) entry point above assumes that the underlying fs
520  * will be synced, we treat this as a directed blocking sync on the file system
521  * which means we should attempt the underlying file system.
522  */
523 static int
524 lo_syncfs(vfs_t *vfsp, uint64_t flags, cred_t *cr)
525 {
526 	vfs_t *realvfs;
527 
528 #ifdef LODEBUG
529 	lo_dprint(4, "lo_syncfs: %p\n", vfsp);
530 #endif
531 	realvfs = lo_realvfs(vfsp, NULL);
532 	if (realvfs != NULL) {
533 		return (VFS_SYNCFS(realvfs, flags, cr));
534 	} else {
535 		return (EIO);
536 	}
537 
538 }
539 
540 /*
541  * Obtain the vnode from the underlying filesystem.
542  */
543 static int
544 lo_vget(struct vfs *vfsp, struct vnode **vpp, struct fid *fidp)
545 {
546 	vnode_t *realrootvp;
547 
548 #ifdef LODEBUG
549 	lo_dprint(4, "lo_vget: %p\n", vfsp);
550 #endif
551 	(void) lo_realvfs(vfsp, &realrootvp);
552 	if (realrootvp != NULL)
553 		return (VFS_VGET(realrootvp->v_vfsp, vpp, fidp));
554 	else
555 		return (EIO);
556 }
557 
558 /*
559  * Free mount-specific data.
560  */
561 static void
562 lo_freevfs(struct vfs *vfsp)
563 {
564 	struct loinfo *li = vtoli(vfsp);
565 
566 	ldestroy(li);
567 	kmem_free(li, sizeof (struct loinfo));
568 }
569 
570 static int
571 lofsinit(int fstyp, char *name)
572 {
573 	static const fs_operation_def_t lo_vfsops_template[] = {
574 		VFSNAME_MOUNT,		{ .vfs_mount = lo_mount },
575 		VFSNAME_UNMOUNT,	{ .vfs_unmount = lo_unmount },
576 		VFSNAME_ROOT,		{ .vfs_root = lo_root },
577 		VFSNAME_STATVFS,	{ .vfs_statvfs = lo_statvfs },
578 		VFSNAME_SYNC,		{ .vfs_sync = lo_sync },
579 		VFSNAME_VGET,		{ .vfs_vget = lo_vget },
580 		VFSNAME_FREEVFS,	{ .vfs_freevfs = lo_freevfs },
581 		VFSNAME_SYNCFS,		{ .vfs_syncfs = lo_syncfs },
582 		NULL,			NULL
583 	};
584 	int error;
585 
586 	error = vfs_setfsops(fstyp, lo_vfsops_template, &lo_vfsops);
587 	if (error != 0) {
588 		cmn_err(CE_WARN, "lofsinit: bad vfs ops template");
589 		return (error);
590 	}
591 
592 	error = vn_make_ops(name, lo_vnodeops_template, &lo_vnodeops);
593 	if (error != 0) {
594 		(void) vfs_freevfsops_by_type(fstyp);
595 		cmn_err(CE_WARN, "lofsinit: bad vnode ops template");
596 		return (error);
597 	}
598 
599 	lofsfstype = fstyp;
600 
601 	return (0);
602 }
603