/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident	"%Z%%M%	%I%	%E% SMI"

#include <sys/param.h>
#include <sys/errno.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <sys/uio.h>
#include <sys/pathname.h>
#include <sys/kmem.h>
#include <sys/cred.h>
#include <sys/statvfs.h>
#include <sys/fs/lofs_info.h>
#include <sys/fs/lofs_node.h>
#include <sys/mount.h>
#include <sys/mntent.h>
#include <sys/mkdev.h>
#include <sys/priv.h>
#include <sys/sysmacros.h>
#include <sys/systm.h>
#include <sys/cmn_err.h>
#include <sys/policy.h>
#include <sys/tsol/label.h>
#include "fs/fs_subr.h"

/*
 * This is the loadable module wrapper.
 */
#include <sys/modctl.h>

static mntopts_t lofs_mntopts;

static int lofsinit(int, char *);

static vfsdef_t vfw = {
	VFSDEF_VERSION,
	"lofs",
	lofsinit,
	VSW_HASPROTO|VSW_STATS,
	&lofs_mntopts
};

/*
 * Stuff needed to support "zonedevfs" mode.
 */
static major_t lofs_major;
static minor_t lofs_minor;
static kmutex_t lofs_minor_lock;

/*
 * LOFS mount options table
 */
static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
static char *zonedevfs_cancel[] = { MNTOPT_LOFS_NOZONEDEVFS, NULL };
static char *nozonedevfs_cancel[] = { MNTOPT_LOFS_ZONEDEVFS, NULL };
static char *sub_cancel[] = { MNTOPT_LOFS_NOSUB, NULL };
static char *nosub_cancel[] = { MNTOPT_LOFS_SUB, NULL };

static mntopt_t mntopts[] = {
/*
 *	option name		cancel option	default arg	flags
 *		private data
 */
	{ MNTOPT_XATTR,		xattr_cancel,	NULL,		0,
		(void *)0 },
	{ MNTOPT_NOXATTR,	noxattr_cancel,	NULL,		0,
		(void *)0 },
	{ MNTOPT_LOFS_ZONEDEVFS,	zonedevfs_cancel,	NULL,	0,
		(void *)0 },
	{ MNTOPT_LOFS_NOZONEDEVFS,	nozonedevfs_cancel,	NULL,	0,
		(void *)0 },
	{ MNTOPT_LOFS_SUB,	sub_cancel,	NULL,		0,
		(void *)0 },
	{ MNTOPT_LOFS_NOSUB,	nosub_cancel,	NULL,		0,
		(void *)0 },
};

static mntopts_t lofs_mntopts = {
	sizeof (mntopts) / sizeof (mntopt_t),
	mntopts
};

/*
 * Module linkage information for the kernel.
 */

static struct modlfs modlfs = {
	&mod_fsops, "filesystem for lofs", &vfw
};

static struct modlinkage modlinkage = {
	MODREV_1, (void *)&modlfs, NULL
};

/*
 * This is the module initialization routine.
 */

int
_init(void)
{
	int status;

	lofs_subrinit();
	status = mod_install(&modlinkage);
	if (status != 0) {
		/*
		 * Cleanup previously initialized work.
		 */
		lofs_subrfini();
	}

	return (status);
}

/*
 * Don't allow the lofs module to be unloaded for now.
 * There is a memory leak if it gets unloaded.
 */

int
_fini(void)
{
	return (EBUSY);
}

int
_info(struct modinfo *modinfop)
{
	return (mod_info(&modlinkage, modinfop));
}


static int lofsfstype;
vfsops_t *lo_vfsops;

/*
 * lo mount vfsop
 * Set up mount info record and attach it to vfs struct.
 */
/*ARGSUSED*/
static int
lo_mount(struct vfs *vfsp,
	struct vnode *vp,
	struct mounta *uap,
	struct cred *cr)
{
	int error;
	struct vnode *srootvp = NULL;	/* the server's root */
	struct vnode *realrootvp;
	struct loinfo *li;
	int is_zonedevfs = 0;
	int nodev;

	nodev = vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL);

	if ((error = secpolicy_fs_mount(cr, vp, vfsp)) != 0)
		return (EPERM);

	/*
	 * Loopback devices which get "nodevices" added can be done without
	 * "nodevices" set because we cannot import devices into a zone
	 * with loopback.  Note that we have all zone privileges when
	 * this happens; if not, we'd have gotten "nosuid".
	 */
	if (!nodev && vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
		vfs_setmntopt(vfsp, MNTOPT_DEVICES, NULL, VFS_NODISPLAY);

	/*
	 * We must ensure that only the global zone applies the 'zonedevfs'
	 * option; we don't want non-global zones to be able to establish
	 * lofs mounts using the special dev_t we use to ensure that the
	 * contents of a zone's /dev cannot be victim to link(2) or rename(2).
	 * See below, where we set all of this up.
	 *
	 * Since this is more like a privilege check, we use crgetzoneid(cr)
	 * instead of getzoneid().
	 */
	is_zonedevfs = vfs_optionisset(vfsp, MNTOPT_LOFS_ZONEDEVFS, NULL);
	if (crgetzoneid(cr) != GLOBAL_ZONEID && is_zonedevfs)
		return (EPERM);

	mutex_enter(&vp->v_lock);
	if (!(uap->flags & MS_OVERLAY) &&
	    (vp->v_count != 1 || (vp->v_flag & VROOT))) {
		mutex_exit(&vp->v_lock);
		return (EBUSY);
	}
	mutex_exit(&vp->v_lock);

	/*
	 * Find real root, and make vfs point to real vfs
	 */
	if (error = lookupname(uap->spec, (uap->flags & MS_SYSSPACE) ?
		UIO_SYSSPACE : UIO_USERSPACE, FOLLOW, NULLVPP,
	    &realrootvp))
		return (error);

	/*
	 * Enforce MAC policy if needed.
	 *
	 * Loopback mounts must not allow writing up. The dominance test
	 * is intended to prevent a global zone caller from accidentally
	 * creating write-up conditions between two labeled zones.
	 * Local zones can't violate MAC on their own without help from
	 * the global zone because they can't name a pathname that
	 * they don't already have.
	 *
	 * The special case check for the NET_MAC_AWARE process flag is
	 * to support the case of the automounter in the global zone. We
	 * permit automounting of local zone directories such as home
	 * directories, into the global zone as required by setlabel,
	 * zonecopy, and saving of desktop sessions. Such mounts are
	 * trusted not to expose the contents of one zone's directories
	 * to another by leaking them through the global zone.
	 */
	if (is_system_labeled() && crgetzoneid(cr) == GLOBAL_ZONEID) {
		void *specname;
		zone_t *from_zptr;
		zone_t *to_zptr;

		if (uap->flags & MS_SYSSPACE) {
			specname = uap->spec;
		} else {
			specname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
			error = copyinstr(uap->spec, specname, MAXPATHLEN,
			    NULL);
			if (error) {
				kmem_free(specname, MAXPATHLEN);
				return (error);
			}
		}
		from_zptr = zone_find_by_path(specname);
		if (!(uap->flags & MS_SYSSPACE))
			kmem_free(specname, MAXPATHLEN);

		to_zptr = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));

		/*
		 * Special case for zone devfs: the zone for /dev will
		 * incorrectly appear as the global zone since it's not
		 * under the zone rootpath.  So for zone devfs check allow
		 * read-write mounts.
		 */

		if (from_zptr != to_zptr && !is_zonedevfs) {
			/*
			 * We know at this point that the labels aren't equal
			 * because the zone pointers aren't equal, and zones
			 * can't share a label.
			 *
			 * If the source is the global zone then making
			 * it available to a local zone must be done in
			 * read-only mode as the label will become admin_low.
			 *
			 * If it is a mount between local zones then if
			 * the current process is in the global zone and has
			 * the NET_MAC_AWARE flag, then regular read-write
			 * access is allowed.  If it's in some other zone, but
			 * the label on the mount point dominates the original
			 * source, then allow the mount as read-only
			 * ("read-down").
			 */
			if (from_zptr->zone_id == GLOBAL_ZONEID) {
				/* make the mount read-only */
				vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
			} else { /* cross-zone mount */
				if (to_zptr->zone_id == GLOBAL_ZONEID &&
				    /* LINTED: no consequent */
				    getpflags(NET_MAC_AWARE, cr) != 0) {
					/* Allow the mount as read-write */
				} else if (bldominates(
				    label2bslabel(to_zptr->zone_slabel),
				    label2bslabel(from_zptr->zone_slabel))) {
					/* make the mount read-only */
					vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
				} else {
					zone_rele(to_zptr);
					zone_rele(from_zptr);
					return (EACCES);
				}
			}
		}
		zone_rele(to_zptr);
		zone_rele(from_zptr);
	}

	/*
	 * realrootvp may be an AUTOFS node, in which case we
	 * perform a VOP_ACCESS() to trigger the mount of the
	 * intended filesystem, so we loopback mount the intended
	 * filesystem instead of the AUTOFS filesystem.
	 */
	(void) VOP_ACCESS(realrootvp, 0, 0, cr);

	/*
	 * We're interested in the top most filesystem.
	 * This is specially important when uap->spec is a trigger
	 * AUTOFS node, since we're really interested in mounting the
	 * filesystem AUTOFS mounted as result of the VOP_ACCESS()
	 * call not the AUTOFS node itself.
	 */
	if (vn_mountedvfs(realrootvp) != NULL) {
		if (error = traverse(&realrootvp)) {
			VN_RELE(realrootvp);
			return (error);
		}
	}

	/*
	 * Allocate a vfs info struct and attach it
	 */
	li = kmem_zalloc(sizeof (struct loinfo), KM_SLEEP);
	li->li_realvfs = realrootvp->v_vfsp;
	li->li_mountvfs = vfsp;

	/*
	 * Set mount flags to be inherited by loopback vfs's
	 */
	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
		li->li_mflag |= VFS_RDONLY;
	}
	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
		li->li_mflag |= (VFS_NOSETUID|VFS_NODEVICES);
	}
	if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
		li->li_mflag |= VFS_NODEVICES;
	}
	if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
		li->li_mflag |= VFS_NOSETUID;
	}
	/*
	 * Permissive flags are added to the "deny" bitmap.
	 */
	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
		li->li_dflag |= VFS_XATTR;
	}
	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
		li->li_dflag |= VFS_NBMAND;
	}

	/*
	 * Propagate inheritable mount flags from the real vfs.
	 */
	if ((li->li_realvfs->vfs_flag & VFS_RDONLY) &&
	    !vfs_optionisset(vfsp, MNTOPT_RO, NULL))
		vfs_setmntopt(vfsp, MNTOPT_RO, NULL,
		    VFS_NODISPLAY);
	if ((li->li_realvfs->vfs_flag & VFS_NOSETUID) &&
	    !vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
		vfs_setmntopt(vfsp, MNTOPT_NOSETUID, NULL,
		    VFS_NODISPLAY);
	if ((li->li_realvfs->vfs_flag & VFS_NODEVICES) &&
	    !vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
		vfs_setmntopt(vfsp, MNTOPT_NODEVICES, NULL,
		    VFS_NODISPLAY);
	/*
	 * Permissive flags such as VFS_XATTR, as opposed to restrictive flags
	 * such as VFS_RDONLY, are handled differently.  An explicit
	 * MNTOPT_NOXATTR should override the underlying filesystem's VFS_XATTR.
	 */
	if ((li->li_realvfs->vfs_flag & VFS_XATTR) &&
	    !vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL) &&
	    !vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
		vfs_setmntopt(vfsp, MNTOPT_XATTR, NULL,
		    VFS_NODISPLAY);
	if ((li->li_realvfs->vfs_flag & VFS_NBMAND) &&
	    !vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL) &&
	    !vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
		vfs_setmntopt(vfsp, MNTOPT_NBMAND, NULL,
		    VFS_NODISPLAY);

	li->li_refct = 0;
	vfsp->vfs_data = (caddr_t)li;
	vfsp->vfs_bcount = 0;
	vfsp->vfs_fstype = lofsfstype;
	vfsp->vfs_bsize = li->li_realvfs->vfs_bsize;

	/*
	 * Test to see if we need to be in "zone /dev" mode.  In zonedevfs
	 * mode, we pull a nasty trick; we make sure that the lofs dev_t does
	 * *not* reflect the underlying device, so that no renames or links
	 * can occur to or from the /dev hierarchy.
	 */
	if (is_zonedevfs) {
		dev_t dev;

		mutex_enter(&lofs_minor_lock);
		do {
			lofs_minor = (lofs_minor + 1) & MAXMIN32;
			dev = makedevice(lofs_major, lofs_minor);
		} while (vfs_devismounted(dev));
		mutex_exit(&lofs_minor_lock);

		vfsp->vfs_dev = dev;
		vfs_make_fsid(&vfsp->vfs_fsid, dev, lofsfstype);

		li->li_flag |= LO_ZONEDEVFS;
	} else {
		vfsp->vfs_dev = li->li_realvfs->vfs_dev;
		vfsp->vfs_fsid.val[0] = li->li_realvfs->vfs_fsid.val[0];
		vfsp->vfs_fsid.val[1] = li->li_realvfs->vfs_fsid.val[1];
	}

	if (vfs_optionisset(vfsp, MNTOPT_LOFS_NOSUB, NULL)) {
		li->li_flag |= LO_NOSUB;
	}

	/*
	 * Setup the hashtable. If the root of this mount isn't a directory,
	 * there's no point in allocating a large hashtable. A table with one
	 * bucket is sufficient.
	 */
	if (realrootvp->v_type != VDIR)
		lsetup(li, 1);
	else
		lsetup(li, 0);

	/*
	 * Make the root vnode
	 */
	srootvp = makelonode(realrootvp, li, 0);
	srootvp->v_flag |= VROOT;
	li->li_rootvp = srootvp;

#ifdef LODEBUG
	lo_dprint(4, "lo_mount: vfs %p realvfs %p root %p realroot %p li %p\n",
	    vfsp, li->li_realvfs, srootvp, realrootvp, li);
#endif
	return (0);
}

/*
 * Undo loopback mount
 */
static int
lo_unmount(struct vfs *vfsp, int flag, struct cred *cr)
{
	struct loinfo *li;

	if (secpolicy_fs_unmount(cr, vfsp) != 0)
		return (EPERM);

	/*
	 * Forced unmount is not supported by this file system
	 * and thus, ENOTSUP, is being returned.
	 */
	if (flag & MS_FORCE)
		return (ENOTSUP);

	li = vtoli(vfsp);
#ifdef LODEBUG
	lo_dprint(4, "lo_unmount(%p) li %p\n", vfsp, li);
#endif
	if (li->li_refct != 1 || li->li_rootvp->v_count != 1) {
#ifdef LODEBUG
		lo_dprint(4, "refct %d v_ct %d\n", li->li_refct,
		    li->li_rootvp->v_count);
#endif
		return (EBUSY);
	}
	VN_RELE(li->li_rootvp);
	return (0);
}

/*
 * Find root of lofs mount.
 */
static int
lo_root(struct vfs *vfsp, struct vnode **vpp)
{
	*vpp = vtoli(vfsp)->li_rootvp;
#ifdef LODEBUG
	lo_dprint(4, "lo_root(0x%p) = %p\n", vfsp, *vpp);
#endif
	/*
	 * If the root of the filesystem is a special file, return the specvp
	 * version of the vnode. We don't save the specvp vnode in our
	 * hashtable since that's exclusively for lnodes.
	 */
	if (IS_DEVVP(*vpp)) {
		struct vnode *svp;

		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, kcred);
		if (svp == NULL)
			return (ENOSYS);
		*vpp = svp;
	} else {
		VN_HOLD(*vpp);
	}

	return (0);
}

/*
 * Get file system statistics.
 */
static int
lo_statvfs(register struct vfs *vfsp, struct statvfs64 *sbp)
{
	vnode_t *realrootvp;

#ifdef LODEBUG
	lo_dprint(4, "lostatvfs %p\n", vfsp);
#endif
	/*
	 * Using realrootvp->v_vfsp (instead of the realvfsp that was
	 * cached) is necessary to make lofs work woth forced UFS unmounts.
	 * In the case of a forced unmount, UFS stores a set of dummy vfsops
	 * in all the (i)vnodes in the filesystem. The dummy ops simply
	 * returns back EIO.
	 */
	(void) lo_realvfs(vfsp, &realrootvp);
	if (realrootvp != NULL)
		return (VFS_STATVFS(realrootvp->v_vfsp, sbp));
	else
		return (EIO);
}

/*
 * LOFS doesn't have any data or metadata to flush, pending I/O on the
 * underlying filesystem will be flushed when such filesystem is synched.
 */
/* ARGSUSED */
static int
lo_sync(struct vfs *vfsp,
	short flag,
	struct cred *cr)
{
#ifdef LODEBUG
	lo_dprint(4, "lo_sync: %p\n", vfsp);
#endif
	return (0);
}

/*
 * Obtain the vnode from the underlying filesystem.
 */
static int
lo_vget(struct vfs *vfsp, struct vnode **vpp, struct fid *fidp)
{
	vnode_t *realrootvp;

#ifdef LODEBUG
	lo_dprint(4, "lo_vget: %p\n", vfsp);
#endif
	(void) lo_realvfs(vfsp, &realrootvp);
	if (realrootvp != NULL)
		return (VFS_VGET(realrootvp->v_vfsp, vpp, fidp));
	else
		return (EIO);
}

/*
 * Free mount-specific data.
 */
static void
lo_freevfs(struct vfs *vfsp)
{
	struct loinfo *li = vtoli(vfsp);

	ldestroy(li);
	kmem_free(li, sizeof (struct loinfo));
}

static int
lofsinit(int fstyp, char *name)
{
	static const fs_operation_def_t lo_vfsops_template[] = {
		VFSNAME_MOUNT, lo_mount,
		VFSNAME_UNMOUNT, lo_unmount,
		VFSNAME_ROOT, lo_root,
		VFSNAME_STATVFS, lo_statvfs,
		VFSNAME_SYNC, (fs_generic_func_p) lo_sync,
		VFSNAME_VGET, lo_vget,
		VFSNAME_FREEVFS, (fs_generic_func_p) lo_freevfs,
		NULL, NULL
	};
	int error;

	error = vfs_setfsops(fstyp, lo_vfsops_template, &lo_vfsops);
	if (error != 0) {
		cmn_err(CE_WARN, "lofsinit: bad vfs ops template");
		return (error);
	}

	error = vn_make_ops(name, lo_vnodeops_template, &lo_vnodeops);
	if (error != 0) {
		(void) vfs_freevfsops_by_type(fstyp);
		cmn_err(CE_WARN, "lofsinit: bad vnode ops template");
		return (error);
	}

	lofsfstype = fstyp;

	if ((lofs_major = getudev()) == (major_t)-1) {
		(void) vfs_freevfsops_by_type(fstyp);
		cmn_err(CE_WARN, "lofsinit: Can't get unique device number.");
		return (ENXIO);
	}

	lofs_minor = 0;
	mutex_init(&lofs_minor_lock, NULL, MUTEX_DEFAULT, NULL);

	return (0);
}