/*
 * This file and its contents are supplied under the terms of the
 * Common Development and Distribution License ("CDDL"), version 1.0.
 * You may only use this file in accordance with the terms of version
 * 1.0 of the CDDL.
 *
 * A full copy of the text of the CDDL should have accompanied this
 * source.  A copy of the CDDL is also available via the Internet at
 * http://www.illumos.org/license/CDDL.
 */

/*
 * Copyright 2019 Joyent, Inc.
 */

/*
 * Dynamic directory plugin interface for sdev.
 *
 * The sdev plugin interfaces provides a means for a dynamic directory based on
 * in-kernel state to be simply created. Traditionally, dynamic directories were
 * built into sdev itself. While these legacy plugins are useful, it makes more
 * sense for these pieces of functionality to live with the individual drivers.
 *
 * The plugin interface requires folks to implement three interfaces and
 * provides a series of callbacks that can be made in the context of those
 * interfaces to interrogate the sdev_node_t without having to leak
 * implementation details of the sdev_node_t. These interfaces are:
 *
 *   o spo_validate
 *
 *   Given a particular node, answer the question as to whether or not this
 *   entry is still valid. Here, plugins should use the name and the dev_t
 *   associated with the node to verify that it matches something that still
 *   exists.
 *
 *   o spo_filldir
 *
 *   Fill all the entries inside of a directory. Note that some of these entries
 *   may already exist.
 *
 *   o spo_inactive
 *
 *   The given node is no longer being used. This allows the consumer to
 *   potentially tear down anything that was being held open related to this.
 *   Note that this only fires when the given sdev_node_t becomes a zombie.
 *
 * During these callbacks a consumer is not allowed to register or unregister a
 * plugin, especially their own. They may call the sdev_ctx style functions. All
 * callbacks fire in a context where blocking is allowed (eg. the spl is below
 * LOCK_LEVEL).
 *
 * When a plugin is added, we create its directory in the global zone. By doing
 * that, we ensure that something isn't already there and that nothing else can
 * come along and try and create something without our knowledge. We only have
 * to create it in the GZ and not for all other instances of sdev because an
 * instance of sdev that isn't at /dev does not have dynamic directories, and
 * second, any instance of sdev present in a non-global zone cannot create
 * anything, therefore we know that by it not being in the global zone's
 * instance of sdev that we're good to go.
 *
 * Lock Ordering
 * -------------
 *
 * The global sdev_plugin_lock must be held before any of the individual
 * sdev_plugin_t`sp_lock. Further, once any plugin related lock has been held,
 * it is not legal to take any holds on any sdev_node_t or to grab the
 * sdev_node_t`contents_lock in any way.
 */

#include <sys/types.h>
#include <sys/stat.h>
#include <sys/fs/sdev_impl.h>
#include <sys/fs/sdev_plugin.h>
#include <fs/fs_subr.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/ksynch.h>
#include <sys/sysmacros.h>
#include <sys/list.h>
#include <sys/ctype.h>

kmutex_t sdev_plugin_lock;
list_t sdev_plugin_list;
kmem_cache_t *sdev_plugin_cache;
struct vnodeops *sdev_plugin_vnops;

#define	SDEV_PLUGIN_NAMELEN	64

typedef struct sdev_plugin {
	list_node_t sp_link;
	char sp_name[SDEV_PLUGIN_NAMELEN];	/* E */
	int sp_nflags;				/* E */
	struct vnodeops *sp_vnops;		/* E */
	sdev_plugin_ops_t *sp_pops;		/* E */
	boolean_t sp_islegacy;			/* E */
	int (*sp_lvtor)(sdev_node_t *);		/* E */
	kmutex_t sp_lock;			/* Protects everything below */
	kcondvar_t sp_nodecv;
	size_t sp_nnodes;
} sdev_plugin_t;

/* ARGSUSED */
static int
sdev_plugin_cache_constructor(void *buf, void *arg, int tags)
{
	sdev_plugin_t *spp = buf;
	mutex_init(&spp->sp_lock, NULL, MUTEX_DRIVER, 0);
	cv_init(&spp->sp_nodecv, NULL, CV_DRIVER, NULL);
	return (0);
}

/* ARGSUSED */
static void
sdev_plugin_cache_destructor(void *buf, void *arg)
{
	sdev_plugin_t *spp = buf;
	cv_destroy(&spp->sp_nodecv);
	mutex_destroy(&spp->sp_lock);
}

enum vtype
sdev_ctx_vtype(sdev_ctx_t ctx)
{
	sdev_node_t *sdp = (sdev_node_t *)ctx;

	ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
	return (sdp->sdev_vnode->v_type);
}

const char *
sdev_ctx_path(sdev_ctx_t ctx)
{
	sdev_node_t *sdp = (sdev_node_t *)ctx;

	ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
	return (sdp->sdev_path);
}

const char *
sdev_ctx_name(sdev_ctx_t ctx)
{
	sdev_node_t *sdp = (sdev_node_t *)ctx;

	ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
	return (sdp->sdev_name);
}

int
sdev_ctx_minor(sdev_ctx_t ctx, minor_t *minorp)
{
	sdev_node_t *sdp = (sdev_node_t *)ctx;

	ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
	ASSERT(minorp != NULL);
	if (sdp->sdev_vnode->v_type == VCHR ||
	    sdp->sdev_vnode->v_type == VBLK) {
		*minorp = getminor(sdp->sdev_vnode->v_rdev);
		return (0);
	}

	return (ENODEV);
}

/*
 * Currently we only support psasing through a single flag -- SDEV_IS_GLOBAL.
 */
sdev_ctx_flags_t
sdev_ctx_flags(sdev_ctx_t ctx)
{
	sdev_node_t *sdp = (sdev_node_t *)ctx;

	ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
	return (sdp->sdev_flags & SDEV_GLOBAL);
}

/*
 * Use the same rules as zones for a name. isalphanum + '-', '_', and '.'.
 */
static int
sdev_plugin_name_isvalid(const char *c, int buflen)
{
	int i;

	for (i = 0; i < buflen; i++, c++) {
		if (*c == '\0')
			return (1);

		if (!isalnum(*c) && *c != '-' && *c != '_' && *c != '.')
			return (0);
	}
	/* Never found a null terminator */
	return (0);
}

static int
sdev_plugin_mknode(sdev_plugin_t *spp, sdev_node_t *sdvp, char *name,
    vattr_t *vap)
{
	int ret;
	sdev_node_t *svp;

	ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));
	ASSERT(spp != NULL);
	svp = sdev_cache_lookup(sdvp, name);
	if (svp != NULL) {
		SDEV_SIMPLE_RELE(svp);
		return (EEXIST);
	}

	ret = sdev_mknode(sdvp, name, &svp, vap, NULL, NULL, kcred,
	    SDEV_READY);
	if (ret != 0)
		return (ret);
	SDEV_SIMPLE_RELE(svp);

	return (0);
}

/*
 * Plugin node creation callbacks
 */
int
sdev_plugin_mkdir(sdev_ctx_t ctx, char *name)
{
	sdev_node_t *sdvp;
	timestruc_t now;
	struct vattr vap;

	if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0)
		return (EINVAL);

	sdvp = (sdev_node_t *)ctx;
	ASSERT(sdvp->sdev_private != NULL);
	ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));

	vap = *sdev_getdefault_attr(VDIR);
	gethrestime(&now);
	vap.va_atime = now;
	vap.va_mtime = now;
	vap.va_ctime = now;

	return (sdev_plugin_mknode(sdvp->sdev_private, sdvp, name, &vap));
}

int
sdev_plugin_mknod(sdev_ctx_t ctx, char *name, mode_t mode, dev_t dev)
{
	sdev_node_t *sdvp;
	timestruc_t now;
	struct vattr vap;
	mode_t type = mode & S_IFMT;
	mode_t access = mode & S_IAMB;

	if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0)
		return (EINVAL);

	sdvp = (sdev_node_t *)ctx;
	ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));

	/*
	 * Ensure only type and user/group/other permission bits are present.
	 * Do not allow setuid, setgid, etc.
	 */
	if ((mode & ~(S_IFMT | S_IAMB)) != 0)
		return (EINVAL);

	/* Disallow types other than character and block devices */
	if (type != S_IFCHR && type != S_IFBLK)
		return (EINVAL);

	/* Disallow execute bits */
	if ((access & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0)
		return (EINVAL);

	/* No bits other than 0666 in access */
	ASSERT((access &
	    ~(S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH)) == 0);

	/* Default to relatively safe access bits if none specified. */
	if (access == 0)
		access = 0600;

	ASSERT(sdvp->sdev_private != NULL);

	vap = *sdev_getdefault_attr(type == S_IFCHR ? VCHR : VBLK);
	gethrestime(&now);
	vap.va_atime = now;
	vap.va_mtime = now;
	vap.va_ctime = now;
	vap.va_rdev = dev;
	vap.va_mode = type | access;

	/* Despite the similar name, this is in fact a different function */
	return (sdev_plugin_mknode(sdvp->sdev_private, sdvp, name, &vap));
}

static int
sdev_plugin_validate(sdev_node_t *sdp)
{
	int ret;
	sdev_plugin_t *spp;

	ASSERT(sdp->sdev_private != NULL);
	spp = sdp->sdev_private;
	ASSERT(spp->sp_islegacy == B_FALSE);
	ASSERT(spp->sp_pops != NULL);
	rw_enter(&sdp->sdev_contents, RW_READER);
	ret = spp->sp_pops->spo_validate((uintptr_t)sdp);
	rw_exit(&sdp->sdev_contents);
	return (ret);
}

static void
sdev_plugin_validate_dir(sdev_node_t *sdvp)
{
	int ret;
	sdev_node_t *svp, *next;

	ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));

	for (svp = SDEV_FIRST_ENTRY(sdvp); svp != NULL; svp = next) {

		next = SDEV_NEXT_ENTRY(sdvp, svp);
		ASSERT(svp->sdev_state != SDEV_ZOMBIE);
		/* skip nodes that aren't ready */
		if (svp->sdev_state == SDEV_INIT)
			continue;

		switch (sdev_plugin_validate(svp)) {
		case SDEV_VTOR_VALID:
		case SDEV_VTOR_SKIP:
			continue;
		case SDEV_VTOR_INVALID:
		case SDEV_VTOR_STALE:
			break;
		}

		SDEV_HOLD(svp);

		/*
		 * Clean out everything underneath this node before we
		 * remove it.
		 */
		if (svp->sdev_vnode->v_type == VDIR) {
			ret = sdev_cleandir(svp, NULL, 0);
			ASSERT(ret == 0);
		}
		/* remove the cache node */
		(void) sdev_cache_update(sdvp, &svp, svp->sdev_name,
		    SDEV_CACHE_DELETE);
		SDEV_RELE(svp);
	}
}

/* ARGSUSED */
static int
sdev_plugin_vop_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
    int *eofp, caller_context_t *ct_unused, int flags_unused)
{
	int ret;
	sdev_node_t *sdvp = VTOSDEV(dvp);
	sdev_plugin_t *spp;

	ASSERT(RW_READ_HELD(&sdvp->sdev_contents));

	/* Sanity check we're not a zombie before we do anyting else */
	if (sdvp->sdev_state == SDEV_ZOMBIE)
		return (ENOENT);

	spp = sdvp->sdev_private;
	ASSERT(spp != NULL);
	ASSERT(spp->sp_islegacy == B_FALSE);
	ASSERT(spp->sp_pops != NULL);

	if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp))
		return (EPERM);

	if (uiop->uio_offset == 0) {
		/*
		 * We upgrade to a write lock and grab the plugin's lock along
		 * the way. We're almost certainly going to get creation
		 * callbacks, so this is the only safe way to go.
		 */
		if (rw_tryupgrade(&sdvp->sdev_contents) == 0) {
			rw_exit(&sdvp->sdev_contents);
			rw_enter(&sdvp->sdev_contents, RW_WRITER);
			if (sdvp->sdev_state == SDEV_ZOMBIE) {
				rw_downgrade(&sdvp->sdev_contents);
				return (ENOENT);
			}
		}

		sdev_plugin_validate_dir(sdvp);
		ret = spp->sp_pops->spo_filldir((uintptr_t)sdvp);
		rw_downgrade(&sdvp->sdev_contents);
		if (ret != 0)
			return (ret);
	}

	return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
}

/*
 * If we don't have a callback function that returns a failure, then sdev will
 * try to create a node for us which violates all of our basic assertions. To
 * work around that we create our own callback for devname_lookup_func which
 * always returns ENOENT as at this point either it was created with the filldir
 * callback or it was not.
 */
/*ARGSUSED*/
static int
sdev_plugin_vop_lookup_cb(sdev_node_t *ddv, char *nm, void **arg, cred_t *cred,
    void *unused, char *unused2)
{
	return (ENOENT);
}

/* ARGSUSED */
static int
sdev_plugin_vop_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
    struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
    caller_context_t *ct, int *direntflags, pathname_t *realpnp)
{
	int ret;
	sdev_node_t *sdvp;
	sdev_plugin_t *spp;

	/* execute access is required to search the directory */
	if ((ret = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
		return (ret);

	sdvp = VTOSDEV(dvp);
	spp = sdvp->sdev_private;
	ASSERT(spp != NULL);
	ASSERT(spp->sp_islegacy == B_FALSE);
	ASSERT(spp->sp_pops != NULL);

	if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp))
		return (EPERM);

	/*
	 * Go straight for the write lock.
	 */
	rw_enter(&sdvp->sdev_contents, RW_WRITER);
	if (sdvp->sdev_state == SDEV_ZOMBIE) {
		rw_exit(&sdvp->sdev_contents);
		return (ENOENT);
	}
	sdev_plugin_validate_dir(sdvp);
	ret = spp->sp_pops->spo_filldir((uintptr_t)sdvp);
	rw_exit(&sdvp->sdev_contents);
	if (ret != 0)
		return (ret);

	return (devname_lookup_func(sdvp, nm, vpp, cred,
	    sdev_plugin_vop_lookup_cb, SDEV_VATTR));
}

/*
 * sdev is not a good citizen. We get inactive callbacks whenever a vnode goes
 * to zero, but isn't necessairily a zombie yet. As such, to make things easier
 * for users, we only fire the inactive callback when the node becomes a zombie
 * and thus will be torn down here.
 */
static void
sdev_plugin_vop_inactive_cb(struct vnode *dvp)
{
	sdev_node_t *sdp = VTOSDEV(dvp);
	sdev_plugin_t *spp = sdp->sdev_private;

	rw_enter(&sdp->sdev_contents, RW_READER);
	if (sdp->sdev_state != SDEV_ZOMBIE) {
		rw_exit(&sdp->sdev_contents);
		return;
	}
	spp->sp_pops->spo_inactive((uintptr_t)sdp);
	mutex_enter(&spp->sp_lock);
	VERIFY(spp->sp_nnodes > 0);
	spp->sp_nnodes--;
	cv_signal(&spp->sp_nodecv);
	mutex_exit(&spp->sp_lock);
	rw_exit(&sdp->sdev_contents);
}

/*ARGSUSED*/
static void
sdev_plugin_vop_inactive(struct vnode *dvp, struct cred *cred,
    caller_context_t *ct)
{
	sdev_node_t *sdp = VTOSDEV(dvp);
	sdev_plugin_t *spp = sdp->sdev_private;
	ASSERT(sdp->sdev_private != NULL);
	ASSERT(spp->sp_islegacy == B_FALSE);
	devname_inactive_func(dvp, cred, sdev_plugin_vop_inactive_cb);
}

const fs_operation_def_t sdev_plugin_vnodeops_tbl[] = {
	VOPNAME_READDIR,	{ .vop_readdir = sdev_plugin_vop_readdir },
	VOPNAME_LOOKUP,		{ .vop_lookup = sdev_plugin_vop_lookup },
	VOPNAME_INACTIVE,	{ .vop_inactive = sdev_plugin_vop_inactive },
	VOPNAME_CREATE,		{ .error = fs_nosys },
	VOPNAME_REMOVE,		{ .error = fs_nosys },
	VOPNAME_MKDIR,		{ .error = fs_nosys },
	VOPNAME_RMDIR,		{ .error = fs_nosys },
	VOPNAME_SYMLINK,	{ .error = fs_nosys },
	VOPNAME_SETSECATTR,	{ .error = fs_nosys },
	NULL,			NULL
};

/*
 * construct a new template with overrides from vtab
 */
static fs_operation_def_t *
sdev_merge_vtab(const fs_operation_def_t tab[])
{
	fs_operation_def_t *new;
	const fs_operation_def_t *tab_entry;

	/* make a copy of standard vnode ops table */
	new = kmem_alloc(sdev_vnodeops_tbl_size, KM_SLEEP);
	bcopy((void *)sdev_vnodeops_tbl, new, sdev_vnodeops_tbl_size);

	/* replace the overrides from tab */
	for (tab_entry = tab; tab_entry->name != NULL; tab_entry++) {
		fs_operation_def_t *std_entry = new;
		while (std_entry->name) {
			if (strcmp(tab_entry->name, std_entry->name) == 0) {
				std_entry->func = tab_entry->func;
				break;
			}
			std_entry++;
		}
	}

	return (new);
}

/* free memory allocated by sdev_merge_vtab */
static void
sdev_free_vtab(fs_operation_def_t *new)
{
	kmem_free(new, sdev_vnodeops_tbl_size);
}

/*
 * Register a new plugin.
 */
sdev_plugin_hdl_t
sdev_plugin_register(const char *name, sdev_plugin_ops_t *ops, int *errp)
{
	char buf[sizeof ("dev")] = "";
	struct pathname pn = { 0 };
	sdev_plugin_t *spp, *iter;
	vnode_t *vp, *nvp;
	sdev_node_t *sdp, *slp;
	timestruc_t now;
	struct vattr vap;
	int ret, err;

	/*
	 * Some consumers don't care about why they failed. To keep the code
	 * simple, we'll just pretend they gave us something.
	 */
	if (errp == NULL)
		errp = &err;

	if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0) {
		*errp = EINVAL;
		return ((sdev_plugin_hdl_t)NULL);
	}

	if (ops->spo_version != 1) {
		*errp = EINVAL;
		return ((sdev_plugin_hdl_t)NULL);
	}

	if (ops->spo_validate == NULL || ops->spo_filldir == NULL ||
	    ops->spo_inactive == NULL) {
		*errp = EINVAL;
		return ((sdev_plugin_hdl_t)NULL);
	}

	if ((ops->spo_flags & ~SDEV_PLUGIN_FLAGS_MASK) != 0) {
		*errp = EINVAL;
		return ((sdev_plugin_hdl_t)NULL);
	}

	spp = kmem_cache_alloc(sdev_plugin_cache, KM_SLEEP);
	(void) strlcpy(spp->sp_name, name, SDEV_PLUGIN_NAMELEN);

	spp->sp_pops = ops;
	spp->sp_nflags = SDEV_DYNAMIC | SDEV_VTOR;
	if (ops->spo_flags & SDEV_PLUGIN_NO_NCACHE)
		spp->sp_nflags |= SDEV_NO_NCACHE;
	if (ops->spo_flags & SDEV_PLUGIN_SUBDIR)
		spp->sp_nflags |= SDEV_SUBDIR;
	spp->sp_vnops = sdev_plugin_vnops;
	spp->sp_islegacy = B_FALSE;
	spp->sp_lvtor = NULL;
	spp->sp_nnodes = 0;

	/*
	 * Make sure our /dev entry is unique and install it.  We also need to
	 * go through and grab the sdev root node as we cannot grab any sdev
	 * node locks once we've grabbed the sdev_plugin_lock. We effectively
	 * assert that if a directory is not present in the GZ's /dev, then it
	 * doesn't exist in any of the local zones.
	 *
	 * Note that we may be in NGZ context: during a prof_filldir(".../dev/")
	 * enumeration, for example. So we have to dig as deep as lookuppnvp()
	 * to make sure we really get to the global /dev (i.e.  escape both
	 * CRED() and ->u_rdir).
	 */
	(void) pn_get_buf("dev", UIO_SYSSPACE, &pn, buf, sizeof (buf));
	VN_HOLD(rootdir);
	ret = lookuppnvp(&pn, NULL, NO_FOLLOW, NULLVPP,
	    &vp, rootdir, rootdir, kcred);

	if (ret != 0) {
		*errp = ret;
		kmem_cache_free(sdev_plugin_cache, spp);
		return ((sdev_plugin_hdl_t)NULL);
	}
	/* Make sure we have the real vnode */
	if (VOP_REALVP(vp, &nvp, NULL) == 0) {
		VN_HOLD(nvp);
		VN_RELE(vp);
		vp = nvp;
		nvp = NULL;
	}
	VERIFY(vp->v_op == sdev_vnodeops);
	sdp = VTOSDEV(vp);
	rw_enter(&sdp->sdev_contents, RW_WRITER);
	slp = sdev_cache_lookup(sdp, spp->sp_name);
	if (slp != NULL) {
		SDEV_RELE(slp);
		rw_exit(&sdp->sdev_contents);
		VN_RELE(vp);
		*errp = EEXIST;
		kmem_cache_free(sdev_plugin_cache, spp);
		return ((sdev_plugin_hdl_t)NULL);
	}

	mutex_enter(&sdev_plugin_lock);
	for (iter = list_head(&sdev_plugin_list); iter != NULL;
	    iter = list_next(&sdev_plugin_list, iter)) {
		if (strcmp(spp->sp_name, iter->sp_name) == 0) {
			mutex_exit(&sdev_plugin_lock);
			rw_exit(&sdp->sdev_contents);
			VN_RELE(vp);
			*errp = EEXIST;
			kmem_cache_free(sdev_plugin_cache, spp);
			return ((sdev_plugin_hdl_t)NULL);
		}
	}

	list_insert_tail(&sdev_plugin_list, spp);
	mutex_exit(&sdev_plugin_lock);

	/*
	 * Now go ahead and create the top level directory for the global zone.
	 */
	vap = *sdev_getdefault_attr(VDIR);
	gethrestime(&now);
	vap.va_atime = now;
	vap.va_mtime = now;
	vap.va_ctime = now;

	(void) sdev_plugin_mknode(spp, sdp, spp->sp_name, &vap);

	rw_exit(&sdp->sdev_contents);
	VN_RELE(vp);

	*errp = 0;

	return ((sdev_plugin_hdl_t)spp);
}

static void
sdev_plugin_unregister_cb(sdev_node_t *rdp, void *arg)
{
	sdev_plugin_t *spp = arg;
	sdev_node_t *sdp;

	rw_enter(&rdp->sdev_contents, RW_WRITER);
	sdp = sdev_cache_lookup(rdp, spp->sp_name);
	/* If it doesn't exist, we're done here */
	if (sdp == NULL) {
		rw_exit(&rdp->sdev_contents);
		return;
	}

	/*
	 * We first delete the directory before recursively marking everything
	 * else stale. This ordering should ensure that we don't accidentally
	 * miss anything.
	 */
	sdev_cache_update(rdp, &sdp, spp->sp_name, SDEV_CACHE_DELETE);
	sdev_stale(sdp);
	SDEV_RELE(sdp);
	rw_exit(&rdp->sdev_contents);
}

int sdev_plugin_unregister_allowed;

/*
 * Remove a plugin. This will block until everything has become a zombie, thus
 * guaranteeing the caller that nothing will call into them again once this call
 * returns. While the call is ongoing, it could be called into. Note that while
 * this is ongoing, it will block other mounts.
 *
 * NB: this is not safe when used from detach() context - we will be DEVI_BUSY,
 * and other sdev threads may be waiting for this.  Only use the over-ride if
 * willing to risk it.
 */
int
sdev_plugin_unregister(sdev_plugin_hdl_t hdl)
{
	sdev_plugin_t *spp = (sdev_plugin_t *)hdl;
	if (spp->sp_islegacy)
		return (EINVAL);

	if (!sdev_plugin_unregister_allowed)
		return (EBUSY);

	mutex_enter(&sdev_plugin_lock);
	list_remove(&sdev_plugin_list, spp);
	mutex_exit(&sdev_plugin_lock);

	sdev_mnt_walk(sdev_plugin_unregister_cb, spp);
	mutex_enter(&spp->sp_lock);
	while (spp->sp_nnodes > 0)
		cv_wait(&spp->sp_nodecv, &spp->sp_lock);
	mutex_exit(&spp->sp_lock);
	kmem_cache_free(sdev_plugin_cache, spp);
	return (0);
}

/*
 * Register an old sdev style plugin to deal with what used to be in the vtab.
 */
static int
sdev_plugin_register_legacy(struct sdev_vop_table *vtp)
{
	sdev_plugin_t *spp;

	spp = kmem_cache_alloc(sdev_plugin_cache, KM_SLEEP);
	(void) strlcpy(spp->sp_name, vtp->vt_name, SDEV_PLUGIN_NAMELEN);
	spp->sp_islegacy = B_TRUE;
	spp->sp_pops = NULL;
	spp->sp_nflags = vtp->vt_flags;
	spp->sp_lvtor = vtp->vt_vtor;
	spp->sp_nnodes = 0;

	if (vtp->vt_service != NULL) {
		fs_operation_def_t *templ;
		templ = sdev_merge_vtab(vtp->vt_service);
		if (vn_make_ops(vtp->vt_name,
		    (const fs_operation_def_t *)templ,
		    &spp->sp_vnops) != 0) {
			cmn_err(CE_WARN, "%s: malformed vnode ops\n",
			    vtp->vt_name);
			sdev_free_vtab(templ);
			kmem_cache_free(sdev_plugin_cache, spp);
			return (1);
		}

		if (vtp->vt_global_vops) {
			*(vtp->vt_global_vops) = spp->sp_vnops;
		}

		sdev_free_vtab(templ);
	} else {
		spp->sp_vnops = sdev_vnodeops;
	}

	/*
	 * No need to check for EEXIST here. These are loaded as a part of the
	 * sdev's initialization function. Further, we don't have to create them
	 * as that's taken care of in sdev's mount for the GZ.
	 */
	mutex_enter(&sdev_plugin_lock);
	list_insert_tail(&sdev_plugin_list, spp);
	mutex_exit(&sdev_plugin_lock);

	return (0);
}

/*
 * We need to match off of the sdev_path, not the sdev_name. We are only allowed
 * to exist directly under /dev.
 */
static sdev_plugin_t *
sdev_match(sdev_node_t *dv)
{
	int vlen;
	const char *path;
	sdev_plugin_t *spp;

	if (strlen(dv->sdev_path) <= 5)
		return (NULL);

	if (strncmp(dv->sdev_path, "/dev/", 5) != 0)
		return (NULL);
	path = dv->sdev_path + 5;

	mutex_enter(&sdev_plugin_lock);

	for (spp = list_head(&sdev_plugin_list); spp != NULL;
	    spp = list_next(&sdev_plugin_list, spp)) {
		if (strcmp(spp->sp_name, path) == 0) {
			mutex_exit(&sdev_plugin_lock);
			return (spp);
		}

		if (spp->sp_nflags & SDEV_SUBDIR) {
			vlen = strlen(spp->sp_name);
			if ((strncmp(spp->sp_name, path,
			    vlen - 1) == 0) && path[vlen] == '/') {
				mutex_exit(&sdev_plugin_lock);
				return (spp);
			}

		}
	}

	mutex_exit(&sdev_plugin_lock);
	return (NULL);
}

void
sdev_set_no_negcache(sdev_node_t *dv)
{
	char *path;
	sdev_plugin_t *spp;

	ASSERT(dv->sdev_path);
	path = dv->sdev_path + strlen("/dev/");

	mutex_enter(&sdev_plugin_lock);
	for (spp = list_head(&sdev_plugin_list); spp != NULL;
	    spp = list_next(&sdev_plugin_list, spp)) {
		if (strcmp(spp->sp_name, path) == 0) {
			if (spp->sp_nflags & SDEV_NO_NCACHE)
				dv->sdev_flags |= SDEV_NO_NCACHE;
			break;
		}
	}
	mutex_exit(&sdev_plugin_lock);
}

struct vnodeops *
sdev_get_vop(sdev_node_t *dv)
{
	char *path;
	sdev_plugin_t *spp;

	path = dv->sdev_path;
	ASSERT(path);

	/* gets the relative path to /dev/ */
	path += 5;

	if ((spp = sdev_match(dv)) != NULL) {
		dv->sdev_flags |= spp->sp_nflags;
		if (SDEV_IS_PERSIST(dv->sdev_dotdot) &&
		    (SDEV_IS_PERSIST(dv) || !SDEV_IS_DYNAMIC(dv)))
			dv->sdev_flags |= SDEV_PERSIST;
		return (spp->sp_vnops);
	}

	/* child inherits the persistence of the parent */
	if (SDEV_IS_PERSIST(dv->sdev_dotdot))
		dv->sdev_flags |= SDEV_PERSIST;
	return (sdev_vnodeops);
}

void *
sdev_get_vtor(sdev_node_t *dv)
{
	sdev_plugin_t *spp;

	if (dv->sdev_private == NULL) {
		spp = sdev_match(dv);
		if (spp == NULL)
			return (NULL);
	} else {
		spp = dv->sdev_private;
	}

	if (spp->sp_islegacy)
		return ((void *)spp->sp_lvtor);
	else
		return ((void *)sdev_plugin_validate);
}

void
sdev_plugin_nodeready(sdev_node_t *sdp)
{
	sdev_plugin_t *spp;

	ASSERT(RW_WRITE_HELD(&sdp->sdev_contents));
	ASSERT(sdp->sdev_private == NULL);

	spp = sdev_match(sdp);
	if (spp == NULL)
		return;
	if (spp->sp_islegacy)
		return;
	sdp->sdev_private = spp;
	mutex_enter(&spp->sp_lock);
	spp->sp_nnodes++;
	mutex_exit(&spp->sp_lock);
}

int
sdev_plugin_init(void)
{
	sdev_vop_table_t *vtp;
	fs_operation_def_t *templ;

	sdev_plugin_cache = kmem_cache_create("sdev_plugin",
	    sizeof (sdev_plugin_t), 0, sdev_plugin_cache_constructor,
	    sdev_plugin_cache_destructor, NULL, NULL, NULL, 0);
	if (sdev_plugin_cache == NULL)
		return (1);
	mutex_init(&sdev_plugin_lock, NULL, MUTEX_DRIVER, NULL);
	list_create(&sdev_plugin_list, sizeof (sdev_plugin_t),
	    offsetof(sdev_plugin_t, sp_link));

	/*
	 * Register all of the legacy vnops
	 */
	for (vtp = &vtab[0]; vtp->vt_name != NULL; vtp++)
		if (sdev_plugin_register_legacy(vtp) != 0)
			return (1);

	templ = sdev_merge_vtab(sdev_plugin_vnodeops_tbl);
	if (vn_make_ops("sdev_plugin",
	    (const fs_operation_def_t *)templ,
	    &sdev_plugin_vnops) != 0) {
		sdev_free_vtab(templ);
		return (1);
	}

	sdev_free_vtab(templ);
	return (0);
}