/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * This file contains the functions used to support the ZFS integration
 * with zones.  This includes validation (e.g. zonecfg dataset), cloning,
 * file system creation and destruction.
 */

#include <stdio.h>
#include <errno.h>
#include <unistd.h>
#include <string.h>
#include <locale.h>
#include <libintl.h>
#include <sys/stat.h>
#include <sys/statvfs.h>
#include <libgen.h>
#include <libzonecfg.h>
#include <sys/mnttab.h>
#include <libzfs.h>
#include <sys/mntent.h>
#include <values.h>

#include "zoneadm.h"

libzfs_handle_t *g_zfs;

typedef struct zfs_mount_data {
	char		*match_name;
	zfs_handle_t	*match_handle;
} zfs_mount_data_t;

typedef struct zfs_snapshot_data {
	char	*match_name;	/* zonename@SUNWzone */
	int	len;		/* strlen of match_name */
	int	max;		/* highest digit appended to snap name */
	int	num;		/* number of snapshots to rename */
	int	cntr;		/* counter for renaming snapshots */
} zfs_snapshot_data_t;

typedef struct clone_data {
	zfs_handle_t	*clone_zhp;	/* clone dataset to promote */
	time_t		origin_creation; /* snapshot creation time of clone */
	const char	*snapshot;	/* snapshot of dataset being demoted */
} clone_data_t;

/*
 * A ZFS file system iterator call-back function which is used to validate
 * datasets imported into the zone.
 */
/* ARGSUSED */
static int
check_zvol(zfs_handle_t *zhp, void *unused)
{
	int ret;

	if (zfs_get_type(zhp) == ZFS_TYPE_VOLUME) {
		/*
		 * TRANSLATION_NOTE
		 * zfs and dataset are literals that should not be translated.
		 */
		(void) fprintf(stderr, gettext("cannot verify zfs dataset %s: "
		    "volumes cannot be specified as a zone dataset resource\n"),
		    zfs_get_name(zhp));
		ret = -1;
	} else {
		ret = zfs_iter_children(zhp, check_zvol, NULL);
	}

	zfs_close(zhp);

	return (ret);
}

/*
 * A ZFS file system iterator call-back function which returns the
 * zfs_handle_t for a ZFS file system on the specified mount point.
 */
static int
match_mountpoint(zfs_handle_t *zhp, void *data)
{
	int			res;
	zfs_mount_data_t	*cbp;
	char			mp[ZFS_MAXPROPLEN];

	if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM) {
		zfs_close(zhp);
		return (0);
	}

	/* First check if the dataset is mounted. */
	if (zfs_prop_get(zhp, ZFS_PROP_MOUNTED, mp, sizeof (mp), NULL, NULL,
	    0, B_FALSE) != 0 || strcmp(mp, "no") == 0) {
		zfs_close(zhp);
		return (0);
	}

	/* Now check mount point. */
	if (zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mp, sizeof (mp), NULL, NULL,
	    0, B_FALSE) != 0) {
		zfs_close(zhp);
		return (0);
	}

	cbp = (zfs_mount_data_t *)data;

	if (strcmp(mp, "legacy") == 0) {
		/* If legacy, must look in mnttab for mountpoint. */
		FILE		*fp;
		struct mnttab	entry;
		const char	*nm;

		nm = zfs_get_name(zhp);
		if ((fp = fopen(MNTTAB, "r")) == NULL) {
			zfs_close(zhp);
			return (0);
		}

		while (getmntent(fp, &entry) == 0) {
			if (strcmp(nm, entry.mnt_special) == 0) {
				if (strcmp(entry.mnt_mountp, cbp->match_name)
				    == 0) {
					(void) fclose(fp);
					cbp->match_handle = zhp;
					return (1);
				}
				break;
			}
		}
		(void) fclose(fp);

	} else if (strcmp(mp, cbp->match_name) == 0) {
		cbp->match_handle = zhp;
		return (1);
	}

	/* Iterate over any nested datasets. */
	res = zfs_iter_filesystems(zhp, match_mountpoint, data);
	zfs_close(zhp);
	return (res);
}

/*
 * Get ZFS handle for the specified mount point.
 */
static zfs_handle_t *
mount2zhandle(char *mountpoint)
{
	zfs_mount_data_t	cb;

	cb.match_name = mountpoint;
	cb.match_handle = NULL;
	(void) zfs_iter_root(g_zfs, match_mountpoint, &cb);
	return (cb.match_handle);
}

/*
 * Check if there is already a file system (zfs or any other type) mounted on
 * path.
 */
static boolean_t
is_mountpnt(char *path)
{
	FILE		*fp;
	struct mnttab	entry;

	if ((fp = fopen(MNTTAB, "r")) == NULL)
		return (B_FALSE);

	while (getmntent(fp, &entry) == 0) {
		if (strcmp(path, entry.mnt_mountp) == 0) {
			(void) fclose(fp);
			return (B_TRUE);
		}
	}

	(void) fclose(fp);
	return (B_FALSE);
}

/*
 * Run the brand's pre-snapshot hook before we take a ZFS snapshot of the zone.
 */
static int
pre_snapshot(char *presnapbuf)
{
	int status;

	/* No brand-specific handler */
	if (presnapbuf[0] == '\0')
		return (Z_OK);

	/* Run the hook */
	status = do_subproc(presnapbuf);
	if ((status = subproc_status(gettext("brand-specific presnapshot"),
	    status, B_FALSE)) != ZONE_SUBPROC_OK)
		return (Z_ERR);

	return (Z_OK);
}

/*
 * Run the brand's post-snapshot hook after we take a ZFS snapshot of the zone.
 */
static int
post_snapshot(char *postsnapbuf)
{
	int status;

	/* No brand-specific handler */
	if (postsnapbuf[0] == '\0')
		return (Z_OK);

	/* Run the hook */
	status = do_subproc(postsnapbuf);
	if ((status = subproc_status(gettext("brand-specific postsnapshot"),
	    status, B_FALSE)) != ZONE_SUBPROC_OK)
		return (Z_ERR);

	return (Z_OK);
}

/*
 * This is a ZFS snapshot iterator call-back function which returns the
 * highest number of SUNWzone snapshots that have been taken.
 */
static int
get_snap_max(zfs_handle_t *zhp, void *data)
{
	int			res;
	zfs_snapshot_data_t	*cbp;

	if (zfs_get_type(zhp) != ZFS_TYPE_SNAPSHOT) {
		zfs_close(zhp);
		return (0);
	}

	cbp = (zfs_snapshot_data_t *)data;

	if (strncmp(zfs_get_name(zhp), cbp->match_name, cbp->len) == 0) {
		char	*nump;
		int	num;

		cbp->num++;
		nump = (char *)(zfs_get_name(zhp) + cbp->len);
		num = atoi(nump);
		if (num > cbp->max)
			cbp->max = num;
	}

	res = zfs_iter_snapshots(zhp, get_snap_max, data);
	zfs_close(zhp);
	return (res);
}

/*
 * Take a ZFS snapshot to be used for cloning the zone.
 */
static int
take_snapshot(zfs_handle_t *zhp, char *snapshot_name, int snap_size,
    char *presnapbuf, char *postsnapbuf)
{
	int			res;
	char			template[ZFS_MAXNAMELEN];
	zfs_snapshot_data_t	cb;

	/*
	 * First we need to figure out the next available name for the
	 * zone snapshot.  Look through the list of zones snapshots for
	 * this file system to determine the maximum snapshot name.
	 */
	if (snprintf(template, sizeof (template), "%s@SUNWzone",
	    zfs_get_name(zhp)) >=  sizeof (template))
		return (Z_ERR);

	cb.match_name = template;
	cb.len = strlen(template);
	cb.max = 0;

	if (zfs_iter_snapshots(zhp, get_snap_max, &cb) != 0)
		return (Z_ERR);

	cb.max++;

	if (snprintf(snapshot_name, snap_size, "%s@SUNWzone%d",
	    zfs_get_name(zhp), cb.max) >= snap_size)
		return (Z_ERR);

	if (pre_snapshot(presnapbuf) != Z_OK)
		return (Z_ERR);
	res = zfs_snapshot(g_zfs, snapshot_name, B_FALSE, NULL);
	if (post_snapshot(postsnapbuf) != Z_OK)
		return (Z_ERR);

	if (res != 0)
		return (Z_ERR);
	return (Z_OK);
}

/*
 * We are using an explicit snapshot from some earlier point in time so
 * we need to validate it.  Run the brand specific hook.
 */
static int
validate_snapshot(char *snapshot_name, char *snap_path, char *validsnapbuf)
{
	int status;
	char cmdbuf[MAXPATHLEN];

	/* No brand-specific handler */
	if (validsnapbuf[0] == '\0')
		return (Z_OK);

	/* pass args - snapshot_name & snap_path */
	if (snprintf(cmdbuf, sizeof (cmdbuf), "%s %s %s", validsnapbuf,
	    snapshot_name, snap_path) >= sizeof (cmdbuf)) {
		zerror("Command line too long");
		return (Z_ERR);
	}

	/* Run the hook */
	status = do_subproc(cmdbuf);
	if ((status = subproc_status(gettext("brand-specific validatesnapshot"),
	    status, B_FALSE)) != ZONE_SUBPROC_OK)
		return (Z_ERR);

	return (Z_OK);
}

/*
 * Remove the sw inventory file from inside this zonepath that we picked up out
 * of the snapshot.
 */
static int
clean_out_clone()
{
	int err;
	zone_dochandle_t handle;

	if ((handle = zonecfg_init_handle()) == NULL) {
		zperror(cmd_to_str(CMD_CLONE), B_TRUE);
		return (Z_ERR);
	}

	if ((err = zonecfg_get_handle(target_zone, handle)) != Z_OK) {
		errno = err;
		zperror(cmd_to_str(CMD_CLONE), B_TRUE);
		zonecfg_fini_handle(handle);
		return (Z_ERR);
	}

	zonecfg_rm_detached(handle, B_FALSE);
	zonecfg_fini_handle(handle);

	return (Z_OK);
}

/*
 * Make a ZFS clone on zonepath from snapshot_name.
 */
static int
clone_snap(char *snapshot_name, char *zonepath)
{
	int		res = Z_OK;
	int		err;
	zfs_handle_t	*zhp;
	zfs_handle_t	*clone;
	nvlist_t	*props = NULL;

	if ((zhp = zfs_open(g_zfs, snapshot_name, ZFS_TYPE_SNAPSHOT)) == NULL)
		return (Z_NO_ENTRY);

	(void) printf(gettext("Cloning snapshot %s\n"), snapshot_name);

	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0 ||
	    nvlist_add_string(props, zfs_prop_to_name(ZFS_PROP_SHARENFS),
	    "off") != 0) {
		if (props != NULL)
			nvlist_free(props);
		(void) fprintf(stderr, gettext("could not create ZFS clone "
		    "%s: out of memory\n"), zonepath);
		return (Z_ERR);
	}

	err = zfs_clone(zhp, zonepath, props);
	zfs_close(zhp);

	nvlist_free(props);

	if (err != 0)
		return (Z_ERR);

	/* create the mountpoint if necessary */
	if ((clone = zfs_open(g_zfs, zonepath, ZFS_TYPE_DATASET)) == NULL)
		return (Z_ERR);

	/*
	 * The clone has been created so we need to print a diagnostic
	 * message if one of the following steps fails for some reason.
	 */
	if (zfs_mount(clone, NULL, 0) != 0) {
		(void) fprintf(stderr, gettext("could not mount ZFS clone "
		    "%s\n"), zfs_get_name(clone));
		res = Z_ERR;

	} else if (clean_out_clone() != Z_OK) {
		(void) fprintf(stderr, gettext("could not remove the "
		    "software inventory from ZFS clone %s\n"),
		    zfs_get_name(clone));
		res = Z_ERR;
	}

	zfs_close(clone);
	return (res);
}

/*
 * This function takes a zonepath and attempts to determine what the ZFS
 * file system name (not mountpoint) should be for that path.  We do not
 * assume that zonepath is an existing directory or ZFS fs since we use
 * this function as part of the process of creating a new ZFS fs or clone.
 *
 * The way this works is that we look at the parent directory of the zonepath
 * to see if it is a ZFS fs.  If it is, we get the name of that ZFS fs and
 * append the last component of the zonepath to generate the ZFS name for the
 * zonepath.  This matches the algorithm that ZFS uses for automatically
 * mounting a new fs after it is created.
 *
 * Although a ZFS fs can be mounted anywhere, we don't worry about handling
 * all of the complexity that a user could possibly configure with arbitrary
 * mounts since there is no way to generate a ZFS name from a random path in
 * the file system.  We only try to handle the automatic mounts that ZFS does
 * for each file system.  ZFS restricts this so that a new fs must be created
 * in an existing parent ZFS fs.  It then automatically mounts the new fs
 * directly under the mountpoint for the parent fs using the last component
 * of the name as the mountpoint directory.
 *
 * For example:
 *    Name			Mountpoint
 *    space/eng/dev/test/zone1	/project1/eng/dev/test/zone1
 *
 * Return Z_OK if the path mapped to a ZFS file system name, otherwise return
 * Z_ERR.
 */
static int
path2name(char *zonepath, char *zfs_name, int len)
{
	int		res;
	char		*bnm, *dnm, *dname, *bname;
	zfs_handle_t	*zhp;
	struct stat	stbuf;

	/*
	 * We need two tmp strings to handle paths directly in / (e.g. /foo)
	 * since dirname will overwrite the first char after "/" in this case.
	 */
	if ((bnm = strdup(zonepath)) == NULL)
		return (Z_ERR);

	if ((dnm = strdup(zonepath)) == NULL) {
		free(bnm);
		return (Z_ERR);
	}

	bname = basename(bnm);
	dname = dirname(dnm);

	/*
	 * This is a quick test to save iterating over all of the zfs datasets
	 * on the system (which can be a lot).  If the parent dir is not in a
	 * ZFS fs, then we're done.
	 */
	if (stat(dname, &stbuf) != 0 || !S_ISDIR(stbuf.st_mode) ||
	    strcmp(stbuf.st_fstype, MNTTYPE_ZFS) != 0) {
		free(bnm);
		free(dnm);
		return (Z_ERR);
	}

	/* See if the parent directory is its own ZFS dataset. */
	if ((zhp = mount2zhandle(dname)) == NULL) {
		/*
		 * The parent is not a ZFS dataset so we can't automatically
		 * create a dataset on the given path.
		 */
		free(bnm);
		free(dnm);
		return (Z_ERR);
	}

	res = snprintf(zfs_name, len, "%s/%s", zfs_get_name(zhp), bname);

	free(bnm);
	free(dnm);
	zfs_close(zhp);
	if (res >= len)
		return (Z_ERR);

	return (Z_OK);
}

/*
 * A ZFS file system iterator call-back function used to determine if the
 * file system has dependents (snapshots & clones).
 */
/* ARGSUSED */
static int
has_dependent(zfs_handle_t *zhp, void *data)
{
	zfs_close(zhp);
	return (1);
}

/*
 * Given a snapshot name, get the file system path where the snapshot lives.
 * A snapshot name is of the form fs_name@snap_name.  For example, snapshot
 * pl/zones/z1@SUNWzone1 would have a path of
 * /pl/zones/z1/.zfs/snapshot/SUNWzone1.
 */
static int
snap2path(char *snap_name, char *path, int len)
{
	char		*p;
	zfs_handle_t	*zhp;
	char		mp[ZFS_MAXPROPLEN];

	if ((p = strrchr(snap_name, '@')) == NULL)
		return (Z_ERR);

	/* Get the file system name from the snap_name. */
	*p = '\0';
	zhp = zfs_open(g_zfs, snap_name, ZFS_TYPE_DATASET);
	*p = '@';
	if (zhp == NULL)
		return (Z_ERR);

	/* Get the file system mount point. */
	if (zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mp, sizeof (mp), NULL, NULL,
	    0, B_FALSE) != 0) {
		zfs_close(zhp);
		return (Z_ERR);
	}
	zfs_close(zhp);

	p++;
	if (snprintf(path, len, "%s/.zfs/snapshot/%s", mp, p) >= len)
		return (Z_ERR);

	return (Z_OK);
}

/*
 * This callback function is used to iterate through a snapshot's dependencies
 * to find a filesystem that is a direct clone of the snapshot being iterated.
 */
static int
get_direct_clone(zfs_handle_t *zhp, void *data)
{
	clone_data_t	*cd = data;
	char		origin[ZFS_MAXNAMELEN];
	char		ds_path[ZFS_MAXNAMELEN];

	if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM) {
		zfs_close(zhp);
		return (0);
	}

	(void) strlcpy(ds_path, zfs_get_name(zhp), sizeof (ds_path));

	/* Make sure this is a direct clone of the snapshot we're iterating. */
	if (zfs_prop_get(zhp, ZFS_PROP_ORIGIN, origin, sizeof (origin), NULL,
	    NULL, 0, B_FALSE) != 0 || strcmp(origin, cd->snapshot) != 0) {
		zfs_close(zhp);
		return (0);
	}

	if (cd->clone_zhp != NULL)
		zfs_close(cd->clone_zhp);

	cd->clone_zhp = zhp;
	return (1);
}

/*
 * A ZFS file system iterator call-back function used to determine the clone
 * to promote.  This function finds the youngest (i.e. last one taken) snapshot
 * that has a clone.  If found, it returns a reference to that clone in the
 * callback data.
 */
static int
find_clone(zfs_handle_t *zhp, void *data)
{
	clone_data_t	*cd = data;
	time_t		snap_creation;
	int		zret = 0;

	/* If snapshot has no clones, skip it */
	if (zfs_prop_get_int(zhp, ZFS_PROP_NUMCLONES) == 0) {
		zfs_close(zhp);
		return (0);
	}

	cd->snapshot = zfs_get_name(zhp);

	/* Get the creation time of this snapshot */
	snap_creation = (time_t)zfs_prop_get_int(zhp, ZFS_PROP_CREATION);

	/*
	 * If this snapshot's creation time is greater than (i.e. younger than)
	 * the current youngest snapshot found, iterate this snapshot to
	 * get the right clone.
	 */
	if (snap_creation >= cd->origin_creation) {
		/*
		 * Iterate the dependents of this snapshot to find a clone
		 * that's a direct dependent.
		 */
		if ((zret = zfs_iter_dependents(zhp, B_FALSE, get_direct_clone,
		    cd)) == -1) {
			zfs_close(zhp);
			return (1);
		} else if (zret == 1) {
			/*
			 * Found a clone, update the origin_creation time
			 * in the callback data.
			 */
			cd->origin_creation = snap_creation;
		}
	}

	zfs_close(zhp);
	return (0);
}

/*
 * A ZFS file system iterator call-back function used to remove standalone
 * snapshots.
 */
/* ARGSUSED */
static int
rm_snap(zfs_handle_t *zhp, void *data)
{
	/* If snapshot has clones, something is wrong */
	if (zfs_prop_get_int(zhp, ZFS_PROP_NUMCLONES) != 0) {
		zfs_close(zhp);
		return (1);
	}

	if (zfs_unmount(zhp, NULL, 0) == 0) {
		(void) zfs_destroy(zhp, B_FALSE);
	}

	zfs_close(zhp);
	return (0);
}

/*
 * A ZFS snapshot iterator call-back function which renames snapshots.
 */
static int
rename_snap(zfs_handle_t *zhp, void *data)
{
	int			res;
	zfs_snapshot_data_t	*cbp;
	char			template[ZFS_MAXNAMELEN];

	cbp = (zfs_snapshot_data_t *)data;

	/*
	 * When renaming snapshots with the iterator, the iterator can see
	 * the same snapshot after we've renamed up in the namespace.  To
	 * prevent this we check the count for the number of snapshots we have
	 * to rename and stop at that point.
	 */
	if (cbp->cntr >= cbp->num) {
		zfs_close(zhp);
		return (0);
	}

	if (zfs_get_type(zhp) != ZFS_TYPE_SNAPSHOT) {
		zfs_close(zhp);
		return (0);
	}

	/* Only rename the snapshots we automatically generate when we clone. */
	if (strncmp(zfs_get_name(zhp), cbp->match_name, cbp->len) != 0) {
		zfs_close(zhp);
		return (0);
	}

	(void) snprintf(template, sizeof (template), "%s%d", cbp->match_name,
	    cbp->max++);

	res = (zfs_rename(zhp, template, B_FALSE) != 0);
	if (res != 0)
		(void) fprintf(stderr, gettext("failed to rename snapshot %s "
		    "to %s: %s\n"), zfs_get_name(zhp), template,
		    libzfs_error_description(g_zfs));

	cbp->cntr++;

	zfs_close(zhp);
	return (res);
}

/*
 * Rename the source dataset's snapshots that are automatically generated when
 * we clone a zone so that there won't be a name collision when we promote the
 * cloned dataset.  Once the snapshots have been renamed, then promote the
 * clone.
 *
 * The snapshot rename process gets the highest number on the snapshot names
 * (the format is zonename@SUNWzoneXX where XX are digits) on both the source
 * and clone datasets, then renames the source dataset snapshots starting at
 * the next number.
 */
static int
promote_clone(zfs_handle_t *src_zhp, zfs_handle_t *cln_zhp)
{
	zfs_snapshot_data_t	sd;
	char			nm[ZFS_MAXNAMELEN];
	char			template[ZFS_MAXNAMELEN];

	(void) strlcpy(nm, zfs_get_name(cln_zhp), sizeof (nm));
	/*
	 * Start by getting the clone's snapshot max which we use
	 * during the rename of the original dataset's snapshots.
	 */
	(void) snprintf(template, sizeof (template), "%s@SUNWzone", nm);
	sd.match_name = template;
	sd.len = strlen(template);
	sd.max = 0;

	if (zfs_iter_snapshots(cln_zhp, get_snap_max, &sd) != 0)
		return (Z_ERR);

	/*
	 * Now make sure the source's snapshot max is at least as high as
	 * the clone's snapshot max.
	 */
	(void) snprintf(template, sizeof (template), "%s@SUNWzone",
	    zfs_get_name(src_zhp));
	sd.match_name = template;
	sd.len = strlen(template);
	sd.num = 0;

	if (zfs_iter_snapshots(src_zhp, get_snap_max, &sd) != 0)
		return (Z_ERR);

	/*
	 * Now rename the source dataset's snapshots so there's no
	 * conflict when we promote the clone.
	 */
	sd.max++;
	sd.cntr = 0;
	if (zfs_iter_snapshots(src_zhp, rename_snap, &sd) != 0)
		return (Z_ERR);

	/* close and reopen the clone dataset to get the latest info */
	zfs_close(cln_zhp);
	if ((cln_zhp = zfs_open(g_zfs, nm, ZFS_TYPE_FILESYSTEM)) == NULL)
		return (Z_ERR);

	if (zfs_promote(cln_zhp) != 0) {
		(void) fprintf(stderr, gettext("failed to promote %s: %s\n"),
		    nm, libzfs_error_description(g_zfs));
		return (Z_ERR);
	}

	zfs_close(cln_zhp);
	return (Z_OK);
}

/*
 * Promote the youngest clone.  That clone will then become the origin of all
 * of the other clones that were hanging off of the source dataset.
 */
int
promote_all_clones(zfs_handle_t *zhp)
{
	clone_data_t	cd;
	char		nm[ZFS_MAXNAMELEN];

	cd.clone_zhp = NULL;
	cd.origin_creation = 0;
	cd.snapshot = NULL;

	if (zfs_iter_snapshots(zhp, find_clone, &cd) != 0) {
		zfs_close(zhp);
		return (Z_ERR);
	}

	/* Nothing to promote. */
	if (cd.clone_zhp == NULL)
		return (Z_OK);

	/* Found the youngest clone to promote.  Promote it. */
	if (promote_clone(zhp, cd.clone_zhp) != 0) {
		zfs_close(cd.clone_zhp);
		zfs_close(zhp);
		return (Z_ERR);
	}

	/* close and reopen the main dataset to get the latest info */
	(void) strlcpy(nm, zfs_get_name(zhp), sizeof (nm));
	zfs_close(zhp);
	if ((zhp = zfs_open(g_zfs, nm, ZFS_TYPE_FILESYSTEM)) == NULL)
		return (Z_ERR);

	return (Z_OK);
}

/*
 * Clone a pre-existing ZFS snapshot, either by making a direct ZFS clone, if
 * possible, or by copying the data from the snapshot to the zonepath.
 */
int
clone_snapshot_zfs(char *snap_name, char *zonepath, char *validatesnap)
{
	int	err = Z_OK;
	char	clone_name[MAXPATHLEN];
	char	snap_path[MAXPATHLEN];

	if (snap2path(snap_name, snap_path, sizeof (snap_path)) != Z_OK) {
		(void) fprintf(stderr, gettext("unable to find path for %s.\n"),
		    snap_name);
		return (Z_ERR);
	}

	if (validate_snapshot(snap_name, snap_path, validatesnap) != Z_OK)
		return (Z_NO_ENTRY);

	/*
	 * The zonepath cannot be ZFS cloned, try to copy the data from
	 * within the snapshot to the zonepath.
	 */
	if (path2name(zonepath, clone_name, sizeof (clone_name)) != Z_OK) {
		if ((err = clone_copy(snap_path, zonepath)) == Z_OK)
			if (clean_out_clone() != Z_OK)
				(void) fprintf(stderr,
				    gettext("could not remove the "
				    "software inventory from %s\n"), zonepath);

		return (err);
	}

	if ((err = clone_snap(snap_name, clone_name)) != Z_OK) {
		if (err != Z_NO_ENTRY) {
			/*
			 * Cloning the snapshot failed.  Fall back to trying
			 * to install the zone by copying from the snapshot.
			 */
			if ((err = clone_copy(snap_path, zonepath)) == Z_OK)
				if (clean_out_clone() != Z_OK)
					(void) fprintf(stderr,
					    gettext("could not remove the "
					    "software inventory from %s\n"),
					    zonepath);
		} else {
			/*
			 * The snapshot is unusable for some reason so restore
			 * the zone state to configured since we were unable to
			 * actually do anything about getting the zone
			 * installed.
			 */
			int tmp;

			if ((tmp = zone_set_state(target_zone,
			    ZONE_STATE_CONFIGURED)) != Z_OK) {
				errno = tmp;
				zperror2(target_zone,
				    gettext("could not set state"));
			}
		}
	}

	return (err);
}

/*
 * Attempt to clone a source_zone to a target zonepath by using a ZFS clone.
 */
int
clone_zfs(char *source_zonepath, char *zonepath, char *presnapbuf,
    char *postsnapbuf)
{
	zfs_handle_t	*zhp;
	char		clone_name[MAXPATHLEN];
	char		snap_name[MAXPATHLEN];

	/*
	 * Try to get a zfs handle for the source_zonepath.  If this fails
	 * the source_zonepath is not ZFS so return an error.
	 */
	if ((zhp = mount2zhandle(source_zonepath)) == NULL)
		return (Z_ERR);

	/*
	 * Check if there is a file system already mounted on zonepath.  If so,
	 * we can't clone to the path so we should fall back to copying.
	 */
	if (is_mountpnt(zonepath)) {
		zfs_close(zhp);
		(void) fprintf(stderr,
		    gettext("A file system is already mounted on %s,\n"
		    "preventing use of a ZFS clone.\n"), zonepath);
		return (Z_ERR);
	}

	/*
	 * Instead of using path2name to get the clone name from the zonepath,
	 * we could generate a name from the source zone ZFS name.  However,
	 * this would mean we would create the clone under the ZFS fs of the
	 * source instead of what the zonepath says.  For example,
	 *
	 * source_zonepath		zonepath
	 * /pl/zones/dev/z1		/pl/zones/deploy/z2
	 *
	 * We don't want the clone to be under "dev", we want it under
	 * "deploy", so that we can leverage the normal attribute inheritance
	 * that ZFS provides in the fs hierarchy.
	 */
	if (path2name(zonepath, clone_name, sizeof (clone_name)) != Z_OK) {
		zfs_close(zhp);
		return (Z_ERR);
	}

	if (take_snapshot(zhp, snap_name, sizeof (snap_name), presnapbuf,
	    postsnapbuf) != Z_OK) {
		zfs_close(zhp);
		return (Z_ERR);
	}
	zfs_close(zhp);

	if (clone_snap(snap_name, clone_name) != Z_OK) {
		/* Clean up the snapshot we just took. */
		if ((zhp = zfs_open(g_zfs, snap_name, ZFS_TYPE_SNAPSHOT))
		    != NULL) {
			if (zfs_unmount(zhp, NULL, 0) == 0)
				(void) zfs_destroy(zhp, B_FALSE);
			zfs_close(zhp);
		}

		return (Z_ERR);
	}

	(void) printf(gettext("Instead of copying, a ZFS clone has been "
	    "created for this zone.\n"));

	return (Z_OK);
}

/*
 * Attempt to create a ZFS file system for the specified zonepath.
 * We either will successfully create a ZFS file system and get it mounted
 * on the zonepath or we don't.  The caller doesn't care since a regular
 * directory is used for the zonepath if no ZFS file system is mounted there.
 */
void
create_zfs_zonepath(char *zonepath)
{
	zfs_handle_t	*zhp;
	char		zfs_name[MAXPATHLEN];
	nvlist_t	*props = NULL;

	if (path2name(zonepath, zfs_name, sizeof (zfs_name)) != Z_OK)
		return;

	/* Check if the dataset already exists. */
	if ((zhp = zfs_open(g_zfs, zfs_name, ZFS_TYPE_DATASET)) != NULL) {
		zfs_close(zhp);
		return;
	}

	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0 ||
	    nvlist_add_string(props, zfs_prop_to_name(ZFS_PROP_SHARENFS),
	    "off") != 0) {
		if (props != NULL)
			nvlist_free(props);
		(void) fprintf(stderr, gettext("cannot create ZFS dataset %s: "
		    "out of memory\n"), zfs_name);
	}

	if (zfs_create(g_zfs, zfs_name, ZFS_TYPE_FILESYSTEM, props) != 0 ||
	    (zhp = zfs_open(g_zfs, zfs_name, ZFS_TYPE_DATASET)) == NULL) {
		(void) fprintf(stderr, gettext("cannot create ZFS dataset %s: "
		    "%s\n"), zfs_name, libzfs_error_description(g_zfs));
		nvlist_free(props);
		return;
	}

	nvlist_free(props);

	if (zfs_mount(zhp, NULL, 0) != 0) {
		(void) fprintf(stderr, gettext("cannot mount ZFS dataset %s: "
		    "%s\n"), zfs_name, libzfs_error_description(g_zfs));
		(void) zfs_destroy(zhp, B_FALSE);
	} else {
		if (chmod(zonepath, S_IRWXU) != 0) {
			(void) fprintf(stderr, gettext("file system %s "
			    "successfully created, but chmod %o failed: %s\n"),
			    zfs_name, S_IRWXU, strerror(errno));
			(void) destroy_zfs(zonepath);
		} else {
			(void) printf(gettext("A ZFS file system has been "
			    "created for this zone.\n"));
		}
	}

	zfs_close(zhp);
}

/*
 * If the zonepath is a ZFS file system, attempt to destroy it.  We return Z_OK
 * if we were able to zfs_destroy the zonepath, otherwise we return Z_ERR
 * which means the caller should clean up the zonepath in the traditional
 * way.
 */
int
destroy_zfs(char *zonepath)
{
	zfs_handle_t	*zhp;
	boolean_t	is_clone = B_FALSE;
	char		origin[ZFS_MAXPROPLEN];

	if ((zhp = mount2zhandle(zonepath)) == NULL)
		return (Z_ERR);

	if (promote_all_clones(zhp) != 0)
		return (Z_ERR);

	/* Now cleanup any snapshots remaining. */
	if (zfs_iter_snapshots(zhp, rm_snap, NULL) != 0) {
		zfs_close(zhp);
		return (Z_ERR);
	}

	/*
	 * We can't destroy the file system if it has still has dependents.
	 * There shouldn't be any at this point, but we'll double check.
	 */
	if (zfs_iter_dependents(zhp, B_TRUE, has_dependent, NULL) != 0) {
		(void) fprintf(stderr, gettext("zfs destroy %s failed: the "
		    "dataset still has dependents\n"), zfs_get_name(zhp));
		zfs_close(zhp);
		return (Z_ERR);
	}

	/*
	 * This might be a clone.  Try to get the snapshot so we can attempt
	 * to destroy that as well.
	 */
	if (zfs_prop_get(zhp, ZFS_PROP_ORIGIN, origin, sizeof (origin), NULL,
	    NULL, 0, B_FALSE) == 0)
		is_clone = B_TRUE;

	if (zfs_unmount(zhp, NULL, 0) != 0) {
		(void) fprintf(stderr, gettext("zfs unmount %s failed: %s\n"),
		    zfs_get_name(zhp), libzfs_error_description(g_zfs));
		zfs_close(zhp);
		return (Z_ERR);
	}

	if (zfs_destroy(zhp, B_FALSE) != 0) {
		/*
		 * If the destroy fails for some reason, try to remount
		 * the file system so that we can use "rm -rf" to clean up
		 * instead.
		 */
		(void) fprintf(stderr, gettext("zfs destroy %s failed: %s\n"),
		    zfs_get_name(zhp), libzfs_error_description(g_zfs));
		(void) zfs_mount(zhp, NULL, 0);
		zfs_close(zhp);
		return (Z_ERR);
	}

	/*
	 * If the zone has ever been moved then the mountpoint dir will not be
	 * cleaned up by the zfs_destroy().  To handle this case try to clean
	 * it up now but don't worry if it fails, that will be normal.
	 */
	(void) rmdir(zonepath);

	(void) printf(gettext("The ZFS file system for this zone has been "
	    "destroyed.\n"));

	if (is_clone) {
		zfs_handle_t	*ohp;

		/*
		 * Try to clean up the snapshot that the clone was taken from.
		 */
		if ((ohp = zfs_open(g_zfs, origin,
		    ZFS_TYPE_SNAPSHOT)) != NULL) {
			if (zfs_iter_dependents(ohp, B_TRUE, has_dependent,
			    NULL) == 0 && zfs_unmount(ohp, NULL, 0) == 0)
				(void) zfs_destroy(ohp, B_FALSE);
			zfs_close(ohp);
		}
	}

	zfs_close(zhp);
	return (Z_OK);
}

/*
 * Return true if the path is its own zfs file system.  We determine this
 * by stat-ing the path to see if it is zfs and stat-ing the parent to see
 * if it is a different fs.
 */
boolean_t
is_zonepath_zfs(char *zonepath)
{
	int res;
	char *path;
	char *parent;
	struct statvfs64 buf1, buf2;

	if (statvfs64(zonepath, &buf1) != 0)
		return (B_FALSE);

	if (strcmp(buf1.f_basetype, "zfs") != 0)
		return (B_FALSE);

	if ((path = strdup(zonepath)) == NULL)
		return (B_FALSE);

	parent = dirname(path);
	res = statvfs64(parent, &buf2);
	free(path);

	if (res != 0)
		return (B_FALSE);

	if (buf1.f_fsid == buf2.f_fsid)
		return (B_FALSE);

	return (B_TRUE);
}

/*
 * Implement the fast move of a ZFS file system by simply updating the
 * mountpoint.  Since it is file system already, we don't have the
 * issue of cross-file system copying.
 */
int
move_zfs(char *zonepath, char *new_zonepath)
{
	int		ret = Z_ERR;
	zfs_handle_t	*zhp;

	if ((zhp = mount2zhandle(zonepath)) == NULL)
		return (Z_ERR);

	if (zfs_prop_set(zhp, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT),
	    new_zonepath) == 0) {
		/*
		 * Clean up the old mount point.  We ignore any failure since
		 * the zone is already successfully mounted on the new path.
		 */
		(void) rmdir(zonepath);
		ret = Z_OK;
	}

	zfs_close(zhp);

	return (ret);
}

/*
 * Validate that the given dataset exists on the system, and that neither it nor
 * its children are zvols.
 *
 * Note that we don't do anything with the 'zoned' property here.  All
 * management is done in zoneadmd when the zone is actually rebooted.  This
 * allows us to automatically set the zoned property even when a zone is
 * rebooted by the administrator.
 */
int
verify_datasets(zone_dochandle_t handle)
{
	int return_code = Z_OK;
	struct zone_dstab dstab;
	zfs_handle_t *zhp;
	char propbuf[ZFS_MAXPROPLEN];
	char source[ZFS_MAXNAMELEN];
	zprop_source_t srctype;

	if (zonecfg_setdsent(handle) != Z_OK) {
		/*
		 * TRANSLATION_NOTE
		 * zfs and dataset are literals that should not be translated.
		 */
		(void) fprintf(stderr, gettext("could not verify zfs datasets: "
		    "unable to enumerate datasets\n"));
		return (Z_ERR);
	}

	while (zonecfg_getdsent(handle, &dstab) == Z_OK) {

		if ((zhp = zfs_open(g_zfs, dstab.zone_dataset_name,
		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) {
			(void) fprintf(stderr, gettext("could not verify zfs "
			    "dataset %s: %s\n"), dstab.zone_dataset_name,
			    libzfs_error_description(g_zfs));
			return_code = Z_ERR;
			continue;
		}

		if (zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, propbuf,
		    sizeof (propbuf), &srctype, source,
		    sizeof (source), 0) == 0 &&
		    (srctype == ZPROP_SRC_INHERITED)) {
			(void) fprintf(stderr, gettext("could not verify zfs "
			    "dataset %s: mountpoint cannot be inherited\n"),
			    dstab.zone_dataset_name);
			return_code = Z_ERR;
			zfs_close(zhp);
			continue;
		}

		if (zfs_get_type(zhp) == ZFS_TYPE_VOLUME) {
			(void) fprintf(stderr, gettext("cannot verify zfs "
			    "dataset %s: volumes cannot be specified as a "
			    "zone dataset resource\n"),
			    dstab.zone_dataset_name);
			return_code = Z_ERR;
		}

		if (zfs_iter_children(zhp, check_zvol, NULL) != 0)
			return_code = Z_ERR;

		zfs_close(zhp);
	}
	(void) zonecfg_enddsent(handle);

	return (return_code);
}

/*
 * Verify that the ZFS dataset exists, and its mountpoint
 * property is set to "legacy".
 */
int
verify_fs_zfs(struct zone_fstab *fstab)
{
	zfs_handle_t *zhp;
	char propbuf[ZFS_MAXPROPLEN];

	if ((zhp = zfs_open(g_zfs, fstab->zone_fs_special,
	    ZFS_TYPE_DATASET)) == NULL) {
		(void) fprintf(stderr, gettext("could not verify fs %s: "
		    "could not access zfs dataset '%s'\n"),
		    fstab->zone_fs_dir, fstab->zone_fs_special);
		return (Z_ERR);
	}

	if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM) {
		(void) fprintf(stderr, gettext("cannot verify fs %s: "
		    "'%s' is not a file system\n"),
		    fstab->zone_fs_dir, fstab->zone_fs_special);
		zfs_close(zhp);
		return (Z_ERR);
	}

	if (zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, propbuf, sizeof (propbuf),
	    NULL, NULL, 0, 0) != 0 || strcmp(propbuf, "legacy") != 0) {
		(void) fprintf(stderr, gettext("could not verify fs %s: "
		    "zfs '%s' mountpoint is not \"legacy\"\n"),
		    fstab->zone_fs_dir, fstab->zone_fs_special);
		zfs_close(zhp);
		return (Z_ERR);
	}

	zfs_close(zhp);
	return (Z_OK);
}

int
init_zfs(void)
{
	if ((g_zfs = libzfs_init()) == NULL) {
		(void) fprintf(stderr, gettext("failed to initialize ZFS "
		    "library\n"));
		return (Z_ERR);
	}

	return (Z_OK);
}