lvm/util/metaclust.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <meta.h>
#include <sdssc.h>
#include <signal.h>
#include <syslog.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/lvm/md_mirror.h>
#include <metad.h>

#define	MY_VERSION		"1.0"	/* the highest supported version */
#define	MAX_DEBUG_LEVEL		5	/* maximum verbosity level */

#define	RESET_OWNER		0x0001
#define	CHOOSE_OWNER		0x0002
#define	RESET_ABR		0x0004
#define	UPDATE_ABR		0x0008
#define	GET_MIRROR_STATE	0x0010

#define	SET_INFO_NO_WR	0x0002
#define	SET_INFO_MN	0x0004

/*
 * This table defines all the metaclust reconfig steps we understand
 */
typedef enum stpnum {
	MC_UNK = 0,
	MC_START,
	MC_STOP,
	MC_ABORT,
	MC_RETURN,
	MC_STEP1,
	MC_STEP2,
	MC_STEP3,
	MC_STEP4
} stepnum_t;

/*
 * Structure for step_name -> step_number mapping
 */
struct step_t {
	char		*step_nam;
	stepnum_t	step_num;
};

/*
 * Step name to step number mapping table
 * This table MUST be sorted alphabetically in ascending order of step name
 */
static struct step_t step_table[] = {
	{ "abort",	MC_ABORT },
	{ "return",	MC_RETURN },
	{ "start",	MC_START },
	{ "step1",	MC_STEP1 },
	{ "step2",	MC_STEP2 },
	{ "step3",	MC_STEP3 },
	{ "step4",	MC_STEP4 },
	{ "stop",	MC_STOP }
};

/*
 * If support for a different version is added, the new version number should
 * be appended to the version_table below. This list will be searched to
 * determine if a version requested via the -V option is supported or not.
 */
static char *version_table[] = {
	MY_VERSION
};

uint_t	timeout = 0;			/* disable timeout by default */
char	*version = MY_VERSION;		/* use latest version by default */
int	stepnum = MC_UNK;		/* reconfiguration step number */
pid_t	c_pid;				/* child process id */

/*
 * Binary search comparison routine
 */
static int
mc_compare(const void *stp1, const void *stp2)
{
	return (strcmp((const char *)stp1,
	    ((const struct step_t *)stp2)->step_nam));
}

/*
 * Timeout expiry alarm signal handler
 */
/*ARGSUSED*/
static void
sigalarmhandler(int sig)
{
	int	i, n, ret, stat_loc = 0;
	FILE	*pgcore;
	char	corecmd[256];

	n = sizeof (step_table) / sizeof (step_table[0]);
	for (i = 0; i < n; i++) {
		if (stepnum == step_table[i].step_num)
			break;
	}

	assert(i != n);

	meta_mc_log(MC_LOG1, gettext("Timeout expired in %s: %s"),
	    step_table[i].step_nam,
	    meta_print_hrtime(gethrtime() - start_time));

	/*
	 * See what the child was actually doing when the timeout expired.
	 * A core-dump of this would be _really_ good, so let's just
	 * try a 'gcore -g c_pid' and hope
	 */

	(void) memset(corecmd, 0, sizeof (corecmd));
	(void) snprintf(corecmd, sizeof (corecmd),
	    "/bin/gcore -g %d >/dev/null 2>&1", (int)c_pid);

	pgcore = popen(corecmd, "r");

	if (pgcore == NULL) {
		meta_mc_log(MC_LOG1, gettext("Could not grab core for pid %s"),
		    c_pid);
	} else {
		(void) pclose(pgcore);
	}

	if ((ret = kill(c_pid, SIGKILL)) == 0) {
		/*
		 * The child will wait forever until the status is retrieved
		 * so get it now. Keep retrying if the call is interrupted.
		 *
		 * The possible results are,
		 *
		 *	- child killed successfully
		 *	- signal sent but child not killed
		 *	- waitpid failed/interrupted
		 */
		(void) sleep(2);
		while ((ret = waitpid(c_pid, &stat_loc, WNOHANG)) < 0) {
			if (errno != EINTR) {
				break;
			}
		}
		if ((ret == c_pid) || (errno == ECHILD)) {
			ret = 0;
		} else {
			ret = 1;
		}
	} else if (errno == ESRCH) {
		/*
		 * If the kill did not catch the child then it means the child
		 * exited immediately after the timeout occured.
		 */
		ret = 0;
	}

	/*
	 * make sure not to exit with 205 for any steps other than step1-step4.
	 * Suncluster reconfiguration can't handle it otherwise.
	 */
	switch (stepnum) {
	case MC_STEP1:
	case MC_STEP2:
	case MC_STEP3:
	case MC_STEP4:
		/*
		 * If the child was killed successfully return 205 for a
		 * new reconfig cycle otherwise send 1 to panic the node.
		 */
		if (ret != 0) {
			md_eprintf(gettext("Could not kill child\n"));
			exit(1);
		} else {
			exit(205);
		}
		break;
	case MC_START:
	case MC_STOP:
	case MC_ABORT:
	case MC_RETURN:
	default:
		exit(1);
		break;
	}
}

/*
 * Attempt to load local set.
 * Returns:
 *	pointer to mdsetname_t for local set (local_sp) is successful.
 *	0 if failure
 *		if there are no local set mddbs, no error message is printed.
 *		Otherwise, error message is printed so that user
 *		can determine why the local set didn't start.
 */
mdsetname_t *
load_local_set(md_error_t *ep)
{
	mdsetname_t	*local_sp = NULL;

	/* Does local set exist? If not, give no error */
	if ((local_sp = metasetname(MD_LOCAL_NAME, ep)) == NULL) {
		return (0);
	}

	/*
	 * snarf local set
	 * If fails with MDE_DB_NODB, then just return 1 printing
	 * no failure.
	 * Otherwise, print error message, and return 1.
	 */
	if (meta_setup_db_locations(ep) != 0) {
		if (!(mdismddberror(ep, MDE_DB_NODB)))
			mde_perror(ep, "");
		return (0);
	}

	/* local set loaded successfully */
	return (local_sp);
}

/*
 * Purpose:	Compose a full path name for a metadevice
 *
 * On entry:	sp	- setname pointer
 *		mnum	- minor number of metadevice
 *		pathname - pointer to array to return path string
 *		pathlen	- max length of pathname array
 */
static int
compose_path(mdsetname_t *sp, int mnum, char *pathname, int pathlen)
{
	int	rtn;
	mdname_t	*np;
	md_error_t	status = mdnullerror;

	if (MD_MIN2SET(mnum) != sp->setno) {
		md_eprintf(gettext("minor number 0x%x invalid for set %d\n"),
		    mnum, sp->setno);
		return (-1);
	}

	if ((np = metamnumname(&sp, mnum, 0, &status)) == NULL) {
		return (-1);
	}

	rtn = snprintf(pathname, pathlen, "%s", np->rname);

	if ((pathname[0] == '\0') || (rtn >= pathlen)) {
		md_eprintf(gettext(
		    "Could not create path for device %s\n"),
		    get_mdname(sp, mnum));
		return (-1);
	}
	return (0);
}

/*
 * Purpose:	Walk through all the devices specified for the given set
 *		and do the action specified in mode
 */
static int
reset_state(uint_t mode, mdsetname_t *sp, char *drivername, md_error_t *ep)
{
	mdnamelist_t			*devnlp = NULL;
	mdnamelist_t			*p;
	mdname_t			*devnp = NULL;
	md_set_mmown_params_t		ownpar_p;
	md_set_mmown_params_t		*ownpar = &ownpar_p;
	md_unit_t			*mm;
	int				mirror_dev = 0;
	mndiskset_membershiplist_t	*nl;
	int				cnt;
	int				has_parent;
	md_mn_get_mir_state_t		mir_state_p;
	md_mn_get_mir_state_t		*mir_state = &mir_state_p;

	/*
	 * if we are choosing or resetting the owners then make sure
	 * we are only doing it for mirror devices
	 */
	mirror_dev = (strcmp(MD_MIRROR, drivername) == 0);
	if ((mode & (RESET_OWNER | CHOOSE_OWNER)) && !mirror_dev) {
		return (-1);
	}

	/* get a list of all the metadevices for current set */
	if (mirror_dev && meta_get_mirror_names(sp, &devnlp, 0, ep) < 0) {
		mde_perror(ep, gettext("Could not get mirrors for set %s"),
		    sp->setname);
		return (-1);
	} else if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) {
		mde_perror(ep, gettext(
		    "Could not get soft partitions for set %s"), sp->setname);
		return (-1);
	}

	/* If resetting the owner, get the known membership list */
	if (mode & RESET_OWNER) {
		if (meta_read_nodelist(&cnt, &nl, ep)) {
			mde_perror(ep, "Could not get nodelist");
			return (-1);
		}
	}

	/* for each metadevice */
	for (p = devnlp; (p != NULL); p = p->next) {
		devnp = p->namep;

		/*
		 * Get the current setting for mirror ABR state and all of the
		 * submirror state and flags from the master node. We only
		 * perform this when going through a 'start' cycle.
		 */
		if ((mode & GET_MIRROR_STATE) && mirror_dev) {
			char	*miscname;

			/*
			 * Ensure that we ignore soft-parts that are returned
			 * from the meta_get_mirror_names() call
			 */
			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
				goto out;
			if (strcmp(miscname, MD_MIRROR) != 0)
				continue;

			mir_state->mnum = meta_getminor(devnp->dev);
			MD_SETDRIVERNAME(mir_state, MD_MIRROR, sp->setno);
			meta_mc_log(MC_LOG4, gettext("Getting mirror state"
			    " for %s: %s"), get_mdname(sp, mir_state->mnum),
			    meta_print_hrtime(gethrtime() - start_time));

			if (metaioctl(MD_MN_GET_MIRROR_STATE, mir_state, ep,
			    "MD_MN_GET_MIRROR_STATE") != 0) {
				mde_perror(ep, gettext("Unable to get "
				    "mirror state for %s"),
				    get_mdname(sp, mir_state->mnum));
				goto out;
			} else {
				continue;
			}
		}

		/* check if this is a top level metadevice */
		if ((mm = meta_get_mdunit(sp, devnp, ep)) == NULL)
			goto out;
		if (MD_HAS_PARENT(MD_PARENT(mm))) {
			has_parent = 1;
		} else {
			has_parent = 0;
		}
		Free(mm);

		if (mode & (RESET_OWNER | CHOOSE_OWNER)) {
			char	*miscname;

			/*
			 * we can only do these for mirrors so make sure we
			 * really have a mirror device and not a softpartition
			 * imitating one. meta_get_mirror_names seems to think
			 * softparts on top of a mirror are mirrors!
			 */
			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
				goto out;
			if (strcmp(miscname, MD_MIRROR) != 0)
				continue;

			(void) memset(ownpar, 0, sizeof (*ownpar));
			ownpar->d.mnum = meta_getminor(devnp->dev);
			MD_SETDRIVERNAME(ownpar, MD_MIRROR, sp->setno);

			meta_mc_log(MC_LOG4, gettext("Setting owner "
			    "for %s: %s"), get_mdname(sp, ownpar->d.mnum),
			    meta_print_hrtime(gethrtime() - start_time));

			/* get the current owner id */
			if (metaioctl(MD_MN_GET_MM_OWNER, ownpar, ep,
			    "MD_MN_GET_MM_OWNER") != 0) {
				mde_perror(ep, gettext("Unable to get "
				    "mirror owner for %s"),
				    get_mdname(sp, ownpar->d.mnum));
				goto out;
			}
		}

		if (mode & RESET_OWNER) {
			if (ownpar->d.owner == MD_MN_MIRROR_UNOWNED) {
				mdclrerror(ep);
				continue;
			}

			/*
			 * reset owner only if the current owner is
			 * not in the membership list
			 * Also kill the resync thread so that when the resync
			 * is started, it will perform an optimized resync
			 * for any resync regions that were dirty when the
			 * current owner left the membership.
			 */
			if (meta_is_member(NULL, ownpar->d.owner, nl) != 1) {
				if (meta_mn_change_owner(&ownpar,
				    sp->setno, ownpar->d.mnum,
				    MD_MN_MIRROR_UNOWNED,
				    MD_MN_MM_ALLOW_CHANGE) == -1) {
					md_eprintf(gettext(
					    "Unable to reset mirror owner "
					    "for %s\n"),
					    get_mdname(sp, ownpar->d.mnum));
					goto out;
				}
				if (meta_mirror_resync(sp, devnp, 0, ep,
				    MD_RESYNC_KILL_NO_WAIT) != 0) {
					md_eprintf(gettext(
					    "Unable to kill resync for"
					    " %s\n"),
					    get_mdname(sp, ownpar->d.mnum));
					goto out;
				}
			}
		}

		if (mode & CHOOSE_OWNER) {
			/*
			 * only orphaned resyncs will have no owner.
			 * if that is the case choose a new owner. Otherwise
			 * re-establish the existing owner. This covers the
			 * case where a node that owned the mirror
			 * reboots/panics and comes back into the cluster before
			 * the reconfig cycle has completed. In this case the
			 * other cluster nodes will have the mirror owner marked
			 * as the rebooted node while it has the owner marked
			 * as 'None'. We have to reestablish the ownership so
			 * that the subsequent resync can continue.
			 */
			if (meta_mn_change_owner(&ownpar, sp->setno,
			    ownpar->d.mnum, ownpar->d.owner,
			    MD_MN_MM_CHOOSE_OWNER) == -1) {
				md_eprintf(gettext("Unable to choose "
				    "mirror owner for %s\n"),
				    get_mdname(sp, ownpar->d.mnum));
				goto out;
			}
		}

		/*
		 * For RESET_ABR and UPDATE_ABR - only handle top
		 * level metadevices.
		 */
		if (has_parent)
			continue;

		if (mode & RESET_ABR) {
			/*
			 * Reset the ABR (application based recovery)
			 * value on all nodes. We are dealing with
			 * the possibility that we have ABR set but the
			 * only node that had the device open with ABR has
			 * left the cluster. We simply open and close the
			 * device and if this is the last close in the
			 * cluster, ABR will be cleared on all nodes.
			 */
			char		*miscname;
			char		name[MAXPATHLEN];
			int		mnum, fd;

			name[0] = '\0';
			mnum = meta_getminor(devnp->dev);

			/*
			 * Ensure that we don't include soft-parts in the
			 * mirror-only call to RESET_ABR. meta_get_mirror_names
			 * returns a bogus list that includes all soft-parts
			 * built on mirrors.
			 */
			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
				goto out;
			if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0))
				continue;

			meta_mc_log(MC_LOG4, gettext("Re-setting ABR state "
			    "for %s: %s"), get_mdname(sp, mnum),
			    meta_print_hrtime(gethrtime() - start_time));

			/* compose the absolute device path and open it */
			if (compose_path(sp, mnum, &name[0],
			    sizeof (name)) != 0)
				goto out;
			if ((fd = open(name, O_RDWR, 0)) < 0) {
				md_perror(gettext("Could not open device %s"),
				    name);
				continue;
			}

			(void) close(fd);
		}

		if (mode & UPDATE_ABR) {
			/*
			 * Update the ABR value on this node. We obtain the
			 * current ABR state from the master node.
			 */

			char		*miscname;
			char		name[MAXPATHLEN];
			int		mnum, fd;
			volcap_t	vc;
			uint_t		tstate;

			name[0] = '\0';
			mnum = meta_getminor(devnp->dev);

			/*
			 * Ensure that we don't include soft-parts in the
			 * mirror-only call to UPDATE_ABR. meta_get_mirror_names
			 * returns a bogus list that includes all soft-parts
			 * built on mirrors.
			 */
			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
				goto out;
			if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0))
				continue;

			/* Get tstate from Master */
			if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep)
			    != 0)
				continue;
			/* If not set on the master, nothing to do */
			if (!(tstate & MD_ABR_CAP))
				continue;

			meta_mc_log(MC_LOG4, gettext("Updating ABR state "
			    "for %s: %s"), get_mdname(sp, mnum),
			    meta_print_hrtime(gethrtime() - start_time));

			/* compose the absolute device path and open it */
			if (compose_path(sp, mnum, &name[0],
			    sizeof (name)) != 0)
				goto out;
			if ((fd = open(name, O_RDWR, 0)) < 0) {
				md_perror(gettext("Could not open device %s"),
				    name);
				continue;
			}

			/* set ABR state */
			vc.vc_info = 0;
			vc.vc_set = 0;
			if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) {
				/*
				 * Ignore if device does not support this
				 * ioctl
				 */
				if ((errno != ENOTTY) && (errno != ENOTSUP)) {
					md_perror(gettext("Could not get "
					    "ABR/DMR state for device %s"),
					    name);
				}
				(void) close(fd);
				continue;
			}
			if (!(vc.vc_info & (DKV_ABR_CAP | DKV_DMR_CAP))) {
				(void) close(fd);
				continue;
			}

			vc.vc_set = DKV_ABR_CAP;
			if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) {
				md_perror(gettext(
				    "Could not set ABR state for "
				    "device %s"), name);
				(void) close(fd);
				goto out;
			} else {
				md_eprintf(gettext(
				    "Setting ABR state on device %s\n"), name);
			}

			(void) close(fd);
		}
	}

	/* cleanup */
	if (mode & RESET_OWNER) {
		meta_free_nodelist(nl);
	}
	metafreenamelist(devnlp);
	return (0);

out:
	/* cleanup */
	if (mode & RESET_OWNER) {
		meta_free_nodelist(nl);
	}
	metafreenamelist(devnlp);
	return (-1);
}

/*
 * Print usage message
 */
static void
usage(mdsetname_t *sp, int eval)
{
	(void) fprintf(stderr, gettext("usage:"
	    "\t%s [-V version] [-t timeout] [-d level] start localnodeid\n"
	    "\t%s [-V version] [-t timeout] [-d level] step nodelist...\n"
	    "\t%s [-V version] [-t timeout] [-d level] abort | stop\n"
	    "\t%s [-V | -? | -h]\n"),
	    myname, myname, myname, myname);
	if (!eval) {
		(void) fprintf(stderr, gettext("\n"
		    "\tValid debug (-d) levels are 1-%d for increasing "
		    "verbosity.\n\tDefault is -d 3.\n\n"
		    "\tValid step values are: return | step1 | step2 | "
		    "step3 | step4\n\n"
		    "\tNodelist is a space-separated list of node id's\n\n"),
		    MAX_DEBUG_LEVEL);
	}
	md_exit(sp, eval);
}

/*
 * Input:	Input takes a config step name followed by a list of
 *		possible node id's.
 *
 * Returns:	  0 - Success
 *		  1 - Fail
 *			Node will be removed from cluster membership
 *			by forcing node to panic.
 *		205 - Unsuccessful. Start another reconfig cycle.
 *			Problem was encountered that could be fixed by
 *			running another reconfig cycle.
 *			Problem could be a result of a failure to read
 *			the nodelist file or that all work could not be
 *			accomplished in a reconfig step in the amount of
 *			time given so another reconfig cycle is needed in
 *			order to finish the current step.
 */
int
main(int argc, char **argv)
{
	mdsetname_t		*sp = NULL;
	md_error_t		status = mdnullerror;
	md_error_t		*ep = &status;
	set_t			max_sets, setno;
	int			c, clust = 0;
	struct sigaction	nsa, osa;
	struct step_t		*step_ptr;
	mdsetname_t		*local_sp = NULL;
	md_drive_desc		*dd;
	int			rval = 0;
	md_set_desc		*sd;
	mddb_block_parm_t	mbp;
	uint_t			debug = 3; /* log upto MC_LOG3 by default */
	int			version_table_size;
	mddb_setflags_config_t	sf;
	int			ret_val;
	mddb_config_t		cfg;
	int			set_info[MD_MAXSETS];
	long			commd_timeout = 0;

	/*
	 * Get the locale set up before calling any other routines
	 * with messages to ouput.  Just in case we're not in a build
	 * environment, make sure that TEXT_DOMAIN gets set to
	 * something.
	 */
#if !defined(TEXT_DOMAIN)
#define	TEXT_DOMAIN "SYS_TEST"
#endif
	(void) setlocale(LC_ALL, "");
	(void) textdomain(TEXT_DOMAIN);

	if ((clust = sdssc_bind_library()) == SDSSC_ERROR) {
		md_eprintf(gettext("Interface error with libsds_sc.so\n"));
		exit(1);
	}

	if (md_init(argc, argv, 1, 1, ep) != 0 || meta_check_root(ep) != 0) {
		mde_perror(ep, "");
		md_exit(sp, 1);
	}

	/*
	 * open log and enable libmeta logging. Do it here explicitly
	 * rather than letting md_init() do it because we are not really
	 * a daemon and that is what md_init() opens the log as.
	 */
	openlog("metaclust", LOG_CONS, LOG_USER);

	version_table_size = sizeof (version_table) / sizeof (version_table[0]);

	optind = 1;
	opterr = 0;
	while ((c = getopt(argc, argv, "hd:V:t:?")) != -1) {
		switch (c) {
		case 'h':
			usage(sp, 0);
			break;

		case 'd':
			if (sscanf(optarg, "%u", &debug) != 1) {
				md_eprintf(gettext("Invalid debug level\n"));
				md_exit(sp, 1);
			} else if ((debug < 1) || (debug > MAX_DEBUG_LEVEL)) {
				debug = min(max(debug, 1), MAX_DEBUG_LEVEL);
				md_eprintf(gettext("Debug level must be "
				    "between 1 and %d inclusive.\n"),
				    MAX_DEBUG_LEVEL);
				md_eprintf(gettext("Debug level set to %d.\n"),
				    debug);
			}
			break;

		case 'V':
			version = Strdup(optarg);
			break;

		case 't':
			if (sscanf(optarg, "%u", &timeout) != 1) {
				md_eprintf(gettext("Invalid timeout value\n"));
				md_exit(sp, 1);
			}
			break;

		case '?':
			if (optopt == '?') {
				usage(sp, 0);
			} else if (optopt == 'V') {
				int	i;

				(void) fprintf(stdout, gettext(
				    "%s: Versions Supported:"), myname);
				for (i = 0; i < version_table_size; i++) {
					(void) fprintf(stdout, " %s",
					    version_table[i]);
				}
				(void) fprintf(stdout, "\n");
				md_exit(sp, 0);
			}
			/*FALLTHROUGH*/

		default:
			usage(sp, 1);
			break;
		}
	}

	/* initialise the debug level and start time */
	setup_mc_log(debug);

	/*
	 * check that the version specified (if any) is supported.
	 */
	if (version != NULL) {
		int	i, found = 0;

		for (i = 0; i < version_table_size; i++) {
			if (strcmp(version, version_table[i]) == 0) {
				found = 1;
				break;
			}
		}
		if (!found) {
			md_eprintf(gettext("Version %s not supported\n"),
			    version);
			md_exit(sp, 1);
		}
	}

	argc -= optind;
	argv += optind;

	/* parse arguments */
	if (argc <= 0) {
		usage(sp, 1);
	}

	/* convert the step name to the corresponding number */
	step_ptr = bsearch(argv[0], step_table, (sizeof (step_table) /
	    sizeof (step_table[0])), sizeof (step_table[0]), mc_compare);
	if (step_ptr != NULL) {
		stepnum = step_ptr->step_num;
	}

	--argc;
	++argv;

	/* set timeout alarm signal, a value of 0 will disable timeout */
	if (timeout > 0) {
		int	stat_loc = 0;
		commd_timeout = (long)(timeout * .75);

		c_pid = fork();

		if (c_pid == (pid_t)-1) {
			md_perror(gettext("Unable to fork"));
			md_exit(sp, 1);
		} else if (c_pid) {
			/* parent */
			nsa.sa_flags = 0;
			if (sigfillset(&nsa.sa_mask) < 0) {
				md_perror(gettext("Unable to set signal mask"));
				md_exit(sp, 1);
			}

			nsa.sa_handler = sigalarmhandler;
			if (sigaction(SIGALRM, &nsa, &osa) == -1) {
				md_perror(gettext("Unable to set alarm "
				    "handler"));
				md_exit(sp, 1);
			}

			(void) alarm(timeout);

			/*
			 * wait for child to exit or timeout to expire.
			 * keep retrying if the call is interrupted
			 */
			while ((ret_val = waitpid(c_pid, &stat_loc, 0)) < 0) {
				if (errno != EINTR) {
					break;
				}
			}
			if (ret_val == c_pid) {
				/* exit with the childs exit value */
				exit(WEXITSTATUS(stat_loc));
			} else if (errno == ECHILD) {
				md_exit(sp, 0);
			} else {
				perror(myname);
				md_exit(sp, 1);
			}
		}
	}

	/*
	 * If a timeout value is given, everything from this point onwards is
	 * executed in the child process.
	 */

	switch (stepnum) {
	case MC_START:
		/*
		 * Start Step
		 *
		 * - Suspend all rpc.mdcommd messages
		 */

		/* expect the local node id to be given only */
		if (argc != 1)
			usage(sp, 1);

		meta_mc_log(MC_LOG2, gettext("Starting Start step: %s"),
		    meta_print_hrtime(0));

		/*
		 * With multinode disksets configured we need to
		 * update all replicas on all cluster nodes to have
		 * the same status. If local replicas on a cluster
		 * node are not accessible we need to panic this
		 * node, otherwise we abort in the reconfig cycle
		 * and failfast/reboot the "good" cluster node too.
		 * To avoid a total cluster outage in the above case
		 * we panic only the failing node via md_exit(.., 1).
		 */
		if ((local_sp = load_local_set(ep)) == NULL) {
			/* panic the node */
			md_exit(local_sp, 1);
		}

		if ((max_sets = get_max_sets(ep)) == 0) {
			mde_perror(ep, "");
			md_exit(sp, 1);
		}

		/* start walking through all possible disksets */
		for (setno = 1; setno < max_sets; setno++) {
			if ((sp = metasetnosetname(setno, ep)) == NULL) {
				if (mdiserror(ep, MDE_NO_SET)) {
					/* No set for this setno - continue */
					mdclrerror(ep);
					continue;
				} else {
					mde_perror(ep, gettext("Unable to "
					    "get set %d information"), setno);
					md_exit(sp, 1);
				}
			}

			/* only check multi-node disksets */
			if (!meta_is_mn_set(sp, ep)) {
				mdclrerror(ep);
				continue;
			}

			meta_mc_log(MC_LOG3, gettext("Start - block parse "
			    "messages for set %s: %s"), sp->setname,
			    meta_print_hrtime(gethrtime() - start_time));

			/*
			 * Mddb parse messages are sent amongst the nodes
			 * in a diskset whenever the locator block or
			 * locator names structure has been changed.
			 * A locator block change could occur as a result
			 * of a disk failure during the reconfig cycle,
			 * so block the mddb parse messages while the
			 * rpc.mdcommd is suspended during the reconfig cycle.
			 */
			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
				(void) memset(&mbp, 0, sizeof (mbp));
				mbp.c_setno = setno;
				mbp.c_blk_flags = MDDB_BLOCK_PARSE;
				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
				    &mbp.c_mde, NULL)) {
					(void) mdstealerror(ep, &mbp.c_mde);
					mde_perror(ep, gettext("Could not "
					    "block set %s"), sp->setname);
					md_exit(sp, 1);
				}
			}

			/* suspend commd and spin waiting for drain */
			while ((ret_val = mdmn_suspend(setno,
			    MD_COMM_ALL_CLASSES, commd_timeout)) ==
			    MDE_DS_COMMDCTL_SUSPEND_NYD) {
				(void) sleep(1);
			}

			if (ret_val) {
				md_eprintf(gettext("Could not suspend "
				    "rpc.mdcommd for set %s\n"), sp->setname);
				md_exit(sp, 1);
			}

			/*
			 * Set start step flag for set. This is set to indicate
			 * that this node entered the reconfig cycle through
			 * the start step.  This is used during the reconfig
			 * cycle to determine whether the node had entered
			 * through the start step or the return step.
			 */
			(void) memset(&sf, 0, sizeof (sf));
			sf.sf_setno = sp->setno;
			sf.sf_setflags = MD_SET_MN_START_RC;
			sf.sf_flags = MDDB_NM_SET;
			/* Use magic to help protect ioctl against attack. */
			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
			if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
			    &sf.sf_mde, NULL)) {
				(void) mdstealerror(ep, &sf.sf_mde);
				mde_perror(ep, gettext("Could not set "
				    "start_step flag for set %s"), sp->setname);
				md_exit(sp, 1);
			}

		}

		meta_mc_log(MC_LOG2, gettext("Start step completed: %s"),
		    meta_print_hrtime(gethrtime() - start_time));

		break;

	case MC_STOP:
		/*
		 * Stop Step
		 *
		 * - ???
		 */

		/* don't expect any more arguments to follow the step name */
		if (argc != 0)
			usage(sp, 1);

		break;

	case MC_ABORT:
		/*
		 * Abort Step
		 *
		 * - Abort rpc.mdcommd
		 */

		/* don't expect any more arguments to follow the step name */
		if (argc != 0)
			usage(sp, 1);

		meta_mc_log(MC_LOG2, gettext("Starting Abort step: %s"),
		    meta_print_hrtime(0));

		/*
		 * Does local set exist? If not, exit with 0
		 * since there's no reason to have this node panic if
		 * the local set cannot be started.
		 */
		if ((local_sp = load_local_set(ep)) == NULL) {
			md_exit(local_sp, 0);
		}

		/*
		 * abort the rpc.mdcommd.  The abort is only issued on this node
		 * meaning that the abort reconfig step is called on this
		 * node before a panic while the rest of the cluster will
		 * undergo a reconfig cycle.
		 * There is no time relation between this node running a
		 * reconfig abort and the the rest of the cluster
		 * running a reconfig cycle meaning that this node may
		 * panic before, during or after the cluster has run
		 * a reconfig cycle.
		 */
		mdmn_abort();

		meta_mc_log(MC_LOG2, gettext("Abort step completed: %s"),
		    meta_print_hrtime(gethrtime() - start_time));

		break;

	case MC_RETURN:
		/*
		 * Return Step
		 *
		 * - Grab local set lock, issue rpc.mdcommd DRAIN ALL
		 *   and release local set lock.  Grabbing the local set
		 *   lock allows any active metaset/metadb commands to
		 *   terminate gracefully and will keep a metaset/metadb
		 *   command from starting until the DRAIN ALL is issued.
		 *   The metaset/metadb commands can issue
		 *   DRAIN ALL/RESUME ALL commands to rpc.mdcommd,
		 *   so the return step must not issue the DRAIN ALL command
		 *   until metaset/metadb have finished or metaset may issue
		 *   a RESUME ALL after this return reconfig step has issued
		 *   the DRAIN ALL command.
		 *   After this reconfig step has issued the DRAIN_ALL and
		 *   released the local set lock, metaset/metadb will fail
		 *   when attempting to contact the rpc.mdcommd and will
		 *   terminate without making any configuration changes.
		 *   The DRAIN ALL command will keep all other meta* commands
		 *   from running during the reconfig cycle (these commands
		 *   will wait until the rpc.mdcommd is resumed) since the
		 *   reconfig cycle may be changing the diskset configuration.
		 */

		/* expect the nodelist to follow the step name */
		if (argc < 1)
			usage(sp, 1);

		meta_mc_log(MC_LOG2, gettext("Starting Return step: %s"),
		    meta_print_hrtime(0));

		/*
		 * Does local set exist? If not, exit with 0
		 * since there's no reason to have this node panic if
		 * the local set cannot be started.
		 */
		if ((local_sp = load_local_set(ep)) == NULL) {
			md_exit(local_sp, 0);
		}

		/*
		 * Suspend any mirror resyncs that are in progress. This
		 * stops unnecessary timeouts.
		 */
		meta_mirror_resync_block_all();

		if (meta_lock(local_sp, TRUE, ep) != 0) {
			mde_perror(ep, "");
			md_exit(local_sp, 1);
		}

		/*
		 * All metaset and metadb commands on this node have now
		 * terminated gracefully.  Now, issue a drain all to
		 * the rpc.mdcommd.  Any meta command issued after the
		 * drain all will either spin sending the command to the
		 * master until after the reconfig cycle has finished OR
		 * will terminate gracefully (metaset/metadb).
		 */
		if ((max_sets = get_max_sets(ep)) == 0) {
			mde_perror(ep, "");
			md_exit(sp, 1);
		}

		/* start walking through all possible disksets */
		for (setno = 1; setno < max_sets; setno++) {
			if ((sp = metasetnosetname(setno, ep)) == NULL) {
				if (mdiserror(ep, MDE_NO_SET)) {
					/* No set for this setno - continue */
					mdclrerror(ep);
					continue;
				} else {
					mde_perror(ep, gettext("Unable to "
					    "get set %d information"), setno);
					md_exit(sp, 1);
				}
			}

			/* only check multi-node disksets */
			if (!meta_is_mn_set(sp, ep)) {
				mdclrerror(ep);
				continue;
			}

			meta_mc_log(MC_LOG3, gettext("Return - block parse "
			    "messages for set %s: %s"), sp->setname,
			    meta_print_hrtime(gethrtime() - start_time));

			/*
			 * Mddb parse messages are sent amongst the nodes
			 * in a diskset whenever the locator block or
			 * locator names structure has been changed.
			 * A locator block change could occur as a result
			 * of a disk failure during the reconfig cycle,
			 * so block the mddb parse messages while the
			 * rpc.commd is suspended during the reconfig cycle.
			 */
			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
				(void) memset(&mbp, 0, sizeof (mbp));
				mbp.c_setno = setno;
				mbp.c_blk_flags = MDDB_BLOCK_PARSE;
				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
				    &mbp.c_mde, NULL)) {
					(void) mdstealerror(ep, &mbp.c_mde);
					mde_perror(ep, gettext("Could not "
					    "block set %s"), sp->setname);
					md_exit(sp, 1);
				}
			}

			/* suspend commd and spin waiting for drain */
			while ((ret_val = mdmn_suspend(setno,
			    MD_COMM_ALL_CLASSES, commd_timeout)) ==
			    MDE_DS_COMMDCTL_SUSPEND_NYD) {
				(void) sleep(1);
			}

			if (ret_val) {
				md_eprintf(gettext("Could not suspend "
				    "rpc.mdcommd for set %s\n"), sp->setname);
				md_exit(sp, 1);
			}
		}
		/*
		 * Resume all I/Os for this node for all MN sets in
		 * case master node had suspended I/Os but panic'd
		 * before resuming I/Os.  In case of failure, exit
		 * with a 1 since unable to resume I/Os on this node.
		 */
		if (clnt_mn_susp_res_io(mynode(), 0, MN_RES_IO, ep)) {
			mde_perror(ep, gettext(
			    "Unable to resume I/O on node %s for all sets"),
			    mynode());
			md_exit(sp, 1);
		}


		/*
		 * Can now unlock local set lock.  New metaset/metadb
		 * commands are now held off using drain all.
		 */
		(void) meta_unlock(local_sp, ep);

		meta_mc_log(MC_LOG2, gettext("Return step completed: %s"),
		    meta_print_hrtime(gethrtime() - start_time));

		break;

	case MC_STEP1:
		/*
		 * Step 1
		 *
		 * - Populate nodelist file if we are on clustering
		 *   and pick a master node for each MN diskset.
		 */

		/* expect the nodelist to follow the step name */
		if (argc < 1)
			usage(sp, 1);

		meta_mc_log(MC_LOG2, gettext("Starting Step1: %s"),
		    meta_print_hrtime(0));

		/* Always write nodelist file even if no local set exists */
		if (clust == SDSSC_OKAY) {
			/* skip to the nodelist args */
			if (meta_write_nodelist(argc, argv, ep) != 0) {
				mde_perror(ep, gettext(
				    "Could not populate nodelist file"));
				md_exit(sp, 1);
			}
		}

		/*
		 * Does local set exist? If not, exit with 0
		 * since there's no reason to have this node panic if
		 * the local set cannot be started.
		 */
		if ((local_sp = load_local_set(ep)) == NULL) {
			md_exit(local_sp, 0);
		}

		/*
		 * At this point, all meta* commands are blocked across
		 * all disksets since the master rpc.mdcommd has drained or
		 * the master node has died.
		 * If a metaset or metadb command had been in progress
		 * at the start of the reconfig cycle, this command has
		 * either completed or it has been terminated due to
		 * the death of the master node.
		 *
		 * This means that that it is now ok to remove any
		 * outstanding clnt_locks associated with multinode
		 * disksets on this node due to a node panic during
		 * a metaset operation.  This allows the routines that
		 * choose the master to use rpc.metad to determine the
		 * master of the diskset.
		 */
		if (clnt_clr_mnsetlock(mynode(), ep) != 0) {
			meta_mc_log(MC_LOG2, gettext("Step1 aborted:"
			    "clear locks failed %s"),
			    meta_print_hrtime(gethrtime() - start_time));
			md_exit(local_sp, 1);
		}

		/*
		 * Call reconfig_choose_master to choose a master for
		 * each MN diskset, update the nodelist for each diskset
		 * given the member information and send a reinit message
		 * to rpc.mdcommd to reload the nodelist.
		 */
		rval = meta_reconfig_choose_master(commd_timeout, ep);
		if (rval == 205) {
			/*
			 * NOTE: Should issue call to reboot remote host that
			 * is causing the RPC failure.  Clustering to
			 * provide interface in the future.  This should
			 * stop a never-ending set of 205 reconfig cycles.
			 * Remote host causing failure is stored in
			 * ep->host if ep is an RPC error.
			 * if (mdanyrpcerror(ep))
			 * 	reboot (ep->host);
			 */
			meta_mc_log(MC_LOG2, gettext("Step1 aborted:"
			    "choose master failure of 205 %s"),
			    meta_print_hrtime(gethrtime() - start_time));
			md_exit(local_sp, 205);
		} else if (rval != 0) {
			meta_mc_log(MC_LOG2, gettext("Step1 failure: "
			    "choose master failure %s"),
			    meta_print_hrtime(gethrtime() - start_time));
			md_exit(local_sp, 1);
		}

		meta_mc_log(MC_LOG2, gettext("Step1 completed: %s"),
		    meta_print_hrtime(gethrtime() - start_time));

		md_exit(local_sp, rval);
		break;

	case MC_STEP2:
		/*
		 * Step 2
		 *
		 * In Step 2, each node walks the list of disksets.  If a
		 * node is a master of a MN diskset, it synchronizes
		 * the local set USER records for that diskset.
		 *
		 * If disks exist in the diskset and there is a joined
		 * (owner) node in the diskset, the master will also:
		 *	- synchronize the diskset mddbs to the master
		 *	- play the change log
		 *
		 * The master node will now attempt to join any unjoined
		 * nodes that are currently members in the membership list.
		 */

		/* expect the nodelist to follow the step name */
		if (argc < 1)
			usage(sp, 1);

		meta_mc_log(MC_LOG2, gettext("Starting Step2: %s"),
		    meta_print_hrtime(0));

		/*
		 * Does local set exist? If not, exit with 0
		 * since there's no reason to have this node panic if
		 * the local set cannot be started.
		 */
		if ((local_sp = load_local_set(ep)) == NULL) {
			md_exit(local_sp, 0);
		}

		if ((max_sets = get_max_sets(ep)) == 0) {
			mde_perror(ep, "");
			md_exit(local_sp, 1);
		}

		/* start walking through all possible disksets */
		for (setno = 1; setno < max_sets; setno++) {
			if ((sp = metasetnosetname(setno, ep)) == NULL) {
				if (mdiserror(ep, MDE_NO_SET)) {
					/* No set for this setno - continue */
					mdclrerror(ep);
					continue;
				} else if (mdanyrpcerror(ep)) {
					/* Fail on RPC failure to self */
					mde_perror(ep, gettext(
					    "Unable to get information for "
					    "set number %d"), setno);
					md_exit(local_sp, 1);
				} else {
					mde_perror(ep, gettext(
					    "Unable to get information for "
					    "set number %d"), setno);
					mdclrerror(ep);
					continue;
				}
			}

			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
				if (mdanyrpcerror(ep)) {
					/* Fail on RPC failure to self */
					mde_perror(ep, gettext(
					    "Unable to get information for "
					    "set number %d"), setno);
					md_exit(local_sp, 1);
				}
				mde_perror(ep, gettext("Unable to get set "
				    "%s desc information"), sp->setname);
				mdclrerror(ep);
				continue;
			}

			/* Only check MN disksets */
			if (!(MD_MNSET_DESC(sd))) {
				continue;
			}

			/* All actions in step 2 are driven by master */
			if (!(sd->sd_mn_am_i_master)) {
				continue;
			}

			meta_mc_log(MC_LOG3, gettext("Step2 - begin record "
			    "synchronization for set %s: %s"), sp->setname,
			    meta_print_hrtime(gethrtime() - start_time));

			/*
			 * Synchronize the USER records in the local mddbs
			 * for hosts that are members.  The USER records
			 * contain set, drive and host information.
			 */
			rval = meta_mnsync_user_records(sp, ep);
			if (rval != 0) {
				mde_perror(ep, gettext(
				    "Synchronization of user records "
				    "in set %s failed\n"), sp->setname);
				if (rval == 205) {
					/*
					 * NOTE: Should issue call to reboot
					 * remote host that is causing the RPC
					 * failure.  Clustering to provide
					 * interface in the future.  This
					 * should stop a never-ending set of
					 * 205 reconfig cycles.
					 * Remote host causing failure is
					 * stored in ep->host if ep is an
					 * RPC error.
					 * if (mdanyrpcerror(ep))
					 * 	reboot (ep->host);
					 */
					md_exit(local_sp, 205);
				} else {
					md_exit(local_sp, 1);
				}
			}

			/* Reget sd since sync_user_recs may have flushed it */
			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
				mde_perror(ep, gettext("Unable to get set "
				    "%s desc information"), sp->setname);
				md_exit(local_sp, 1);
			}

			dd = metaget_drivedesc(sp,
			    (MD_BASICNAME_OK | PRINT_FAST), ep);
			if (! mdisok(ep)) {
				mde_perror(ep, gettext("Unable to get set "
				    "%s drive information"), sp->setname);
				md_exit(local_sp, 1);
			}

			/*
			 * No drives in set, continue to next set.
			 */
			if (dd == NULL) {
				/* Done with this set */
				continue;
			}

			meta_mc_log(MC_LOG3, gettext("Step2 - local set user "
			    "records completed for set %s: %s"), sp->setname,
			    meta_print_hrtime(gethrtime() - start_time));

			/*
			 * Synchronize the diskset mddbs for hosts
			 * that are members.  This may involve
			 * playing the changelog and writing out
			 * to the diskset mddbs.
			 */
			rval = meta_mnsync_diskset_mddbs(sp, ep);
			if (rval != 0) {
				mde_perror(ep, gettext(
				    "Synchronization of diskset mddbs "
				    "in set %s failed\n"), sp->setname);
				meta_mc_log(MC_LOG3, gettext("Step2 - diskset "
				    "mddb synchronization failed for "
				    "set %s: %s"), sp->setname,
				    meta_print_hrtime(gethrtime() -
				    start_time));
				if (rval == 205) {
					/*
					 * NOTE: Should issue call to reboot
					 * remote host that is causing the RPC
					 * failure.  Clustering to provide
					 * interface in the future.  This
					 * should stop a never-ending set of
					 * 205 reconfig cycles.
					 * Remote host causing failure is
					 * stored in ep->host if ep is an
					 * RPC error.
					 * if (mdanyrpcerror(ep))
					 * 	reboot (ep->host);
					 */
					md_exit(local_sp, 205);
				} else if (rval == 1) {
					continue;
				} else {
					md_exit(local_sp, 1);
				}
			}

			meta_mc_log(MC_LOG3, gettext("Step2 - diskset mddb "
			    "synchronization completed for set %s: %s"),
			    sp->setname,
			    meta_print_hrtime(gethrtime() - start_time));

			/* Join the starting nodes to the diskset */
			rval = meta_mnjoin_all(sp, ep);
			if (rval != 0) {
				mde_perror(ep, gettext(
				    "Join of non-owner (starting) nodes "
				    "in set %s failed\n"), sp->setname);
				meta_mc_log(MC_LOG3, gettext("Step2 - non owner"
				    "nodes joined for set %s: %s"),
				    sp->setname,
				    meta_print_hrtime(gethrtime() -
				    start_time));
				if (rval == 205) {
					/*
					 * NOTE: Should issue call to reboot
					 * remote host that is causing the RPC
					 * failure.  Clustering to provide
					 * interface in the future.  This
					 * should stop a never-ending set of
					 * 205 reconfig cycles.
					 * Remote host causing failure is
					 * stored in ep->host if ep is an
					 * RPC error.
					 * if (mdanyrpcerror(ep))
					 * 	reboot (ep->host);
					 */
					md_exit(local_sp, 205);
				} else {
					md_exit(local_sp, 1);
				}
			}

			meta_mc_log(MC_LOG3, gettext("Step2 - non owner nodes "
			    "joined for set %s: %s"), sp->setname,
			    meta_print_hrtime(gethrtime() - start_time));

		}

		meta_mc_log(MC_LOG2, gettext("Step2 completed: %s"),
		    meta_print_hrtime(gethrtime() - start_time));

		break;

	case MC_STEP3:
		/*
		 * Step 3
		 *
		 * For all multinode sets do,
		 * - Reinitialise rpc.mdcommd
		 * - Reset mirror owners to null if the current owner is
		 *   no longer in the membership list
		 */

		/* expect the nodelist to follow the step name */
		if (argc < 1)
			usage(sp, 1);

		meta_mc_log(MC_LOG2, gettext("Starting Step3: %s"),
		    meta_print_hrtime(0));

		/*
		 * Does local set exist? If not, exit with 0
		 * since there's no reason to have this node panic if
		 * the local set cannot be started.
		 */
		if ((local_sp = load_local_set(ep)) == NULL) {
			md_exit(local_sp, 0);
		}

		/*
		 * walk through all sets on this node which could include:
		 *	- MN disksets
		 *	- traditional disksets
		 *	- non-existent disksets
		 * start mirror resync for all MN sets
		 */
		if ((max_sets = get_max_sets(ep)) == 0) {
			mde_perror(ep, "");
			md_exit(local_sp, 1);
		}

		/* start walking through all possible disksets */
		for (setno = 1; setno < max_sets; setno++) {
			if ((sp = metasetnosetname(setno, ep)) == NULL) {
				if (mdiserror(ep, MDE_NO_SET)) {
					/* No set for this setno - continue */
					mdclrerror(ep);
					continue;
				} else {
					mde_perror(ep, gettext("Unable to "
					    "get set %d information"), setno);
					md_exit(local_sp, 1);
				}
			}

			/* only check multi-node disksets */
			if (!meta_is_mn_set(sp, ep)) {
				mdclrerror(ep);
				continue;
			}

			if (meta_lock(sp, TRUE, ep) != 0) {
				mde_perror(ep, "");
				md_exit(local_sp, 1);
			}

			/* If this node isn't joined to set, do nothing */
			if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) {
				if (!mdisok(ep)) {
					mde_perror(ep, gettext("Could "
					    "not get set %s ownership"),
					    sp->setname);
					md_exit(sp, 1);
				}
				mdclrerror(ep);
				(void) meta_unlock(sp, ep);
				continue;
			}

			meta_mc_log(MC_LOG3, gettext("Step3 - begin "
			    "re-initialising rpc.mdcommd and resetting mirror "
			    "owners for set %s: %s"), sp->setname,
			    meta_print_hrtime(gethrtime() - start_time));

			/* reinitialzse rpc.mdcommd with new nodelist */
			if (mdmn_reinit_set(setno, commd_timeout)) {
				md_eprintf(gettext(
				    "Could not re-initialise rpc.mdcommd for "
				    "set %s\n"), sp->setname);
				md_exit(sp, 1);
			}

			(void) memset(&cfg, 0, sizeof (cfg));
			cfg.c_id = 0;
			cfg.c_setno = sp->setno;
			if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde,
			    NULL) != 0) {
				(void) mdstealerror(ep, &cfg.c_mde);
				mde_perror(ep, gettext("Could "
				    "not get set %s information"),
				    sp->setname);
				md_exit(sp, 1);
			}

			/* Don't do anything else if set is stale */
			if (cfg.c_flags & MDDB_C_STALE) {
				(void) meta_unlock(sp, ep);
				mdclrerror(ep);
				continue;
			}

			/* reset mirror owners */
			if (reset_state(RESET_OWNER, sp, MD_MIRROR, ep) == -1) {
				md_exit(sp, 1);
			}

			(void) meta_unlock(sp, ep);

			meta_mc_log(MC_LOG3, gettext("Step3 - rpc.mdcommd "
			    "re-initialised and mirror owners reset for "
			    "set %s: %s"), sp->setname,
			    meta_print_hrtime(gethrtime() - start_time));
		}

		meta_mc_log(MC_LOG2, gettext("Step3 completed: %s"),
		    meta_print_hrtime(gethrtime() - start_time));

		break;

	case MC_STEP4:
		/*
		 * Step 4
		 *
		 * For all multinode sets do:
		 * - Resume the rpc.mdcommd messages.  Must resume all
		 *	sets before issuing I/O to any set since an error
		 * 	encountered in a commd suspended set could be
		 *	blocked waiting for commd in another set to resume.
		 *	(This happens since the daemon queues service
		 *	all sets).  An open of a soft partition causes
		 *	a read of the watermarks during the open.
		 * - If set is non-writable (not an owner or STALE), then
		 *	continue to next set.
		 *
		 * For all multinode sets do,
		 * - Reset ABR states for all mirrors, ie clear ABR if not
		 *	open on any node.
		 * - Reset ABR states for all soft partitions, ie clear ABR if
		 *	not open on any node.
		 * - For all slave nodes that have entered through the start
		 *	step, update the ABR state to that of the master and
		 *	get the submirror state from the master
		 * - meta_lock set
		 * - Resync all mirrors
		 * - unlock meta_lock for this set.
		 * - Choose a new owner for any orphaned resyncs
		 *
		 * There is one potential issue here. when concurrently
		 * resetting and updating the ABR state. If the master has ABR
		 * set, but should no longer have because the only node that
		 * had the metadevice open and had ABR set has paniced, the
		 * master will send a message to all nodes to clear the ABR
		 * state. Meanwhile any node that has come through the
		 * start step will get tstate from the master and will update
		 * ABR if it was set in tstate. So, we appear to have a problem
		 * if the following sequence occurs:-
		 * - The slave gets tstate with ABR set
		 * - The master sends a message to clear ABR
		 * - The slave updates ABR with the value it got from tstate.
		 * We now have the master with ABR clear and the slave with ABR
		 * set. Fortunately, having set ABR, the slave will close the
		 * metadevice after setting ABR and as there are no nodes with
		 * the device open, the close will send a message to clear ABR
		 * on all nodes. So, the nodes will all have ABR unset.
		 */

		/* expect the nodelist to follow the step name */
		if (argc < 1)
			usage(sp, 1);

		meta_mc_log(MC_LOG2, gettext("Starting Step4: %s"),
		    meta_print_hrtime(0));

		/*
		 * Does local set exist? If not, exit with 0
		 * since there's no reason to have this node panic if
		 * the local set cannot be started.
		 */
		if ((local_sp = load_local_set(ep)) == NULL) {
			md_exit(local_sp, 0);
		}

		/*
		 * walk through all sets on this node which could include:
		 *	- MN disksets
		 *	- traditional disksets
		 *	- non-existent disksets
		 * start mirror resync for all MN sets
		 */
		if ((max_sets = get_max_sets(ep)) == 0) {
			mde_perror(ep, "");
			md_exit(local_sp, 1);
		}

		/* Clear set_info structure */
		for (setno = 1; setno < max_sets; setno++) {
			set_info[setno] = 0;
		}

		/* start walking through all possible disksets */
		for (setno = 1; setno < max_sets; setno++) {
			if ((sp = metasetnosetname(setno, ep)) == NULL) {
				if (mdiserror(ep, MDE_NO_SET)) {
					/* No set for this setno - continue */
					mdclrerror(ep);
					continue;
				} else {
					mde_perror(ep, gettext("Unable to "
					    "get set %d information"), setno);
					md_exit(local_sp, 1);
				}
			}

			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
				mde_perror(ep, gettext("Unable to get set "
				    "%s desc information"), sp->setname);
				mdclrerror(ep);
				continue;
			}

			/* only check multi-node disksets */
			if (!meta_is_mn_set(sp, ep)) {
				mdclrerror(ep);
				continue;
			}

			set_info[setno] |= SET_INFO_MN;

			/*
			 * If not an owner (all mddbs failed) or stale
			 * (< 50% mddbs operational), then set is
			 * non-writable so just resume commd and
			 * unblock mddb messages.
			 */
			mdclrerror(ep);
			if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) {
				set_info[setno] |= SET_INFO_NO_WR;
			}
			if (!mdisok(ep)) {
				mde_perror(ep, gettext("Could "
				    "not get set %s ownership"),
				    sp->setname);
				md_exit(local_sp, 1);
			}
			/* Set is owned - is it stale? */
			if (!set_info[setno] & SET_INFO_NO_WR) {
				(void) memset(&cfg, 0, sizeof (cfg));
				cfg.c_id = 0;
				cfg.c_setno = sp->setno;
				if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde,
				    NULL) != 0) {
					(void) mdstealerror(ep, &cfg.c_mde);
					mde_perror(ep, gettext("Could "
					    "not get set %s information"),
					    sp->setname);
					md_exit(local_sp, 1);
				}
				if (cfg.c_flags & MDDB_C_STALE) {
					set_info[setno] |= SET_INFO_NO_WR;
				}
			}

			/* resume rpc.mdcommd */
			if (mdmn_resume(setno, MD_COMM_ALL_CLASSES, 0,
			    commd_timeout)) {
				md_eprintf(gettext("Unable to resume "
				    "rpc.mdcommd for set %s\n"), sp->setname);
				md_exit(local_sp, 1);
			}

			/* Unblock mddb parse messages */
			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
				(void) memset(&mbp, 0, sizeof (mbp));
				mbp.c_setno = setno;
				mbp.c_blk_flags = MDDB_UNBLOCK_PARSE;
				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
				    &mbp.c_mde, NULL)) {
					(void) mdstealerror(ep, &mbp.c_mde);
					mde_perror(ep, gettext("Could not "
					    "unblock set %s"), sp->setname);
					md_exit(local_sp, 1);
				}
			}
			meta_mc_log(MC_LOG3, gettext("Step4 - rpc.mdcommd "
			    "resumed and messages unblocked for set %s: %s"),
			    sp->setname,
			    meta_print_hrtime(gethrtime() - start_time));
		}

		for (setno = 1; setno < max_sets; setno++) {
			int			start_step;

			/* Skip traditional disksets. */
			if ((set_info[setno] & SET_INFO_MN) == 0)
				continue;

			/*
			 * If already determined that this set is
			 * a non-writable set, then just continue
			 * to next set since there's nothing else
			 * to do for a non-writable set.
			 */
			if (set_info[setno] & SET_INFO_NO_WR)
				continue;

			if ((sp = metasetnosetname(setno, ep)) == NULL) {
				if (mdiserror(ep, MDE_NO_SET)) {
					/* No set for this setno - continue */
					mdclrerror(ep);
					continue;
				} else {
					mde_perror(ep, gettext("Unable to "
					    "get set %d information"), setno);
					md_exit(local_sp, 1);
				}
			}

			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
				mde_perror(ep, gettext("Unable to get set "
				    "%s desc information"), sp->setname);
				mdclrerror(ep);
				continue;
			}

			/* See if this node came through the start step */
			(void) memset(&sf, 0, sizeof (sf));
			sf.sf_setno = sp->setno;
			sf.sf_flags = MDDB_NM_GET;
			/* Use magic to help protect ioctl against attack. */
			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
			if (metaioctl(MD_MN_GET_SETFLAGS, &sf,
			    &sf.sf_mde, NULL)) {
				(void) mdstealerror(ep, &sf.sf_mde);
				mde_perror(ep, gettext("Could not get "
				    "start_step flag for set %s"), sp->setname);
				md_exit(local_sp, 1);
			}
			start_step =
			    (sf.sf_setflags & MD_SET_MN_START_RC)? 1: 0;

			/*
			 * We can now reset the start_step flag for the set
			 * if it was already set.
			 */
			if (start_step) {
				(void) memset(&sf, 0, sizeof (sf));
					sf.sf_setno = sp->setno;
				sf.sf_setflags = MD_SET_MN_START_RC;
				sf.sf_flags = MDDB_NM_RESET;
				/*
				 * Use magic to help protect ioctl
				 * against attack.
				 */
				sf.sf_magic = MDDB_SETFLAGS_MAGIC;
				if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
				    &sf.sf_mde, NULL)) {
					(void) mdstealerror(ep, &sf.sf_mde);
					mde_perror(ep,
					    gettext("Could not reset "
					    "start_step flag for set %s"),
					    sp->setname);
				}
			}

			meta_mc_log(MC_LOG3, gettext("Step4 - begin setting "
			    "ABR state and restarting io's for "
			    "set %s: %s"), sp->setname,
			    meta_print_hrtime(gethrtime() - start_time));


			/*
			 * If we are not the master and we have come through
			 * the start step, we must update the ABR states
			 * for mirrors and soft partitions. Also the submirror
			 * states need to be synchronised so that we see the
			 * same status as other previously joined members.
			 * This _must_ be done before starting the resync.
			 */
			if (!(sd->sd_mn_am_i_master) && start_step) {
				if (reset_state(GET_MIRROR_STATE, sp, MD_MIRROR,
				    ep) == -1) {
					md_exit(local_sp, 1);
				}
				if (reset_state(UPDATE_ABR, sp, MD_SP,
				    ep) == -1) {
					md_exit(local_sp, 1);
				}
				/*
				 * Mark the fact that we've got the mirror
				 * state. This allows the resync thread to
				 * determine if _it_ needs to issue this. This
				 * can happen if a node is added to a set after
				 * a reconfig cycle has completed.
				 */
				(void) memset(&sf, 0, sizeof (sf));
					sf.sf_setno = sp->setno;
				sf.sf_setflags = MD_SET_MN_MIR_STATE_RC;
				sf.sf_flags = MDDB_NM_SET;
				/*
				 * Use magic to help protect ioctl
				 * against attack.
				 */
				sf.sf_magic = MDDB_SETFLAGS_MAGIC;
				if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
				    &sf.sf_mde, NULL)) {
					(void) mdstealerror(ep, &sf.sf_mde);
					mde_perror(ep,
					    gettext("Could not set "
					    "submirror state flag for set %s"),
					    sp->setname);
				}
			}

			/*
			 * All remaining actions are only performed by the
			 * master
			 */
			if (!(sd->sd_mn_am_i_master)) {
				if (meta_lock(sp, TRUE, ep) != 0) {
					mde_perror(ep, "");
					md_exit(local_sp, 1);
				}
				meta_mirror_resync_unblock(sp);
				(void) meta_unlock(sp, ep);
				continue;
			}

			/*
			 * If the master came through the start step, this
			 * implies that all of the nodes must have done the
			 * same and hence there can be no applications
			 * running. Hence no need to reset ABR
			 */
			if (!start_step) {
				/* Reset ABR state for mirrors */
				if (reset_state(RESET_ABR, sp, MD_MIRROR,
				    ep) == -1) {
					md_exit(local_sp, 1);
				}
				/* ...and now the same for soft partitions */
				if (reset_state(RESET_ABR, sp, MD_SP,
				    ep) == -1) {
					md_exit(local_sp, 1);
				}
			}

			/*
			 * choose owners for orphaned resyncs and reset
			 * non-orphaned resyncs so that an owner node that
			 * reboots will restart the resync if needed.
			 */
			if (reset_state(CHOOSE_OWNER, sp, MD_MIRROR, ep) == -1)
				md_exit(local_sp, 1);

			/*
			 * Must unlock set lock before meta_mirror_resync_all
			 * sends a message to run the metasync command
			 * which also grabs the meta_lock.
			 */
			if (meta_lock(sp, TRUE, ep) != 0) {
				mde_perror(ep, "");
				md_exit(local_sp, 1);
			}
			meta_mirror_resync_unblock(sp);
			(void) meta_unlock(sp, ep);

			/* resync all mirrors in set */
			if (meta_mirror_resync_all(sp, 0, ep) != 0) {
				mde_perror(ep, gettext("Mirror resyncs "
				    "failed for set %s"), sp->setname);
				md_exit(local_sp, 1);
			}

			meta_mc_log(MC_LOG3, gettext("Step4 - io's restarted "
			    "for set %s: %s"), sp->setname,
			    meta_print_hrtime(gethrtime() - start_time));
		}

		meta_mc_log(MC_LOG2, gettext("Step4 completed: %s"),
		    meta_print_hrtime(gethrtime() - start_time));

		break;

	default:
		usage(sp, 1);
		break;
	}

	md_exit(sp, 0);
	/* NOTREACHED */
	return (0);
}