xref: /titanic_41/usr/src/cmd/lvm/util/metaclust.c (revision b54157c1b1bf9673e4da8b526477d59202cd08a6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <meta.h>
30 #include <sdssc.h>
31 #include <signal.h>
32 #include <syslog.h>
33 #include <sys/types.h>
34 #include <sys/wait.h>
35 #include <sys/lvm/md_mirror.h>
36 #include <metad.h>
37 
38 #define	MY_VERSION		"1.0"	/* the highest supported version */
39 #define	MAX_DEBUG_LEVEL		5	/* maximum verbosity level */
40 
41 #define	RESET_OWNER		0x0001
42 #define	CHOOSE_OWNER		0x0002
43 #define	RESET_ABR		0x0004
44 #define	UPDATE_ABR		0x0008
45 #define	GET_MIRROR_STATE	0x0010
46 
47 #define	SET_INFO_NO_WR	0x0002
48 #define	SET_INFO_MN	0x0004
49 
50 /*
51  * This table defines all the metaclust reconfig steps we understand
52  */
53 typedef enum stpnum {
54 	MC_UNK = 0,
55 	MC_START,
56 	MC_STOP,
57 	MC_ABORT,
58 	MC_RETURN,
59 	MC_STEP1,
60 	MC_STEP2,
61 	MC_STEP3,
62 	MC_STEP4
63 } stepnum_t;
64 
65 /*
66  * Structure for step_name -> step_number mapping
67  */
68 struct step_t {
69 	char		*step_nam;
70 	stepnum_t	step_num;
71 };
72 
73 /*
74  * Step name to step number mapping table
75  * This table MUST be sorted alphabetically in ascending order of step name
76  */
77 static struct step_t step_table[] = {
78 	{ "abort",	MC_ABORT },
79 	{ "return",	MC_RETURN },
80 	{ "start",	MC_START },
81 	{ "step1",	MC_STEP1 },
82 	{ "step2",	MC_STEP2 },
83 	{ "step3",	MC_STEP3 },
84 	{ "step4",	MC_STEP4 },
85 	{ "stop",	MC_STOP }
86 };
87 
88 /*
89  * If support for a different version is added, the new version number should
90  * be appended to the version_table below. This list will be searched to
91  * determine if a version requested via the -V option is supported or not.
92  */
93 static char *version_table[] = {
94 	MY_VERSION
95 };
96 
97 uint_t	timeout = 0;			/* disable timeout by default */
98 char	*version = MY_VERSION;		/* use latest version by default */
99 int	stepnum = MC_UNK;		/* reconfiguration step number */
100 pid_t	c_pid;				/* child process id */
101 
102 /*
103  * Binary search comparison routine
104  */
105 static int
106 mc_compare(const void *stp1, const void *stp2)
107 {
108 	return (strcmp((const char *)stp1,
109 	    ((const struct step_t *)stp2)->step_nam));
110 }
111 
112 /*
113  * Timeout expiry alarm signal handler
114  */
115 /*ARGSUSED*/
116 static void
117 sigalarmhandler(int sig)
118 {
119 	int	i, n, ret, stat_loc = 0;
120 
121 	n = sizeof (step_table) / sizeof (step_table[0]);
122 	for (i = 0; i < n; i++) {
123 		if (stepnum == step_table[i].step_num)
124 			break;
125 	}
126 
127 	assert(i != n);
128 
129 	meta_mc_log(MC_LOG1, gettext("Timeout expired in %s: %s"),
130 	    step_table[i].step_nam,
131 	    meta_print_hrtime(gethrtime() - start_time));
132 
133 	if ((ret = kill(c_pid, SIGKILL)) == 0) {
134 		/*
135 		 * The child will wait forever until the status is retrieved
136 		 * so get it now. Keep retrying if the call is interrupted.
137 		 *
138 		 * The possible results are,
139 		 *
140 		 *	- child killed successfully
141 		 *	- signal sent but child not killed
142 		 *	- waitpid failed/interrupted
143 		 */
144 		sleep(2);
145 		while ((ret = waitpid(c_pid, &stat_loc, WNOHANG)) < 0) {
146 			if (errno != EINTR) {
147 				break;
148 			}
149 		}
150 		if ((ret == c_pid) || (errno == ECHILD)) {
151 			ret = 0;
152 		} else {
153 			ret = 1;
154 		}
155 	} else if (errno == ESRCH) {
156 		/*
157 		 * If the kill did not catch the child then it means the child
158 		 * exited immediately after the timeout occured.
159 		 */
160 		ret = 0;
161 	}
162 
163 	/*
164 	 * make sure not to exit with 205 for any steps other than step1-step4.
165 	 * Suncluster reconfiguration can't handle it otherwise.
166 	 */
167 	switch (stepnum) {
168 	case MC_STEP1:
169 	case MC_STEP2:
170 	case MC_STEP3:
171 	case MC_STEP4:
172 		/*
173 		 * If the child was killed successfully return 205 for a
174 		 * new reconfig cycle otherwise send 1 to panic the node.
175 		 */
176 		if (ret != 0) {
177 			md_eprintf(gettext("Could not kill child\n"));
178 			exit(1);
179 		} else {
180 			exit(205);
181 		}
182 		break;
183 	case MC_START:
184 	case MC_STOP:
185 	case MC_ABORT:
186 	case MC_RETURN:
187 	default:
188 		exit(1);
189 		break;
190 	}
191 }
192 
193 /*
194  * Attempt to load local set.
195  * Returns:
196  *	pointer to mdsetname_t for local set (local_sp) is successful.
197  *	0 if failure
198  *		if there are no local set mddbs, no error message is printed.
199  *		Otherwise, error message is printed so that user
200  *		can determine why the local set didn't start.
201  */
202 mdsetname_t *
203 load_local_set(md_error_t *ep)
204 {
205 	mdsetname_t	*local_sp = NULL;
206 
207 	/* Does local set exist? If not, give no error */
208 	if ((local_sp = metasetname(MD_LOCAL_NAME, ep)) == NULL) {
209 		return (0);
210 	}
211 
212 	/*
213 	 * snarf local set
214 	 * If fails with MDE_DB_NODB, then just return 1 printing
215 	 * no failure.
216 	 * Otherwise, print error message, and return 1.
217 	 */
218 	if (meta_setup_db_locations(ep) != 0) {
219 		if (!(mdismddberror(ep, MDE_DB_NODB)))
220 			mde_perror(ep, "");
221 		return (0);
222 	}
223 
224 	/* local set loaded successfully */
225 	return (local_sp);
226 }
227 
228 /*
229  * Purpose:	Compose a full path name for a metadevice
230  *
231  * On entry:	sp	- setname pointer
232  *		mnum	- minor number of metadevice
233  *		pathname - pointer to array to return path string
234  *		pathlen	- max length of pathname array
235  */
236 static int
237 compose_path(mdsetname_t *sp, int mnum, char *pathname, int pathlen)
238 {
239 	int	rtn;
240 	mdname_t	*np;
241 	md_error_t	status = mdnullerror;
242 
243 	if (MD_MIN2SET(mnum) != sp->setno) {
244 		md_eprintf(gettext("minor number 0x%x invalid for set %d\n"),
245 		    mnum, sp->setno);
246 		return (-1);
247 	}
248 
249 	if ((np = metamnumname(&sp, mnum, 0, &status)) == NULL) {
250 		return (-1);
251 	}
252 
253 	rtn = snprintf(pathname, pathlen, "%s", np->rname);
254 
255 	if ((pathname[0] == '\0') || (rtn >= pathlen)) {
256 		md_eprintf(gettext(
257 		    "Could not create path for device %s\n"),
258 		    get_mdname(sp, mnum));
259 		return (-1);
260 	}
261 	return (0);
262 }
263 
264 /*
265  * Purpose:	Walk through all the devices specified for the given set
266  *		and do the action specified in mode
267  */
268 static int
269 reset_state(uint_t mode, mdsetname_t *sp, char *drivername, md_error_t *ep)
270 {
271 	mdnamelist_t			*devnlp = NULL;
272 	mdnamelist_t			*p;
273 	mdname_t			*devnp = NULL;
274 	md_set_mmown_params_t		ownpar_p;
275 	md_set_mmown_params_t		*ownpar = &ownpar_p;
276 	md_unit_t			*mm;
277 	int				mirror_dev = 0;
278 	mndiskset_membershiplist_t	*nl;
279 	int				cnt;
280 	int				has_parent;
281 	md_mn_get_mir_state_t		mir_state_p;
282 	md_mn_get_mir_state_t		*mir_state = &mir_state_p;
283 
284 	/*
285 	 * if we are choosing or resetting the owners then make sure
286 	 * we are only doing it for mirror devices
287 	 */
288 	mirror_dev = (strcmp(MD_MIRROR, drivername) == 0);
289 	if ((mode & (RESET_OWNER | CHOOSE_OWNER)) && !mirror_dev) {
290 		return (-1);
291 	}
292 
293 	/* get a list of all the metadevices for current set */
294 	if (mirror_dev && meta_get_mirror_names(sp, &devnlp, 0, ep) < 0) {
295 		mde_perror(ep, gettext("Could not get mirrors for set %s"),
296 		    sp->setname);
297 		return (-1);
298 	} else if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) {
299 		mde_perror(ep, gettext(
300 		    "Could not get soft partitions for set %s"), sp->setname);
301 		return (-1);
302 	}
303 
304 	/* If resetting the owner, get the known membership list */
305 	if (mode & RESET_OWNER) {
306 		if (meta_read_nodelist(&cnt, &nl, ep)) {
307 			mde_perror(ep, "Could not get nodelist");
308 			return (-1);
309 		}
310 	}
311 
312 	/* for each metadevice */
313 	for (p = devnlp; (p != NULL); p = p->next) {
314 		devnp = p->namep;
315 
316 		/*
317 		 * Get the current setting for mirror ABR state and all of the
318 		 * submirror state and flags from the master node. We only
319 		 * perform this when going through a 'start' cycle.
320 		 */
321 		if ((mode & GET_MIRROR_STATE) && mirror_dev) {
322 			char	*miscname;
323 
324 			/*
325 			 * Ensure that we ignore soft-parts that are returned
326 			 * from the meta_get_mirror_names() call
327 			 */
328 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
329 				goto out;
330 			if (strcmp(miscname, MD_MIRROR) != 0)
331 				continue;
332 
333 			mir_state->mnum = meta_getminor(devnp->dev);
334 			MD_SETDRIVERNAME(mir_state, MD_MIRROR, sp->setno);
335 			meta_mc_log(MC_LOG4, gettext("Getting mirror state"
336 			    " for %s: %s"), get_mdname(sp, mir_state->mnum),
337 			    meta_print_hrtime(gethrtime() - start_time));
338 
339 			if (metaioctl(MD_MN_GET_MIRROR_STATE, mir_state, ep,
340 			    "MD_MN_GET_MIRROR_STATE") != 0) {
341 				mde_perror(ep, gettext("Unable to get "
342 				    "mirror state for %s"),
343 				    get_mdname(sp, mir_state->mnum));
344 				goto out;
345 			} else {
346 				continue;
347 			}
348 		}
349 
350 		/* check if this is a top level metadevice */
351 		if ((mm = meta_get_mdunit(sp, devnp, ep)) == NULL)
352 			goto out;
353 		if (MD_HAS_PARENT(MD_PARENT(mm))) {
354 			has_parent = 1;
355 		} else {
356 			has_parent = 0;
357 		}
358 		Free(mm);
359 
360 		if (mode & (RESET_OWNER | CHOOSE_OWNER)) {
361 			char	*miscname;
362 
363 			/*
364 			 * we can only do these for mirrors so make sure we
365 			 * really have a mirror device and not a softpartition
366 			 * imitating one. meta_get_mirror_names seems to think
367 			 * softparts on top of a mirror are mirrors!
368 			 */
369 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
370 				goto out;
371 			if (strcmp(miscname, MD_MIRROR) != 0)
372 				continue;
373 
374 			(void) memset(ownpar, 0, sizeof (*ownpar));
375 			ownpar->d.mnum = meta_getminor(devnp->dev);
376 			MD_SETDRIVERNAME(ownpar, MD_MIRROR, sp->setno);
377 
378 			meta_mc_log(MC_LOG4, gettext("Setting owner "
379 			    "for %s: %s"), get_mdname(sp, ownpar->d.mnum),
380 			    meta_print_hrtime(gethrtime() - start_time));
381 
382 			/* get the current owner id */
383 			if (metaioctl(MD_MN_GET_MM_OWNER, ownpar, ep,
384 			    "MD_MN_GET_MM_OWNER") != 0) {
385 				mde_perror(ep, gettext("Unable to get "
386 				    "mirror owner for %s"),
387 				    get_mdname(sp, ownpar->d.mnum));
388 				goto out;
389 			}
390 		}
391 
392 		if (mode & RESET_OWNER) {
393 			if (ownpar->d.owner == MD_MN_MIRROR_UNOWNED) {
394 				mdclrerror(ep);
395 				continue;
396 			}
397 
398 			/*
399 			 * reset owner only if the current owner is
400 			 * not in the membership list
401 			 * Also kill the resync thread so that when the resync
402 			 * is started, it will perform an optimized resync
403 			 * for any resync regions that were dirty when the
404 			 * current owner left the membership.
405 			 */
406 			if (meta_is_member(NULL, ownpar->d.owner, nl) != 1) {
407 				if (meta_mn_change_owner(&ownpar,
408 				    sp->setno, ownpar->d.mnum,
409 				    MD_MN_MIRROR_UNOWNED,
410 				    MD_MN_MM_ALLOW_CHANGE) == -1) {
411 					md_eprintf(gettext(
412 					    "Unable to reset mirror owner "
413 					    "for %s\n"),
414 					    get_mdname(sp, ownpar->d.mnum));
415 					goto out;
416 				}
417 				if (meta_mirror_resync(sp, devnp, 0, ep,
418 				    MD_RESYNC_KILL_NO_WAIT) != 0) {
419 					md_eprintf(gettext(
420 					    "Unable to kill resync for"
421 					    " %s\n"),
422 					    get_mdname(sp, ownpar->d.mnum));
423 					goto out;
424 				}
425 			}
426 		}
427 
428 		if (mode & CHOOSE_OWNER) {
429 			/*
430 			 * only orphaned resyncs will have no owner.
431 			 * if that is the case choose a new owner. Otherwise
432 			 * re-establish the existing owner. This covers the
433 			 * case where a node that owned the mirror
434 			 * reboots/panics and comes back into the cluster before
435 			 * the reconfig cycle has completed. In this case the
436 			 * other cluster nodes will have the mirror owner marked
437 			 * as the rebooted node while it has the owner marked
438 			 * as 'None'. We have to reestablish the ownership so
439 			 * that the subsequent resync can continue.
440 			 */
441 			if (meta_mn_change_owner(&ownpar, sp->setno,
442 			    ownpar->d.mnum, ownpar->d.owner,
443 			    MD_MN_MM_CHOOSE_OWNER) == -1) {
444 				md_eprintf(gettext("Unable to choose "
445 				    "mirror owner for %s\n"),
446 				    get_mdname(sp, ownpar->d.mnum));
447 				goto out;
448 			}
449 		}
450 
451 		/*
452 		 * For RESET_ABR and UPDATE_ABR - only handle top
453 		 * level metadevices.
454 		 */
455 		if (has_parent)
456 			continue;
457 
458 		if (mode & RESET_ABR) {
459 			/*
460 			 * Reset the ABR (application based recovery)
461 			 * value on all nodes. We are dealing with
462 			 * the possibility that we have ABR set but the
463 			 * only node that had the device open with ABR has
464 			 * left the cluster. We simply open and close the
465 			 * device and if this is the last close in the
466 			 * cluster, ABR will be cleared on all nodes.
467 			 */
468 			char		*miscname;
469 			char		name[MAXPATHLEN];
470 			int		mnum, fd;
471 
472 			name[0] = '\0';
473 			mnum = meta_getminor(devnp->dev);
474 
475 			/*
476 			 * Ensure that we don't include soft-parts in the
477 			 * mirror-only call to RESET_ABR. meta_get_mirror_names
478 			 * returns a bogus list that includes all soft-parts
479 			 * built on mirrors.
480 			 */
481 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
482 				goto out;
483 			if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0))
484 				continue;
485 
486 			meta_mc_log(MC_LOG4, gettext("Re-setting ABR state "
487 			    "for %s: %s"), get_mdname(sp, mnum),
488 			    meta_print_hrtime(gethrtime() - start_time));
489 
490 			/* compose the absolute device path and open it */
491 			if (compose_path(sp, mnum, &name[0],
492 			    sizeof (name)) != 0)
493 				goto out;
494 			if ((fd = open(name, O_RDWR, 0)) < 0) {
495 				md_perror(gettext("Could not open device %s"),
496 				    name);
497 				continue;
498 			}
499 
500 			(void) close(fd);
501 		}
502 
503 		if (mode & UPDATE_ABR) {
504 			/*
505 			 * Update the ABR value on this node. We obtain the
506 			 * current ABR state from the master node.
507 			 */
508 
509 			char		*miscname;
510 			char		name[MAXPATHLEN];
511 			int		mnum, fd;
512 			volcap_t	vc;
513 			uint_t		tstate;
514 
515 			name[0] = '\0';
516 			mnum = meta_getminor(devnp->dev);
517 
518 			/*
519 			 * Ensure that we don't include soft-parts in the
520 			 * mirror-only call to UPDATE_ABR. meta_get_mirror_names
521 			 * returns a bogus list that includes all soft-parts
522 			 * built on mirrors.
523 			 */
524 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
525 				goto out;
526 			if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0))
527 				continue;
528 
529 			/* Get tstate from Master */
530 			if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep)
531 			    != 0)
532 				continue;
533 			/* If not set on the master, nothing to do */
534 			if (!(tstate & MD_ABR_CAP))
535 				continue;
536 
537 			meta_mc_log(MC_LOG4, gettext("Updating ABR state "
538 			    "for %s: %s"), get_mdname(sp, mnum),
539 			    meta_print_hrtime(gethrtime() - start_time));
540 
541 			/* compose the absolute device path and open it */
542 			if (compose_path(sp, mnum, &name[0],
543 			    sizeof (name)) != 0)
544 				goto out;
545 			if ((fd = open(name, O_RDWR, 0)) < 0) {
546 				md_perror(gettext("Could not open device %s"),
547 				    name);
548 				continue;
549 			}
550 
551 			/* set ABR state */
552 			vc.vc_info = 0;
553 			vc.vc_set = 0;
554 			if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) {
555 				/*
556 				 * Ignore if device does not support this
557 				 * ioctl
558 				 */
559 				if ((errno != ENOTTY) && (errno != ENOTSUP)) {
560 					md_perror(gettext("Could not get "
561 					    "ABR/DMR state for device %s"),
562 					    name);
563 				}
564 				(void) close(fd);
565 				continue;
566 			}
567 			if (!(vc.vc_info & (DKV_ABR_CAP | DKV_DMR_CAP))) {
568 				(void) close(fd);
569 				continue;
570 			}
571 
572 			vc.vc_set = DKV_ABR_CAP;
573 			if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) {
574 				md_perror(gettext(
575 				    "Could not set ABR state for "
576 				    "device %s"), name);
577 				(void) close(fd);
578 				goto out;
579 			} else {
580 				md_eprintf(gettext(
581 				    "Setting ABR state on device %s\n"), name);
582 			}
583 
584 			(void) close(fd);
585 		}
586 	}
587 
588 	/* cleanup */
589 	if (mode & RESET_OWNER) {
590 		meta_free_nodelist(nl);
591 	}
592 	metafreenamelist(devnlp);
593 	return (0);
594 
595 out:
596 	/* cleanup */
597 	if (mode & RESET_OWNER) {
598 		meta_free_nodelist(nl);
599 	}
600 	metafreenamelist(devnlp);
601 	return (-1);
602 }
603 
604 /*
605  * Print usage message
606  */
607 static void
608 usage(mdsetname_t *sp, int eval)
609 {
610 	(void) fprintf(stderr, gettext("usage:"
611 	    "\t%s [-V version] [-t timeout] [-d level] start localnodeid\n"
612 	    "\t%s [-V version] [-t timeout] [-d level] step nodelist...\n"
613 	    "\t%s [-V version] [-t timeout] [-d level] abort | stop\n"
614 	    "\t%s [-V | -? | -h]\n"),
615 	    myname, myname, myname, myname);
616 	if (!eval) {
617 		fprintf(stderr, gettext("\n"
618 		    "\tValid debug (-d) levels are 1-%d for increasing "
619 		    "verbosity.\n\tDefault is -d 3.\n\n"
620 		    "\tValid step values are: return | step1 | step2 | "
621 		    "step3 | step4\n\n"
622 		    "\tNodelist is a space-separated list of node id's\n\n"),
623 		    MAX_DEBUG_LEVEL);
624 	}
625 	md_exit(sp, eval);
626 }
627 
628 /*
629  * Input:	Input takes a config step name followed by a list of
630  *		possible node id's.
631  *
632  * Returns:	  0 - Success
633  *		  1 - Fail
634  *			Node will be removed from cluster membership
635  *			by forcing node to panic.
636  *		205 - Unsuccessful. Start another reconfig cycle.
637  *			Problem was encountered that could be fixed by
638  *			running another reconfig cycle.
639  *			Problem could be a result of a failure to read
640  *			the nodelist file or that all work could not be
641  *			accomplished in a reconfig step in the amount of
642  *			time given so another reconfig cycle is needed in
643  *			order to finish the current step.
644  */
645 int
646 main(int argc, char **argv)
647 {
648 	mdsetname_t		*sp = NULL;
649 	md_error_t		status = mdnullerror;
650 	md_error_t		*ep = &status;
651 	set_t			max_sets, setno;
652 	int			c, clust = 0;
653 	struct sigaction	nsa, osa;
654 	struct step_t		*step_ptr;
655 	mdsetname_t		*local_sp = NULL;
656 	md_drive_desc		*dd;
657 	int			rval = 0;
658 	md_set_desc		*sd;
659 	mddb_block_parm_t	mbp;
660 	uint_t			debug = 3; /* log upto MC_LOG3 by default */
661 	int			version_table_size;
662 	mddb_setflags_config_t	sf;
663 	int			ret_val;
664 	mddb_config_t		cfg;
665 	int			set_info[MD_MAXSETS];
666 	long			commd_timeout = 0;
667 
668 	/*
669 	 * Get the locale set up before calling any other routines
670 	 * with messages to ouput.  Just in case we're not in a build
671 	 * environment, make sure that TEXT_DOMAIN gets set to
672 	 * something.
673 	 */
674 #if !defined(TEXT_DOMAIN)
675 #define	TEXT_DOMAIN "SYS_TEST"
676 #endif
677 	(void) setlocale(LC_ALL, "");
678 	(void) textdomain(TEXT_DOMAIN);
679 
680 	if ((clust = sdssc_bind_library()) == SDSSC_ERROR) {
681 		md_eprintf(gettext("Interface error with libsds_sc.so\n"));
682 		exit(1);
683 	}
684 
685 	if (md_init(argc, argv, 1, 1, ep) != 0 || meta_check_root(ep) != 0) {
686 		mde_perror(ep, "");
687 		md_exit(sp, 1);
688 	}
689 
690 	/*
691 	 * open log and enable libmeta logging. Do it here explicitly
692 	 * rather than letting md_init() do it because we are not really
693 	 * a daemon and that is what md_init() opens the log as.
694 	 */
695 	openlog("metaclust", LOG_CONS, LOG_USER);
696 
697 	version_table_size = sizeof (version_table) / sizeof (version_table[0]);
698 
699 	optind = 1;
700 	opterr = 0;
701 	while ((c = getopt(argc, argv, "hd:V:t:?")) != -1) {
702 		switch (c) {
703 		case 'h':
704 			usage(sp, 0);
705 			break;
706 
707 		case 'd':
708 			if (sscanf(optarg, "%u", &debug) != 1) {
709 				md_eprintf(gettext("Invalid debug level\n"));
710 				md_exit(sp, 1);
711 			} else if ((debug < 1) || (debug > MAX_DEBUG_LEVEL)) {
712 				debug = min(max(debug, 1), MAX_DEBUG_LEVEL);
713 				md_eprintf(gettext("Debug level must be "
714 				    "between 1 and %d inclusive.\n"),
715 				    MAX_DEBUG_LEVEL);
716 				md_eprintf(gettext("Debug level set to %d.\n"),
717 				    debug);
718 			}
719 			break;
720 
721 		case 'V':
722 			version = Strdup(optarg);
723 			break;
724 
725 		case 't':
726 			if (sscanf(optarg, "%u", &timeout) != 1) {
727 				md_eprintf(gettext("Invalid timeout value\n"));
728 				md_exit(sp, 1);
729 			}
730 			break;
731 
732 		case '?':
733 			if (optopt == '?') {
734 				usage(sp, 0);
735 			} else if (optopt == 'V') {
736 				int	i;
737 
738 				fprintf(stdout, gettext(
739 				    "%s: Versions Supported:"), myname);
740 				for (i = 0; i < version_table_size; i++) {
741 					fprintf(stdout, " %s",
742 					    version_table[i]);
743 				}
744 				fprintf(stdout, "\n");
745 				md_exit(sp, 0);
746 			}
747 			/*FALLTHROUGH*/
748 
749 		default:
750 			usage(sp, 1);
751 			break;
752 		}
753 	}
754 
755 	/* initialise the debug level and start time */
756 	setup_mc_log(debug);
757 
758 	/*
759 	 * check that the version specified (if any) is supported.
760 	 */
761 	if (version != NULL) {
762 		int	i, found = 0;
763 
764 		for (i = 0; i < version_table_size; i++) {
765 			if (strcmp(version, version_table[i]) == 0) {
766 				found = 1;
767 				break;
768 			}
769 		}
770 		if (!found) {
771 			md_eprintf(gettext("Version %s not supported\n"),
772 			    version);
773 			md_exit(sp, 1);
774 		}
775 	}
776 
777 	argc -= optind;
778 	argv += optind;
779 
780 	/* parse arguments */
781 	if (argc <= 0) {
782 		usage(sp, 1);
783 	}
784 
785 	/* convert the step name to the corresponding number */
786 	step_ptr = bsearch(argv[0], step_table, (sizeof (step_table) /
787 	    sizeof (step_table[0])), sizeof (step_table[0]), mc_compare);
788 	if (step_ptr != NULL) {
789 		stepnum = step_ptr->step_num;
790 	}
791 
792 	--argc;
793 	++argv;
794 
795 	/* set timeout alarm signal, a value of 0 will disable timeout */
796 	if (timeout > 0) {
797 		int	stat_loc = 0;
798 		commd_timeout = (long)(timeout * .75);
799 
800 		c_pid = fork();
801 
802 		if (c_pid == (pid_t)-1) {
803 			md_perror(gettext("Unable to fork"));
804 			md_exit(sp, 1);
805 		} else if (c_pid) {
806 			/* parent */
807 			nsa.sa_flags = 0;
808 			if (sigfillset(&nsa.sa_mask) < 0) {
809 				md_perror(gettext("Unable to set signal mask"));
810 				md_exit(sp, 1);
811 			}
812 
813 			nsa.sa_handler = sigalarmhandler;
814 			if (sigaction(SIGALRM, &nsa, &osa) == -1) {
815 				md_perror(gettext("Unable to set alarm "
816 				    "handler"));
817 				md_exit(sp, 1);
818 			}
819 
820 			(void) alarm(timeout);
821 
822 			/*
823 			 * wait for child to exit or timeout to expire.
824 			 * keep retrying if the call is interrupted
825 			 */
826 			while ((ret_val = waitpid(c_pid, &stat_loc, 0)) < 0) {
827 				if (errno != EINTR) {
828 					break;
829 				}
830 			}
831 			if (ret_val == c_pid) {
832 				/* exit with the childs exit value */
833 				exit(WEXITSTATUS(stat_loc));
834 			} else if (errno == ECHILD) {
835 				md_exit(sp, 0);
836 			} else {
837 				perror(myname);
838 				md_exit(sp, 1);
839 			}
840 		}
841 	}
842 
843 	/*
844 	 * If a timeout value is given, everything from this point onwards is
845 	 * executed in the child process.
846 	 */
847 
848 	switch (stepnum) {
849 	case MC_START:
850 		/*
851 		 * Start Step
852 		 *
853 		 * - Suspend all rpc.mdcommd messages
854 		 */
855 
856 		/* expect the local node id to be given only */
857 		if (argc != 1)
858 			usage(sp, 1);
859 
860 		meta_mc_log(MC_LOG2, gettext("Starting Start step: %s"),
861 		    meta_print_hrtime(0));
862 
863 		/*
864 		 * Does local set exist? If not, exit with 0
865 		 * since there's no reason to have this node panic if
866 		 * the local set cannot be started.
867 		 */
868 		if ((local_sp = load_local_set(ep)) == NULL) {
869 			md_exit(local_sp, 0);
870 		}
871 
872 		if ((max_sets = get_max_sets(ep)) == 0) {
873 			mde_perror(ep, "");
874 			md_exit(sp, 1);
875 		}
876 
877 		/* start walking through all possible disksets */
878 		for (setno = 1; setno < max_sets; setno++) {
879 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
880 				if (mdiserror(ep, MDE_NO_SET)) {
881 					/* No set for this setno - continue */
882 					mdclrerror(ep);
883 					continue;
884 				} else {
885 					mde_perror(ep, gettext("Unable to "
886 					    "get set %d information"), setno);
887 					md_exit(sp, 1);
888 				}
889 			}
890 
891 			/* only check multi-node disksets */
892 			if (!meta_is_mn_set(sp, ep)) {
893 				mdclrerror(ep);
894 				continue;
895 			}
896 
897 			meta_mc_log(MC_LOG3, gettext("Start - block parse "
898 			    "messages for set %s: %s"), sp->setname,
899 			    meta_print_hrtime(gethrtime() - start_time));
900 
901 			/*
902 			 * Mddb parse messages are sent amongst the nodes
903 			 * in a diskset whenever the locator block or
904 			 * locator names structure has been changed.
905 			 * A locator block change could occur as a result
906 			 * of a disk failure during the reconfig cycle,
907 			 * so block the mddb parse messages while the
908 			 * rpc.mdcommd is suspended during the reconfig cycle.
909 			 */
910 			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
911 				(void) memset(&mbp, 0, sizeof (mbp));
912 				mbp.c_setno = setno;
913 				mbp.c_blk_flags = MDDB_BLOCK_PARSE;
914 				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
915 				    &mbp.c_mde, NULL)) {
916 					mdstealerror(ep, &mbp.c_mde);
917 					mde_perror(ep, gettext("Could not "
918 					    "block set %s"), sp->setname);
919 					md_exit(sp, 1);
920 				}
921 			}
922 
923 			/* suspend commd and spin waiting for drain */
924 			while ((ret_val = mdmn_suspend(setno,
925 			    MD_COMM_ALL_CLASSES, commd_timeout)) ==
926 			    MDE_DS_COMMDCTL_SUSPEND_NYD) {
927 				sleep(1);
928 			}
929 
930 			if (ret_val) {
931 				md_eprintf(gettext("Could not suspend "
932 				    "rpc.mdcommd for set %s\n"), sp->setname);
933 				md_exit(sp, 1);
934 			}
935 
936 			/*
937 			 * Set start step flag for set. This is set to indicate
938 			 * that this node entered the reconfig cycle through
939 			 * the start step.  This is used during the reconfig
940 			 * cycle to determine whether the node had entered
941 			 * through the start step or the return step.
942 			 */
943 			(void) memset(&sf, 0, sizeof (sf));
944 			sf.sf_setno = sp->setno;
945 			sf.sf_setflags = MD_SET_MN_START_RC;
946 			sf.sf_flags = MDDB_NM_SET;
947 			/* Use magic to help protect ioctl against attack. */
948 			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
949 			if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
950 			    &sf.sf_mde, NULL)) {
951 				mdstealerror(ep, &sf.sf_mde);
952 				mde_perror(ep, gettext("Could not set "
953 				    "start_step flag for set %s"), sp->setname);
954 				md_exit(sp, 1);
955 			}
956 
957 		}
958 
959 		meta_mc_log(MC_LOG2, gettext("Start step completed: %s"),
960 		    meta_print_hrtime(gethrtime() - start_time));
961 
962 		break;
963 
964 	case MC_STOP:
965 		/*
966 		 * Stop Step
967 		 *
968 		 * - ???
969 		 */
970 
971 		/* don't expect any more arguments to follow the step name */
972 		if (argc != 0)
973 			usage(sp, 1);
974 
975 		break;
976 
977 	case MC_ABORT:
978 		/*
979 		 * Abort Step
980 		 *
981 		 * - Abort rpc.mdcommd
982 		 */
983 
984 		/* don't expect any more arguments to follow the step name */
985 		if (argc != 0)
986 			usage(sp, 1);
987 
988 		meta_mc_log(MC_LOG2, gettext("Starting Abort step: %s"),
989 		    meta_print_hrtime(0));
990 
991 		/*
992 		 * Does local set exist? If not, exit with 0
993 		 * since there's no reason to have this node panic if
994 		 * the local set cannot be started.
995 		 */
996 		if ((local_sp = load_local_set(ep)) == NULL) {
997 			md_exit(local_sp, 0);
998 		}
999 
1000 		/*
1001 		 * abort the rpc.mdcommd.  The abort is only issued on this node
1002 		 * meaning that the abort reconfig step is called on this
1003 		 * node before a panic while the rest of the cluster will
1004 		 * undergo a reconfig cycle.
1005 		 * There is no time relation between this node running a
1006 		 * reconfig abort and the the rest of the cluster
1007 		 * running a reconfig cycle meaning that this node may
1008 		 * panic before, during or after the cluster has run
1009 		 * a reconfig cycle.
1010 		 */
1011 		mdmn_abort();
1012 
1013 		meta_mc_log(MC_LOG2, gettext("Abort step completed: %s"),
1014 		    meta_print_hrtime(gethrtime() - start_time));
1015 
1016 		break;
1017 
1018 	case MC_RETURN:
1019 		/*
1020 		 * Return Step
1021 		 *
1022 		 * - Grab local set lock, issue rpc.mdcommd DRAIN ALL
1023 		 *   and release local set lock.  Grabbing the local set
1024 		 *   lock allows any active metaset/metadb commands to
1025 		 *   terminate gracefully and will keep a metaset/metadb
1026 		 *   command from starting until the DRAIN ALL is issued.
1027 		 *   The metaset/metadb commands can issue
1028 		 *   DRAIN ALL/RESUME ALL commands to rpc.mdcommd,
1029 		 *   so the return step must not issue the DRAIN ALL command
1030 		 *   until metaset/metadb have finished or metaset may issue
1031 		 *   a RESUME ALL after this return reconfig step has issued
1032 		 *   the DRAIN ALL command.
1033 		 *   After this reconfig step has issued the DRAIN_ALL and
1034 		 *   released the local set lock, metaset/metadb will fail
1035 		 *   when attempting to contact the rpc.mdcommd and will
1036 		 *   terminate without making any configuration changes.
1037 		 *   The DRAIN ALL command will keep all other meta* commands
1038 		 *   from running during the reconfig cycle (these commands
1039 		 *   will wait until the rpc.mdcommd is resumed) since the
1040 		 *   reconfig cycle may be changing the diskset configuration.
1041 		 */
1042 
1043 		/* expect the nodelist to follow the step name */
1044 		if (argc < 1)
1045 			usage(sp, 1);
1046 
1047 		meta_mc_log(MC_LOG2, gettext("Starting Return step: %s"),
1048 		    meta_print_hrtime(0));
1049 
1050 		/*
1051 		 * Does local set exist? If not, exit with 0
1052 		 * since there's no reason to have this node panic if
1053 		 * the local set cannot be started.
1054 		 */
1055 		if ((local_sp = load_local_set(ep)) == NULL) {
1056 			md_exit(local_sp, 0);
1057 		}
1058 
1059 		/*
1060 		 * Suspend any mirror resyncs that are in progress. This
1061 		 * stops unnecessary timeouts.
1062 		 */
1063 		meta_mirror_resync_block_all();
1064 
1065 		if (meta_lock(local_sp, TRUE, ep) != 0) {
1066 			mde_perror(ep, "");
1067 			md_exit(local_sp, 1);
1068 		}
1069 
1070 		/*
1071 		 * All metaset and metadb commands on this node have now
1072 		 * terminated gracefully.  Now, issue a drain all to
1073 		 * the rpc.mdcommd.  Any meta command issued after the
1074 		 * drain all will either spin sending the command to the
1075 		 * master until after the reconfig cycle has finished OR
1076 		 * will terminate gracefully (metaset/metadb).
1077 		 */
1078 		if ((max_sets = get_max_sets(ep)) == 0) {
1079 			mde_perror(ep, "");
1080 			md_exit(sp, 1);
1081 		}
1082 
1083 		/* start walking through all possible disksets */
1084 		for (setno = 1; setno < max_sets; setno++) {
1085 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1086 				if (mdiserror(ep, MDE_NO_SET)) {
1087 					/* No set for this setno - continue */
1088 					mdclrerror(ep);
1089 					continue;
1090 				} else {
1091 					mde_perror(ep, gettext("Unable to "
1092 					    "get set %d information"), setno);
1093 					md_exit(sp, 1);
1094 				}
1095 			}
1096 
1097 			/* only check multi-node disksets */
1098 			if (!meta_is_mn_set(sp, ep)) {
1099 				mdclrerror(ep);
1100 				continue;
1101 			}
1102 
1103 			meta_mc_log(MC_LOG3, gettext("Return - block parse "
1104 			    "messages for set %s: %s"), sp->setname,
1105 			    meta_print_hrtime(gethrtime() - start_time));
1106 
1107 			/*
1108 			 * Mddb parse messages are sent amongst the nodes
1109 			 * in a diskset whenever the locator block or
1110 			 * locator names structure has been changed.
1111 			 * A locator block change could occur as a result
1112 			 * of a disk failure during the reconfig cycle,
1113 			 * so block the mddb parse messages while the
1114 			 * rpc.commd is suspended during the reconfig cycle.
1115 			 */
1116 			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
1117 				(void) memset(&mbp, 0, sizeof (mbp));
1118 				mbp.c_setno = setno;
1119 				mbp.c_blk_flags = MDDB_BLOCK_PARSE;
1120 				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
1121 				    &mbp.c_mde, NULL)) {
1122 					mdstealerror(ep, &mbp.c_mde);
1123 					mde_perror(ep, gettext("Could not "
1124 					    "block set %s"), sp->setname);
1125 					md_exit(sp, 1);
1126 				}
1127 			}
1128 
1129 			/* suspend commd and spin waiting for drain */
1130 			while ((ret_val = mdmn_suspend(setno,
1131 			    MD_COMM_ALL_CLASSES, commd_timeout)) ==
1132 			    MDE_DS_COMMDCTL_SUSPEND_NYD) {
1133 				sleep(1);
1134 			}
1135 
1136 			if (ret_val) {
1137 				md_eprintf(gettext("Could not suspend "
1138 				    "rpc.mdcommd for set %s\n"), sp->setname);
1139 				md_exit(sp, 1);
1140 			}
1141 		}
1142 		/*
1143 		 * Resume all I/Os for this node for all MN sets in
1144 		 * case master node had suspended I/Os but panic'd
1145 		 * before resuming I/Os.  In case of failure, exit
1146 		 * with a 1 since unable to resume I/Os on this node.
1147 		 */
1148 		if (clnt_mn_susp_res_io(mynode(), 0, MN_RES_IO, ep)) {
1149 			mde_perror(ep, gettext(
1150 			    "Unable to resume I/O on node %s for all sets"),
1151 			    mynode());
1152 			md_exit(sp, 1);
1153 		}
1154 
1155 
1156 		/*
1157 		 * Can now unlock local set lock.  New metaset/metadb
1158 		 * commands are now held off using drain all.
1159 		 */
1160 		(void) meta_unlock(local_sp, ep);
1161 
1162 		meta_mc_log(MC_LOG2, gettext("Return step completed: %s"),
1163 		    meta_print_hrtime(gethrtime() - start_time));
1164 
1165 		break;
1166 
1167 	case MC_STEP1:
1168 		/*
1169 		 * Step 1
1170 		 *
1171 		 * - Populate nodelist file if we are on clustering
1172 		 *   and pick a master node for each MN diskset.
1173 		 */
1174 
1175 		/* expect the nodelist to follow the step name */
1176 		if (argc < 1)
1177 			usage(sp, 1);
1178 
1179 		meta_mc_log(MC_LOG2, gettext("Starting Step1: %s"),
1180 		    meta_print_hrtime(0));
1181 
1182 		/* Always write nodelist file even if no local set exists */
1183 		if (clust == SDSSC_OKAY) {
1184 			/* skip to the nodelist args */
1185 			if (meta_write_nodelist(argc, argv, ep) != 0) {
1186 				mde_perror(ep, gettext(
1187 				    "Could not populate nodelist file"));
1188 				md_exit(sp, 1);
1189 			}
1190 		}
1191 
1192 		/*
1193 		 * Does local set exist? If not, exit with 0
1194 		 * since there's no reason to have this node panic if
1195 		 * the local set cannot be started.
1196 		 */
1197 		if ((local_sp = load_local_set(ep)) == NULL) {
1198 			md_exit(local_sp, 0);
1199 		}
1200 
1201 		/*
1202 		 * At this point, all meta* commands are blocked across
1203 		 * all disksets since the master rpc.mdcommd has drained or
1204 		 * the master node has died.
1205 		 * If a metaset or metadb command had been in progress
1206 		 * at the start of the reconfig cycle, this command has
1207 		 * either completed or it has been terminated due to
1208 		 * the death of the master node.
1209 		 *
1210 		 * This means that that it is now ok to remove any
1211 		 * outstanding clnt_locks associated with multinode
1212 		 * disksets on this node due to a node panic during
1213 		 * a metaset operation.  This allows the routines that
1214 		 * choose the master to use rpc.metad to determine the
1215 		 * master of the diskset.
1216 		 */
1217 		if (clnt_clr_mnsetlock(mynode(), ep) != 0) {
1218 			meta_mc_log(MC_LOG2, gettext("Step1 aborted:"
1219 			    "clear locks failed %s"),
1220 			    meta_print_hrtime(gethrtime() - start_time));
1221 			md_exit(local_sp, 1);
1222 		}
1223 
1224 		/*
1225 		 * Call reconfig_choose_master to choose a master for
1226 		 * each MN diskset, update the nodelist for each diskset
1227 		 * given the member information and send a reinit message
1228 		 * to rpc.mdcommd to reload the nodelist.
1229 		 */
1230 		rval = meta_reconfig_choose_master(commd_timeout, ep);
1231 		if (rval == 205) {
1232 			/*
1233 			 * NOTE: Should issue call to reboot remote host that
1234 			 * is causing the RPC failure.  Clustering to
1235 			 * provide interface in the future.  This should
1236 			 * stop a never-ending set of 205 reconfig cycles.
1237 			 * Remote host causing failure is stored in
1238 			 * ep->host if ep is an RPC error.
1239 			 * if (mdanyrpcerror(ep))
1240 			 * 	reboot (ep->host);
1241 			 */
1242 			meta_mc_log(MC_LOG2, gettext("Step1 aborted:"
1243 			    "choose master failure of 205 %s"),
1244 			    meta_print_hrtime(gethrtime() - start_time));
1245 			md_exit(local_sp, 205);
1246 		} else if (rval != 0) {
1247 			meta_mc_log(MC_LOG2, gettext("Step1 failure: "
1248 			    "choose master failure %s"),
1249 			    meta_print_hrtime(gethrtime() - start_time));
1250 			md_exit(local_sp, 1);
1251 		}
1252 
1253 		meta_mc_log(MC_LOG2, gettext("Step1 completed: %s"),
1254 		    meta_print_hrtime(gethrtime() - start_time));
1255 
1256 		md_exit(local_sp, rval);
1257 		break;
1258 
1259 	case MC_STEP2:
1260 		/*
1261 		 * Step 2
1262 		 *
1263 		 * In Step 2, each node walks the list of disksets.  If a
1264 		 * node is a master of a MN diskset, it synchronizes
1265 		 * the local set USER records for that diskset.
1266 		 *
1267 		 * If disks exist in the diskset and there is a joined
1268 		 * (owner) node in the diskset, the master will also:
1269 		 *	- synchronize the diskset mddbs to the master
1270 		 *	- play the change log
1271 		 *
1272 		 * The master node will now attempt to join any unjoined
1273 		 * nodes that are currently members in the membership list.
1274 		 */
1275 
1276 		/* expect the nodelist to follow the step name */
1277 		if (argc < 1)
1278 			usage(sp, 1);
1279 
1280 		meta_mc_log(MC_LOG2, gettext("Starting Step2: %s"),
1281 		    meta_print_hrtime(0));
1282 
1283 		/*
1284 		 * Does local set exist? If not, exit with 0
1285 		 * since there's no reason to have this node panic if
1286 		 * the local set cannot be started.
1287 		 */
1288 		if ((local_sp = load_local_set(ep)) == NULL) {
1289 			md_exit(local_sp, 0);
1290 		}
1291 
1292 		if ((max_sets = get_max_sets(ep)) == 0) {
1293 			mde_perror(ep, "");
1294 			md_exit(local_sp, 1);
1295 		}
1296 
1297 		/* start walking through all possible disksets */
1298 		for (setno = 1; setno < max_sets; setno++) {
1299 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1300 				if (mdiserror(ep, MDE_NO_SET)) {
1301 					/* No set for this setno - continue */
1302 					mdclrerror(ep);
1303 					continue;
1304 				} else if (mdanyrpcerror(ep)) {
1305 					/* Fail on RPC failure to self */
1306 					mde_perror(ep, gettext(
1307 					    "Unable to get information for "
1308 					    "set number %d"), setno);
1309 					md_exit(local_sp, 1);
1310 				} else {
1311 					mde_perror(ep, gettext(
1312 					    "Unable to get information for "
1313 					    "set number %d"), setno);
1314 					mdclrerror(ep);
1315 					continue;
1316 				}
1317 			}
1318 
1319 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1320 				if (mdanyrpcerror(ep)) {
1321 					/* Fail on RPC failure to self */
1322 					mde_perror(ep, gettext(
1323 					    "Unable to get information for "
1324 					    "set number %d"), setno);
1325 					md_exit(local_sp, 1);
1326 				}
1327 				mde_perror(ep, gettext("Unable to get set "
1328 				    "%s desc information"), sp->setname);
1329 				mdclrerror(ep);
1330 				continue;
1331 			}
1332 
1333 			/* Only check MN disksets */
1334 			if (!(MD_MNSET_DESC(sd))) {
1335 				continue;
1336 			}
1337 
1338 			/* All actions in step 2 are driven by master */
1339 			if (!(sd->sd_mn_am_i_master)) {
1340 				continue;
1341 			}
1342 
1343 			meta_mc_log(MC_LOG3, gettext("Step2 - begin record "
1344 			    "synchronization for set %s: %s"), sp->setname,
1345 			    meta_print_hrtime(gethrtime() - start_time));
1346 
1347 			/*
1348 			 * Synchronize the USER records in the local mddbs
1349 			 * for hosts that are members.  The USER records
1350 			 * contain set, drive and host information.
1351 			 */
1352 			rval = meta_mnsync_user_records(sp, ep);
1353 			if (rval != 0) {
1354 				mde_perror(ep, gettext(
1355 				    "Synchronization of user records "
1356 				    "in set %s failed\n"), sp->setname);
1357 				if (rval == 205) {
1358 					/*
1359 					 * NOTE: Should issue call to reboot
1360 					 * remote host that is causing the RPC
1361 					 * failure.  Clustering to provide
1362 					 * interface in the future.  This
1363 					 * should stop a never-ending set of
1364 					 * 205 reconfig cycles.
1365 					 * Remote host causing failure is
1366 					 * stored in ep->host if ep is an
1367 					 * RPC error.
1368 					 * if (mdanyrpcerror(ep))
1369 					 * 	reboot (ep->host);
1370 					 */
1371 					md_exit(local_sp, 205);
1372 				} else {
1373 					md_exit(local_sp, 1);
1374 				}
1375 			}
1376 
1377 			/* Reget sd since sync_user_recs may have flushed it */
1378 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1379 				mde_perror(ep, gettext("Unable to get set "
1380 				    "%s desc information"), sp->setname);
1381 				md_exit(local_sp, 1);
1382 			}
1383 
1384 			dd = metaget_drivedesc(sp,
1385 			    (MD_BASICNAME_OK | PRINT_FAST), ep);
1386 			if (! mdisok(ep)) {
1387 				mde_perror(ep, gettext("Unable to get set "
1388 				    "%s drive information"), sp->setname);
1389 				md_exit(local_sp, 1);
1390 			}
1391 
1392 			/*
1393 			 * No drives in set, continue to next set.
1394 			 */
1395 			if (dd == NULL) {
1396 				/* Done with this set */
1397 				continue;
1398 			}
1399 
1400 			meta_mc_log(MC_LOG3, gettext("Step2 - local set user "
1401 			    "records completed for set %s: %s"), sp->setname,
1402 			    meta_print_hrtime(gethrtime() - start_time));
1403 
1404 			/*
1405 			 * Synchronize the diskset mddbs for hosts
1406 			 * that are members.  This may involve
1407 			 * playing the changelog and writing out
1408 			 * to the diskset mddbs.
1409 			 */
1410 			rval = meta_mnsync_diskset_mddbs(sp, ep);
1411 			if (rval != 0) {
1412 				mde_perror(ep, gettext(
1413 				    "Synchronization of diskset mddbs "
1414 				    "in set %s failed\n"), sp->setname);
1415 				meta_mc_log(MC_LOG3, gettext("Step2 - diskset "
1416 				    "mddb synchronization failed for "
1417 				    "set %s: %s"), sp->setname,
1418 				    meta_print_hrtime(gethrtime() -
1419 				    start_time));
1420 				if (rval == 205) {
1421 					/*
1422 					 * NOTE: Should issue call to reboot
1423 					 * remote host that is causing the RPC
1424 					 * failure.  Clustering to provide
1425 					 * interface in the future.  This
1426 					 * should stop a never-ending set of
1427 					 * 205 reconfig cycles.
1428 					 * Remote host causing failure is
1429 					 * stored in ep->host if ep is an
1430 					 * RPC error.
1431 					 * if (mdanyrpcerror(ep))
1432 					 * 	reboot (ep->host);
1433 					 */
1434 					md_exit(local_sp, 205);
1435 				} else if (rval == 1) {
1436 					continue;
1437 				} else {
1438 					md_exit(local_sp, 1);
1439 				}
1440 			}
1441 
1442 			meta_mc_log(MC_LOG3, gettext("Step2 - diskset mddb "
1443 			    "synchronization completed for set %s: %s"),
1444 			    sp->setname,
1445 			    meta_print_hrtime(gethrtime() - start_time));
1446 
1447 			/* Join the starting nodes to the diskset */
1448 			rval = meta_mnjoin_all(sp, ep);
1449 			if (rval != 0) {
1450 				mde_perror(ep, gettext(
1451 				    "Join of non-owner (starting) nodes "
1452 				    "in set %s failed\n"), sp->setname);
1453 				meta_mc_log(MC_LOG3, gettext("Step2 - non owner"
1454 				    "nodes joined for set %s: %s"),
1455 				    sp->setname,
1456 				    meta_print_hrtime(gethrtime() -
1457 				    start_time));
1458 				if (rval == 205) {
1459 					/*
1460 					 * NOTE: Should issue call to reboot
1461 					 * remote host that is causing the RPC
1462 					 * failure.  Clustering to provide
1463 					 * interface in the future.  This
1464 					 * should stop a never-ending set of
1465 					 * 205 reconfig cycles.
1466 					 * Remote host causing failure is
1467 					 * stored in ep->host if ep is an
1468 					 * RPC error.
1469 					 * if (mdanyrpcerror(ep))
1470 					 * 	reboot (ep->host);
1471 					 */
1472 					md_exit(local_sp, 205);
1473 				} else {
1474 					md_exit(local_sp, 1);
1475 				}
1476 			}
1477 
1478 			meta_mc_log(MC_LOG3, gettext("Step2 - non owner nodes "
1479 			    "joined for set %s: %s"), sp->setname,
1480 			    meta_print_hrtime(gethrtime() - start_time));
1481 
1482 		}
1483 
1484 		meta_mc_log(MC_LOG2, gettext("Step2 completed: %s"),
1485 		    meta_print_hrtime(gethrtime() - start_time));
1486 
1487 		break;
1488 
1489 	case MC_STEP3:
1490 		/*
1491 		 * Step 3
1492 		 *
1493 		 * For all multinode sets do,
1494 		 * - Reinitialise rpc.mdcommd
1495 		 * - Reset mirror owners to null if the current owner is
1496 		 *   no longer in the membership list
1497 		 */
1498 
1499 		/* expect the nodelist to follow the step name */
1500 		if (argc < 1)
1501 			usage(sp, 1);
1502 
1503 		meta_mc_log(MC_LOG2, gettext("Starting Step3: %s"),
1504 		    meta_print_hrtime(0));
1505 
1506 		/*
1507 		 * Does local set exist? If not, exit with 0
1508 		 * since there's no reason to have this node panic if
1509 		 * the local set cannot be started.
1510 		 */
1511 		if ((local_sp = load_local_set(ep)) == NULL) {
1512 			md_exit(local_sp, 0);
1513 		}
1514 
1515 		/*
1516 		 * walk through all sets on this node which could include:
1517 		 *	- MN disksets
1518 		 *	- traditional disksets
1519 		 *	- non-existent disksets
1520 		 * start mirror resync for all MN sets
1521 		 */
1522 		if ((max_sets = get_max_sets(ep)) == 0) {
1523 			mde_perror(ep, "");
1524 			md_exit(local_sp, 1);
1525 		}
1526 
1527 		/* start walking through all possible disksets */
1528 		for (setno = 1; setno < max_sets; setno++) {
1529 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1530 				if (mdiserror(ep, MDE_NO_SET)) {
1531 					/* No set for this setno - continue */
1532 					mdclrerror(ep);
1533 					continue;
1534 				} else {
1535 					mde_perror(ep, gettext("Unable to "
1536 					    "get set %d information"), setno);
1537 					md_exit(local_sp, 1);
1538 				}
1539 			}
1540 
1541 			/* only check multi-node disksets */
1542 			if (!meta_is_mn_set(sp, ep)) {
1543 				mdclrerror(ep);
1544 				continue;
1545 			}
1546 
1547 			if (meta_lock(sp, TRUE, ep) != 0) {
1548 				mde_perror(ep, "");
1549 				md_exit(local_sp, 1);
1550 			}
1551 
1552 			/* If this node isn't joined to set, do nothing */
1553 			if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) {
1554 				if (!mdisok(ep)) {
1555 					mde_perror(ep, gettext("Could "
1556 					    "not get set %s ownership"),
1557 					    sp->setname);
1558 					md_exit(sp, 1);
1559 				}
1560 				mdclrerror(ep);
1561 				meta_unlock(sp, ep);
1562 				continue;
1563 			}
1564 
1565 			meta_mc_log(MC_LOG3, gettext("Step3 - begin "
1566 			    "re-initialising rpc.mdcommd and resetting mirror "
1567 			    "owners for set %s: %s"), sp->setname,
1568 			    meta_print_hrtime(gethrtime() - start_time));
1569 
1570 			/* reinitialzse rpc.mdcommd with new nodelist */
1571 			if (mdmn_reinit_set(setno, commd_timeout)) {
1572 				md_eprintf(gettext(
1573 				    "Could not re-initialise rpc.mdcommd for "
1574 				    "set %s\n"), sp->setname);
1575 				md_exit(sp, 1);
1576 			}
1577 
1578 			(void) memset(&cfg, 0, sizeof (cfg));
1579 			cfg.c_id = 0;
1580 			cfg.c_setno = sp->setno;
1581 			if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde,
1582 			    NULL) != 0) {
1583 				mdstealerror(ep, &cfg.c_mde);
1584 				mde_perror(ep, gettext("Could "
1585 				    "not get set %s information"),
1586 				    sp->setname);
1587 				md_exit(sp, 1);
1588 			}
1589 
1590 			/* Don't do anything else if set is stale */
1591 			if (cfg.c_flags & MDDB_C_STALE) {
1592 				meta_unlock(sp, ep);
1593 				mdclrerror(ep);
1594 				continue;
1595 			}
1596 
1597 			/* reset mirror owners */
1598 			if (reset_state(RESET_OWNER, sp, MD_MIRROR, ep) == -1) {
1599 				md_exit(sp, 1);
1600 			}
1601 
1602 			meta_unlock(sp, ep);
1603 
1604 			meta_mc_log(MC_LOG3, gettext("Step3 - rpc.mdcommd "
1605 			    "re-initialised and mirror owners reset for "
1606 			    "set %s: %s"), sp->setname,
1607 			    meta_print_hrtime(gethrtime() - start_time));
1608 		}
1609 
1610 		meta_mc_log(MC_LOG2, gettext("Step3 completed: %s"),
1611 		    meta_print_hrtime(gethrtime() - start_time));
1612 
1613 		break;
1614 
1615 	case MC_STEP4:
1616 		/*
1617 		 * Step 4
1618 		 *
1619 		 * For all multinode sets do:
1620 		 * - Resume the rpc.mdcommd messages.  Must resume all
1621 		 *	sets before issuing I/O to any set since an error
1622 		 * 	encountered in a commd suspended set could be
1623 		 *	blocked waiting for commd in another set to resume.
1624 		 *	(This happens since the daemon queues service
1625 		 *	all sets).  An open of a soft partition causes
1626 		 *	a read of the watermarks during the open.
1627 		 * - If set is non-writable (not an owner or STALE), then
1628 		 *	continue to next set.
1629 		 *
1630 		 * For all multinode sets do,
1631 		 * - Reset ABR states for all mirrors, ie clear ABR if not
1632 		 *	open on any node.
1633 		 * - Reset ABR states for all soft partitions, ie clear ABR if
1634 		 *	not open on any node.
1635 		 * - For all slave nodes that have entered through the start
1636 		 *	step, update the ABR state to that of the master and
1637 		 *	get the submirror state from the master
1638 		 * - meta_lock set
1639 		 * - Resync all mirrors
1640 		 * - unlock meta_lock for this set.
1641 		 * - Choose a new owner for any orphaned resyncs
1642 		 *
1643 		 * There is one potential issue here. when concurrently
1644 		 * resetting and updating the ABR state. If the master has ABR
1645 		 * set, but should no longer have because the only node that
1646 		 * had the metadevice open and had ABR set has paniced, the
1647 		 * master will send a message to all nodes to clear the ABR
1648 		 * state. Meanwhile any node that has come through the
1649 		 * start step will get tstate from the master and will update
1650 		 * ABR if it was set in tstate. So, we appear to have a problem
1651 		 * if the following sequence occurs:-
1652 		 * - The slave gets tstate with ABR set
1653 		 * - The master sends a message to clear ABR
1654 		 * - The slave updates ABR with the value it got from tstate.
1655 		 * We now have the master with ABR clear and the slave with ABR
1656 		 * set. Fortunately, having set ABR, the slave will close the
1657 		 * metadevice after setting ABR and as there are no nodes with
1658 		 * the device open, the close will send a message to clear ABR
1659 		 * on all nodes. So, the nodes will all have ABR unset.
1660 		 */
1661 
1662 		/* expect the nodelist to follow the step name */
1663 		if (argc < 1)
1664 			usage(sp, 1);
1665 
1666 		meta_mc_log(MC_LOG2, gettext("Starting Step4: %s"),
1667 		    meta_print_hrtime(0));
1668 
1669 		/*
1670 		 * Does local set exist? If not, exit with 0
1671 		 * since there's no reason to have this node panic if
1672 		 * the local set cannot be started.
1673 		 */
1674 		if ((local_sp = load_local_set(ep)) == NULL) {
1675 			md_exit(local_sp, 0);
1676 		}
1677 
1678 		/*
1679 		 * walk through all sets on this node which could include:
1680 		 *	- MN disksets
1681 		 *	- traditional disksets
1682 		 *	- non-existent disksets
1683 		 * start mirror resync for all MN sets
1684 		 */
1685 		if ((max_sets = get_max_sets(ep)) == 0) {
1686 			mde_perror(ep, "");
1687 			md_exit(local_sp, 1);
1688 		}
1689 
1690 		/* Clear set_info structure */
1691 		for (setno = 1; setno < max_sets; setno++) {
1692 			set_info[setno] = 0;
1693 		}
1694 
1695 		/* start walking through all possible disksets */
1696 		for (setno = 1; setno < max_sets; setno++) {
1697 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1698 				if (mdiserror(ep, MDE_NO_SET)) {
1699 					/* No set for this setno - continue */
1700 					mdclrerror(ep);
1701 					continue;
1702 				} else {
1703 					mde_perror(ep, gettext("Unable to "
1704 					    "get set %d information"), setno);
1705 					md_exit(local_sp, 1);
1706 				}
1707 			}
1708 
1709 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1710 				mde_perror(ep, gettext("Unable to get set "
1711 				    "%s desc information"), sp->setname);
1712 				mdclrerror(ep);
1713 				continue;
1714 			}
1715 
1716 			/* only check multi-node disksets */
1717 			if (!meta_is_mn_set(sp, ep)) {
1718 				mdclrerror(ep);
1719 				continue;
1720 			}
1721 
1722 			set_info[setno] |= SET_INFO_MN;
1723 
1724 			/*
1725 			 * If not an owner (all mddbs failed) or stale
1726 			 * (< 50% mddbs operational), then set is
1727 			 * non-writable so just resume commd and
1728 			 * unblock mddb messages.
1729 			 */
1730 			mdclrerror(ep);
1731 			if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) {
1732 				set_info[setno] |= SET_INFO_NO_WR;
1733 			}
1734 			if (!mdisok(ep)) {
1735 				mde_perror(ep, gettext("Could "
1736 				    "not get set %s ownership"),
1737 				    sp->setname);
1738 				md_exit(local_sp, 1);
1739 			}
1740 			/* Set is owned - is it stale? */
1741 			if (!set_info[setno] & SET_INFO_NO_WR) {
1742 				(void) memset(&cfg, 0, sizeof (cfg));
1743 				cfg.c_id = 0;
1744 				cfg.c_setno = sp->setno;
1745 				if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde,
1746 				    NULL) != 0) {
1747 					mdstealerror(ep, &cfg.c_mde);
1748 					mde_perror(ep, gettext("Could "
1749 					    "not get set %s information"),
1750 					    sp->setname);
1751 					md_exit(local_sp, 1);
1752 				}
1753 				if (cfg.c_flags & MDDB_C_STALE) {
1754 					set_info[setno] |= SET_INFO_NO_WR;
1755 				}
1756 			}
1757 
1758 			/* resume rpc.mdcommd */
1759 			if (mdmn_resume(setno, MD_COMM_ALL_CLASSES, 0,
1760 			    commd_timeout)) {
1761 				md_eprintf(gettext("Unable to resume "
1762 				    "rpc.mdcommd for set %s\n"), sp->setname);
1763 				md_exit(local_sp, 1);
1764 			}
1765 			meta_ping_mnset(setno);
1766 
1767 			/* Unblock mddb parse messages */
1768 			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
1769 				(void) memset(&mbp, 0, sizeof (mbp));
1770 				mbp.c_setno = setno;
1771 				mbp.c_blk_flags = MDDB_UNBLOCK_PARSE;
1772 				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
1773 				    &mbp.c_mde, NULL)) {
1774 					mdstealerror(ep, &mbp.c_mde);
1775 					mde_perror(ep, gettext("Could not "
1776 					    "unblock set %s"), sp->setname);
1777 					md_exit(local_sp, 1);
1778 				}
1779 			}
1780 			meta_mc_log(MC_LOG3, gettext("Step4 - rpc.mdcommd "
1781 			    "resumed and messages unblocked for set %s: %s"),
1782 			    sp->setname,
1783 			    meta_print_hrtime(gethrtime() - start_time));
1784 		}
1785 
1786 		for (setno = 1; setno < max_sets; setno++) {
1787 			int			start_step;
1788 
1789 			/* Skip traditional disksets. */
1790 			if ((set_info[setno] & SET_INFO_MN) == 0)
1791 				continue;
1792 
1793 			/*
1794 			 * If already determined that this set is
1795 			 * a non-writable set, then just continue
1796 			 * to next set since there's nothing else
1797 			 * to do for a non-writable set.
1798 			 */
1799 			if (set_info[setno] & SET_INFO_NO_WR)
1800 				continue;
1801 
1802 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1803 				if (mdiserror(ep, MDE_NO_SET)) {
1804 					/* No set for this setno - continue */
1805 					mdclrerror(ep);
1806 					continue;
1807 				} else {
1808 					mde_perror(ep, gettext("Unable to "
1809 					    "get set %d information"), setno);
1810 					md_exit(local_sp, 1);
1811 				}
1812 			}
1813 
1814 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1815 				mde_perror(ep, gettext("Unable to get set "
1816 				    "%s desc information"), sp->setname);
1817 				mdclrerror(ep);
1818 				continue;
1819 			}
1820 
1821 			/* See if this node came through the start step */
1822 			(void) memset(&sf, 0, sizeof (sf));
1823 			sf.sf_setno = sp->setno;
1824 			sf.sf_flags = MDDB_NM_GET;
1825 			/* Use magic to help protect ioctl against attack. */
1826 			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
1827 			if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
1828 			    &sf.sf_mde, NULL)) {
1829 				mdstealerror(ep, &sf.sf_mde);
1830 				mde_perror(ep, gettext("Could not get "
1831 				    "start_step flag for set %s"), sp->setname);
1832 				md_exit(local_sp, 1);
1833 			}
1834 			start_step =
1835 			    (sf.sf_setflags & MD_SET_MN_START_RC)? 1: 0;
1836 
1837 			/*
1838 			 * We can now reset the start_step flag for the set
1839 			 * if it was already set.
1840 			 */
1841 			if (start_step) {
1842 				(void) memset(&sf, 0, sizeof (sf));
1843 					sf.sf_setno = sp->setno;
1844 				sf.sf_setflags = MD_SET_MN_START_RC;
1845 				sf.sf_flags = MDDB_NM_RESET;
1846 				/*
1847 				 * Use magic to help protect ioctl
1848 				 * against attack.
1849 				 */
1850 				sf.sf_magic = MDDB_SETFLAGS_MAGIC;
1851 				if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
1852 				    &sf.sf_mde, NULL)) {
1853 					mdstealerror(ep, &sf.sf_mde);
1854 					mde_perror(ep,
1855 					    gettext("Could not reset "
1856 					    "start_step flag for set %s"),
1857 					    sp->setname);
1858 				}
1859 			}
1860 
1861 			meta_mc_log(MC_LOG3, gettext("Step4 - begin setting "
1862 			    "ABR state and restarting io's for "
1863 			    "set %s: %s"), sp->setname,
1864 			    meta_print_hrtime(gethrtime() - start_time));
1865 
1866 
1867 			/*
1868 			 * If we are not the master and we have come through
1869 			 * the start step, we must update the ABR states
1870 			 * for mirrors and soft partitions. Also the submirror
1871 			 * states need to be synchronised so that we see the
1872 			 * same status as other previously joined members.
1873 			 * This _must_ be done before starting the resync.
1874 			 */
1875 			if (!(sd->sd_mn_am_i_master) && start_step) {
1876 				if (reset_state(GET_MIRROR_STATE, sp, MD_MIRROR,
1877 				    ep) == -1) {
1878 					md_exit(local_sp, 1);
1879 				}
1880 				if (reset_state(UPDATE_ABR, sp, MD_SP,
1881 				    ep) == -1) {
1882 					md_exit(local_sp, 1);
1883 				}
1884 				/*
1885 				 * Mark the fact that we've got the mirror
1886 				 * state. This allows the resync thread to
1887 				 * determine if _it_ needs to issue this. This
1888 				 * can happen if a node is added to a set after
1889 				 * a reconfig cycle has completed.
1890 				 */
1891 				(void) memset(&sf, 0, sizeof (sf));
1892 					sf.sf_setno = sp->setno;
1893 				sf.sf_setflags = MD_SET_MN_MIR_STATE_RC;
1894 				sf.sf_flags = MDDB_NM_SET;
1895 				/*
1896 				 * Use magic to help protect ioctl
1897 				 * against attack.
1898 				 */
1899 				sf.sf_magic = MDDB_SETFLAGS_MAGIC;
1900 				if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
1901 				    &sf.sf_mde, NULL)) {
1902 					mdstealerror(ep, &sf.sf_mde);
1903 					mde_perror(ep,
1904 					    gettext("Could not set "
1905 					    "submirror state flag for set %s"),
1906 					    sp->setname);
1907 				}
1908 			}
1909 
1910 			/*
1911 			 * All remaining actions are only performed by the
1912 			 * master
1913 			 */
1914 			if (!(sd->sd_mn_am_i_master)) {
1915 				if (meta_lock(sp, TRUE, ep) != 0) {
1916 					mde_perror(ep, "");
1917 					md_exit(local_sp, 1);
1918 				}
1919 				meta_mirror_resync_unblock(sp);
1920 				meta_unlock(sp, ep);
1921 				continue;
1922 			}
1923 
1924 			/*
1925 			 * If the master came through the start step, this
1926 			 * implies that all of the nodes must have done the
1927 			 * same and hence there can be no applications
1928 			 * running. Hence no need to reset ABR
1929 			 */
1930 			if (!start_step) {
1931 				/* Reset ABR state for mirrors */
1932 				if (reset_state(RESET_ABR, sp, MD_MIRROR,
1933 				    ep) == -1) {
1934 					md_exit(local_sp, 1);
1935 				}
1936 				/* ...and now the same for soft partitions */
1937 				if (reset_state(RESET_ABR, sp, MD_SP,
1938 				    ep) == -1) {
1939 					md_exit(local_sp, 1);
1940 				}
1941 			}
1942 
1943 			/*
1944 			 * choose owners for orphaned resyncs and reset
1945 			 * non-orphaned resyncs so that an owner node that
1946 			 * reboots will restart the resync if needed.
1947 			 */
1948 			if (reset_state(CHOOSE_OWNER, sp, MD_MIRROR, ep) == -1)
1949 				md_exit(local_sp, 1);
1950 
1951 			/*
1952 			 * Must unlock set lock before meta_mirror_resync_all
1953 			 * sends a message to run the metasync command
1954 			 * which also grabs the meta_lock.
1955 			 */
1956 			if (meta_lock(sp, TRUE, ep) != 0) {
1957 				mde_perror(ep, "");
1958 				md_exit(local_sp, 1);
1959 			}
1960 			meta_mirror_resync_unblock(sp);
1961 			meta_unlock(sp, ep);
1962 
1963 			/* resync all mirrors in set */
1964 			if (meta_mirror_resync_all(sp, 0, ep) != 0) {
1965 				mde_perror(ep, gettext("Mirror resyncs "
1966 				    "failed for set %s"), sp->setname);
1967 				md_exit(local_sp, 1);
1968 			}
1969 
1970 			meta_mc_log(MC_LOG3, gettext("Step4 - io's restarted "
1971 			    "for set %s: %s"), sp->setname,
1972 			    meta_print_hrtime(gethrtime() - start_time));
1973 		}
1974 
1975 		meta_mc_log(MC_LOG2, gettext("Step4 completed: %s"),
1976 		    meta_print_hrtime(gethrtime() - start_time));
1977 
1978 		break;
1979 
1980 	default:
1981 		usage(sp, 1);
1982 		break;
1983 	}
1984 
1985 	md_exit(sp, 0);
1986 	/* NOTREACHED */
1987 	return (0);
1988 }
1989