xref: /titanic_41/usr/src/cmd/lvm/util/metaclust.c (revision 99fd4d22c4d79c3cbb29109c221e7868cfa07333)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <meta.h>
28 #include <sdssc.h>
29 #include <signal.h>
30 #include <syslog.h>
31 #include <sys/types.h>
32 #include <sys/wait.h>
33 #include <sys/lvm/md_mirror.h>
34 #include <metad.h>
35 
36 #define	MY_VERSION		"1.0"	/* the highest supported version */
37 #define	MAX_DEBUG_LEVEL		5	/* maximum verbosity level */
38 
39 #define	RESET_OWNER		0x0001
40 #define	CHOOSE_OWNER		0x0002
41 #define	RESET_ABR		0x0004
42 #define	UPDATE_ABR		0x0008
43 #define	GET_MIRROR_STATE	0x0010
44 
45 #define	SET_INFO_NO_WR	0x0002
46 #define	SET_INFO_MN	0x0004
47 
48 /*
49  * This table defines all the metaclust reconfig steps we understand
50  */
51 typedef enum stpnum {
52 	MC_UNK = 0,
53 	MC_START,
54 	MC_STOP,
55 	MC_ABORT,
56 	MC_RETURN,
57 	MC_STEP1,
58 	MC_STEP2,
59 	MC_STEP3,
60 	MC_STEP4
61 } stepnum_t;
62 
63 /*
64  * Structure for step_name -> step_number mapping
65  */
66 struct step_t {
67 	char		*step_nam;
68 	stepnum_t	step_num;
69 };
70 
71 /*
72  * Step name to step number mapping table
73  * This table MUST be sorted alphabetically in ascending order of step name
74  */
75 static struct step_t step_table[] = {
76 	{ "abort",	MC_ABORT },
77 	{ "return",	MC_RETURN },
78 	{ "start",	MC_START },
79 	{ "step1",	MC_STEP1 },
80 	{ "step2",	MC_STEP2 },
81 	{ "step3",	MC_STEP3 },
82 	{ "step4",	MC_STEP4 },
83 	{ "stop",	MC_STOP }
84 };
85 
86 /*
87  * If support for a different version is added, the new version number should
88  * be appended to the version_table below. This list will be searched to
89  * determine if a version requested via the -V option is supported or not.
90  */
91 static char *version_table[] = {
92 	MY_VERSION
93 };
94 
95 uint_t	timeout = 0;			/* disable timeout by default */
96 char	*version = MY_VERSION;		/* use latest version by default */
97 int	stepnum = MC_UNK;		/* reconfiguration step number */
98 pid_t	c_pid;				/* child process id */
99 
100 /*
101  * Binary search comparison routine
102  */
103 static int
mc_compare(const void * stp1,const void * stp2)104 mc_compare(const void *stp1, const void *stp2)
105 {
106 	return (strcmp((const char *)stp1,
107 	    ((const struct step_t *)stp2)->step_nam));
108 }
109 
110 /*
111  * Timeout expiry alarm signal handler
112  */
113 /*ARGSUSED*/
114 static void
sigalarmhandler(int sig)115 sigalarmhandler(int sig)
116 {
117 	int	i, n, ret, stat_loc = 0;
118 	FILE	*pgcore;
119 	char	corecmd[256];
120 
121 	n = sizeof (step_table) / sizeof (step_table[0]);
122 	for (i = 0; i < n; i++) {
123 		if (stepnum == step_table[i].step_num)
124 			break;
125 	}
126 
127 	assert(i != n);
128 
129 	meta_mc_log(MC_LOG1, gettext("Timeout expired in %s: %s"),
130 	    step_table[i].step_nam,
131 	    meta_print_hrtime(gethrtime() - start_time));
132 
133 	/*
134 	 * See what the child was actually doing when the timeout expired.
135 	 * A core-dump of this would be _really_ good, so let's just
136 	 * try a 'gcore -g c_pid' and hope
137 	 */
138 
139 	(void) memset(corecmd, 0, sizeof (corecmd));
140 	(void) snprintf(corecmd, sizeof (corecmd),
141 	    "/bin/gcore -g %d >/dev/null 2>&1", (int)c_pid);
142 
143 	pgcore = popen(corecmd, "r");
144 
145 	if (pgcore == NULL) {
146 		meta_mc_log(MC_LOG1, gettext("Could not grab core for pid %s"),
147 		    c_pid);
148 	} else {
149 		(void) pclose(pgcore);
150 	}
151 
152 	if ((ret = kill(c_pid, SIGKILL)) == 0) {
153 		/*
154 		 * The child will wait forever until the status is retrieved
155 		 * so get it now. Keep retrying if the call is interrupted.
156 		 *
157 		 * The possible results are,
158 		 *
159 		 *	- child killed successfully
160 		 *	- signal sent but child not killed
161 		 *	- waitpid failed/interrupted
162 		 */
163 		(void) sleep(2);
164 		while ((ret = waitpid(c_pid, &stat_loc, WNOHANG)) < 0) {
165 			if (errno != EINTR) {
166 				break;
167 			}
168 		}
169 		if ((ret == c_pid) || (errno == ECHILD)) {
170 			ret = 0;
171 		} else {
172 			ret = 1;
173 		}
174 	} else if (errno == ESRCH) {
175 		/*
176 		 * If the kill did not catch the child then it means the child
177 		 * exited immediately after the timeout occured.
178 		 */
179 		ret = 0;
180 	}
181 
182 	/*
183 	 * make sure not to exit with 205 for any steps other than step1-step4.
184 	 * Suncluster reconfiguration can't handle it otherwise.
185 	 */
186 	switch (stepnum) {
187 	case MC_STEP1:
188 	case MC_STEP2:
189 	case MC_STEP3:
190 	case MC_STEP4:
191 		/*
192 		 * If the child was killed successfully return 205 for a
193 		 * new reconfig cycle otherwise send 1 to panic the node.
194 		 */
195 		if (ret != 0) {
196 			md_eprintf(gettext("Could not kill child\n"));
197 			exit(1);
198 		} else {
199 			exit(205);
200 		}
201 		break;
202 	case MC_START:
203 	case MC_STOP:
204 	case MC_ABORT:
205 	case MC_RETURN:
206 	default:
207 		exit(1);
208 		break;
209 	}
210 }
211 
212 /*
213  * Attempt to load local set.
214  * Returns:
215  *	pointer to mdsetname_t for local set (local_sp) is successful.
216  *	0 if failure
217  *		if there are no local set mddbs, no error message is printed.
218  *		Otherwise, error message is printed so that user
219  *		can determine why the local set didn't start.
220  */
221 mdsetname_t *
load_local_set(md_error_t * ep)222 load_local_set(md_error_t *ep)
223 {
224 	mdsetname_t	*local_sp = NULL;
225 
226 	/* Does local set exist? If not, give no error */
227 	if ((local_sp = metasetname(MD_LOCAL_NAME, ep)) == NULL) {
228 		return (0);
229 	}
230 
231 	/*
232 	 * snarf local set
233 	 * If fails with MDE_DB_NODB, then just return 1 printing
234 	 * no failure.
235 	 * Otherwise, print error message, and return 1.
236 	 */
237 	if (meta_setup_db_locations(ep) != 0) {
238 		if (!(mdismddberror(ep, MDE_DB_NODB)))
239 			mde_perror(ep, "");
240 		return (0);
241 	}
242 
243 	/* local set loaded successfully */
244 	return (local_sp);
245 }
246 
247 /*
248  * Purpose:	Compose a full path name for a metadevice
249  *
250  * On entry:	sp	- setname pointer
251  *		mnum	- minor number of metadevice
252  *		pathname - pointer to array to return path string
253  *		pathlen	- max length of pathname array
254  */
255 static int
compose_path(mdsetname_t * sp,int mnum,char * pathname,int pathlen)256 compose_path(mdsetname_t *sp, int mnum, char *pathname, int pathlen)
257 {
258 	int	rtn;
259 	mdname_t	*np;
260 	md_error_t	status = mdnullerror;
261 
262 	if (MD_MIN2SET(mnum) != sp->setno) {
263 		md_eprintf(gettext("minor number 0x%x invalid for set %d\n"),
264 		    mnum, sp->setno);
265 		return (-1);
266 	}
267 
268 	if ((np = metamnumname(&sp, mnum, 0, &status)) == NULL) {
269 		return (-1);
270 	}
271 
272 	rtn = snprintf(pathname, pathlen, "%s", np->rname);
273 
274 	if ((pathname[0] == '\0') || (rtn >= pathlen)) {
275 		md_eprintf(gettext(
276 		    "Could not create path for device %s\n"),
277 		    get_mdname(sp, mnum));
278 		return (-1);
279 	}
280 	return (0);
281 }
282 
283 /*
284  * Purpose:	Walk through all the devices specified for the given set
285  *		and do the action specified in mode
286  */
287 static int
reset_state(uint_t mode,mdsetname_t * sp,char * drivername,md_error_t * ep)288 reset_state(uint_t mode, mdsetname_t *sp, char *drivername, md_error_t *ep)
289 {
290 	mdnamelist_t			*devnlp = NULL;
291 	mdnamelist_t			*p;
292 	mdname_t			*devnp = NULL;
293 	md_set_mmown_params_t		ownpar_p;
294 	md_set_mmown_params_t		*ownpar = &ownpar_p;
295 	md_unit_t			*mm;
296 	int				mirror_dev = 0;
297 	mndiskset_membershiplist_t	*nl;
298 	int				cnt;
299 	int				has_parent;
300 	md_mn_get_mir_state_t		mir_state_p;
301 	md_mn_get_mir_state_t		*mir_state = &mir_state_p;
302 
303 	/*
304 	 * if we are choosing or resetting the owners then make sure
305 	 * we are only doing it for mirror devices
306 	 */
307 	mirror_dev = (strcmp(MD_MIRROR, drivername) == 0);
308 	if ((mode & (RESET_OWNER | CHOOSE_OWNER)) && !mirror_dev) {
309 		return (-1);
310 	}
311 
312 	/* get a list of all the metadevices for current set */
313 	if (mirror_dev && meta_get_mirror_names(sp, &devnlp, 0, ep) < 0) {
314 		mde_perror(ep, gettext("Could not get mirrors for set %s"),
315 		    sp->setname);
316 		return (-1);
317 	} else if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) {
318 		mde_perror(ep, gettext(
319 		    "Could not get soft partitions for set %s"), sp->setname);
320 		return (-1);
321 	}
322 
323 	/* If resetting the owner, get the known membership list */
324 	if (mode & RESET_OWNER) {
325 		if (meta_read_nodelist(&cnt, &nl, ep)) {
326 			mde_perror(ep, "Could not get nodelist");
327 			return (-1);
328 		}
329 	}
330 
331 	/* for each metadevice */
332 	for (p = devnlp; (p != NULL); p = p->next) {
333 		devnp = p->namep;
334 
335 		/*
336 		 * Get the current setting for mirror ABR state and all of the
337 		 * submirror state and flags from the master node. We only
338 		 * perform this when going through a 'start' cycle.
339 		 */
340 		if ((mode & GET_MIRROR_STATE) && mirror_dev) {
341 			char	*miscname;
342 
343 			/*
344 			 * Ensure that we ignore soft-parts that are returned
345 			 * from the meta_get_mirror_names() call
346 			 */
347 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
348 				goto out;
349 			if (strcmp(miscname, MD_MIRROR) != 0)
350 				continue;
351 
352 			mir_state->mnum = meta_getminor(devnp->dev);
353 			MD_SETDRIVERNAME(mir_state, MD_MIRROR, sp->setno);
354 			meta_mc_log(MC_LOG4, gettext("Getting mirror state"
355 			    " for %s: %s"), get_mdname(sp, mir_state->mnum),
356 			    meta_print_hrtime(gethrtime() - start_time));
357 
358 			if (metaioctl(MD_MN_GET_MIRROR_STATE, mir_state, ep,
359 			    "MD_MN_GET_MIRROR_STATE") != 0) {
360 				mde_perror(ep, gettext("Unable to get "
361 				    "mirror state for %s"),
362 				    get_mdname(sp, mir_state->mnum));
363 				goto out;
364 			} else {
365 				continue;
366 			}
367 		}
368 
369 		/* check if this is a top level metadevice */
370 		if ((mm = meta_get_mdunit(sp, devnp, ep)) == NULL)
371 			goto out;
372 		if (MD_HAS_PARENT(MD_PARENT(mm))) {
373 			has_parent = 1;
374 		} else {
375 			has_parent = 0;
376 		}
377 		Free(mm);
378 
379 		if (mode & (RESET_OWNER | CHOOSE_OWNER)) {
380 			char	*miscname;
381 
382 			/*
383 			 * we can only do these for mirrors so make sure we
384 			 * really have a mirror device and not a softpartition
385 			 * imitating one. meta_get_mirror_names seems to think
386 			 * softparts on top of a mirror are mirrors!
387 			 */
388 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
389 				goto out;
390 			if (strcmp(miscname, MD_MIRROR) != 0)
391 				continue;
392 
393 			(void) memset(ownpar, 0, sizeof (*ownpar));
394 			ownpar->d.mnum = meta_getminor(devnp->dev);
395 			MD_SETDRIVERNAME(ownpar, MD_MIRROR, sp->setno);
396 
397 			meta_mc_log(MC_LOG4, gettext("Setting owner "
398 			    "for %s: %s"), get_mdname(sp, ownpar->d.mnum),
399 			    meta_print_hrtime(gethrtime() - start_time));
400 
401 			/* get the current owner id */
402 			if (metaioctl(MD_MN_GET_MM_OWNER, ownpar, ep,
403 			    "MD_MN_GET_MM_OWNER") != 0) {
404 				mde_perror(ep, gettext("Unable to get "
405 				    "mirror owner for %s"),
406 				    get_mdname(sp, ownpar->d.mnum));
407 				goto out;
408 			}
409 		}
410 
411 		if (mode & RESET_OWNER) {
412 			if (ownpar->d.owner == MD_MN_MIRROR_UNOWNED) {
413 				mdclrerror(ep);
414 				continue;
415 			}
416 
417 			/*
418 			 * reset owner only if the current owner is
419 			 * not in the membership list
420 			 * Also kill the resync thread so that when the resync
421 			 * is started, it will perform an optimized resync
422 			 * for any resync regions that were dirty when the
423 			 * current owner left the membership.
424 			 */
425 			if (meta_is_member(NULL, ownpar->d.owner, nl) != 1) {
426 				if (meta_mn_change_owner(&ownpar,
427 				    sp->setno, ownpar->d.mnum,
428 				    MD_MN_MIRROR_UNOWNED,
429 				    MD_MN_MM_ALLOW_CHANGE) == -1) {
430 					md_eprintf(gettext(
431 					    "Unable to reset mirror owner "
432 					    "for %s\n"),
433 					    get_mdname(sp, ownpar->d.mnum));
434 					goto out;
435 				}
436 				if (meta_mirror_resync(sp, devnp, 0, ep,
437 				    MD_RESYNC_KILL_NO_WAIT) != 0) {
438 					md_eprintf(gettext(
439 					    "Unable to kill resync for"
440 					    " %s\n"),
441 					    get_mdname(sp, ownpar->d.mnum));
442 					goto out;
443 				}
444 			}
445 		}
446 
447 		if (mode & CHOOSE_OWNER) {
448 			/*
449 			 * only orphaned resyncs will have no owner.
450 			 * if that is the case choose a new owner. Otherwise
451 			 * re-establish the existing owner. This covers the
452 			 * case where a node that owned the mirror
453 			 * reboots/panics and comes back into the cluster before
454 			 * the reconfig cycle has completed. In this case the
455 			 * other cluster nodes will have the mirror owner marked
456 			 * as the rebooted node while it has the owner marked
457 			 * as 'None'. We have to reestablish the ownership so
458 			 * that the subsequent resync can continue.
459 			 */
460 			if (meta_mn_change_owner(&ownpar, sp->setno,
461 			    ownpar->d.mnum, ownpar->d.owner,
462 			    MD_MN_MM_CHOOSE_OWNER) == -1) {
463 				md_eprintf(gettext("Unable to choose "
464 				    "mirror owner for %s\n"),
465 				    get_mdname(sp, ownpar->d.mnum));
466 				goto out;
467 			}
468 		}
469 
470 		/*
471 		 * For RESET_ABR and UPDATE_ABR - only handle top
472 		 * level metadevices.
473 		 */
474 		if (has_parent)
475 			continue;
476 
477 		if (mode & RESET_ABR) {
478 			/*
479 			 * Reset the ABR (application based recovery)
480 			 * value on all nodes. We are dealing with
481 			 * the possibility that we have ABR set but the
482 			 * only node that had the device open with ABR has
483 			 * left the cluster. We simply open and close the
484 			 * device and if this is the last close in the
485 			 * cluster, ABR will be cleared on all nodes.
486 			 */
487 			char		*miscname;
488 			char		name[MAXPATHLEN];
489 			int		mnum, fd;
490 
491 			name[0] = '\0';
492 			mnum = meta_getminor(devnp->dev);
493 
494 			/*
495 			 * Ensure that we don't include soft-parts in the
496 			 * mirror-only call to RESET_ABR. meta_get_mirror_names
497 			 * returns a bogus list that includes all soft-parts
498 			 * built on mirrors.
499 			 */
500 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
501 				goto out;
502 			if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0))
503 				continue;
504 
505 			meta_mc_log(MC_LOG4, gettext("Re-setting ABR state "
506 			    "for %s: %s"), get_mdname(sp, mnum),
507 			    meta_print_hrtime(gethrtime() - start_time));
508 
509 			/* compose the absolute device path and open it */
510 			if (compose_path(sp, mnum, &name[0],
511 			    sizeof (name)) != 0)
512 				goto out;
513 			if ((fd = open(name, O_RDWR, 0)) < 0) {
514 				md_perror(gettext("Could not open device %s"),
515 				    name);
516 				continue;
517 			}
518 
519 			(void) close(fd);
520 		}
521 
522 		if (mode & UPDATE_ABR) {
523 			/*
524 			 * Update the ABR value on this node. We obtain the
525 			 * current ABR state from the master node.
526 			 */
527 
528 			char		*miscname;
529 			char		name[MAXPATHLEN];
530 			int		mnum, fd;
531 			volcap_t	vc;
532 			uint_t		tstate;
533 
534 			name[0] = '\0';
535 			mnum = meta_getminor(devnp->dev);
536 
537 			/*
538 			 * Ensure that we don't include soft-parts in the
539 			 * mirror-only call to UPDATE_ABR. meta_get_mirror_names
540 			 * returns a bogus list that includes all soft-parts
541 			 * built on mirrors.
542 			 */
543 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
544 				goto out;
545 			if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0))
546 				continue;
547 
548 			/* Get tstate from Master */
549 			if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep)
550 			    != 0)
551 				continue;
552 			/* If not set on the master, nothing to do */
553 			if (!(tstate & MD_ABR_CAP))
554 				continue;
555 
556 			meta_mc_log(MC_LOG4, gettext("Updating ABR state "
557 			    "for %s: %s"), get_mdname(sp, mnum),
558 			    meta_print_hrtime(gethrtime() - start_time));
559 
560 			/* compose the absolute device path and open it */
561 			if (compose_path(sp, mnum, &name[0],
562 			    sizeof (name)) != 0)
563 				goto out;
564 			if ((fd = open(name, O_RDWR, 0)) < 0) {
565 				md_perror(gettext("Could not open device %s"),
566 				    name);
567 				continue;
568 			}
569 
570 			/* set ABR state */
571 			vc.vc_info = 0;
572 			vc.vc_set = 0;
573 			if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) {
574 				/*
575 				 * Ignore if device does not support this
576 				 * ioctl
577 				 */
578 				if ((errno != ENOTTY) && (errno != ENOTSUP)) {
579 					md_perror(gettext("Could not get "
580 					    "ABR/DMR state for device %s"),
581 					    name);
582 				}
583 				(void) close(fd);
584 				continue;
585 			}
586 			if (!(vc.vc_info & (DKV_ABR_CAP | DKV_DMR_CAP))) {
587 				(void) close(fd);
588 				continue;
589 			}
590 
591 			vc.vc_set = DKV_ABR_CAP;
592 			if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) {
593 				md_perror(gettext(
594 				    "Could not set ABR state for "
595 				    "device %s"), name);
596 				(void) close(fd);
597 				goto out;
598 			} else {
599 				md_eprintf(gettext(
600 				    "Setting ABR state on device %s\n"), name);
601 			}
602 
603 			(void) close(fd);
604 		}
605 	}
606 
607 	/* cleanup */
608 	if (mode & RESET_OWNER) {
609 		meta_free_nodelist(nl);
610 	}
611 	metafreenamelist(devnlp);
612 	return (0);
613 
614 out:
615 	/* cleanup */
616 	if (mode & RESET_OWNER) {
617 		meta_free_nodelist(nl);
618 	}
619 	metafreenamelist(devnlp);
620 	return (-1);
621 }
622 
623 /*
624  * Print usage message
625  */
626 static void
usage(mdsetname_t * sp,int eval)627 usage(mdsetname_t *sp, int eval)
628 {
629 	(void) fprintf(stderr, gettext("usage:"
630 	    "\t%s [-V version] [-t timeout] [-d level] start localnodeid\n"
631 	    "\t%s [-V version] [-t timeout] [-d level] step nodelist...\n"
632 	    "\t%s [-V version] [-t timeout] [-d level] abort | stop\n"
633 	    "\t%s [-V | -? | -h]\n"),
634 	    myname, myname, myname, myname);
635 	if (!eval) {
636 		(void) fprintf(stderr, gettext("\n"
637 		    "\tValid debug (-d) levels are 1-%d for increasing "
638 		    "verbosity.\n\tDefault is -d 3.\n\n"
639 		    "\tValid step values are: return | step1 | step2 | "
640 		    "step3 | step4\n\n"
641 		    "\tNodelist is a space-separated list of node id's\n\n"),
642 		    MAX_DEBUG_LEVEL);
643 	}
644 	md_exit(sp, eval);
645 }
646 
647 /*
648  * Input:	Input takes a config step name followed by a list of
649  *		possible node id's.
650  *
651  * Returns:	  0 - Success
652  *		  1 - Fail
653  *			Node will be removed from cluster membership
654  *			by forcing node to panic.
655  *		205 - Unsuccessful. Start another reconfig cycle.
656  *			Problem was encountered that could be fixed by
657  *			running another reconfig cycle.
658  *			Problem could be a result of a failure to read
659  *			the nodelist file or that all work could not be
660  *			accomplished in a reconfig step in the amount of
661  *			time given so another reconfig cycle is needed in
662  *			order to finish the current step.
663  */
664 int
main(int argc,char ** argv)665 main(int argc, char **argv)
666 {
667 	mdsetname_t		*sp = NULL;
668 	md_error_t		status = mdnullerror;
669 	md_error_t		*ep = &status;
670 	set_t			max_sets, setno;
671 	int			c, clust = 0;
672 	struct sigaction	nsa, osa;
673 	struct step_t		*step_ptr;
674 	mdsetname_t		*local_sp = NULL;
675 	md_drive_desc		*dd;
676 	int			rval = 0;
677 	md_set_desc		*sd;
678 	mddb_block_parm_t	mbp;
679 	uint_t			debug = 3; /* log upto MC_LOG3 by default */
680 	int			version_table_size;
681 	mddb_setflags_config_t	sf;
682 	int			ret_val;
683 	mddb_config_t		cfg;
684 	int			set_info[MD_MAXSETS];
685 	long			commd_timeout = 0;
686 
687 	/*
688 	 * Get the locale set up before calling any other routines
689 	 * with messages to ouput.  Just in case we're not in a build
690 	 * environment, make sure that TEXT_DOMAIN gets set to
691 	 * something.
692 	 */
693 #if !defined(TEXT_DOMAIN)
694 #define	TEXT_DOMAIN "SYS_TEST"
695 #endif
696 	(void) setlocale(LC_ALL, "");
697 	(void) textdomain(TEXT_DOMAIN);
698 
699 	if ((clust = sdssc_bind_library()) == SDSSC_ERROR) {
700 		md_eprintf(gettext("Interface error with libsds_sc.so\n"));
701 		exit(1);
702 	}
703 
704 	if (md_init(argc, argv, 1, 1, ep) != 0 || meta_check_root(ep) != 0) {
705 		mde_perror(ep, "");
706 		md_exit(sp, 1);
707 	}
708 
709 	/*
710 	 * open log and enable libmeta logging. Do it here explicitly
711 	 * rather than letting md_init() do it because we are not really
712 	 * a daemon and that is what md_init() opens the log as.
713 	 */
714 	openlog("metaclust", LOG_CONS, LOG_USER);
715 
716 	version_table_size = sizeof (version_table) / sizeof (version_table[0]);
717 
718 	optind = 1;
719 	opterr = 0;
720 	while ((c = getopt(argc, argv, "hd:V:t:?")) != -1) {
721 		switch (c) {
722 		case 'h':
723 			usage(sp, 0);
724 			break;
725 
726 		case 'd':
727 			if (sscanf(optarg, "%u", &debug) != 1) {
728 				md_eprintf(gettext("Invalid debug level\n"));
729 				md_exit(sp, 1);
730 			} else if ((debug < 1) || (debug > MAX_DEBUG_LEVEL)) {
731 				debug = min(max(debug, 1), MAX_DEBUG_LEVEL);
732 				md_eprintf(gettext("Debug level must be "
733 				    "between 1 and %d inclusive.\n"),
734 				    MAX_DEBUG_LEVEL);
735 				md_eprintf(gettext("Debug level set to %d.\n"),
736 				    debug);
737 			}
738 			break;
739 
740 		case 'V':
741 			version = Strdup(optarg);
742 			break;
743 
744 		case 't':
745 			if (sscanf(optarg, "%u", &timeout) != 1) {
746 				md_eprintf(gettext("Invalid timeout value\n"));
747 				md_exit(sp, 1);
748 			}
749 			break;
750 
751 		case '?':
752 			if (optopt == '?') {
753 				usage(sp, 0);
754 			} else if (optopt == 'V') {
755 				int	i;
756 
757 				(void) fprintf(stdout, gettext(
758 				    "%s: Versions Supported:"), myname);
759 				for (i = 0; i < version_table_size; i++) {
760 					(void) fprintf(stdout, " %s",
761 					    version_table[i]);
762 				}
763 				(void) fprintf(stdout, "\n");
764 				md_exit(sp, 0);
765 			}
766 			/*FALLTHROUGH*/
767 
768 		default:
769 			usage(sp, 1);
770 			break;
771 		}
772 	}
773 
774 	/* initialise the debug level and start time */
775 	setup_mc_log(debug);
776 
777 	/*
778 	 * check that the version specified (if any) is supported.
779 	 */
780 	if (version != NULL) {
781 		int	i, found = 0;
782 
783 		for (i = 0; i < version_table_size; i++) {
784 			if (strcmp(version, version_table[i]) == 0) {
785 				found = 1;
786 				break;
787 			}
788 		}
789 		if (!found) {
790 			md_eprintf(gettext("Version %s not supported\n"),
791 			    version);
792 			md_exit(sp, 1);
793 		}
794 	}
795 
796 	argc -= optind;
797 	argv += optind;
798 
799 	/* parse arguments */
800 	if (argc <= 0) {
801 		usage(sp, 1);
802 	}
803 
804 	/* convert the step name to the corresponding number */
805 	step_ptr = bsearch(argv[0], step_table, (sizeof (step_table) /
806 	    sizeof (step_table[0])), sizeof (step_table[0]), mc_compare);
807 	if (step_ptr != NULL) {
808 		stepnum = step_ptr->step_num;
809 	}
810 
811 	--argc;
812 	++argv;
813 
814 	/* set timeout alarm signal, a value of 0 will disable timeout */
815 	if (timeout > 0) {
816 		int	stat_loc = 0;
817 		commd_timeout = (long)(timeout * .75);
818 
819 		c_pid = fork();
820 
821 		if (c_pid == (pid_t)-1) {
822 			md_perror(gettext("Unable to fork"));
823 			md_exit(sp, 1);
824 		} else if (c_pid) {
825 			/* parent */
826 			nsa.sa_flags = 0;
827 			if (sigfillset(&nsa.sa_mask) < 0) {
828 				md_perror(gettext("Unable to set signal mask"));
829 				md_exit(sp, 1);
830 			}
831 
832 			nsa.sa_handler = sigalarmhandler;
833 			if (sigaction(SIGALRM, &nsa, &osa) == -1) {
834 				md_perror(gettext("Unable to set alarm "
835 				    "handler"));
836 				md_exit(sp, 1);
837 			}
838 
839 			(void) alarm(timeout);
840 
841 			/*
842 			 * wait for child to exit or timeout to expire.
843 			 * keep retrying if the call is interrupted
844 			 */
845 			while ((ret_val = waitpid(c_pid, &stat_loc, 0)) < 0) {
846 				if (errno != EINTR) {
847 					break;
848 				}
849 			}
850 			if (ret_val == c_pid) {
851 				/* exit with the childs exit value */
852 				exit(WEXITSTATUS(stat_loc));
853 			} else if (errno == ECHILD) {
854 				md_exit(sp, 0);
855 			} else {
856 				perror(myname);
857 				md_exit(sp, 1);
858 			}
859 		}
860 	}
861 
862 	/*
863 	 * If a timeout value is given, everything from this point onwards is
864 	 * executed in the child process.
865 	 */
866 
867 	switch (stepnum) {
868 	case MC_START:
869 		/*
870 		 * Start Step
871 		 *
872 		 * - Suspend all rpc.mdcommd messages
873 		 */
874 
875 		/* expect the local node id to be given only */
876 		if (argc != 1)
877 			usage(sp, 1);
878 
879 		meta_mc_log(MC_LOG2, gettext("Starting Start step: %s"),
880 		    meta_print_hrtime(0));
881 
882 		/*
883 		 * With multinode disksets configured we need to
884 		 * update all replicas on all cluster nodes to have
885 		 * the same status. If local replicas on a cluster
886 		 * node are not accessible we need to panic this
887 		 * node, otherwise we abort in the reconfig cycle
888 		 * and failfast/reboot the "good" cluster node too.
889 		 * To avoid a total cluster outage in the above case
890 		 * we panic only the failing node via md_exit(.., 1).
891 		 */
892 		if ((local_sp = load_local_set(ep)) == NULL) {
893 			/* panic the node */
894 			md_exit(local_sp, 1);
895 		}
896 
897 		if ((max_sets = get_max_sets(ep)) == 0) {
898 			mde_perror(ep, "");
899 			md_exit(sp, 1);
900 		}
901 
902 		/* start walking through all possible disksets */
903 		for (setno = 1; setno < max_sets; setno++) {
904 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
905 				if (mdiserror(ep, MDE_NO_SET)) {
906 					/* No set for this setno - continue */
907 					mdclrerror(ep);
908 					continue;
909 				} else {
910 					mde_perror(ep, gettext("Unable to "
911 					    "get set %d information"), setno);
912 					md_exit(sp, 1);
913 				}
914 			}
915 
916 			/* only check multi-node disksets */
917 			if (!meta_is_mn_set(sp, ep)) {
918 				mdclrerror(ep);
919 				continue;
920 			}
921 
922 			meta_mc_log(MC_LOG3, gettext("Start - block parse "
923 			    "messages for set %s: %s"), sp->setname,
924 			    meta_print_hrtime(gethrtime() - start_time));
925 
926 			/*
927 			 * Mddb parse messages are sent amongst the nodes
928 			 * in a diskset whenever the locator block or
929 			 * locator names structure has been changed.
930 			 * A locator block change could occur as a result
931 			 * of a disk failure during the reconfig cycle,
932 			 * so block the mddb parse messages while the
933 			 * rpc.mdcommd is suspended during the reconfig cycle.
934 			 */
935 			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
936 				(void) memset(&mbp, 0, sizeof (mbp));
937 				mbp.c_setno = setno;
938 				mbp.c_blk_flags = MDDB_BLOCK_PARSE;
939 				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
940 				    &mbp.c_mde, NULL)) {
941 					(void) mdstealerror(ep, &mbp.c_mde);
942 					mde_perror(ep, gettext("Could not "
943 					    "block set %s"), sp->setname);
944 					md_exit(sp, 1);
945 				}
946 			}
947 
948 			/* suspend commd and spin waiting for drain */
949 			while ((ret_val = mdmn_suspend(setno,
950 			    MD_COMM_ALL_CLASSES, commd_timeout)) ==
951 			    MDE_DS_COMMDCTL_SUSPEND_NYD) {
952 				(void) sleep(1);
953 			}
954 
955 			if (ret_val) {
956 				md_eprintf(gettext("Could not suspend "
957 				    "rpc.mdcommd for set %s\n"), sp->setname);
958 				md_exit(sp, 1);
959 			}
960 
961 			/*
962 			 * Set start step flag for set. This is set to indicate
963 			 * that this node entered the reconfig cycle through
964 			 * the start step.  This is used during the reconfig
965 			 * cycle to determine whether the node had entered
966 			 * through the start step or the return step.
967 			 */
968 			(void) memset(&sf, 0, sizeof (sf));
969 			sf.sf_setno = sp->setno;
970 			sf.sf_setflags = MD_SET_MN_START_RC;
971 			sf.sf_flags = MDDB_NM_SET;
972 			/* Use magic to help protect ioctl against attack. */
973 			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
974 			if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
975 			    &sf.sf_mde, NULL)) {
976 				(void) mdstealerror(ep, &sf.sf_mde);
977 				mde_perror(ep, gettext("Could not set "
978 				    "start_step flag for set %s"), sp->setname);
979 				md_exit(sp, 1);
980 			}
981 
982 		}
983 
984 		meta_mc_log(MC_LOG2, gettext("Start step completed: %s"),
985 		    meta_print_hrtime(gethrtime() - start_time));
986 
987 		break;
988 
989 	case MC_STOP:
990 		/*
991 		 * Stop Step
992 		 *
993 		 * - ???
994 		 */
995 
996 		/* don't expect any more arguments to follow the step name */
997 		if (argc != 0)
998 			usage(sp, 1);
999 
1000 		break;
1001 
1002 	case MC_ABORT:
1003 		/*
1004 		 * Abort Step
1005 		 *
1006 		 * - Abort rpc.mdcommd
1007 		 */
1008 
1009 		/* don't expect any more arguments to follow the step name */
1010 		if (argc != 0)
1011 			usage(sp, 1);
1012 
1013 		meta_mc_log(MC_LOG2, gettext("Starting Abort step: %s"),
1014 		    meta_print_hrtime(0));
1015 
1016 		/*
1017 		 * Does local set exist? If not, exit with 0
1018 		 * since there's no reason to have this node panic if
1019 		 * the local set cannot be started.
1020 		 */
1021 		if ((local_sp = load_local_set(ep)) == NULL) {
1022 			md_exit(local_sp, 0);
1023 		}
1024 
1025 		/*
1026 		 * abort the rpc.mdcommd.  The abort is only issued on this node
1027 		 * meaning that the abort reconfig step is called on this
1028 		 * node before a panic while the rest of the cluster will
1029 		 * undergo a reconfig cycle.
1030 		 * There is no time relation between this node running a
1031 		 * reconfig abort and the the rest of the cluster
1032 		 * running a reconfig cycle meaning that this node may
1033 		 * panic before, during or after the cluster has run
1034 		 * a reconfig cycle.
1035 		 */
1036 		mdmn_abort();
1037 
1038 		meta_mc_log(MC_LOG2, gettext("Abort step completed: %s"),
1039 		    meta_print_hrtime(gethrtime() - start_time));
1040 
1041 		break;
1042 
1043 	case MC_RETURN:
1044 		/*
1045 		 * Return Step
1046 		 *
1047 		 * - Grab local set lock, issue rpc.mdcommd DRAIN ALL
1048 		 *   and release local set lock.  Grabbing the local set
1049 		 *   lock allows any active metaset/metadb commands to
1050 		 *   terminate gracefully and will keep a metaset/metadb
1051 		 *   command from starting until the DRAIN ALL is issued.
1052 		 *   The metaset/metadb commands can issue
1053 		 *   DRAIN ALL/RESUME ALL commands to rpc.mdcommd,
1054 		 *   so the return step must not issue the DRAIN ALL command
1055 		 *   until metaset/metadb have finished or metaset may issue
1056 		 *   a RESUME ALL after this return reconfig step has issued
1057 		 *   the DRAIN ALL command.
1058 		 *   After this reconfig step has issued the DRAIN_ALL and
1059 		 *   released the local set lock, metaset/metadb will fail
1060 		 *   when attempting to contact the rpc.mdcommd and will
1061 		 *   terminate without making any configuration changes.
1062 		 *   The DRAIN ALL command will keep all other meta* commands
1063 		 *   from running during the reconfig cycle (these commands
1064 		 *   will wait until the rpc.mdcommd is resumed) since the
1065 		 *   reconfig cycle may be changing the diskset configuration.
1066 		 */
1067 
1068 		/* expect the nodelist to follow the step name */
1069 		if (argc < 1)
1070 			usage(sp, 1);
1071 
1072 		meta_mc_log(MC_LOG2, gettext("Starting Return step: %s"),
1073 		    meta_print_hrtime(0));
1074 
1075 		/*
1076 		 * Does local set exist? If not, exit with 0
1077 		 * since there's no reason to have this node panic if
1078 		 * the local set cannot be started.
1079 		 */
1080 		if ((local_sp = load_local_set(ep)) == NULL) {
1081 			md_exit(local_sp, 0);
1082 		}
1083 
1084 		/*
1085 		 * Suspend any mirror resyncs that are in progress. This
1086 		 * stops unnecessary timeouts.
1087 		 */
1088 		meta_mirror_resync_block_all();
1089 
1090 		if (meta_lock(local_sp, TRUE, ep) != 0) {
1091 			mde_perror(ep, "");
1092 			md_exit(local_sp, 1);
1093 		}
1094 
1095 		/*
1096 		 * All metaset and metadb commands on this node have now
1097 		 * terminated gracefully.  Now, issue a drain all to
1098 		 * the rpc.mdcommd.  Any meta command issued after the
1099 		 * drain all will either spin sending the command to the
1100 		 * master until after the reconfig cycle has finished OR
1101 		 * will terminate gracefully (metaset/metadb).
1102 		 */
1103 		if ((max_sets = get_max_sets(ep)) == 0) {
1104 			mde_perror(ep, "");
1105 			md_exit(sp, 1);
1106 		}
1107 
1108 		/* start walking through all possible disksets */
1109 		for (setno = 1; setno < max_sets; setno++) {
1110 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1111 				if (mdiserror(ep, MDE_NO_SET)) {
1112 					/* No set for this setno - continue */
1113 					mdclrerror(ep);
1114 					continue;
1115 				} else {
1116 					mde_perror(ep, gettext("Unable to "
1117 					    "get set %d information"), setno);
1118 					md_exit(sp, 1);
1119 				}
1120 			}
1121 
1122 			/* only check multi-node disksets */
1123 			if (!meta_is_mn_set(sp, ep)) {
1124 				mdclrerror(ep);
1125 				continue;
1126 			}
1127 
1128 			meta_mc_log(MC_LOG3, gettext("Return - block parse "
1129 			    "messages for set %s: %s"), sp->setname,
1130 			    meta_print_hrtime(gethrtime() - start_time));
1131 
1132 			/*
1133 			 * Mddb parse messages are sent amongst the nodes
1134 			 * in a diskset whenever the locator block or
1135 			 * locator names structure has been changed.
1136 			 * A locator block change could occur as a result
1137 			 * of a disk failure during the reconfig cycle,
1138 			 * so block the mddb parse messages while the
1139 			 * rpc.commd is suspended during the reconfig cycle.
1140 			 */
1141 			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
1142 				(void) memset(&mbp, 0, sizeof (mbp));
1143 				mbp.c_setno = setno;
1144 				mbp.c_blk_flags = MDDB_BLOCK_PARSE;
1145 				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
1146 				    &mbp.c_mde, NULL)) {
1147 					(void) mdstealerror(ep, &mbp.c_mde);
1148 					mde_perror(ep, gettext("Could not "
1149 					    "block set %s"), sp->setname);
1150 					md_exit(sp, 1);
1151 				}
1152 			}
1153 
1154 			/* suspend commd and spin waiting for drain */
1155 			while ((ret_val = mdmn_suspend(setno,
1156 			    MD_COMM_ALL_CLASSES, commd_timeout)) ==
1157 			    MDE_DS_COMMDCTL_SUSPEND_NYD) {
1158 				(void) sleep(1);
1159 			}
1160 
1161 			if (ret_val) {
1162 				md_eprintf(gettext("Could not suspend "
1163 				    "rpc.mdcommd for set %s\n"), sp->setname);
1164 				md_exit(sp, 1);
1165 			}
1166 		}
1167 		/*
1168 		 * Resume all I/Os for this node for all MN sets in
1169 		 * case master node had suspended I/Os but panic'd
1170 		 * before resuming I/Os.  In case of failure, exit
1171 		 * with a 1 since unable to resume I/Os on this node.
1172 		 */
1173 		if (clnt_mn_susp_res_io(mynode(), 0, MN_RES_IO, ep)) {
1174 			mde_perror(ep, gettext(
1175 			    "Unable to resume I/O on node %s for all sets"),
1176 			    mynode());
1177 			md_exit(sp, 1);
1178 		}
1179 
1180 
1181 		/*
1182 		 * Can now unlock local set lock.  New metaset/metadb
1183 		 * commands are now held off using drain all.
1184 		 */
1185 		(void) meta_unlock(local_sp, ep);
1186 
1187 		meta_mc_log(MC_LOG2, gettext("Return step completed: %s"),
1188 		    meta_print_hrtime(gethrtime() - start_time));
1189 
1190 		break;
1191 
1192 	case MC_STEP1:
1193 		/*
1194 		 * Step 1
1195 		 *
1196 		 * - Populate nodelist file if we are on clustering
1197 		 *   and pick a master node for each MN diskset.
1198 		 */
1199 
1200 		/* expect the nodelist to follow the step name */
1201 		if (argc < 1)
1202 			usage(sp, 1);
1203 
1204 		meta_mc_log(MC_LOG2, gettext("Starting Step1: %s"),
1205 		    meta_print_hrtime(0));
1206 
1207 		/* Always write nodelist file even if no local set exists */
1208 		if (clust == SDSSC_OKAY) {
1209 			/* skip to the nodelist args */
1210 			if (meta_write_nodelist(argc, argv, ep) != 0) {
1211 				mde_perror(ep, gettext(
1212 				    "Could not populate nodelist file"));
1213 				md_exit(sp, 1);
1214 			}
1215 		}
1216 
1217 		/*
1218 		 * Does local set exist? If not, exit with 0
1219 		 * since there's no reason to have this node panic if
1220 		 * the local set cannot be started.
1221 		 */
1222 		if ((local_sp = load_local_set(ep)) == NULL) {
1223 			md_exit(local_sp, 0);
1224 		}
1225 
1226 		/*
1227 		 * At this point, all meta* commands are blocked across
1228 		 * all disksets since the master rpc.mdcommd has drained or
1229 		 * the master node has died.
1230 		 * If a metaset or metadb command had been in progress
1231 		 * at the start of the reconfig cycle, this command has
1232 		 * either completed or it has been terminated due to
1233 		 * the death of the master node.
1234 		 *
1235 		 * This means that that it is now ok to remove any
1236 		 * outstanding clnt_locks associated with multinode
1237 		 * disksets on this node due to a node panic during
1238 		 * a metaset operation.  This allows the routines that
1239 		 * choose the master to use rpc.metad to determine the
1240 		 * master of the diskset.
1241 		 */
1242 		if (clnt_clr_mnsetlock(mynode(), ep) != 0) {
1243 			meta_mc_log(MC_LOG2, gettext("Step1 aborted:"
1244 			    "clear locks failed %s"),
1245 			    meta_print_hrtime(gethrtime() - start_time));
1246 			md_exit(local_sp, 1);
1247 		}
1248 
1249 		/*
1250 		 * Call reconfig_choose_master to choose a master for
1251 		 * each MN diskset, update the nodelist for each diskset
1252 		 * given the member information and send a reinit message
1253 		 * to rpc.mdcommd to reload the nodelist.
1254 		 */
1255 		rval = meta_reconfig_choose_master(commd_timeout, ep);
1256 		if (rval == 205) {
1257 			/*
1258 			 * NOTE: Should issue call to reboot remote host that
1259 			 * is causing the RPC failure.  Clustering to
1260 			 * provide interface in the future.  This should
1261 			 * stop a never-ending set of 205 reconfig cycles.
1262 			 * Remote host causing failure is stored in
1263 			 * ep->host if ep is an RPC error.
1264 			 * if (mdanyrpcerror(ep))
1265 			 * 	reboot (ep->host);
1266 			 */
1267 			meta_mc_log(MC_LOG2, gettext("Step1 aborted:"
1268 			    "choose master failure of 205 %s"),
1269 			    meta_print_hrtime(gethrtime() - start_time));
1270 			md_exit(local_sp, 205);
1271 		} else if (rval != 0) {
1272 			meta_mc_log(MC_LOG2, gettext("Step1 failure: "
1273 			    "choose master failure %s"),
1274 			    meta_print_hrtime(gethrtime() - start_time));
1275 			md_exit(local_sp, 1);
1276 		}
1277 
1278 		meta_mc_log(MC_LOG2, gettext("Step1 completed: %s"),
1279 		    meta_print_hrtime(gethrtime() - start_time));
1280 
1281 		md_exit(local_sp, rval);
1282 		break;
1283 
1284 	case MC_STEP2:
1285 		/*
1286 		 * Step 2
1287 		 *
1288 		 * In Step 2, each node walks the list of disksets.  If a
1289 		 * node is a master of a MN diskset, it synchronizes
1290 		 * the local set USER records for that diskset.
1291 		 *
1292 		 * If disks exist in the diskset and there is a joined
1293 		 * (owner) node in the diskset, the master will also:
1294 		 *	- synchronize the diskset mddbs to the master
1295 		 *	- play the change log
1296 		 *
1297 		 * The master node will now attempt to join any unjoined
1298 		 * nodes that are currently members in the membership list.
1299 		 */
1300 
1301 		/* expect the nodelist to follow the step name */
1302 		if (argc < 1)
1303 			usage(sp, 1);
1304 
1305 		meta_mc_log(MC_LOG2, gettext("Starting Step2: %s"),
1306 		    meta_print_hrtime(0));
1307 
1308 		/*
1309 		 * Does local set exist? If not, exit with 0
1310 		 * since there's no reason to have this node panic if
1311 		 * the local set cannot be started.
1312 		 */
1313 		if ((local_sp = load_local_set(ep)) == NULL) {
1314 			md_exit(local_sp, 0);
1315 		}
1316 
1317 		if ((max_sets = get_max_sets(ep)) == 0) {
1318 			mde_perror(ep, "");
1319 			md_exit(local_sp, 1);
1320 		}
1321 
1322 		/* start walking through all possible disksets */
1323 		for (setno = 1; setno < max_sets; setno++) {
1324 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1325 				if (mdiserror(ep, MDE_NO_SET)) {
1326 					/* No set for this setno - continue */
1327 					mdclrerror(ep);
1328 					continue;
1329 				} else if (mdanyrpcerror(ep)) {
1330 					/* Fail on RPC failure to self */
1331 					mde_perror(ep, gettext(
1332 					    "Unable to get information for "
1333 					    "set number %d"), setno);
1334 					md_exit(local_sp, 1);
1335 				} else {
1336 					mde_perror(ep, gettext(
1337 					    "Unable to get information for "
1338 					    "set number %d"), setno);
1339 					mdclrerror(ep);
1340 					continue;
1341 				}
1342 			}
1343 
1344 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1345 				if (mdanyrpcerror(ep)) {
1346 					/* Fail on RPC failure to self */
1347 					mde_perror(ep, gettext(
1348 					    "Unable to get information for "
1349 					    "set number %d"), setno);
1350 					md_exit(local_sp, 1);
1351 				}
1352 				mde_perror(ep, gettext("Unable to get set "
1353 				    "%s desc information"), sp->setname);
1354 				mdclrerror(ep);
1355 				continue;
1356 			}
1357 
1358 			/* Only check MN disksets */
1359 			if (!(MD_MNSET_DESC(sd))) {
1360 				continue;
1361 			}
1362 
1363 			/* All actions in step 2 are driven by master */
1364 			if (!(sd->sd_mn_am_i_master)) {
1365 				continue;
1366 			}
1367 
1368 			meta_mc_log(MC_LOG3, gettext("Step2 - begin record "
1369 			    "synchronization for set %s: %s"), sp->setname,
1370 			    meta_print_hrtime(gethrtime() - start_time));
1371 
1372 			/*
1373 			 * Synchronize the USER records in the local mddbs
1374 			 * for hosts that are members.  The USER records
1375 			 * contain set, drive and host information.
1376 			 */
1377 			rval = meta_mnsync_user_records(sp, ep);
1378 			if (rval != 0) {
1379 				mde_perror(ep, gettext(
1380 				    "Synchronization of user records "
1381 				    "in set %s failed\n"), sp->setname);
1382 				if (rval == 205) {
1383 					/*
1384 					 * NOTE: Should issue call to reboot
1385 					 * remote host that is causing the RPC
1386 					 * failure.  Clustering to provide
1387 					 * interface in the future.  This
1388 					 * should stop a never-ending set of
1389 					 * 205 reconfig cycles.
1390 					 * Remote host causing failure is
1391 					 * stored in ep->host if ep is an
1392 					 * RPC error.
1393 					 * if (mdanyrpcerror(ep))
1394 					 * 	reboot (ep->host);
1395 					 */
1396 					md_exit(local_sp, 205);
1397 				} else {
1398 					md_exit(local_sp, 1);
1399 				}
1400 			}
1401 
1402 			/* Reget sd since sync_user_recs may have flushed it */
1403 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1404 				mde_perror(ep, gettext("Unable to get set "
1405 				    "%s desc information"), sp->setname);
1406 				md_exit(local_sp, 1);
1407 			}
1408 
1409 			dd = metaget_drivedesc(sp,
1410 			    (MD_BASICNAME_OK | PRINT_FAST), ep);
1411 			if (! mdisok(ep)) {
1412 				mde_perror(ep, gettext("Unable to get set "
1413 				    "%s drive information"), sp->setname);
1414 				md_exit(local_sp, 1);
1415 			}
1416 
1417 			/*
1418 			 * No drives in set, continue to next set.
1419 			 */
1420 			if (dd == NULL) {
1421 				/* Done with this set */
1422 				continue;
1423 			}
1424 
1425 			meta_mc_log(MC_LOG3, gettext("Step2 - local set user "
1426 			    "records completed for set %s: %s"), sp->setname,
1427 			    meta_print_hrtime(gethrtime() - start_time));
1428 
1429 			/*
1430 			 * Synchronize the diskset mddbs for hosts
1431 			 * that are members.  This may involve
1432 			 * playing the changelog and writing out
1433 			 * to the diskset mddbs.
1434 			 */
1435 			rval = meta_mnsync_diskset_mddbs(sp, ep);
1436 			if (rval != 0) {
1437 				mde_perror(ep, gettext(
1438 				    "Synchronization of diskset mddbs "
1439 				    "in set %s failed\n"), sp->setname);
1440 				meta_mc_log(MC_LOG3, gettext("Step2 - diskset "
1441 				    "mddb synchronization failed for "
1442 				    "set %s: %s"), sp->setname,
1443 				    meta_print_hrtime(gethrtime() -
1444 				    start_time));
1445 				if (rval == 205) {
1446 					/*
1447 					 * NOTE: Should issue call to reboot
1448 					 * remote host that is causing the RPC
1449 					 * failure.  Clustering to provide
1450 					 * interface in the future.  This
1451 					 * should stop a never-ending set of
1452 					 * 205 reconfig cycles.
1453 					 * Remote host causing failure is
1454 					 * stored in ep->host if ep is an
1455 					 * RPC error.
1456 					 * if (mdanyrpcerror(ep))
1457 					 * 	reboot (ep->host);
1458 					 */
1459 					md_exit(local_sp, 205);
1460 				} else if (rval == 1) {
1461 					continue;
1462 				} else {
1463 					md_exit(local_sp, 1);
1464 				}
1465 			}
1466 
1467 			meta_mc_log(MC_LOG3, gettext("Step2 - diskset mddb "
1468 			    "synchronization completed for set %s: %s"),
1469 			    sp->setname,
1470 			    meta_print_hrtime(gethrtime() - start_time));
1471 
1472 			/* Join the starting nodes to the diskset */
1473 			rval = meta_mnjoin_all(sp, ep);
1474 			if (rval != 0) {
1475 				mde_perror(ep, gettext(
1476 				    "Join of non-owner (starting) nodes "
1477 				    "in set %s failed\n"), sp->setname);
1478 				meta_mc_log(MC_LOG3, gettext("Step2 - non owner"
1479 				    "nodes joined for set %s: %s"),
1480 				    sp->setname,
1481 				    meta_print_hrtime(gethrtime() -
1482 				    start_time));
1483 				if (rval == 205) {
1484 					/*
1485 					 * NOTE: Should issue call to reboot
1486 					 * remote host that is causing the RPC
1487 					 * failure.  Clustering to provide
1488 					 * interface in the future.  This
1489 					 * should stop a never-ending set of
1490 					 * 205 reconfig cycles.
1491 					 * Remote host causing failure is
1492 					 * stored in ep->host if ep is an
1493 					 * RPC error.
1494 					 * if (mdanyrpcerror(ep))
1495 					 * 	reboot (ep->host);
1496 					 */
1497 					md_exit(local_sp, 205);
1498 				} else {
1499 					md_exit(local_sp, 1);
1500 				}
1501 			}
1502 
1503 			meta_mc_log(MC_LOG3, gettext("Step2 - non owner nodes "
1504 			    "joined for set %s: %s"), sp->setname,
1505 			    meta_print_hrtime(gethrtime() - start_time));
1506 
1507 		}
1508 
1509 		meta_mc_log(MC_LOG2, gettext("Step2 completed: %s"),
1510 		    meta_print_hrtime(gethrtime() - start_time));
1511 
1512 		break;
1513 
1514 	case MC_STEP3:
1515 		/*
1516 		 * Step 3
1517 		 *
1518 		 * For all multinode sets do,
1519 		 * - Reinitialise rpc.mdcommd
1520 		 * - Reset mirror owners to null if the current owner is
1521 		 *   no longer in the membership list
1522 		 */
1523 
1524 		/* expect the nodelist to follow the step name */
1525 		if (argc < 1)
1526 			usage(sp, 1);
1527 
1528 		meta_mc_log(MC_LOG2, gettext("Starting Step3: %s"),
1529 		    meta_print_hrtime(0));
1530 
1531 		/*
1532 		 * Does local set exist? If not, exit with 0
1533 		 * since there's no reason to have this node panic if
1534 		 * the local set cannot be started.
1535 		 */
1536 		if ((local_sp = load_local_set(ep)) == NULL) {
1537 			md_exit(local_sp, 0);
1538 		}
1539 
1540 		/*
1541 		 * walk through all sets on this node which could include:
1542 		 *	- MN disksets
1543 		 *	- traditional disksets
1544 		 *	- non-existent disksets
1545 		 * start mirror resync for all MN sets
1546 		 */
1547 		if ((max_sets = get_max_sets(ep)) == 0) {
1548 			mde_perror(ep, "");
1549 			md_exit(local_sp, 1);
1550 		}
1551 
1552 		/* start walking through all possible disksets */
1553 		for (setno = 1; setno < max_sets; setno++) {
1554 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1555 				if (mdiserror(ep, MDE_NO_SET)) {
1556 					/* No set for this setno - continue */
1557 					mdclrerror(ep);
1558 					continue;
1559 				} else {
1560 					mde_perror(ep, gettext("Unable to "
1561 					    "get set %d information"), setno);
1562 					md_exit(local_sp, 1);
1563 				}
1564 			}
1565 
1566 			/* only check multi-node disksets */
1567 			if (!meta_is_mn_set(sp, ep)) {
1568 				mdclrerror(ep);
1569 				continue;
1570 			}
1571 
1572 			if (meta_lock(sp, TRUE, ep) != 0) {
1573 				mde_perror(ep, "");
1574 				md_exit(local_sp, 1);
1575 			}
1576 
1577 			/* If this node isn't joined to set, do nothing */
1578 			if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) {
1579 				if (!mdisok(ep)) {
1580 					mde_perror(ep, gettext("Could "
1581 					    "not get set %s ownership"),
1582 					    sp->setname);
1583 					md_exit(sp, 1);
1584 				}
1585 				mdclrerror(ep);
1586 				(void) meta_unlock(sp, ep);
1587 				continue;
1588 			}
1589 
1590 			meta_mc_log(MC_LOG3, gettext("Step3 - begin "
1591 			    "re-initialising rpc.mdcommd and resetting mirror "
1592 			    "owners for set %s: %s"), sp->setname,
1593 			    meta_print_hrtime(gethrtime() - start_time));
1594 
1595 			/* reinitialzse rpc.mdcommd with new nodelist */
1596 			if (mdmn_reinit_set(setno, commd_timeout)) {
1597 				md_eprintf(gettext(
1598 				    "Could not re-initialise rpc.mdcommd for "
1599 				    "set %s\n"), sp->setname);
1600 				md_exit(sp, 1);
1601 			}
1602 
1603 			(void) memset(&cfg, 0, sizeof (cfg));
1604 			cfg.c_id = 0;
1605 			cfg.c_setno = sp->setno;
1606 			if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde,
1607 			    NULL) != 0) {
1608 				(void) mdstealerror(ep, &cfg.c_mde);
1609 				mde_perror(ep, gettext("Could "
1610 				    "not get set %s information"),
1611 				    sp->setname);
1612 				md_exit(sp, 1);
1613 			}
1614 
1615 			/* Don't do anything else if set is stale */
1616 			if (cfg.c_flags & MDDB_C_STALE) {
1617 				(void) meta_unlock(sp, ep);
1618 				mdclrerror(ep);
1619 				continue;
1620 			}
1621 
1622 			/* reset mirror owners */
1623 			if (reset_state(RESET_OWNER, sp, MD_MIRROR, ep) == -1) {
1624 				md_exit(sp, 1);
1625 			}
1626 
1627 			(void) meta_unlock(sp, ep);
1628 
1629 			meta_mc_log(MC_LOG3, gettext("Step3 - rpc.mdcommd "
1630 			    "re-initialised and mirror owners reset for "
1631 			    "set %s: %s"), sp->setname,
1632 			    meta_print_hrtime(gethrtime() - start_time));
1633 		}
1634 
1635 		meta_mc_log(MC_LOG2, gettext("Step3 completed: %s"),
1636 		    meta_print_hrtime(gethrtime() - start_time));
1637 
1638 		break;
1639 
1640 	case MC_STEP4:
1641 		/*
1642 		 * Step 4
1643 		 *
1644 		 * For all multinode sets do:
1645 		 * - Resume the rpc.mdcommd messages.  Must resume all
1646 		 *	sets before issuing I/O to any set since an error
1647 		 * 	encountered in a commd suspended set could be
1648 		 *	blocked waiting for commd in another set to resume.
1649 		 *	(This happens since the daemon queues service
1650 		 *	all sets).  An open of a soft partition causes
1651 		 *	a read of the watermarks during the open.
1652 		 * - If set is non-writable (not an owner or STALE), then
1653 		 *	continue to next set.
1654 		 *
1655 		 * For all multinode sets do,
1656 		 * - Reset ABR states for all mirrors, ie clear ABR if not
1657 		 *	open on any node.
1658 		 * - Reset ABR states for all soft partitions, ie clear ABR if
1659 		 *	not open on any node.
1660 		 * - For all slave nodes that have entered through the start
1661 		 *	step, update the ABR state to that of the master and
1662 		 *	get the submirror state from the master
1663 		 * - meta_lock set
1664 		 * - Resync all mirrors
1665 		 * - unlock meta_lock for this set.
1666 		 * - Choose a new owner for any orphaned resyncs
1667 		 *
1668 		 * There is one potential issue here. when concurrently
1669 		 * resetting and updating the ABR state. If the master has ABR
1670 		 * set, but should no longer have because the only node that
1671 		 * had the metadevice open and had ABR set has paniced, the
1672 		 * master will send a message to all nodes to clear the ABR
1673 		 * state. Meanwhile any node that has come through the
1674 		 * start step will get tstate from the master and will update
1675 		 * ABR if it was set in tstate. So, we appear to have a problem
1676 		 * if the following sequence occurs:-
1677 		 * - The slave gets tstate with ABR set
1678 		 * - The master sends a message to clear ABR
1679 		 * - The slave updates ABR with the value it got from tstate.
1680 		 * We now have the master with ABR clear and the slave with ABR
1681 		 * set. Fortunately, having set ABR, the slave will close the
1682 		 * metadevice after setting ABR and as there are no nodes with
1683 		 * the device open, the close will send a message to clear ABR
1684 		 * on all nodes. So, the nodes will all have ABR unset.
1685 		 */
1686 
1687 		/* expect the nodelist to follow the step name */
1688 		if (argc < 1)
1689 			usage(sp, 1);
1690 
1691 		meta_mc_log(MC_LOG2, gettext("Starting Step4: %s"),
1692 		    meta_print_hrtime(0));
1693 
1694 		/*
1695 		 * Does local set exist? If not, exit with 0
1696 		 * since there's no reason to have this node panic if
1697 		 * the local set cannot be started.
1698 		 */
1699 		if ((local_sp = load_local_set(ep)) == NULL) {
1700 			md_exit(local_sp, 0);
1701 		}
1702 
1703 		/*
1704 		 * walk through all sets on this node which could include:
1705 		 *	- MN disksets
1706 		 *	- traditional disksets
1707 		 *	- non-existent disksets
1708 		 * start mirror resync for all MN sets
1709 		 */
1710 		if ((max_sets = get_max_sets(ep)) == 0) {
1711 			mde_perror(ep, "");
1712 			md_exit(local_sp, 1);
1713 		}
1714 
1715 		/* Clear set_info structure */
1716 		for (setno = 1; setno < max_sets; setno++) {
1717 			set_info[setno] = 0;
1718 		}
1719 
1720 		/* start walking through all possible disksets */
1721 		for (setno = 1; setno < max_sets; setno++) {
1722 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1723 				if (mdiserror(ep, MDE_NO_SET)) {
1724 					/* No set for this setno - continue */
1725 					mdclrerror(ep);
1726 					continue;
1727 				} else {
1728 					mde_perror(ep, gettext("Unable to "
1729 					    "get set %d information"), setno);
1730 					md_exit(local_sp, 1);
1731 				}
1732 			}
1733 
1734 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1735 				mde_perror(ep, gettext("Unable to get set "
1736 				    "%s desc information"), sp->setname);
1737 				mdclrerror(ep);
1738 				continue;
1739 			}
1740 
1741 			/* only check multi-node disksets */
1742 			if (!meta_is_mn_set(sp, ep)) {
1743 				mdclrerror(ep);
1744 				continue;
1745 			}
1746 
1747 			set_info[setno] |= SET_INFO_MN;
1748 
1749 			/*
1750 			 * If not an owner (all mddbs failed) or stale
1751 			 * (< 50% mddbs operational), then set is
1752 			 * non-writable so just resume commd and
1753 			 * unblock mddb messages.
1754 			 */
1755 			mdclrerror(ep);
1756 			if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) {
1757 				set_info[setno] |= SET_INFO_NO_WR;
1758 			}
1759 			if (!mdisok(ep)) {
1760 				mde_perror(ep, gettext("Could "
1761 				    "not get set %s ownership"),
1762 				    sp->setname);
1763 				md_exit(local_sp, 1);
1764 			}
1765 			/* Set is owned - is it stale? */
1766 			if (!set_info[setno] & SET_INFO_NO_WR) {
1767 				(void) memset(&cfg, 0, sizeof (cfg));
1768 				cfg.c_id = 0;
1769 				cfg.c_setno = sp->setno;
1770 				if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde,
1771 				    NULL) != 0) {
1772 					(void) mdstealerror(ep, &cfg.c_mde);
1773 					mde_perror(ep, gettext("Could "
1774 					    "not get set %s information"),
1775 					    sp->setname);
1776 					md_exit(local_sp, 1);
1777 				}
1778 				if (cfg.c_flags & MDDB_C_STALE) {
1779 					set_info[setno] |= SET_INFO_NO_WR;
1780 				}
1781 			}
1782 
1783 			/* resume rpc.mdcommd */
1784 			if (mdmn_resume(setno, MD_COMM_ALL_CLASSES, 0,
1785 			    commd_timeout)) {
1786 				md_eprintf(gettext("Unable to resume "
1787 				    "rpc.mdcommd for set %s\n"), sp->setname);
1788 				md_exit(local_sp, 1);
1789 			}
1790 
1791 			/* Unblock mddb parse messages */
1792 			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
1793 				(void) memset(&mbp, 0, sizeof (mbp));
1794 				mbp.c_setno = setno;
1795 				mbp.c_blk_flags = MDDB_UNBLOCK_PARSE;
1796 				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
1797 				    &mbp.c_mde, NULL)) {
1798 					(void) mdstealerror(ep, &mbp.c_mde);
1799 					mde_perror(ep, gettext("Could not "
1800 					    "unblock set %s"), sp->setname);
1801 					md_exit(local_sp, 1);
1802 				}
1803 			}
1804 			meta_mc_log(MC_LOG3, gettext("Step4 - rpc.mdcommd "
1805 			    "resumed and messages unblocked for set %s: %s"),
1806 			    sp->setname,
1807 			    meta_print_hrtime(gethrtime() - start_time));
1808 		}
1809 
1810 		for (setno = 1; setno < max_sets; setno++) {
1811 			int			start_step;
1812 
1813 			/* Skip traditional disksets. */
1814 			if ((set_info[setno] & SET_INFO_MN) == 0)
1815 				continue;
1816 
1817 			/*
1818 			 * If already determined that this set is
1819 			 * a non-writable set, then just continue
1820 			 * to next set since there's nothing else
1821 			 * to do for a non-writable set.
1822 			 */
1823 			if (set_info[setno] & SET_INFO_NO_WR)
1824 				continue;
1825 
1826 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1827 				if (mdiserror(ep, MDE_NO_SET)) {
1828 					/* No set for this setno - continue */
1829 					mdclrerror(ep);
1830 					continue;
1831 				} else {
1832 					mde_perror(ep, gettext("Unable to "
1833 					    "get set %d information"), setno);
1834 					md_exit(local_sp, 1);
1835 				}
1836 			}
1837 
1838 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1839 				mde_perror(ep, gettext("Unable to get set "
1840 				    "%s desc information"), sp->setname);
1841 				mdclrerror(ep);
1842 				continue;
1843 			}
1844 
1845 			/* See if this node came through the start step */
1846 			(void) memset(&sf, 0, sizeof (sf));
1847 			sf.sf_setno = sp->setno;
1848 			sf.sf_flags = MDDB_NM_GET;
1849 			/* Use magic to help protect ioctl against attack. */
1850 			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
1851 			if (metaioctl(MD_MN_GET_SETFLAGS, &sf,
1852 			    &sf.sf_mde, NULL)) {
1853 				(void) mdstealerror(ep, &sf.sf_mde);
1854 				mde_perror(ep, gettext("Could not get "
1855 				    "start_step flag for set %s"), sp->setname);
1856 				md_exit(local_sp, 1);
1857 			}
1858 			start_step =
1859 			    (sf.sf_setflags & MD_SET_MN_START_RC)? 1: 0;
1860 
1861 			/*
1862 			 * We can now reset the start_step flag for the set
1863 			 * if it was already set.
1864 			 */
1865 			if (start_step) {
1866 				(void) memset(&sf, 0, sizeof (sf));
1867 					sf.sf_setno = sp->setno;
1868 				sf.sf_setflags = MD_SET_MN_START_RC;
1869 				sf.sf_flags = MDDB_NM_RESET;
1870 				/*
1871 				 * Use magic to help protect ioctl
1872 				 * against attack.
1873 				 */
1874 				sf.sf_magic = MDDB_SETFLAGS_MAGIC;
1875 				if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
1876 				    &sf.sf_mde, NULL)) {
1877 					(void) mdstealerror(ep, &sf.sf_mde);
1878 					mde_perror(ep,
1879 					    gettext("Could not reset "
1880 					    "start_step flag for set %s"),
1881 					    sp->setname);
1882 				}
1883 			}
1884 
1885 			meta_mc_log(MC_LOG3, gettext("Step4 - begin setting "
1886 			    "ABR state and restarting io's for "
1887 			    "set %s: %s"), sp->setname,
1888 			    meta_print_hrtime(gethrtime() - start_time));
1889 
1890 
1891 			/*
1892 			 * If we are not the master and we have come through
1893 			 * the start step, we must update the ABR states
1894 			 * for mirrors and soft partitions. Also the submirror
1895 			 * states need to be synchronised so that we see the
1896 			 * same status as other previously joined members.
1897 			 * This _must_ be done before starting the resync.
1898 			 */
1899 			if (!(sd->sd_mn_am_i_master) && start_step) {
1900 				if (reset_state(GET_MIRROR_STATE, sp, MD_MIRROR,
1901 				    ep) == -1) {
1902 					md_exit(local_sp, 1);
1903 				}
1904 				if (reset_state(UPDATE_ABR, sp, MD_SP,
1905 				    ep) == -1) {
1906 					md_exit(local_sp, 1);
1907 				}
1908 				/*
1909 				 * Mark the fact that we've got the mirror
1910 				 * state. This allows the resync thread to
1911 				 * determine if _it_ needs to issue this. This
1912 				 * can happen if a node is added to a set after
1913 				 * a reconfig cycle has completed.
1914 				 */
1915 				(void) memset(&sf, 0, sizeof (sf));
1916 					sf.sf_setno = sp->setno;
1917 				sf.sf_setflags = MD_SET_MN_MIR_STATE_RC;
1918 				sf.sf_flags = MDDB_NM_SET;
1919 				/*
1920 				 * Use magic to help protect ioctl
1921 				 * against attack.
1922 				 */
1923 				sf.sf_magic = MDDB_SETFLAGS_MAGIC;
1924 				if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
1925 				    &sf.sf_mde, NULL)) {
1926 					(void) mdstealerror(ep, &sf.sf_mde);
1927 					mde_perror(ep,
1928 					    gettext("Could not set "
1929 					    "submirror state flag for set %s"),
1930 					    sp->setname);
1931 				}
1932 			}
1933 
1934 			/*
1935 			 * All remaining actions are only performed by the
1936 			 * master
1937 			 */
1938 			if (!(sd->sd_mn_am_i_master)) {
1939 				if (meta_lock(sp, TRUE, ep) != 0) {
1940 					mde_perror(ep, "");
1941 					md_exit(local_sp, 1);
1942 				}
1943 				meta_mirror_resync_unblock(sp);
1944 				(void) meta_unlock(sp, ep);
1945 				continue;
1946 			}
1947 
1948 			/*
1949 			 * If the master came through the start step, this
1950 			 * implies that all of the nodes must have done the
1951 			 * same and hence there can be no applications
1952 			 * running. Hence no need to reset ABR
1953 			 */
1954 			if (!start_step) {
1955 				/* Reset ABR state for mirrors */
1956 				if (reset_state(RESET_ABR, sp, MD_MIRROR,
1957 				    ep) == -1) {
1958 					md_exit(local_sp, 1);
1959 				}
1960 				/* ...and now the same for soft partitions */
1961 				if (reset_state(RESET_ABR, sp, MD_SP,
1962 				    ep) == -1) {
1963 					md_exit(local_sp, 1);
1964 				}
1965 			}
1966 
1967 			/*
1968 			 * choose owners for orphaned resyncs and reset
1969 			 * non-orphaned resyncs so that an owner node that
1970 			 * reboots will restart the resync if needed.
1971 			 */
1972 			if (reset_state(CHOOSE_OWNER, sp, MD_MIRROR, ep) == -1)
1973 				md_exit(local_sp, 1);
1974 
1975 			/*
1976 			 * Must unlock set lock before meta_mirror_resync_all
1977 			 * sends a message to run the metasync command
1978 			 * which also grabs the meta_lock.
1979 			 */
1980 			if (meta_lock(sp, TRUE, ep) != 0) {
1981 				mde_perror(ep, "");
1982 				md_exit(local_sp, 1);
1983 			}
1984 			meta_mirror_resync_unblock(sp);
1985 			(void) meta_unlock(sp, ep);
1986 
1987 			/* resync all mirrors in set */
1988 			if (meta_mirror_resync_all(sp, 0, ep) != 0) {
1989 				mde_perror(ep, gettext("Mirror resyncs "
1990 				    "failed for set %s"), sp->setname);
1991 				md_exit(local_sp, 1);
1992 			}
1993 
1994 			meta_mc_log(MC_LOG3, gettext("Step4 - io's restarted "
1995 			    "for set %s: %s"), sp->setname,
1996 			    meta_print_hrtime(gethrtime() - start_time));
1997 		}
1998 
1999 		meta_mc_log(MC_LOG2, gettext("Step4 completed: %s"),
2000 		    meta_print_hrtime(gethrtime() - start_time));
2001 
2002 		break;
2003 
2004 	default:
2005 		usage(sp, 1);
2006 		break;
2007 	}
2008 
2009 	md_exit(sp, 0);
2010 	/* NOTREACHED */
2011 	return (0);
2012 }
2013