xref: /titanic_41/usr/src/cmd/lvm/util/metaclust.c (revision 35a5a3587fd94b666239c157d3722745250ccbd7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <meta.h>
28 #include <sdssc.h>
29 #include <signal.h>
30 #include <syslog.h>
31 #include <sys/types.h>
32 #include <sys/wait.h>
33 #include <sys/lvm/md_mirror.h>
34 #include <metad.h>
35 
36 #define	MY_VERSION		"1.0"	/* the highest supported version */
37 #define	MAX_DEBUG_LEVEL		5	/* maximum verbosity level */
38 
39 #define	RESET_OWNER		0x0001
40 #define	CHOOSE_OWNER		0x0002
41 #define	RESET_ABR		0x0004
42 #define	UPDATE_ABR		0x0008
43 #define	GET_MIRROR_STATE	0x0010
44 
45 #define	SET_INFO_NO_WR	0x0002
46 #define	SET_INFO_MN	0x0004
47 
48 /*
49  * This table defines all the metaclust reconfig steps we understand
50  */
51 typedef enum stpnum {
52 	MC_UNK = 0,
53 	MC_START,
54 	MC_STOP,
55 	MC_ABORT,
56 	MC_RETURN,
57 	MC_STEP1,
58 	MC_STEP2,
59 	MC_STEP3,
60 	MC_STEP4
61 } stepnum_t;
62 
63 /*
64  * Structure for step_name -> step_number mapping
65  */
66 struct step_t {
67 	char		*step_nam;
68 	stepnum_t	step_num;
69 };
70 
71 /*
72  * Step name to step number mapping table
73  * This table MUST be sorted alphabetically in ascending order of step name
74  */
75 static struct step_t step_table[] = {
76 	{ "abort",	MC_ABORT },
77 	{ "return",	MC_RETURN },
78 	{ "start",	MC_START },
79 	{ "step1",	MC_STEP1 },
80 	{ "step2",	MC_STEP2 },
81 	{ "step3",	MC_STEP3 },
82 	{ "step4",	MC_STEP4 },
83 	{ "stop",	MC_STOP }
84 };
85 
86 /*
87  * If support for a different version is added, the new version number should
88  * be appended to the version_table below. This list will be searched to
89  * determine if a version requested via the -V option is supported or not.
90  */
91 static char *version_table[] = {
92 	MY_VERSION
93 };
94 
95 uint_t	timeout = 0;			/* disable timeout by default */
96 char	*version = MY_VERSION;		/* use latest version by default */
97 int	stepnum = MC_UNK;		/* reconfiguration step number */
98 pid_t	c_pid;				/* child process id */
99 
100 /*
101  * Binary search comparison routine
102  */
103 static int
104 mc_compare(const void *stp1, const void *stp2)
105 {
106 	return (strcmp((const char *)stp1,
107 	    ((const struct step_t *)stp2)->step_nam));
108 }
109 
110 /*
111  * Timeout expiry alarm signal handler
112  */
113 /*ARGSUSED*/
114 static void
115 sigalarmhandler(int sig)
116 {
117 	int	i, n, ret, stat_loc = 0;
118 	FILE	*pgcore;
119 	char	corecmd[256];
120 
121 	n = sizeof (step_table) / sizeof (step_table[0]);
122 	for (i = 0; i < n; i++) {
123 		if (stepnum == step_table[i].step_num)
124 			break;
125 	}
126 
127 	assert(i != n);
128 
129 	meta_mc_log(MC_LOG1, gettext("Timeout expired in %s: %s"),
130 	    step_table[i].step_nam,
131 	    meta_print_hrtime(gethrtime() - start_time));
132 
133 	/*
134 	 * See what the child was actually doing when the timeout expired.
135 	 * A core-dump of this would be _really_ good, so let's just
136 	 * try a 'gcore -g c_pid' and hope
137 	 */
138 
139 	(void) memset(corecmd, 0, sizeof (corecmd));
140 	(void) snprintf(corecmd, sizeof (corecmd),
141 	    "/bin/gcore -g %d >/dev/null 2>&1", (int)c_pid);
142 
143 	pgcore = popen(corecmd, "r");
144 
145 	if (pgcore == NULL) {
146 		meta_mc_log(MC_LOG1, gettext("Could not grab core for pid %s"),
147 		    c_pid);
148 	} else {
149 		(void) pclose(pgcore);
150 	}
151 
152 	if ((ret = kill(c_pid, SIGKILL)) == 0) {
153 		/*
154 		 * The child will wait forever until the status is retrieved
155 		 * so get it now. Keep retrying if the call is interrupted.
156 		 *
157 		 * The possible results are,
158 		 *
159 		 *	- child killed successfully
160 		 *	- signal sent but child not killed
161 		 *	- waitpid failed/interrupted
162 		 */
163 		(void) sleep(2);
164 		while ((ret = waitpid(c_pid, &stat_loc, WNOHANG)) < 0) {
165 			if (errno != EINTR) {
166 				break;
167 			}
168 		}
169 		if ((ret == c_pid) || (errno == ECHILD)) {
170 			ret = 0;
171 		} else {
172 			ret = 1;
173 		}
174 	} else if (errno == ESRCH) {
175 		/*
176 		 * If the kill did not catch the child then it means the child
177 		 * exited immediately after the timeout occured.
178 		 */
179 		ret = 0;
180 	}
181 
182 	/*
183 	 * make sure not to exit with 205 for any steps other than step1-step4.
184 	 * Suncluster reconfiguration can't handle it otherwise.
185 	 */
186 	switch (stepnum) {
187 	case MC_STEP1:
188 	case MC_STEP2:
189 	case MC_STEP3:
190 	case MC_STEP4:
191 		/*
192 		 * If the child was killed successfully return 205 for a
193 		 * new reconfig cycle otherwise send 1 to panic the node.
194 		 */
195 		if (ret != 0) {
196 			md_eprintf(gettext("Could not kill child\n"));
197 			exit(1);
198 		} else {
199 			exit(205);
200 		}
201 		break;
202 	case MC_START:
203 	case MC_STOP:
204 	case MC_ABORT:
205 	case MC_RETURN:
206 	default:
207 		exit(1);
208 		break;
209 	}
210 }
211 
212 /*
213  * Attempt to load local set.
214  * Returns:
215  *	pointer to mdsetname_t for local set (local_sp) is successful.
216  *	0 if failure
217  *		if there are no local set mddbs, no error message is printed.
218  *		Otherwise, error message is printed so that user
219  *		can determine why the local set didn't start.
220  */
221 mdsetname_t *
222 load_local_set(md_error_t *ep)
223 {
224 	mdsetname_t	*local_sp = NULL;
225 
226 	/* Does local set exist? If not, give no error */
227 	if ((local_sp = metasetname(MD_LOCAL_NAME, ep)) == NULL) {
228 		return (0);
229 	}
230 
231 	/*
232 	 * snarf local set
233 	 * If fails with MDE_DB_NODB, then just return 1 printing
234 	 * no failure.
235 	 * Otherwise, print error message, and return 1.
236 	 */
237 	if (meta_setup_db_locations(ep) != 0) {
238 		if (!(mdismddberror(ep, MDE_DB_NODB)))
239 			mde_perror(ep, "");
240 		return (0);
241 	}
242 
243 	/* local set loaded successfully */
244 	return (local_sp);
245 }
246 
247 /*
248  * Purpose:	Compose a full path name for a metadevice
249  *
250  * On entry:	sp	- setname pointer
251  *		mnum	- minor number of metadevice
252  *		pathname - pointer to array to return path string
253  *		pathlen	- max length of pathname array
254  */
255 static int
256 compose_path(mdsetname_t *sp, int mnum, char *pathname, int pathlen)
257 {
258 	int	rtn;
259 	mdname_t	*np;
260 	md_error_t	status = mdnullerror;
261 
262 	if (MD_MIN2SET(mnum) != sp->setno) {
263 		md_eprintf(gettext("minor number 0x%x invalid for set %d\n"),
264 		    mnum, sp->setno);
265 		return (-1);
266 	}
267 
268 	if ((np = metamnumname(&sp, mnum, 0, &status)) == NULL) {
269 		return (-1);
270 	}
271 
272 	rtn = snprintf(pathname, pathlen, "%s", np->rname);
273 
274 	if ((pathname[0] == '\0') || (rtn >= pathlen)) {
275 		md_eprintf(gettext(
276 		    "Could not create path for device %s\n"),
277 		    get_mdname(sp, mnum));
278 		return (-1);
279 	}
280 	return (0);
281 }
282 
283 /*
284  * Purpose:	Walk through all the devices specified for the given set
285  *		and do the action specified in mode
286  */
287 static int
288 reset_state(uint_t mode, mdsetname_t *sp, char *drivername, md_error_t *ep)
289 {
290 	mdnamelist_t			*devnlp = NULL;
291 	mdnamelist_t			*p;
292 	mdname_t			*devnp = NULL;
293 	md_set_mmown_params_t		ownpar_p;
294 	md_set_mmown_params_t		*ownpar = &ownpar_p;
295 	md_unit_t			*mm;
296 	int				mirror_dev = 0;
297 	mndiskset_membershiplist_t	*nl;
298 	int				cnt;
299 	int				has_parent;
300 	md_mn_get_mir_state_t		mir_state_p;
301 	md_mn_get_mir_state_t		*mir_state = &mir_state_p;
302 
303 	/*
304 	 * if we are choosing or resetting the owners then make sure
305 	 * we are only doing it for mirror devices
306 	 */
307 	mirror_dev = (strcmp(MD_MIRROR, drivername) == 0);
308 	if ((mode & (RESET_OWNER | CHOOSE_OWNER)) && !mirror_dev) {
309 		return (-1);
310 	}
311 
312 	/* get a list of all the metadevices for current set */
313 	if (mirror_dev && meta_get_mirror_names(sp, &devnlp, 0, ep) < 0) {
314 		mde_perror(ep, gettext("Could not get mirrors for set %s"),
315 		    sp->setname);
316 		return (-1);
317 	} else if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) {
318 		mde_perror(ep, gettext(
319 		    "Could not get soft partitions for set %s"), sp->setname);
320 		return (-1);
321 	}
322 
323 	/* If resetting the owner, get the known membership list */
324 	if (mode & RESET_OWNER) {
325 		if (meta_read_nodelist(&cnt, &nl, ep)) {
326 			mde_perror(ep, "Could not get nodelist");
327 			return (-1);
328 		}
329 	}
330 
331 	/* for each metadevice */
332 	for (p = devnlp; (p != NULL); p = p->next) {
333 		devnp = p->namep;
334 
335 		/*
336 		 * Get the current setting for mirror ABR state and all of the
337 		 * submirror state and flags from the master node. We only
338 		 * perform this when going through a 'start' cycle.
339 		 */
340 		if ((mode & GET_MIRROR_STATE) && mirror_dev) {
341 			char	*miscname;
342 
343 			/*
344 			 * Ensure that we ignore soft-parts that are returned
345 			 * from the meta_get_mirror_names() call
346 			 */
347 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
348 				goto out;
349 			if (strcmp(miscname, MD_MIRROR) != 0)
350 				continue;
351 
352 			mir_state->mnum = meta_getminor(devnp->dev);
353 			MD_SETDRIVERNAME(mir_state, MD_MIRROR, sp->setno);
354 			meta_mc_log(MC_LOG4, gettext("Getting mirror state"
355 			    " for %s: %s"), get_mdname(sp, mir_state->mnum),
356 			    meta_print_hrtime(gethrtime() - start_time));
357 
358 			if (metaioctl(MD_MN_GET_MIRROR_STATE, mir_state, ep,
359 			    "MD_MN_GET_MIRROR_STATE") != 0) {
360 				mde_perror(ep, gettext("Unable to get "
361 				    "mirror state for %s"),
362 				    get_mdname(sp, mir_state->mnum));
363 				goto out;
364 			} else {
365 				continue;
366 			}
367 		}
368 
369 		/* check if this is a top level metadevice */
370 		if ((mm = meta_get_mdunit(sp, devnp, ep)) == NULL)
371 			goto out;
372 		if (MD_HAS_PARENT(MD_PARENT(mm))) {
373 			has_parent = 1;
374 		} else {
375 			has_parent = 0;
376 		}
377 		Free(mm);
378 
379 		if (mode & (RESET_OWNER | CHOOSE_OWNER)) {
380 			char	*miscname;
381 
382 			/*
383 			 * we can only do these for mirrors so make sure we
384 			 * really have a mirror device and not a softpartition
385 			 * imitating one. meta_get_mirror_names seems to think
386 			 * softparts on top of a mirror are mirrors!
387 			 */
388 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
389 				goto out;
390 			if (strcmp(miscname, MD_MIRROR) != 0)
391 				continue;
392 
393 			(void) memset(ownpar, 0, sizeof (*ownpar));
394 			ownpar->d.mnum = meta_getminor(devnp->dev);
395 			MD_SETDRIVERNAME(ownpar, MD_MIRROR, sp->setno);
396 
397 			meta_mc_log(MC_LOG4, gettext("Setting owner "
398 			    "for %s: %s"), get_mdname(sp, ownpar->d.mnum),
399 			    meta_print_hrtime(gethrtime() - start_time));
400 
401 			/* get the current owner id */
402 			if (metaioctl(MD_MN_GET_MM_OWNER, ownpar, ep,
403 			    "MD_MN_GET_MM_OWNER") != 0) {
404 				mde_perror(ep, gettext("Unable to get "
405 				    "mirror owner for %s"),
406 				    get_mdname(sp, ownpar->d.mnum));
407 				goto out;
408 			}
409 		}
410 
411 		if (mode & RESET_OWNER) {
412 			if (ownpar->d.owner == MD_MN_MIRROR_UNOWNED) {
413 				mdclrerror(ep);
414 				continue;
415 			}
416 
417 			/*
418 			 * reset owner only if the current owner is
419 			 * not in the membership list
420 			 * Also kill the resync thread so that when the resync
421 			 * is started, it will perform an optimized resync
422 			 * for any resync regions that were dirty when the
423 			 * current owner left the membership.
424 			 */
425 			if (meta_is_member(NULL, ownpar->d.owner, nl) != 1) {
426 				if (meta_mn_change_owner(&ownpar,
427 				    sp->setno, ownpar->d.mnum,
428 				    MD_MN_MIRROR_UNOWNED,
429 				    MD_MN_MM_ALLOW_CHANGE) == -1) {
430 					md_eprintf(gettext(
431 					    "Unable to reset mirror owner "
432 					    "for %s\n"),
433 					    get_mdname(sp, ownpar->d.mnum));
434 					goto out;
435 				}
436 				if (meta_mirror_resync(sp, devnp, 0, ep,
437 				    MD_RESYNC_KILL_NO_WAIT) != 0) {
438 					md_eprintf(gettext(
439 					    "Unable to kill resync for"
440 					    " %s\n"),
441 					    get_mdname(sp, ownpar->d.mnum));
442 					goto out;
443 				}
444 			}
445 		}
446 
447 		if (mode & CHOOSE_OWNER) {
448 			/*
449 			 * only orphaned resyncs will have no owner.
450 			 * if that is the case choose a new owner. Otherwise
451 			 * re-establish the existing owner. This covers the
452 			 * case where a node that owned the mirror
453 			 * reboots/panics and comes back into the cluster before
454 			 * the reconfig cycle has completed. In this case the
455 			 * other cluster nodes will have the mirror owner marked
456 			 * as the rebooted node while it has the owner marked
457 			 * as 'None'. We have to reestablish the ownership so
458 			 * that the subsequent resync can continue.
459 			 */
460 			if (meta_mn_change_owner(&ownpar, sp->setno,
461 			    ownpar->d.mnum, ownpar->d.owner,
462 			    MD_MN_MM_CHOOSE_OWNER) == -1) {
463 				md_eprintf(gettext("Unable to choose "
464 				    "mirror owner for %s\n"),
465 				    get_mdname(sp, ownpar->d.mnum));
466 				goto out;
467 			}
468 		}
469 
470 		/*
471 		 * For RESET_ABR and UPDATE_ABR - only handle top
472 		 * level metadevices.
473 		 */
474 		if (has_parent)
475 			continue;
476 
477 		if (mode & RESET_ABR) {
478 			/*
479 			 * Reset the ABR (application based recovery)
480 			 * value on all nodes. We are dealing with
481 			 * the possibility that we have ABR set but the
482 			 * only node that had the device open with ABR has
483 			 * left the cluster. We simply open and close the
484 			 * device and if this is the last close in the
485 			 * cluster, ABR will be cleared on all nodes.
486 			 */
487 			char		*miscname;
488 			char		name[MAXPATHLEN];
489 			int		mnum, fd;
490 
491 			name[0] = '\0';
492 			mnum = meta_getminor(devnp->dev);
493 
494 			/*
495 			 * Ensure that we don't include soft-parts in the
496 			 * mirror-only call to RESET_ABR. meta_get_mirror_names
497 			 * returns a bogus list that includes all soft-parts
498 			 * built on mirrors.
499 			 */
500 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
501 				goto out;
502 			if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0))
503 				continue;
504 
505 			meta_mc_log(MC_LOG4, gettext("Re-setting ABR state "
506 			    "for %s: %s"), get_mdname(sp, mnum),
507 			    meta_print_hrtime(gethrtime() - start_time));
508 
509 			/* compose the absolute device path and open it */
510 			if (compose_path(sp, mnum, &name[0],
511 			    sizeof (name)) != 0)
512 				goto out;
513 			if ((fd = open(name, O_RDWR, 0)) < 0) {
514 				md_perror(gettext("Could not open device %s"),
515 				    name);
516 				continue;
517 			}
518 
519 			(void) close(fd);
520 		}
521 
522 		if (mode & UPDATE_ABR) {
523 			/*
524 			 * Update the ABR value on this node. We obtain the
525 			 * current ABR state from the master node.
526 			 */
527 
528 			char		*miscname;
529 			char		name[MAXPATHLEN];
530 			int		mnum, fd;
531 			volcap_t	vc;
532 			uint_t		tstate;
533 
534 			name[0] = '\0';
535 			mnum = meta_getminor(devnp->dev);
536 
537 			/*
538 			 * Ensure that we don't include soft-parts in the
539 			 * mirror-only call to UPDATE_ABR. meta_get_mirror_names
540 			 * returns a bogus list that includes all soft-parts
541 			 * built on mirrors.
542 			 */
543 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
544 				goto out;
545 			if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0))
546 				continue;
547 
548 			/* Get tstate from Master */
549 			if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep)
550 			    != 0)
551 				continue;
552 			/* If not set on the master, nothing to do */
553 			if (!(tstate & MD_ABR_CAP))
554 				continue;
555 
556 			meta_mc_log(MC_LOG4, gettext("Updating ABR state "
557 			    "for %s: %s"), get_mdname(sp, mnum),
558 			    meta_print_hrtime(gethrtime() - start_time));
559 
560 			/* compose the absolute device path and open it */
561 			if (compose_path(sp, mnum, &name[0],
562 			    sizeof (name)) != 0)
563 				goto out;
564 			if ((fd = open(name, O_RDWR, 0)) < 0) {
565 				md_perror(gettext("Could not open device %s"),
566 				    name);
567 				continue;
568 			}
569 
570 			/* set ABR state */
571 			vc.vc_info = 0;
572 			vc.vc_set = 0;
573 			if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) {
574 				/*
575 				 * Ignore if device does not support this
576 				 * ioctl
577 				 */
578 				if ((errno != ENOTTY) && (errno != ENOTSUP)) {
579 					md_perror(gettext("Could not get "
580 					    "ABR/DMR state for device %s"),
581 					    name);
582 				}
583 				(void) close(fd);
584 				continue;
585 			}
586 			if (!(vc.vc_info & (DKV_ABR_CAP | DKV_DMR_CAP))) {
587 				(void) close(fd);
588 				continue;
589 			}
590 
591 			vc.vc_set = DKV_ABR_CAP;
592 			if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) {
593 				md_perror(gettext(
594 				    "Could not set ABR state for "
595 				    "device %s"), name);
596 				(void) close(fd);
597 				goto out;
598 			} else {
599 				md_eprintf(gettext(
600 				    "Setting ABR state on device %s\n"), name);
601 			}
602 
603 			(void) close(fd);
604 		}
605 	}
606 
607 	/* cleanup */
608 	if (mode & RESET_OWNER) {
609 		meta_free_nodelist(nl);
610 	}
611 	metafreenamelist(devnlp);
612 	return (0);
613 
614 out:
615 	/* cleanup */
616 	if (mode & RESET_OWNER) {
617 		meta_free_nodelist(nl);
618 	}
619 	metafreenamelist(devnlp);
620 	return (-1);
621 }
622 
623 /*
624  * Print usage message
625  */
626 static void
627 usage(mdsetname_t *sp, int eval)
628 {
629 	(void) fprintf(stderr, gettext("usage:"
630 	    "\t%s [-V version] [-t timeout] [-d level] start localnodeid\n"
631 	    "\t%s [-V version] [-t timeout] [-d level] step nodelist...\n"
632 	    "\t%s [-V version] [-t timeout] [-d level] abort | stop\n"
633 	    "\t%s [-V | -? | -h]\n"),
634 	    myname, myname, myname, myname);
635 	if (!eval) {
636 		(void) fprintf(stderr, gettext("\n"
637 		    "\tValid debug (-d) levels are 1-%d for increasing "
638 		    "verbosity.\n\tDefault is -d 3.\n\n"
639 		    "\tValid step values are: return | step1 | step2 | "
640 		    "step3 | step4\n\n"
641 		    "\tNodelist is a space-separated list of node id's\n\n"),
642 		    MAX_DEBUG_LEVEL);
643 	}
644 	md_exit(sp, eval);
645 }
646 
647 /*
648  * Input:	Input takes a config step name followed by a list of
649  *		possible node id's.
650  *
651  * Returns:	  0 - Success
652  *		  1 - Fail
653  *			Node will be removed from cluster membership
654  *			by forcing node to panic.
655  *		205 - Unsuccessful. Start another reconfig cycle.
656  *			Problem was encountered that could be fixed by
657  *			running another reconfig cycle.
658  *			Problem could be a result of a failure to read
659  *			the nodelist file or that all work could not be
660  *			accomplished in a reconfig step in the amount of
661  *			time given so another reconfig cycle is needed in
662  *			order to finish the current step.
663  */
664 int
665 main(int argc, char **argv)
666 {
667 	mdsetname_t		*sp = NULL;
668 	md_error_t		status = mdnullerror;
669 	md_error_t		*ep = &status;
670 	set_t			max_sets, setno;
671 	int			c, clust = 0;
672 	struct sigaction	nsa, osa;
673 	struct step_t		*step_ptr;
674 	mdsetname_t		*local_sp = NULL;
675 	md_drive_desc		*dd;
676 	int			rval = 0;
677 	md_set_desc		*sd;
678 	mddb_block_parm_t	mbp;
679 	uint_t			debug = 3; /* log upto MC_LOG3 by default */
680 	int			version_table_size;
681 	mddb_setflags_config_t	sf;
682 	int			ret_val;
683 	mddb_config_t		cfg;
684 	int			set_info[MD_MAXSETS];
685 	long			commd_timeout = 0;
686 
687 	/*
688 	 * Get the locale set up before calling any other routines
689 	 * with messages to ouput.  Just in case we're not in a build
690 	 * environment, make sure that TEXT_DOMAIN gets set to
691 	 * something.
692 	 */
693 #if !defined(TEXT_DOMAIN)
694 #define	TEXT_DOMAIN "SYS_TEST"
695 #endif
696 	(void) setlocale(LC_ALL, "");
697 	(void) textdomain(TEXT_DOMAIN);
698 
699 	if ((clust = sdssc_bind_library()) == SDSSC_ERROR) {
700 		md_eprintf(gettext("Interface error with libsds_sc.so\n"));
701 		exit(1);
702 	}
703 
704 	if (md_init(argc, argv, 1, 1, ep) != 0 || meta_check_root(ep) != 0) {
705 		mde_perror(ep, "");
706 		md_exit(sp, 1);
707 	}
708 
709 	/*
710 	 * open log and enable libmeta logging. Do it here explicitly
711 	 * rather than letting md_init() do it because we are not really
712 	 * a daemon and that is what md_init() opens the log as.
713 	 */
714 	openlog("metaclust", LOG_CONS, LOG_USER);
715 
716 	version_table_size = sizeof (version_table) / sizeof (version_table[0]);
717 
718 	optind = 1;
719 	opterr = 0;
720 	while ((c = getopt(argc, argv, "hd:V:t:?")) != -1) {
721 		switch (c) {
722 		case 'h':
723 			usage(sp, 0);
724 			break;
725 
726 		case 'd':
727 			if (sscanf(optarg, "%u", &debug) != 1) {
728 				md_eprintf(gettext("Invalid debug level\n"));
729 				md_exit(sp, 1);
730 			} else if ((debug < 1) || (debug > MAX_DEBUG_LEVEL)) {
731 				debug = min(max(debug, 1), MAX_DEBUG_LEVEL);
732 				md_eprintf(gettext("Debug level must be "
733 				    "between 1 and %d inclusive.\n"),
734 				    MAX_DEBUG_LEVEL);
735 				md_eprintf(gettext("Debug level set to %d.\n"),
736 				    debug);
737 			}
738 			break;
739 
740 		case 'V':
741 			version = Strdup(optarg);
742 			break;
743 
744 		case 't':
745 			if (sscanf(optarg, "%u", &timeout) != 1) {
746 				md_eprintf(gettext("Invalid timeout value\n"));
747 				md_exit(sp, 1);
748 			}
749 			break;
750 
751 		case '?':
752 			if (optopt == '?') {
753 				usage(sp, 0);
754 			} else if (optopt == 'V') {
755 				int	i;
756 
757 				(void) fprintf(stdout, gettext(
758 				    "%s: Versions Supported:"), myname);
759 				for (i = 0; i < version_table_size; i++) {
760 					(void) fprintf(stdout, " %s",
761 					    version_table[i]);
762 				}
763 				(void) fprintf(stdout, "\n");
764 				md_exit(sp, 0);
765 			}
766 			/*FALLTHROUGH*/
767 
768 		default:
769 			usage(sp, 1);
770 			break;
771 		}
772 	}
773 
774 	/* initialise the debug level and start time */
775 	setup_mc_log(debug);
776 
777 	/*
778 	 * check that the version specified (if any) is supported.
779 	 */
780 	if (version != NULL) {
781 		int	i, found = 0;
782 
783 		for (i = 0; i < version_table_size; i++) {
784 			if (strcmp(version, version_table[i]) == 0) {
785 				found = 1;
786 				break;
787 			}
788 		}
789 		if (!found) {
790 			md_eprintf(gettext("Version %s not supported\n"),
791 			    version);
792 			md_exit(sp, 1);
793 		}
794 	}
795 
796 	argc -= optind;
797 	argv += optind;
798 
799 	/* parse arguments */
800 	if (argc <= 0) {
801 		usage(sp, 1);
802 	}
803 
804 	/* convert the step name to the corresponding number */
805 	step_ptr = bsearch(argv[0], step_table, (sizeof (step_table) /
806 	    sizeof (step_table[0])), sizeof (step_table[0]), mc_compare);
807 	if (step_ptr != NULL) {
808 		stepnum = step_ptr->step_num;
809 	}
810 
811 	--argc;
812 	++argv;
813 
814 	/* set timeout alarm signal, a value of 0 will disable timeout */
815 	if (timeout > 0) {
816 		int	stat_loc = 0;
817 		commd_timeout = (long)(timeout * .75);
818 
819 		c_pid = fork();
820 
821 		if (c_pid == (pid_t)-1) {
822 			md_perror(gettext("Unable to fork"));
823 			md_exit(sp, 1);
824 		} else if (c_pid) {
825 			/* parent */
826 			nsa.sa_flags = 0;
827 			if (sigfillset(&nsa.sa_mask) < 0) {
828 				md_perror(gettext("Unable to set signal mask"));
829 				md_exit(sp, 1);
830 			}
831 
832 			nsa.sa_handler = sigalarmhandler;
833 			if (sigaction(SIGALRM, &nsa, &osa) == -1) {
834 				md_perror(gettext("Unable to set alarm "
835 				    "handler"));
836 				md_exit(sp, 1);
837 			}
838 
839 			(void) alarm(timeout);
840 
841 			/*
842 			 * wait for child to exit or timeout to expire.
843 			 * keep retrying if the call is interrupted
844 			 */
845 			while ((ret_val = waitpid(c_pid, &stat_loc, 0)) < 0) {
846 				if (errno != EINTR) {
847 					break;
848 				}
849 			}
850 			if (ret_val == c_pid) {
851 				/* exit with the childs exit value */
852 				exit(WEXITSTATUS(stat_loc));
853 			} else if (errno == ECHILD) {
854 				md_exit(sp, 0);
855 			} else {
856 				perror(myname);
857 				md_exit(sp, 1);
858 			}
859 		}
860 	}
861 
862 	/*
863 	 * If a timeout value is given, everything from this point onwards is
864 	 * executed in the child process.
865 	 */
866 
867 	switch (stepnum) {
868 	case MC_START:
869 		/*
870 		 * Start Step
871 		 *
872 		 * - Suspend all rpc.mdcommd messages
873 		 */
874 
875 		/* expect the local node id to be given only */
876 		if (argc != 1)
877 			usage(sp, 1);
878 
879 		meta_mc_log(MC_LOG2, gettext("Starting Start step: %s"),
880 		    meta_print_hrtime(0));
881 
882 		/*
883 		 * Does local set exist? If not, exit with 0
884 		 * since there's no reason to have this node panic if
885 		 * the local set cannot be started.
886 		 */
887 		if ((local_sp = load_local_set(ep)) == NULL) {
888 			md_exit(local_sp, 0);
889 		}
890 
891 		if ((max_sets = get_max_sets(ep)) == 0) {
892 			mde_perror(ep, "");
893 			md_exit(sp, 1);
894 		}
895 
896 		/* start walking through all possible disksets */
897 		for (setno = 1; setno < max_sets; setno++) {
898 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
899 				if (mdiserror(ep, MDE_NO_SET)) {
900 					/* No set for this setno - continue */
901 					mdclrerror(ep);
902 					continue;
903 				} else {
904 					mde_perror(ep, gettext("Unable to "
905 					    "get set %d information"), setno);
906 					md_exit(sp, 1);
907 				}
908 			}
909 
910 			/* only check multi-node disksets */
911 			if (!meta_is_mn_set(sp, ep)) {
912 				mdclrerror(ep);
913 				continue;
914 			}
915 
916 			meta_mc_log(MC_LOG3, gettext("Start - block parse "
917 			    "messages for set %s: %s"), sp->setname,
918 			    meta_print_hrtime(gethrtime() - start_time));
919 
920 			/*
921 			 * Mddb parse messages are sent amongst the nodes
922 			 * in a diskset whenever the locator block or
923 			 * locator names structure has been changed.
924 			 * A locator block change could occur as a result
925 			 * of a disk failure during the reconfig cycle,
926 			 * so block the mddb parse messages while the
927 			 * rpc.mdcommd is suspended during the reconfig cycle.
928 			 */
929 			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
930 				(void) memset(&mbp, 0, sizeof (mbp));
931 				mbp.c_setno = setno;
932 				mbp.c_blk_flags = MDDB_BLOCK_PARSE;
933 				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
934 				    &mbp.c_mde, NULL)) {
935 					(void) mdstealerror(ep, &mbp.c_mde);
936 					mde_perror(ep, gettext("Could not "
937 					    "block set %s"), sp->setname);
938 					md_exit(sp, 1);
939 				}
940 			}
941 
942 			/* suspend commd and spin waiting for drain */
943 			while ((ret_val = mdmn_suspend(setno,
944 			    MD_COMM_ALL_CLASSES, commd_timeout)) ==
945 			    MDE_DS_COMMDCTL_SUSPEND_NYD) {
946 				(void) sleep(1);
947 			}
948 
949 			if (ret_val) {
950 				md_eprintf(gettext("Could not suspend "
951 				    "rpc.mdcommd for set %s\n"), sp->setname);
952 				md_exit(sp, 1);
953 			}
954 
955 			/*
956 			 * Set start step flag for set. This is set to indicate
957 			 * that this node entered the reconfig cycle through
958 			 * the start step.  This is used during the reconfig
959 			 * cycle to determine whether the node had entered
960 			 * through the start step or the return step.
961 			 */
962 			(void) memset(&sf, 0, sizeof (sf));
963 			sf.sf_setno = sp->setno;
964 			sf.sf_setflags = MD_SET_MN_START_RC;
965 			sf.sf_flags = MDDB_NM_SET;
966 			/* Use magic to help protect ioctl against attack. */
967 			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
968 			if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
969 			    &sf.sf_mde, NULL)) {
970 				(void) mdstealerror(ep, &sf.sf_mde);
971 				mde_perror(ep, gettext("Could not set "
972 				    "start_step flag for set %s"), sp->setname);
973 				md_exit(sp, 1);
974 			}
975 
976 		}
977 
978 		meta_mc_log(MC_LOG2, gettext("Start step completed: %s"),
979 		    meta_print_hrtime(gethrtime() - start_time));
980 
981 		break;
982 
983 	case MC_STOP:
984 		/*
985 		 * Stop Step
986 		 *
987 		 * - ???
988 		 */
989 
990 		/* don't expect any more arguments to follow the step name */
991 		if (argc != 0)
992 			usage(sp, 1);
993 
994 		break;
995 
996 	case MC_ABORT:
997 		/*
998 		 * Abort Step
999 		 *
1000 		 * - Abort rpc.mdcommd
1001 		 */
1002 
1003 		/* don't expect any more arguments to follow the step name */
1004 		if (argc != 0)
1005 			usage(sp, 1);
1006 
1007 		meta_mc_log(MC_LOG2, gettext("Starting Abort step: %s"),
1008 		    meta_print_hrtime(0));
1009 
1010 		/*
1011 		 * Does local set exist? If not, exit with 0
1012 		 * since there's no reason to have this node panic if
1013 		 * the local set cannot be started.
1014 		 */
1015 		if ((local_sp = load_local_set(ep)) == NULL) {
1016 			md_exit(local_sp, 0);
1017 		}
1018 
1019 		/*
1020 		 * abort the rpc.mdcommd.  The abort is only issued on this node
1021 		 * meaning that the abort reconfig step is called on this
1022 		 * node before a panic while the rest of the cluster will
1023 		 * undergo a reconfig cycle.
1024 		 * There is no time relation between this node running a
1025 		 * reconfig abort and the the rest of the cluster
1026 		 * running a reconfig cycle meaning that this node may
1027 		 * panic before, during or after the cluster has run
1028 		 * a reconfig cycle.
1029 		 */
1030 		mdmn_abort();
1031 
1032 		meta_mc_log(MC_LOG2, gettext("Abort step completed: %s"),
1033 		    meta_print_hrtime(gethrtime() - start_time));
1034 
1035 		break;
1036 
1037 	case MC_RETURN:
1038 		/*
1039 		 * Return Step
1040 		 *
1041 		 * - Grab local set lock, issue rpc.mdcommd DRAIN ALL
1042 		 *   and release local set lock.  Grabbing the local set
1043 		 *   lock allows any active metaset/metadb commands to
1044 		 *   terminate gracefully and will keep a metaset/metadb
1045 		 *   command from starting until the DRAIN ALL is issued.
1046 		 *   The metaset/metadb commands can issue
1047 		 *   DRAIN ALL/RESUME ALL commands to rpc.mdcommd,
1048 		 *   so the return step must not issue the DRAIN ALL command
1049 		 *   until metaset/metadb have finished or metaset may issue
1050 		 *   a RESUME ALL after this return reconfig step has issued
1051 		 *   the DRAIN ALL command.
1052 		 *   After this reconfig step has issued the DRAIN_ALL and
1053 		 *   released the local set lock, metaset/metadb will fail
1054 		 *   when attempting to contact the rpc.mdcommd and will
1055 		 *   terminate without making any configuration changes.
1056 		 *   The DRAIN ALL command will keep all other meta* commands
1057 		 *   from running during the reconfig cycle (these commands
1058 		 *   will wait until the rpc.mdcommd is resumed) since the
1059 		 *   reconfig cycle may be changing the diskset configuration.
1060 		 */
1061 
1062 		/* expect the nodelist to follow the step name */
1063 		if (argc < 1)
1064 			usage(sp, 1);
1065 
1066 		meta_mc_log(MC_LOG2, gettext("Starting Return step: %s"),
1067 		    meta_print_hrtime(0));
1068 
1069 		/*
1070 		 * Does local set exist? If not, exit with 0
1071 		 * since there's no reason to have this node panic if
1072 		 * the local set cannot be started.
1073 		 */
1074 		if ((local_sp = load_local_set(ep)) == NULL) {
1075 			md_exit(local_sp, 0);
1076 		}
1077 
1078 		/*
1079 		 * Suspend any mirror resyncs that are in progress. This
1080 		 * stops unnecessary timeouts.
1081 		 */
1082 		meta_mirror_resync_block_all();
1083 
1084 		if (meta_lock(local_sp, TRUE, ep) != 0) {
1085 			mde_perror(ep, "");
1086 			md_exit(local_sp, 1);
1087 		}
1088 
1089 		/*
1090 		 * All metaset and metadb commands on this node have now
1091 		 * terminated gracefully.  Now, issue a drain all to
1092 		 * the rpc.mdcommd.  Any meta command issued after the
1093 		 * drain all will either spin sending the command to the
1094 		 * master until after the reconfig cycle has finished OR
1095 		 * will terminate gracefully (metaset/metadb).
1096 		 */
1097 		if ((max_sets = get_max_sets(ep)) == 0) {
1098 			mde_perror(ep, "");
1099 			md_exit(sp, 1);
1100 		}
1101 
1102 		/* start walking through all possible disksets */
1103 		for (setno = 1; setno < max_sets; setno++) {
1104 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1105 				if (mdiserror(ep, MDE_NO_SET)) {
1106 					/* No set for this setno - continue */
1107 					mdclrerror(ep);
1108 					continue;
1109 				} else {
1110 					mde_perror(ep, gettext("Unable to "
1111 					    "get set %d information"), setno);
1112 					md_exit(sp, 1);
1113 				}
1114 			}
1115 
1116 			/* only check multi-node disksets */
1117 			if (!meta_is_mn_set(sp, ep)) {
1118 				mdclrerror(ep);
1119 				continue;
1120 			}
1121 
1122 			meta_mc_log(MC_LOG3, gettext("Return - block parse "
1123 			    "messages for set %s: %s"), sp->setname,
1124 			    meta_print_hrtime(gethrtime() - start_time));
1125 
1126 			/*
1127 			 * Mddb parse messages are sent amongst the nodes
1128 			 * in a diskset whenever the locator block or
1129 			 * locator names structure has been changed.
1130 			 * A locator block change could occur as a result
1131 			 * of a disk failure during the reconfig cycle,
1132 			 * so block the mddb parse messages while the
1133 			 * rpc.commd is suspended during the reconfig cycle.
1134 			 */
1135 			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
1136 				(void) memset(&mbp, 0, sizeof (mbp));
1137 				mbp.c_setno = setno;
1138 				mbp.c_blk_flags = MDDB_BLOCK_PARSE;
1139 				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
1140 				    &mbp.c_mde, NULL)) {
1141 					(void) mdstealerror(ep, &mbp.c_mde);
1142 					mde_perror(ep, gettext("Could not "
1143 					    "block set %s"), sp->setname);
1144 					md_exit(sp, 1);
1145 				}
1146 			}
1147 
1148 			/* suspend commd and spin waiting for drain */
1149 			while ((ret_val = mdmn_suspend(setno,
1150 			    MD_COMM_ALL_CLASSES, commd_timeout)) ==
1151 			    MDE_DS_COMMDCTL_SUSPEND_NYD) {
1152 				(void) sleep(1);
1153 			}
1154 
1155 			if (ret_val) {
1156 				md_eprintf(gettext("Could not suspend "
1157 				    "rpc.mdcommd for set %s\n"), sp->setname);
1158 				md_exit(sp, 1);
1159 			}
1160 		}
1161 		/*
1162 		 * Resume all I/Os for this node for all MN sets in
1163 		 * case master node had suspended I/Os but panic'd
1164 		 * before resuming I/Os.  In case of failure, exit
1165 		 * with a 1 since unable to resume I/Os on this node.
1166 		 */
1167 		if (clnt_mn_susp_res_io(mynode(), 0, MN_RES_IO, ep)) {
1168 			mde_perror(ep, gettext(
1169 			    "Unable to resume I/O on node %s for all sets"),
1170 			    mynode());
1171 			md_exit(sp, 1);
1172 		}
1173 
1174 
1175 		/*
1176 		 * Can now unlock local set lock.  New metaset/metadb
1177 		 * commands are now held off using drain all.
1178 		 */
1179 		(void) meta_unlock(local_sp, ep);
1180 
1181 		meta_mc_log(MC_LOG2, gettext("Return step completed: %s"),
1182 		    meta_print_hrtime(gethrtime() - start_time));
1183 
1184 		break;
1185 
1186 	case MC_STEP1:
1187 		/*
1188 		 * Step 1
1189 		 *
1190 		 * - Populate nodelist file if we are on clustering
1191 		 *   and pick a master node for each MN diskset.
1192 		 */
1193 
1194 		/* expect the nodelist to follow the step name */
1195 		if (argc < 1)
1196 			usage(sp, 1);
1197 
1198 		meta_mc_log(MC_LOG2, gettext("Starting Step1: %s"),
1199 		    meta_print_hrtime(0));
1200 
1201 		/* Always write nodelist file even if no local set exists */
1202 		if (clust == SDSSC_OKAY) {
1203 			/* skip to the nodelist args */
1204 			if (meta_write_nodelist(argc, argv, ep) != 0) {
1205 				mde_perror(ep, gettext(
1206 				    "Could not populate nodelist file"));
1207 				md_exit(sp, 1);
1208 			}
1209 		}
1210 
1211 		/*
1212 		 * Does local set exist? If not, exit with 0
1213 		 * since there's no reason to have this node panic if
1214 		 * the local set cannot be started.
1215 		 */
1216 		if ((local_sp = load_local_set(ep)) == NULL) {
1217 			md_exit(local_sp, 0);
1218 		}
1219 
1220 		/*
1221 		 * At this point, all meta* commands are blocked across
1222 		 * all disksets since the master rpc.mdcommd has drained or
1223 		 * the master node has died.
1224 		 * If a metaset or metadb command had been in progress
1225 		 * at the start of the reconfig cycle, this command has
1226 		 * either completed or it has been terminated due to
1227 		 * the death of the master node.
1228 		 *
1229 		 * This means that that it is now ok to remove any
1230 		 * outstanding clnt_locks associated with multinode
1231 		 * disksets on this node due to a node panic during
1232 		 * a metaset operation.  This allows the routines that
1233 		 * choose the master to use rpc.metad to determine the
1234 		 * master of the diskset.
1235 		 */
1236 		if (clnt_clr_mnsetlock(mynode(), ep) != 0) {
1237 			meta_mc_log(MC_LOG2, gettext("Step1 aborted:"
1238 			    "clear locks failed %s"),
1239 			    meta_print_hrtime(gethrtime() - start_time));
1240 			md_exit(local_sp, 1);
1241 		}
1242 
1243 		/*
1244 		 * Call reconfig_choose_master to choose a master for
1245 		 * each MN diskset, update the nodelist for each diskset
1246 		 * given the member information and send a reinit message
1247 		 * to rpc.mdcommd to reload the nodelist.
1248 		 */
1249 		rval = meta_reconfig_choose_master(commd_timeout, ep);
1250 		if (rval == 205) {
1251 			/*
1252 			 * NOTE: Should issue call to reboot remote host that
1253 			 * is causing the RPC failure.  Clustering to
1254 			 * provide interface in the future.  This should
1255 			 * stop a never-ending set of 205 reconfig cycles.
1256 			 * Remote host causing failure is stored in
1257 			 * ep->host if ep is an RPC error.
1258 			 * if (mdanyrpcerror(ep))
1259 			 * 	reboot (ep->host);
1260 			 */
1261 			meta_mc_log(MC_LOG2, gettext("Step1 aborted:"
1262 			    "choose master failure of 205 %s"),
1263 			    meta_print_hrtime(gethrtime() - start_time));
1264 			md_exit(local_sp, 205);
1265 		} else if (rval != 0) {
1266 			meta_mc_log(MC_LOG2, gettext("Step1 failure: "
1267 			    "choose master failure %s"),
1268 			    meta_print_hrtime(gethrtime() - start_time));
1269 			md_exit(local_sp, 1);
1270 		}
1271 
1272 		meta_mc_log(MC_LOG2, gettext("Step1 completed: %s"),
1273 		    meta_print_hrtime(gethrtime() - start_time));
1274 
1275 		md_exit(local_sp, rval);
1276 		break;
1277 
1278 	case MC_STEP2:
1279 		/*
1280 		 * Step 2
1281 		 *
1282 		 * In Step 2, each node walks the list of disksets.  If a
1283 		 * node is a master of a MN diskset, it synchronizes
1284 		 * the local set USER records for that diskset.
1285 		 *
1286 		 * If disks exist in the diskset and there is a joined
1287 		 * (owner) node in the diskset, the master will also:
1288 		 *	- synchronize the diskset mddbs to the master
1289 		 *	- play the change log
1290 		 *
1291 		 * The master node will now attempt to join any unjoined
1292 		 * nodes that are currently members in the membership list.
1293 		 */
1294 
1295 		/* expect the nodelist to follow the step name */
1296 		if (argc < 1)
1297 			usage(sp, 1);
1298 
1299 		meta_mc_log(MC_LOG2, gettext("Starting Step2: %s"),
1300 		    meta_print_hrtime(0));
1301 
1302 		/*
1303 		 * Does local set exist? If not, exit with 0
1304 		 * since there's no reason to have this node panic if
1305 		 * the local set cannot be started.
1306 		 */
1307 		if ((local_sp = load_local_set(ep)) == NULL) {
1308 			md_exit(local_sp, 0);
1309 		}
1310 
1311 		if ((max_sets = get_max_sets(ep)) == 0) {
1312 			mde_perror(ep, "");
1313 			md_exit(local_sp, 1);
1314 		}
1315 
1316 		/* start walking through all possible disksets */
1317 		for (setno = 1; setno < max_sets; setno++) {
1318 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1319 				if (mdiserror(ep, MDE_NO_SET)) {
1320 					/* No set for this setno - continue */
1321 					mdclrerror(ep);
1322 					continue;
1323 				} else if (mdanyrpcerror(ep)) {
1324 					/* Fail on RPC failure to self */
1325 					mde_perror(ep, gettext(
1326 					    "Unable to get information for "
1327 					    "set number %d"), setno);
1328 					md_exit(local_sp, 1);
1329 				} else {
1330 					mde_perror(ep, gettext(
1331 					    "Unable to get information for "
1332 					    "set number %d"), setno);
1333 					mdclrerror(ep);
1334 					continue;
1335 				}
1336 			}
1337 
1338 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1339 				if (mdanyrpcerror(ep)) {
1340 					/* Fail on RPC failure to self */
1341 					mde_perror(ep, gettext(
1342 					    "Unable to get information for "
1343 					    "set number %d"), setno);
1344 					md_exit(local_sp, 1);
1345 				}
1346 				mde_perror(ep, gettext("Unable to get set "
1347 				    "%s desc information"), sp->setname);
1348 				mdclrerror(ep);
1349 				continue;
1350 			}
1351 
1352 			/* Only check MN disksets */
1353 			if (!(MD_MNSET_DESC(sd))) {
1354 				continue;
1355 			}
1356 
1357 			/* All actions in step 2 are driven by master */
1358 			if (!(sd->sd_mn_am_i_master)) {
1359 				continue;
1360 			}
1361 
1362 			meta_mc_log(MC_LOG3, gettext("Step2 - begin record "
1363 			    "synchronization for set %s: %s"), sp->setname,
1364 			    meta_print_hrtime(gethrtime() - start_time));
1365 
1366 			/*
1367 			 * Synchronize the USER records in the local mddbs
1368 			 * for hosts that are members.  The USER records
1369 			 * contain set, drive and host information.
1370 			 */
1371 			rval = meta_mnsync_user_records(sp, ep);
1372 			if (rval != 0) {
1373 				mde_perror(ep, gettext(
1374 				    "Synchronization of user records "
1375 				    "in set %s failed\n"), sp->setname);
1376 				if (rval == 205) {
1377 					/*
1378 					 * NOTE: Should issue call to reboot
1379 					 * remote host that is causing the RPC
1380 					 * failure.  Clustering to provide
1381 					 * interface in the future.  This
1382 					 * should stop a never-ending set of
1383 					 * 205 reconfig cycles.
1384 					 * Remote host causing failure is
1385 					 * stored in ep->host if ep is an
1386 					 * RPC error.
1387 					 * if (mdanyrpcerror(ep))
1388 					 * 	reboot (ep->host);
1389 					 */
1390 					md_exit(local_sp, 205);
1391 				} else {
1392 					md_exit(local_sp, 1);
1393 				}
1394 			}
1395 
1396 			/* Reget sd since sync_user_recs may have flushed it */
1397 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1398 				mde_perror(ep, gettext("Unable to get set "
1399 				    "%s desc information"), sp->setname);
1400 				md_exit(local_sp, 1);
1401 			}
1402 
1403 			dd = metaget_drivedesc(sp,
1404 			    (MD_BASICNAME_OK | PRINT_FAST), ep);
1405 			if (! mdisok(ep)) {
1406 				mde_perror(ep, gettext("Unable to get set "
1407 				    "%s drive information"), sp->setname);
1408 				md_exit(local_sp, 1);
1409 			}
1410 
1411 			/*
1412 			 * No drives in set, continue to next set.
1413 			 */
1414 			if (dd == NULL) {
1415 				/* Done with this set */
1416 				continue;
1417 			}
1418 
1419 			meta_mc_log(MC_LOG3, gettext("Step2 - local set user "
1420 			    "records completed for set %s: %s"), sp->setname,
1421 			    meta_print_hrtime(gethrtime() - start_time));
1422 
1423 			/*
1424 			 * Synchronize the diskset mddbs for hosts
1425 			 * that are members.  This may involve
1426 			 * playing the changelog and writing out
1427 			 * to the diskset mddbs.
1428 			 */
1429 			rval = meta_mnsync_diskset_mddbs(sp, ep);
1430 			if (rval != 0) {
1431 				mde_perror(ep, gettext(
1432 				    "Synchronization of diskset mddbs "
1433 				    "in set %s failed\n"), sp->setname);
1434 				meta_mc_log(MC_LOG3, gettext("Step2 - diskset "
1435 				    "mddb synchronization failed for "
1436 				    "set %s: %s"), sp->setname,
1437 				    meta_print_hrtime(gethrtime() -
1438 				    start_time));
1439 				if (rval == 205) {
1440 					/*
1441 					 * NOTE: Should issue call to reboot
1442 					 * remote host that is causing the RPC
1443 					 * failure.  Clustering to provide
1444 					 * interface in the future.  This
1445 					 * should stop a never-ending set of
1446 					 * 205 reconfig cycles.
1447 					 * Remote host causing failure is
1448 					 * stored in ep->host if ep is an
1449 					 * RPC error.
1450 					 * if (mdanyrpcerror(ep))
1451 					 * 	reboot (ep->host);
1452 					 */
1453 					md_exit(local_sp, 205);
1454 				} else if (rval == 1) {
1455 					continue;
1456 				} else {
1457 					md_exit(local_sp, 1);
1458 				}
1459 			}
1460 
1461 			meta_mc_log(MC_LOG3, gettext("Step2 - diskset mddb "
1462 			    "synchronization completed for set %s: %s"),
1463 			    sp->setname,
1464 			    meta_print_hrtime(gethrtime() - start_time));
1465 
1466 			/* Join the starting nodes to the diskset */
1467 			rval = meta_mnjoin_all(sp, ep);
1468 			if (rval != 0) {
1469 				mde_perror(ep, gettext(
1470 				    "Join of non-owner (starting) nodes "
1471 				    "in set %s failed\n"), sp->setname);
1472 				meta_mc_log(MC_LOG3, gettext("Step2 - non owner"
1473 				    "nodes joined for set %s: %s"),
1474 				    sp->setname,
1475 				    meta_print_hrtime(gethrtime() -
1476 				    start_time));
1477 				if (rval == 205) {
1478 					/*
1479 					 * NOTE: Should issue call to reboot
1480 					 * remote host that is causing the RPC
1481 					 * failure.  Clustering to provide
1482 					 * interface in the future.  This
1483 					 * should stop a never-ending set of
1484 					 * 205 reconfig cycles.
1485 					 * Remote host causing failure is
1486 					 * stored in ep->host if ep is an
1487 					 * RPC error.
1488 					 * if (mdanyrpcerror(ep))
1489 					 * 	reboot (ep->host);
1490 					 */
1491 					md_exit(local_sp, 205);
1492 				} else {
1493 					md_exit(local_sp, 1);
1494 				}
1495 			}
1496 
1497 			meta_mc_log(MC_LOG3, gettext("Step2 - non owner nodes "
1498 			    "joined for set %s: %s"), sp->setname,
1499 			    meta_print_hrtime(gethrtime() - start_time));
1500 
1501 		}
1502 
1503 		meta_mc_log(MC_LOG2, gettext("Step2 completed: %s"),
1504 		    meta_print_hrtime(gethrtime() - start_time));
1505 
1506 		break;
1507 
1508 	case MC_STEP3:
1509 		/*
1510 		 * Step 3
1511 		 *
1512 		 * For all multinode sets do,
1513 		 * - Reinitialise rpc.mdcommd
1514 		 * - Reset mirror owners to null if the current owner is
1515 		 *   no longer in the membership list
1516 		 */
1517 
1518 		/* expect the nodelist to follow the step name */
1519 		if (argc < 1)
1520 			usage(sp, 1);
1521 
1522 		meta_mc_log(MC_LOG2, gettext("Starting Step3: %s"),
1523 		    meta_print_hrtime(0));
1524 
1525 		/*
1526 		 * Does local set exist? If not, exit with 0
1527 		 * since there's no reason to have this node panic if
1528 		 * the local set cannot be started.
1529 		 */
1530 		if ((local_sp = load_local_set(ep)) == NULL) {
1531 			md_exit(local_sp, 0);
1532 		}
1533 
1534 		/*
1535 		 * walk through all sets on this node which could include:
1536 		 *	- MN disksets
1537 		 *	- traditional disksets
1538 		 *	- non-existent disksets
1539 		 * start mirror resync for all MN sets
1540 		 */
1541 		if ((max_sets = get_max_sets(ep)) == 0) {
1542 			mde_perror(ep, "");
1543 			md_exit(local_sp, 1);
1544 		}
1545 
1546 		/* start walking through all possible disksets */
1547 		for (setno = 1; setno < max_sets; setno++) {
1548 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1549 				if (mdiserror(ep, MDE_NO_SET)) {
1550 					/* No set for this setno - continue */
1551 					mdclrerror(ep);
1552 					continue;
1553 				} else {
1554 					mde_perror(ep, gettext("Unable to "
1555 					    "get set %d information"), setno);
1556 					md_exit(local_sp, 1);
1557 				}
1558 			}
1559 
1560 			/* only check multi-node disksets */
1561 			if (!meta_is_mn_set(sp, ep)) {
1562 				mdclrerror(ep);
1563 				continue;
1564 			}
1565 
1566 			if (meta_lock(sp, TRUE, ep) != 0) {
1567 				mde_perror(ep, "");
1568 				md_exit(local_sp, 1);
1569 			}
1570 
1571 			/* If this node isn't joined to set, do nothing */
1572 			if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) {
1573 				if (!mdisok(ep)) {
1574 					mde_perror(ep, gettext("Could "
1575 					    "not get set %s ownership"),
1576 					    sp->setname);
1577 					md_exit(sp, 1);
1578 				}
1579 				mdclrerror(ep);
1580 				(void) meta_unlock(sp, ep);
1581 				continue;
1582 			}
1583 
1584 			meta_mc_log(MC_LOG3, gettext("Step3 - begin "
1585 			    "re-initialising rpc.mdcommd and resetting mirror "
1586 			    "owners for set %s: %s"), sp->setname,
1587 			    meta_print_hrtime(gethrtime() - start_time));
1588 
1589 			/* reinitialzse rpc.mdcommd with new nodelist */
1590 			if (mdmn_reinit_set(setno, commd_timeout)) {
1591 				md_eprintf(gettext(
1592 				    "Could not re-initialise rpc.mdcommd for "
1593 				    "set %s\n"), sp->setname);
1594 				md_exit(sp, 1);
1595 			}
1596 
1597 			(void) memset(&cfg, 0, sizeof (cfg));
1598 			cfg.c_id = 0;
1599 			cfg.c_setno = sp->setno;
1600 			if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde,
1601 			    NULL) != 0) {
1602 				(void) mdstealerror(ep, &cfg.c_mde);
1603 				mde_perror(ep, gettext("Could "
1604 				    "not get set %s information"),
1605 				    sp->setname);
1606 				md_exit(sp, 1);
1607 			}
1608 
1609 			/* Don't do anything else if set is stale */
1610 			if (cfg.c_flags & MDDB_C_STALE) {
1611 				(void) meta_unlock(sp, ep);
1612 				mdclrerror(ep);
1613 				continue;
1614 			}
1615 
1616 			/* reset mirror owners */
1617 			if (reset_state(RESET_OWNER, sp, MD_MIRROR, ep) == -1) {
1618 				md_exit(sp, 1);
1619 			}
1620 
1621 			(void) meta_unlock(sp, ep);
1622 
1623 			meta_mc_log(MC_LOG3, gettext("Step3 - rpc.mdcommd "
1624 			    "re-initialised and mirror owners reset for "
1625 			    "set %s: %s"), sp->setname,
1626 			    meta_print_hrtime(gethrtime() - start_time));
1627 		}
1628 
1629 		meta_mc_log(MC_LOG2, gettext("Step3 completed: %s"),
1630 		    meta_print_hrtime(gethrtime() - start_time));
1631 
1632 		break;
1633 
1634 	case MC_STEP4:
1635 		/*
1636 		 * Step 4
1637 		 *
1638 		 * For all multinode sets do:
1639 		 * - Resume the rpc.mdcommd messages.  Must resume all
1640 		 *	sets before issuing I/O to any set since an error
1641 		 * 	encountered in a commd suspended set could be
1642 		 *	blocked waiting for commd in another set to resume.
1643 		 *	(This happens since the daemon queues service
1644 		 *	all sets).  An open of a soft partition causes
1645 		 *	a read of the watermarks during the open.
1646 		 * - If set is non-writable (not an owner or STALE), then
1647 		 *	continue to next set.
1648 		 *
1649 		 * For all multinode sets do,
1650 		 * - Reset ABR states for all mirrors, ie clear ABR if not
1651 		 *	open on any node.
1652 		 * - Reset ABR states for all soft partitions, ie clear ABR if
1653 		 *	not open on any node.
1654 		 * - For all slave nodes that have entered through the start
1655 		 *	step, update the ABR state to that of the master and
1656 		 *	get the submirror state from the master
1657 		 * - meta_lock set
1658 		 * - Resync all mirrors
1659 		 * - unlock meta_lock for this set.
1660 		 * - Choose a new owner for any orphaned resyncs
1661 		 *
1662 		 * There is one potential issue here. when concurrently
1663 		 * resetting and updating the ABR state. If the master has ABR
1664 		 * set, but should no longer have because the only node that
1665 		 * had the metadevice open and had ABR set has paniced, the
1666 		 * master will send a message to all nodes to clear the ABR
1667 		 * state. Meanwhile any node that has come through the
1668 		 * start step will get tstate from the master and will update
1669 		 * ABR if it was set in tstate. So, we appear to have a problem
1670 		 * if the following sequence occurs:-
1671 		 * - The slave gets tstate with ABR set
1672 		 * - The master sends a message to clear ABR
1673 		 * - The slave updates ABR with the value it got from tstate.
1674 		 * We now have the master with ABR clear and the slave with ABR
1675 		 * set. Fortunately, having set ABR, the slave will close the
1676 		 * metadevice after setting ABR and as there are no nodes with
1677 		 * the device open, the close will send a message to clear ABR
1678 		 * on all nodes. So, the nodes will all have ABR unset.
1679 		 */
1680 
1681 		/* expect the nodelist to follow the step name */
1682 		if (argc < 1)
1683 			usage(sp, 1);
1684 
1685 		meta_mc_log(MC_LOG2, gettext("Starting Step4: %s"),
1686 		    meta_print_hrtime(0));
1687 
1688 		/*
1689 		 * Does local set exist? If not, exit with 0
1690 		 * since there's no reason to have this node panic if
1691 		 * the local set cannot be started.
1692 		 */
1693 		if ((local_sp = load_local_set(ep)) == NULL) {
1694 			md_exit(local_sp, 0);
1695 		}
1696 
1697 		/*
1698 		 * walk through all sets on this node which could include:
1699 		 *	- MN disksets
1700 		 *	- traditional disksets
1701 		 *	- non-existent disksets
1702 		 * start mirror resync for all MN sets
1703 		 */
1704 		if ((max_sets = get_max_sets(ep)) == 0) {
1705 			mde_perror(ep, "");
1706 			md_exit(local_sp, 1);
1707 		}
1708 
1709 		/* Clear set_info structure */
1710 		for (setno = 1; setno < max_sets; setno++) {
1711 			set_info[setno] = 0;
1712 		}
1713 
1714 		/* start walking through all possible disksets */
1715 		for (setno = 1; setno < max_sets; setno++) {
1716 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1717 				if (mdiserror(ep, MDE_NO_SET)) {
1718 					/* No set for this setno - continue */
1719 					mdclrerror(ep);
1720 					continue;
1721 				} else {
1722 					mde_perror(ep, gettext("Unable to "
1723 					    "get set %d information"), setno);
1724 					md_exit(local_sp, 1);
1725 				}
1726 			}
1727 
1728 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1729 				mde_perror(ep, gettext("Unable to get set "
1730 				    "%s desc information"), sp->setname);
1731 				mdclrerror(ep);
1732 				continue;
1733 			}
1734 
1735 			/* only check multi-node disksets */
1736 			if (!meta_is_mn_set(sp, ep)) {
1737 				mdclrerror(ep);
1738 				continue;
1739 			}
1740 
1741 			set_info[setno] |= SET_INFO_MN;
1742 
1743 			/*
1744 			 * If not an owner (all mddbs failed) or stale
1745 			 * (< 50% mddbs operational), then set is
1746 			 * non-writable so just resume commd and
1747 			 * unblock mddb messages.
1748 			 */
1749 			mdclrerror(ep);
1750 			if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) {
1751 				set_info[setno] |= SET_INFO_NO_WR;
1752 			}
1753 			if (!mdisok(ep)) {
1754 				mde_perror(ep, gettext("Could "
1755 				    "not get set %s ownership"),
1756 				    sp->setname);
1757 				md_exit(local_sp, 1);
1758 			}
1759 			/* Set is owned - is it stale? */
1760 			if (!set_info[setno] & SET_INFO_NO_WR) {
1761 				(void) memset(&cfg, 0, sizeof (cfg));
1762 				cfg.c_id = 0;
1763 				cfg.c_setno = sp->setno;
1764 				if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde,
1765 				    NULL) != 0) {
1766 					(void) mdstealerror(ep, &cfg.c_mde);
1767 					mde_perror(ep, gettext("Could "
1768 					    "not get set %s information"),
1769 					    sp->setname);
1770 					md_exit(local_sp, 1);
1771 				}
1772 				if (cfg.c_flags & MDDB_C_STALE) {
1773 					set_info[setno] |= SET_INFO_NO_WR;
1774 				}
1775 			}
1776 
1777 			/* resume rpc.mdcommd */
1778 			if (mdmn_resume(setno, MD_COMM_ALL_CLASSES, 0,
1779 			    commd_timeout)) {
1780 				md_eprintf(gettext("Unable to resume "
1781 				    "rpc.mdcommd for set %s\n"), sp->setname);
1782 				md_exit(local_sp, 1);
1783 			}
1784 
1785 			/* Unblock mddb parse messages */
1786 			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
1787 				(void) memset(&mbp, 0, sizeof (mbp));
1788 				mbp.c_setno = setno;
1789 				mbp.c_blk_flags = MDDB_UNBLOCK_PARSE;
1790 				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
1791 				    &mbp.c_mde, NULL)) {
1792 					(void) mdstealerror(ep, &mbp.c_mde);
1793 					mde_perror(ep, gettext("Could not "
1794 					    "unblock set %s"), sp->setname);
1795 					md_exit(local_sp, 1);
1796 				}
1797 			}
1798 			meta_mc_log(MC_LOG3, gettext("Step4 - rpc.mdcommd "
1799 			    "resumed and messages unblocked for set %s: %s"),
1800 			    sp->setname,
1801 			    meta_print_hrtime(gethrtime() - start_time));
1802 		}
1803 
1804 		for (setno = 1; setno < max_sets; setno++) {
1805 			int			start_step;
1806 
1807 			/* Skip traditional disksets. */
1808 			if ((set_info[setno] & SET_INFO_MN) == 0)
1809 				continue;
1810 
1811 			/*
1812 			 * If already determined that this set is
1813 			 * a non-writable set, then just continue
1814 			 * to next set since there's nothing else
1815 			 * to do for a non-writable set.
1816 			 */
1817 			if (set_info[setno] & SET_INFO_NO_WR)
1818 				continue;
1819 
1820 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1821 				if (mdiserror(ep, MDE_NO_SET)) {
1822 					/* No set for this setno - continue */
1823 					mdclrerror(ep);
1824 					continue;
1825 				} else {
1826 					mde_perror(ep, gettext("Unable to "
1827 					    "get set %d information"), setno);
1828 					md_exit(local_sp, 1);
1829 				}
1830 			}
1831 
1832 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1833 				mde_perror(ep, gettext("Unable to get set "
1834 				    "%s desc information"), sp->setname);
1835 				mdclrerror(ep);
1836 				continue;
1837 			}
1838 
1839 			/* See if this node came through the start step */
1840 			(void) memset(&sf, 0, sizeof (sf));
1841 			sf.sf_setno = sp->setno;
1842 			sf.sf_flags = MDDB_NM_GET;
1843 			/* Use magic to help protect ioctl against attack. */
1844 			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
1845 			if (metaioctl(MD_MN_GET_SETFLAGS, &sf,
1846 			    &sf.sf_mde, NULL)) {
1847 				(void) mdstealerror(ep, &sf.sf_mde);
1848 				mde_perror(ep, gettext("Could not get "
1849 				    "start_step flag for set %s"), sp->setname);
1850 				md_exit(local_sp, 1);
1851 			}
1852 			start_step =
1853 			    (sf.sf_setflags & MD_SET_MN_START_RC)? 1: 0;
1854 
1855 			/*
1856 			 * We can now reset the start_step flag for the set
1857 			 * if it was already set.
1858 			 */
1859 			if (start_step) {
1860 				(void) memset(&sf, 0, sizeof (sf));
1861 					sf.sf_setno = sp->setno;
1862 				sf.sf_setflags = MD_SET_MN_START_RC;
1863 				sf.sf_flags = MDDB_NM_RESET;
1864 				/*
1865 				 * Use magic to help protect ioctl
1866 				 * against attack.
1867 				 */
1868 				sf.sf_magic = MDDB_SETFLAGS_MAGIC;
1869 				if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
1870 				    &sf.sf_mde, NULL)) {
1871 					(void) mdstealerror(ep, &sf.sf_mde);
1872 					mde_perror(ep,
1873 					    gettext("Could not reset "
1874 					    "start_step flag for set %s"),
1875 					    sp->setname);
1876 				}
1877 			}
1878 
1879 			meta_mc_log(MC_LOG3, gettext("Step4 - begin setting "
1880 			    "ABR state and restarting io's for "
1881 			    "set %s: %s"), sp->setname,
1882 			    meta_print_hrtime(gethrtime() - start_time));
1883 
1884 
1885 			/*
1886 			 * If we are not the master and we have come through
1887 			 * the start step, we must update the ABR states
1888 			 * for mirrors and soft partitions. Also the submirror
1889 			 * states need to be synchronised so that we see the
1890 			 * same status as other previously joined members.
1891 			 * This _must_ be done before starting the resync.
1892 			 */
1893 			if (!(sd->sd_mn_am_i_master) && start_step) {
1894 				if (reset_state(GET_MIRROR_STATE, sp, MD_MIRROR,
1895 				    ep) == -1) {
1896 					md_exit(local_sp, 1);
1897 				}
1898 				if (reset_state(UPDATE_ABR, sp, MD_SP,
1899 				    ep) == -1) {
1900 					md_exit(local_sp, 1);
1901 				}
1902 				/*
1903 				 * Mark the fact that we've got the mirror
1904 				 * state. This allows the resync thread to
1905 				 * determine if _it_ needs to issue this. This
1906 				 * can happen if a node is added to a set after
1907 				 * a reconfig cycle has completed.
1908 				 */
1909 				(void) memset(&sf, 0, sizeof (sf));
1910 					sf.sf_setno = sp->setno;
1911 				sf.sf_setflags = MD_SET_MN_MIR_STATE_RC;
1912 				sf.sf_flags = MDDB_NM_SET;
1913 				/*
1914 				 * Use magic to help protect ioctl
1915 				 * against attack.
1916 				 */
1917 				sf.sf_magic = MDDB_SETFLAGS_MAGIC;
1918 				if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
1919 				    &sf.sf_mde, NULL)) {
1920 					(void) mdstealerror(ep, &sf.sf_mde);
1921 					mde_perror(ep,
1922 					    gettext("Could not set "
1923 					    "submirror state flag for set %s"),
1924 					    sp->setname);
1925 				}
1926 			}
1927 
1928 			/*
1929 			 * All remaining actions are only performed by the
1930 			 * master
1931 			 */
1932 			if (!(sd->sd_mn_am_i_master)) {
1933 				if (meta_lock(sp, TRUE, ep) != 0) {
1934 					mde_perror(ep, "");
1935 					md_exit(local_sp, 1);
1936 				}
1937 				meta_mirror_resync_unblock(sp);
1938 				(void) meta_unlock(sp, ep);
1939 				continue;
1940 			}
1941 
1942 			/*
1943 			 * If the master came through the start step, this
1944 			 * implies that all of the nodes must have done the
1945 			 * same and hence there can be no applications
1946 			 * running. Hence no need to reset ABR
1947 			 */
1948 			if (!start_step) {
1949 				/* Reset ABR state for mirrors */
1950 				if (reset_state(RESET_ABR, sp, MD_MIRROR,
1951 				    ep) == -1) {
1952 					md_exit(local_sp, 1);
1953 				}
1954 				/* ...and now the same for soft partitions */
1955 				if (reset_state(RESET_ABR, sp, MD_SP,
1956 				    ep) == -1) {
1957 					md_exit(local_sp, 1);
1958 				}
1959 			}
1960 
1961 			/*
1962 			 * choose owners for orphaned resyncs and reset
1963 			 * non-orphaned resyncs so that an owner node that
1964 			 * reboots will restart the resync if needed.
1965 			 */
1966 			if (reset_state(CHOOSE_OWNER, sp, MD_MIRROR, ep) == -1)
1967 				md_exit(local_sp, 1);
1968 
1969 			/*
1970 			 * Must unlock set lock before meta_mirror_resync_all
1971 			 * sends a message to run the metasync command
1972 			 * which also grabs the meta_lock.
1973 			 */
1974 			if (meta_lock(sp, TRUE, ep) != 0) {
1975 				mde_perror(ep, "");
1976 				md_exit(local_sp, 1);
1977 			}
1978 			meta_mirror_resync_unblock(sp);
1979 			(void) meta_unlock(sp, ep);
1980 
1981 			/* resync all mirrors in set */
1982 			if (meta_mirror_resync_all(sp, 0, ep) != 0) {
1983 				mde_perror(ep, gettext("Mirror resyncs "
1984 				    "failed for set %s"), sp->setname);
1985 				md_exit(local_sp, 1);
1986 			}
1987 
1988 			meta_mc_log(MC_LOG3, gettext("Step4 - io's restarted "
1989 			    "for set %s: %s"), sp->setname,
1990 			    meta_print_hrtime(gethrtime() - start_time));
1991 		}
1992 
1993 		meta_mc_log(MC_LOG2, gettext("Step4 completed: %s"),
1994 		    meta_print_hrtime(gethrtime() - start_time));
1995 
1996 		break;
1997 
1998 	default:
1999 		usage(sp, 1);
2000 		break;
2001 	}
2002 
2003 	md_exit(sp, 0);
2004 	/* NOTREACHED */
2005 	return (0);
2006 }
2007