xref: /titanic_41/usr/src/cmd/lvm/util/metaclust.c (revision 447e4a639cd3f814fbae624e7188d5adaa83f841)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
25  * Use is subject to license terms.
26  */
27 
28 #pragma ident	"%Z%%M%	%I%	%E% SMI"
29 
30 #include <meta.h>
31 #include <sdssc.h>
32 #include <signal.h>
33 #include <syslog.h>
34 #include <sys/types.h>
35 #include <sys/wait.h>
36 #include <sys/lvm/md_mirror.h>
37 #include <metad.h>
38 
39 #define	MY_VERSION		"1.0"	/* the highest supported version */
40 #define	MAX_DEBUG_LEVEL		5	/* maximum verbosity level */
41 
42 #define	RESET_OWNER		0x0001
43 #define	CHOOSE_OWNER		0x0002
44 #define	RESET_ABR		0x0004
45 #define	UPDATE_ABR		0x0008
46 #define	GET_MIRROR_STATE	0x0010
47 
48 #define	SET_INFO_NO_WR	0x0002
49 #define	SET_INFO_MN	0x0004
50 
51 /*
52  * This table defines all the metaclust reconfig steps we understand
53  */
54 typedef enum stpnum {
55 	MC_UNK = 0,
56 	MC_START,
57 	MC_STOP,
58 	MC_ABORT,
59 	MC_RETURN,
60 	MC_STEP1,
61 	MC_STEP2,
62 	MC_STEP3,
63 	MC_STEP4
64 } stepnum_t;
65 
66 /*
67  * Structure for step_name -> step_number mapping
68  */
69 struct step_t {
70 	char		*step_nam;
71 	stepnum_t	step_num;
72 };
73 
74 /*
75  * Step name to step number mapping table
76  * This table MUST be sorted alphabetically in ascending order of step name
77  */
78 static struct step_t step_table[] = {
79 	{ "abort",	MC_ABORT },
80 	{ "return",	MC_RETURN },
81 	{ "start",	MC_START },
82 	{ "step1",	MC_STEP1 },
83 	{ "step2",	MC_STEP2 },
84 	{ "step3",	MC_STEP3 },
85 	{ "step4",	MC_STEP4 },
86 	{ "stop",	MC_STOP }
87 };
88 
89 /*
90  * If support for a different version is added, the new version number should
91  * be appended to the version_table below. This list will be searched to
92  * determine if a version requested via the -V option is supported or not.
93  */
94 static char *version_table[] = {
95 	MY_VERSION
96 };
97 
98 uint_t	timeout = 0;			/* disable timeout by default */
99 char	*version = MY_VERSION;		/* use latest version by default */
100 int	stepnum = MC_UNK;		/* reconfiguration step number */
101 pid_t	c_pid;				/* child process id */
102 
103 /*
104  * Binary search comparison routine
105  */
106 static int
107 mc_compare(const void *stp1, const void *stp2)
108 {
109 	return (strcmp((const char *)stp1,
110 	    ((const struct step_t *)stp2)->step_nam));
111 }
112 
113 /*
114  * Timeout expiry alarm signal handler
115  */
116 /*ARGSUSED*/
117 static void
118 sigalarmhandler(int sig)
119 {
120 	int	i, n, ret, stat_loc = 0;
121 
122 	n = sizeof (step_table) / sizeof (step_table[0]);
123 	for (i = 0; i < n; i++) {
124 		if (stepnum == step_table[i].step_num)
125 			break;
126 	}
127 
128 	assert(i != n);
129 
130 	meta_mc_log(MC_LOG1, gettext("Timeout expired in %s: %s"),
131 	    step_table[i].step_nam,
132 	    meta_print_hrtime(gethrtime() - start_time));
133 
134 	if ((ret = kill(c_pid, SIGKILL)) == 0) {
135 		/*
136 		 * The child will wait forever until the status is retrieved
137 		 * so get it now. Keep retrying if the call is interrupted.
138 		 *
139 		 * The possible results are,
140 		 *
141 		 *	- child killed successfully
142 		 *	- signal sent but child not killed
143 		 *	- waitpid failed/interrupted
144 		 */
145 		sleep(2);
146 		while ((ret = waitpid(c_pid, &stat_loc, WNOHANG)) < 0) {
147 			if (errno != EINTR) {
148 				break;
149 			}
150 		}
151 		if ((ret == c_pid) || (errno == ECHILD)) {
152 			ret = 0;
153 		} else {
154 			ret = 1;
155 		}
156 	} else if (errno == ESRCH) {
157 		/*
158 		 * If the kill did not catch the child then it means the child
159 		 * exited immediately after the timeout occured.
160 		 */
161 		ret = 0;
162 	}
163 
164 	/*
165 	 * make sure not to exit with 205 for any steps other than step1-step4.
166 	 * Suncluster reconfiguration can't handle it otherwise.
167 	 */
168 	switch (stepnum) {
169 	case MC_STEP1:
170 	case MC_STEP2:
171 	case MC_STEP3:
172 	case MC_STEP4:
173 		/*
174 		 * If the child was killed successfully return 205 for a
175 		 * new reconfig cycle otherwise send 1 to panic the node.
176 		 */
177 		if (ret != 0) {
178 			md_eprintf(gettext("Could not kill child\n"));
179 			exit(1);
180 		} else {
181 			exit(205);
182 		}
183 		break;
184 	case MC_START:
185 	case MC_STOP:
186 	case MC_ABORT:
187 	case MC_RETURN:
188 	default:
189 		exit(1);
190 		break;
191 	}
192 }
193 
194 /*
195  * Attempt to load local set.
196  * Returns:
197  *	pointer to mdsetname_t for local set (local_sp) is successful.
198  *	0 if failure
199  *		if there are no local set mddbs, no error message is printed.
200  *		Otherwise, error message is printed so that user
201  *		can determine why the local set didn't start.
202  */
203 mdsetname_t *
204 load_local_set(md_error_t *ep)
205 {
206 	mdsetname_t	*local_sp = NULL;
207 
208 	/* Does local set exist? If not, give no error */
209 	if ((local_sp = metasetname(MD_LOCAL_NAME, ep)) == NULL) {
210 		return (0);
211 	}
212 
213 	/*
214 	 * snarf local set
215 	 * If fails with MDE_DB_NODB, then just return 1 printing
216 	 * no failure.
217 	 * Otherwise, print error message, and return 1.
218 	 */
219 	if (meta_setup_db_locations(ep) != 0) {
220 		if (!(mdismddberror(ep, MDE_DB_NODB)))
221 			mde_perror(ep, "");
222 		return (0);
223 	}
224 
225 	/* local set loaded successfully */
226 	return (local_sp);
227 }
228 
229 /*
230  * Purpose:	Compose a full path name for a metadevice
231  *
232  * On entry:	sp	- setname pointer
233  *		mnum	- minor number of metadevice
234  *		pathname - pointer to array to return path string
235  *		pathlen	- max length of pathname array
236  */
237 static int
238 compose_path(mdsetname_t *sp, int mnum, char *pathname, int pathlen)
239 {
240 	int	rtn;
241 
242 	if (MD_MIN2SET(mnum) != sp->setno) {
243 		md_eprintf(gettext("minor number 0x%x invalid for set %d\n"),
244 		    mnum, sp->setno);
245 		return (-1);
246 	}
247 	rtn = snprintf(pathname, pathlen, "/dev/md/%s/rdsk/d%u",
248 	    sp->setname, (unsigned)MD_MIN2UNIT(mnum));
249 
250 	if ((pathname[0] == '\0') || (rtn >= pathlen)) {
251 		md_eprintf(gettext(
252 		    "Could not create path for device %s/d%u\n"),
253 		    sp->setname, (unsigned)MD_MIN2UNIT(mnum));
254 		return (-1);
255 	}
256 	return (0);
257 }
258 
259 /*
260  * Purpose:	Walk through all the devices specified for the given set
261  *		and do the action specified in mode
262  */
263 static int
264 reset_state(uint_t mode, mdsetname_t *sp, char *drivername, md_error_t *ep)
265 {
266 	mdnamelist_t			*devnlp = NULL;
267 	mdnamelist_t			*p;
268 	mdname_t			*devnp = NULL;
269 	md_set_mmown_params_t		ownpar_p;
270 	md_set_mmown_params_t		*ownpar = &ownpar_p;
271 	md_unit_t			*mm;
272 	int				mirror_dev = 0;
273 	mndiskset_membershiplist_t	*nl;
274 	int				cnt;
275 	int				has_parent;
276 	md_mn_get_mir_state_t		mir_state_p;
277 	md_mn_get_mir_state_t		*mir_state = &mir_state_p;
278 
279 	/*
280 	 * if we are choosing or resetting the owners then make sure
281 	 * we are only doing it for mirror devices
282 	 */
283 	mirror_dev = (strcmp(MD_MIRROR, drivername) == 0);
284 	if ((mode & (RESET_OWNER | CHOOSE_OWNER)) && !mirror_dev) {
285 		return (-1);
286 	}
287 
288 	/* get a list of all the metadevices for current set */
289 	if (mirror_dev && meta_get_mirror_names(sp, &devnlp, 0, ep) < 0) {
290 		mde_perror(ep, gettext("Could not get mirrors for set %s"),
291 		    sp->setname);
292 		return (-1);
293 	} else if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) {
294 		mde_perror(ep, gettext(
295 		    "Could not get soft partitions for set %s"), sp->setname);
296 		return (-1);
297 	}
298 
299 	/* If resetting the owner, get the known membership list */
300 	if (mode & RESET_OWNER) {
301 		if (meta_read_nodelist(&cnt, &nl, ep)) {
302 			mde_perror(ep, "Could not get nodelist");
303 			return (-1);
304 		}
305 	}
306 
307 	/* for each metadevice */
308 	for (p = devnlp; (p != NULL); p = p->next) {
309 		devnp = p->namep;
310 
311 		/*
312 		 * Get the current setting for mirror ABR state and all of the
313 		 * submirror state and flags from the master node. We only
314 		 * perform this when going through a 'start' cycle.
315 		 */
316 		if ((mode & GET_MIRROR_STATE) && mirror_dev) {
317 			char	*miscname;
318 
319 			/*
320 			 * Ensure that we ignore soft-parts that are returned
321 			 * from the meta_get_mirror_names() call
322 			 */
323 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
324 				goto out;
325 			if (strcmp(miscname, MD_MIRROR) != 0)
326 				continue;
327 
328 			mir_state->mnum = meta_getminor(devnp->dev);
329 			MD_SETDRIVERNAME(mir_state, MD_MIRROR, sp->setno);
330 			meta_mc_log(MC_LOG4, gettext("Getting mirror state"
331 			    " for %s/d%u: %s"), sp->setname,
332 			    (unsigned)MD_MIN2UNIT(mir_state->mnum),
333 			    meta_print_hrtime(gethrtime() - start_time));
334 
335 			if (metaioctl(MD_MN_GET_MIRROR_STATE, mir_state, ep,
336 			    "MD_MN_GET_MIRROR_STATE") != 0) {
337 				mde_perror(ep, gettext("Unable to get "
338 				    "mirror state for %s/d%u"), sp->setname,
339 				    (unsigned)MD_MIN2UNIT(mir_state->mnum));
340 				goto out;
341 			} else {
342 				continue;
343 			}
344 		}
345 
346 		/* check if this is a top level metadevice */
347 		if ((mm = meta_get_mdunit(sp, devnp, ep)) == NULL)
348 			goto out;
349 		if (MD_HAS_PARENT(MD_PARENT(mm))) {
350 			has_parent = 1;
351 		} else {
352 			has_parent = 0;
353 		}
354 		Free(mm);
355 
356 		if (mode & (RESET_OWNER | CHOOSE_OWNER)) {
357 			char	*miscname;
358 
359 			/*
360 			 * we can only do these for mirrors so make sure we
361 			 * really have a mirror device and not a softpartition
362 			 * imitating one. meta_get_mirror_names seems to think
363 			 * softparts on top of a mirror are mirrors!
364 			 */
365 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
366 				goto out;
367 			if (strcmp(miscname, MD_MIRROR) != 0)
368 				continue;
369 
370 			(void) memset(ownpar, 0, sizeof (*ownpar));
371 			ownpar->d.mnum = meta_getminor(devnp->dev);
372 			MD_SETDRIVERNAME(ownpar, MD_MIRROR, sp->setno);
373 
374 			meta_mc_log(MC_LOG4, gettext("Setting owner "
375 			    "for %s/d%u: %s"), sp->setname,
376 			    (unsigned)MD_MIN2UNIT(ownpar->d.mnum),
377 			    meta_print_hrtime(gethrtime() - start_time));
378 
379 			/* get the current owner id */
380 			if (metaioctl(MD_MN_GET_MM_OWNER, ownpar, ep,
381 			    "MD_MN_GET_MM_OWNER") != 0) {
382 				mde_perror(ep, gettext("Unable to get "
383 				    "mirror owner for %s/d%u"), sp->setname,
384 				    (unsigned)MD_MIN2UNIT(ownpar->d.mnum));
385 				goto out;
386 			}
387 		}
388 
389 		if (mode & RESET_OWNER) {
390 			if (ownpar->d.owner == MD_MN_MIRROR_UNOWNED) {
391 				mdclrerror(ep);
392 				continue;
393 			}
394 
395 			/*
396 			 * reset owner only if the current owner is
397 			 * not in the membership list
398 			 * Also kill the resync thread so that when the resync
399 			 * is started, it will perform an optimized resync
400 			 * for any resync regions that were dirty when the
401 			 * current owner left the membership.
402 			 */
403 			if (meta_is_member(NULL, ownpar->d.owner, nl) != 1) {
404 				if (meta_mn_change_owner(&ownpar,
405 				    sp->setno, ownpar->d.mnum,
406 				    MD_MN_MIRROR_UNOWNED,
407 				    MD_MN_MM_ALLOW_CHANGE) == -1) {
408 					md_eprintf(gettext(
409 					    "Unable to reset mirror owner "
410 					    "for %s/d%u\n"), sp->setname,
411 					    (unsigned)MD_MIN2UNIT(
412 					    ownpar->d.mnum));
413 					goto out;
414 				}
415 				if (meta_mirror_resync(sp, devnp, 0, ep,
416 				    MD_RESYNC_KILL_NO_WAIT) != 0) {
417 					md_eprintf(gettext(
418 					    "Unable to kill resync for"
419 					    " %s/d%u\n"), sp->setname,
420 					    (unsigned)MD_MIN2UNIT(
421 					    ownpar->d.mnum));
422 					goto out;
423 				}
424 			}
425 		}
426 
427 		if (mode & CHOOSE_OWNER) {
428 			/*
429 			 * only orphaned resyncs will have no owner.
430 			 * if that is the case choose a new owner. Otherwise
431 			 * re-establish the existing owner. This covers the
432 			 * case where a node that owned the mirror
433 			 * reboots/panics and comes back into the cluster before
434 			 * the reconfig cycle has completed. In this case the
435 			 * other cluster nodes will have the mirror owner marked
436 			 * as the rebooted node while it has the owner marked
437 			 * as 'None'. We have to reestablish the ownership so
438 			 * that the subsequent resync can continue.
439 			 */
440 			if (meta_mn_change_owner(&ownpar, sp->setno,
441 			    ownpar->d.mnum, ownpar->d.owner,
442 			    MD_MN_MM_CHOOSE_OWNER) == -1) {
443 				md_eprintf(gettext("Unable to choose "
444 				    "mirror owner for %s/d%u\n"), sp->setname,
445 				    (unsigned)MD_MIN2UNIT(ownpar->d.mnum));
446 				goto out;
447 			}
448 		}
449 
450 		/*
451 		 * For RESET_ABR and UPDATE_ABR - only handle top
452 		 * level metadevices.
453 		 */
454 		if (has_parent)
455 			continue;
456 
457 		if (mode & RESET_ABR) {
458 			/*
459 			 * Reset the ABR (application based recovery)
460 			 * value on all nodes. We are dealing with
461 			 * the possibility that we have ABR set but the
462 			 * only node that had the device open with ABR has
463 			 * left the cluster. We simply open and close the
464 			 * device and if this is the last close in the
465 			 * cluster, ABR will be cleared on all nodes.
466 			 */
467 			char		*miscname;
468 			char		name[MD_MAX_CTDLEN];
469 			int		mnum, fd;
470 
471 			name[0] = '\0';
472 			mnum = meta_getminor(devnp->dev);
473 
474 			/*
475 			 * Ensure that we don't include soft-parts in the
476 			 * mirror-only call to RESET_ABR. meta_get_mirror_names
477 			 * returns a bogus list that includes all soft-parts
478 			 * built on mirrors.
479 			 */
480 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
481 				goto out;
482 			if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0))
483 				continue;
484 
485 			meta_mc_log(MC_LOG4, gettext("Re-setting ABR state "
486 			    "for %s/d%u: %s"), sp->setname,
487 			    (unsigned)MD_MIN2UNIT(mnum),
488 			    meta_print_hrtime(gethrtime() - start_time));
489 
490 			/* compose the absolute device path and open it */
491 			if (compose_path(sp, mnum, &name[0],
492 			    sizeof (name)) != 0)
493 				goto out;
494 			if ((fd = open(name, O_RDWR, 0)) < 0) {
495 				md_perror(gettext("Could not open device %s"),
496 				    name);
497 				continue;
498 			}
499 
500 			(void) close(fd);
501 		}
502 
503 		if (mode & UPDATE_ABR) {
504 			/*
505 			 * Update the ABR value on this node. We obtain the
506 			 * current ABR state from the master node.
507 			 */
508 
509 			char		*miscname;
510 			char		name[MD_MAX_CTDLEN];
511 			int		mnum, fd;
512 			volcap_t	vc;
513 			uint_t		tstate;
514 
515 			name[0] = '\0';
516 			mnum = meta_getminor(devnp->dev);
517 
518 			/*
519 			 * Ensure that we don't include soft-parts in the
520 			 * mirror-only call to UPDATE_ABR. meta_get_mirror_names
521 			 * returns a bogus list that includes all soft-parts
522 			 * built on mirrors.
523 			 */
524 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
525 				goto out;
526 			if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0))
527 				continue;
528 
529 			/* Get tstate from Master */
530 			if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep)
531 			    != 0)
532 				continue;
533 			/* If not set on the master, nothing to do */
534 			if (!(tstate & MD_ABR_CAP))
535 				continue;
536 
537 			meta_mc_log(MC_LOG4, gettext("Updating ABR state "
538 			    "for %s/d%u: %s"), sp->setname,
539 			    (unsigned)MD_MIN2UNIT(mnum),
540 			    meta_print_hrtime(gethrtime() - start_time));
541 
542 			/* compose the absolute device path and open it */
543 			if (compose_path(sp, mnum, &name[0],
544 			    sizeof (name)) != 0)
545 				goto out;
546 			if ((fd = open(name, O_RDWR, 0)) < 0) {
547 				md_perror(gettext("Could not open device %s"),
548 				    name);
549 				continue;
550 			}
551 
552 			/* set ABR state */
553 			vc.vc_info = 0;
554 			vc.vc_set = 0;
555 			if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) {
556 				/*
557 				 * Ignore if device does not support this
558 				 * ioctl
559 				 */
560 				if ((errno != ENOTTY) && (errno != ENOTSUP)) {
561 					md_perror(gettext("Could not get "
562 					    "ABR/DMR state for device %s"),
563 					    name);
564 				}
565 				(void) close(fd);
566 				continue;
567 			}
568 			if (!(vc.vc_info & (DKV_ABR_CAP | DKV_DMR_CAP))) {
569 				(void) close(fd);
570 				continue;
571 			}
572 
573 			vc.vc_set = DKV_ABR_CAP;
574 			if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) {
575 				md_perror(gettext(
576 				    "Could not set ABR state for "
577 				    "device %s"), name);
578 				(void) close(fd);
579 				goto out;
580 			} else {
581 				md_eprintf(gettext(
582 				    "Setting ABR state on device %s\n"), name);
583 			}
584 
585 			(void) close(fd);
586 		}
587 	}
588 
589 	/* cleanup */
590 	if (mode & RESET_OWNER) {
591 		meta_free_nodelist(nl);
592 	}
593 	metafreenamelist(devnlp);
594 	return (0);
595 
596 out:
597 	/* cleanup */
598 	if (mode & RESET_OWNER) {
599 		meta_free_nodelist(nl);
600 	}
601 	metafreenamelist(devnlp);
602 	return (-1);
603 }
604 
605 /*
606  * Print usage message
607  */
608 static void
609 usage(mdsetname_t *sp, int eval)
610 {
611 	(void) fprintf(stderr, gettext("usage:"
612 	    "\t%s [-V version] [-t timeout] [-d level] start localnodeid\n"
613 	    "\t%s [-V version] [-t timeout] [-d level] step nodelist...\n"
614 	    "\t%s [-V version] [-t timeout] [-d level] abort | stop\n"
615 	    "\t%s [-V | -? | -h]\n"),
616 	    myname, myname, myname, myname);
617 	if (!eval) {
618 		fprintf(stderr, gettext("\n"
619 		    "\tValid debug (-d) levels are 1-%d for increasing "
620 		    "verbosity.\n\tDefault is -d 3.\n\n"
621 		    "\tValid step values are: return | step1 | step2 | "
622 		    "step3 | step4\n\n"
623 		    "\tNodelist is a space-separated list of node id's\n\n"),
624 		    MAX_DEBUG_LEVEL);
625 	}
626 	md_exit(sp, eval);
627 }
628 
629 /*
630  * Input:	Input takes a config step name followed by a list of
631  *		possible node id's.
632  *
633  * Returns:	  0 - Success
634  *		  1 - Fail
635  *			Node will be removed from cluster membership
636  *			by forcing node to panic.
637  *		205 - Unsuccessful. Start another reconfig cycle.
638  *			Problem was encountered that could be fixed by
639  *			running another reconfig cycle.
640  *			Problem could be a result of a failure to read
641  *			the nodelist file or that all work could not be
642  *			accomplished in a reconfig step in the amount of
643  *			time given so another reconfig cycle is needed in
644  *			order to finish the current step.
645  */
646 int
647 main(int argc, char **argv)
648 {
649 	mdsetname_t		*sp = NULL;
650 	md_error_t		status = mdnullerror;
651 	md_error_t		*ep = &status;
652 	set_t			max_sets, setno;
653 	int			c, clust = 0;
654 	struct sigaction	nsa, osa;
655 	struct step_t		*step_ptr;
656 	mdsetname_t		*local_sp = NULL;
657 	md_drive_desc		*dd;
658 	int			rval = 0;
659 	md_set_desc		*sd;
660 	mddb_block_parm_t	mbp;
661 	uint_t			debug = 3; /* log upto MC_LOG3 by default */
662 	int			version_table_size;
663 	mddb_setflags_config_t	sf;
664 	int			ret_val;
665 	mddb_config_t		cfg;
666 	int			set_info[MD_MAXSETS];
667 
668 	/*
669 	 * Get the locale set up before calling any other routines
670 	 * with messages to ouput.  Just in case we're not in a build
671 	 * environment, make sure that TEXT_DOMAIN gets set to
672 	 * something.
673 	 */
674 #if !defined(TEXT_DOMAIN)
675 #define	TEXT_DOMAIN "SYS_TEST"
676 #endif
677 	(void) setlocale(LC_ALL, "");
678 	(void) textdomain(TEXT_DOMAIN);
679 
680 	if ((clust = sdssc_bind_library()) == SDSSC_ERROR) {
681 		md_eprintf(gettext("Interface error with libsds_sc.so\n"));
682 		exit(1);
683 	}
684 
685 	if (md_init(argc, argv, 1, 1, ep) != 0 || meta_check_root(ep) != 0) {
686 		mde_perror(ep, "");
687 		md_exit(sp, 1);
688 	}
689 
690 	/*
691 	 * open log and enable libmeta logging. Do it here explicitly
692 	 * rather than letting md_init() do it because we are not really
693 	 * a daemon and that is what md_init() opens the log as.
694 	 */
695 	openlog("metaclust", LOG_CONS, LOG_USER);
696 
697 	version_table_size = sizeof (version_table) / sizeof (version_table[0]);
698 
699 	optind = 1;
700 	opterr = 0;
701 	while ((c = getopt(argc, argv, "hd:V:t:?")) != -1) {
702 		switch (c) {
703 		case 'h':
704 			usage(sp, 0);
705 			break;
706 
707 		case 'd':
708 			if (sscanf(optarg, "%u", &debug) != 1) {
709 				md_eprintf(gettext("Invalid debug level\n"));
710 				md_exit(sp, 1);
711 			} else if ((debug < 1) || (debug > MAX_DEBUG_LEVEL)) {
712 				debug = min(max(debug, 1), MAX_DEBUG_LEVEL);
713 				md_eprintf(gettext("Debug level must be "
714 				    "between 1 and %d inclusive.\n"),
715 				    MAX_DEBUG_LEVEL);
716 				md_eprintf(gettext("Debug level set to %d.\n"),
717 				    debug);
718 			}
719 			break;
720 
721 		case 'V':
722 			version = Strdup(optarg);
723 			break;
724 
725 		case 't':
726 			if (sscanf(optarg, "%u", &timeout) != 1) {
727 				md_eprintf(gettext("Invalid timeout value\n"));
728 				md_exit(sp, 1);
729 			}
730 			break;
731 
732 		case '?':
733 			if (optopt == '?') {
734 				usage(sp, 0);
735 			} else if (optopt == 'V') {
736 				int	i;
737 
738 				fprintf(stdout, gettext(
739 				    "%s: Versions Supported:"), myname);
740 				for (i = 0; i < version_table_size; i++) {
741 					fprintf(stdout, " %s",
742 					    version_table[i]);
743 				}
744 				fprintf(stdout, "\n");
745 				md_exit(sp, 0);
746 			}
747 			/*FALLTHROUGH*/
748 
749 		default:
750 			usage(sp, 1);
751 			break;
752 		}
753 	}
754 
755 	/* initialise the debug level and start time */
756 	setup_mc_log(debug);
757 
758 	/*
759 	 * check that the version specified (if any) is supported.
760 	 */
761 	if (version != NULL) {
762 		int	i, found = 0;
763 
764 		for (i = 0; i < version_table_size; i++) {
765 			if (strcmp(version, version_table[i]) == 0) {
766 				found = 1;
767 				break;
768 			}
769 		}
770 		if (!found) {
771 			md_eprintf(gettext("Version %s not supported\n"),
772 			    version);
773 			md_exit(sp, 1);
774 		}
775 	}
776 
777 	argc -= optind;
778 	argv += optind;
779 
780 	/* parse arguments */
781 	if (argc <= 0) {
782 		usage(sp, 1);
783 	}
784 
785 	/* convert the step name to the corresponding number */
786 	step_ptr = bsearch(argv[0], step_table, (sizeof (step_table) /
787 	    sizeof (step_table[0])), sizeof (step_table[0]), mc_compare);
788 	if (step_ptr != NULL) {
789 		stepnum = step_ptr->step_num;
790 	}
791 
792 	--argc;
793 	++argv;
794 
795 	/* set timeout alarm signal, a value of 0 will disable timeout */
796 	if (timeout > 0) {
797 		int	stat_loc = 0;
798 
799 		c_pid = fork();
800 
801 		if (c_pid == (pid_t)-1) {
802 			md_perror(gettext("Unable to fork"));
803 			md_exit(sp, 1);
804 		} else if (c_pid) {
805 			/* parent */
806 			nsa.sa_flags = 0;
807 			if (sigfillset(&nsa.sa_mask) < 0) {
808 				md_perror(gettext("Unable to set signal mask"));
809 				md_exit(sp, 1);
810 			}
811 
812 			nsa.sa_handler = sigalarmhandler;
813 			if (sigaction(SIGALRM, &nsa, &osa) == -1) {
814 				md_perror(gettext("Unable to set alarm "
815 				    "handler"));
816 				md_exit(sp, 1);
817 			}
818 
819 			(void) alarm(timeout);
820 
821 			/*
822 			 * wait for child to exit or timeout to expire.
823 			 * keep retrying if the call is interrupted
824 			 */
825 			while ((ret_val = waitpid(c_pid, &stat_loc, 0)) < 0) {
826 				if (errno != EINTR) {
827 					break;
828 				}
829 			}
830 			if (ret_val == c_pid) {
831 				/* exit with the childs exit value */
832 				exit(WEXITSTATUS(stat_loc));
833 			} else if (errno == ECHILD) {
834 				md_exit(sp, 0);
835 			} else {
836 				perror(myname);
837 				md_exit(sp, 1);
838 			}
839 		}
840 	}
841 
842 	/*
843 	 * If a timeout value is given, everything from this point onwards is
844 	 * executed in the child process.
845 	 */
846 
847 	switch (stepnum) {
848 	case MC_START:
849 		/*
850 		 * Start Step
851 		 *
852 		 * - Suspend all rpc.mdcommd messages
853 		 */
854 
855 		/* expect the local node id to be given only */
856 		if (argc != 1)
857 			usage(sp, 1);
858 
859 		meta_mc_log(MC_LOG2, gettext("Starting Start step: %s"),
860 		    meta_print_hrtime(0));
861 
862 		/*
863 		 * Does local set exist? If not, exit with 0
864 		 * since there's no reason to have this node panic if
865 		 * the local set cannot be started.
866 		 */
867 		if ((local_sp = load_local_set(ep)) == NULL) {
868 			md_exit(local_sp, 0);
869 		}
870 
871 		if ((max_sets = get_max_sets(ep)) == 0) {
872 			mde_perror(ep, "");
873 			md_exit(sp, 1);
874 		}
875 
876 		/* start walking through all possible disksets */
877 		for (setno = 1; setno < max_sets; setno++) {
878 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
879 				if (mdiserror(ep, MDE_NO_SET)) {
880 					/* No set for this setno - continue */
881 					mdclrerror(ep);
882 					continue;
883 				} else {
884 					mde_perror(ep, gettext("Unable to "
885 					    "get set %d information"), setno);
886 					md_exit(sp, 1);
887 				}
888 			}
889 
890 			/* only check multi-node disksets */
891 			if (!meta_is_mn_set(sp, ep)) {
892 				mdclrerror(ep);
893 				continue;
894 			}
895 
896 			meta_mc_log(MC_LOG3, gettext("Start - block parse "
897 			    "messages for set %s: %s"), sp->setname,
898 			    meta_print_hrtime(gethrtime() - start_time));
899 
900 			/*
901 			 * Mddb parse messages are sent amongst the nodes
902 			 * in a diskset whenever the locator block or
903 			 * locator names structure has been changed.
904 			 * A locator block change could occur as a result
905 			 * of a disk failure during the reconfig cycle,
906 			 * so block the mddb parse messages while the
907 			 * rpc.mdcommd is suspended during the reconfig cycle.
908 			 */
909 			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
910 				(void) memset(&mbp, 0, sizeof (mbp));
911 				mbp.c_setno = setno;
912 				mbp.c_blk_flags = MDDB_BLOCK_PARSE;
913 				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
914 				    &mbp.c_mde, NULL)) {
915 					mdstealerror(ep, &mbp.c_mde);
916 					mde_perror(ep, gettext("Could not "
917 					    "block set %s"), sp->setname);
918 					md_exit(sp, 1);
919 				}
920 			}
921 
922 			/* suspend commd and spin waiting for drain */
923 			while ((ret_val = mdmn_suspend(setno,
924 			    MD_COMM_ALL_CLASSES)) ==
925 			    MDE_DS_COMMDCTL_SUSPEND_NYD) {
926 				sleep(1);
927 			}
928 
929 			if (ret_val) {
930 				md_eprintf(gettext("Could not suspend "
931 				    "rpc.mdcommd for set %s\n"), sp->setname);
932 				md_exit(sp, 1);
933 			}
934 
935 			/*
936 			 * Set start step flag for set. This is set to indicate
937 			 * that this node entered the reconfig cycle through
938 			 * the start step.  This is used during the reconfig
939 			 * cycle to determine whether the node had entered
940 			 * through the start step or the return step.
941 			 */
942 			(void) memset(&sf, 0, sizeof (sf));
943 			sf.sf_setno = sp->setno;
944 			sf.sf_setflags = MD_SET_MN_START_RC;
945 			sf.sf_flags = MDDB_NM_SET;
946 			/* Use magic to help protect ioctl against attack. */
947 			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
948 			if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
949 			    &sf.sf_mde, NULL)) {
950 				mdstealerror(ep, &sf.sf_mde);
951 				mde_perror(ep, gettext("Could not set "
952 				    "start_step flag for set %s"), sp->setname);
953 				md_exit(sp, 1);
954 			}
955 
956 		}
957 
958 		meta_mc_log(MC_LOG2, gettext("Start step completed: %s"),
959 		    meta_print_hrtime(gethrtime() - start_time));
960 
961 		break;
962 
963 	case MC_STOP:
964 		/*
965 		 * Stop Step
966 		 *
967 		 * - ???
968 		 */
969 
970 		/* don't expect any more arguments to follow the step name */
971 		if (argc != 0)
972 			usage(sp, 1);
973 
974 		break;
975 
976 	case MC_ABORT:
977 		/*
978 		 * Abort Step
979 		 *
980 		 * - Abort rpc.mdcommd
981 		 */
982 
983 		/* don't expect any more arguments to follow the step name */
984 		if (argc != 0)
985 			usage(sp, 1);
986 
987 		meta_mc_log(MC_LOG2, gettext("Starting Abort step: %s"),
988 		    meta_print_hrtime(0));
989 
990 		/*
991 		 * Does local set exist? If not, exit with 0
992 		 * since there's no reason to have this node panic if
993 		 * the local set cannot be started.
994 		 */
995 		if ((local_sp = load_local_set(ep)) == NULL) {
996 			md_exit(local_sp, 0);
997 		}
998 
999 		/*
1000 		 * abort the rpc.mdcommd.  The abort is only issued on this node
1001 		 * meaning that the abort reconfig step is called on this
1002 		 * node before a panic while the rest of the cluster will
1003 		 * undergo a reconfig cycle.
1004 		 * There is no time relation between this node running a
1005 		 * reconfig abort and the the rest of the cluster
1006 		 * running a reconfig cycle meaning that this node may
1007 		 * panic before, during or after the cluster has run
1008 		 * a reconfig cycle.
1009 		 */
1010 		mdmn_abort();
1011 
1012 		meta_mc_log(MC_LOG2, gettext("Abort step completed: %s"),
1013 		    meta_print_hrtime(gethrtime() - start_time));
1014 
1015 		break;
1016 
1017 	case MC_RETURN:
1018 		/*
1019 		 * Return Step
1020 		 *
1021 		 * - Grab local set lock, issue rpc.mdcommd DRAIN ALL
1022 		 *   and release local set lock.  Grabbing the local set
1023 		 *   lock allows any active metaset/metadb commands to
1024 		 *   terminate gracefully and will keep a metaset/metadb
1025 		 *   command from starting until the DRAIN ALL is issued.
1026 		 *   The metaset/metadb commands can issue
1027 		 *   DRAIN ALL/RESUME ALL commands to rpc.mdcommd,
1028 		 *   so the return step must not issue the DRAIN ALL command
1029 		 *   until metaset/metadb have finished or metaset may issue
1030 		 *   a RESUME ALL after this return reconfig step has issued
1031 		 *   the DRAIN ALL command.
1032 		 *   After this reconfig step has issued the DRAIN_ALL and
1033 		 *   released the local set lock, metaset/metadb will fail
1034 		 *   when attempting to contact the rpc.mdcommd and will
1035 		 *   terminate without making any configuration changes.
1036 		 *   The DRAIN ALL command will keep all other meta* commands
1037 		 *   from running during the reconfig cycle (these commands
1038 		 *   will wait until the rpc.mdcommd is resumed) since the
1039 		 *   reconfig cycle may be changing the diskset configuration.
1040 		 */
1041 
1042 		/* expect the nodelist to follow the step name */
1043 		if (argc < 1)
1044 			usage(sp, 1);
1045 
1046 		meta_mc_log(MC_LOG2, gettext("Starting Return step: %s"),
1047 		    meta_print_hrtime(0));
1048 
1049 		/*
1050 		 * Does local set exist? If not, exit with 0
1051 		 * since there's no reason to have this node panic if
1052 		 * the local set cannot be started.
1053 		 */
1054 		if ((local_sp = load_local_set(ep)) == NULL) {
1055 			md_exit(local_sp, 0);
1056 		}
1057 
1058 		/*
1059 		 * Suspend any mirror resyncs that are in progress. This
1060 		 * stops unnecessary timeouts.
1061 		 */
1062 		meta_mirror_resync_block_all();
1063 
1064 		if (meta_lock(local_sp, TRUE, ep) != 0) {
1065 			mde_perror(ep, "");
1066 			md_exit(local_sp, 1);
1067 		}
1068 
1069 		/*
1070 		 * All metaset and metadb commands on this node have now
1071 		 * terminated gracefully.  Now, issue a drain all to
1072 		 * the rpc.mdcommd.  Any meta command issued after the
1073 		 * drain all will either spin sending the command to the
1074 		 * master until after the reconfig cycle has finished OR
1075 		 * will terminate gracefully (metaset/metadb).
1076 		 */
1077 		if ((max_sets = get_max_sets(ep)) == 0) {
1078 			mde_perror(ep, "");
1079 			md_exit(sp, 1);
1080 		}
1081 
1082 		/* start walking through all possible disksets */
1083 		for (setno = 1; setno < max_sets; setno++) {
1084 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1085 				if (mdiserror(ep, MDE_NO_SET)) {
1086 					/* No set for this setno - continue */
1087 					mdclrerror(ep);
1088 					continue;
1089 				} else {
1090 					mde_perror(ep, gettext("Unable to "
1091 					    "get set %d information"), setno);
1092 					md_exit(sp, 1);
1093 				}
1094 			}
1095 
1096 			/* only check multi-node disksets */
1097 			if (!meta_is_mn_set(sp, ep)) {
1098 				mdclrerror(ep);
1099 				continue;
1100 			}
1101 
1102 			meta_mc_log(MC_LOG3, gettext("Return - block parse "
1103 			    "messages for set %s: %s"), sp->setname,
1104 			    meta_print_hrtime(gethrtime() - start_time));
1105 
1106 			/*
1107 			 * Mddb parse messages are sent amongst the nodes
1108 			 * in a diskset whenever the locator block or
1109 			 * locator names structure has been changed.
1110 			 * A locator block change could occur as a result
1111 			 * of a disk failure during the reconfig cycle,
1112 			 * so block the mddb parse messages while the
1113 			 * rpc.commd is suspended during the reconfig cycle.
1114 			 */
1115 			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
1116 				(void) memset(&mbp, 0, sizeof (mbp));
1117 				mbp.c_setno = setno;
1118 				mbp.c_blk_flags = MDDB_BLOCK_PARSE;
1119 				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
1120 				    &mbp.c_mde, NULL)) {
1121 					mdstealerror(ep, &mbp.c_mde);
1122 					mde_perror(ep, gettext("Could not "
1123 					    "block set %s"), sp->setname);
1124 					md_exit(sp, 1);
1125 				}
1126 			}
1127 
1128 			/* suspend commd and spin waiting for drain */
1129 			while ((ret_val = mdmn_suspend(setno,
1130 			    MD_COMM_ALL_CLASSES)) ==
1131 			    MDE_DS_COMMDCTL_SUSPEND_NYD) {
1132 				sleep(1);
1133 			}
1134 
1135 			if (ret_val) {
1136 				md_eprintf(gettext("Could not suspend "
1137 				    "rpc.mdcommd for set %s\n"), sp->setname);
1138 				md_exit(sp, 1);
1139 			}
1140 		}
1141 		/*
1142 		 * Resume all I/Os for this node for all MN sets in
1143 		 * case master node had suspended I/Os but panic'd
1144 		 * before resuming I/Os.  In case of failure, exit
1145 		 * with a 1 since unable to resume I/Os on this node.
1146 		 */
1147 		if (clnt_mn_susp_res_io(mynode(), 0, MN_RES_IO, ep)) {
1148 			mde_perror(ep, gettext(
1149 			    "Unable to resume I/O on node %s for all sets"),
1150 			    mynode());
1151 			md_exit(sp, 1);
1152 		}
1153 
1154 
1155 		/*
1156 		 * Can now unlock local set lock.  New metaset/metadb
1157 		 * commands are now held off using drain all.
1158 		 */
1159 		(void) meta_unlock(local_sp, ep);
1160 
1161 		meta_mc_log(MC_LOG2, gettext("Return step completed: %s"),
1162 		    meta_print_hrtime(gethrtime() - start_time));
1163 
1164 		break;
1165 
1166 	case MC_STEP1:
1167 		/*
1168 		 * Step 1
1169 		 *
1170 		 * - Populate nodelist file if we are on clustering
1171 		 *   and pick a master node for each MN diskset.
1172 		 */
1173 
1174 		/* expect the nodelist to follow the step name */
1175 		if (argc < 1)
1176 			usage(sp, 1);
1177 
1178 		meta_mc_log(MC_LOG2, gettext("Starting Step1: %s"),
1179 		    meta_print_hrtime(0));
1180 
1181 		/* Always write nodelist file even if no local set exists */
1182 		if (clust == SDSSC_OKAY) {
1183 			/* skip to the nodelist args */
1184 			if (meta_write_nodelist(argc, argv, ep) != 0) {
1185 				mde_perror(ep, gettext(
1186 				    "Could not populate nodelist file"));
1187 				md_exit(sp, 1);
1188 			}
1189 		}
1190 
1191 		/*
1192 		 * Does local set exist? If not, exit with 0
1193 		 * since there's no reason to have this node panic if
1194 		 * the local set cannot be started.
1195 		 */
1196 		if ((local_sp = load_local_set(ep)) == NULL) {
1197 			md_exit(local_sp, 0);
1198 		}
1199 
1200 		/*
1201 		 * At this point, all meta* commands are blocked across
1202 		 * all disksets since the master rpc.mdcommd has drained or
1203 		 * the master node has died.
1204 		 * If a metaset or metadb command had been in progress
1205 		 * at the start of the reconfig cycle, this command has
1206 		 * either completed or it has been terminated due to
1207 		 * the death of the master node.
1208 		 *
1209 		 * This means that that it is now ok to remove any
1210 		 * outstanding clnt_locks associated with multinode
1211 		 * disksets on this node due to a node panic during
1212 		 * a metaset operation.  This allows the routines that
1213 		 * choose the master to use rpc.metad to determine the
1214 		 * master of the diskset.
1215 		 */
1216 		if (clnt_clr_mnsetlock(mynode(), ep) != 0) {
1217 			meta_mc_log(MC_LOG2, gettext("Step1 aborted:"
1218 			    "clear locks failed %s"),
1219 			    meta_print_hrtime(gethrtime() - start_time));
1220 			md_exit(local_sp, 1);
1221 		}
1222 
1223 		/*
1224 		 * Call reconfig_choose_master to choose a master for
1225 		 * each MN diskset, update the nodelist for each diskset
1226 		 * given the member information and send a reinit message
1227 		 * to rpc.mdcommd to reload the nodelist.
1228 		 */
1229 		rval = meta_reconfig_choose_master(ep);
1230 		if (rval == 205) {
1231 			/*
1232 			 * NOTE: Should issue call to reboot remote host that
1233 			 * is causing the RPC failure.  Clustering to
1234 			 * provide interface in the future.  This should
1235 			 * stop a never-ending set of 205 reconfig cycles.
1236 			 * Remote host causing failure is stored in
1237 			 * ep->host if ep is an RPC error.
1238 			 * if (mdanyrpcerror(ep))
1239 			 * 	reboot (ep->host);
1240 			 */
1241 			meta_mc_log(MC_LOG2, gettext("Step1 aborted:"
1242 			    "choose master failure of 205 %s"),
1243 			    meta_print_hrtime(gethrtime() - start_time));
1244 			md_exit(local_sp, 205);
1245 		} else if (rval != 0) {
1246 			meta_mc_log(MC_LOG2, gettext("Step1 failure: "
1247 			    "choose master failure %s"),
1248 			    meta_print_hrtime(gethrtime() - start_time));
1249 			md_exit(local_sp, 1);
1250 		}
1251 
1252 		meta_mc_log(MC_LOG2, gettext("Step1 completed: %s"),
1253 		    meta_print_hrtime(gethrtime() - start_time));
1254 
1255 		md_exit(local_sp, rval);
1256 		break;
1257 
1258 	case MC_STEP2:
1259 		/*
1260 		 * Step 2
1261 		 *
1262 		 * In Step 2, each node walks the list of disksets.  If a
1263 		 * node is a master of a MN diskset, it synchronizes
1264 		 * the local set USER records for that diskset.
1265 		 *
1266 		 * If disks exist in the diskset and there is a joined
1267 		 * (owner) node in the diskset, the master will also:
1268 		 *	- synchronize the diskset mddbs to the master
1269 		 *	- play the change log
1270 		 *
1271 		 * The master node will now attempt to join any unjoined
1272 		 * nodes that are currently members in the membership list.
1273 		 */
1274 
1275 		/* expect the nodelist to follow the step name */
1276 		if (argc < 1)
1277 			usage(sp, 1);
1278 
1279 		meta_mc_log(MC_LOG2, gettext("Starting Step2: %s"),
1280 		    meta_print_hrtime(0));
1281 
1282 		/*
1283 		 * Does local set exist? If not, exit with 0
1284 		 * since there's no reason to have this node panic if
1285 		 * the local set cannot be started.
1286 		 */
1287 		if ((local_sp = load_local_set(ep)) == NULL) {
1288 			md_exit(local_sp, 0);
1289 		}
1290 
1291 		if ((max_sets = get_max_sets(ep)) == 0) {
1292 			mde_perror(ep, "");
1293 			md_exit(local_sp, 1);
1294 		}
1295 
1296 		/* start walking through all possible disksets */
1297 		for (setno = 1; setno < max_sets; setno++) {
1298 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1299 				if (mdiserror(ep, MDE_NO_SET)) {
1300 					/* No set for this setno - continue */
1301 					mdclrerror(ep);
1302 					continue;
1303 				} else if (mdanyrpcerror(ep)) {
1304 					/* Fail on RPC failure to self */
1305 					mde_perror(ep, gettext(
1306 					    "Unable to get information for "
1307 					    "set number %d"), setno);
1308 					md_exit(local_sp, 1);
1309 				} else {
1310 					mde_perror(ep, gettext(
1311 					    "Unable to get information for "
1312 					    "set number %d"), setno);
1313 					mdclrerror(ep);
1314 					continue;
1315 				}
1316 			}
1317 
1318 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1319 				if (mdanyrpcerror(ep)) {
1320 					/* Fail on RPC failure to self */
1321 					mde_perror(ep, gettext(
1322 					    "Unable to get information for "
1323 					    "set number %d"), setno);
1324 					md_exit(local_sp, 1);
1325 				}
1326 				mde_perror(ep, gettext("Unable to get set "
1327 				    "%s desc information"), sp->setname);
1328 				mdclrerror(ep);
1329 				continue;
1330 			}
1331 
1332 			/* Only check MN disksets */
1333 			if (!(MD_MNSET_DESC(sd))) {
1334 				continue;
1335 			}
1336 
1337 			/* All actions in step 2 are driven by master */
1338 			if (!(sd->sd_mn_am_i_master)) {
1339 				continue;
1340 			}
1341 
1342 			meta_mc_log(MC_LOG3, gettext("Step2 - begin record "
1343 			    "synchronization for set %s: %s"), sp->setname,
1344 			    meta_print_hrtime(gethrtime() - start_time));
1345 
1346 			/*
1347 			 * Synchronize the USER records in the local mddbs
1348 			 * for hosts that are members.  The USER records
1349 			 * contain set, drive and host information.
1350 			 */
1351 			rval = meta_mnsync_user_records(sp, ep);
1352 			if (rval != 0) {
1353 				mde_perror(ep, gettext(
1354 				    "Synchronization of user records "
1355 				    "in set %s failed\n"), sp->setname);
1356 				if (rval == 205) {
1357 					/*
1358 					 * NOTE: Should issue call to reboot
1359 					 * remote host that is causing the RPC
1360 					 * failure.  Clustering to provide
1361 					 * interface in the future.  This
1362 					 * should stop a never-ending set of
1363 					 * 205 reconfig cycles.
1364 					 * Remote host causing failure is
1365 					 * stored in ep->host if ep is an
1366 					 * RPC error.
1367 					 * if (mdanyrpcerror(ep))
1368 					 * 	reboot (ep->host);
1369 					 */
1370 					md_exit(local_sp, 205);
1371 				} else {
1372 					md_exit(local_sp, 1);
1373 				}
1374 			}
1375 
1376 			/* Reget sd since sync_user_recs may have flushed it */
1377 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1378 				mde_perror(ep, gettext("Unable to get set "
1379 				    "%s desc information"), sp->setname);
1380 				md_exit(local_sp, 1);
1381 			}
1382 
1383 			dd = metaget_drivedesc(sp,
1384 			    (MD_BASICNAME_OK | PRINT_FAST), ep);
1385 			if (! mdisok(ep)) {
1386 				mde_perror(ep, gettext("Unable to get set "
1387 				    "%s drive information"), sp->setname);
1388 				md_exit(local_sp, 1);
1389 			}
1390 
1391 			/*
1392 			 * No drives in set, continue to next set.
1393 			 */
1394 			if (dd == NULL) {
1395 				/* Done with this set */
1396 				continue;
1397 			}
1398 
1399 			meta_mc_log(MC_LOG3, gettext("Step2 - local set user "
1400 			    "records completed for set %s: %s"), sp->setname,
1401 			    meta_print_hrtime(gethrtime() - start_time));
1402 
1403 			/*
1404 			 * Synchronize the diskset mddbs for hosts
1405 			 * that are members.  This may involve
1406 			 * playing the changelog and writing out
1407 			 * to the diskset mddbs.
1408 			 */
1409 			rval = meta_mnsync_diskset_mddbs(sp, ep);
1410 			if (rval != 0) {
1411 				mde_perror(ep, gettext(
1412 				    "Synchronization of diskset mddbs "
1413 				    "in set %s failed\n"), sp->setname);
1414 				meta_mc_log(MC_LOG3, gettext("Step2 - diskset "
1415 				    "mddb synchronization failed for "
1416 				    "set %s: %s"), sp->setname,
1417 				    meta_print_hrtime(gethrtime() -
1418 				    start_time));
1419 				if (rval == 205) {
1420 					/*
1421 					 * NOTE: Should issue call to reboot
1422 					 * remote host that is causing the RPC
1423 					 * failure.  Clustering to provide
1424 					 * interface in the future.  This
1425 					 * should stop a never-ending set of
1426 					 * 205 reconfig cycles.
1427 					 * Remote host causing failure is
1428 					 * stored in ep->host if ep is an
1429 					 * RPC error.
1430 					 * if (mdanyrpcerror(ep))
1431 					 * 	reboot (ep->host);
1432 					 */
1433 					md_exit(local_sp, 205);
1434 				} else if (rval == 1) {
1435 					continue;
1436 				} else {
1437 					md_exit(local_sp, 1);
1438 				}
1439 			}
1440 
1441 			meta_mc_log(MC_LOG3, gettext("Step2 - diskset mddb "
1442 			    "synchronization completed for set %s: %s"),
1443 			    sp->setname,
1444 			    meta_print_hrtime(gethrtime() - start_time));
1445 
1446 			/* Join the starting nodes to the diskset */
1447 			rval = meta_mnjoin_all(sp, ep);
1448 			if (rval != 0) {
1449 				mde_perror(ep, gettext(
1450 				    "Join of non-owner (starting) nodes "
1451 				    "in set %s failed\n"), sp->setname);
1452 				meta_mc_log(MC_LOG3, gettext("Step2 - non owner"
1453 				    "nodes joined for set %s: %s"),
1454 				    sp->setname,
1455 				    meta_print_hrtime(gethrtime() -
1456 				    start_time));
1457 				if (rval == 205) {
1458 					/*
1459 					 * NOTE: Should issue call to reboot
1460 					 * remote host that is causing the RPC
1461 					 * failure.  Clustering to provide
1462 					 * interface in the future.  This
1463 					 * should stop a never-ending set of
1464 					 * 205 reconfig cycles.
1465 					 * Remote host causing failure is
1466 					 * stored in ep->host if ep is an
1467 					 * RPC error.
1468 					 * if (mdanyrpcerror(ep))
1469 					 * 	reboot (ep->host);
1470 					 */
1471 					md_exit(local_sp, 205);
1472 				} else {
1473 					md_exit(local_sp, 1);
1474 				}
1475 			}
1476 
1477 			meta_mc_log(MC_LOG3, gettext("Step2 - non owner nodes "
1478 			    "joined for set %s: %s"), sp->setname,
1479 			    meta_print_hrtime(gethrtime() - start_time));
1480 
1481 		}
1482 
1483 		meta_mc_log(MC_LOG2, gettext("Step2 completed: %s"),
1484 		    meta_print_hrtime(gethrtime() - start_time));
1485 
1486 		break;
1487 
1488 	case MC_STEP3:
1489 		/*
1490 		 * Step 3
1491 		 *
1492 		 * For all multinode sets do,
1493 		 * - Reinitialise rpc.mdcommd
1494 		 * - Reset mirror owners to null if the current owner is
1495 		 *   no longer in the membership list
1496 		 */
1497 
1498 		/* expect the nodelist to follow the step name */
1499 		if (argc < 1)
1500 			usage(sp, 1);
1501 
1502 		meta_mc_log(MC_LOG2, gettext("Starting Step3: %s"),
1503 		    meta_print_hrtime(0));
1504 
1505 		/*
1506 		 * Does local set exist? If not, exit with 0
1507 		 * since there's no reason to have this node panic if
1508 		 * the local set cannot be started.
1509 		 */
1510 		if ((local_sp = load_local_set(ep)) == NULL) {
1511 			md_exit(local_sp, 0);
1512 		}
1513 
1514 		/*
1515 		 * walk through all sets on this node which could include:
1516 		 *	- MN disksets
1517 		 *	- traditional disksets
1518 		 *	- non-existent disksets
1519 		 * start mirror resync for all MN sets
1520 		 */
1521 		if ((max_sets = get_max_sets(ep)) == 0) {
1522 			mde_perror(ep, "");
1523 			md_exit(local_sp, 1);
1524 		}
1525 
1526 		/* start walking through all possible disksets */
1527 		for (setno = 1; setno < max_sets; setno++) {
1528 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1529 				if (mdiserror(ep, MDE_NO_SET)) {
1530 					/* No set for this setno - continue */
1531 					mdclrerror(ep);
1532 					continue;
1533 				} else {
1534 					mde_perror(ep, gettext("Unable to "
1535 					    "get set %d information"), setno);
1536 					md_exit(local_sp, 1);
1537 				}
1538 			}
1539 
1540 			/* only check multi-node disksets */
1541 			if (!meta_is_mn_set(sp, ep)) {
1542 				mdclrerror(ep);
1543 				continue;
1544 			}
1545 
1546 			if (meta_lock(sp, TRUE, ep) != 0) {
1547 				mde_perror(ep, "");
1548 				md_exit(local_sp, 1);
1549 			}
1550 
1551 			/* If this node isn't joined to set, do nothing */
1552 			if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) {
1553 				if (!mdisok(ep)) {
1554 					mde_perror(ep, gettext("Could "
1555 					    "not get set %s ownership"),
1556 					    sp->setname);
1557 					md_exit(sp, 1);
1558 				}
1559 				mdclrerror(ep);
1560 				meta_unlock(sp, ep);
1561 				continue;
1562 			}
1563 
1564 			meta_mc_log(MC_LOG3, gettext("Step3 - begin "
1565 			    "re-initialising rpc.mdcommd and resetting mirror "
1566 			    "owners for set %s: %s"), sp->setname,
1567 			    meta_print_hrtime(gethrtime() - start_time));
1568 
1569 			/* reinitialzse rpc.mdcommd with new nodelist */
1570 			if (mdmn_reinit_set(setno)) {
1571 				md_eprintf(gettext(
1572 				    "Could not re-initialise rpc.mdcommd for "
1573 				    "set %s\n"), sp->setname);
1574 				md_exit(sp, 1);
1575 			}
1576 
1577 			(void) memset(&cfg, 0, sizeof (cfg));
1578 			cfg.c_id = 0;
1579 			cfg.c_setno = sp->setno;
1580 			if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde,
1581 			    NULL) != 0) {
1582 				mdstealerror(ep, &cfg.c_mde);
1583 				mde_perror(ep, gettext("Could "
1584 				    "not get set %s information"),
1585 				    sp->setname);
1586 				md_exit(sp, 1);
1587 			}
1588 
1589 			/* Don't do anything else if set is stale */
1590 			if (cfg.c_flags & MDDB_C_STALE) {
1591 				meta_unlock(sp, ep);
1592 				mdclrerror(ep);
1593 				continue;
1594 			}
1595 
1596 			/* reset mirror owners */
1597 			if (reset_state(RESET_OWNER, sp, MD_MIRROR, ep) == -1) {
1598 				md_exit(sp, 1);
1599 			}
1600 
1601 			meta_unlock(sp, ep);
1602 
1603 			meta_mc_log(MC_LOG3, gettext("Step3 - rpc.mdcommd "
1604 			    "re-initialised and mirror owners reset for "
1605 			    "set %s: %s"), sp->setname,
1606 			    meta_print_hrtime(gethrtime() - start_time));
1607 		}
1608 
1609 		meta_mc_log(MC_LOG2, gettext("Step3 completed: %s"),
1610 		    meta_print_hrtime(gethrtime() - start_time));
1611 
1612 		break;
1613 
1614 	case MC_STEP4:
1615 		/*
1616 		 * Step 4
1617 		 *
1618 		 * For all multinode sets do:
1619 		 * - Resume the rpc.mdcommd messages.  Must resume all
1620 		 *	sets before issuing I/O to any set since an error
1621 		 * 	encountered in a commd suspended set could be
1622 		 *	blocked waiting for commd in another set to resume.
1623 		 *	(This happens since the daemon queues service
1624 		 *	all sets).  An open of a soft partition causes
1625 		 *	a read of the watermarks during the open.
1626 		 * - If set is non-writable (not an owner or STALE), then
1627 		 *	continue to next set.
1628 		 *
1629 		 * For all multinode sets do,
1630 		 * - Reset ABR states for all mirrors, ie clear ABR if not
1631 		 *	open on any node.
1632 		 * - Reset ABR states for all soft partitions, ie clear ABR if
1633 		 *	not open on any node.
1634 		 * - For all slave nodes that have entered through the start
1635 		 *	step, update the ABR state to that of the master and
1636 		 *	get the submirror state from the master
1637 		 * - meta_lock set
1638 		 * - Resync all mirrors
1639 		 * - unlock meta_lock for this set.
1640 		 * - Choose a new owner for any orphaned resyncs
1641 		 *
1642 		 * There is one potential issue here. when concurrently
1643 		 * resetting and updating the ABR state. If the master has ABR
1644 		 * set, but should no longer have because the only node that
1645 		 * had the metadevice open and had ABR set has paniced, the
1646 		 * master will send a message to all nodes to clear the ABR
1647 		 * state. Meanwhile any node that has come through the
1648 		 * start step will get tstate from the master and will update
1649 		 * ABR if it was set in tstate. So, we appear to have a problem
1650 		 * if the following sequence occurs:-
1651 		 * - The slave gets tstate with ABR set
1652 		 * - The master sends a message to clear ABR
1653 		 * - The slave updates ABR with the value it got from tstate.
1654 		 * We now have the master with ABR clear and the slave with ABR
1655 		 * set. Fortunately, having set ABR, the slave will close the
1656 		 * metadevice after setting ABR and as there are no nodes with
1657 		 * the device open, the close will send a message to clear ABR
1658 		 * on all nodes. So, the nodes will all have ABR unset.
1659 		 */
1660 
1661 		/* expect the nodelist to follow the step name */
1662 		if (argc < 1)
1663 			usage(sp, 1);
1664 
1665 		meta_mc_log(MC_LOG2, gettext("Starting Step4: %s"),
1666 		    meta_print_hrtime(0));
1667 
1668 		/*
1669 		 * Does local set exist? If not, exit with 0
1670 		 * since there's no reason to have this node panic if
1671 		 * the local set cannot be started.
1672 		 */
1673 		if ((local_sp = load_local_set(ep)) == NULL) {
1674 			md_exit(local_sp, 0);
1675 		}
1676 
1677 		/*
1678 		 * walk through all sets on this node which could include:
1679 		 *	- MN disksets
1680 		 *	- traditional disksets
1681 		 *	- non-existent disksets
1682 		 * start mirror resync for all MN sets
1683 		 */
1684 		if ((max_sets = get_max_sets(ep)) == 0) {
1685 			mde_perror(ep, "");
1686 			md_exit(local_sp, 1);
1687 		}
1688 
1689 		/* Clear set_info structure */
1690 		for (setno = 1; setno < max_sets; setno++) {
1691 			set_info[setno] = 0;
1692 		}
1693 
1694 		/* start walking through all possible disksets */
1695 		for (setno = 1; setno < max_sets; setno++) {
1696 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1697 				if (mdiserror(ep, MDE_NO_SET)) {
1698 					/* No set for this setno - continue */
1699 					mdclrerror(ep);
1700 					continue;
1701 				} else {
1702 					mde_perror(ep, gettext("Unable to "
1703 					    "get set %d information"), setno);
1704 					md_exit(local_sp, 1);
1705 				}
1706 			}
1707 
1708 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1709 				mde_perror(ep, gettext("Unable to get set "
1710 				    "%s desc information"), sp->setname);
1711 				mdclrerror(ep);
1712 				continue;
1713 			}
1714 
1715 			/* only check multi-node disksets */
1716 			if (!meta_is_mn_set(sp, ep)) {
1717 				mdclrerror(ep);
1718 				continue;
1719 			}
1720 
1721 			set_info[setno] |= SET_INFO_MN;
1722 
1723 			/*
1724 			 * If not an owner (all mddbs failed) or stale
1725 			 * (< 50% mddbs operational), then set is
1726 			 * non-writable so just resume commd and
1727 			 * unblock mddb messages.
1728 			 */
1729 			mdclrerror(ep);
1730 			if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) {
1731 				set_info[setno] |= SET_INFO_NO_WR;
1732 			}
1733 			if (!mdisok(ep)) {
1734 				mde_perror(ep, gettext("Could "
1735 				    "not get set %s ownership"),
1736 				    sp->setname);
1737 				md_exit(local_sp, 1);
1738 			}
1739 			/* Set is owned - is it stale? */
1740 			if (!set_info[setno] & SET_INFO_NO_WR) {
1741 				(void) memset(&cfg, 0, sizeof (cfg));
1742 				cfg.c_id = 0;
1743 				cfg.c_setno = sp->setno;
1744 				if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde,
1745 				    NULL) != 0) {
1746 					mdstealerror(ep, &cfg.c_mde);
1747 					mde_perror(ep, gettext("Could "
1748 					    "not get set %s information"),
1749 					    sp->setname);
1750 					md_exit(local_sp, 1);
1751 				}
1752 				if (cfg.c_flags & MDDB_C_STALE) {
1753 					set_info[setno] |= SET_INFO_NO_WR;
1754 				}
1755 			}
1756 
1757 			/* resume rpc.mdcommd */
1758 			if (mdmn_resume(setno, MD_COMM_ALL_CLASSES, 0)) {
1759 				md_eprintf(gettext("Unable to resume "
1760 				    "rpc.mdcommd for set %s\n"), sp->setname);
1761 				md_exit(local_sp, 1);
1762 			}
1763 			meta_ping_mnset(setno);
1764 
1765 			/* Unblock mddb parse messages */
1766 			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
1767 				(void) memset(&mbp, 0, sizeof (mbp));
1768 				mbp.c_setno = setno;
1769 				mbp.c_blk_flags = MDDB_UNBLOCK_PARSE;
1770 				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
1771 				    &mbp.c_mde, NULL)) {
1772 					mdstealerror(ep, &mbp.c_mde);
1773 					mde_perror(ep, gettext("Could not "
1774 					    "unblock set %s"), sp->setname);
1775 					md_exit(local_sp, 1);
1776 				}
1777 			}
1778 			meta_mc_log(MC_LOG3, gettext("Step4 - rpc.mdcommd "
1779 			    "resumed and messages unblocked for set %s: %s"),
1780 			    sp->setname,
1781 			    meta_print_hrtime(gethrtime() - start_time));
1782 		}
1783 
1784 		for (setno = 1; setno < max_sets; setno++) {
1785 			int			start_step;
1786 
1787 			/* Skip traditional disksets. */
1788 			if ((set_info[setno] & SET_INFO_MN) == 0)
1789 				continue;
1790 
1791 			/*
1792 			 * If already determined that this set is
1793 			 * a non-writable set, then just continue
1794 			 * to next set since there's nothing else
1795 			 * to do for a non-writable set.
1796 			 */
1797 			if (set_info[setno] & SET_INFO_NO_WR)
1798 				continue;
1799 
1800 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1801 				if (mdiserror(ep, MDE_NO_SET)) {
1802 					/* No set for this setno - continue */
1803 					mdclrerror(ep);
1804 					continue;
1805 				} else {
1806 					mde_perror(ep, gettext("Unable to "
1807 					    "get set %d information"), setno);
1808 					md_exit(local_sp, 1);
1809 				}
1810 			}
1811 
1812 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1813 				mde_perror(ep, gettext("Unable to get set "
1814 				    "%s desc information"), sp->setname);
1815 				mdclrerror(ep);
1816 				continue;
1817 			}
1818 
1819 			/* See if this node came through the start step */
1820 			(void) memset(&sf, 0, sizeof (sf));
1821 			sf.sf_setno = sp->setno;
1822 			sf.sf_flags = MDDB_NM_GET;
1823 			/* Use magic to help protect ioctl against attack. */
1824 			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
1825 			if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
1826 			    &sf.sf_mde, NULL)) {
1827 				mdstealerror(ep, &sf.sf_mde);
1828 				mde_perror(ep, gettext("Could not get "
1829 				    "start_step flag for set %s"), sp->setname);
1830 				md_exit(local_sp, 1);
1831 			}
1832 			start_step =
1833 			    (sf.sf_setflags & MD_SET_MN_START_RC)? 1: 0;
1834 
1835 			/*
1836 			 * We can now reset the start_step flag for the set
1837 			 * if it was already set.
1838 			 */
1839 			if (start_step) {
1840 				(void) memset(&sf, 0, sizeof (sf));
1841 					sf.sf_setno = sp->setno;
1842 				sf.sf_setflags = MD_SET_MN_START_RC;
1843 				sf.sf_flags = MDDB_NM_RESET;
1844 				/*
1845 				 * Use magic to help protect ioctl
1846 				 * against attack.
1847 				 */
1848 				sf.sf_magic = MDDB_SETFLAGS_MAGIC;
1849 				if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
1850 				    &sf.sf_mde, NULL)) {
1851 					mdstealerror(ep, &sf.sf_mde);
1852 					mde_perror(ep,
1853 					    gettext("Could not reset "
1854 					    "start_step flag for set %s"),
1855 					    sp->setname);
1856 				}
1857 			}
1858 
1859 			meta_mc_log(MC_LOG3, gettext("Step4 - begin setting "
1860 			    "ABR state and restarting io's for "
1861 			    "set %s: %s"), sp->setname,
1862 			    meta_print_hrtime(gethrtime() - start_time));
1863 
1864 
1865 			/*
1866 			 * If we are not the master and we have come through
1867 			 * the start step, we must update the ABR states
1868 			 * for mirrors and soft partitions. Also the submirror
1869 			 * states need to be synchronised so that we see the
1870 			 * same status as other previously joined members.
1871 			 * This _must_ be done before starting the resync.
1872 			 */
1873 			if (!(sd->sd_mn_am_i_master) && start_step) {
1874 				if (reset_state(GET_MIRROR_STATE, sp, MD_MIRROR,
1875 				    ep) == -1) {
1876 					md_exit(local_sp, 1);
1877 				}
1878 				if (reset_state(UPDATE_ABR, sp, MD_SP,
1879 				    ep) == -1) {
1880 					md_exit(local_sp, 1);
1881 				}
1882 				/*
1883 				 * Mark the fact that we've got the mirror
1884 				 * state. This allows the resync thread to
1885 				 * determine if _it_ needs to issue this. This
1886 				 * can happen if a node is added to a set after
1887 				 * a reconfig cycle has completed.
1888 				 */
1889 				(void) memset(&sf, 0, sizeof (sf));
1890 					sf.sf_setno = sp->setno;
1891 				sf.sf_setflags = MD_SET_MN_MIR_STATE_RC;
1892 				sf.sf_flags = MDDB_NM_SET;
1893 				/*
1894 				 * Use magic to help protect ioctl
1895 				 * against attack.
1896 				 */
1897 				sf.sf_magic = MDDB_SETFLAGS_MAGIC;
1898 				if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
1899 				    &sf.sf_mde, NULL)) {
1900 					mdstealerror(ep, &sf.sf_mde);
1901 					mde_perror(ep,
1902 					    gettext("Could not set "
1903 					    "submirror state flag for set %s"),
1904 					    sp->setname);
1905 				}
1906 			}
1907 
1908 			/*
1909 			 * All remaining actions are only performed by the
1910 			 * master
1911 			 */
1912 			if (!(sd->sd_mn_am_i_master)) {
1913 				if (meta_lock(sp, TRUE, ep) != 0) {
1914 					mde_perror(ep, "");
1915 					md_exit(local_sp, 1);
1916 				}
1917 				meta_mirror_resync_unblock(sp);
1918 				meta_unlock(sp, ep);
1919 				continue;
1920 			}
1921 
1922 			/*
1923 			 * If the master came through the start step, this
1924 			 * implies that all of the nodes must have done the
1925 			 * same and hence there can be no applications
1926 			 * running. Hence no need to reset ABR
1927 			 */
1928 			if (!start_step) {
1929 				/* Reset ABR state for mirrors */
1930 				if (reset_state(RESET_ABR, sp, MD_MIRROR,
1931 				    ep) == -1) {
1932 					md_exit(local_sp, 1);
1933 				}
1934 				/* ...and now the same for soft partitions */
1935 				if (reset_state(RESET_ABR, sp, MD_SP,
1936 				    ep) == -1) {
1937 					md_exit(local_sp, 1);
1938 				}
1939 			}
1940 
1941 			/*
1942 			 * choose owners for orphaned resyncs and reset
1943 			 * non-orphaned resyncs so that an owner node that
1944 			 * reboots will restart the resync if needed.
1945 			 */
1946 			if (reset_state(CHOOSE_OWNER, sp, MD_MIRROR, ep) == -1)
1947 				md_exit(local_sp, 1);
1948 
1949 			/*
1950 			 * Must unlock set lock before meta_mirror_resync_all
1951 			 * sends a message to run the metasync command
1952 			 * which also grabs the meta_lock.
1953 			 */
1954 			if (meta_lock(sp, TRUE, ep) != 0) {
1955 				mde_perror(ep, "");
1956 				md_exit(local_sp, 1);
1957 			}
1958 			meta_mirror_resync_unblock(sp);
1959 			meta_unlock(sp, ep);
1960 
1961 			/* resync all mirrors in set */
1962 			if (meta_mirror_resync_all(sp, 0, ep) != 0) {
1963 				mde_perror(ep, gettext("Mirror resyncs "
1964 				    "failed for set %s"), sp->setname);
1965 				md_exit(local_sp, 1);
1966 			}
1967 
1968 			meta_mc_log(MC_LOG3, gettext("Step4 - io's restarted "
1969 			    "for set %s: %s"), sp->setname,
1970 			    meta_print_hrtime(gethrtime() - start_time));
1971 		}
1972 
1973 		meta_mc_log(MC_LOG2, gettext("Step4 completed: %s"),
1974 		    meta_print_hrtime(gethrtime() - start_time));
1975 
1976 		break;
1977 
1978 	default:
1979 		usage(sp, 1);
1980 		break;
1981 	}
1982 
1983 	md_exit(sp, 0);
1984 	/* NOTREACHED */
1985 	return (0);
1986 }
1987