xref: /titanic_41/usr/src/cmd/lvm/util/metaclust.c (revision 0b6016e6ff70af39f99c9cc28e0c2207c8f5413c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <meta.h>
30 #include <sdssc.h>
31 #include <signal.h>
32 #include <syslog.h>
33 #include <sys/types.h>
34 #include <sys/wait.h>
35 #include <sys/lvm/md_mirror.h>
36 #include <metad.h>
37 
38 #define	MY_VERSION		"1.0"	/* the highest supported version */
39 #define	MAX_DEBUG_LEVEL		5	/* maximum verbosity level */
40 
41 #define	RESET_OWNER		0x0001
42 #define	CHOOSE_OWNER		0x0002
43 #define	RESET_ABR		0x0004
44 #define	UPDATE_ABR		0x0008
45 #define	GET_MIRROR_STATE	0x0010
46 
47 #define	SET_INFO_NO_WR	0x0002
48 #define	SET_INFO_MN	0x0004
49 
50 /*
51  * This table defines all the metaclust reconfig steps we understand
52  */
53 typedef enum stpnum {
54 	MC_UNK = 0,
55 	MC_START,
56 	MC_STOP,
57 	MC_ABORT,
58 	MC_RETURN,
59 	MC_STEP1,
60 	MC_STEP2,
61 	MC_STEP3,
62 	MC_STEP4
63 } stepnum_t;
64 
65 /*
66  * Structure for step_name -> step_number mapping
67  */
68 struct step_t {
69 	char		*step_nam;
70 	stepnum_t	step_num;
71 };
72 
73 /*
74  * Step name to step number mapping table
75  * This table MUST be sorted alphabetically in ascending order of step name
76  */
77 static struct step_t step_table[] = {
78 	{ "abort",	MC_ABORT },
79 	{ "return",	MC_RETURN },
80 	{ "start",	MC_START },
81 	{ "step1",	MC_STEP1 },
82 	{ "step2",	MC_STEP2 },
83 	{ "step3",	MC_STEP3 },
84 	{ "step4",	MC_STEP4 },
85 	{ "stop",	MC_STOP }
86 };
87 
88 /*
89  * If support for a different version is added, the new version number should
90  * be appended to the version_table below. This list will be searched to
91  * determine if a version requested via the -V option is supported or not.
92  */
93 static char *version_table[] = {
94 	MY_VERSION
95 };
96 
97 uint_t	timeout = 0;			/* disable timeout by default */
98 char	*version = MY_VERSION;		/* use latest version by default */
99 int	stepnum = MC_UNK;		/* reconfiguration step number */
100 pid_t	c_pid;				/* child process id */
101 
102 /*
103  * Binary search comparison routine
104  */
105 static int
106 mc_compare(const void *stp1, const void *stp2)
107 {
108 	return (strcmp((const char *)stp1,
109 	    ((const struct step_t *)stp2)->step_nam));
110 }
111 
112 /*
113  * Timeout expiry alarm signal handler
114  */
115 /*ARGSUSED*/
116 static void
117 sigalarmhandler(int sig)
118 {
119 	int	i, n, ret, stat_loc = 0;
120 
121 	n = sizeof (step_table) / sizeof (step_table[0]);
122 	for (i = 0; i < n; i++) {
123 		if (stepnum == step_table[i].step_num)
124 			break;
125 	}
126 
127 	assert(i != n);
128 
129 	meta_mc_log(MC_LOG1, gettext("Timeout expired in %s: %s"),
130 	    step_table[i].step_nam,
131 	    meta_print_hrtime(gethrtime() - start_time));
132 
133 	if ((ret = kill(c_pid, SIGKILL)) == 0) {
134 		/*
135 		 * The child will wait forever until the status is retrieved
136 		 * so get it now. Keep retrying if the call is interrupted.
137 		 *
138 		 * The possible results are,
139 		 *
140 		 *	- child killed successfully
141 		 *	- signal sent but child not killed
142 		 *	- waitpid failed/interrupted
143 		 */
144 		sleep(2);
145 		while ((ret = waitpid(c_pid, &stat_loc, WNOHANG)) < 0) {
146 			if (errno != EINTR) {
147 				break;
148 			}
149 		}
150 		if ((ret == c_pid) || (errno == ECHILD)) {
151 			ret = 0;
152 		} else {
153 			ret = 1;
154 		}
155 	} else if (errno == ESRCH) {
156 		/*
157 		 * If the kill did not catch the child then it means the child
158 		 * exited immediately after the timeout occured.
159 		 */
160 		ret = 0;
161 	}
162 
163 	/*
164 	 * make sure not to exit with 205 for any steps other than step1-step4.
165 	 * Suncluster reconfiguration can't handle it otherwise.
166 	 */
167 	switch (stepnum) {
168 	case MC_STEP1:
169 	case MC_STEP2:
170 	case MC_STEP3:
171 	case MC_STEP4:
172 		/*
173 		 * If the child was killed successfully return 205 for a
174 		 * new reconfig cycle otherwise send 1 to panic the node.
175 		 */
176 		if (ret != 0) {
177 			md_eprintf(gettext("Could not kill child\n"));
178 			exit(1);
179 		} else {
180 			exit(205);
181 		}
182 		break;
183 	case MC_START:
184 	case MC_STOP:
185 	case MC_ABORT:
186 	case MC_RETURN:
187 	default:
188 		exit(1);
189 		break;
190 	}
191 }
192 
193 /*
194  * Attempt to load local set.
195  * Returns:
196  *	pointer to mdsetname_t for local set (local_sp) is successful.
197  *	0 if failure
198  *		if there are no local set mddbs, no error message is printed.
199  *		Otherwise, error message is printed so that user
200  *		can determine why the local set didn't start.
201  */
202 mdsetname_t *
203 load_local_set(md_error_t *ep)
204 {
205 	mdsetname_t	*local_sp = NULL;
206 
207 	/* Does local set exist? If not, give no error */
208 	if ((local_sp = metasetname(MD_LOCAL_NAME, ep)) == NULL) {
209 		return (0);
210 	}
211 
212 	/*
213 	 * snarf local set
214 	 * If fails with MDE_DB_NODB, then just return 1 printing
215 	 * no failure.
216 	 * Otherwise, print error message, and return 1.
217 	 */
218 	if (meta_setup_db_locations(ep) != 0) {
219 		if (!(mdismddberror(ep, MDE_DB_NODB)))
220 			mde_perror(ep, "");
221 		return (0);
222 	}
223 
224 	/* local set loaded successfully */
225 	return (local_sp);
226 }
227 
228 /*
229  * Purpose:	Compose a full path name for a metadevice
230  *
231  * On entry:	sp	- setname pointer
232  *		mnum	- minor number of metadevice
233  *		pathname - pointer to array to return path string
234  *		pathlen	- max length of pathname array
235  */
236 static int
237 compose_path(mdsetname_t *sp, int mnum, char *pathname, int pathlen)
238 {
239 	int	rtn;
240 	mdname_t	*np;
241 	md_error_t	status = mdnullerror;
242 
243 	if (MD_MIN2SET(mnum) != sp->setno) {
244 		md_eprintf(gettext("minor number 0x%x invalid for set %d\n"),
245 		    mnum, sp->setno);
246 		return (-1);
247 	}
248 
249 	if ((np = metamnumname(&sp, mnum, 0, &status)) == NULL) {
250 		return (-1);
251 	}
252 
253 	rtn = snprintf(pathname, pathlen, "%s", np->rname);
254 
255 	if ((pathname[0] == '\0') || (rtn >= pathlen)) {
256 		md_eprintf(gettext(
257 		    "Could not create path for device %s\n"),
258 		    get_mdname(sp, mnum));
259 		return (-1);
260 	}
261 	return (0);
262 }
263 
264 /*
265  * Purpose:	Walk through all the devices specified for the given set
266  *		and do the action specified in mode
267  */
268 static int
269 reset_state(uint_t mode, mdsetname_t *sp, char *drivername, md_error_t *ep)
270 {
271 	mdnamelist_t			*devnlp = NULL;
272 	mdnamelist_t			*p;
273 	mdname_t			*devnp = NULL;
274 	md_set_mmown_params_t		ownpar_p;
275 	md_set_mmown_params_t		*ownpar = &ownpar_p;
276 	md_unit_t			*mm;
277 	int				mirror_dev = 0;
278 	mndiskset_membershiplist_t	*nl;
279 	int				cnt;
280 	int				has_parent;
281 	md_mn_get_mir_state_t		mir_state_p;
282 	md_mn_get_mir_state_t		*mir_state = &mir_state_p;
283 
284 	/*
285 	 * if we are choosing or resetting the owners then make sure
286 	 * we are only doing it for mirror devices
287 	 */
288 	mirror_dev = (strcmp(MD_MIRROR, drivername) == 0);
289 	if ((mode & (RESET_OWNER | CHOOSE_OWNER)) && !mirror_dev) {
290 		return (-1);
291 	}
292 
293 	/* get a list of all the metadevices for current set */
294 	if (mirror_dev && meta_get_mirror_names(sp, &devnlp, 0, ep) < 0) {
295 		mde_perror(ep, gettext("Could not get mirrors for set %s"),
296 		    sp->setname);
297 		return (-1);
298 	} else if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) {
299 		mde_perror(ep, gettext(
300 		    "Could not get soft partitions for set %s"), sp->setname);
301 		return (-1);
302 	}
303 
304 	/* If resetting the owner, get the known membership list */
305 	if (mode & RESET_OWNER) {
306 		if (meta_read_nodelist(&cnt, &nl, ep)) {
307 			mde_perror(ep, "Could not get nodelist");
308 			return (-1);
309 		}
310 	}
311 
312 	/* for each metadevice */
313 	for (p = devnlp; (p != NULL); p = p->next) {
314 		devnp = p->namep;
315 
316 		/*
317 		 * Get the current setting for mirror ABR state and all of the
318 		 * submirror state and flags from the master node. We only
319 		 * perform this when going through a 'start' cycle.
320 		 */
321 		if ((mode & GET_MIRROR_STATE) && mirror_dev) {
322 			char	*miscname;
323 
324 			/*
325 			 * Ensure that we ignore soft-parts that are returned
326 			 * from the meta_get_mirror_names() call
327 			 */
328 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
329 				goto out;
330 			if (strcmp(miscname, MD_MIRROR) != 0)
331 				continue;
332 
333 			mir_state->mnum = meta_getminor(devnp->dev);
334 			MD_SETDRIVERNAME(mir_state, MD_MIRROR, sp->setno);
335 			meta_mc_log(MC_LOG4, gettext("Getting mirror state"
336 			    " for %s: %s"), get_mdname(sp, mir_state->mnum),
337 			    meta_print_hrtime(gethrtime() - start_time));
338 
339 			if (metaioctl(MD_MN_GET_MIRROR_STATE, mir_state, ep,
340 			    "MD_MN_GET_MIRROR_STATE") != 0) {
341 				mde_perror(ep, gettext("Unable to get "
342 				    "mirror state for %s"),
343 				    get_mdname(sp, mir_state->mnum));
344 				goto out;
345 			} else {
346 				continue;
347 			}
348 		}
349 
350 		/* check if this is a top level metadevice */
351 		if ((mm = meta_get_mdunit(sp, devnp, ep)) == NULL)
352 			goto out;
353 		if (MD_HAS_PARENT(MD_PARENT(mm))) {
354 			has_parent = 1;
355 		} else {
356 			has_parent = 0;
357 		}
358 		Free(mm);
359 
360 		if (mode & (RESET_OWNER | CHOOSE_OWNER)) {
361 			char	*miscname;
362 
363 			/*
364 			 * we can only do these for mirrors so make sure we
365 			 * really have a mirror device and not a softpartition
366 			 * imitating one. meta_get_mirror_names seems to think
367 			 * softparts on top of a mirror are mirrors!
368 			 */
369 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
370 				goto out;
371 			if (strcmp(miscname, MD_MIRROR) != 0)
372 				continue;
373 
374 			(void) memset(ownpar, 0, sizeof (*ownpar));
375 			ownpar->d.mnum = meta_getminor(devnp->dev);
376 			MD_SETDRIVERNAME(ownpar, MD_MIRROR, sp->setno);
377 
378 			meta_mc_log(MC_LOG4, gettext("Setting owner "
379 			    "for %s: %s"), get_mdname(sp, ownpar->d.mnum),
380 			    meta_print_hrtime(gethrtime() - start_time));
381 
382 			/* get the current owner id */
383 			if (metaioctl(MD_MN_GET_MM_OWNER, ownpar, ep,
384 			    "MD_MN_GET_MM_OWNER") != 0) {
385 				mde_perror(ep, gettext("Unable to get "
386 				    "mirror owner for %s"),
387 				    get_mdname(sp, ownpar->d.mnum));
388 				goto out;
389 			}
390 		}
391 
392 		if (mode & RESET_OWNER) {
393 			if (ownpar->d.owner == MD_MN_MIRROR_UNOWNED) {
394 				mdclrerror(ep);
395 				continue;
396 			}
397 
398 			/*
399 			 * reset owner only if the current owner is
400 			 * not in the membership list
401 			 * Also kill the resync thread so that when the resync
402 			 * is started, it will perform an optimized resync
403 			 * for any resync regions that were dirty when the
404 			 * current owner left the membership.
405 			 */
406 			if (meta_is_member(NULL, ownpar->d.owner, nl) != 1) {
407 				if (meta_mn_change_owner(&ownpar,
408 				    sp->setno, ownpar->d.mnum,
409 				    MD_MN_MIRROR_UNOWNED,
410 				    MD_MN_MM_ALLOW_CHANGE) == -1) {
411 					md_eprintf(gettext(
412 					    "Unable to reset mirror owner "
413 					    "for %s\n"),
414 					    get_mdname(sp, ownpar->d.mnum));
415 					goto out;
416 				}
417 				if (meta_mirror_resync(sp, devnp, 0, ep,
418 				    MD_RESYNC_KILL_NO_WAIT) != 0) {
419 					md_eprintf(gettext(
420 					    "Unable to kill resync for"
421 					    " %s\n"),
422 					    get_mdname(sp, ownpar->d.mnum));
423 					goto out;
424 				}
425 			}
426 		}
427 
428 		if (mode & CHOOSE_OWNER) {
429 			/*
430 			 * only orphaned resyncs will have no owner.
431 			 * if that is the case choose a new owner. Otherwise
432 			 * re-establish the existing owner. This covers the
433 			 * case where a node that owned the mirror
434 			 * reboots/panics and comes back into the cluster before
435 			 * the reconfig cycle has completed. In this case the
436 			 * other cluster nodes will have the mirror owner marked
437 			 * as the rebooted node while it has the owner marked
438 			 * as 'None'. We have to reestablish the ownership so
439 			 * that the subsequent resync can continue.
440 			 */
441 			if (meta_mn_change_owner(&ownpar, sp->setno,
442 			    ownpar->d.mnum, ownpar->d.owner,
443 			    MD_MN_MM_CHOOSE_OWNER) == -1) {
444 				md_eprintf(gettext("Unable to choose "
445 				    "mirror owner for %s\n"),
446 				    get_mdname(sp, ownpar->d.mnum));
447 				goto out;
448 			}
449 		}
450 
451 		/*
452 		 * For RESET_ABR and UPDATE_ABR - only handle top
453 		 * level metadevices.
454 		 */
455 		if (has_parent)
456 			continue;
457 
458 		if (mode & RESET_ABR) {
459 			/*
460 			 * Reset the ABR (application based recovery)
461 			 * value on all nodes. We are dealing with
462 			 * the possibility that we have ABR set but the
463 			 * only node that had the device open with ABR has
464 			 * left the cluster. We simply open and close the
465 			 * device and if this is the last close in the
466 			 * cluster, ABR will be cleared on all nodes.
467 			 */
468 			char		*miscname;
469 			char		name[MAXPATHLEN];
470 			int		mnum, fd;
471 
472 			name[0] = '\0';
473 			mnum = meta_getminor(devnp->dev);
474 
475 			/*
476 			 * Ensure that we don't include soft-parts in the
477 			 * mirror-only call to RESET_ABR. meta_get_mirror_names
478 			 * returns a bogus list that includes all soft-parts
479 			 * built on mirrors.
480 			 */
481 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
482 				goto out;
483 			if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0))
484 				continue;
485 
486 			meta_mc_log(MC_LOG4, gettext("Re-setting ABR state "
487 			    "for %s: %s"), get_mdname(sp, mnum),
488 			    meta_print_hrtime(gethrtime() - start_time));
489 
490 			/* compose the absolute device path and open it */
491 			if (compose_path(sp, mnum, &name[0],
492 			    sizeof (name)) != 0)
493 				goto out;
494 			if ((fd = open(name, O_RDWR, 0)) < 0) {
495 				md_perror(gettext("Could not open device %s"),
496 				    name);
497 				continue;
498 			}
499 
500 			(void) close(fd);
501 		}
502 
503 		if (mode & UPDATE_ABR) {
504 			/*
505 			 * Update the ABR value on this node. We obtain the
506 			 * current ABR state from the master node.
507 			 */
508 
509 			char		*miscname;
510 			char		name[MAXPATHLEN];
511 			int		mnum, fd;
512 			volcap_t	vc;
513 			uint_t		tstate;
514 
515 			name[0] = '\0';
516 			mnum = meta_getminor(devnp->dev);
517 
518 			/*
519 			 * Ensure that we don't include soft-parts in the
520 			 * mirror-only call to UPDATE_ABR. meta_get_mirror_names
521 			 * returns a bogus list that includes all soft-parts
522 			 * built on mirrors.
523 			 */
524 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
525 				goto out;
526 			if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0))
527 				continue;
528 
529 			/* Get tstate from Master */
530 			if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep)
531 			    != 0)
532 				continue;
533 			/* If not set on the master, nothing to do */
534 			if (!(tstate & MD_ABR_CAP))
535 				continue;
536 
537 			meta_mc_log(MC_LOG4, gettext("Updating ABR state "
538 			    "for %s: %s"), get_mdname(sp, mnum),
539 			    meta_print_hrtime(gethrtime() - start_time));
540 
541 			/* compose the absolute device path and open it */
542 			if (compose_path(sp, mnum, &name[0],
543 			    sizeof (name)) != 0)
544 				goto out;
545 			if ((fd = open(name, O_RDWR, 0)) < 0) {
546 				md_perror(gettext("Could not open device %s"),
547 				    name);
548 				continue;
549 			}
550 
551 			/* set ABR state */
552 			vc.vc_info = 0;
553 			vc.vc_set = 0;
554 			if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) {
555 				/*
556 				 * Ignore if device does not support this
557 				 * ioctl
558 				 */
559 				if ((errno != ENOTTY) && (errno != ENOTSUP)) {
560 					md_perror(gettext("Could not get "
561 					    "ABR/DMR state for device %s"),
562 					    name);
563 				}
564 				(void) close(fd);
565 				continue;
566 			}
567 			if (!(vc.vc_info & (DKV_ABR_CAP | DKV_DMR_CAP))) {
568 				(void) close(fd);
569 				continue;
570 			}
571 
572 			vc.vc_set = DKV_ABR_CAP;
573 			if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) {
574 				md_perror(gettext(
575 				    "Could not set ABR state for "
576 				    "device %s"), name);
577 				(void) close(fd);
578 				goto out;
579 			} else {
580 				md_eprintf(gettext(
581 				    "Setting ABR state on device %s\n"), name);
582 			}
583 
584 			(void) close(fd);
585 		}
586 	}
587 
588 	/* cleanup */
589 	if (mode & RESET_OWNER) {
590 		meta_free_nodelist(nl);
591 	}
592 	metafreenamelist(devnlp);
593 	return (0);
594 
595 out:
596 	/* cleanup */
597 	if (mode & RESET_OWNER) {
598 		meta_free_nodelist(nl);
599 	}
600 	metafreenamelist(devnlp);
601 	return (-1);
602 }
603 
604 /*
605  * Print usage message
606  */
607 static void
608 usage(mdsetname_t *sp, int eval)
609 {
610 	(void) fprintf(stderr, gettext("usage:"
611 	    "\t%s [-V version] [-t timeout] [-d level] start localnodeid\n"
612 	    "\t%s [-V version] [-t timeout] [-d level] step nodelist...\n"
613 	    "\t%s [-V version] [-t timeout] [-d level] abort | stop\n"
614 	    "\t%s [-V | -? | -h]\n"),
615 	    myname, myname, myname, myname);
616 	if (!eval) {
617 		fprintf(stderr, gettext("\n"
618 		    "\tValid debug (-d) levels are 1-%d for increasing "
619 		    "verbosity.\n\tDefault is -d 3.\n\n"
620 		    "\tValid step values are: return | step1 | step2 | "
621 		    "step3 | step4\n\n"
622 		    "\tNodelist is a space-separated list of node id's\n\n"),
623 		    MAX_DEBUG_LEVEL);
624 	}
625 	md_exit(sp, eval);
626 }
627 
628 /*
629  * Input:	Input takes a config step name followed by a list of
630  *		possible node id's.
631  *
632  * Returns:	  0 - Success
633  *		  1 - Fail
634  *			Node will be removed from cluster membership
635  *			by forcing node to panic.
636  *		205 - Unsuccessful. Start another reconfig cycle.
637  *			Problem was encountered that could be fixed by
638  *			running another reconfig cycle.
639  *			Problem could be a result of a failure to read
640  *			the nodelist file or that all work could not be
641  *			accomplished in a reconfig step in the amount of
642  *			time given so another reconfig cycle is needed in
643  *			order to finish the current step.
644  */
645 int
646 main(int argc, char **argv)
647 {
648 	mdsetname_t		*sp = NULL;
649 	md_error_t		status = mdnullerror;
650 	md_error_t		*ep = &status;
651 	set_t			max_sets, setno;
652 	int			c, clust = 0;
653 	struct sigaction	nsa, osa;
654 	struct step_t		*step_ptr;
655 	mdsetname_t		*local_sp = NULL;
656 	md_drive_desc		*dd;
657 	int			rval = 0;
658 	md_set_desc		*sd;
659 	mddb_block_parm_t	mbp;
660 	uint_t			debug = 3; /* log upto MC_LOG3 by default */
661 	int			version_table_size;
662 	mddb_setflags_config_t	sf;
663 	int			ret_val;
664 	mddb_config_t		cfg;
665 	int			set_info[MD_MAXSETS];
666 
667 	/*
668 	 * Get the locale set up before calling any other routines
669 	 * with messages to ouput.  Just in case we're not in a build
670 	 * environment, make sure that TEXT_DOMAIN gets set to
671 	 * something.
672 	 */
673 #if !defined(TEXT_DOMAIN)
674 #define	TEXT_DOMAIN "SYS_TEST"
675 #endif
676 	(void) setlocale(LC_ALL, "");
677 	(void) textdomain(TEXT_DOMAIN);
678 
679 	if ((clust = sdssc_bind_library()) == SDSSC_ERROR) {
680 		md_eprintf(gettext("Interface error with libsds_sc.so\n"));
681 		exit(1);
682 	}
683 
684 	if (md_init(argc, argv, 1, 1, ep) != 0 || meta_check_root(ep) != 0) {
685 		mde_perror(ep, "");
686 		md_exit(sp, 1);
687 	}
688 
689 	/*
690 	 * open log and enable libmeta logging. Do it here explicitly
691 	 * rather than letting md_init() do it because we are not really
692 	 * a daemon and that is what md_init() opens the log as.
693 	 */
694 	openlog("metaclust", LOG_CONS, LOG_USER);
695 
696 	version_table_size = sizeof (version_table) / sizeof (version_table[0]);
697 
698 	optind = 1;
699 	opterr = 0;
700 	while ((c = getopt(argc, argv, "hd:V:t:?")) != -1) {
701 		switch (c) {
702 		case 'h':
703 			usage(sp, 0);
704 			break;
705 
706 		case 'd':
707 			if (sscanf(optarg, "%u", &debug) != 1) {
708 				md_eprintf(gettext("Invalid debug level\n"));
709 				md_exit(sp, 1);
710 			} else if ((debug < 1) || (debug > MAX_DEBUG_LEVEL)) {
711 				debug = min(max(debug, 1), MAX_DEBUG_LEVEL);
712 				md_eprintf(gettext("Debug level must be "
713 				    "between 1 and %d inclusive.\n"),
714 				    MAX_DEBUG_LEVEL);
715 				md_eprintf(gettext("Debug level set to %d.\n"),
716 				    debug);
717 			}
718 			break;
719 
720 		case 'V':
721 			version = Strdup(optarg);
722 			break;
723 
724 		case 't':
725 			if (sscanf(optarg, "%u", &timeout) != 1) {
726 				md_eprintf(gettext("Invalid timeout value\n"));
727 				md_exit(sp, 1);
728 			}
729 			break;
730 
731 		case '?':
732 			if (optopt == '?') {
733 				usage(sp, 0);
734 			} else if (optopt == 'V') {
735 				int	i;
736 
737 				fprintf(stdout, gettext(
738 				    "%s: Versions Supported:"), myname);
739 				for (i = 0; i < version_table_size; i++) {
740 					fprintf(stdout, " %s",
741 					    version_table[i]);
742 				}
743 				fprintf(stdout, "\n");
744 				md_exit(sp, 0);
745 			}
746 			/*FALLTHROUGH*/
747 
748 		default:
749 			usage(sp, 1);
750 			break;
751 		}
752 	}
753 
754 	/* initialise the debug level and start time */
755 	setup_mc_log(debug);
756 
757 	/*
758 	 * check that the version specified (if any) is supported.
759 	 */
760 	if (version != NULL) {
761 		int	i, found = 0;
762 
763 		for (i = 0; i < version_table_size; i++) {
764 			if (strcmp(version, version_table[i]) == 0) {
765 				found = 1;
766 				break;
767 			}
768 		}
769 		if (!found) {
770 			md_eprintf(gettext("Version %s not supported\n"),
771 			    version);
772 			md_exit(sp, 1);
773 		}
774 	}
775 
776 	argc -= optind;
777 	argv += optind;
778 
779 	/* parse arguments */
780 	if (argc <= 0) {
781 		usage(sp, 1);
782 	}
783 
784 	/* convert the step name to the corresponding number */
785 	step_ptr = bsearch(argv[0], step_table, (sizeof (step_table) /
786 	    sizeof (step_table[0])), sizeof (step_table[0]), mc_compare);
787 	if (step_ptr != NULL) {
788 		stepnum = step_ptr->step_num;
789 	}
790 
791 	--argc;
792 	++argv;
793 
794 	/* set timeout alarm signal, a value of 0 will disable timeout */
795 	if (timeout > 0) {
796 		int	stat_loc = 0;
797 
798 		c_pid = fork();
799 
800 		if (c_pid == (pid_t)-1) {
801 			md_perror(gettext("Unable to fork"));
802 			md_exit(sp, 1);
803 		} else if (c_pid) {
804 			/* parent */
805 			nsa.sa_flags = 0;
806 			if (sigfillset(&nsa.sa_mask) < 0) {
807 				md_perror(gettext("Unable to set signal mask"));
808 				md_exit(sp, 1);
809 			}
810 
811 			nsa.sa_handler = sigalarmhandler;
812 			if (sigaction(SIGALRM, &nsa, &osa) == -1) {
813 				md_perror(gettext("Unable to set alarm "
814 				    "handler"));
815 				md_exit(sp, 1);
816 			}
817 
818 			(void) alarm(timeout);
819 
820 			/*
821 			 * wait for child to exit or timeout to expire.
822 			 * keep retrying if the call is interrupted
823 			 */
824 			while ((ret_val = waitpid(c_pid, &stat_loc, 0)) < 0) {
825 				if (errno != EINTR) {
826 					break;
827 				}
828 			}
829 			if (ret_val == c_pid) {
830 				/* exit with the childs exit value */
831 				exit(WEXITSTATUS(stat_loc));
832 			} else if (errno == ECHILD) {
833 				md_exit(sp, 0);
834 			} else {
835 				perror(myname);
836 				md_exit(sp, 1);
837 			}
838 		}
839 	}
840 
841 	/*
842 	 * If a timeout value is given, everything from this point onwards is
843 	 * executed in the child process.
844 	 */
845 
846 	switch (stepnum) {
847 	case MC_START:
848 		/*
849 		 * Start Step
850 		 *
851 		 * - Suspend all rpc.mdcommd messages
852 		 */
853 
854 		/* expect the local node id to be given only */
855 		if (argc != 1)
856 			usage(sp, 1);
857 
858 		meta_mc_log(MC_LOG2, gettext("Starting Start step: %s"),
859 		    meta_print_hrtime(0));
860 
861 		/*
862 		 * Does local set exist? If not, exit with 0
863 		 * since there's no reason to have this node panic if
864 		 * the local set cannot be started.
865 		 */
866 		if ((local_sp = load_local_set(ep)) == NULL) {
867 			md_exit(local_sp, 0);
868 		}
869 
870 		if ((max_sets = get_max_sets(ep)) == 0) {
871 			mde_perror(ep, "");
872 			md_exit(sp, 1);
873 		}
874 
875 		/* start walking through all possible disksets */
876 		for (setno = 1; setno < max_sets; setno++) {
877 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
878 				if (mdiserror(ep, MDE_NO_SET)) {
879 					/* No set for this setno - continue */
880 					mdclrerror(ep);
881 					continue;
882 				} else {
883 					mde_perror(ep, gettext("Unable to "
884 					    "get set %d information"), setno);
885 					md_exit(sp, 1);
886 				}
887 			}
888 
889 			/* only check multi-node disksets */
890 			if (!meta_is_mn_set(sp, ep)) {
891 				mdclrerror(ep);
892 				continue;
893 			}
894 
895 			meta_mc_log(MC_LOG3, gettext("Start - block parse "
896 			    "messages for set %s: %s"), sp->setname,
897 			    meta_print_hrtime(gethrtime() - start_time));
898 
899 			/*
900 			 * Mddb parse messages are sent amongst the nodes
901 			 * in a diskset whenever the locator block or
902 			 * locator names structure has been changed.
903 			 * A locator block change could occur as a result
904 			 * of a disk failure during the reconfig cycle,
905 			 * so block the mddb parse messages while the
906 			 * rpc.mdcommd is suspended during the reconfig cycle.
907 			 */
908 			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
909 				(void) memset(&mbp, 0, sizeof (mbp));
910 				mbp.c_setno = setno;
911 				mbp.c_blk_flags = MDDB_BLOCK_PARSE;
912 				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
913 				    &mbp.c_mde, NULL)) {
914 					mdstealerror(ep, &mbp.c_mde);
915 					mde_perror(ep, gettext("Could not "
916 					    "block set %s"), sp->setname);
917 					md_exit(sp, 1);
918 				}
919 			}
920 
921 			/* suspend commd and spin waiting for drain */
922 			while ((ret_val = mdmn_suspend(setno,
923 			    MD_COMM_ALL_CLASSES)) ==
924 			    MDE_DS_COMMDCTL_SUSPEND_NYD) {
925 				sleep(1);
926 			}
927 
928 			if (ret_val) {
929 				md_eprintf(gettext("Could not suspend "
930 				    "rpc.mdcommd for set %s\n"), sp->setname);
931 				md_exit(sp, 1);
932 			}
933 
934 			/*
935 			 * Set start step flag for set. This is set to indicate
936 			 * that this node entered the reconfig cycle through
937 			 * the start step.  This is used during the reconfig
938 			 * cycle to determine whether the node had entered
939 			 * through the start step or the return step.
940 			 */
941 			(void) memset(&sf, 0, sizeof (sf));
942 			sf.sf_setno = sp->setno;
943 			sf.sf_setflags = MD_SET_MN_START_RC;
944 			sf.sf_flags = MDDB_NM_SET;
945 			/* Use magic to help protect ioctl against attack. */
946 			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
947 			if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
948 			    &sf.sf_mde, NULL)) {
949 				mdstealerror(ep, &sf.sf_mde);
950 				mde_perror(ep, gettext("Could not set "
951 				    "start_step flag for set %s"), sp->setname);
952 				md_exit(sp, 1);
953 			}
954 
955 		}
956 
957 		meta_mc_log(MC_LOG2, gettext("Start step completed: %s"),
958 		    meta_print_hrtime(gethrtime() - start_time));
959 
960 		break;
961 
962 	case MC_STOP:
963 		/*
964 		 * Stop Step
965 		 *
966 		 * - ???
967 		 */
968 
969 		/* don't expect any more arguments to follow the step name */
970 		if (argc != 0)
971 			usage(sp, 1);
972 
973 		break;
974 
975 	case MC_ABORT:
976 		/*
977 		 * Abort Step
978 		 *
979 		 * - Abort rpc.mdcommd
980 		 */
981 
982 		/* don't expect any more arguments to follow the step name */
983 		if (argc != 0)
984 			usage(sp, 1);
985 
986 		meta_mc_log(MC_LOG2, gettext("Starting Abort step: %s"),
987 		    meta_print_hrtime(0));
988 
989 		/*
990 		 * Does local set exist? If not, exit with 0
991 		 * since there's no reason to have this node panic if
992 		 * the local set cannot be started.
993 		 */
994 		if ((local_sp = load_local_set(ep)) == NULL) {
995 			md_exit(local_sp, 0);
996 		}
997 
998 		/*
999 		 * abort the rpc.mdcommd.  The abort is only issued on this node
1000 		 * meaning that the abort reconfig step is called on this
1001 		 * node before a panic while the rest of the cluster will
1002 		 * undergo a reconfig cycle.
1003 		 * There is no time relation between this node running a
1004 		 * reconfig abort and the the rest of the cluster
1005 		 * running a reconfig cycle meaning that this node may
1006 		 * panic before, during or after the cluster has run
1007 		 * a reconfig cycle.
1008 		 */
1009 		mdmn_abort();
1010 
1011 		meta_mc_log(MC_LOG2, gettext("Abort step completed: %s"),
1012 		    meta_print_hrtime(gethrtime() - start_time));
1013 
1014 		break;
1015 
1016 	case MC_RETURN:
1017 		/*
1018 		 * Return Step
1019 		 *
1020 		 * - Grab local set lock, issue rpc.mdcommd DRAIN ALL
1021 		 *   and release local set lock.  Grabbing the local set
1022 		 *   lock allows any active metaset/metadb commands to
1023 		 *   terminate gracefully and will keep a metaset/metadb
1024 		 *   command from starting until the DRAIN ALL is issued.
1025 		 *   The metaset/metadb commands can issue
1026 		 *   DRAIN ALL/RESUME ALL commands to rpc.mdcommd,
1027 		 *   so the return step must not issue the DRAIN ALL command
1028 		 *   until metaset/metadb have finished or metaset may issue
1029 		 *   a RESUME ALL after this return reconfig step has issued
1030 		 *   the DRAIN ALL command.
1031 		 *   After this reconfig step has issued the DRAIN_ALL and
1032 		 *   released the local set lock, metaset/metadb will fail
1033 		 *   when attempting to contact the rpc.mdcommd and will
1034 		 *   terminate without making any configuration changes.
1035 		 *   The DRAIN ALL command will keep all other meta* commands
1036 		 *   from running during the reconfig cycle (these commands
1037 		 *   will wait until the rpc.mdcommd is resumed) since the
1038 		 *   reconfig cycle may be changing the diskset configuration.
1039 		 */
1040 
1041 		/* expect the nodelist to follow the step name */
1042 		if (argc < 1)
1043 			usage(sp, 1);
1044 
1045 		meta_mc_log(MC_LOG2, gettext("Starting Return step: %s"),
1046 		    meta_print_hrtime(0));
1047 
1048 		/*
1049 		 * Does local set exist? If not, exit with 0
1050 		 * since there's no reason to have this node panic if
1051 		 * the local set cannot be started.
1052 		 */
1053 		if ((local_sp = load_local_set(ep)) == NULL) {
1054 			md_exit(local_sp, 0);
1055 		}
1056 
1057 		/*
1058 		 * Suspend any mirror resyncs that are in progress. This
1059 		 * stops unnecessary timeouts.
1060 		 */
1061 		meta_mirror_resync_block_all();
1062 
1063 		if (meta_lock(local_sp, TRUE, ep) != 0) {
1064 			mde_perror(ep, "");
1065 			md_exit(local_sp, 1);
1066 		}
1067 
1068 		/*
1069 		 * All metaset and metadb commands on this node have now
1070 		 * terminated gracefully.  Now, issue a drain all to
1071 		 * the rpc.mdcommd.  Any meta command issued after the
1072 		 * drain all will either spin sending the command to the
1073 		 * master until after the reconfig cycle has finished OR
1074 		 * will terminate gracefully (metaset/metadb).
1075 		 */
1076 		if ((max_sets = get_max_sets(ep)) == 0) {
1077 			mde_perror(ep, "");
1078 			md_exit(sp, 1);
1079 		}
1080 
1081 		/* start walking through all possible disksets */
1082 		for (setno = 1; setno < max_sets; setno++) {
1083 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1084 				if (mdiserror(ep, MDE_NO_SET)) {
1085 					/* No set for this setno - continue */
1086 					mdclrerror(ep);
1087 					continue;
1088 				} else {
1089 					mde_perror(ep, gettext("Unable to "
1090 					    "get set %d information"), setno);
1091 					md_exit(sp, 1);
1092 				}
1093 			}
1094 
1095 			/* only check multi-node disksets */
1096 			if (!meta_is_mn_set(sp, ep)) {
1097 				mdclrerror(ep);
1098 				continue;
1099 			}
1100 
1101 			meta_mc_log(MC_LOG3, gettext("Return - block parse "
1102 			    "messages for set %s: %s"), sp->setname,
1103 			    meta_print_hrtime(gethrtime() - start_time));
1104 
1105 			/*
1106 			 * Mddb parse messages are sent amongst the nodes
1107 			 * in a diskset whenever the locator block or
1108 			 * locator names structure has been changed.
1109 			 * A locator block change could occur as a result
1110 			 * of a disk failure during the reconfig cycle,
1111 			 * so block the mddb parse messages while the
1112 			 * rpc.commd is suspended during the reconfig cycle.
1113 			 */
1114 			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
1115 				(void) memset(&mbp, 0, sizeof (mbp));
1116 				mbp.c_setno = setno;
1117 				mbp.c_blk_flags = MDDB_BLOCK_PARSE;
1118 				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
1119 				    &mbp.c_mde, NULL)) {
1120 					mdstealerror(ep, &mbp.c_mde);
1121 					mde_perror(ep, gettext("Could not "
1122 					    "block set %s"), sp->setname);
1123 					md_exit(sp, 1);
1124 				}
1125 			}
1126 
1127 			/* suspend commd and spin waiting for drain */
1128 			while ((ret_val = mdmn_suspend(setno,
1129 			    MD_COMM_ALL_CLASSES)) ==
1130 			    MDE_DS_COMMDCTL_SUSPEND_NYD) {
1131 				sleep(1);
1132 			}
1133 
1134 			if (ret_val) {
1135 				md_eprintf(gettext("Could not suspend "
1136 				    "rpc.mdcommd for set %s\n"), sp->setname);
1137 				md_exit(sp, 1);
1138 			}
1139 		}
1140 		/*
1141 		 * Resume all I/Os for this node for all MN sets in
1142 		 * case master node had suspended I/Os but panic'd
1143 		 * before resuming I/Os.  In case of failure, exit
1144 		 * with a 1 since unable to resume I/Os on this node.
1145 		 */
1146 		if (clnt_mn_susp_res_io(mynode(), 0, MN_RES_IO, ep)) {
1147 			mde_perror(ep, gettext(
1148 			    "Unable to resume I/O on node %s for all sets"),
1149 			    mynode());
1150 			md_exit(sp, 1);
1151 		}
1152 
1153 
1154 		/*
1155 		 * Can now unlock local set lock.  New metaset/metadb
1156 		 * commands are now held off using drain all.
1157 		 */
1158 		(void) meta_unlock(local_sp, ep);
1159 
1160 		meta_mc_log(MC_LOG2, gettext("Return step completed: %s"),
1161 		    meta_print_hrtime(gethrtime() - start_time));
1162 
1163 		break;
1164 
1165 	case MC_STEP1:
1166 		/*
1167 		 * Step 1
1168 		 *
1169 		 * - Populate nodelist file if we are on clustering
1170 		 *   and pick a master node for each MN diskset.
1171 		 */
1172 
1173 		/* expect the nodelist to follow the step name */
1174 		if (argc < 1)
1175 			usage(sp, 1);
1176 
1177 		meta_mc_log(MC_LOG2, gettext("Starting Step1: %s"),
1178 		    meta_print_hrtime(0));
1179 
1180 		/* Always write nodelist file even if no local set exists */
1181 		if (clust == SDSSC_OKAY) {
1182 			/* skip to the nodelist args */
1183 			if (meta_write_nodelist(argc, argv, ep) != 0) {
1184 				mde_perror(ep, gettext(
1185 				    "Could not populate nodelist file"));
1186 				md_exit(sp, 1);
1187 			}
1188 		}
1189 
1190 		/*
1191 		 * Does local set exist? If not, exit with 0
1192 		 * since there's no reason to have this node panic if
1193 		 * the local set cannot be started.
1194 		 */
1195 		if ((local_sp = load_local_set(ep)) == NULL) {
1196 			md_exit(local_sp, 0);
1197 		}
1198 
1199 		/*
1200 		 * At this point, all meta* commands are blocked across
1201 		 * all disksets since the master rpc.mdcommd has drained or
1202 		 * the master node has died.
1203 		 * If a metaset or metadb command had been in progress
1204 		 * at the start of the reconfig cycle, this command has
1205 		 * either completed or it has been terminated due to
1206 		 * the death of the master node.
1207 		 *
1208 		 * This means that that it is now ok to remove any
1209 		 * outstanding clnt_locks associated with multinode
1210 		 * disksets on this node due to a node panic during
1211 		 * a metaset operation.  This allows the routines that
1212 		 * choose the master to use rpc.metad to determine the
1213 		 * master of the diskset.
1214 		 */
1215 		if (clnt_clr_mnsetlock(mynode(), ep) != 0) {
1216 			meta_mc_log(MC_LOG2, gettext("Step1 aborted:"
1217 			    "clear locks failed %s"),
1218 			    meta_print_hrtime(gethrtime() - start_time));
1219 			md_exit(local_sp, 1);
1220 		}
1221 
1222 		/*
1223 		 * Call reconfig_choose_master to choose a master for
1224 		 * each MN diskset, update the nodelist for each diskset
1225 		 * given the member information and send a reinit message
1226 		 * to rpc.mdcommd to reload the nodelist.
1227 		 */
1228 		rval = meta_reconfig_choose_master(ep);
1229 		if (rval == 205) {
1230 			/*
1231 			 * NOTE: Should issue call to reboot remote host that
1232 			 * is causing the RPC failure.  Clustering to
1233 			 * provide interface in the future.  This should
1234 			 * stop a never-ending set of 205 reconfig cycles.
1235 			 * Remote host causing failure is stored in
1236 			 * ep->host if ep is an RPC error.
1237 			 * if (mdanyrpcerror(ep))
1238 			 * 	reboot (ep->host);
1239 			 */
1240 			meta_mc_log(MC_LOG2, gettext("Step1 aborted:"
1241 			    "choose master failure of 205 %s"),
1242 			    meta_print_hrtime(gethrtime() - start_time));
1243 			md_exit(local_sp, 205);
1244 		} else if (rval != 0) {
1245 			meta_mc_log(MC_LOG2, gettext("Step1 failure: "
1246 			    "choose master failure %s"),
1247 			    meta_print_hrtime(gethrtime() - start_time));
1248 			md_exit(local_sp, 1);
1249 		}
1250 
1251 		meta_mc_log(MC_LOG2, gettext("Step1 completed: %s"),
1252 		    meta_print_hrtime(gethrtime() - start_time));
1253 
1254 		md_exit(local_sp, rval);
1255 		break;
1256 
1257 	case MC_STEP2:
1258 		/*
1259 		 * Step 2
1260 		 *
1261 		 * In Step 2, each node walks the list of disksets.  If a
1262 		 * node is a master of a MN diskset, it synchronizes
1263 		 * the local set USER records for that diskset.
1264 		 *
1265 		 * If disks exist in the diskset and there is a joined
1266 		 * (owner) node in the diskset, the master will also:
1267 		 *	- synchronize the diskset mddbs to the master
1268 		 *	- play the change log
1269 		 *
1270 		 * The master node will now attempt to join any unjoined
1271 		 * nodes that are currently members in the membership list.
1272 		 */
1273 
1274 		/* expect the nodelist to follow the step name */
1275 		if (argc < 1)
1276 			usage(sp, 1);
1277 
1278 		meta_mc_log(MC_LOG2, gettext("Starting Step2: %s"),
1279 		    meta_print_hrtime(0));
1280 
1281 		/*
1282 		 * Does local set exist? If not, exit with 0
1283 		 * since there's no reason to have this node panic if
1284 		 * the local set cannot be started.
1285 		 */
1286 		if ((local_sp = load_local_set(ep)) == NULL) {
1287 			md_exit(local_sp, 0);
1288 		}
1289 
1290 		if ((max_sets = get_max_sets(ep)) == 0) {
1291 			mde_perror(ep, "");
1292 			md_exit(local_sp, 1);
1293 		}
1294 
1295 		/* start walking through all possible disksets */
1296 		for (setno = 1; setno < max_sets; setno++) {
1297 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1298 				if (mdiserror(ep, MDE_NO_SET)) {
1299 					/* No set for this setno - continue */
1300 					mdclrerror(ep);
1301 					continue;
1302 				} else if (mdanyrpcerror(ep)) {
1303 					/* Fail on RPC failure to self */
1304 					mde_perror(ep, gettext(
1305 					    "Unable to get information for "
1306 					    "set number %d"), setno);
1307 					md_exit(local_sp, 1);
1308 				} else {
1309 					mde_perror(ep, gettext(
1310 					    "Unable to get information for "
1311 					    "set number %d"), setno);
1312 					mdclrerror(ep);
1313 					continue;
1314 				}
1315 			}
1316 
1317 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1318 				if (mdanyrpcerror(ep)) {
1319 					/* Fail on RPC failure to self */
1320 					mde_perror(ep, gettext(
1321 					    "Unable to get information for "
1322 					    "set number %d"), setno);
1323 					md_exit(local_sp, 1);
1324 				}
1325 				mde_perror(ep, gettext("Unable to get set "
1326 				    "%s desc information"), sp->setname);
1327 				mdclrerror(ep);
1328 				continue;
1329 			}
1330 
1331 			/* Only check MN disksets */
1332 			if (!(MD_MNSET_DESC(sd))) {
1333 				continue;
1334 			}
1335 
1336 			/* All actions in step 2 are driven by master */
1337 			if (!(sd->sd_mn_am_i_master)) {
1338 				continue;
1339 			}
1340 
1341 			meta_mc_log(MC_LOG3, gettext("Step2 - begin record "
1342 			    "synchronization for set %s: %s"), sp->setname,
1343 			    meta_print_hrtime(gethrtime() - start_time));
1344 
1345 			/*
1346 			 * Synchronize the USER records in the local mddbs
1347 			 * for hosts that are members.  The USER records
1348 			 * contain set, drive and host information.
1349 			 */
1350 			rval = meta_mnsync_user_records(sp, ep);
1351 			if (rval != 0) {
1352 				mde_perror(ep, gettext(
1353 				    "Synchronization of user records "
1354 				    "in set %s failed\n"), sp->setname);
1355 				if (rval == 205) {
1356 					/*
1357 					 * NOTE: Should issue call to reboot
1358 					 * remote host that is causing the RPC
1359 					 * failure.  Clustering to provide
1360 					 * interface in the future.  This
1361 					 * should stop a never-ending set of
1362 					 * 205 reconfig cycles.
1363 					 * Remote host causing failure is
1364 					 * stored in ep->host if ep is an
1365 					 * RPC error.
1366 					 * if (mdanyrpcerror(ep))
1367 					 * 	reboot (ep->host);
1368 					 */
1369 					md_exit(local_sp, 205);
1370 				} else {
1371 					md_exit(local_sp, 1);
1372 				}
1373 			}
1374 
1375 			/* Reget sd since sync_user_recs may have flushed it */
1376 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1377 				mde_perror(ep, gettext("Unable to get set "
1378 				    "%s desc information"), sp->setname);
1379 				md_exit(local_sp, 1);
1380 			}
1381 
1382 			dd = metaget_drivedesc(sp,
1383 			    (MD_BASICNAME_OK | PRINT_FAST), ep);
1384 			if (! mdisok(ep)) {
1385 				mde_perror(ep, gettext("Unable to get set "
1386 				    "%s drive information"), sp->setname);
1387 				md_exit(local_sp, 1);
1388 			}
1389 
1390 			/*
1391 			 * No drives in set, continue to next set.
1392 			 */
1393 			if (dd == NULL) {
1394 				/* Done with this set */
1395 				continue;
1396 			}
1397 
1398 			meta_mc_log(MC_LOG3, gettext("Step2 - local set user "
1399 			    "records completed for set %s: %s"), sp->setname,
1400 			    meta_print_hrtime(gethrtime() - start_time));
1401 
1402 			/*
1403 			 * Synchronize the diskset mddbs for hosts
1404 			 * that are members.  This may involve
1405 			 * playing the changelog and writing out
1406 			 * to the diskset mddbs.
1407 			 */
1408 			rval = meta_mnsync_diskset_mddbs(sp, ep);
1409 			if (rval != 0) {
1410 				mde_perror(ep, gettext(
1411 				    "Synchronization of diskset mddbs "
1412 				    "in set %s failed\n"), sp->setname);
1413 				meta_mc_log(MC_LOG3, gettext("Step2 - diskset "
1414 				    "mddb synchronization failed for "
1415 				    "set %s: %s"), sp->setname,
1416 				    meta_print_hrtime(gethrtime() -
1417 				    start_time));
1418 				if (rval == 205) {
1419 					/*
1420 					 * NOTE: Should issue call to reboot
1421 					 * remote host that is causing the RPC
1422 					 * failure.  Clustering to provide
1423 					 * interface in the future.  This
1424 					 * should stop a never-ending set of
1425 					 * 205 reconfig cycles.
1426 					 * Remote host causing failure is
1427 					 * stored in ep->host if ep is an
1428 					 * RPC error.
1429 					 * if (mdanyrpcerror(ep))
1430 					 * 	reboot (ep->host);
1431 					 */
1432 					md_exit(local_sp, 205);
1433 				} else if (rval == 1) {
1434 					continue;
1435 				} else {
1436 					md_exit(local_sp, 1);
1437 				}
1438 			}
1439 
1440 			meta_mc_log(MC_LOG3, gettext("Step2 - diskset mddb "
1441 			    "synchronization completed for set %s: %s"),
1442 			    sp->setname,
1443 			    meta_print_hrtime(gethrtime() - start_time));
1444 
1445 			/* Join the starting nodes to the diskset */
1446 			rval = meta_mnjoin_all(sp, ep);
1447 			if (rval != 0) {
1448 				mde_perror(ep, gettext(
1449 				    "Join of non-owner (starting) nodes "
1450 				    "in set %s failed\n"), sp->setname);
1451 				meta_mc_log(MC_LOG3, gettext("Step2 - non owner"
1452 				    "nodes joined for set %s: %s"),
1453 				    sp->setname,
1454 				    meta_print_hrtime(gethrtime() -
1455 				    start_time));
1456 				if (rval == 205) {
1457 					/*
1458 					 * NOTE: Should issue call to reboot
1459 					 * remote host that is causing the RPC
1460 					 * failure.  Clustering to provide
1461 					 * interface in the future.  This
1462 					 * should stop a never-ending set of
1463 					 * 205 reconfig cycles.
1464 					 * Remote host causing failure is
1465 					 * stored in ep->host if ep is an
1466 					 * RPC error.
1467 					 * if (mdanyrpcerror(ep))
1468 					 * 	reboot (ep->host);
1469 					 */
1470 					md_exit(local_sp, 205);
1471 				} else {
1472 					md_exit(local_sp, 1);
1473 				}
1474 			}
1475 
1476 			meta_mc_log(MC_LOG3, gettext("Step2 - non owner nodes "
1477 			    "joined for set %s: %s"), sp->setname,
1478 			    meta_print_hrtime(gethrtime() - start_time));
1479 
1480 		}
1481 
1482 		meta_mc_log(MC_LOG2, gettext("Step2 completed: %s"),
1483 		    meta_print_hrtime(gethrtime() - start_time));
1484 
1485 		break;
1486 
1487 	case MC_STEP3:
1488 		/*
1489 		 * Step 3
1490 		 *
1491 		 * For all multinode sets do,
1492 		 * - Reinitialise rpc.mdcommd
1493 		 * - Reset mirror owners to null if the current owner is
1494 		 *   no longer in the membership list
1495 		 */
1496 
1497 		/* expect the nodelist to follow the step name */
1498 		if (argc < 1)
1499 			usage(sp, 1);
1500 
1501 		meta_mc_log(MC_LOG2, gettext("Starting Step3: %s"),
1502 		    meta_print_hrtime(0));
1503 
1504 		/*
1505 		 * Does local set exist? If not, exit with 0
1506 		 * since there's no reason to have this node panic if
1507 		 * the local set cannot be started.
1508 		 */
1509 		if ((local_sp = load_local_set(ep)) == NULL) {
1510 			md_exit(local_sp, 0);
1511 		}
1512 
1513 		/*
1514 		 * walk through all sets on this node which could include:
1515 		 *	- MN disksets
1516 		 *	- traditional disksets
1517 		 *	- non-existent disksets
1518 		 * start mirror resync for all MN sets
1519 		 */
1520 		if ((max_sets = get_max_sets(ep)) == 0) {
1521 			mde_perror(ep, "");
1522 			md_exit(local_sp, 1);
1523 		}
1524 
1525 		/* start walking through all possible disksets */
1526 		for (setno = 1; setno < max_sets; setno++) {
1527 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1528 				if (mdiserror(ep, MDE_NO_SET)) {
1529 					/* No set for this setno - continue */
1530 					mdclrerror(ep);
1531 					continue;
1532 				} else {
1533 					mde_perror(ep, gettext("Unable to "
1534 					    "get set %d information"), setno);
1535 					md_exit(local_sp, 1);
1536 				}
1537 			}
1538 
1539 			/* only check multi-node disksets */
1540 			if (!meta_is_mn_set(sp, ep)) {
1541 				mdclrerror(ep);
1542 				continue;
1543 			}
1544 
1545 			if (meta_lock(sp, TRUE, ep) != 0) {
1546 				mde_perror(ep, "");
1547 				md_exit(local_sp, 1);
1548 			}
1549 
1550 			/* If this node isn't joined to set, do nothing */
1551 			if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) {
1552 				if (!mdisok(ep)) {
1553 					mde_perror(ep, gettext("Could "
1554 					    "not get set %s ownership"),
1555 					    sp->setname);
1556 					md_exit(sp, 1);
1557 				}
1558 				mdclrerror(ep);
1559 				meta_unlock(sp, ep);
1560 				continue;
1561 			}
1562 
1563 			meta_mc_log(MC_LOG3, gettext("Step3 - begin "
1564 			    "re-initialising rpc.mdcommd and resetting mirror "
1565 			    "owners for set %s: %s"), sp->setname,
1566 			    meta_print_hrtime(gethrtime() - start_time));
1567 
1568 			/* reinitialzse rpc.mdcommd with new nodelist */
1569 			if (mdmn_reinit_set(setno)) {
1570 				md_eprintf(gettext(
1571 				    "Could not re-initialise rpc.mdcommd for "
1572 				    "set %s\n"), sp->setname);
1573 				md_exit(sp, 1);
1574 			}
1575 
1576 			(void) memset(&cfg, 0, sizeof (cfg));
1577 			cfg.c_id = 0;
1578 			cfg.c_setno = sp->setno;
1579 			if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde,
1580 			    NULL) != 0) {
1581 				mdstealerror(ep, &cfg.c_mde);
1582 				mde_perror(ep, gettext("Could "
1583 				    "not get set %s information"),
1584 				    sp->setname);
1585 				md_exit(sp, 1);
1586 			}
1587 
1588 			/* Don't do anything else if set is stale */
1589 			if (cfg.c_flags & MDDB_C_STALE) {
1590 				meta_unlock(sp, ep);
1591 				mdclrerror(ep);
1592 				continue;
1593 			}
1594 
1595 			/* reset mirror owners */
1596 			if (reset_state(RESET_OWNER, sp, MD_MIRROR, ep) == -1) {
1597 				md_exit(sp, 1);
1598 			}
1599 
1600 			meta_unlock(sp, ep);
1601 
1602 			meta_mc_log(MC_LOG3, gettext("Step3 - rpc.mdcommd "
1603 			    "re-initialised and mirror owners reset for "
1604 			    "set %s: %s"), sp->setname,
1605 			    meta_print_hrtime(gethrtime() - start_time));
1606 		}
1607 
1608 		meta_mc_log(MC_LOG2, gettext("Step3 completed: %s"),
1609 		    meta_print_hrtime(gethrtime() - start_time));
1610 
1611 		break;
1612 
1613 	case MC_STEP4:
1614 		/*
1615 		 * Step 4
1616 		 *
1617 		 * For all multinode sets do:
1618 		 * - Resume the rpc.mdcommd messages.  Must resume all
1619 		 *	sets before issuing I/O to any set since an error
1620 		 * 	encountered in a commd suspended set could be
1621 		 *	blocked waiting for commd in another set to resume.
1622 		 *	(This happens since the daemon queues service
1623 		 *	all sets).  An open of a soft partition causes
1624 		 *	a read of the watermarks during the open.
1625 		 * - If set is non-writable (not an owner or STALE), then
1626 		 *	continue to next set.
1627 		 *
1628 		 * For all multinode sets do,
1629 		 * - Reset ABR states for all mirrors, ie clear ABR if not
1630 		 *	open on any node.
1631 		 * - Reset ABR states for all soft partitions, ie clear ABR if
1632 		 *	not open on any node.
1633 		 * - For all slave nodes that have entered through the start
1634 		 *	step, update the ABR state to that of the master and
1635 		 *	get the submirror state from the master
1636 		 * - meta_lock set
1637 		 * - Resync all mirrors
1638 		 * - unlock meta_lock for this set.
1639 		 * - Choose a new owner for any orphaned resyncs
1640 		 *
1641 		 * There is one potential issue here. when concurrently
1642 		 * resetting and updating the ABR state. If the master has ABR
1643 		 * set, but should no longer have because the only node that
1644 		 * had the metadevice open and had ABR set has paniced, the
1645 		 * master will send a message to all nodes to clear the ABR
1646 		 * state. Meanwhile any node that has come through the
1647 		 * start step will get tstate from the master and will update
1648 		 * ABR if it was set in tstate. So, we appear to have a problem
1649 		 * if the following sequence occurs:-
1650 		 * - The slave gets tstate with ABR set
1651 		 * - The master sends a message to clear ABR
1652 		 * - The slave updates ABR with the value it got from tstate.
1653 		 * We now have the master with ABR clear and the slave with ABR
1654 		 * set. Fortunately, having set ABR, the slave will close the
1655 		 * metadevice after setting ABR and as there are no nodes with
1656 		 * the device open, the close will send a message to clear ABR
1657 		 * on all nodes. So, the nodes will all have ABR unset.
1658 		 */
1659 
1660 		/* expect the nodelist to follow the step name */
1661 		if (argc < 1)
1662 			usage(sp, 1);
1663 
1664 		meta_mc_log(MC_LOG2, gettext("Starting Step4: %s"),
1665 		    meta_print_hrtime(0));
1666 
1667 		/*
1668 		 * Does local set exist? If not, exit with 0
1669 		 * since there's no reason to have this node panic if
1670 		 * the local set cannot be started.
1671 		 */
1672 		if ((local_sp = load_local_set(ep)) == NULL) {
1673 			md_exit(local_sp, 0);
1674 		}
1675 
1676 		/*
1677 		 * walk through all sets on this node which could include:
1678 		 *	- MN disksets
1679 		 *	- traditional disksets
1680 		 *	- non-existent disksets
1681 		 * start mirror resync for all MN sets
1682 		 */
1683 		if ((max_sets = get_max_sets(ep)) == 0) {
1684 			mde_perror(ep, "");
1685 			md_exit(local_sp, 1);
1686 		}
1687 
1688 		/* Clear set_info structure */
1689 		for (setno = 1; setno < max_sets; setno++) {
1690 			set_info[setno] = 0;
1691 		}
1692 
1693 		/* start walking through all possible disksets */
1694 		for (setno = 1; setno < max_sets; setno++) {
1695 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1696 				if (mdiserror(ep, MDE_NO_SET)) {
1697 					/* No set for this setno - continue */
1698 					mdclrerror(ep);
1699 					continue;
1700 				} else {
1701 					mde_perror(ep, gettext("Unable to "
1702 					    "get set %d information"), setno);
1703 					md_exit(local_sp, 1);
1704 				}
1705 			}
1706 
1707 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1708 				mde_perror(ep, gettext("Unable to get set "
1709 				    "%s desc information"), sp->setname);
1710 				mdclrerror(ep);
1711 				continue;
1712 			}
1713 
1714 			/* only check multi-node disksets */
1715 			if (!meta_is_mn_set(sp, ep)) {
1716 				mdclrerror(ep);
1717 				continue;
1718 			}
1719 
1720 			set_info[setno] |= SET_INFO_MN;
1721 
1722 			/*
1723 			 * If not an owner (all mddbs failed) or stale
1724 			 * (< 50% mddbs operational), then set is
1725 			 * non-writable so just resume commd and
1726 			 * unblock mddb messages.
1727 			 */
1728 			mdclrerror(ep);
1729 			if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) {
1730 				set_info[setno] |= SET_INFO_NO_WR;
1731 			}
1732 			if (!mdisok(ep)) {
1733 				mde_perror(ep, gettext("Could "
1734 				    "not get set %s ownership"),
1735 				    sp->setname);
1736 				md_exit(local_sp, 1);
1737 			}
1738 			/* Set is owned - is it stale? */
1739 			if (!set_info[setno] & SET_INFO_NO_WR) {
1740 				(void) memset(&cfg, 0, sizeof (cfg));
1741 				cfg.c_id = 0;
1742 				cfg.c_setno = sp->setno;
1743 				if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde,
1744 				    NULL) != 0) {
1745 					mdstealerror(ep, &cfg.c_mde);
1746 					mde_perror(ep, gettext("Could "
1747 					    "not get set %s information"),
1748 					    sp->setname);
1749 					md_exit(local_sp, 1);
1750 				}
1751 				if (cfg.c_flags & MDDB_C_STALE) {
1752 					set_info[setno] |= SET_INFO_NO_WR;
1753 				}
1754 			}
1755 
1756 			/* resume rpc.mdcommd */
1757 			if (mdmn_resume(setno, MD_COMM_ALL_CLASSES, 0)) {
1758 				md_eprintf(gettext("Unable to resume "
1759 				    "rpc.mdcommd for set %s\n"), sp->setname);
1760 				md_exit(local_sp, 1);
1761 			}
1762 			meta_ping_mnset(setno);
1763 
1764 			/* Unblock mddb parse messages */
1765 			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
1766 				(void) memset(&mbp, 0, sizeof (mbp));
1767 				mbp.c_setno = setno;
1768 				mbp.c_blk_flags = MDDB_UNBLOCK_PARSE;
1769 				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
1770 				    &mbp.c_mde, NULL)) {
1771 					mdstealerror(ep, &mbp.c_mde);
1772 					mde_perror(ep, gettext("Could not "
1773 					    "unblock set %s"), sp->setname);
1774 					md_exit(local_sp, 1);
1775 				}
1776 			}
1777 			meta_mc_log(MC_LOG3, gettext("Step4 - rpc.mdcommd "
1778 			    "resumed and messages unblocked for set %s: %s"),
1779 			    sp->setname,
1780 			    meta_print_hrtime(gethrtime() - start_time));
1781 		}
1782 
1783 		for (setno = 1; setno < max_sets; setno++) {
1784 			int			start_step;
1785 
1786 			/* Skip traditional disksets. */
1787 			if ((set_info[setno] & SET_INFO_MN) == 0)
1788 				continue;
1789 
1790 			/*
1791 			 * If already determined that this set is
1792 			 * a non-writable set, then just continue
1793 			 * to next set since there's nothing else
1794 			 * to do for a non-writable set.
1795 			 */
1796 			if (set_info[setno] & SET_INFO_NO_WR)
1797 				continue;
1798 
1799 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1800 				if (mdiserror(ep, MDE_NO_SET)) {
1801 					/* No set for this setno - continue */
1802 					mdclrerror(ep);
1803 					continue;
1804 				} else {
1805 					mde_perror(ep, gettext("Unable to "
1806 					    "get set %d information"), setno);
1807 					md_exit(local_sp, 1);
1808 				}
1809 			}
1810 
1811 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1812 				mde_perror(ep, gettext("Unable to get set "
1813 				    "%s desc information"), sp->setname);
1814 				mdclrerror(ep);
1815 				continue;
1816 			}
1817 
1818 			/* See if this node came through the start step */
1819 			(void) memset(&sf, 0, sizeof (sf));
1820 			sf.sf_setno = sp->setno;
1821 			sf.sf_flags = MDDB_NM_GET;
1822 			/* Use magic to help protect ioctl against attack. */
1823 			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
1824 			if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
1825 			    &sf.sf_mde, NULL)) {
1826 				mdstealerror(ep, &sf.sf_mde);
1827 				mde_perror(ep, gettext("Could not get "
1828 				    "start_step flag for set %s"), sp->setname);
1829 				md_exit(local_sp, 1);
1830 			}
1831 			start_step =
1832 			    (sf.sf_setflags & MD_SET_MN_START_RC)? 1: 0;
1833 
1834 			/*
1835 			 * We can now reset the start_step flag for the set
1836 			 * if it was already set.
1837 			 */
1838 			if (start_step) {
1839 				(void) memset(&sf, 0, sizeof (sf));
1840 					sf.sf_setno = sp->setno;
1841 				sf.sf_setflags = MD_SET_MN_START_RC;
1842 				sf.sf_flags = MDDB_NM_RESET;
1843 				/*
1844 				 * Use magic to help protect ioctl
1845 				 * against attack.
1846 				 */
1847 				sf.sf_magic = MDDB_SETFLAGS_MAGIC;
1848 				if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
1849 				    &sf.sf_mde, NULL)) {
1850 					mdstealerror(ep, &sf.sf_mde);
1851 					mde_perror(ep,
1852 					    gettext("Could not reset "
1853 					    "start_step flag for set %s"),
1854 					    sp->setname);
1855 				}
1856 			}
1857 
1858 			meta_mc_log(MC_LOG3, gettext("Step4 - begin setting "
1859 			    "ABR state and restarting io's for "
1860 			    "set %s: %s"), sp->setname,
1861 			    meta_print_hrtime(gethrtime() - start_time));
1862 
1863 
1864 			/*
1865 			 * If we are not the master and we have come through
1866 			 * the start step, we must update the ABR states
1867 			 * for mirrors and soft partitions. Also the submirror
1868 			 * states need to be synchronised so that we see the
1869 			 * same status as other previously joined members.
1870 			 * This _must_ be done before starting the resync.
1871 			 */
1872 			if (!(sd->sd_mn_am_i_master) && start_step) {
1873 				if (reset_state(GET_MIRROR_STATE, sp, MD_MIRROR,
1874 				    ep) == -1) {
1875 					md_exit(local_sp, 1);
1876 				}
1877 				if (reset_state(UPDATE_ABR, sp, MD_SP,
1878 				    ep) == -1) {
1879 					md_exit(local_sp, 1);
1880 				}
1881 				/*
1882 				 * Mark the fact that we've got the mirror
1883 				 * state. This allows the resync thread to
1884 				 * determine if _it_ needs to issue this. This
1885 				 * can happen if a node is added to a set after
1886 				 * a reconfig cycle has completed.
1887 				 */
1888 				(void) memset(&sf, 0, sizeof (sf));
1889 					sf.sf_setno = sp->setno;
1890 				sf.sf_setflags = MD_SET_MN_MIR_STATE_RC;
1891 				sf.sf_flags = MDDB_NM_SET;
1892 				/*
1893 				 * Use magic to help protect ioctl
1894 				 * against attack.
1895 				 */
1896 				sf.sf_magic = MDDB_SETFLAGS_MAGIC;
1897 				if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
1898 				    &sf.sf_mde, NULL)) {
1899 					mdstealerror(ep, &sf.sf_mde);
1900 					mde_perror(ep,
1901 					    gettext("Could not set "
1902 					    "submirror state flag for set %s"),
1903 					    sp->setname);
1904 				}
1905 			}
1906 
1907 			/*
1908 			 * All remaining actions are only performed by the
1909 			 * master
1910 			 */
1911 			if (!(sd->sd_mn_am_i_master)) {
1912 				if (meta_lock(sp, TRUE, ep) != 0) {
1913 					mde_perror(ep, "");
1914 					md_exit(local_sp, 1);
1915 				}
1916 				meta_mirror_resync_unblock(sp);
1917 				meta_unlock(sp, ep);
1918 				continue;
1919 			}
1920 
1921 			/*
1922 			 * If the master came through the start step, this
1923 			 * implies that all of the nodes must have done the
1924 			 * same and hence there can be no applications
1925 			 * running. Hence no need to reset ABR
1926 			 */
1927 			if (!start_step) {
1928 				/* Reset ABR state for mirrors */
1929 				if (reset_state(RESET_ABR, sp, MD_MIRROR,
1930 				    ep) == -1) {
1931 					md_exit(local_sp, 1);
1932 				}
1933 				/* ...and now the same for soft partitions */
1934 				if (reset_state(RESET_ABR, sp, MD_SP,
1935 				    ep) == -1) {
1936 					md_exit(local_sp, 1);
1937 				}
1938 			}
1939 
1940 			/*
1941 			 * choose owners for orphaned resyncs and reset
1942 			 * non-orphaned resyncs so that an owner node that
1943 			 * reboots will restart the resync if needed.
1944 			 */
1945 			if (reset_state(CHOOSE_OWNER, sp, MD_MIRROR, ep) == -1)
1946 				md_exit(local_sp, 1);
1947 
1948 			/*
1949 			 * Must unlock set lock before meta_mirror_resync_all
1950 			 * sends a message to run the metasync command
1951 			 * which also grabs the meta_lock.
1952 			 */
1953 			if (meta_lock(sp, TRUE, ep) != 0) {
1954 				mde_perror(ep, "");
1955 				md_exit(local_sp, 1);
1956 			}
1957 			meta_mirror_resync_unblock(sp);
1958 			meta_unlock(sp, ep);
1959 
1960 			/* resync all mirrors in set */
1961 			if (meta_mirror_resync_all(sp, 0, ep) != 0) {
1962 				mde_perror(ep, gettext("Mirror resyncs "
1963 				    "failed for set %s"), sp->setname);
1964 				md_exit(local_sp, 1);
1965 			}
1966 
1967 			meta_mc_log(MC_LOG3, gettext("Step4 - io's restarted "
1968 			    "for set %s: %s"), sp->setname,
1969 			    meta_print_hrtime(gethrtime() - start_time));
1970 		}
1971 
1972 		meta_mc_log(MC_LOG2, gettext("Step4 completed: %s"),
1973 		    meta_print_hrtime(gethrtime() - start_time));
1974 
1975 		break;
1976 
1977 	default:
1978 		usage(sp, 1);
1979 		break;
1980 	}
1981 
1982 	md_exit(sp, 0);
1983 	/* NOTREACHED */
1984 	return (0);
1985 }
1986