xref: /titanic_41/usr/src/lib/lvm/libmeta/common/meta_set.c (revision afd1ac7b1c9a8cdf273c865aa5e9a14620341443)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Just in case we're not in a build environment, make sure that
31  * TEXT_DOMAIN gets set to something.
32  */
33 #if !defined(TEXT_DOMAIN)
34 #define	TEXT_DOMAIN "SYS_TEST"
35 #endif
36 
37 /*
38  * Metadevice diskset interfaces
39  */
40 
41 #include "meta_set_prv.h"
42 #include <meta.h>
43 #include <metad.h>
44 #include <mdmn_changelog.h>
45 #include <sys/lvm/md_crc.h>
46 #include <sys/utsname.h>
47 #include <sdssc.h>
48 
49 #include <sys/sysevent/eventdefs.h>
50 #include <sys/sysevent/svm.h>
51 extern	char	*blkname(char *);
52 
53 static md_drive_desc *
54 dr2drivedesc(
55 	mdsetname_t	*sp,
56 	side_t		sideno,
57 	int		flags,
58 	md_error_t	*ep
59 )
60 {
61 	md_set_record	*sr;
62 	md_drive_record	*dr;
63 	mddrivename_t	*dnp;
64 	md_drive_desc	*dd_head = NULL;
65 	md_set_desc	*sd;
66 
67 	if (flags & MD_BYPASS_DAEMON) {
68 		if ((sr = metad_getsetbynum(sp->setno, ep)) == NULL)
69 			return (NULL);
70 		sd = metaget_setdesc(sp, ep);
71 		sideno = getnodeside(mynode(), sd);
72 		sp = metafakesetname(sp->setno, sr->sr_setname);
73 	} else {
74 		if ((sr = getsetbyname(sp->setname, ep)) == NULL)
75 			return (NULL);
76 	}
77 
78 	assert(sideno != MD_SIDEWILD);
79 
80 	/*
81 	 * WARNING:
82 	 * The act of getting the dnp from the namespace means that we
83 	 * will get the devid of the disk as recorded in the namespace.
84 	 * This devid has the potential to be stale if the disk is being
85 	 * replaced via a rebind, this means that any code that relies
86 	 * on any of the dnp information should take the appropriate action
87 	 * to preserve that information. For example in the rebind code the
88 	 * devid of the new disk is saved off and then copied back in once
89 	 * the code that has called this function has completed.
90 	 */
91 	for (dr = sr->sr_drivechain; dr != NULL; dr = dr->dr_next) {
92 		if ((dnp = metadrivename_withdrkey(sp, sideno, dr->dr_key,
93 		    flags, ep)) == NULL) {
94 			if (!(flags & MD_BYPASS_DAEMON))
95 				free_sr(sr);
96 			metafreedrivedesc(&dd_head);
97 			return (NULL);
98 		}
99 
100 		(void) metadrivedesc_append(&dd_head, dnp, dr->dr_dbcnt,
101 		    dr->dr_dbsize, dr->dr_ctime, dr->dr_genid, dr->dr_flags);
102 	}
103 
104 	if (!(flags & MD_BYPASS_DAEMON)) {
105 		free_sr(sr);
106 	}
107 	return (dd_head);
108 }
109 
110 static int
111 get_sidenmlist(
112 	mdsetname_t	*sp,
113 	mddrivename_t	*dnp,
114 	md_error_t	*ep
115 )
116 {
117 	md_set_desc	*sd;
118 	mdsidenames_t	*sn, **sn_next;
119 	int		i;
120 
121 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
122 		return (-1);
123 
124 	metaflushsidenames(dnp);
125 	sn_next = &dnp->side_names;
126 	if (MD_MNSET_DESC(sd)) {
127 		/*
128 		 * Only get sidenames for this node since
129 		 * that is the only side information stored in
130 		 * the local mddb for a multi-node diskset.
131 		 */
132 		if (sd->sd_mn_mynode) {
133 			sn = Zalloc(sizeof (*sn));
134 			sn->sideno = sd->sd_mn_mynode->nd_nodeid;
135 			if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET,
136 			    sn->sideno, dnp->side_names_key, &sn->dname,
137 			    &sn->mnum, NULL, ep)) == NULL) {
138 				if (sn->dname != NULL)
139 					Free(sn->dname);
140 				Free(sn);
141 				return (-1);
142 			}
143 
144 			/* Add to the end of the linked list */
145 			assert(*sn_next == NULL);
146 			*sn_next = sn;
147 			sn_next = &sn->next;
148 		}
149 	} else {
150 		for (i = 0; i < MD_MAXSIDES; i++) {
151 			/* Skip empty slots */
152 			if (sd->sd_nodes[i][0] == '\0')
153 				continue;
154 
155 			sn = Zalloc(sizeof (*sn));
156 			sn->sideno = i;
157 			if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET,
158 			    i+SKEW, dnp->side_names_key, &sn->dname,
159 			    &sn->mnum, NULL, ep)) == NULL) {
160 				/*
161 				 * It is possible that during the add of a
162 				 * host to have a 'missing' side as the side
163 				 * for this disk will be added later. So ignore
164 				 * the error. The 'missing' side will be added
165 				 * once the addhosts process has completed.
166 				 */
167 				if (mdissyserror(ep, ENOENT)) {
168 					mdclrerror(ep);
169 					Free(sn);
170 					continue;
171 				}
172 
173 				if (sn->dname != NULL)
174 					Free(sn->dname);
175 				Free(sn);
176 				return (-1);
177 			}
178 
179 			/* Add to the end of the linked list */
180 			assert(*sn_next == NULL);
181 			*sn_next = sn;
182 			sn_next = &sn->next;
183 		}
184 	}
185 
186 	return (0);
187 }
188 
189 static md_drive_desc *
190 rl_to_dd(
191 	mdsetname_t		*sp,
192 	md_replicalist_t	*rlp,
193 	md_error_t		*ep
194 )
195 {
196 	md_replicalist_t	*rl;
197 	md_replica_t		*r;
198 	md_drive_desc		*dd = NULL;
199 	md_drive_desc		*d;
200 	int			found;
201 	md_set_desc		*sd;
202 	daddr_t			nblks = 0;
203 
204 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
205 		return (NULL);
206 
207 	/* find the smallest existing replica */
208 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
209 		r = rl->rl_repp;
210 		nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks));
211 	}
212 
213 	if (nblks <= 0)
214 		nblks = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;
215 
216 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
217 		r = rl->rl_repp;
218 
219 		found = 0;
220 		for (d = dd; d != NULL; d = d->dd_next) {
221 			if (strcmp(r->r_namep->drivenamep->cname,
222 			    d->dd_dnp->cname) == 0) {
223 				found = 1;
224 				dd->dd_dbcnt++;
225 				break;
226 			}
227 		}
228 
229 		if (! found)
230 			(void) metadrivedesc_append(&dd, r->r_namep->drivenamep,
231 			    1, nblks, sd->sd_ctime, sd->sd_genid, MD_DR_OK);
232 	}
233 
234 	return (dd);
235 }
236 
237 /*
238  * Exported Entry Points
239  */
240 
241 set_t
242 get_max_sets(md_error_t *ep)
243 {
244 
245 	static set_t		max_sets = 0;
246 
247 	if (max_sets == 0)
248 		if (metaioctl(MD_IOCGETNSET, &max_sets, ep, NULL) != 0)
249 			return (0);
250 
251 	return (max_sets);
252 }
253 
254 int
255 get_max_meds(md_error_t *ep)
256 {
257 	static int		max_meds = 0;
258 
259 	if (max_meds == 0)
260 		if (metaioctl(MD_MED_GET_NMED, &max_meds, ep, NULL) != 0)
261 			return (0);
262 
263 	return (max_meds);
264 }
265 
266 side_t
267 getmyside(mdsetname_t *sp, md_error_t *ep)
268 {
269 	md_set_desc		*sd;
270 	char 			*node = NULL;
271 	side_t			sideno;
272 
273 	if (sp->setno == 0)
274 		return (0);
275 
276 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
277 		return (MD_SIDEWILD);
278 
279 	node = mynode();
280 
281 	assert(node != NULL);
282 
283 	sideno = getnodeside(node, sd);
284 
285 	if (sideno != MD_SIDEWILD)
286 		return (sideno);
287 
288 	return (mddserror(ep, MDE_DS_HOSTNOSIDE, sp->setno, node, NULL, node));
289 }
290 
291 /*
292  * get set info from name
293  */
294 md_set_record *
295 getsetbyname(char *setname, md_error_t *ep)
296 {
297 	md_set_record		*sr = NULL;
298 	md_mnset_record		*mnsr = NULL;
299 	char			*p;
300 	size_t			len;
301 
302 	/* get set info from daemon */
303 	if (clnt_getset(mynode(), setname, MD_SET_BAD, &sr, ep) == -1)
304 		return (NULL);
305 	if (sr != NULL) {
306 		/*
307 		 * Returned record could be for a multi-node set or a
308 		 * non-multi-node set.
309 		 */
310 		if (MD_MNSET_REC(sr)) {
311 			/*
312 			 * Record is for a multi-node set.  Reissue call
313 			 * to get mnset information.  Need to free
314 			 * record as if a non-multi-node set record since
315 			 * that is what clnt_getset gave us.  If in
316 			 * the daemon, don't free since this is a pointer
317 			 * into the setrecords array.
318 			 */
319 			if (! md_in_daemon) {
320 				sr->sr_flags &= ~MD_SR_MN;
321 				free_sr(sr);
322 			}
323 			if (clnt_mngetset(mynode(), setname, MD_SET_BAD, &mnsr,
324 			    ep) == -1)
325 				return (NULL);
326 			if (mnsr != NULL)
327 				return ((struct md_set_record *)mnsr);
328 		} else {
329 			return (sr);
330 		}
331 	}
332 
333 	/* no such set */
334 	len = strlen(setname) + 30;
335 	p = Malloc(len);
336 	(void) snprintf(p, len, "setname \"%s\"", setname);
337 	(void) mderror(ep, MDE_NO_SET, p);
338 	Free(p);
339 	return (NULL);
340 }
341 
342 /*
343  * get set info from number
344  */
345 md_set_record *
346 getsetbynum(set_t setno, md_error_t *ep)
347 {
348 	md_set_record		*sr;
349 	md_mnset_record		*mnsr = NULL;
350 	char			buf[100];
351 
352 	if (clnt_getset(mynode(), NULL, setno, &sr, ep) == -1)
353 		return (NULL);
354 
355 	if (sr != NULL) {
356 		/*
357 		 * Record is for a multi-node set.  Reissue call
358 		 * to get mnset information.  Need to free
359 		 * record as if a non-multi-node set record since
360 		 * that is what clnt_getset gave us.  If in
361 		 * the daemon, don't free since this is a pointer
362 		 * into the setrecords array.
363 		 */
364 		if (MD_MNSET_REC(sr)) {
365 			/*
366 			 * Record is for a multi-node set.  Reissue call
367 			 * to get mnset information.
368 			 */
369 			if (! md_in_daemon) {
370 				sr->sr_flags &= ~MD_SR_MN;
371 				free_sr(sr);
372 			}
373 			if (clnt_mngetset(mynode(), NULL, setno, &mnsr,
374 			    ep) == -1)
375 				return (NULL);
376 			if (mnsr != NULL)
377 				return ((struct md_set_record *)mnsr);
378 		} else {
379 			return (sr);
380 		}
381 	}
382 
383 	(void) sprintf(buf, "setno %u", setno);
384 	(void) mderror(ep, MDE_NO_SET, buf);
385 	return (NULL);
386 }
387 
388 int
389 meta_check_drive_inuse(
390 	mdsetname_t	*sp,
391 	mddrivename_t	*dnp,
392 	int		check_db,
393 	md_error_t	*ep
394 )
395 {
396 	mdnamelist_t	*nlp = NULL;
397 	mdnamelist_t	*p;
398 	int		rval = 0;
399 
400 	/* get all underlying partitions */
401 	if (meta_getalldevs(sp, &nlp, check_db, ep) != 0)
402 		return (-1);
403 
404 	/* search for drive */
405 	for (p = nlp; (p != NULL); p = p->next) {
406 		mdname_t	*np = p->namep;
407 
408 		if (strcmp(dnp->cname, np->drivenamep->cname) == 0) {
409 			rval = (mddserror(ep, MDE_DS_DRIVEINUSE, sp->setno,
410 			    NULL, dnp->cname, sp->setname));
411 			break;
412 		}
413 	}
414 
415 	/* cleanup, return success */
416 	metafreenamelist(nlp);
417 	return (rval);
418 }
419 
420 /*
421  * simple check for ownership
422  */
423 int
424 meta_check_ownership(mdsetname_t *sp, md_error_t *ep)
425 {
426 	int			ownset;
427 	md_set_desc		*sd;
428 	md_drive_desc		*dd;
429 	md_replicalist_t	*rlp = NULL;
430 	md_error_t		xep = mdnullerror;
431 
432 	if (metaislocalset(sp))
433 		return (0);
434 
435 	ownset = own_set(sp, NULL, TRUE, ep);
436 	if (! mdisok(ep))
437 		return (-1);
438 
439 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
440 		return (-1);
441 
442 	dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep);
443 	if (! mdisok(ep))
444 		return (-1);
445 
446 	/* If we have no drive descriptors, check for no ownership */
447 	if (dd == NULL) {
448 		if (ownset == MD_SETOWNER_NONE)
449 			return (0);
450 
451 		/* If ownership somehow has come to exist, we must clean up */
452 
453 		if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp,
454 		    &xep) < 0)
455 			mdclrerror(&xep);
456 
457 		if ((dd = rl_to_dd(sp, rlp, &xep)) == NULL)
458 			if (! mdisok(&xep))
459 				mdclrerror(&xep);
460 
461 		if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
462 			if (rel_own_bydd(sp, dd, TRUE, &xep))
463 				mdclrerror(&xep);
464 		}
465 
466 		if (halt_set(sp, &xep))
467 			mdclrerror(&xep);
468 
469 		metafreereplicalist(rlp);
470 
471 		metafreedrivedesc(&dd);
472 
473 		return (0);
474 	}
475 
476 	metafreedrivedesc(&sd->sd_drvs);
477 
478 	if (ownset == MD_SETOWNER_YES)
479 		return (0);
480 
481 	return (mddserror(ep, MDE_DS_NOOWNER, sp->setno, NULL, NULL,
482 	    sp->setname));
483 }
484 
485 /*
486  * simple check for ownership
487  */
488 int
489 meta_check_ownership_on_host(mdsetname_t *sp, char *hostname, md_error_t *ep)
490 {
491 	md_set_desc	*sd;
492 	md_drive_desc	*dd;
493 	int		bool;
494 
495 	if (metaislocalset(sp))
496 		return (0);
497 
498 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
499 		return (-1);
500 
501 	if (getnodeside(hostname, sd) == MD_SIDEWILD)
502 		return (mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
503 		    hostname, NULL, sp->setname));
504 
505 	dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep);
506 	if (! mdisok(ep))
507 		return (-1);
508 
509 	if (clnt_ownset(hostname, sp, &bool, ep) == -1)
510 		return (-1);
511 
512 	if (dd == NULL)
513 		return (0);
514 
515 	metafreedrivedesc(&sd->sd_drvs);
516 
517 	if (bool == TRUE)
518 		return (0);
519 
520 	return (mddserror(ep, MDE_DS_NODEISNOTOWNER, sp->setno, hostname, NULL,
521 	    sp->setname));
522 }
523 
524 /*
525  * Function that determines if a node is in the multinode diskset
526  * membership list.  Calling node passes in node to be checked and
527  * the nodelist as returned from meta_read_nodelist.  This routine
528  * anticipates being called many times using the same diskset membership
529  * list which is why the alloc and free of the diskset membership list
530  * is left to the calling routine.
531  * Returns:
532  *	1 - if a member
533  *	0 - not a member
534  */
535 int
536 meta_is_member(
537 	char				*node_name,
538 	md_mn_nodeid_t			node_id,
539 	mndiskset_membershiplist_t	*nl
540 )
541 {
542 	mndiskset_membershiplist_t	*nl2;
543 	int				flag_check_name;
544 
545 	if (node_id != 0)
546 		flag_check_name = 0;
547 	else if (node_name != NULL)
548 		flag_check_name = 1;
549 	else
550 		return (0);
551 
552 	nl2 = nl;
553 	while (nl2) {
554 		if (flag_check_name) {
555 			/* Compare given name against name in member list */
556 			if (strcmp(nl2->msl_node_name, node_name) == 0)
557 				break;
558 		} else {
559 			/* Compare given nodeid against nodeid in member list */
560 			if (nl2->msl_node_id == node_id)
561 				break;
562 		}
563 		nl2 = nl2->next;
564 	}
565 	/* No match found in member list */
566 	if (nl2 == NULL) {
567 		return (0);
568 	}
569 	/* Return 1 if node is in member list */
570 	return (1);
571 }
572 
573 /*
574  * meta_getnext_devinfo should go to the host that
575  * has the device, to return the device name, driver name, minor num.
576  * We can take the big cheat for now, since it is a requirement
577  * that the device names and device numbers are the same, and
578  * just get the info locally.
579  *
580  * This routine is very similar to meta_getnextside_devinfo except
581  * that the specific side to be used is being passed in.
582  *
583  * Exit status:
584  *	 0 - No more side info to return
585  *	 1 - More side info's to return
586  *	-1 - An error has been detected
587  */
588 /*ARGSUSED*/
589 int
590 meta_getside_devinfo(
591 	mdsetname_t	*sp,		/* for this set */
592 	char		*bname,		/* local block name (myside) */
593 	side_t		sideno,		/* sideno */
594 	char		**ret_bname,	/* block device name of returned side */
595 	char		**ret_dname,	/* driver name of returned side */
596 	minor_t		*ret_mnum,	/* minor number of returned side */
597 	md_error_t	*ep
598 )
599 {
600 	mdname_t	*np;
601 
602 	if (ret_bname != NULL)
603 		*ret_bname = NULL;
604 	if (ret_dname != NULL)
605 		*ret_dname = NULL;
606 	if (ret_mnum != NULL)
607 		*ret_mnum = NODEV32;
608 
609 
610 	if ((np = metaname(&sp, bname, ep)) == NULL)
611 		return (-1);
612 
613 /*
614  * NOTE (future) - There will be more work here once devids are integrated
615  * into disksets.  Then the side should be used to find the correct
616  * host and the b/d names should be gotten from that host.
617  */
618 
619 	/*
620 	 * Return the side info.
621 	 */
622 	if (ret_bname != NULL)
623 		*ret_bname = Strdup(np->bname);
624 
625 	if (ret_dname != NULL) {
626 		mdcinfo_t	*cinfo;
627 
628 		if ((cinfo = metagetcinfo(np, ep)) == NULL)
629 			return (-1);
630 
631 		*ret_dname = Strdup(cinfo->dname);
632 	}
633 
634 	if (ret_mnum != NULL)
635 		*ret_mnum = meta_getminor(np->dev);
636 
637 	return (1);
638 }
639 
640 /*
641  * Get the information on the device from the remote node using the devid
642  * of the disk.
643  *
644  * Exit status:
645  *	 0 - No more side info to return
646  *	 1 - More side info's to return
647  *	-1 - An error has been detected
648  */
649 int
650 meta_getnextside_devinfo(
651 	mdsetname_t	*sp,		/* for this set */
652 	char		*bname,		/* local block name (myside) */
653 	side_t		*sideno,	/* previous sideno & returned sideno */
654 	char		**ret_bname,	/* block device name of returned side */
655 	char		**ret_dname,	/* driver name of returned side */
656 	minor_t		*ret_mnum,	/* minor number of returned side */
657 	md_error_t	*ep
658 )
659 {
660 	md_set_desc	*sd;
661 	int		i;
662 	mdname_t	*np;
663 	mddrivename_t	*dnp;
664 	char		*devidstr = NULL;
665 	int		devidstrlen;
666 	md_dev64_t	retdev = NODEV64;
667 	char		*ret_devname = NULL;
668 	char		*ret_blkdevname = NULL;
669 	char		*ret_driver = NULL;
670 	char		*nodename;
671 	int		fd;
672 	int		ret = -1;
673 	char		*minor_name = NULL;
674 	md_mnnode_desc	*nd;
675 
676 
677 	if (ret_bname != NULL)
678 		*ret_bname = NULL;
679 	if (ret_dname != NULL)
680 		*ret_dname = NULL;
681 	if (ret_mnum != NULL)
682 		*ret_mnum = NODEV32;
683 
684 	if (metaislocalset(sp)) {
685 		/* no more sides - we are done */
686 		if (*sideno != MD_SIDEWILD)
687 			return (0);
688 
689 		/* First time through -  set up return sideno */
690 		*sideno = 0;
691 	} else {
692 
693 		/*
694 		 * Find the next sideno, starting after the one given.
695 		 */
696 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
697 			return (-1);
698 
699 		if (MD_MNSET_DESC(sd)) {
700 			nd = sd->sd_nodelist;
701 			if ((*sideno == MD_SIDEWILD) &&
702 			    (nd != (struct md_mnnode_desc *)NULL)) {
703 				*sideno = nd->nd_nodeid;
704 			} else {
705 				while (nd) {
706 					/*
707 					 * Found given sideno, now find
708 					 * next sideno, if there is one.
709 					 */
710 					if ((*sideno == nd->nd_nodeid) &&
711 					    (nd->nd_next !=
712 					    (struct md_mnnode_desc *)NULL)) {
713 						*sideno =
714 						    nd->nd_next->nd_nodeid;
715 						break;
716 					}
717 					nd = nd->nd_next;
718 				}
719 				if (nd == NULL) {
720 					return (0);
721 				}
722 			}
723 			if (*sideno == MD_SIDEWILD)
724 				return (0);
725 		} else {
726 			for (i = (*sideno)+1; i < MD_MAXSIDES; i++)
727 				/* Find next full slot */
728 				if (sd->sd_nodes[i][0] != '\0')
729 					break;
730 
731 			/* No more sides - we are done */
732 			if (i == MD_MAXSIDES)
733 				return (0);
734 
735 			/* Set up the return sideno */
736 			*sideno = i;
737 			nodename = (char *)sd->sd_nodes[i];
738 		}
739 	}
740 
741 	/*
742 	 * Need to pass the node the devid of the disk and get it to
743 	 * send back the details of the disk from that side.
744 	 */
745 	if ((np = metaname(&sp, bname, ep)) == NULL)
746 		return (-1);
747 
748 	dnp = np->drivenamep;
749 
750 	/*
751 	 * By default, set up the parameters so that they are copied out.
752 	 */
753 	if (ret_bname != NULL)
754 		*ret_bname = Strdup(np->bname);
755 
756 	if (ret_dname != NULL) {
757 		mdcinfo_t	*cinfo;
758 
759 		if ((cinfo = metagetcinfo(np, ep)) == NULL)
760 			return (-1);
761 
762 		*ret_dname = Strdup(cinfo->dname);
763 	}
764 
765 	if (ret_mnum != NULL)
766 		*ret_mnum = meta_getminor(np->dev);
767 
768 	/*
769 	 * Try some optimization. If this is the local set or the device
770 	 * is a metadevice then just copy the information. If the device
771 	 * does not have a devid (due to not having a minor name) then
772 	 * fall back to the pre-devid behaviour of copying the information
773 	 * on the device: this is okay because the sanity checks before this
774 	 * call would have found any issues with the device. If it's a
775 	 * multi-node diskset also just return ie. copy.
776 	 */
777 	if (metaislocalset(sp) || metaismeta(np) || (dnp->devid == NULL) ||
778 	    (MD_MNSET_DESC(sd)))
779 		return (1);
780 
781 	if (np->minor_name == (char *)NULL) {
782 		/*
783 		 * Have to get the minor name then. The slice should exist
784 		 * on the disk because it will have already been repartitioned
785 		 * up prior to getting to this point.
786 		 */
787 		if ((fd = open(np->bname, (O_RDONLY|O_NDELAY), 0)) < 0) {
788 			(void) mdsyserror(ep, errno, np->bname);
789 			return (-1);
790 		}
791 		(void) devid_get_minor_name(fd, &minor_name);
792 		np->minor_name = Strdup(minor_name);
793 		devid_str_free(minor_name);
794 		(void) close(fd);
795 	}
796 
797 	/* allocate extra space for "/" and NULL hence +2 */
798 	devidstrlen = strlen(dnp->devid) + strlen(np->minor_name) + 2;
799 	devidstr = (char *)Malloc(devidstrlen);
800 
801 	/*
802 	 * As a minor name is supplied then the ret_devname will be
803 	 * appropriate to that minor_name and in this case it will be
804 	 * a block device ie /dev/dsk.
805 	 */
806 	(void) snprintf(devidstr, devidstrlen,
807 		"%s/%s", dnp->devid, np->minor_name);
808 
809 	ret = clnt_devinfo_by_devid(nodename, sp, devidstr, &retdev,
810 	    np->bname, &ret_devname, &ret_driver, ep);
811 
812 	Free(devidstr);
813 
814 	/*
815 	 * If the other side is not running device id in disksets,
816 	 * 'ret' is set to ENOTSUP in which case we fallback to
817 	 * the existing behaviour
818 	 */
819 	if (ret == ENOTSUP)
820 		return (1);
821 	else if (ret == -1)
822 		return (-1);
823 
824 	/*
825 	 * ret_devname comes from the rpc call and is a
826 	 * raw device name. We need to make this into a
827 	 * block device via blkname for further processing.
828 	 * Unfortunately, when our device id isn't found in
829 	 * the system, the rpc call will return a " " in
830 	 * ret_devname in which case we need to fill that in
831 	 * as ret_blkname because blkname of " " returns NULL.
832 	 */
833 	if (ret_bname != NULL && ret_devname != NULL) {
834 		ret_blkdevname = blkname(ret_devname);
835 		if (ret_blkdevname == NULL)
836 			*ret_bname = Strdup(ret_devname);
837 		else
838 			*ret_bname = Strdup(ret_blkdevname);
839 	}
840 
841 	if (ret_dname != NULL && ret_driver != NULL)
842 		*ret_dname = Strdup(ret_driver);
843 
844 	if (ret_mnum != NULL)
845 		*ret_mnum = meta_getminor(retdev);
846 
847 	return (1);
848 }
849 
850 int
851 meta_is_drive_in_anyset(
852 	mddrivename_t	*dnp,
853 	mdsetname_t	**spp,
854 	int		bypass_daemon,
855 	md_error_t 	*ep
856 )
857 {
858 	set_t		setno;
859 	mdsetname_t	*this_sp;
860 	int		is_it;
861 	set_t		max_sets;
862 
863 	if ((max_sets = get_max_sets(ep)) == 0)
864 		return (-1);
865 
866 	assert(spp != NULL);
867 	*spp = NULL;
868 
869 	for (setno = 1; setno < max_sets; setno++) {
870 		if (!bypass_daemon) {
871 			if ((this_sp = metasetnosetname(setno, ep)) == NULL) {
872 				if (mdismddberror(ep, MDE_DB_NODB)) {
873 					mdclrerror(ep);
874 					return (0);
875 				}
876 				if (mdiserror(ep, MDE_NO_SET)) {
877 					mdclrerror(ep);
878 					continue;
879 				}
880 				return (-1);
881 			}
882 		} else
883 			this_sp = metafakesetname(setno, NULL);
884 
885 		if ((is_it = meta_is_drive_in_thisset(this_sp, dnp,
886 		    bypass_daemon, ep)) == -1) {
887 			if (mdiserror(ep, MDE_NO_SET)) {
888 				mdclrerror(ep);
889 				continue;
890 			}
891 			return (-1);
892 		}
893 		if (is_it) {
894 			*spp = this_sp;
895 			return (0);
896 		}
897 	}
898 	return (0);
899 }
900 
901 int
902 meta_is_drive_in_thisset(
903 	mdsetname_t	*sp,
904 	mddrivename_t	*dnp,
905 	int		bypass_daemon,
906 	md_error_t	*ep
907 )
908 {
909 	md_drive_desc	*dd, *p;
910 
911 	if (bypass_daemon)
912 		dd = dr2drivedesc(sp, MD_SIDEWILD,
913 		    (MD_BASICNAME_OK | MD_BYPASS_DAEMON), ep);
914 	else
915 		dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
916 
917 	if (dd == NULL) {
918 		if (! mdisok(ep))
919 			return (-1);
920 		return (0);
921 	}
922 
923 
924 	for (p = dd; p != NULL; p = p->dd_next)
925 		if (strcmp(p->dd_dnp->cname, dnp->cname) == 0)
926 			return (1);
927 	return (0);
928 }
929 
930 int
931 meta_set_balance(
932 	mdsetname_t		*sp,
933 	md_error_t		*ep
934 )
935 {
936 	md_set_desc		*sd;
937 	md_drive_desc		*dd, *curdd;
938 	daddr_t			dbsize;
939 	daddr_t			nblks;
940 	int			i;
941 	int			rval = 0;
942 	sigset_t		oldsigs;
943 	md_setkey_t		*cl_sk;
944 	md_error_t		xep = mdnullerror;
945 	md_mnnode_desc		*nd;
946 	int			suspend1_flag = 0;
947 
948 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
949 		return (-1);
950 
951 	dbsize = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;
952 
953 	/* Make sure we own the set */
954 	if (meta_check_ownership(sp, ep) != 0)
955 		return (-1);
956 
957 	/* END CHECK CODE */
958 
959 	/*
960 	 * Get drive descriptors for the drives that are currently in the set.
961 	 */
962 	curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep);
963 
964 	if (! mdisok(ep))
965 		return (-1);
966 
967 	/* Find the minimum replica size in use is or use the default */
968 	if ((nblks = meta_db_minreplica(sp, ep)) < 0)
969 		mdclrerror(ep);
970 	else
971 		dbsize = nblks;	/* adjust replica size */
972 
973 	/* Make sure we are blocking all signals */
974 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
975 		mdclrerror(&xep);
976 
977 	/*
978 	 * Lock the set on current set members.
979 	 * For MN diskset lock_set and SUSPEND are used to protect against
980 	 * other meta* commands running on the other nodes.
981 	 */
982 	if (MD_MNSET_DESC(sd)) {
983 		nd = sd->sd_nodelist;
984 		while (nd) {
985 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
986 				nd = nd->nd_next;
987 				continue;
988 			}
989 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
990 				rval = -1;
991 				goto out;
992 			}
993 			nd = nd->nd_next;
994 		}
995 		/*
996 		 * Lock out other meta* commands by suspending
997 		 * class 1 messages across the diskset.
998 		 */
999 		nd = sd->sd_nodelist;
1000 		while (nd) {
1001 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1002 				nd = nd->nd_next;
1003 				continue;
1004 			}
1005 			if (clnt_mdcommdctl(nd->nd_nodename,
1006 			    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
1007 			    MD_MSCF_NO_FLAGS, ep)) {
1008 				rval = -1;
1009 				goto out;
1010 			}
1011 			suspend1_flag = 1;
1012 			nd = nd->nd_next;
1013 		}
1014 	} else {
1015 		for (i = 0; i < MD_MAXSIDES; i++) {
1016 			/* Skip empty slots */
1017 			if (sd->sd_nodes[i][0] == '\0') continue;
1018 
1019 			if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
1020 				rval = -1;
1021 				goto out;
1022 			}
1023 		}
1024 	}
1025 
1026 	/* We are not adding or deleting any drives, just balancing */
1027 	dd = NULL;
1028 
1029 	/*
1030 	 * Balance the DB's according to the list of existing drives and the
1031 	 * list of added drives.
1032 	 */
1033 	if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1)
1034 		goto out;
1035 
1036 out:
1037 	/*
1038 	 * Unlock diskset by resuming class 1 messages across the diskset.
1039 	 * Just resume all classes so that resume is the same whether
1040 	 * just one class was locked or all classes were locked.
1041 	 */
1042 	if (suspend1_flag) {
1043 		nd = sd->sd_nodelist;
1044 		while (nd) {
1045 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1046 				nd = nd->nd_next;
1047 				continue;
1048 			}
1049 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
1050 				sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
1051 				/*
1052 				 * We are here because we failed to resume
1053 				 * rpc.mdcommd.  However we potentially have
1054 				 * an error from the previous call
1055 				 * (meta_db_balance). If the previous call
1056 				 * did fail,  we capture that error and
1057 				 * generate a perror withthe string,
1058 				 * "Unable to resume...".
1059 				 * Setting rval to -1 ensures that in the
1060 				 * next iteration of the loop, ep is not
1061 				 * clobbered.
1062 				 */
1063 				if (rval == 0)
1064 					(void) mdstealerror(ep, &xep);
1065 				else
1066 					mdclrerror(&xep);
1067 				rval = -1;
1068 				mde_perror(ep, dgettext(TEXT_DOMAIN,
1069 				    "Unable to resume rpc.mdcommd."));
1070 			}
1071 			nd = nd->nd_next;
1072 		}
1073 	}
1074 
1075 	/* Unlock the set */
1076 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1077 	if (MD_MNSET_DESC(sd)) {
1078 		nd = sd->sd_nodelist;
1079 		while (nd) {
1080 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1081 				nd = nd->nd_next;
1082 				continue;
1083 			}
1084 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
1085 				if (rval == 0)
1086 					(void) mdstealerror(ep, &xep);
1087 				else
1088 					mdclrerror(&xep);
1089 				rval = -1;
1090 			}
1091 			nd = nd->nd_next;
1092 		}
1093 	} else {
1094 		for (i = 0; i < MD_MAXSIDES; i++) {
1095 			/* Skip empty slots */
1096 			if (sd->sd_nodes[i][0] == '\0')
1097 				continue;
1098 
1099 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
1100 				if (rval == 0)
1101 					(void) mdstealerror(ep, &xep);
1102 				rval = -1;
1103 			}
1104 		}
1105 	}
1106 
1107 	/* release signals back to what they were on entry */
1108 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
1109 		mdclrerror(&xep);
1110 
1111 	cl_set_setkey(NULL);
1112 
1113 	metaflushsetname(sp);
1114 
1115 	return (rval);
1116 }
1117 
1118 int
1119 meta_set_destroy(
1120 	mdsetname_t	*sp,
1121 	int		lock_set,
1122 	md_error_t	*ep
1123 )
1124 {
1125 	int		i;
1126 	med_rec_t	medr;
1127 	md_set_desc	*sd;
1128 	md_drive_desc	*dd, *p, *p1;
1129 	mddrivename_t	*dnp;
1130 	mdname_t	*np;
1131 	mdnamelist_t	*nlp = NULL;
1132 	int		num_users = 0;
1133 	int		has_set;
1134 	side_t		mysideno;
1135 	sigset_t	oldsigs;
1136 	md_error_t	xep = mdnullerror;
1137 	md_setkey_t	*cl_sk;
1138 	int		rval = 0;
1139 	int		delete_end = 1;
1140 
1141 	/* Make sure we are blocking all signals */
1142 	if (procsigs(TRUE, &oldsigs, ep) < 0)
1143 		return (-1);
1144 
1145 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1146 		if (! mdisok(ep))
1147 			rval = -1;
1148 		goto out;
1149 	}
1150 
1151 	/*
1152 	 * meta_set_destroy should not be called for a MN diskset.
1153 	 * This routine destroys a set without communicating this information
1154 	 * to the other nodes which would lead to an inconsistency in
1155 	 * the MN diskset.
1156 	 */
1157 	if (MD_MNSET_DESC(sd)) {
1158 		rval = -1;
1159 		goto out;
1160 	}
1161 
1162 	/* Continue if a traditional diskset */
1163 
1164 	/*
1165 	 * Check to see who has the set.  If we are not the last user of the
1166 	 * set, we will not touch the replicas.
1167 	 */
1168 	for (i = 0; i < MD_MAXSIDES; i++) {
1169 		/* Skip empty slots */
1170 		if (sd->sd_nodes[i][0] == '\0')
1171 			continue;
1172 
1173 		has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NST_EQ,
1174 		    ep);
1175 
1176 		if (has_set < 0) {
1177 			mdclrerror(ep);
1178 		} else
1179 			num_users++;
1180 	}
1181 
1182 	if ((dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) == NULL) {
1183 		if (! mdisok(ep)) {
1184 			rval = -1;
1185 			goto out;
1186 		}
1187 	}
1188 
1189 	if (setup_db_bydd(sp, dd, TRUE, ep) == -1) {
1190 		rval = -1;
1191 		goto out;
1192 	}
1193 
1194 	if (lock_set == TRUE) {
1195 		/* Lock the set on our side */
1196 		if (clnt_lock_set(mynode(), sp, ep)) {
1197 			rval = -1;
1198 			goto out;
1199 		}
1200 	}
1201 
1202 	/*
1203 	 * A traditional diskset has no diskset stale information to send
1204 	 * since there can only be one owner node at a time.
1205 	 */
1206 	if (snarf_set(sp, FALSE, ep))
1207 		mdclrerror(ep);
1208 
1209 	if (dd != NULL) {
1210 		/*
1211 		 * Make sure that no drives are in use as parts of metadrives
1212 		 * or hot spare pools, this is one of the few error conditions
1213 		 * that will stop this routine, unless the environment has
1214 		 * META_DESTROY_SET_OK set, in which case, the operation will
1215 		 * proceed.
1216 		 */
1217 		if (getenv("META_DESTROY_SET_OK") == NULL) {
1218 			for (p = dd; p != NULL; p = p->dd_next) {
1219 				dnp = p->dd_dnp;
1220 
1221 				i = meta_check_drive_inuse(sp, dnp, FALSE, ep);
1222 				if (i == -1) {
1223 					/* need xep - wire calls clear error */
1224 					i = metaget_setownership(sp, &xep);
1225 					if (i == -1) {
1226 						rval = -1;
1227 						goto out;
1228 					}
1229 
1230 					mysideno = getmyside(sp, &xep);
1231 
1232 					if (mysideno == MD_SIDEWILD) {
1233 						rval = -1;
1234 						goto out;
1235 					}
1236 
1237 					if (sd->sd_isown[mysideno] == FALSE)
1238 						if (halt_set(sp, &xep)) {
1239 							rval = -1;
1240 							goto out;
1241 						}
1242 
1243 					rval = -1;
1244 					goto out;
1245 				}
1246 			}
1247 		}
1248 
1249 		for (i = 0; i < MD_MAXSIDES; i++) {
1250 			/* Skip empty slots */
1251 			if (sd->sd_nodes[i][0] == '\0')
1252 				continue;
1253 
1254 			/* Skip non local nodes */
1255 			if (strcmp(mynode(), sd->sd_nodes[i]) != 0)
1256 				continue;
1257 
1258 			if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep))
1259 				mdclrerror(ep);
1260 		}
1261 
1262 		/*
1263 		 * Go thru each drive and individually delete the replicas.
1264 		 * This way we can ignore individual errors.
1265 		 */
1266 		for (p = dd; p != NULL; p = p->dd_next) {
1267 			uint_t	rep_slice;
1268 
1269 			dnp = p->dd_dnp;
1270 			if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) ||
1271 			    (((np = metaslicename(dnp, rep_slice, ep))
1272 				== NULL) &&
1273 				((np = metaslicename(dnp, MD_SLICE0, ep))
1274 				    == NULL))) {
1275 				rval = -1;
1276 				goto out;
1277 			}
1278 
1279 			if ((np = metaslicename(dnp,
1280 			    rep_slice, ep)) == NULL) {
1281 				if ((np = metaslicename(dnp,
1282 				    MD_SLICE0, ep)) == NULL) {
1283 					rval = -1;
1284 					goto out;
1285 				}
1286 				mdclrerror(ep);
1287 			}
1288 
1289 			/* Yes this is UGLY!!! */
1290 			p1 = p->dd_next;
1291 			p->dd_next = NULL;
1292 			if (rel_own_bydd(sp, p, FALSE, ep))
1293 				mdclrerror(ep);
1294 			p->dd_next = p1;
1295 
1296 			if (p->dd_dbcnt == 0)
1297 				continue;
1298 
1299 			/*
1300 			 * Skip the replica removal if we are not the last user
1301 			 */
1302 			if (num_users != 1)
1303 				continue;
1304 
1305 			nlp = NULL;
1306 			(void) metanamelist_append(&nlp, np);
1307 			if (meta_db_detach(sp, nlp,
1308 			    (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, ep))
1309 				mdclrerror(ep);
1310 			metafreenamelist(nlp);
1311 		}
1312 	}
1313 
1314 	if (halt_set(sp, ep)) {
1315 		rval = -1;
1316 		goto out;
1317 	}
1318 
1319 	/* Setup the mediator record */
1320 	(void) memset(&medr, '\0', sizeof (med_rec_t));
1321 	medr.med_rec_mag = MED_REC_MAGIC;
1322 	medr.med_rec_rev = MED_REC_REV;
1323 	medr.med_rec_fl  = 0;
1324 	medr.med_rec_sn  = sp->setno;
1325 	(void) strcpy(medr.med_rec_snm, sp->setname);
1326 	medr.med_rec_meds = sd->sd_med;	/* structure assigment */
1327 	(void) memset(&medr.med_rec_data, '\0', sizeof (med_data_t));
1328 	medr.med_rec_foff = 0;
1329 
1330 	/*
1331 	 * If we are the last remaining user, then remove the mediator hosts
1332 	 */
1333 	if (num_users == 1) {
1334 		for (i = 0; i < MED_MAX_HOSTS; i++) {
1335 			if (medr.med_rec_meds.n_lst[i].a_cnt != 0)
1336 				SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE,
1337 				    SVM_TAG_MEDIATOR, sp->setno, i);
1338 			(void) memset(&medr.med_rec_meds.n_lst[i], '\0',
1339 			    sizeof (md_h_t));
1340 		}
1341 		medr.med_rec_meds.n_cnt = 0;
1342 	} else { 	/* Remove this host from the mediator node list. */
1343 		for (i = 0; i < MD_MAXSIDES; i++) {
1344 			/* Skip empty slots */
1345 			if (sd->sd_nodes[i][0] == '\0')
1346 				continue;
1347 
1348 			/* Copy non local node */
1349 			if (strcmp(mynode(), sd->sd_nodes[i]) != 0) {
1350 				(void) strcpy(medr.med_rec_nodes[i],
1351 				    sd->sd_nodes[i]);
1352 				continue;
1353 			}
1354 
1355 			/* Clear local node */
1356 			(void) memset(&medr.med_rec_nodes[i], '\0',
1357 			    sizeof (md_node_nm_t));
1358 		}
1359 	}
1360 
1361 	crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
1362 
1363 	/*
1364 	 * If the client is part of a cluster put the DCS service
1365 	 * into a deleteing state.
1366 	 */
1367 	if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) {
1368 		if (metad_isautotakebyname(sp->setname)) {
1369 			delete_end = 0;
1370 		} else {
1371 			mdclrerror(ep);
1372 			goto out;
1373 		}
1374 	}
1375 
1376 	/* Inform the mediator hosts of the new information */
1377 	for (i = 0; i < MED_MAX_HOSTS; i++) {
1378 		if (sd->sd_med.n_lst[i].a_cnt == 0)
1379 			continue;
1380 
1381 		if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep))
1382 			mdclrerror(ep);
1383 	}
1384 
1385 	/* Delete the set locally */
1386 	for (i = 0; i < MD_MAXSIDES; i++) {
1387 		/* Skip empty slots */
1388 		if (sd->sd_nodes[i][0] == '\0')
1389 			continue;
1390 
1391 		/* Skip non local nodes */
1392 		if (strcmp(mynode(), sd->sd_nodes[i]) != 0)
1393 			continue;
1394 
1395 		if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1)
1396 			mdclrerror(ep);
1397 	}
1398 	if (delete_end &&
1399 	    sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR)
1400 		rval = -1;
1401 
1402 out:
1403 	/* release signals back to what they were on entry */
1404 	if (procsigs(FALSE, &oldsigs, &xep) < 0) {
1405 		if (rval == 0)
1406 			(void) mdstealerror(ep, &xep);
1407 		rval = -1;
1408 	}
1409 
1410 	if (lock_set == TRUE) {
1411 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
1412 		if (clnt_unlock_set(mynode(), cl_sk, &xep)) {
1413 			if (rval == 0)
1414 				(void) mdstealerror(ep, &xep);
1415 			rval = -1;
1416 		}
1417 		cl_set_setkey(NULL);
1418 	}
1419 
1420 	metaflushsetname(sp);
1421 	return (rval);
1422 }
1423 
1424 int
1425 meta_set_purge(
1426 	mdsetname_t	*sp,
1427 	int		bypass_cluster,
1428 	int		forceflg,
1429 	md_error_t	*ep
1430 )
1431 {
1432 	char		*thishost = mynode();
1433 	md_set_desc	*sd;
1434 	md_setkey_t	*cl_sk;
1435 	md_error_t	xep = mdnullerror;
1436 	int		rval = 0;
1437 	int		i, num_hosts = 0;
1438 	int		has_set = 0;
1439 	int		max_node = 0;
1440 	int		delete_end = 1;
1441 	md_mnnode_desc	*nd;
1442 
1443 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1444 		/* unable to find set description */
1445 		rval = 1;
1446 		return (rval);
1447 	}
1448 
1449 	if (MD_MNSET_DESC(sd)) {
1450 		/*
1451 		 * Get a count of the hosts in the set and also lock the set
1452 		 * on those hosts that know about it.
1453 		 */
1454 		nd = sd->sd_nodelist;
1455 		while (nd) {
1456 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1457 				nd = nd->nd_next;
1458 				continue;
1459 			}
1460 			has_set = nodehasset(sp, nd->nd_nodename,
1461 				NHS_NST_EQ, ep);
1462 
1463 			/*
1464 			 * The host is not aware of this set (has_set < 0) or
1465 			 * the set does not match (has_set == 0). This check
1466 			 * prevents the code getting confused by an apparent
1467 			 * inconsistancy in the set's state, this is in the
1468 			 * purge code so something is broken in any case and
1469 			 * this is just trying to fix the brokeness.
1470 			 */
1471 			if (has_set <= 0) {
1472 				mdclrerror(ep);
1473 				nd->nd_flags |= MD_MN_NODE_NOSET;
1474 			} else {
1475 				num_hosts++;
1476 				if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
1477 					/*
1478 					 * If the force flag is set then
1479 					 * ignore any RPC failures because we
1480 					 * are only really interested with
1481 					 * the set on local node.
1482 					 */
1483 					if (forceflg && mdanyrpcerror(ep)) {
1484 						mdclrerror(ep);
1485 					} else {
1486 						/*
1487 						 * set max_node so that in the
1488 						 * unlock code nodes in the
1489 						 * set that have not been
1490 						 * locked are not unlocked.
1491 						 */
1492 						max_node = nd->nd_nodeid;
1493 						rval = 2;
1494 						goto out1;
1495 					}
1496 				}
1497 
1498 			}
1499 			nd = nd->nd_next;
1500 		}
1501 		max_node = 0;
1502 	} else {
1503 		/*
1504 		 * Get a count of the hosts in the set and also lock the set
1505 		 * on those hosts that know about it.
1506 		 */
1507 		for (i = 0; i < MD_MAXSIDES; i++) {
1508 			/* Skip empty slots */
1509 			if (sd->sd_nodes[i][0] == '\0')
1510 				continue;
1511 
1512 			has_set = nodehasset(sp, sd->sd_nodes[i],
1513 				NHS_NST_EQ, ep);
1514 
1515 			/*
1516 			 * The host is not aware of this set (has_set < 0) or
1517 			 * the set does not match (has_set == 0). This check
1518 			 * prevents the code getting confused by an apparent
1519 			 * inconsistancy in the set's state, this is in the
1520 			 * purge code so something is broken in any case and
1521 			 * this is just trying to fix the brokeness.
1522 			 */
1523 			if (has_set <= 0) {
1524 				mdclrerror(ep);
1525 				/*
1526 				 * set the node to NULL to prevent further
1527 				 * requests to this unresponsive node.
1528 				 */
1529 				sd->sd_nodes[i][0] = '\0';
1530 			} else {
1531 				num_hosts++;
1532 				if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
1533 					/*
1534 					 * If the force flag is set then
1535 					 * ignore any RPC failures because we
1536 					 * are only really interested with
1537 					 * the set on local node.
1538 					 */
1539 					if (forceflg && mdanyrpcerror(ep)) {
1540 						mdclrerror(ep);
1541 					} else {
1542 						rval = 2;
1543 						/*
1544 						 * set max_node so that in the
1545 						 * unlock code nodes in the
1546 						 * set that have not been
1547 						 * locked are not unlocked.
1548 						 */
1549 						max_node = i;
1550 						goto out1;
1551 					}
1552 				}
1553 			}
1554 		}
1555 		max_node = i;	/* now MD_MAXSIDES */
1556 	}
1557 	if (!bypass_cluster) {
1558 		/*
1559 		 * If there is only one host associated with the
1560 		 * set then remove the set from the cluster.
1561 		 */
1562 		if (num_hosts == 1) {
1563 			if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) {
1564 				if (metad_isautotakebyname(sp->setname)) {
1565 					delete_end = 0;
1566 				} else {
1567 					mdclrerror(ep);
1568 					rval = 3;
1569 					goto out1;
1570 				}
1571 			}
1572 		}
1573 	}
1574 
1575 	if (MD_MNSET_DESC(sd)) {
1576 		/*
1577 		 * Get a count of the hosts in the set and also lock the set
1578 		 * on those hosts that know about it.
1579 		 */
1580 		nd = sd->sd_nodelist;
1581 		while (nd) {
1582 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1583 				nd = nd->nd_next;
1584 				continue;
1585 			}
1586 			if (nd->nd_nodeid != sd->sd_mn_mynode->nd_nodeid) {
1587 				/*
1588 				 * Tell the remote node to remove this node
1589 				 */
1590 				if (clnt_delhosts(nd->nd_nodename, sp, 1,
1591 					&thishost, ep) == -1) {
1592 					/*
1593 					 * If we fail to delete ourselves
1594 					 * from the remote host it does not
1595 					 * really matter because the set is
1596 					 * being "purged" from this node. The
1597 					 * set can be purged from the other
1598 					 * node at a later time.
1599 					 */
1600 					mdclrerror(ep);
1601 				}
1602 				nd = nd->nd_next;
1603 				continue;
1604 			}
1605 			/* remove the set from this host */
1606 			if (clnt_delset(nd->nd_nodename, sp, ep) == -1) {
1607 				md_perror(dgettext(TEXT_DOMAIN, "delset"));
1608 				if (!bypass_cluster && num_hosts == 1)
1609 					(void) sdssc_delete_end(sp->setname,
1610 					    SDSSC_CLEANUP);
1611 				mdclrerror(ep);
1612 				goto out1;
1613 			}
1614 			nd = nd->nd_next;
1615 		}
1616 	} else {
1617 		for (i = 0; i < MD_MAXSIDES; i++) {
1618 			/* Skip empty slots */
1619 			if (sd->sd_nodes[i][0] == '\0')
1620 				continue;
1621 			if (strcmp(thishost, sd->sd_nodes[i]) != 0) {
1622 				/*
1623 				 * Tell the remote node to remove this node
1624 				 */
1625 				if (clnt_delhosts(sd->sd_nodes[i], sp, 1,
1626 				    &thishost, ep) == -1) {
1627 					/*
1628 					 * If we fail to delete ourselves
1629 					 * from the remote host it does not
1630 					 * really matter because the set is
1631 					 * being "purged" from this node. The
1632 					 * set can be purged from the other
1633 					 * node at a later time.
1634 					 */
1635 					mdclrerror(ep);
1636 				}
1637 				continue;
1638 			}
1639 
1640 			/* remove the set from this host */
1641 			if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) {
1642 				md_perror(dgettext(TEXT_DOMAIN, "delset"));
1643 				if (!bypass_cluster && num_hosts == 1)
1644 					(void) sdssc_delete_end(sp->setname,
1645 					    SDSSC_CLEANUP);
1646 				mdclrerror(ep);
1647 				goto out1;
1648 			}
1649 		}
1650 	}
1651 
1652 	if (!bypass_cluster && num_hosts == 1) {
1653 		if (delete_end && sdssc_delete_end(sp->setname, SDSSC_COMMIT) ==
1654 		    SDSSC_ERROR) {
1655 			rval = 4;
1656 		}
1657 	}
1658 
1659 out1:
1660 
1661 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1662 
1663 	/*
1664 	 * Remove the set lock on those nodes that had the set locked
1665 	 * max_node will either be MD_MAXSIDES or array index of the last
1666 	 * node contacted (or rather failed to contact) for traditional
1667 	 * diskset.  For a MN diskset, max_node is the node_id of the node
1668 	 * that failed the lock.
1669 	 */
1670 	if (MD_MNSET_DESC(sd)) {
1671 		nd = sd->sd_nodelist;
1672 		while (nd) {
1673 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1674 				nd = nd->nd_next;
1675 				continue;
1676 			}
1677 			if (nd->nd_nodeid == max_node)
1678 				break;
1679 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
1680 				if (forceflg && mdanyrpcerror(&xep)) {
1681 					mdclrerror(&xep);
1682 					nd = nd->nd_next;
1683 					continue;
1684 				}
1685 				if (rval == 0)
1686 					(void) mdstealerror(ep, &xep);
1687 				rval = 5;
1688 			}
1689 			nd = nd->nd_next;
1690 		}
1691 	} else {
1692 		for (i = 0; i < max_node; i++) {
1693 			/* Skip empty slots */
1694 			if (sd->sd_nodes[i][0] == '\0')
1695 				continue;
1696 
1697 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
1698 				if (forceflg && mdanyrpcerror(&xep)) {
1699 					mdclrerror(&xep);
1700 					continue;
1701 				}
1702 				if (rval == 0)
1703 					(void) mdstealerror(ep, &xep);
1704 				rval = 5;
1705 			}
1706 		}
1707 	}
1708 
1709 	cl_set_setkey(NULL);
1710 
1711 	return (rval);
1712 }
1713 
1714 int
1715 meta_set_query(
1716 	mdsetname_t		*sp,
1717 	mddb_dtag_lst_t		**dtlpp,
1718 	md_error_t		*ep
1719 )
1720 {
1721 	mddb_dtag_get_parm_t	dtgp;
1722 
1723 	(void) memset(&dtgp, '\0', sizeof (mddb_dtag_get_parm_t));
1724 	dtgp.dtgp_setno = sp->setno;
1725 
1726 	/*CONSTCOND*/
1727 	while (1) {
1728 		if (metaioctl(MD_MED_GET_TAG, &dtgp, &dtgp.dtgp_mde, NULL) != 0)
1729 			if (! mdismddberror(&dtgp.dtgp_mde, MDE_DB_NOTAG) ||
1730 			    *dtlpp == NULL)
1731 				return (mdstealerror(ep, &dtgp.dtgp_mde));
1732 			else
1733 				break;
1734 
1735 		/*
1736 		 * Run to the end of the list
1737 		 */
1738 		for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx)
1739 			/* void */;
1740 
1741 		*dtlpp = Zalloc(sizeof (mddb_dtag_lst_t));
1742 
1743 		(void) memmove(&(*dtlpp)->dtl_dt, &dtgp.dtgp_dt,
1744 		    sizeof (mddb_dtag_t));
1745 
1746 		dtgp.dtgp_dt.dt_id++;
1747 	}
1748 	return (0);
1749 }
1750 
1751 /*
1752  * return drivename get by key
1753  */
1754 mddrivename_t *
1755 metadrivename_withdrkey(
1756 	mdsetname_t	*sp,
1757 	side_t		sideno,
1758 	mdkey_t		key,
1759 	int		flags,
1760 	md_error_t	*ep
1761 )
1762 {
1763 	char		*nm;
1764 	mdname_t	*np;
1765 	mddrivename_t	*dnp;
1766 	ddi_devid_t	devidp;
1767 	md_set_desc	*sd;
1768 
1769 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1770 		return (NULL);
1771 	}
1772 
1773 	/* get namespace info */
1774 	if (MD_MNSET_DESC(sd)) {
1775 		if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno,
1776 		    key, ep)) == NULL)
1777 			return (NULL);
1778 	} else {
1779 		if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno+SKEW,
1780 		    key, ep)) == NULL)
1781 			return (NULL);
1782 	}
1783 
1784 	/* get device name */
1785 	if (flags & PRINT_FAST) {
1786 		if ((np = metaname_fast(&sp, nm, ep)) == NULL) {
1787 			Free(nm);
1788 			return (NULL);
1789 		}
1790 	} else {
1791 		if ((np = metaname(&sp, nm, ep)) == NULL) {
1792 			Free(nm);
1793 			return (NULL);
1794 		}
1795 	}
1796 	Free(nm);
1797 
1798 	/* make sure it's OK */
1799 	if ((! (flags & MD_BASICNAME_OK)) && (metachkcomp(np, ep) != 0))
1800 		return (NULL);
1801 
1802 	/* get drivename */
1803 	dnp = np->drivenamep;
1804 	dnp->side_names_key = key;
1805 
1806 	/*
1807 	 * Skip the following devid check if dnp is did device
1808 	 * The device id is disabled for did device due to the
1809 	 * lack of minor name support in the did driver. The following
1810 	 * devid code path can set and propagate the error and
1811 	 * eventually prevent did disks from being added to the
1812 	 * diskset under SunCluster systems
1813 	 */
1814 	if (strncmp(dnp->rname, "/dev/did/", strlen("/dev/did/")) == 0) {
1815 		goto out;
1816 	}
1817 
1818 	/* Also, Skip the check if MN diskset, no devid's */
1819 	if (MD_MNSET_DESC(sd)) {
1820 		goto out;
1821 	}
1822 
1823 	/*
1824 	 * Get the devid associated with the key.
1825 	 *
1826 	 * If a devid was returned, it MUST be valid even in
1827 	 * the case where a device id has been "updated". The
1828 	 * "update" of the device id may have occured due to
1829 	 * a firmware upgrade.
1830 	 */
1831 	if ((devidp = meta_getdidbykey(MD_LOCAL_SET, sideno+SKEW, key, ep))
1832 	    != NULL) {
1833 		dnp->devid = devid_str_encode(devidp, NULL);
1834 		free(devidp);
1835 	} else {
1836 		/*
1837 		 * It is okay if replica is not in devid mode
1838 		 */
1839 		if (mdissyserror(ep, MDDB_F_NODEVID)) {
1840 			mdclrerror(ep);
1841 			goto out;
1842 		}
1843 
1844 		/*
1845 		 * devid is missing so this means that we have
1846 		 * just upgraded from a configuration where
1847 		 * devid's were not used so try to add in
1848 		 * the devid and requery.
1849 		 */
1850 		if (meta_setdid(MD_LOCAL_SET, sideno + SKEW, key,
1851 		    ep) < 0)
1852 			return (NULL);
1853 		if ((devidp = (ddi_devid_t)meta_getdidbykey(MD_LOCAL_SET,
1854 		    sideno+SKEW, key, ep)) == NULL)
1855 			return (NULL);
1856 		dnp->devid = devid_str_encode(devidp, NULL);
1857 		devid_free(devidp);
1858 	}
1859 
1860 out:
1861 	if (flags & MD_BYPASS_DAEMON)
1862 		return (dnp);
1863 
1864 	if (get_sidenmlist(sp, dnp, ep))
1865 		return (NULL);
1866 
1867 	/* return success */
1868 	return (dnp);
1869 }
1870 
1871 void
1872 metafreedrivedesc(md_drive_desc **dd)
1873 {
1874 	md_drive_desc	*p, *next = NULL;
1875 
1876 	for (p = *dd; p != NULL; p = next) {
1877 		next = p->dd_next;
1878 		Free(p);
1879 	}
1880 	*dd = NULL;
1881 }
1882 
1883 md_drive_desc *
1884 metaget_drivedesc(
1885 	mdsetname_t	*sp,
1886 	int		flags,
1887 	md_error_t	*ep
1888 )
1889 {
1890 	side_t		sideno = MD_SIDEWILD;
1891 
1892 	assert(! (flags & MD_BYPASS_DAEMON));
1893 
1894 	if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD)
1895 		return (NULL);
1896 
1897 	return (metaget_drivedesc_sideno(sp, sideno, flags, ep));
1898 }
1899 
1900 md_drive_desc *
1901 metaget_drivedesc_fromnamelist(
1902 	mdsetname_t	*sp,
1903 	mdnamelist_t	*nlp,
1904 	md_error_t	*ep
1905 )
1906 {
1907 	md_set_desc		*sd;
1908 	mdnamelist_t		*p;
1909 	md_drive_desc		*dd = NULL;
1910 
1911 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1912 		return (NULL);
1913 
1914 	for (p = nlp; p != NULL; p = p->next)
1915 		(void) metadrivedesc_append(&dd, p->namep->drivenamep, 0, 0,
1916 		    sd->sd_ctime, sd->sd_genid, MD_DR_ADD);
1917 
1918 	return (dd);
1919 }
1920 
1921 md_drive_desc *
1922 metaget_drivedesc_sideno(
1923 	mdsetname_t *sp,
1924 	side_t sideno,
1925 	int flags,
1926 	md_error_t *ep
1927 )
1928 {
1929 	md_set_desc	*sd = NULL;
1930 
1931 	assert(! (flags & MD_BYPASS_DAEMON));
1932 
1933 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1934 		return (NULL);
1935 
1936 	if (sd->sd_drvs)
1937 		return (sd->sd_drvs);
1938 
1939 	if ((sd->sd_drvs = dr2drivedesc(sp, sideno, flags, ep)) == NULL)
1940 		return (NULL);
1941 
1942 	return (sd->sd_drvs);
1943 }
1944 
1945 int
1946 metaget_setownership(
1947 	mdsetname_t	*sp,
1948 	md_error_t	*ep
1949 )
1950 {
1951 	md_set_desc	*sd;
1952 	int		bool;
1953 	int		i;
1954 	md_mnnode_desc	*nd;
1955 
1956 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1957 		return (-1);
1958 
1959 	if (MD_MNSET_DESC(sd)) {
1960 		nd = sd->sd_nodelist;
1961 		while (nd) {
1962 			/* If node isn't alive, can't own diskset */
1963 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1964 				nd->nd_flags &= ~MD_MN_NODE_OWN;
1965 				nd = nd->nd_next;
1966 				continue;
1967 			}
1968 			/*
1969 			 * If can't communicate with rpc.metad, then mark
1970 			 * this node as not an owner.  That node may
1971 			 * in fact, be an owner, but without rpc.metad running
1972 			 * that node can't do much.
1973 			 */
1974 			if (clnt_ownset(nd->nd_nodename, sp, &bool, ep) == -1) {
1975 				nd->nd_flags &= ~MD_MN_NODE_OWN;
1976 			} else if (bool == TRUE) {
1977 				nd->nd_flags |= MD_MN_NODE_OWN;
1978 			} else {
1979 				nd->nd_flags &= ~MD_MN_NODE_OWN;
1980 			}
1981 			nd = nd->nd_next;
1982 		}
1983 		return (0);
1984 	}
1985 
1986 	/* Rest of code handles traditional disksets */
1987 
1988 	for (i = 0; i < MD_MAXSIDES; i++)
1989 		sd->sd_isown[i] = 0;
1990 
1991 	if (clnt_ownset(mynode(), sp, &bool, ep) == -1)
1992 		return (-1);
1993 
1994 	if (bool == TRUE)
1995 		sd->sd_isown[getmyside(sp, ep)] = 1;
1996 
1997 	return (0);
1998 }
1999 
2000 char *
2001 mynode(void)
2002 {
2003 	static struct utsname	myuname;
2004 	static int		done = 0;
2005 
2006 	if (! done) {
2007 		if (uname(&myuname) == -1) {
2008 			md_perror(dgettext(TEXT_DOMAIN, "uname"));
2009 			assert(0);
2010 		}
2011 		done = 1;
2012 	}
2013 	return (myuname.nodename);
2014 }
2015 
2016 int
2017 strinlst(char *str, int cnt, char **lst)
2018 {
2019 	int i;
2020 
2021 	for (i = 0; i < cnt; i++)
2022 		if (strcmp(lst[i], str) == 0)
2023 			return (TRUE);
2024 
2025 	return (FALSE);
2026 }
2027 
2028 /*
2029  * meta_get_reserved_names
2030  *  returns an mdnamelist_t of reserved slices
2031  *  reserved slices are those that are used but don't necessarily
2032  *  show up as metadevices (ex. reserved slice for db in sets, logs)
2033  */
2034 
2035 /*ARGSUSED*/
2036 int
2037 meta_get_reserved_names(
2038 	mdsetname_t	*sp,
2039 	mdnamelist_t	**nlpp,
2040 	int		options,
2041 	md_error_t	*ep)
2042 {
2043 	int		 count		= 0;
2044 	mdname_t	*np		= NULL;
2045 	mdnamelist_t	*transnlp	= NULL;
2046 	mdnamelist_t	**tailpp 	= nlpp;
2047 	mdnamelist_t	*nlp;
2048 	md_drive_desc	*dd, *di;
2049 
2050 	if (metaislocalset(sp))
2051 		goto out;
2052 
2053 	if (!(dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) && !mdisok(ep)) {
2054 		count = -1;
2055 		goto out;
2056 	}
2057 
2058 	/* db in for sets on reserved slice */
2059 	for (di = dd; di && count >= 0; di = di->dd_next) {
2060 		uint_t	rep_slice;
2061 
2062 		/*
2063 		 * Add the name struct to the end of the
2064 		 * namelist but keep a pointer to the last
2065 		 * element so that we don't incur the overhead
2066 		 * of traversing the list each time
2067 		 */
2068 		if (di->dd_dnp &&
2069 		    (meta_replicaslice(di->dd_dnp, &rep_slice, ep) == 0) &&
2070 		    (np = metaslicename(di->dd_dnp, rep_slice, ep)) &&
2071 		    (tailpp = meta_namelist_append_wrapper(tailpp, np)))
2072 			count++;
2073 		else
2074 			count = -1;
2075 	}
2076 
2077 	/* now find logs */
2078 	if (meta_get_trans_names(sp, &transnlp, 0, ep) < 0) {
2079 		count = -1;
2080 		goto out;
2081 	}
2082 
2083 	for (nlp = transnlp; (nlp != NULL); nlp = nlp->next) {
2084 		mdname_t	*transnp = nlp->namep;
2085 		md_trans_t	*transp;
2086 
2087 		if ((transp = meta_get_trans(sp, transnp, ep)) == NULL) {
2088 			count = -1;
2089 			goto out;
2090 		}
2091 		if (transp->lognamep) {
2092 			/*
2093 			 * Add the name struct to the end of the
2094 			 * namelist but keep a pointer to the last
2095 			 * element so that we don't incur the overhead
2096 			 * of traversing the list each time
2097 			 */
2098 			tailpp = meta_namelist_append_wrapper(
2099 			    tailpp, transp->lognamep);
2100 		}
2101 	}
2102 out:
2103 	metafreenamelist(transnlp);
2104 	return (count);
2105 }
2106 
2107 /*
2108  * Entry point to join a node to MultiNode diskset.
2109  *
2110  * Validate host in diskset.
2111  *	- Should be in membership list from API
2112  *	- Should not already be joined into diskset.
2113  *	- Set must have drives
2114  * Assume valid configuration is stored in the set/drive/node records
2115  * in the local mddb since no node or drive can be added to the MNset
2116  * unless all drives and nodes are available.  Reconfig steps will
2117  * resync all ALIVE nodes in case of panic in critical areas.
2118  *
2119  * Lock down the set.
2120  * Verify host is a member of this diskset.
2121  * If drives exist in the configuration, load the mddbs.
2122  * Set this node to active by notifying master if one exists.
2123  * If this is the first node active in the diskset, this node
2124  * 	becomes the master.
2125  * Unlock the set.
2126  *
2127  * Mirror Resync:
2128  * If this node is the last node to join the set and clustering
2129  * isn't running, then start the 'metasync -r' type resync
2130  * on all mirrors in this diskset.
2131  * If clustering is running, this resync operation will
2132  * be handled by the reconfig steps and should NOT
2133  * be handled during a join operation.
2134  *
2135  * There are multiple return values in order to assist
2136  * the join operation of all sets in the metaset command.
2137  *
2138  * Return values:
2139  *	0  - Node successfully joined to set.
2140  *	-1 - Join attempted but failed
2141  *		- any failure from libmeta calls
2142  *		- node not in the member list
2143  *	-2 - Join not attempted since
2144  *		- this set had no drives in set
2145  *		- this node already joined to set
2146  *		- set is not a multinode set
2147  *	-3 - Node joined to STALE set.
2148  */
2149 extern int
2150 meta_set_join(
2151 	mdsetname_t	*sp,
2152 	md_error_t	*ep
2153 )
2154 {
2155 	md_set_desc		*sd;
2156 	md_drive_desc		*dd;
2157 	md_mnnode_desc		*nd, *nd2, my_nd;
2158 	int			rval = 0;
2159 	md_setkey_t		*cl_sk;
2160 	md_error_t		xep = mdnullerror;
2161 	md_error_t		ep_snarf = mdnullerror;
2162 	int			master_flag = 0;
2163 	md_mnset_record		*mas_mnsr = NULL;
2164 	int			clear_nr_flags = 0;
2165 	md_mnnode_record	*nr;
2166 	int			stale_set = 0;
2167 	int			rb_flags = 0;
2168 	int			stale_bool = FALSE;
2169 	int			suspendall_flag = 0;
2170 	int			suspend1_flag = 0;
2171 	sigset_t		oldsigs;
2172 	int			send_reinit = 0;
2173 
2174 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2175 		return (-1);
2176 	}
2177 
2178 	/* Must be a multinode diskset */
2179 	if (!MD_MNSET_DESC(sd)) {
2180 		(void) mderror(ep, MDE_NOT_MN, sp->setname);
2181 		return (-2);
2182 	}
2183 
2184 	/* Verify that the node is ALIVE (i.e. is in the API membership list) */
2185 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_ALIVE)) {
2186 		(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno,
2187 			sd->sd_mn_mynode->nd_nodename, NULL,
2188 			sp->setname);
2189 		return (-1);
2190 	}
2191 
2192 	/* Make sure we are blocking all signals */
2193 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
2194 		mdclrerror(&xep);
2195 
2196 	/*
2197 	 * Lock the set on current set members.
2198 	 * For MN diskset lock_set and SUSPEND are used to protect against
2199 	 * other meta* commands running on the other nodes.
2200 	 */
2201 	nd = sd->sd_nodelist;
2202 	while (nd) {
2203 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2204 			nd = nd->nd_next;
2205 			continue;
2206 		}
2207 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2208 			rval = -1;
2209 			goto out;
2210 		}
2211 		nd = nd->nd_next;
2212 	}
2213 
2214 	/*
2215 	 * Lock out other meta* commands by suspending
2216 	 * class 1 messages across the diskset.
2217 	 */
2218 	nd = sd->sd_nodelist;
2219 	while (nd) {
2220 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2221 			nd = nd->nd_next;
2222 			continue;
2223 		}
2224 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2225 			    sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) {
2226 			rval = -1;
2227 			goto out;
2228 		}
2229 		suspend1_flag = 1;
2230 		nd = nd->nd_next;
2231 	}
2232 
2233 	/*
2234 	 * Verify that this host is a member (in the host list) of the set.
2235 	 */
2236 	nd = sd->sd_nodelist;
2237 	while (nd) {
2238 		if (strcmp(mynode(), nd->nd_nodename) == 0) {
2239 			break;
2240 		}
2241 		nd = nd->nd_next;
2242 	}
2243 	if (!nd) {
2244 		(void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
2245 			sd->sd_mn_mynode->nd_nodename, NULL,
2246 			sp->setname);
2247 		rval = -1;
2248 		goto out;
2249 	}
2250 
2251 	/*
2252 	 * Need to return failure if host is already 'joined'
2253 	 * into the set.  This is done so that if later the user
2254 	 * issues a command to join all sets and a failure is
2255 	 * encountered - that the resulting cleanup effort
2256 	 * (withdrawing from all sets that were joined
2257 	 * during that command) won't withdraw from this set.
2258 	 */
2259 	if (nd->nd_flags & MD_MN_NODE_OWN) {
2260 		rval = -2;
2261 		goto out2;
2262 	}
2263 
2264 	/*
2265 	 * Call metaget_setownership that calls each node in diskset and
2266 	 * marks in set descriptor if node is an owner of the set or not.
2267 	 * metaget_setownership checks to see if a node is an owner by
2268 	 * checking to see if that node's kernel has the mddb loaded.
2269 	 * If a node had panic'd during a reconfig or an
2270 	 * add/delete/join/withdraw operation, the other nodes' node
2271 	 * records may not reflect the current state of the diskset,
2272 	 * so calling metaget_setownership is the safest thing to do.
2273 	 */
2274 	if (metaget_setownership(sp, ep) == -1) {
2275 		rval = -1;
2276 		goto out;
2277 	}
2278 
2279 	/* If first active member of diskset, become the master. */
2280 	nd = sd->sd_nodelist;
2281 	while (nd) {
2282 		if (nd->nd_flags & MD_MN_NODE_OWN)
2283 			break;
2284 		nd = nd->nd_next;
2285 	}
2286 	if (nd == NULL)
2287 		master_flag = 1;
2288 
2289 	/*
2290 	 * If not first active member of diskset, then get the
2291 	 * master information from a node that is already joined
2292 	 * and set the master information for this node.  Be sure
2293 	 * that this node (the already joined node) has its own
2294 	 * join flag set.  If not, then this diskset isn't currently
2295 	 * consistent and shouldn't allow a node to join.  This diskset
2296 	 * inconsistency should only occur when a node has panic'd in
2297 	 * the set while doing a metaset operation and the sysadmin is
2298 	 * attempting to join a node into the set.  This inconsistency
2299 	 * will be fixed during a reconfig cycle which should be occurring
2300 	 * soon since a node panic'd.
2301 	 *
2302 	 * If unable to get this information from an owning node, then
2303 	 * this diskset isn't currently consistent and shouldn't
2304 	 * allow a node to join.
2305 	 */
2306 	if (!master_flag) {
2307 		/* get master information from an owner (joined) node */
2308 		if (clnt_mngetset(nd->nd_nodename, sp->setname,
2309 		    sp->setno, &mas_mnsr, ep) == -1) {
2310 			rval = -1;
2311 			goto out;
2312 		}
2313 
2314 		/* Verify that owner (joined) node has its own JOIN flag set */
2315 		nr = mas_mnsr->sr_nodechain;
2316 		while (nr) {
2317 			if ((nd->nd_nodeid == nr->nr_nodeid) &&
2318 			    ((nr->nr_flags & MD_MN_NODE_OWN) == NULL)) {
2319 				(void) mddserror(ep, MDE_DS_NODENOSET,
2320 				    sp->setno, nd->nd_nodename, NULL,
2321 				    nd->nd_nodename);
2322 				free_sr((md_set_record *)mas_mnsr);
2323 				rval = -1;
2324 				goto out;
2325 			}
2326 			nr = nr->nr_next;
2327 		}
2328 
2329 		/*
2330 		 * Does master have set marked as STALE?
2331 		 * If so, need to pass this down to kernel when
2332 		 * this node snarfs the set.
2333 		 */
2334 		if (clnt_mn_is_stale(nd->nd_nodename, sp,
2335 		    &stale_bool, ep) == -1) {
2336 			rval = -1;
2337 			goto out;
2338 		}
2339 
2340 		/* set master information in my rpc.metad's set record */
2341 		if (clnt_mnsetmaster(mynode(), sp, mas_mnsr->sr_master_nodenm,
2342 		    mas_mnsr->sr_master_nodeid, ep)) {
2343 			free_sr((md_set_record *)mas_mnsr);
2344 			rval = -1;
2345 			goto out;
2346 		}
2347 
2348 		/* set master information in my cached set desc */
2349 		(void) strcpy(sd->sd_mn_master_nodenm,
2350 		    mas_mnsr->sr_master_nodenm);
2351 		sd->sd_mn_master_nodeid = mas_mnsr->sr_master_nodeid;
2352 		nd2 = sd->sd_nodelist;
2353 		while (nd2) {
2354 		    if (nd2->nd_nodeid == mas_mnsr->sr_master_nodeid) {
2355 			sd->sd_mn_masternode = nd2;
2356 			break;
2357 		    }
2358 		    nd2 = nd2->nd_next;
2359 		}
2360 		free_sr((md_set_record *)mas_mnsr);
2361 
2362 		/*
2363 		 * Set the node flags in mynode's rpc.metad node records for
2364 		 * the nodes that are in the diskset.  Can use my sd
2365 		 * since earlier call to metaget_setownership set the
2366 		 * owner flags based on whether that node had snarfed
2367 		 * the MN diskset mddb.  Reconfig steps guarantee that
2368 		 * return of metaget_setownership will match the owning
2369 		 * node's owner list except in the case where a node
2370 		 * has just panic'd and in this case, a reconfig will
2371 		 * be starting immediately and the owner lists will
2372 		 * be sync'd up by the reconfig.
2373 		 *
2374 		 * Flag of SET means to take no action except to
2375 		 * set the node flags as given in the nodelist linked list.
2376 		 */
2377 		if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist,
2378 		    MD_NR_SET, NULL, ep)) {
2379 			rval = -1;
2380 			goto out;
2381 		}
2382 	}
2383 
2384 	/*
2385 	 * Read in the mddb if there are drives in the set.
2386 	 */
2387 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
2388 	    ep)) == NULL) {
2389 		/* No drives in list */
2390 		if (! mdisok(ep)) {
2391 			rval = -1;
2392 			goto out;
2393 		}
2394 		rval = -2;
2395 		goto out;
2396 	}
2397 
2398 	/*
2399 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2400 	 * Start by suspending rpc.mdcommd (which drains it of all messages),
2401 	 * then change the nodelist followed by a reinit and resume.
2402 	 */
2403 	nd = sd->sd_nodelist;
2404 	while (nd) {
2405 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2406 			nd = nd->nd_next;
2407 			continue;
2408 		}
2409 
2410 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp,
2411 		    MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
2412 			rval = -1;
2413 			goto out;
2414 		}
2415 		suspendall_flag = 1;
2416 		nd = nd->nd_next;
2417 	}
2418 
2419 	/* Set master in my set record in rpc.metad */
2420 	if (master_flag) {
2421 		if (clnt_mnsetmaster(mynode(), sp,
2422 		    sd->sd_mn_mynode->nd_nodename,
2423 		    sd->sd_mn_mynode->nd_nodeid, ep)) {
2424 			rval = -1;
2425 			goto out;
2426 		}
2427 	}
2428 	/*
2429 	 * Causes mddbs to be loaded into the kernel.
2430 	 * Set the force flag so that replica locations can be
2431 	 * loaded into the kernel even if a mediator node was
2432 	 * unavailable.  This allows a node to join an MO
2433 	 * diskset when there are sufficient replicas available,
2434 	 * but a mediator node in unavailable.
2435 	 */
2436 	if (setup_db_bydd(sp, dd, TRUE, ep) == -1) {
2437 		mde_perror(ep, dgettext(TEXT_DOMAIN,
2438 		    "Host not able to start diskset."));
2439 		rval = -1;
2440 		goto out;
2441 	}
2442 
2443 	if (! mdisok(ep)) {
2444 		rval = -1;
2445 		goto out;
2446 	}
2447 
2448 	/*
2449 	 * Set rollback flags to 1 so that halt_set is called if a failure
2450 	 * is seen after this point.  If snarf_set fails, still need to
2451 	 * call halt_set to cleanup the diskset.
2452 	 */
2453 	rb_flags = 1;
2454 
2455 	/* Starts the set */
2456 	if (snarf_set(sp, stale_bool, ep) != 0) {
2457 		if (mdismddberror(ep, MDE_DB_STALE)) {
2458 			/*
2459 			 * Don't fail join, STALE means that set has
2460 			 * < 50% mddbs.
2461 			 */
2462 			(void) mdstealerror(&ep_snarf, ep);
2463 			stale_set = 1;
2464 		} else if (mdisok(ep)) {
2465 			/* If snarf failed, but no error was set - set it */
2466 			(void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64,
2467 			    sp->setno, 0, NULL);
2468 				rval = -1;
2469 				goto out;
2470 		} else if (!(mdismddberror(ep, MDE_DB_ACCOK))) {
2471 			/*
2472 			 * Don't fail join if ACCOK; ACCOK means that mediator
2473 			 * provided extra vote.
2474 			 */
2475 			rval = -1;
2476 			goto out;
2477 		}
2478 	}
2479 
2480 	/* Did set really get snarfed? */
2481 	if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_NO) {
2482 		if (mdisok(ep)) {
2483 			/* If snarf failed, but no error was set - set it */
2484 			(void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64,
2485 				sp->setno, 0, NULL);
2486 		}
2487 		mde_perror(ep, dgettext(TEXT_DOMAIN,
2488 		    "Host not able to start diskset."));
2489 		rval = -1;
2490 		goto out;
2491 	}
2492 
2493 	/* Change to nodelist so need to send reinit to rpc.mdcommd */
2494 	send_reinit = 1;
2495 
2496 	/* If first node to enter set, setup master and clear change log */
2497 	if (master_flag) {
2498 		/* Set master in my locally cached set descriptor */
2499 		(void) strcpy(sd->sd_mn_master_nodenm,
2500 		    sd->sd_mn_mynode->nd_nodename);
2501 		sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid;
2502 		sd->sd_mn_am_i_master = 1;
2503 
2504 		/*
2505 		 * If first node to join set, then clear out change log
2506 		 * entries.  Change log entries are only needed when a
2507 		 * change of master is occurring in a diskset that has
2508 		 * multiple owners.   Since this node is the first owner
2509 		 * of the diskset, clear the entries.
2510 		 *
2511 		 * Only do this if we are in a single node non-SC3.x
2512 		 * situation.
2513 		 */
2514 		if (meta_mn_singlenode() &&
2515 			mdmn_reset_changelog(sp, ep,  MDMN_CLF_RESETLOG) != 0) {
2516 			mde_perror(ep, dgettext(TEXT_DOMAIN,
2517 			    "Unable to reset changelog."));
2518 			rval = -1;
2519 			goto out;
2520 		}
2521 	}
2522 
2523 	/* Set my locally cached flag */
2524 	sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN;
2525 
2526 	/*
2527 	 * Set this node's own flag on all joined nodes in the set
2528 	 * (including my node).
2529 	 */
2530 	clear_nr_flags = 1;
2531 
2532 	my_nd = *(sd->sd_mn_mynode);
2533 	my_nd.nd_next = NULL;
2534 	nd = sd->sd_nodelist;
2535 	while (nd) {
2536 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2537 			nd = nd->nd_next;
2538 			continue;
2539 		}
2540 		if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
2541 		    MD_NR_JOIN, NULL, ep)) {
2542 			rval = -1;
2543 			goto out;
2544 		}
2545 		nd = nd->nd_next;
2546 	}
2547 
2548 out:
2549 	if (rval != NULL) {
2550 		/*
2551 		 * If rollback flag is 1, then node was joined to set.
2552 		 * Since an error occurred, withdraw node from set in
2553 		 * order to rollback to before command was run.
2554 		 * Need to preserve ep so that calling function can
2555 		 * get error information.
2556 		 */
2557 		if (rb_flags == 1) {
2558 			if (halt_set(sp, &xep)) {
2559 				mdclrerror(&xep);
2560 			}
2561 		}
2562 
2563 		/*
2564 		 * If error, reset master to INVALID.
2565 		 * Ignore error since (next) first node to successfully join
2566 		 * will set master on all nodes.
2567 		 */
2568 		(void) clnt_mnsetmaster(mynode(), sp, "",
2569 			MD_MN_INVALID_NID, &xep);
2570 		mdclrerror(&xep);
2571 		/* Reset master in my locally cached set descriptor */
2572 		sd->sd_mn_master_nodeid = MD_MN_INVALID_NID;
2573 		sd->sd_mn_am_i_master = 0;
2574 
2575 		/*
2576 		 * If nr flags set on other nodes, reset them.
2577 		 */
2578 		if (clear_nr_flags) {
2579 			nd = sd->sd_nodelist;
2580 			while (nd) {
2581 				if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2582 					nd = nd->nd_next;
2583 					continue;
2584 				}
2585 				(void) clnt_upd_nr_flags(nd->nd_nodename, sp,
2586 					&my_nd, MD_NR_WITHDRAW, NULL, &xep);
2587 				mdclrerror(&xep);
2588 				nd = nd->nd_next;
2589 			}
2590 			/* Reset my locally cached flag */
2591 			sd->sd_mn_mynode->nd_flags &= ~MD_MN_NODE_OWN;
2592 		}
2593 	}
2594 
2595 	/*
2596 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2597 	 * Send reinit command to mdcommd which forces it to get
2598 	 * fresh set description.
2599 	 */
2600 	if (send_reinit) {
2601 		/* Send reinit */
2602 		nd = sd->sd_nodelist;
2603 		while (nd) {
2604 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2605 				nd = nd->nd_next;
2606 				continue;
2607 			}
2608 
2609 			/* Class is ignored for REINIT */
2610 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
2611 				sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
2612 				/*
2613 				 * We are here because we failed to resume
2614 				 * rpc.mdcommd.  However we potentially have
2615 				 * an error from the previous call
2616 				 * If the previous call did fail,  we capture
2617 				 * that error and generate a perror with
2618 				 * the string, "Unable to resume...".
2619 				 * Setting rval to -1 ensures that in the
2620 				 * next iteration of the loop, ep is not
2621 				 * clobbered.
2622 				 */
2623 				if (rval == 0)
2624 					(void) mdstealerror(ep, &xep);
2625 				else
2626 					mdclrerror(&xep);
2627 				rval = -1;
2628 				mde_perror(ep, dgettext(TEXT_DOMAIN,
2629 				    "Unable to reinit rpc.mdcommd."));
2630 			}
2631 			nd = nd->nd_next;
2632 		}
2633 
2634 	}
2635 
2636 out2:
2637 	/*
2638 	 * Unlock diskset by resuming messages across the diskset.
2639 	 * Just resume all classes so that resume is the same whether
2640 	 * just one class was locked or all classes were locked.
2641 	 */
2642 	if ((suspend1_flag) || (suspendall_flag)) {
2643 		nd = sd->sd_nodelist;
2644 		while (nd) {
2645 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2646 				nd = nd->nd_next;
2647 				continue;
2648 			}
2649 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
2650 				sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
2651 				/*
2652 				 * We are here because we failed to resume
2653 				 * rpc.mdcommd.  However we potentially have
2654 				 * an error from the previous call
2655 				 * If the previous call did fail,  we capture
2656 				 * that error and generate a perror with
2657 				 * the string, "Unable to resume...".
2658 				 * Setting rval to -1 ensures that in the
2659 				 * next iteration of the loop, ep is not
2660 				 * clobbered.
2661 				 */
2662 				if (rval == 0)
2663 					(void) mdstealerror(ep, &xep);
2664 				else
2665 					mdclrerror(&xep);
2666 				rval = -1;
2667 				mde_perror(ep, dgettext(TEXT_DOMAIN,
2668 				    "Unable to resume rpc.mdcommd."));
2669 			}
2670 			nd = nd->nd_next;
2671 		}
2672 		meta_ping_mnset(sp->setno);
2673 	}
2674 
2675 	/*
2676 	 * Unlock set.  This flushes the caches on the servers.
2677 	 */
2678 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
2679 	nd = sd->sd_nodelist;
2680 	while (nd) {
2681 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2682 			nd = nd->nd_next;
2683 			continue;
2684 		}
2685 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
2686 			if (rval == 0)
2687 				(void) mdstealerror(ep, &xep);
2688 			else
2689 				mdclrerror(&xep);
2690 			rval = -1;
2691 		}
2692 		nd = nd->nd_next;
2693 	}
2694 
2695 	/*
2696 	 * If this node is the last to join the diskset and clustering isn't
2697 	 * running, then resync the mirrors in the diskset. We have to wait
2698 	 * until all nodes are joined so that the status gets propagated to
2699 	 * all of the members of the set.
2700 	 * Ignore any error from the resync as the join function shouldn't fail
2701 	 * because the mirror resync had a problem.
2702 	 *
2703 	 * Don't start resync if set is stale.
2704 	 */
2705 	if ((rval == 0) && (sdssc_bind_library() != SDSSC_OKAY) &&
2706 	    (stale_set != 1)) {
2707 		nd = sd->sd_nodelist;
2708 		while (nd) {
2709 			if (!(nd->nd_flags & MD_MN_NODE_OWN))
2710 				break;
2711 			nd = nd->nd_next;
2712 		}
2713 		/*
2714 		 * nd set to NULL means that we have no nodes in the set that
2715 		 * haven't joined. In this case we start the resync.
2716 		 */
2717 		if (nd == NULL) {
2718 			(void) meta_mirror_resync_all(sp, 0, &xep);
2719 			mdclrerror(&xep);
2720 		}
2721 	}
2722 
2723 	/* Update ABR state for all soft partitions */
2724 	(void) meta_sp_update_abr(sp, &xep);
2725 	mdclrerror(&xep);
2726 
2727 	/*
2728 	 * call metaflushsetnames to reset local cache for master and
2729 	 * node information.
2730 	 */
2731 	metaflushsetname(sp);
2732 
2733 	/* release signals back to what they were on entry */
2734 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
2735 		mdclrerror(&xep);
2736 
2737 	/*
2738 	 * If no error and stale_set is set, then set ep back
2739 	 * to ep from snarf_set call and return -3.  If another error
2740 	 * occurred and rval is not 0, then that error would have
2741 	 * caused the node to be withdrawn from the set and would
2742 	 * have set ep to that error information.
2743 	 */
2744 	if ((rval == 0) && (stale_set)) {
2745 		(void) mdstealerror(ep, &ep_snarf);
2746 		return (-3);
2747 	}
2748 
2749 	return (rval);
2750 }
2751 
2752 /*
2753  * Entry point to withdraw a node from MultiNode diskset.
2754  *
2755  * Validate host in diskset.
2756  *	- Should be joined into diskset.
2757  * Assume valid configuration is stored in the set/drive/node records
2758  * in the local mddb since no node or drive can be added to the MNset
2759  * unless all drives and nodes are available.  Reconfig steps will
2760  * resync all ALIVE nodes in case of panic in critical areas.
2761  *
2762  * Lock down the set.
2763  * Verify that drives exist in configuration.
2764  * Verify host is a member of this diskset.
2765  * Verify host is an owner of the diskset (host is joined to diskset).
2766  * Only allow withdrawal of master node if master node is the only joined
2767  * in the diskset.
2768  * Halt the diskset on this node.
2769  * Reset Master on this node.
2770  * Updated node flags that this node with withdrawn.
2771  * Unlock the set.
2772  *
2773  * Return values:
2774  *	0  - Node successfully withdrew from set.
2775  *	-1 - Withdrawal attempted but failed
2776  *		- any failure from libmeta calls
2777  *		- node not in the member list
2778  *	-2 - Withdrawal not attempted since
2779  *		- this set had no drives in set
2780  *		- this node not joined to set
2781  *		- set is not a multinode set
2782  */
2783 extern int
2784 meta_set_withdraw(
2785 	mdsetname_t	*sp,
2786 	md_error_t	*ep
2787 )
2788 {
2789 	md_set_desc		*sd;
2790 	md_drive_desc		*dd = 0;
2791 	md_mnnode_desc		*nd, my_nd;
2792 	int			rval = 0;
2793 	md_setkey_t		*cl_sk;
2794 	md_error_t		xep = mdnullerror;
2795 	int			set_halted = 0;
2796 	int			suspendall_flag = 0;
2797 	int			suspend1_flag = 0;
2798 	bool_t			stale_bool = FALSE;
2799 	mddb_config_t		c;
2800 	int			node_id_list[1];
2801 	sigset_t		oldsigs;
2802 	int			send_reinit = 0;
2803 
2804 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2805 		return (-1);
2806 	}
2807 
2808 	/* Must be a multinode diskset */
2809 	if (!MD_MNSET_DESC(sd)) {
2810 		(void) mderror(ep, MDE_NOT_MN, sp->setname);
2811 		return (-1);
2812 	}
2813 
2814 	/* Make sure we are blocking all signals */
2815 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
2816 		mdclrerror(&xep);
2817 
2818 	/*
2819 	 * Lock the set on current set members.
2820 	 * For MN diskset lock_set and SUSPEND are used to protect against
2821 	 * other meta* commands running on the other nodes.
2822 	 */
2823 	nd = sd->sd_nodelist;
2824 	while (nd) {
2825 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2826 			nd = nd->nd_next;
2827 			continue;
2828 		}
2829 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2830 			rval = -1;
2831 			goto out;
2832 		}
2833 		nd = nd->nd_next;
2834 	}
2835 	/*
2836 	 * Lock out other meta* commands by suspending
2837 	 * class 1 messages across the diskset.
2838 	 */
2839 	nd = sd->sd_nodelist;
2840 	while (nd) {
2841 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2842 			nd = nd->nd_next;
2843 			continue;
2844 		}
2845 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2846 			sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) {
2847 			rval = -1;
2848 			goto out;
2849 		}
2850 		suspend1_flag = 1;
2851 		nd = nd->nd_next;
2852 	}
2853 
2854 	/* Get list of drives - needed in case of failure */
2855 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
2856 	    ep)) == NULL) {
2857 		/* Error getting drives in list */
2858 		if (! mdisok(ep)) {
2859 			rval = -1;
2860 			goto out2;
2861 		}
2862 		/* no drives in list */
2863 		rval = -2;
2864 		goto out2;
2865 	}
2866 
2867 	/*
2868 	 * Verify that this host is a member (in the host list) of the set.
2869 	 */
2870 	nd = sd->sd_nodelist;
2871 	while (nd) {
2872 		if (strcmp(mynode(), nd->nd_nodename) == 0) {
2873 			break;
2874 		}
2875 		nd = nd->nd_next;
2876 	}
2877 	if (!nd) {
2878 		(void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
2879 			sd->sd_mn_mynode->nd_nodename, NULL,
2880 			sp->setname);
2881 		rval = -1;
2882 		goto out2;
2883 	}
2884 
2885 	/*
2886 	 * Call metaget_setownership that calls each node in diskset and
2887 	 * marks in set descriptor if node is an owner of the set or not.
2888 	 * metaget_setownership checks to see if a node is an owner by
2889 	 * checking to see if that node's kernel has the mddb loaded.
2890 	 * If a node had panic'd during a reconfig or an
2891 	 * add/delete/join/withdraw operation, the other nodes' node
2892 	 * records may not reflect the current state of the diskset,
2893 	 * so calling metaget_setownership is the safest thing to do.
2894 	 */
2895 	if (metaget_setownership(sp, ep) == -1) {
2896 		rval = -1;
2897 		goto out2;
2898 	}
2899 
2900 	/*
2901 	 * Verify that this node is joined
2902 	 * to diskset (i.e. is an owner of the diskset).
2903 	 */
2904 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
2905 		rval = -2;
2906 		goto out2;
2907 	}
2908 
2909 	/*
2910 	 * For a MN diskset, only withdraw master if it is
2911 	 * the only joined node.
2912 	 */
2913 	if (sd->sd_mn_master_nodeid == sd->sd_mn_mynode->nd_nodeid) {
2914 		nd = sd->sd_nodelist;
2915 		while (nd) {
2916 			/* Skip my node since checking for other owners */
2917 			if (nd->nd_nodeid == sd->sd_mn_master_nodeid) {
2918 				nd = nd->nd_next;
2919 				continue;
2920 			}
2921 			/* If another owner node if found, error */
2922 			if (nd->nd_flags & MD_MN_NODE_OWN) {
2923 				(void) mddserror(ep, MDE_DS_WITHDRAWMASTER,
2924 					sp->setno,
2925 					sd->sd_mn_mynode->nd_nodename, NULL,
2926 					sp->setname);
2927 				rval = -1;
2928 				goto out2;
2929 			}
2930 			nd = nd->nd_next;
2931 		}
2932 	}
2933 
2934 	/*
2935 	 * Is current set STALE?
2936 	 */
2937 	(void) memset(&c, 0, sizeof (c));
2938 	c.c_id = 0;
2939 	c.c_setno = sp->setno;
2940 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
2941 		(void) mdstealerror(ep, &c.c_mde);
2942 		rval = -1;
2943 		goto out;
2944 	}
2945 	if (c.c_flags & MDDB_C_STALE) {
2946 		stale_bool = TRUE;
2947 	}
2948 
2949 	/*
2950 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2951 	 * Start by suspending rpc.mdcommd (which drains it of all messages),
2952 	 * then change the nodelist followed by a reinit and resume.
2953 	 */
2954 	nd = sd->sd_nodelist;
2955 	while (nd) {
2956 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2957 			nd = nd->nd_next;
2958 			continue;
2959 		}
2960 
2961 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2962 		    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
2963 			rval = -1;
2964 			goto out;
2965 		}
2966 		suspendall_flag = 1;
2967 		nd = nd->nd_next;
2968 	}
2969 
2970 	/*
2971 	 * Withdraw the set - halt set.
2972 	 * This will fail if any I/O is occuring to any metadevice which
2973 	 * includes a resync to a mirror metadevice.
2974 	 */
2975 	set_halted = 1;
2976 	if (halt_set(sp, ep)) {
2977 		/* Was set actually halted? */
2978 		if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_YES) {
2979 			set_halted = 0;
2980 		}
2981 		rval = -1;
2982 		goto out;
2983 	}
2984 
2985 	/* Change to nodelist so need to send reinit to rpc.mdcommd */
2986 	send_reinit = 1;
2987 
2988 	/* Reset master on withdrawn node */
2989 	if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, "",
2990 	    MD_MN_INVALID_NID, ep)) {
2991 		rval = -1;
2992 		goto out;
2993 	}
2994 
2995 	/* Mark my node as withdrawn and send to other nodes */
2996 	nd = sd->sd_nodelist;
2997 	my_nd = *(sd->sd_mn_mynode);	/* structure copy */
2998 	my_nd.nd_next = NULL;
2999 	while (nd) {
3000 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3001 			nd = nd->nd_next;
3002 			continue;
3003 		}
3004 		if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
3005 		    MD_NR_WITHDRAW, NULL, ep)) {
3006 			rval = -1;
3007 			goto out;
3008 		}
3009 		nd = nd->nd_next;
3010 	}
3011 
3012 	/*
3013 	 * If withdrawn node is a mirror owner, reset mirror owner
3014 	 * to NULL.  If an error occurs, print a warning and continue.
3015 	 * Don't fail metaset because of mirror owner reset problem since
3016 	 * next node to grab mirror will resolve this issue.
3017 	 * Before next node grabs mirrors, metaset will show the withdrawn
3018 	 * node as owner which is why an attempt to reset the mirror owner
3019 	 * is made.
3020 	 */
3021 	node_id_list[0] = sd->sd_mn_mynode->nd_nodeid;	/* Setup my nodeid */
3022 	nd = sd->sd_nodelist;
3023 	while (nd) {
3024 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3025 			nd = nd->nd_next;
3026 			continue;
3027 		}
3028 		if (clnt_reset_mirror_owner(nd->nd_nodename, sp,
3029 		    1, &node_id_list[0], &xep) == 01) {
3030 			mde_perror(&xep, dgettext(TEXT_DOMAIN,
3031 			    "Unable to reset mirror owner on node %s"),
3032 			    nd->nd_nodename);
3033 			mdclrerror(&xep);
3034 		}
3035 		nd = nd->nd_next;
3036 	}
3037 
3038 out:
3039 	if (rval == -1) {
3040 		/* Rejoin node - Mark node as joined and send to other nodes */
3041 		nd = sd->sd_nodelist;
3042 		my_nd = *(sd->sd_mn_mynode);	/* structure copy */
3043 		my_nd.nd_next = NULL;
3044 		while (nd) {
3045 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3046 				nd = nd->nd_next;
3047 				continue;
3048 			}
3049 			if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
3050 			    MD_NR_JOIN, NULL, &xep)) {
3051 				mdclrerror(&xep);
3052 			}
3053 			nd = nd->nd_next;
3054 		}
3055 
3056 		/* Set master on withdrawn node */
3057 		if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp,
3058 		    sd->sd_mn_master_nodenm,
3059 		    sd->sd_mn_master_nodeid, &xep)) {
3060 			mdclrerror(&xep);
3061 		}
3062 
3063 		/* Join set if halt_set had succeeded */
3064 		if (set_halted) {
3065 			/*
3066 			 * Causes mddbs to be loaded into the kernel.
3067 			 * Set the force flag so that replica locations can be
3068 			 * loaded into the kernel even if a mediator node was
3069 			 * unavailable.  This allows a node to join an MO
3070 			 * diskset when there are sufficient replicas available,
3071 			 * but a mediator node in unavailable.
3072 			 */
3073 			if (setup_db_bydd(sp, dd, TRUE, &xep) == -1) {
3074 				mdclrerror(&xep);
3075 			}
3076 			/* If set previously stale - make it so at re-join */
3077 			if (snarf_set(sp, stale_bool, &xep) != 0) {
3078 				mdclrerror(&xep);
3079 				(void) halt_set(sp, &xep);
3080 				mdclrerror(&xep);
3081 			}
3082 		}
3083 	}
3084 
3085 	/*
3086 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
3087 	 * Send reinit command to mdcommd which forces it to get
3088 	 * fresh set description.
3089 	 */
3090 	if (send_reinit) {
3091 		/* Send reinit */
3092 		nd = sd->sd_nodelist;
3093 		while (nd) {
3094 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3095 				nd = nd->nd_next;
3096 				continue;
3097 			}
3098 
3099 			/* Class is ignored for REINIT */
3100 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
3101 				sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
3102 				/*
3103 				 * We are here because we failed to resume
3104 				 * rpc.mdcommd.  However we potentially have
3105 				 * an error from the previous call.
3106 				 * If the previous call did fail,  we
3107 				 * capture that error and generate a perror
3108 				 * withthe string,  "Unable to resume...".
3109 				 * Setting rval to -1 ensures that in the
3110 				 * next iteration of the loop, ep is not
3111 				 * clobbered.
3112 				 */
3113 				if (rval == 0)
3114 					(void) mdstealerror(ep, &xep);
3115 				else
3116 					mdclrerror(&xep);
3117 				rval = -1;
3118 				mde_perror(ep, dgettext(TEXT_DOMAIN,
3119 				    "Unable to reinit rpc.mdcommd."));
3120 			}
3121 			nd = nd->nd_next;
3122 		}
3123 	}
3124 
3125 out2:
3126 	/*
3127 	 * Unlock diskset by resuming messages across the diskset.
3128 	 * Just resume all classes so that resume is the same whether
3129 	 * just one class was locked or all classes were locked.
3130 	 */
3131 	if ((suspend1_flag) || (suspendall_flag)) {
3132 		nd = sd->sd_nodelist;
3133 		while (nd) {
3134 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3135 				nd = nd->nd_next;
3136 				continue;
3137 			}
3138 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
3139 				sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
3140 				/*
3141 				 * We are here because we failed to resume
3142 				 * rpc.mdcommd.  However we potentially have
3143 				 * an error from the previous call
3144 				 * If the previous call did fail,  we capture
3145 				 * that error and generate a perror with
3146 				 * the string, "Unable to resume...".
3147 				 * Setting rval to -1 ensures that in the
3148 				 * next iteration of the loop, ep is not
3149 				 * clobbered.
3150 				 */
3151 				if (rval == 0)
3152 					(void) mdstealerror(ep, &xep);
3153 				else
3154 					mdclrerror(&xep);
3155 				rval = -1;
3156 				mde_perror(ep, dgettext(TEXT_DOMAIN,
3157 				    "Unable to resume rpc.mdcommd."));
3158 			}
3159 			nd = nd->nd_next;
3160 		}
3161 		meta_ping_mnset(sp->setno);
3162 	}
3163 
3164 	/*
3165 	 * Unlock set.  This flushes the caches on the servers.
3166 	 */
3167 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
3168 	nd = sd->sd_nodelist;
3169 	while (nd) {
3170 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3171 			nd = nd->nd_next;
3172 			continue;
3173 		}
3174 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
3175 			if (rval == 0)
3176 				(void) mdstealerror(ep, &xep);
3177 			else
3178 				mdclrerror(&xep);
3179 			rval = -1;
3180 		}
3181 		nd = nd->nd_next;
3182 	}
3183 
3184 	/*
3185 	 * call metaflushsetnames to reset local cache for master and
3186 	 * node information.
3187 	 */
3188 	metaflushsetname(sp);
3189 
3190 	/* release signals back to what they were on entry */
3191 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
3192 		mdclrerror(&xep);
3193 
3194 	return (rval);
3195 
3196 }
3197 
3198 /*
3199  * Update nodelist with cluster member information.
3200  * A node not in the member list will be marked
3201  * as not ALIVE and not OWN.
3202  * A node in the member list will be marked ALIVE, but
3203  * the OWN bit will not be changed.
3204  *
3205  * If mynode isn't in the membership list, fail causing
3206  * another reconfig cycle to be started since a non-member
3207  * node shouldn't be taking part in the reconfig cycle.
3208  *
3209  * Return values:
3210  *	0 - No problem.
3211  *	1 - Any failure including RPC failure to my node.
3212  */
3213 int
3214 meta_reconfig_update_nodelist(
3215 	mdsetname_t			*sp,
3216 	mndiskset_membershiplist_t	*nl,
3217 	md_set_desc			*sd,
3218 	md_error_t			*ep
3219 )
3220 {
3221 	mndiskset_membershiplist_t	*nl2;
3222 	md_mnnode_desc			*nd;
3223 	md_error_t			xep = mdnullerror;
3224 	int				rval = 0;
3225 
3226 	/*
3227 	 * Walk through nodelist, checking to see if each
3228 	 * node is in the member list.
3229 	 * If node is not a member, reset ALIVE and OWN node flag.
3230 	 * If node is a member, set ALIVE.
3231 	 * If mynode's OWN flag gets reset, then halt the diskset on this node.
3232 	 */
3233 	nd = sd->sd_nodelist;
3234 	while (nd) {
3235 		nl2 = nl;
3236 		while (nl2) {
3237 			/* If node is in member list, set ALIVE */
3238 			if (nl2->msl_node_id == nd->nd_nodeid) {
3239 				nd->nd_flags |= MD_MN_NODE_ALIVE;
3240 				break;
3241 			} else {
3242 				nl2 = nl2->next;
3243 			}
3244 			/* node is not in member list, mark !ALIVE and !OWN */
3245 			if (nl2 == NULL) {
3246 				/* If node is mynode, then halt set if needed */
3247 				if (strcmp(mynode(), nd->nd_nodename) == 0) {
3248 					/*
3249 					 * This shouldn't happen, but just
3250 					 * in case...  Any node not in the
3251 					 * membership list should be dead and
3252 					 * not running reconfig step1.
3253 					 */
3254 					if (nd->nd_flags & MD_MN_NODE_OWN) {
3255 						if (halt_set(sp, &xep)) {
3256 							mde_perror(&xep, "");
3257 							mdclrerror(&xep);
3258 						}
3259 					}
3260 					/*
3261 					 * Return failure since this node
3262 					 * (mynode) is not in the membership
3263 					 * list, but process the rest of the
3264 					 * nodelist first so that rpc.metad
3265 					 * can be updated with the latest
3266 					 * membership information.
3267 					 */
3268 					(void) mddserror(ep,
3269 					    MDE_DS_NOTINMEMBERLIST,
3270 					    sp->setno, nd->nd_nodename, NULL,
3271 					    sp->setname);
3272 					rval = 1;
3273 				}
3274 				nd->nd_flags &= ~MD_MN_NODE_ALIVE;
3275 				nd->nd_flags &= ~MD_MN_NODE_OWN;
3276 			}
3277 		}
3278 		nd = nd->nd_next;
3279 	}
3280 
3281 	/* Send this information to rpc.metad */
3282 	if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist,
3283 	    MD_NR_SET,  MNSET_IN_RECONFIG, &xep)) {
3284 		/* Return failure if can't send node flags to rpc.metad */
3285 		if (rval == 0) {
3286 			(void) mdstealerror(ep, &xep);
3287 			rval = 1;
3288 		}
3289 	}
3290 	return (rval);
3291 }
3292 
3293 /*
3294  * Choose master determines the master for a diskset.
3295  * Each node determines the master on its own and
3296  * adds this information to its local rpc.metad nodelist
3297  * and also sends it to the kernel.
3298  *
3299  * Nodelist in set descriptor (sd) is sorted in
3300  * monotonically increasing sequence of nodeid.
3301  *
3302  * Return values:
3303  *	0 - No problem.
3304  *	205 - There was an RPC problem to another node.
3305  *	-1 - There was an error.  This could be an RPC error to my node.
3306  *		This is a catastrophic failure causing node to panic.
3307  */
3308 int
3309 meta_reconfig_choose_master_for_set(
3310 	mdsetname_t	*sp,
3311 	md_set_desc	*sd,
3312 	md_error_t	*ep
3313 )
3314 {
3315 	int			is_owner;
3316 	md_mnset_record		*mnsr = NULL;
3317 	int			lowest_alive_nodeid = 0;
3318 	uint_t			master_nodeid;
3319 	md_mnnode_desc		*nd, *nd2;
3320 	md_mnnode_record	*nr;
3321 	md_drive_desc		*dd;
3322 	md_setkey_t		*cl_sk;
3323 	int			rval = 0;
3324 	md_error_t		xep = mdnullerror;
3325 	mddb_setflags_config_t	sf;
3326 
3327 	/*
3328 	 * Is current node joined to diskset?
3329 	 * Don't trust flags, really check to see if mddb is snarfed.
3330 	 */
3331 	if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
3332 		/*
3333 		 * If a node is joined to the diskset, this node checks
3334 		 * to see if the current master of the diskset is valid and
3335 		 * is still in the membership list (ALIVE) and is
3336 		 * still joined (OWN).  Need to verify if master is
3337 		 * really joined - don't trust the flags.  (Can trust
3338 		 * ALIVE since set during earlier part of reconfig cycle.)
3339 		 * If the current master is valid, still in the membership
3340 		 * list and joined, then master is not changed on this node.
3341 		 * Just return.
3342 		 *
3343 		 * Verify that nodeid is valid before accessing masternode.
3344 		 */
3345 		if ((sd->sd_mn_master_nodeid != MD_MN_INVALID_NID) &&
3346 		    (sd->sd_mn_masternode->nd_flags & MD_MN_NODE_ALIVE)) {
3347 			if (clnt_ownset(sd->sd_mn_master_nodenm, sp,
3348 			    &is_owner, ep) == -1) {
3349 				/* If RPC failure to another node return 205 */
3350 				if ((mdanyrpcerror(ep)) &&
3351 				    (sd->sd_mn_mynode->nd_nodeid !=
3352 				    sd->sd_mn_master_nodeid)) {
3353 					return (205);
3354 				} else {
3355 					/* Any other failure */
3356 					return (-1);
3357 				}
3358 			} else {
3359 				if (is_owner == TRUE) {
3360 
3361 					meta_mc_log(MC_LOG5, dgettext(
3362 					    TEXT_DOMAIN, "Set %s previous "
3363 					    "master chosen %s (%d): %s"),
3364 					    sp->setname,
3365 					    sd->sd_mn_master_nodenm,
3366 					    sd->sd_mn_master_nodeid,
3367 					    meta_print_hrtime(gethrtime() -
3368 					    start_time));
3369 
3370 					/* Previous master is ok - done */
3371 					return (0);
3372 				}
3373 			}
3374 		}
3375 
3376 		/*
3377 		 * If current master is no longer in the membership list or
3378 		 * is no longer joined, then this node uses the following
3379 		 * algorithm:
3380 		 * - node calls RPC routine clnt_ownset to get latest
3381 		 *	information on which nodes are owners of diskset.
3382 		 * 	clnt_ownset checks on each node to see if its kernel
3383 		 *	has that diskset snarfed.
3384 		 */
3385 		nd = sd->sd_nodelist;
3386 		while (nd) {
3387 			/* Don't consider node that isn't in member list */
3388 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3389 				nd = nd->nd_next;
3390 				continue;
3391 			}
3392 
3393 			if (clnt_ownset(nd->nd_nodename, sp,
3394 			    &is_owner, ep) == -1) {
3395 				/* If RPC failure to another node return 205 */
3396 				if ((mdanyrpcerror(ep)) &&
3397 				    (sd->sd_mn_mynode->nd_nodeid !=
3398 				    nd->nd_nodeid)) {
3399 					return (205);
3400 				} else {
3401 					/* Any other failure */
3402 					return (-1);
3403 				}
3404 			}
3405 
3406 			/*
3407 			 * Set owner flag for each node based on whether
3408 			 * that node really has a diskset mddb snarfed in
3409 			 * or not.
3410 			 */
3411 			if (is_owner == TRUE)
3412 				nd->nd_flags |= MD_MN_NODE_OWN;
3413 			else
3414 				nd->nd_flags &= ~MD_MN_NODE_OWN;
3415 
3416 			nd = nd->nd_next;
3417 		}
3418 
3419 		/*
3420 		 * - node walks through nodelist looking for nodes that are
3421 		 *	owners of the diskset that are in the membership list.
3422 		 * - for each owner, node calls RPC routine clnt_getset to
3423 		 *	 see if that node has its node record set to OK.
3424 		 * - If so, master is chosen to be this owner node.
3425 		 */
3426 		nd = sd->sd_nodelist;
3427 		while (nd) {
3428 			/* Don't consider node that isn't in member list */
3429 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3430 				nd = nd->nd_next;
3431 				continue;
3432 			}
3433 
3434 			/* Don't consider a node that isn't an owner */
3435 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3436 				nd = nd->nd_next;
3437 				continue;
3438 			}
3439 
3440 			/* Does node has its own node record set to OK? */
3441 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3442 			    MD_SET_BAD, &mnsr, ep) == -1) {
3443 				/* If RPC failure to another node return 205 */
3444 				if ((mdanyrpcerror(ep)) &&
3445 				    (sd->sd_mn_mynode->nd_nodeid !=
3446 				    nd->nd_nodeid)) {
3447 					return (205);
3448 				} else {
3449 					/* Any other failure */
3450 					return (-1);
3451 				}
3452 			}
3453 			nr = mnsr->sr_nodechain;
3454 			while (nr) {
3455 				if (nd->nd_nodeid == nr->nr_nodeid) {
3456 					if (nr->nr_flags & MD_MN_NODE_OK) {
3457 						/* Found a master */
3458 						free_sr(
3459 						    (md_set_record *)mnsr);
3460 						goto found_master;
3461 					}
3462 				}
3463 				nr = nr->nr_next;
3464 			}
3465 			free_sr((md_set_record *)mnsr);
3466 			nd = nd->nd_next;
3467 		}
3468 
3469 		/*
3470 		 * - If no owner node has its own node record on its own node
3471 		 *	set to OK, then this node checks all of the non-owner
3472 		 * 	nodes that are in the membership list.
3473 		 * - for each non-owner, node calls RPC routine clnt_getset to
3474 		 *	 see if that node has its node record set to OK.
3475 		 * - If set doesn't exist, don't choose node for master.
3476 		 * - If so, master is chosen to be this non-owner node.
3477 		 *
3478 		 */
3479 		nd = sd->sd_nodelist;
3480 		while (nd) {
3481 			/* Don't consider node that isn't in member list */
3482 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3483 				nd = nd->nd_next;
3484 				continue;
3485 			}
3486 
3487 			/* Only checking non-owner nodes this time around */
3488 			if (nd->nd_flags & MD_MN_NODE_OWN) {
3489 				nd = nd->nd_next;
3490 				continue;
3491 			}
3492 
3493 			/* Does node has its own node record set to OK? */
3494 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3495 			    MD_SET_BAD, &mnsr, ep) == -1) {
3496 				/*
3497 				 * If set doesn't exist on non-owner node,
3498 				 * don't consider this node for master.
3499 				 */
3500 				if (mdiserror(ep, MDE_NO_SET)) {
3501 					nd = nd->nd_next;
3502 					continue;
3503 				} else if ((mdanyrpcerror(ep)) &&
3504 				    (sd->sd_mn_mynode->nd_nodeid !=
3505 				    nd->nd_nodeid)) {
3506 					/* RPC failure to another node */
3507 					return (205);
3508 				} else {
3509 					/* Any other failure */
3510 					return (-1);
3511 				}
3512 			}
3513 			nr = mnsr->sr_nodechain;
3514 			while (nr) {
3515 				if (nd->nd_nodeid == nr->nr_nodeid) {
3516 					if (nr->nr_flags & MD_MN_NODE_OK) {
3517 						/* Found a master */
3518 						free_sr(
3519 						    (md_set_record *)mnsr);
3520 						goto found_master;
3521 					}
3522 				}
3523 				nr = nr->nr_next;
3524 			}
3525 			free_sr((md_set_record *)mnsr);
3526 			nd = nd->nd_next;
3527 		}
3528 
3529 		/*
3530 		 * - If no node can be found that has its own node record on
3531 		 *	its node to be set to OK, then all alive nodes
3532 		 * 	were in the process of being added to or deleted
3533 		 *	from set.  Each alive node will remove all
3534 		 *	information pertaining to this set from its node.
3535 		 *
3536 		 * If all nodes in set are ALIVE, then call sdssc end routines
3537 		 * since set was truly being initially created or destroyed.
3538 		 */
3539 		goto delete_set;
3540 	} else {
3541 
3542 		/*
3543 		 * If node is not joined to diskset, then this
3544 		 * node uses the following algorithm:
3545 		 * - If unjoined node doesn't have a node record for itself,
3546 		 *	just delete the diskset since diskset was in the
3547 		 *	process of being created.
3548 		 * - node needs to find master of diskset before
3549 		 *	reconfig cycle, if a master existed.
3550 		 * - node calls RPC routine clnt_ownset to get latest
3551 		 * 	information on which nodes are owners of diskset.
3552 		 *	clnt_ownset checks on each node to see if its
3553 		 *	kernel has that diskset snarfed.
3554 		 */
3555 
3556 		/*
3557 		 * Is my node in the set description?
3558 		 * If not, delete the set from this node.
3559 		 * sr2setdesc sets sd_mn_mynode pointer to the node
3560 		 * descriptor for this node if there was a node
3561 		 * record for this node.
3562 		 *
3563 		 */
3564 		if (sd->sd_mn_mynode == NULL) {
3565 			goto delete_set;
3566 		}
3567 
3568 		nd = sd->sd_nodelist;
3569 		while (nd) {
3570 			/* Don't consider node that isn't in member list */
3571 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3572 				nd = nd->nd_next;
3573 				continue;
3574 			}
3575 
3576 			if (clnt_ownset(nd->nd_nodename, sp,
3577 			    &is_owner, ep) == -1) {
3578 				/* If RPC failure to another node return 205 */
3579 				if ((mdanyrpcerror(ep)) &&
3580 				    (sd->sd_mn_mynode->nd_nodeid !=
3581 				    nd->nd_nodeid)) {
3582 					return (205);
3583 				} else {
3584 					/* Any other failure */
3585 					return (-1);
3586 				}
3587 			}
3588 
3589 			/*
3590 			 * Set owner flag for each node based on whether
3591 			 * that node really has a diskset mddb snarfed in
3592 			 * or not.
3593 			 */
3594 			if (is_owner == TRUE)
3595 				nd->nd_flags |= MD_MN_NODE_OWN;
3596 			else
3597 				nd->nd_flags &= ~MD_MN_NODE_OWN;
3598 
3599 			nd = nd->nd_next;
3600 		}
3601 
3602 		/*
3603 		 * - node walks through nodelist looking for nodes that
3604 		 *	are owners of the diskset that are in
3605 		 *	the membership list.
3606 		 * - for each owner, node calls RPC routine clnt_getset to
3607 		 *	see if that node has a master set and to get the
3608 		 *	diskset description.
3609 		 * - If the owner node has a set description that doesn't
3610 		 *	include the non-joined node in the nodelist, this node
3611 		 *	removes its set description of that diskset
3612 		 *	(i.e. removes the set from its local mddbs).  This is
3613 		 *	handling the case of when a node was removed from a
3614 		 *	diskset while it was not in the cluster membership
3615 		 *	list.
3616 		 * - If that node has a master set and the master is in the
3617 		 *	membership list and is an owner, then either this was
3618 		 *	the master from before the reconfig cycle or this
3619 		 *	node has already chosen a new master - either way,
3620 		 *	the master value is valid as long as it is in the
3621 		 *	membership list and is an owner
3622 		 * - master is chosen to be owner node's master
3623 		 */
3624 		nd = sd->sd_nodelist;
3625 		while (nd) {
3626 			/* Don't consider node that isn't in member list */
3627 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3628 				nd = nd->nd_next;
3629 				continue;
3630 			}
3631 
3632 			/* Don't consider a node that isn't an owner */
3633 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3634 				nd = nd->nd_next;
3635 				continue;
3636 			}
3637 
3638 			/* Get owner node's set record */
3639 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3640 			    MD_SET_BAD, &mnsr, ep) == -1) {
3641 				/* If RPC failure to another node return 205 */
3642 				if ((mdanyrpcerror(ep)) &&
3643 				    (sd->sd_mn_mynode->nd_nodeid !=
3644 				    nd->nd_nodeid)) {
3645 					return (205);
3646 				} else {
3647 					/* Any other failure */
3648 					return (-1);
3649 				}
3650 			}
3651 
3652 			/* Is this node in the owner node's set record */
3653 			nr = mnsr->sr_nodechain;
3654 			while (nr) {
3655 				if (sd->sd_mn_mynode->nd_nodeid ==
3656 				    nr->nr_nodeid) {
3657 					break;
3658 				}
3659 				nr = nr->nr_next;
3660 			}
3661 			if (nr == NULL) {
3662 				/* my node not found - delete set */
3663 				free_sr((md_set_record *)mnsr);
3664 				goto delete_set;
3665 			}
3666 
3667 			/* Is owner's node's master valid? */
3668 			master_nodeid = mnsr->sr_master_nodeid;
3669 			free_sr((md_set_record *)mnsr);
3670 			if (master_nodeid == MD_MN_INVALID_NID) {
3671 				nd = nd->nd_next;
3672 				continue;
3673 			}
3674 
3675 			nd2 = sd->sd_nodelist;
3676 			while (nd2) {
3677 				if ((nd2->nd_nodeid == master_nodeid) &&
3678 				    (nd2->nd_flags & MD_MN_NODE_ALIVE) &&
3679 				    (nd2->nd_flags & MD_MN_NODE_OWN)) {
3680 						nd = nd2;
3681 						goto found_master;
3682 				}
3683 				nd2 = nd2->nd_next;
3684 			}
3685 			nd = nd->nd_next;
3686 		}
3687 
3688 		/*
3689 		 * - If no owner node has a valid master, then follow
3690 		 * 	algorithm of when a node is joined to the diskset.
3691 		 * - node walks through nodelist looking for nodes that are
3692 		 *	owners of the diskset that are in the membership list.
3693 		 * - for each owner, node calls RPC routine clnt_getset to
3694 		 *	 see if that node has its node record set to OK.
3695 		 * - If so, master is chosen to be this owner node.
3696 		 */
3697 		nd = sd->sd_nodelist;
3698 		while (nd) {
3699 			/* Don't consider node that isn't in member list */
3700 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3701 				nd = nd->nd_next;
3702 				continue;
3703 			}
3704 
3705 			/* Don't consider a node that isn't an owner */
3706 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3707 				nd = nd->nd_next;
3708 				continue;
3709 			}
3710 
3711 			/* Does node has its own node record set to OK? */
3712 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3713 			    MD_SET_BAD, &mnsr, ep) == -1) {
3714 				/* If RPC failure to another node return 205 */
3715 				if ((mdanyrpcerror(ep)) &&
3716 				    (sd->sd_mn_mynode->nd_nodeid !=
3717 				    nd->nd_nodeid)) {
3718 					return (205);
3719 				} else {
3720 					/* Any other failure */
3721 					return (-1);
3722 				}
3723 			}
3724 			nr = mnsr->sr_nodechain;
3725 			while (nr) {
3726 				if (nd->nd_nodeid == nr->nr_nodeid) {
3727 					if (nr->nr_flags & MD_MN_NODE_OK) {
3728 						/* Found a master */
3729 						free_sr(
3730 						    (md_set_record *)mnsr);
3731 						goto found_master;
3732 					}
3733 				}
3734 				nr = nr->nr_next;
3735 			}
3736 			free_sr((md_set_record *)mnsr);
3737 			nd = nd->nd_next;
3738 		}
3739 
3740 		/*
3741 		 * - If no owner node has its own node record on its own node
3742 		 *	set to OK, then this node checks all of the non-owner
3743 		 *	nodes that are in the membership list.
3744 		 * - for each non-owner, node calls RPC routine clnt_getset to
3745 		 *	see if that node has its node record set to OK.
3746 		 * - If set doesn't exist, don't choose node for master.
3747 		 * - If this node doesn't exist in the nodelist on any of the
3748 		 *	non-owner nodes, this node removes its set description
3749 		 *	of that diskset (i.e. removes the set from its local
3750 		 *	mddbs). This is handling the case of when a node was
3751 		 *	removed from a diskset while it was not in the
3752 		 *	cluster membership list.
3753 		 * - If non-owner node has its node record set to OK and if
3754 		 *	this node hasn't removed this diskset (step directly
3755 		 *	before this one), then the master is chosen to be this
3756 		 *	non-owner node.
3757 		 */
3758 		nd = sd->sd_nodelist;
3759 		while (nd) {
3760 			/* Don't consider node that isn't in member list */
3761 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3762 				nd->nd_flags |= MD_MN_NODE_DEL;
3763 				nd = nd->nd_next;
3764 				continue;
3765 			}
3766 
3767 			/* Don't consider owner nodes since none are OK */
3768 			if (nd->nd_flags & MD_MN_NODE_OWN) {
3769 				nd->nd_flags |= MD_MN_NODE_DEL;
3770 				nd = nd->nd_next;
3771 				continue;
3772 			}
3773 
3774 			/*
3775 			 * Don't need to get nodelist from my node since
3776 			 * this is where sd_nodelist was obtained.
3777 			 */
3778 			if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) {
3779 				nd = nd->nd_next;
3780 				continue;
3781 			}
3782 
3783 			/*
3784 			 * If node has already been decided against for
3785 			 * master, then skip it.
3786 			 */
3787 			if (nd->nd_flags & MD_MN_NODE_DEL) {
3788 				nd = nd->nd_next;
3789 				continue;
3790 			}
3791 
3792 			/*
3793 			 * Does node in my nodelist have its own node
3794 			 * record marked OK on its node?  And does node
3795 			 * in my nodelist exist on all other nodes?
3796 			 * Don't want to choose a node for master unless
3797 			 * that node is marked OK on its own node and that
3798 			 * node exists on all other alive nodes.
3799 			 *
3800 			 * This is guarding against the case when several
3801 			 * nodes are down and one of the downed nodes is
3802 			 * deleted from the diskset.  When the down nodes
3803 			 * are rebooted into the cluster, you don't want
3804 			 * any node to pick the deleted node as the master.
3805 			 */
3806 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3807 			    MD_SET_BAD, &mnsr, ep) == -1) {
3808 				/*
3809 				 * If set doesn't exist on non-owner node,
3810 				 * don't consider this node for master.
3811 				 */
3812 				if (mdiserror(ep, MDE_NO_SET)) {
3813 					nd->nd_flags |= MD_MN_NODE_DEL;
3814 					nd = nd->nd_next;
3815 					continue;
3816 				} else if (mdanyrpcerror(ep)) {
3817 					/* RPC failure to another node */
3818 					return (205);
3819 				} else {
3820 					/* Any other failure */
3821 					return (-1);
3822 				}
3823 			}
3824 			/*
3825 			 * Is my node in the nodelist gotten from the other
3826 			 * node?  If not, then remove the set from my node
3827 			 * since set was deleted from my node while my node
3828 			 * was out of the cluster.
3829 			 */
3830 			nr = mnsr->sr_nodechain;
3831 			while (nr) {
3832 				if (sd->sd_mn_mynode->nd_nodeid ==
3833 				    nr->nr_nodeid) {
3834 					break;
3835 				}
3836 				nr = nr->nr_next;
3837 			}
3838 			if (nr == NULL) {
3839 				/* my node not found - delete set */
3840 				free_sr((md_set_record *)mnsr);
3841 				goto delete_set;
3842 			}
3843 
3844 			/* Is node being checked marked OK on its own node? */
3845 			nr = mnsr->sr_nodechain;
3846 			while (nr) {
3847 				if (nd->nd_nodeid == nr->nr_nodeid) {
3848 					if (!(nr->nr_flags & MD_MN_NODE_OK)) {
3849 						nd->nd_flags |= MD_MN_NODE_DEL;
3850 					}
3851 					break;
3852 				}
3853 				nr = nr->nr_next;
3854 			}
3855 			/*
3856 			 * If node being checked doesn't exist on its
3857 			 * own node - don't choose it as master.
3858 			 */
3859 			if (nr == NULL) {
3860 				nd->nd_flags |= MD_MN_NODE_DEL;
3861 			}
3862 
3863 			/*
3864 			 * Check every node in my node's nodelist against
3865 			 * the nodelist gotten from the other node.
3866 			 * If a node in my node's nodelist is not found in the
3867 			 * other node's nodelist, then set the DEL flag.
3868 			 */
3869 			nd2 = sd->sd_nodelist;
3870 			while (nd2) {
3871 				nr = mnsr->sr_nodechain;
3872 				while (nr) {
3873 					if (nd2->nd_nodeid == nr->nr_nodeid) {
3874 						break;
3875 					}
3876 					nr = nr->nr_next;
3877 				}
3878 				/* nd2 not found in other node's nodelist */
3879 				if (nr == NULL) {
3880 					nd2->nd_flags |= MD_MN_NODE_DEL;
3881 				}
3882 				nd2 = nd2->nd_next;
3883 			}
3884 
3885 			free_sr((md_set_record *)mnsr);
3886 			nd = nd->nd_next;
3887 		}
3888 
3889 		/*
3890 		 * Rescan list look for node that has not been marked DEL.
3891 		 * First node found is the master.
3892 		 */
3893 		nd = sd->sd_nodelist;
3894 		while (nd) {
3895 			if (!(nd->nd_flags & MD_MN_NODE_DEL)) {
3896 				break;
3897 			}
3898 			nd = nd->nd_next;
3899 			continue;
3900 		}
3901 		if (nd) {
3902 			/* Found a master */
3903 			goto found_master;
3904 		}
3905 
3906 		/*
3907 		 * - If no node can be found that has its own node record on
3908 		 *	its node to be set to OK, then all alive nodes
3909 		 * 	were in the process of being added to or deleted
3910 		 *	from set.  Each alive node will remove all
3911 		 *	information pertaining to this set from its node.
3912 		 *
3913 		 * If all nodes in set are ALIVE, then call sdssc end routines
3914 		 * since set was truly being initially created or destroyed.
3915 		 */
3916 		goto delete_set;
3917 	}
3918 
3919 found_master:
3920 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
3921 	    "Set %s master chosen %s (%d): %s"),
3922 	    sp->setname, nd->nd_nodename, nd->nd_nodeid,
3923 	    meta_print_hrtime(gethrtime() - start_time));
3924 
3925 	if (clnt_lock_set(mynode(), sp, ep) == -1) {
3926 		return (-1);
3927 	}
3928 
3929 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
3930 
3931 	if (clnt_mnsetmaster(mynode(), sp,
3932 	    nd->nd_nodename, nd->nd_nodeid, ep)) {
3933 		rval = -1;
3934 	} else if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) {
3935 		/* If this node is new master, set flag in this node's kernel */
3936 		(void) memset(&sf, 0, sizeof (sf));
3937 		sf.sf_setno = sp->setno;
3938 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
3939 		/* Use magic to help protect ioctl against attack. */
3940 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
3941 		sf.sf_flags = MDDB_NM_SET;
3942 
3943 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
3944 		    "Setting new master flag for set %s: %s"),
3945 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
3946 
3947 		/*
3948 		 * Fail reconfig cycle if ioctl fails since it is critical
3949 		 * to set new master flag.
3950 		 */
3951 		if (metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde,
3952 		    NULL) != NULL) {
3953 			(void) mdstealerror(ep, &sf.sf_mde);
3954 			rval = -1;
3955 		}
3956 	}
3957 
3958 	if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) {
3959 		if (rval == 0) {
3960 			(void) mdstealerror(ep, &xep);
3961 			rval = -1;
3962 		}
3963 	}
3964 
3965 	cl_set_setkey(NULL);
3966 
3967 	metaflushsetname(sp);
3968 
3969 	return (rval);
3970 
3971 delete_set:
3972 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
3973 	    "Master not chosen, deleting set %s: %s"),
3974 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
3975 
3976 	/*
3977 	 * Remove all set information from this node:
3978 	 *	- node records for this set
3979 	 *	- drive records for this set
3980 	 *	- set record for this set
3981 	 * (Only do this on this node since each node
3982 	 * will do it for its own local mddb.)
3983 	 *
3984 	 * If all nodes in set are ALIVE, then
3985 	 * the lowest numbered ALIVE nodeid in set
3986 	 * (irregardless of whether an owner node or not) will
3987 	 * call the DCS service to cleanup for create/delete of set.
3988 	 *   sdssc_create_end(cleanup) if set was being created or
3989 	 *   sdssc_delete_end(cleanup) if set was being deleted.
3990 	 * A node record with flag ADD denotes a set being
3991 	 * created.  A node record with flag DEL denotes a
3992 	 * set being deleted.
3993 	 */
3994 	nd = sd->sd_nodelist;
3995 	while (nd) {
3996 		/* Found a node that isn't alive */
3997 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
3998 			break;
3999 
4000 		/* Is my node the lowest numbered ALIVE node? */
4001 		if (nd->nd_nodeid < sd->sd_mn_mynode->nd_nodeid) {
4002 			break;
4003 		}
4004 		nd = nd->nd_next;
4005 	}
4006 	if (nd == NULL) {
4007 		/* All nodes ALIVE and this is the lowest nodeid */
4008 		lowest_alive_nodeid = 1;
4009 	}
4010 
4011 	if (clnt_lock_set(mynode(), sp, ep) == -1) {
4012 		return (-1);
4013 	}
4014 
4015 
4016 	/*
4017 	 * If this node had been joined, withdraw and reset master.
4018 	 *
4019 	 * This could happen if a node was being added to or removed
4020 	 * from a diskset and the node doing the add/delete operation and
4021 	 * all other nodes in the diskset have left the cluster.
4022 	 */
4023 	if (sd->sd_mn_mynode) {
4024 		nd = sd->sd_mn_mynode;
4025 		if (nd->nd_flags & MD_MN_NODE_OWN) {
4026 			if (clnt_withdrawset(mynode(), sp, ep)) {
4027 				rval = -1;
4028 				goto out;
4029 			}
4030 			if (clnt_mnsetmaster(mynode(), sp, "",
4031 			    MD_MN_INVALID_NID, ep)) {
4032 				rval = -1;
4033 				goto out;
4034 			}
4035 		}
4036 	}
4037 
4038 	/*
4039 	 * Remove side records for this node (side) from local mddb
4040 	 * (clnt_deldrvs does this) if there are drives in the set.
4041 	 *
4042 	 * Don't need to mark this node as DEL since already marked as
4043 	 * ADD or DEL (or this node would have been chosen as master).
4044 	 * Don't need to mark other node records, drive records or
4045 	 * set records as DEL.  If a panic occurs during clnt_delset,
4046 	 * these records will be deleted the next time this node
4047 	 * becomes a member and goes through the reconfig cycle.
4048 	 */
4049 	/* Get the drive descriptors for this set */
4050 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
4051 	    ep)) == NULL) {
4052 		if (! mdisok(ep)) {
4053 			/*
4054 			 * Ignore and clear out any failures from
4055 			 * metaget_drivedesc since a panic could have
4056 			 * occurred when a node was partially added to a set.
4057 			 */
4058 			mdclrerror(ep);
4059 		}
4060 	} else {
4061 		if (clnt_deldrvs(mynode(), sp, dd, ep)) {
4062 			rval = -1;
4063 			goto out;
4064 		}
4065 	}
4066 
4067 	/*
4068 	 * Now, delete the set - this removes the node, drive
4069 	 * and set records from the local mddb.
4070 	 */
4071 	if (clnt_delset(mynode(), sp, ep)) {
4072 		rval = -1;
4073 		goto out;
4074 	}
4075 
4076 out:
4077 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
4078 
4079 	/*
4080 	 * Ignore errors from unlock of set since set is no longer
4081 	 * known (if clnt_delset worked).
4082 	 */
4083 	if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) {
4084 		mdclrerror(&xep);
4085 	}
4086 
4087 	cl_set_setkey(NULL);
4088 
4089 	metaflushsetname(sp);
4090 
4091 	/*
4092 	 * If this node is the lowest numbered nodeid then
4093 	 * call sdssc_create/delete_end depending on whether
4094 	 * this node is marked as ADD or DEL in the node record.
4095 	 */
4096 	if (lowest_alive_nodeid) {
4097 		if (nd->nd_flags & MD_MN_NODE_ADD)
4098 			sdssc_create_end(sp->setname, SDSSC_CLEANUP);
4099 		else if (nd->nd_flags & MD_MN_NODE_DEL)
4100 			sdssc_delete_end(sp->setname, SDSSC_CLEANUP);
4101 	}
4102 
4103 	/* Finished with this set -- return */
4104 	return (rval);
4105 }
4106 
4107 /*
4108  * Reconfig step to choose a new master for all MN disksets.
4109  * Return values:
4110  *	0 - Everything is great.
4111  *	1 - This node failed to reconfig.
4112  *	205 - Cause another reconfig due to a nodelist problem
4113  *		or RPC failure to another node
4114  */
4115 int
4116 meta_reconfig_choose_master(
4117 	md_error_t	*ep
4118 )
4119 {
4120 	set_t				max_sets, setno;
4121 	int				nodecnt;
4122 	mndiskset_membershiplist_t	*nl;
4123 	md_set_desc			*sd;
4124 	mdsetname_t			*sp;
4125 	int				rval = 0;
4126 	mddb_setflags_config_t		sf;
4127 	int				start_node_delayed = 0;
4128 
4129 	if ((max_sets = get_max_sets(ep)) == 0) {
4130 		mde_perror(ep, dgettext(TEXT_DOMAIN,
4131 		    "Unable to get number of sets"));
4132 		return (1);
4133 	}
4134 
4135 	/*
4136 	 * Get membershiplist from API routine.  If there's
4137 	 * an error, return a 205 to cause another reconfig.
4138 	 */
4139 	if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
4140 		mde_perror(ep, "");
4141 		return (205);
4142 	}
4143 
4144 	for (setno = 1; setno < max_sets; setno++) {
4145 		if ((sp = metasetnosetname(setno, ep)) == NULL) {
4146 			if (mdiserror(ep, MDE_NO_SET)) {
4147 				/* No set for this setno - continue */
4148 				mdclrerror(ep);
4149 				continue;
4150 			} else {
4151 				/*
4152 				 * If encountered an RPC error from my node,
4153 				 * then immediately fail.
4154 				 */
4155 				if (mdanyrpcerror(ep)) {
4156 					mde_perror(ep, "");
4157 					return (1);
4158 				}
4159 				/* Can't get set information */
4160 				mde_perror(ep, dgettext(TEXT_DOMAIN,
4161 					"Unable to get information for "
4162 					"set number %d"), setno);
4163 				mdclrerror(ep);
4164 				continue;
4165 			}
4166 		}
4167 
4168 		/* If setname is there, set desc should exist. */
4169 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4170 			/*
4171 			 * If encountered an RPC error from my node,
4172 			 * then immediately fail.
4173 			 */
4174 			if (mdanyrpcerror(ep)) {
4175 				mde_perror(ep, "");
4176 				return (1);
4177 			}
4178 			mde_perror(ep, dgettext(TEXT_DOMAIN,
4179 				"Unable to get set %s desc information"),
4180 				sp->setname);
4181 			mdclrerror(ep);
4182 			continue;
4183 		}
4184 
4185 		/* Only reconfig MN disksets */
4186 		if (!MD_MNSET_DESC(sd)) {
4187 			continue;
4188 		}
4189 
4190 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4191 		    "Begin choose master for set %s: %s"),
4192 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4193 
4194 		/* Update nodelist with member information. */
4195 		if (meta_reconfig_update_nodelist(sp, nl, sd, ep)) {
4196 			/*
4197 			 * If encountered an RPC error from my node,
4198 			 * then immediately fail.
4199 			 */
4200 			if (mdanyrpcerror(ep)) {
4201 				mde_perror(ep, "");
4202 				return (1);
4203 			}
4204 			mde_perror(ep, "");
4205 			mdclrerror(ep);
4206 			continue;
4207 		}
4208 
4209 		/*
4210 		 * If all nodes in a cluster are starting, then
4211 		 * all nodes will attempt to contact all other nodes
4212 		 * to determine a master node.  This can lead to a
4213 		 * problem where node 1 is trying to contact the rpc.metad
4214 		 * node 2 and node 2 is trying to contact the rpc.metad
4215 		 * on node 1 -- and this causes the rpc call to fail
4216 		 * on both nodes and causes a new reconfig cycle.
4217 		 *
4218 		 * In order to break this problem, a newly starting node
4219 		 * will delay a small amount of time (nodeid mod 4 seconds)
4220 		 * and will then run the code to choose a master for the
4221 		 * first set.  Delay will only be done once regardless of the
4222 		 * number of sets.
4223 		 */
4224 		if (start_node_delayed == 0) {
4225 			(void) memset(&sf, 0, sizeof (sf));
4226 			sf.sf_setno = sp->setno;
4227 			sf.sf_flags = MDDB_NM_GET;
4228 			/* Use magic to help protect ioctl against attack. */
4229 			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
4230 			if ((metaioctl(MD_MN_GET_SETFLAGS, &sf,
4231 			    &sf.sf_mde, NULL) == 0) &&
4232 			    ((sf.sf_setflags & MD_SET_MN_START_RC) ==
4233 			    MD_SET_MN_START_RC)) {
4234 				(void) sleep(sd->sd_mn_mynode->nd_nodeid % 4);
4235 			}
4236 			start_node_delayed = 1;
4237 		}
4238 
4239 		/* Choose master for this set */
4240 		rval = meta_reconfig_choose_master_for_set(sp, sd, ep);
4241 		if (rval == -1) {
4242 			mde_perror(ep, "");
4243 			return (1);
4244 		} else if (rval == 205) {
4245 			mde_perror(ep, "");
4246 			return (205);
4247 		}
4248 
4249 		/* Send new nodelist to rpc.mdcommd */
4250 		(void) mdmn_reinit_set(sp->setno);
4251 
4252 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4253 		    "Choose master for set %s completed: %s"),
4254 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4255 	}
4256 
4257 	/*
4258 	 * Each node turns on I/Os for all MN disksets.
4259 	 * This is to recover from the situation where the master died
4260 	 * during a previous reconfig cycle when I/Os were suspended
4261 	 * for a MN diskset.
4262 	 * If a failure occurs return a 1 which will force this node to
4263 	 * panic.  Cannot leave node in the situation where I/Os are
4264 	 * not resumed.
4265 	 */
4266 	setno = 0; /* 0 means all MN sets */
4267 	if (metaioctl(MD_MN_RESUME_SET, &setno, ep, NULL)) {
4268 		mde_perror(ep, "");
4269 		return (1);
4270 	}
4271 
4272 	/* Free the nodelist */
4273 	if (nodecnt)
4274 		meta_free_nodelist(nl);
4275 
4276 	return (0);
4277 }
4278 
4279 /*
4280  * meta_mnsync_user_records will synchronize the diskset user records across
4281  * all nodes in the diskset.  The diskset user records are stored in
4282  * each node's local set mddb.
4283  *
4284  * This needs to be done even if there is no master change during the
4285  * reconfig cycle since this routine should clean up any mess left by
4286  * the untimely termination of a metaset or metadb command (due to a
4287  * node panic or to user intervention).
4288  *
4289  * Caller is the Master node.
4290  *
4291  * Returns	 0 - Success
4292  *		205 - Failure during RPC to another node
4293  *		-1 - Any other failure and ep is filled in.
4294  */
4295 int
4296 meta_mnsync_user_records(
4297 	mdsetname_t	*sp,
4298 	md_error_t	*ep
4299 )
4300 {
4301 	md_set_desc		*sd;
4302 	md_mnnode_desc		*master_nodelist, *nd, *nd2, *ndtail;
4303 	md_mnset_record		*mnsr;
4304 	md_mnsr_node_t		*master_mnsr_node = NULL, *mnsr_node = NULL;
4305 	md_mnnode_record	*nr;
4306 	md_drive_record		*dr;
4307 	int			dr_cnt, dd_cnt;
4308 	int			found_my_nr;
4309 	md_drive_desc		*dd, *dd_prev, *master_dd, *other_dd;
4310 	int			all_drives_ok;
4311 	int			rval = 0;
4312 	int			max_genid = 0;
4313 	int			num_alive_nodes, num_alive_nodes_del = 0;
4314 	int			set_locked = 0;
4315 	md_setkey_t		*cl_sk;
4316 	md_error_t		xep = mdnullerror;
4317 	char			*anode[1];
4318 	mddb_setflags_config_t	sf;
4319 
4320 	/*
4321 	 * Sync up node records first.
4322 	 * Construct a master nodelist using the nodelist from this
4323 	 * node's rpc.metad node records and then setting the state of each
4324 	 * node following these rules:
4325 	 *	- If a node record is marked OK on its node, mark it OK
4326 	 *		in the master nodelist (and later OK on all nodes)
4327 	 *		If a node record is also marked OWN on its node,
4328 	 *		mark it OWN in the master nodelist.
4329 	 *	- If a node record is not marked OK on its node, then mark
4330 	 *		it as DEL in the master list (later deleting it)
4331 	 *	- If node record doesn't exist on that node, then mark it DEL
4332 	 *		(later deleting it)
4333 	 *	- If set record doesn't exist on that node, mark node as DEL
4334 	 *	- If a node record doesn't exist on all nodes, then mark it DEL
4335 	 *	- If a node is not ALIVE, then
4336 	 *		- If that node marked DEL on any node - mark it DEL
4337 	 *			in master list but leave in nodelist
4338 	 *		- If that node is marked as ADD on any node, mark it
4339 	 *			ADD in the master list but leave in nodelist
4340 	 *		- When that node returns to the living, the DEL
4341 	 *			node record will be removed and the ADD node
4342 	 *			record may be removed if marked ADD on that
4343 	 *			node.
4344 	 * The key rule is to not remove a node from the nodelist until
4345 	 * that node record is removed from its own node.  Do not want to
4346 	 * remove a node's record from all other nodes and then have
4347 	 * that node have its own record marked OK so that a node will pick
4348 	 * a different master than the other nodes.
4349 	 *
4350 	 * Next,
4351 	 * If node is ALIVE and node record is marked DEL in master nodelist,
4352 	 * remove node from set.
4353 	 * If node is ALIVE and node record is marked OK in master nodelist,
4354 	 * mark it OK on all other nodes.
4355 	 * If node is not ALIVE and node record is marked DEL in master
4356 	 * nodelist, mark it DEL on all other nodes.
4357 	 * If node is not ALIVE and node record is marked ADD in master,
4358 	 * nodelist, mark it ADD on all other nodes.
4359 	 */
4360 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4361 		return (-1);
4362 	}
4363 	master_nodelist = sd->sd_nodelist;
4364 
4365 	/*
4366 	 * Walk through nodelist creating a master nodelist.
4367 	 */
4368 	num_alive_nodes = 0;
4369 	nd = master_nodelist;
4370 	while (nd) {
4371 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4372 			nd = nd->nd_next;
4373 			continue;
4374 		}
4375 		num_alive_nodes++;
4376 		if (clnt_mngetset(nd->nd_nodename, sp->setname,
4377 		    MD_SET_BAD, &mnsr, ep) == -1) {
4378 			if (mdiserror(ep, MDE_NO_SET)) {
4379 				/* set doesn't exist, mark node as DEL */
4380 				nd->nd_flags &= ~MD_MN_NODE_OK;
4381 				nd->nd_flags &= ~MD_MN_NODE_ADD;
4382 				nd->nd_flags |= MD_MN_NODE_DEL;
4383 				nd->nd_flags |= MD_MN_NODE_NOSET;
4384 				nd = nd->nd_next;
4385 				continue;
4386 			} else {
4387 				/* If RPC failure to another node return 205 */
4388 				if ((mdanyrpcerror(ep)) &&
4389 				    (sd->sd_mn_mynode->nd_nodeid !=
4390 				    nd->nd_nodeid)) {
4391 					rval = 205;
4392 				} else {
4393 					/* Any other failure */
4394 					rval = -1;
4395 				}
4396 				goto out;
4397 			}
4398 		}
4399 		/* Find biggest genid in records for this diskset */
4400 		if (mnsr->sr_genid > max_genid)
4401 			max_genid = mnsr->sr_genid;
4402 
4403 		dr = mnsr->sr_drivechain;
4404 		while (dr) {
4405 			/* Find biggest genid in records for this diskset */
4406 			if (dr->dr_genid > max_genid) {
4407 				max_genid = dr->dr_genid;
4408 			}
4409 			dr = dr->dr_next;
4410 		}
4411 
4412 		found_my_nr = 0;
4413 		nr = mnsr->sr_nodechain;
4414 		/* nr is the list of node recs from nd_nodename node */
4415 		while (nr) {
4416 			/* Find biggest genid in records for this diskset */
4417 			if (nr->nr_genid > max_genid)
4418 				max_genid = nr->nr_genid;
4419 			nd2 = master_nodelist;
4420 			ndtail = NULL;
4421 			/* For each node record, is it in master list? */
4422 			while (nd2) {
4423 				if (nd2->nd_nodeid == nr->nr_nodeid)
4424 					break;
4425 				if (nd2->nd_next == NULL)
4426 					ndtail = nd2;
4427 				nd2 = nd2->nd_next;
4428 			}
4429 			/*
4430 			 * Found node record not in master list -- add it
4431 			 * to list marking it as DEL since node record
4432 			 * should exist on all nodes unless a panic occurred
4433 			 * during addition or deletion of host to diskset.
4434 			 */
4435 			if (nd2 == NULL) {
4436 				nd2 = Zalloc(sizeof (*nd2));
4437 				(void) strcpy(nd2->nd_nodename,
4438 				    nr->nr_nodename);
4439 				nd2->nd_flags = nr->nr_flags;
4440 				nd2->nd_flags |= MD_MN_NODE_DEL;
4441 				nd2->nd_nodeid = nr->nr_nodeid;
4442 				nd2->nd_next = NULL;
4443 				ndtail->nd_next = nd2;
4444 				nd2 = NULL;
4445 				nr = nr->nr_next;
4446 				continue;
4447 			}
4448 			/*
4449 			 * Is this the node record for the node that
4450 			 * we requested the set desc from?
4451 			 * If so, check if node has its own node record
4452 			 * marked OK. If marked OK, check for the OWN bit.
4453 			 */
4454 			if (nr->nr_nodeid == nd->nd_nodeid) {
4455 				found_my_nr = 1;
4456 				if (nr->nr_flags & MD_MN_NODE_OK) {
4457 					/*
4458 					 * If node record is marked OK
4459 					 * on its own node, then mark it OK
4460 					 * in the master list.  Node record
4461 					 * would have to exist on all nodes
4462 					 * in the ADD state before it could
4463 					 * be put into the OK state.
4464 					 */
4465 					nd->nd_flags |= MD_MN_NODE_OK;
4466 					nd->nd_flags &=
4467 					    ~(MD_MN_NODE_ADD | MD_MN_NODE_DEL);
4468 					/*
4469 					 * Mark own in master list as marked
4470 					 * on own node.
4471 					 */
4472 					if (nr->nr_flags & MD_MN_NODE_OWN)
4473 						nd->nd_flags |= MD_MN_NODE_OWN;
4474 					else
4475 						nd->nd_flags &= ~MD_MN_NODE_OWN;
4476 				} else {
4477 					/* Otherwise, mark node as DEL */
4478 					nd->nd_flags &= ~MD_MN_NODE_OK;
4479 					nd->nd_flags &= ~MD_MN_NODE_ADD;
4480 					nd->nd_flags |= MD_MN_NODE_DEL;
4481 				}
4482 			}
4483 			/*
4484 			 * If node is not ALIVE and marked DEL
4485 			 * on any node, make it DEL in master list.
4486 			 * If node is not ALIVE and marked ADD
4487 			 * on any node, make it ADD in master list
4488 			 * unless node record has already been marked DEL.
4489 			 */
4490 			if (!(nr->nr_flags & MD_MN_NODE_ALIVE)) {
4491 				if (nr->nr_flags & MD_MN_NODE_ADD) {
4492 					if (!(nd->nd_flags & MD_MN_NODE_DEL)) {
4493 						/* If not DEL - mark it ADD */
4494 						nd->nd_flags |= MD_MN_NODE_ADD;
4495 						nd->nd_flags &= ~MD_MN_NODE_OK;
4496 					}
4497 				}
4498 				if (nr->nr_flags & MD_MN_NODE_DEL) {
4499 					nd->nd_flags |= MD_MN_NODE_DEL;
4500 					nd->nd_flags &= ~MD_MN_NODE_OK;
4501 					/* Could already be ADD - make it DEL */
4502 					nd->nd_flags &= ~MD_MN_NODE_ADD;
4503 				}
4504 			}
4505 			nr = nr->nr_next;
4506 		}
4507 		/*
4508 		 * If a node record doesn't exist on its own node,
4509 		 * then mark node as DEL.
4510 		 */
4511 		if (found_my_nr == 0) {
4512 			nd->nd_flags &= ~MD_MN_NODE_OK;
4513 			nd->nd_flags |= MD_MN_NODE_DEL;
4514 		}
4515 
4516 		/*
4517 		 * If node is OK - put mnsr onto master_mnsr_node list for
4518 		 * later use when syncing up the drive records in the set.
4519 		 */
4520 		if (nd->nd_flags & MD_MN_NODE_OK) {
4521 			mnsr_node = Zalloc(sizeof (*mnsr_node));
4522 			mnsr_node->mmn_mnsr = mnsr;
4523 			(void) strncpy(mnsr_node->mmn_nodename,
4524 				nd->nd_nodename, MD_MAX_MNNODENAME_PLUS_1);
4525 			mnsr_node->mmn_next = master_mnsr_node;
4526 			master_mnsr_node = mnsr_node;
4527 		} else {
4528 			free_sr((struct md_set_record *)mnsr);
4529 		}
4530 
4531 		nd = nd->nd_next;
4532 	}
4533 
4534 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4535 	    "Master nodelist created for set %s: %s"),
4536 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4537 
4538 	/*
4539 	 * Send master nodelist to the rpc.metad on all nodes (including
4540 	 * myself) and each node will update itself.  This will set the
4541 	 * ADD and DEL flags on each node as setup in the master nodelist.
4542 	 * Don't send nodelist to node where set doesn't exist.
4543 	 */
4544 	nd = master_nodelist;
4545 	while (nd) {
4546 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4547 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
4548 			nd = nd->nd_next;
4549 			continue;
4550 		}
4551 		if (clnt_upd_nr_flags(nd->nd_nodename, sp,
4552 		    master_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) {
4553 			/* If RPC failure to another node return 205 */
4554 			if ((mdanyrpcerror(ep)) &&
4555 			    (sd->sd_mn_mynode->nd_nodeid !=
4556 			    nd->nd_nodeid)) {
4557 				rval = 205;
4558 			} else {
4559 				/* Any other failure */
4560 				rval = -1;
4561 			}
4562 			goto out;
4563 		}
4564 		nd = nd->nd_next;
4565 	}
4566 
4567 	/*
4568 	 * Now, delete nodes that need to be deleted.
4569 	 */
4570 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
4571 	    ep))  == NULL) {
4572 		if (! mdisok(ep)) {
4573 			rval = -1;
4574 			goto out;
4575 		}
4576 	}
4577 
4578 	/*
4579 	 * May be doing lots of RPC commands to the nodes, so lock the
4580 	 * ALIVE members of the set since most of the rpc.metad routines
4581 	 * require this for security reasons.
4582 	 */
4583 	nd = master_nodelist;
4584 	while (nd) {
4585 		/* Skip non-alive nodes and node without set */
4586 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4587 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
4588 			nd = nd->nd_next;
4589 			continue;
4590 		}
4591 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
4592 			/* If RPC failure to another node return 205 */
4593 			if ((mdanyrpcerror(ep)) &&
4594 			    (sd->sd_mn_mynode->nd_nodeid !=
4595 			    nd->nd_nodeid)) {
4596 				rval = 205;
4597 			} else {
4598 				/* Any other failure */
4599 				rval = -1;
4600 			}
4601 			goto out;
4602 		}
4603 		set_locked = 1;
4604 		nd = nd->nd_next;
4605 	}
4606 
4607 	nd = master_nodelist;
4608 	while (nd) {
4609 		/* Skip non-alive nodes */
4610 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4611 			nd = nd->nd_next;
4612 			continue;
4613 		}
4614 		if (nd->nd_flags & MD_MN_NODE_DEL) {
4615 			num_alive_nodes_del++;
4616 			/*
4617 			 * Delete this node rec from all ALIVE nodes in diskset.
4618 			 */
4619 			nd2 = master_nodelist;
4620 			while (nd2) {
4621 				/* Skip non-alive nodes and node without set */
4622 				if (!(nd2->nd_flags & MD_MN_NODE_ALIVE) ||
4623 				    (nd2->nd_flags & MD_MN_NODE_NOSET)) {
4624 					nd2 = nd2->nd_next;
4625 					continue;
4626 				}
4627 
4628 				/* This is a node being deleted from set */
4629 				if (nd2->nd_nodeid == nd->nd_nodeid) {
4630 					/* Mark set record as DEL */
4631 					if (clnt_upd_sr_flags(nd->nd_nodename,
4632 					    sp, MD_SR_DEL, ep)) {
4633 						/* RPC failure to !my node */
4634 						if ((mdanyrpcerror(ep)) &&
4635 						    (sd->sd_mn_mynode->
4636 						    nd_nodeid
4637 						    != nd->nd_nodeid)) {
4638 							rval = 205;
4639 						} else {
4640 							/* Any other failure */
4641 							rval = -1;
4642 						}
4643 						goto out;
4644 					}
4645 					if (clnt_deldrvs(nd->nd_nodename, sp,
4646 					    dd, ep)) {
4647 						/* RPC failure to !my node */
4648 						if ((mdanyrpcerror(ep)) &&
4649 						    (sd->sd_mn_mynode->
4650 						    nd_nodeid
4651 						    != nd->nd_nodeid)) {
4652 							rval = 205;
4653 						} else {
4654 							/* Any other failure */
4655 							rval = -1;
4656 						}
4657 						goto out;
4658 					}
4659 					if (clnt_delset(nd->nd_nodename, sp,
4660 					    ep) == -1) {
4661 						/* RPC failure to !my node */
4662 						if ((mdanyrpcerror(ep)) &&
4663 						    (sd->sd_mn_mynode->
4664 						    nd_nodeid
4665 						    != nd->nd_nodeid)) {
4666 							rval = 205;
4667 						} else {
4668 							/* Any other failure */
4669 							rval = -1;
4670 						}
4671 						goto out;
4672 					}
4673 				} else {
4674 					/*
4675 					 * Delete host from sets on hosts
4676 					 * not being deleted.
4677 					 */
4678 					anode[0] = Strdup(nd->nd_nodename);
4679 					if (clnt_delhosts(nd2->nd_nodename, sp,
4680 					    1, anode, ep) == -1) {
4681 						Free(anode[0]);
4682 						/* RPC failure to !my node */
4683 						if ((mdanyrpcerror(ep)) &&
4684 						    (sd->sd_mn_mynode->
4685 						    nd_nodeid
4686 						    != nd2->nd_nodeid)) {
4687 							rval = 205;
4688 						} else {
4689 							/* Any other failure */
4690 							rval = -1;
4691 						}
4692 						goto out;
4693 					}
4694 
4695 					meta_mc_log(MC_LOG5,
4696 					    dgettext(TEXT_DOMAIN,
4697 					    "Deleted node %s (%d) on node %s "
4698 					    "from set %s: %s"),
4699 					    nd->nd_nodename, nd->nd_nodeid,
4700 					    nd2->nd_nodename,
4701 					    sp->setname,
4702 					    meta_print_hrtime(
4703 					    gethrtime() - start_time));
4704 
4705 					Free(anode[0]);
4706 				}
4707 				nd2 = nd2->nd_next;
4708 			}
4709 		}
4710 		nd = nd->nd_next;
4711 	}
4712 
4713 	nd = master_nodelist;
4714 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
4715 	while (nd) {
4716 		/* Skip non-alive nodes and node without set */
4717 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4718 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
4719 			nd = nd->nd_next;
4720 			continue;
4721 		}
4722 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) {
4723 			/* If RPC failure to another node return 205 */
4724 			if ((mdanyrpcerror(ep)) &&
4725 			    (sd->sd_mn_mynode->nd_nodeid !=
4726 			    nd->nd_nodeid)) {
4727 				rval = 205;
4728 			} else {
4729 				/* Any other failure */
4730 				rval = -1;
4731 			}
4732 			goto out;
4733 		}
4734 		nd = nd->nd_next;
4735 	}
4736 	cl_set_setkey(NULL);
4737 	set_locked = 0;
4738 
4739 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4740 	    "Nodelist syncronization complete for set %s: %s"),
4741 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4742 
4743 	metaflushsetname(sp);
4744 
4745 	/*
4746 	 * If all alive nodes have been deleted from set, just
4747 	 * return since nothing else can be done until non-alive
4748 	 * nodes (if there are any) rejoin the cluster.
4749 	 */
4750 	if (num_alive_nodes == num_alive_nodes_del) {
4751 		rval = 0;
4752 		goto out;
4753 	}
4754 
4755 	/*
4756 	 * Sync up drive records.
4757 	 *
4758 	 * If a node panic'd (or metaset command was killed) during the
4759 	 * addition or deletion of a drive to the diskset, the nodes
4760 	 * may have a different view of the drive list.  During cleanup
4761 	 * of the drive list during reconfig, a drive will be deleted
4762 	 * from the list if the master node sees that the drive has been
4763 	 * marked in the ADD state on any node or is marked in the DEL state
4764 	 * on all nodes.
4765 	 * This cleanup must occur even if all nodes in the cluster are
4766 	 * not part of the cluster so that all nodes have the same view
4767 	 * of the drivelist.
4768 	 * Then if the entire cluster goes down and comes back up, the
4769 	 * new master node could be a node that wasn't in the cluster when
4770 	 * the node was deleted.  This could lead to a situation where the
4771 	 * master node thinks that a drive is OK, but this drive isn't
4772 	 * known to the other nodes.
4773 	 * This situation can also occur during the addition of a drive
4774 	 * where a node has the drive marked OK, but the node executing the
4775 	 * metaset command enountered a failure before marking that drive OK
4776 	 * on the rest of the nodes.  If the node with the OK drive then
4777 	 * panics, then rest of the nodes will remove that drive marked ADD
4778 	 * and when the node with the OK drive rejoins the cluster, it will
4779 	 * have a drive marked OK that is unknown by the other nodes.
4780 	 *
4781 	 * There are 2 situations to consider:
4782 	 * A) Master knows about a drive that other nodes don't know about.
4783 	 * B) At least one slave node knows about a drive that the master
4784 	 *    node doesn't know about.
4785 	 *
4786 	 * To handle these situations the following steps are followed:
4787 	 * 1) Count number of drives known by this master node and the
4788 	 *    other slave nodes.
4789 	 *    If all nodes have the same number of drives and the master has
4790 	 *    all drives marked OK, then skip to step4.
4791 	 *
4792 	 * 2) If a node has less drives listed than the master, the master
4793 	 *    must get the drive descriptor list from that node so that
4794 	 *    master can determine which drive it needs to delete from that
4795 	 *    node.  Master must get the drive descriptor list since the
4796 	 *    drive record list does not contain the name of the drive, but
4797 	 *    only a key and the key can only be interprested on that other
4798 	 *    node.
4799 	 *
4800 	 * 3) The master will then create the master drive list by doing:
4801 	 *	- Master starts with drive list known by master.
4802 	 *	- Any drive marked ADD will be removed from the list.
4803 	 *	- Any drive not known by another node (from step2) will be
4804 	 *	removed from the drive list.
4805 	 *	- If a drive is marked DEL on the master, the master must
4806 	 *	verify that the drive record is marked DEL on all nodes.
4807 	 *	If any node has the drive record marked OK, mark it OK
4808 	 *	on the master.  (The reason why is described below).
4809 	 *
4810 	 * 4) The master sends out the master drive list and the slave
4811 	 *    nodes will force their drive lists to match the master
4812 	 *    drive list by deleting drives, if necessary and by changing
4813 	 *    the drive record states from ADD->OK if master has drive
4814 	 *    marked OK and slave has drive marked ADD.
4815 	 *
4816 	 * Interesting scenarios:
4817 	 *
4818 	 * 1) System has 4 nodes with node 1 as the master.  Node 3 starts
4819 	 *    to delete a drive record (drive record on node 1 is marked DEL),
4820 	 *    but is stopped when node 3 panics.  Node 1 also panics.
4821 	 *    During reconfig cycle, node 2 is picked as master and the drive
4822 	 *    record is left alone since all nodes in the cluster have it
4823 	 *    marked OK.  User now sees drive as part of diskset.
4824 	 *    Now, entire cluster is rebooted and node 1 rejoins the cluster.
4825 	 *    Node 1 is picked as the master and node 1 has drive record
4826 	 *    marked DEL.  Node 1 contacts all other nodes in the cluster
4827 	 *    and since at least one node has the drive record marked OK,
4828 	 *    the master marks the drive record OK.
4829 	 *    User continues to see the drive as part of the diskset.
4830 	 */
4831 
4832 	/* Reget set descriptor since flushed above */
4833 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4834 		rval = -1;
4835 		goto out;
4836 	}
4837 
4838 	/* Has side effect of setting sd->sd_drvs to same as master_dd */
4839 	if ((master_dd = metaget_drivedesc_sideno(sp,
4840 	    sd->sd_mn_mynode->nd_nodeid,
4841 	    (MD_BASICNAME_OK | PRINT_FAST), ep)) == NULL) {
4842 		/* No drives in list */
4843 		if (!mdisok(ep)) {
4844 			/*
4845 			 * Can't get drive list for this node, so
4846 			 * return -1 causing this node to be removed
4847 			 * cluster config and fixed.
4848 			 */
4849 			rval = -1;
4850 			goto out;
4851 		}
4852 	}
4853 
4854 	/* Count the number of drives for all nodes */
4855 	mnsr_node = master_mnsr_node;
4856 	while (mnsr_node) {
4857 		dr_cnt = 0;
4858 		dr = mnsr_node->mmn_mnsr->sr_drivechain;
4859 		while (dr) {
4860 			dr_cnt++;
4861 			dr = dr->dr_next;
4862 		}
4863 		mnsr_node->mmn_numdrives = dr_cnt;
4864 		mnsr_node = mnsr_node->mmn_next;
4865 	}
4866 
4867 	/* Count the number of drives for the master; also check flags */
4868 	all_drives_ok = 1;
4869 	dd_cnt = 0;
4870 	dd = master_dd;
4871 	while (dd) {
4872 		dd_cnt++;
4873 		if (!(dd->dd_flags & MD_DR_OK))
4874 			all_drives_ok = 0;
4875 		dd = dd->dd_next;
4876 	}
4877 
4878 	/* If all drives are ok, do quick check against number of drives */
4879 	if (all_drives_ok) {
4880 		/* If all nodes have same number of drives, almost done */
4881 		mnsr_node = master_mnsr_node;
4882 		while (mnsr_node) {
4883 			if (mnsr_node->mmn_numdrives != dd_cnt)
4884 				break;
4885 			mnsr_node = mnsr_node->mmn_next;
4886 		}
4887 		/* All nodes have same number of drives, just send flags */
4888 		if (mnsr_node == NULL) {
4889 			goto send_drive_list;
4890 		}
4891 	}
4892 
4893 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4894 	    "Begin detailed drive synchronization for set %s: %s"),
4895 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4896 
4897 	/* Detailed check required  */
4898 	mnsr_node = master_mnsr_node;
4899 	while (mnsr_node) {
4900 		/* Does slave node have less drives than master? */
4901 		if (mnsr_node->mmn_numdrives < dd_cnt) {
4902 			/* Yes - must determine which drive is missing */
4903 			if (clnt_getdrivedesc(mnsr_node->mmn_nodename, sp,
4904 			    &other_dd, ep)) {
4905 				/* RPC failure to !my node */
4906 				if ((mdanyrpcerror(ep)) &&
4907 				    (strcmp(mynode(), mnsr_node->mmn_nodename)
4908 				    != 0)) {
4909 					rval = 205;
4910 				} else {
4911 					/* Any other failure */
4912 					rval = -1;
4913 				}
4914 				mde_perror(ep, dgettext(TEXT_DOMAIN,
4915 				    "Master node %s unable to "
4916 				    "retrieve drive list from node %s"),
4917 				    mynode(), mnsr_node->mmn_nodename);
4918 				goto out;
4919 			}
4920 			mnsr_node->mmn_dd = other_dd;
4921 			dd = master_dd;
4922 			while (dd) {
4923 				if (!(dd->dd_flags & MD_DR_OK)) {
4924 					dd = dd->dd_next;
4925 					continue;
4926 				}
4927 				other_dd = mnsr_node->mmn_dd;
4928 				while (other_dd) {
4929 					/* Convert to devids, when available */
4930 					if (strcmp(other_dd->dd_dnp->cname,
4931 					    dd->dd_dnp->cname) == 0) {
4932 						break;
4933 					}
4934 					other_dd = other_dd->dd_next;
4935 				}
4936 				/*
4937 				 * dd not found on slave so mark it
4938 				 * ADD for later deletion (drives in ADD
4939 				 * state are deleted later in this routine).
4940 				 */
4941 				if (other_dd == NULL) {
4942 					dd->dd_flags = MD_DR_ADD;
4943 				}
4944 				dd = dd->dd_next;
4945 			}
4946 
4947 		}
4948 		mnsr_node = mnsr_node->mmn_next;
4949 	}
4950 
4951 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4952 	    "Drive check completed for set %s: %s"),
4953 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4954 
4955 	dd = master_dd;
4956 	dd_prev = 0;
4957 	while (dd) {
4958 		/* Remove any ADD drives from list */
4959 		if (dd->dd_flags & MD_DR_ADD) {
4960 			if (dd_prev) {
4961 				dd_prev->dd_next = dd->dd_next;
4962 				dd->dd_next = NULL;
4963 				metafreedrivedesc(&dd);
4964 				dd = dd_prev->dd_next;
4965 			} else {
4966 				/*
4967 				 * If removing drive descriptor from head
4968 				 * of linked list, also change sd->sd_drvs.
4969 				 */
4970 				master_dd = sd->sd_drvs = dd->dd_next;
4971 				dd->dd_next = NULL;
4972 				metafreedrivedesc(&dd);
4973 				dd = master_dd;
4974 			}
4975 			/* dd setup in if/else above */
4976 			continue;
4977 		}
4978 		/*
4979 		 * If drive is marked DEL, check all other nodes.
4980 		 * If drive on another node is marked OK, mark drive OK
4981 		 * in master list.  If drive is marked DEL or doesn't exist
4982 		 * on all nodes, remove drive from list.
4983 		 */
4984 		if (dd->dd_flags & MD_DR_DEL) {
4985 			mnsr_node = master_mnsr_node;
4986 			while (mnsr_node) {
4987 				if (mnsr_node->mmn_dd == NULL) {
4988 				    if (clnt_getdrivedesc(
4989 					mnsr_node->mmn_nodename, sp,
4990 					&other_dd, ep)) {
4991 					    /* RPC failure to !my node */
4992 					    if ((mdanyrpcerror(ep)) &&
4993 						(strcmp(mynode(),
4994 						mnsr_node->mmn_nodename)
4995 						!= 0)) {
4996 						    rval = 205;
4997 					    } else {
4998 						    /* Any other failure */
4999 						    rval = -1;
5000 					    }
5001 					    mde_perror(ep, dgettext(TEXT_DOMAIN,
5002 						"Master node %s unable "
5003 						"to retrieve drive list from "
5004 						"node %s"), mynode(),
5005 						mnsr_node->mmn_nodename);
5006 					    goto out;
5007 				    }
5008 				    mnsr_node->mmn_dd = other_dd;
5009 				}
5010 				other_dd = mnsr_node->mmn_dd;
5011 				while (other_dd) {
5012 					/* Found drive (OK) from other node */
5013 					if (strcmp(dd->dd_dnp->cname,
5014 					    other_dd->dd_dnp->cname)
5015 					    == 0) {
5016 						/* Drive marked OK */
5017 						if (other_dd->dd_flags &
5018 						    MD_DR_OK) {
5019 						    dd->dd_flags = MD_DR_OK;
5020 						}
5021 						break;
5022 					}
5023 					other_dd = other_dd->dd_next;
5024 				}
5025 				if (dd->dd_flags == MD_DR_OK)
5026 					break;
5027 
5028 				mnsr_node = mnsr_node->mmn_next;
5029 			}
5030 			/*
5031 			 * If no node had this drive marked OK, delete it.
5032 			 */
5033 			if (dd->dd_flags & MD_DR_DEL) {
5034 				if (dd_prev) {
5035 					dd_prev->dd_next = dd->dd_next;
5036 					dd->dd_next = NULL;
5037 					metafreedrivedesc(&dd);
5038 					dd = dd_prev->dd_next;
5039 				} else {
5040 					/*
5041 					 * If removing drive descriptor from
5042 					 * head of linked list, also change
5043 					 * sd->sd_drvs.
5044 					 */
5045 					master_dd = sd->sd_drvs = dd->dd_next;
5046 					dd->dd_next = NULL;
5047 					metafreedrivedesc(&dd);
5048 					dd = master_dd;
5049 				}
5050 				/* dd setup in if/else above */
5051 				continue;
5052 			}
5053 		}
5054 		dd_prev = dd;
5055 		dd = dd->dd_next;
5056 	}
5057 
5058 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5059 	    "Setting drive states completed for set %s: %s"),
5060 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5061 
5062 send_drive_list:
5063 	/*
5064 	 * Set genid on all drives to be the highest value seen.
5065 	 */
5066 	dd = master_dd;
5067 	while (dd) {
5068 		dd->dd_genid = max_genid;
5069 		dd = dd->dd_next;
5070 	}
5071 	/*
5072 	 * Send updated drive list to all alive nodes.
5073 	 * Will also set genid on set and node records to have same
5074 	 * as the drive records.
5075 	 */
5076 	nd = sd->sd_nodelist;
5077 	while (nd) {
5078 		/* Skip non-alive nodes */
5079 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5080 			nd = nd->nd_next;
5081 			continue;
5082 		}
5083 		if (clnt_upd_dr_reconfig(nd->nd_nodename, sp, master_dd, ep)) {
5084 			/* RPC failure to another node */
5085 			if ((mdanyrpcerror(ep)) &&
5086 			    (sd->sd_mn_mynode->nd_nodeid != nd->nd_nodeid)) {
5087 				rval = 205;
5088 			} else {
5089 				/* Any other failure */
5090 				rval = -1;
5091 			}
5092 			goto out;
5093 		}
5094 		nd = nd->nd_next;
5095 	}
5096 
5097 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5098 	    "Sent drive list to all nodes for set %s: %s"),
5099 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5100 
5101 	/*
5102 	 * If no drive records left in set and nodes had been joined,
5103 	 * withdraw the nodes.  Always reset the master and mark
5104 	 * all nodes as withdrawn on all nodes.
5105 	 */
5106 	if (master_dd == NULL) {
5107 		/* Reset new master flag since no longer master */
5108 		(void) memset(&sf, 0, sizeof (sf));
5109 		sf.sf_setno = sp->setno;
5110 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5111 		sf.sf_flags = MDDB_NM_RESET;
5112 		/* Use magic to help protect ioctl against attack. */
5113 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5114 		/* Ignore failure, failure to reset flag isn't catastrophic */
5115 		(void) metaioctl(MD_MN_SET_SETFLAGS, &sf,
5116 		    &sf.sf_mde, NULL);
5117 
5118 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5119 		    "Reset new master flag for " "set %s: %s"),
5120 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5121 
5122 		nd = sd->sd_nodelist;
5123 		while (nd) {
5124 			/* Skip non-alive nodes  */
5125 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5126 				nd = nd->nd_next;
5127 				continue;
5128 			}
5129 
5130 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
5131 				/* RPC failure to another node */
5132 				if ((mdanyrpcerror(ep)) &&
5133 				    (sd->sd_mn_mynode->nd_nodeid !=
5134 				    nd->nd_nodeid)) {
5135 					rval = 205;
5136 				} else {
5137 					/* Any other failure */
5138 					rval = -1;
5139 				}
5140 				goto out;
5141 			}
5142 			set_locked = 1;
5143 
5144 			/* Withdraw node from set if owner */
5145 			if ((nd->nd_flags & MD_MN_NODE_OWN) &&
5146 			    (clnt_withdrawset(nd->nd_nodename, sp, ep))) {
5147 				/* RPC failure to another node */
5148 				if ((mdanyrpcerror(ep)) &&
5149 				    (sd->sd_mn_mynode->nd_nodeid !=
5150 				    nd->nd_nodeid)) {
5151 					rval = 205;
5152 				} else {
5153 					/* Any other failure */
5154 					rval = -1;
5155 				}
5156 				goto out;
5157 			}
5158 
5159 			/* Mark all nodes as withdrawn on this node */
5160 			if (clnt_upd_nr_flags(nd->nd_nodename, sp,
5161 			    sd->sd_nodelist, MD_NR_WITHDRAW, NULL, ep)) {
5162 				/* RPC failure to another node */
5163 				if ((mdanyrpcerror(ep)) &&
5164 				    (sd->sd_mn_mynode->nd_nodeid !=
5165 				    nd->nd_nodeid)) {
5166 					rval = 205;
5167 				} else {
5168 					/* Any other failure */
5169 					rval = -1;
5170 				}
5171 				goto out;
5172 			}
5173 
5174 			/* Resets master to no-master on this node */
5175 			if (clnt_mnsetmaster(nd->nd_nodename, sp,
5176 			    "", MD_MN_INVALID_NID, ep)) {
5177 				/* RPC failure to another node */
5178 				if ((mdanyrpcerror(ep)) &&
5179 				    (sd->sd_mn_mynode->nd_nodeid !=
5180 				    nd->nd_nodeid)) {
5181 					rval = 205;
5182 				} else {
5183 					/* Any other failure */
5184 					rval = -1;
5185 				}
5186 				goto out;
5187 			}
5188 
5189 			cl_sk = cl_get_setkey(sp->setno, sp->setname);
5190 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) {
5191 				/* RPC failure to another node */
5192 				if ((mdanyrpcerror(ep)) &&
5193 				    (sd->sd_mn_mynode->nd_nodeid !=
5194 				    nd->nd_nodeid)) {
5195 					rval = 205;
5196 				} else {
5197 					/* Any other failure */
5198 					rval = -1;
5199 				}
5200 				goto out;
5201 			}
5202 			set_locked = 0;
5203 			nd = nd->nd_next;
5204 		}
5205 	}
5206 
5207 out:
5208 	/*
5209 	 * If got here and set is still locked, then an error has
5210 	 * occurred and master_nodelist is still valid.
5211 	 * If error is not an RPC error, then unlock.
5212 	 * If error is an RPC error, skip unlocks since this could cause
5213 	 * yet another RPC timeout if a node has failed.
5214 	 * Ignore failures in unlock since unlock is just trying to
5215 	 * clean things up.
5216 	 */
5217 	if ((set_locked) && !(mdanyrpcerror(ep))) {
5218 		nd = master_nodelist;
5219 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
5220 		while (nd) {
5221 			/* Skip non-alive nodes */
5222 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5223 				nd = nd->nd_next;
5224 				continue;
5225 			}
5226 			/*
5227 			 * If clnt_unlock fails, just break out since next
5228 			 * reconfig cycle will reset the locks anyway.
5229 			 */
5230 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
5231 				break;
5232 			}
5233 			nd = nd->nd_next;
5234 		}
5235 		cl_set_setkey(NULL);
5236 	}
5237 	/* Free master_mnsr and drive descs */
5238 	mnsr_node = master_mnsr_node;
5239 	while (mnsr_node) {
5240 		master_mnsr_node = mnsr_node->mmn_next;
5241 		free_sr((md_set_record *)mnsr_node->mmn_mnsr);
5242 		free_rem_dd(mnsr_node->mmn_dd);
5243 		Free(mnsr_node);
5244 		mnsr_node = master_mnsr_node;
5245 	}
5246 
5247 	/* Frees sd->sd_drvs (which is also master_dd) */
5248 	metaflushsetname(sp);
5249 	return (rval);
5250 }
5251 
5252 /*
5253  * meta_mnsync_diskset_mddbs
5254  * Calling node is guaranteed to be an owner node.
5255  * Calling node is the master node.
5256  *
5257  * Master node verifies that ondisk mddb format matches its incore format.
5258  * If no nodes are joined to set, remove the change log entries.
5259  * If a node is joined to set, play the change log.
5260  *
5261  * Returns	 0 - Success
5262  *		 1 - Master unable to join to set.
5263  *		205 - Failure during RPC to another node
5264  *		-1 - Any other failure and ep is filled in.
5265  *			-1 return will eventually cause node to panic
5266  *			in a SunCluster environment.
5267  */
5268 int
5269 meta_mnsync_diskset_mddbs(
5270 	mdsetname_t	*sp,
5271 	md_error_t	*ep
5272 )
5273 {
5274 	md_set_desc		*sd;
5275 	mddb_config_t		c;
5276 	md_mn_msgclass_t	class;
5277 	mddb_setflags_config_t	sf;
5278 	md_mnnode_desc		*nd, *nd2;
5279 	md_error_t		xep = mdnullerror;
5280 	int			stale_set = 0;
5281 
5282 	/* If setname is there, set desc should exist. */
5283 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5284 		mde_perror(ep, dgettext(TEXT_DOMAIN,
5285 		    "Unable to get set %s desc information"), sp->setname);
5286 		return (-1);
5287 	}
5288 
5289 	/* Are there drives in the set? */
5290 	if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
5291 	    ep) == NULL) {
5292 		if (! mdisok(ep)) {
5293 			return (-1);
5294 		}
5295 		/* No drives in set -- nothing to sync up */
5296 		return (0);
5297 	}
5298 
5299 	/*
5300 	 * Is master node (which is this node) joined to set?
5301 	 * If master node isn't joined (which means that no nodes
5302 	 * are joined to diskset), remove the change log entries
5303 	 * since no need to replay them - all nodes will have same
5304 	 * view of mddbs since all nodes are reading in the mddbs
5305 	 * from disk.
5306 	 * There is also no need to sync up the master and ondisk mddbs
5307 	 * since master has no incore knowledge.
5308 	 * Need to join master to set in order to flush the change
5309 	 * log entries. Don't need to block I/O during join of master
5310 	 * to set since no other nodes are joined to set and so no I/O
5311 	 * can be occurring.
5312 	 */
5313 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
5314 		/* Join master to set */
5315 		if (clnt_joinset(mynode(), sp,
5316 		    MNSET_IN_RECONFIG, ep)) {
5317 			if (mdismddberror(ep, MDE_DB_STALE)) {
5318 				/*
5319 				 * If STALE, print message and continue on.
5320 				 * Don't do any writes or reads to mddbs
5321 				 * so don't clear change log.
5322 				 */
5323 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5324 				    "Join of master node to STALE set %s"),
5325 				    sp->setname);
5326 				stale_set = 1;
5327 				mdclrerror(ep);
5328 			} else if (mdismddberror(ep, MDE_DB_ACCOK)) {
5329 				/* ACCOK means mediator provided extra vote */
5330 				mdclrerror(ep);
5331 			} else {
5332 				/*
5333 				 * If master is unable to join set, print an
5334 				 * error message.  Don't return failure or node
5335 				 * will panic during cluster reconfig cycle.
5336 				 * Also, withdraw node from set in order to
5337 				 * cleanup from failed join attempt.
5338 				 */
5339 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5340 				    "Join of master node in set %s failed"),
5341 				    sp->setname);
5342 				if (clnt_withdrawset(mynode(), sp, &xep))
5343 					mdclrerror(&xep);
5344 				return (1);
5345 			}
5346 		}
5347 		/*
5348 		 * Master node successfully joined.
5349 		 * Set local copy of flags to OWN and
5350 		 * send owner flag to rpc.metad. If not stale,
5351 		 * flush the change log.
5352 		 */
5353 		sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN;
5354 		if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, MD_NR_SET,
5355 		    MNSET_IN_RECONFIG, ep)) {
5356 			mde_perror(ep, dgettext(TEXT_DOMAIN,
5357 			    "Flag update of master node join in set %s failed"),
5358 			    sp->setname);
5359 			return (-1);
5360 		}
5361 
5362 		if (!stale_set) {
5363 			if (mdmn_reset_changelog(sp, ep,
5364 			    MDMN_CLF_RESETLOG) != 0) {
5365 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5366 				    "Unable to reset changelog."));
5367 				return (-1);
5368 			}
5369 			meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5370 			    "Removed changelog entries for set %s: %s"),
5371 			    sp->setname,
5372 			    meta_print_hrtime(gethrtime() - start_time));
5373 		}
5374 		/* Reset new master flag before return */
5375 		(void) memset(&sf, 0, sizeof (sf));
5376 		sf.sf_setno = sp->setno;
5377 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5378 		sf.sf_flags = MDDB_NM_RESET;
5379 		/* Use magic to help protect ioctl against attack. */
5380 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5381 		/* Ignore failure, failure to reset flag isn't catastrophic */
5382 		(void) metaioctl(MD_MN_SET_SETFLAGS, &sf,
5383 		    &sf.sf_mde, NULL);
5384 
5385 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5386 		    "Reset new master flag for set %s: %s"),
5387 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5388 
5389 		return (0);
5390 	}
5391 
5392 	/*
5393 	 * Is master already joined to STALE set (< 50% mddbs avail)?
5394 	 * If so, can make no config changes to mddbs so don't check or play
5395 	 * changelog and don't sync master node to ondisk mddbs.
5396 	 * To get out of the stale state all nodes must be withdrawn
5397 	 * from set.  Then as nodes are re-joined, all nodes will
5398 	 * have same view of mddbs since all nodes are reading the
5399 	 * mddbs from disk.
5400 	 */
5401 	(void) memset(&c, 0, sizeof (c));
5402 	c.c_id = 0;
5403 	c.c_setno = sp->setno;
5404 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
5405 		(void) mdstealerror(ep, &c.c_mde);
5406 		return (-1);
5407 	}
5408 	if (c.c_flags & MDDB_C_STALE) {
5409 		return (0);
5410 	}
5411 
5412 	/*
5413 	 * If this node is NOT a newly chosen master, then there's
5414 	 * nothing else to do since the change log should be empty and
5415 	 * the ondisk and incore mddbs are already consistent.
5416 	 *
5417 	 * A newly chosen master is a node that was not the master
5418 	 * at the beginning of the reconfig cycle.  If a node is a new
5419 	 * master, then the new master state is reset after the ondisk
5420 	 * and incore mddbs are consistent and the change log has
5421 	 * been replayed.
5422 	 */
5423 	(void) memset(&sf, 0, sizeof (sf));
5424 	sf.sf_setno = sp->setno;
5425 	sf.sf_flags = MDDB_NM_GET;
5426 	/* Use magic to help protect ioctl against attack. */
5427 	sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5428 	if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, &sf.sf_mde, NULL) == 0) &&
5429 	    ((sf.sf_setflags & MD_SET_MN_NEWMAS_RC) == 0)) {
5430 		return (0);
5431 	}
5432 
5433 	/*
5434 	 * Now, sync up incore master view to ondisk mddbs.
5435 	 * This is needed in the case where a master node
5436 	 * had made a change to the mddb, but this change
5437 	 * may not have been relayed to the slaves yet.
5438 	 * So, the new master needs to verify that the ondisk
5439 	 * mddbs match what the new master has incore -
5440 	 * if different, new master rewrites all of the mddbs.
5441 	 * Then the new master will replay the changelog and the
5442 	 * new master will then execute what the old master had
5443 	 * done.
5444 	 *
5445 	 * Block all I/Os to disks in this diskset on all nodes in
5446 	 * the diskset.  This will allow the rewriting of the mddbs
5447 	 * (if needed), to proceed in a timely manner.
5448 	 *
5449 	 * If block of I/Os fail, return a -1.
5450 	 */
5451 
5452 	nd = sd->sd_nodelist;
5453 	while (nd) {
5454 		/* Skip non-alive and non-owner nodes  */
5455 		if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5456 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5457 			nd = nd->nd_next;
5458 			continue;
5459 		}
5460 		if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5461 		    MN_SUSP_IO, ep)) {
5462 			mde_perror(ep, dgettext(TEXT_DOMAIN,
5463 			    "Unable to suspend I/O on node %s in set %s"),
5464 			    nd->nd_nodename, sp->setname);
5465 
5466 			/*
5467 			 * Resume all other nodes that had been suspended.
5468 			 * (Reconfig return step also resumes I/Os
5469 			 * for all sets.)
5470 			 */
5471 			nd2 = sd->sd_nodelist;
5472 			while (nd2) {
5473 				/* Stop when reaching failed node */
5474 				if (nd2->nd_nodeid == nd->nd_nodeid)
5475 					break;
5476 				/* Skip non-alive and non-owner nodes  */
5477 				if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) ||
5478 				    (!(nd2->nd_flags & MD_MN_NODE_OWN))) {
5479 					nd2 = nd2->nd_next;
5480 					continue;
5481 				}
5482 				(void) (clnt_mn_susp_res_io(nd2->nd_nodename,
5483 					sp->setno, MN_RES_IO, &xep));
5484 				nd2 = nd2->nd_next;
5485 			}
5486 
5487 			/*
5488 			 * If an RPC failure on another node, return a 205.
5489 			 * Otherwise, exit with failure.
5490 			 */
5491 			if ((mdanyrpcerror(ep)) &&
5492 			    (sd->sd_mn_mynode->nd_nodeid !=
5493 			    nd->nd_nodeid)) {
5494 				return (205);
5495 			} else {
5496 				return (-1);
5497 			}
5498 
5499 		}
5500 		nd = nd->nd_next;
5501 	}
5502 
5503 	(void) memset(&c, 0, sizeof (c));
5504 	c.c_id = 0;
5505 	c.c_setno = sp->setno;
5506 	/* Master can't sync up to ondisk mddbs?  Kick it out of cluster */
5507 	if (metaioctl(MD_MN_CHK_WRT_MDDB, &c, &c.c_mde, NULL) != 0)
5508 		return (-1);
5509 
5510 	/*
5511 	 * Resume I/Os that were suspended above.
5512 	 */
5513 	nd = sd->sd_nodelist;
5514 	while (nd) {
5515 		/* Skip non-alive and non-owner nodes  */
5516 		if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5517 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5518 			nd = nd->nd_next;
5519 			continue;
5520 		}
5521 		if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5522 		    MN_RES_IO, ep)) {
5523 			mde_perror(ep, dgettext(TEXT_DOMAIN,
5524 			    "Unable to resume I/O on node %s in set %s"),
5525 			    nd->nd_nodename, sp->setname);
5526 
5527 			/*
5528 			 * If an RPC failure then don't do any
5529 			 * more RPC calls, since one timeout is enough
5530 			 * to endure.  If RPC failure to another node, return
5531 			 * 205.  If RPC failure to my node, return -1.
5532 			 * If not an RPC failure, continue resuming the
5533 			 * rest of the nodes and then return -1.
5534 			 */
5535 			if (mdanyrpcerror(ep)) {
5536 				if (sd->sd_mn_mynode->nd_nodeid ==
5537 				    nd->nd_nodeid) {
5538 					return (-1);
5539 				} else {
5540 					return (205);
5541 				}
5542 			}
5543 
5544 			/*
5545 			 * If not an RPC error, continue resuming rest of
5546 			 * nodes, ignoring any failures except for an
5547 			 * RPC failure which constitutes an immediate exit.
5548 			 * Start in middle of list with failing node.
5549 			 */
5550 			nd2 = nd->nd_next;
5551 			while (nd2) {
5552 				/* Skip non-alive and non-owner nodes  */
5553 				if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) ||
5554 				    (!(nd2->nd_flags & MD_MN_NODE_OWN))) {
5555 					nd2 = nd2->nd_next;
5556 					continue;
5557 				}
5558 				(void) (clnt_mn_susp_res_io(nd2->nd_nodename,
5559 					sp->setno, MN_RES_IO, &xep));
5560 				if (mdanyrpcerror(&xep)) {
5561 					return (-1);
5562 				}
5563 				nd2 = nd2->nd_next;
5564 			}
5565 		}
5566 		nd = nd->nd_next;
5567 	}
5568 
5569 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, "Master node has completed "
5570 	    "checking/writing the mddb for set %s: %s"), sp->setname,
5571 	    meta_print_hrtime(gethrtime() - start_time));
5572 
5573 	/*
5574 	 * Send (aka replay) all messages we find in the changelog.
5575 	 * Flag the messages with
5576 	 *   MD_MSGF_REPLAY_MSG, so no new message ID is generated for them
5577 	 *   MD_MSGF_OVERRIDE_SUSPEND so they can pass the suspended commd.
5578 	 */
5579 	for (class = MD_MN_NCLASSES - 1; class > 0; class--) {
5580 		mdmn_changelog_record_t	*lr;
5581 		md_error_t	xep = mdnullerror;
5582 		md_mn_result_t	*resultp = NULL;
5583 		int		ret;
5584 
5585 		lr = mdmn_get_changelogrec(sp->setno, class);
5586 		if ((lr->lr_flags & MD_MN_LR_INUSE) == 0) {
5587 			/* no entry for this class */
5588 			continue;
5589 		}
5590 
5591 		meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN,
5592 		    "replaying message ID=(%d, 0x%llx-%d)\n"),
5593 		    MSGID_ELEMS(lr->lr_msg.msg_msgid));
5594 
5595 		ret = mdmn_send_message_with_msgid(
5596 			lr->lr_msg.msg_setno,
5597 			lr->lr_msg.msg_type,
5598 			lr->lr_msg.msg_flags |  MD_MSGF_REPLAY_MSG |
5599 						MD_MSGF_OVERRIDE_SUSPEND,
5600 			lr->lr_msg.msg_event_data,
5601 			lr->lr_msg.msg_event_size,
5602 			&resultp,
5603 			&lr->lr_msg.msg_msgid,
5604 			&xep);
5605 
5606 		meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN,
5607 		    "mdmn_send_message returned %d\n"), ret);
5608 
5609 		if (resultp)
5610 			free_result(resultp);
5611 	}
5612 
5613 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5614 	    "Playing changelog completed for set %s: %s"),
5615 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5616 
5617 	/*
5618 	 * Now that new master has ondisk and incore mddbs in sync, reset
5619 	 * this node's new master kernel flag (for this set).  If this node
5620 	 * re-enters another reconfig cycle before the completion of this
5621 	 * reconfig cycle, this master node won't need to check if the ondisk
5622 	 * and incore mddbs are in sync since this node won't be considered
5623 	 * a new master (since this flag is being reset here in the middle of
5624 	 * step2).  This will save time during any subsequent reconfig
5625 	 * cycles as long as this node continues to be master.
5626 	 */
5627 	(void) memset(&sf, 0, sizeof (sf));
5628 	sf.sf_setno = sp->setno;
5629 	sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5630 	sf.sf_flags = MDDB_NM_RESET;
5631 	/* Use magic to help protect ioctl against attack. */
5632 	sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5633 	/* Ignore failure, since failure to reset flag isn't catastrophic */
5634 	(void) metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, NULL);
5635 
5636 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5637 	    "Reset new master flag for set %s: %s"),
5638 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5639 
5640 	return (0);
5641 }
5642 
5643 /*
5644  * meta_mnjoin_all will join all starting nodes in the diskset.
5645  * A starting node is considered to be any node that is not
5646  * an owner of the set but is a member of the cluster.
5647  * Master node is already joined to set (done in meta_mnsync_diskset_mddbs).
5648  *
5649  * Caller is the Master node.
5650  *
5651  * Returns	 0 - Success
5652  *		205 - Failure during RPC to another node
5653  *		-1 - Any other failure and ep is filled in.
5654  */
5655 int
5656 meta_mnjoin_all(
5657 	mdsetname_t	*sp,
5658 	md_error_t	*ep
5659 )
5660 {
5661 	md_set_desc		*sd;
5662 	md_mnnode_desc		*nd, *nd2;
5663 	int			rval = 0;
5664 	int			stale_flag = 0;
5665 	mddb_config_t		c;
5666 	int			susp_res_flag = 0;
5667 	md_error_t		xep = mdnullerror;
5668 
5669 	/* If setname is there, set desc should exist. */
5670 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5671 		mde_perror(ep, dgettext(TEXT_DOMAIN,
5672 		    "Unable to get set %s desc information"), sp->setname);
5673 		return (-1);
5674 	}
5675 
5676 	/* Are there drives in the set? */
5677 	if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
5678 	    ep) == NULL) {
5679 		if (! mdisok(ep)) {
5680 			return (-1);
5681 		}
5682 		/* No drives in set -- nothing to join */
5683 		return (0);
5684 	}
5685 
5686 	/*
5687 	 * Is set currently stale?
5688 	 */
5689 	(void) memset(&c, 0, sizeof (c));
5690 	c.c_id = 0;
5691 	c.c_setno = sp->setno;
5692 	/* Ignore failure since master node may not be joined yet */
5693 	(void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL);
5694 	if (c.c_flags & MDDB_C_STALE) {
5695 		stale_flag = MNSET_IS_STALE;
5696 	}
5697 
5698 	/*
5699 	 * If any nodes are going to be joined to diskset, then
5700 	 * suspend I/O to all disks in diskset so that nodes can join
5701 	 * (read in mddbs) in a reasonable amount of time even under
5702 	 * high I/O load.  Don't need to do this if set is STALE since
5703 	 * no I/O can be occurring to a STALE set.
5704 	 */
5705 	if (stale_flag != MNSET_IS_STALE) {
5706 		nd = sd->sd_nodelist;
5707 		while (nd) {
5708 			/* Found a node that will be joined to diskset */
5709 			if ((nd->nd_flags & MD_MN_NODE_ALIVE) &&
5710 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5711 				/* Set flag that diskset should be suspended */
5712 				susp_res_flag = 1;
5713 				break;
5714 			}
5715 			nd = nd->nd_next;
5716 		}
5717 	}
5718 
5719 	if (susp_res_flag) {
5720 		/*
5721 		 * Block all I/Os to disks in this diskset on all joined
5722 		 * nodes in the diskset.
5723 		 * If block of I/Os fails due to an RPC failure on another
5724 		 * node, return 205; otherwise, return -1.
5725 		 */
5726 		nd = sd->sd_nodelist;
5727 		while (nd) {
5728 			/* Skip non-alive and non-owner nodes  */
5729 			if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5730 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5731 				nd = nd->nd_next;
5732 				continue;
5733 			}
5734 			if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5735 			    MN_SUSP_IO, ep)) {
5736 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5737 				    "Unable to suspend I/O on node %s"
5738 				    " in set %s"), nd->nd_nodename,
5739 				    sp->setname);
5740 				/*
5741 				 * Resume other nodes that had been suspended.
5742 				 * (Reconfig return step also resumes I/Os
5743 				 * for all sets.)
5744 				 */
5745 				nd2 = sd->sd_nodelist;
5746 				while (nd2) {
5747 					/* Stop when reaching failed node */
5748 					if (nd2->nd_nodeid == nd->nd_nodeid)
5749 						break;
5750 					/* Skip non-alive/non-owner nodes  */
5751 					if ((!(nd2->nd_flags &
5752 					    MD_MN_NODE_ALIVE)) ||
5753 					    (!(nd2->nd_flags &
5754 					    MD_MN_NODE_OWN))) {
5755 						nd2 = nd2->nd_next;
5756 						continue;
5757 					}
5758 					(void) (clnt_mn_susp_res_io(
5759 					    nd2->nd_nodename, sp->setno,
5760 					    MN_RES_IO, &xep));
5761 					nd2 = nd2->nd_next;
5762 				}
5763 
5764 				/*
5765 				 * If the suspend failed due to an
5766 				 * RPC failure on another node, return
5767 				 * a 205.
5768 				 * Otherwise, exit with failure.
5769 				 * The return reconfig step will resume
5770 				 * I/Os for all disksets.
5771 				 */
5772 				if ((mdanyrpcerror(ep)) &&
5773 				    (sd->sd_mn_mynode->nd_nodeid !=
5774 				    nd->nd_nodeid)) {
5775 					return (205);
5776 				} else {
5777 					return (-1);
5778 				}
5779 			}
5780 			nd = nd->nd_next;
5781 		}
5782 	}
5783 
5784 	nd = sd->sd_nodelist;
5785 	while (nd) {
5786 		/*
5787 		 * If a node is in the membership list but isn't joined
5788 		 * to the set, try to join the node.
5789 		 */
5790 		if ((nd->nd_flags & MD_MN_NODE_ALIVE) &&
5791 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5792 			if (clnt_joinset(nd->nd_nodename, sp,
5793 			    (MNSET_IN_RECONFIG | stale_flag), ep)) {
5794 				/*
5795 				 * If RPC failure to another node
5796 				 * then exit without attempting anything else.
5797 				 * (Reconfig return step will resume I/Os
5798 				 * for all sets.)
5799 				 */
5800 				if (mdanyrpcerror(ep)) {
5801 					mde_perror(ep, "");
5802 					return (205);
5803 				}
5804 				/*
5805 				 * STALE and ACCOK failures aren't true
5806 				 * failures.  STALE means that <50% mddbs
5807 				 * are available. ACCOK means that the
5808 				 * mediator provided the extra vote.
5809 				 * If a true failure, then print messasge
5810 				 * and withdraw node from set in order to
5811 				 * cleanup from failed join attempt.
5812 				 */
5813 				if ((!mdismddberror(ep, MDE_DB_STALE)) &&
5814 				    (!mdismddberror(ep, MDE_DB_ACCOK))) {
5815 					mde_perror(ep,
5816 					    "WARNING: Unable to join node %s "
5817 					    "to set %s", nd->nd_nodename,
5818 					    sp->setname);
5819 					mdclrerror(ep);
5820 					if (clnt_withdrawset(nd->nd_nodename,
5821 					    sp, &xep))
5822 						mdclrerror(&xep);
5823 					nd = nd->nd_next;
5824 					continue;
5825 				}
5826 			}
5827 			/* Set owner flag even if STALE or ACCOK */
5828 			nd->nd_flags |= MD_MN_NODE_OWN;
5829 		}
5830 		nd = nd->nd_next;
5831 	}
5832 	/*
5833 	 * Resume I/Os if suspended above.
5834 	 */
5835 	if (susp_res_flag) {
5836 		nd = sd->sd_nodelist;
5837 		while (nd) {
5838 			/*
5839 			 * Skip non-alive and non-owner nodes
5840 			 * (this list doesn't include any of
5841 			 * the nodes that were joined).
5842 			 */
5843 			if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5844 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5845 				nd = nd->nd_next;
5846 				continue;
5847 			}
5848 			if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5849 			    MN_RES_IO, ep)) {
5850 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5851 				    "Unable to resume I/O on node %s"
5852 				    " in set %s"), nd->nd_nodename,
5853 				    sp->setname);
5854 
5855 				/*
5856 				 * If an RPC failure then don't do any
5857 				 * more RPC calls, since one timeout is enough
5858 				 * to endure.  If RPC failure to another node,
5859 				 * return 205.  If RPC failure to my node,
5860 				 * return -1.
5861 				 * (Reconfig return step will resume I/Os
5862 				 * for all sets.)
5863 				 * If not an RPC failure, continue resuming the
5864 				 * rest of the nodes and then return -1.
5865 				 */
5866 				if (mdanyrpcerror(ep)) {
5867 					if (sd->sd_mn_mynode->nd_nodeid ==
5868 					    nd->nd_nodeid) {
5869 						return (-1);
5870 					} else {
5871 						return (205);
5872 					}
5873 				}
5874 
5875 				/*
5876 				 * If not an RPC error, continue resuming rest
5877 				 * of nodes, ignoring any failures except for
5878 				 * an RPC failure which constitutes an
5879 				 * immediate exit.
5880 				 * Start in middle of list with failing node.
5881 				 */
5882 				nd2 = nd->nd_next;
5883 				while (nd2) {
5884 					/* Skip non-owner nodes  */
5885 					if ((!(nd2->nd_flags &
5886 					    MD_MN_NODE_ALIVE)) ||
5887 					    (!(nd2->nd_flags &
5888 					    MD_MN_NODE_OWN))) {
5889 						nd2 = nd2->nd_next;
5890 						continue;
5891 					}
5892 					(void) (clnt_mn_susp_res_io(
5893 					    nd2->nd_nodename, sp->setno,
5894 					    MN_RES_IO, &xep));
5895 					if (mdanyrpcerror(&xep)) {
5896 						return (-1);
5897 					}
5898 					nd2 = nd2->nd_next;
5899 				}
5900 			}
5901 			nd = nd->nd_next;
5902 		}
5903 	}
5904 
5905 	nd = sd->sd_nodelist;
5906 	while (nd) {
5907 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
5908 			nd = nd->nd_next;
5909 			continue;
5910 		}
5911 		/*
5912 		 * If 1 node fails - go ahead and update the rest except
5913 		 * in the case of an RPC failure, fail immediately.
5914 		 */
5915 		if (clnt_upd_nr_flags(nd->nd_nodename, sp,
5916 		    sd->sd_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) {
5917 			/* RPC failure to another node */
5918 			if (mdanyrpcerror(ep)) {
5919 				return (205);
5920 			}
5921 			nd = nd->nd_next;
5922 			rval = -1;
5923 			continue;
5924 		}
5925 		nd = nd->nd_next;
5926 	}
5927 
5928 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5929 	    "Join of all nodes completed for set %s: %s"),
5930 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5931 
5932 	return (rval);
5933 }
5934