xref: /titanic_41/usr/src/lib/lvm/libmeta/common/meta_set.c (revision d89fccd8788afe1e920f842edd883fe192a1b8fe)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Just in case we're not in a build environment, make sure that
30  * TEXT_DOMAIN gets set to something.
31  */
32 #if !defined(TEXT_DOMAIN)
33 #define	TEXT_DOMAIN "SYS_TEST"
34 #endif
35 
36 /*
37  * Metadevice diskset interfaces
38  */
39 
40 #include "meta_set_prv.h"
41 #include <meta.h>
42 #include <metad.h>
43 #include <mdmn_changelog.h>
44 #include <sys/lvm/md_crc.h>
45 #include <sys/utsname.h>
46 #include <sdssc.h>
47 
48 #include <sys/sysevent/eventdefs.h>
49 #include <sys/sysevent/svm.h>
50 extern	char	*blkname(char *);
51 
52 static md_drive_desc *
53 dr2drivedesc(
54 	mdsetname_t	*sp,
55 	side_t		sideno,
56 	int		flags,
57 	md_error_t	*ep
58 )
59 {
60 	md_set_record	*sr;
61 	md_drive_record	*dr;
62 	mddrivename_t	*dnp;
63 	md_drive_desc	*dd_head = NULL;
64 	md_set_desc	*sd;
65 
66 	if (flags & MD_BYPASS_DAEMON) {
67 		if ((sr = metad_getsetbynum(sp->setno, ep)) == NULL)
68 			return (NULL);
69 		sd = metaget_setdesc(sp, ep);
70 		sideno = getnodeside(mynode(), sd);
71 		sp = metafakesetname(sp->setno, sr->sr_setname);
72 	} else {
73 		if ((sr = getsetbyname(sp->setname, ep)) == NULL)
74 			return (NULL);
75 	}
76 
77 	assert(sideno != MD_SIDEWILD);
78 
79 	/*
80 	 * WARNING:
81 	 * The act of getting the dnp from the namespace means that we
82 	 * will get the devid of the disk as recorded in the namespace.
83 	 * This devid has the potential to be stale if the disk is being
84 	 * replaced via a rebind, this means that any code that relies
85 	 * on any of the dnp information should take the appropriate action
86 	 * to preserve that information. For example in the rebind code the
87 	 * devid of the new disk is saved off and then copied back in once
88 	 * the code that has called this function has completed.
89 	 */
90 	for (dr = sr->sr_drivechain; dr != NULL; dr = dr->dr_next) {
91 		if ((dnp = metadrivename_withdrkey(sp, sideno, dr->dr_key,
92 		    flags, ep)) == NULL) {
93 			if (!(flags & MD_BYPASS_DAEMON))
94 				free_sr(sr);
95 			metafreedrivedesc(&dd_head);
96 			return (NULL);
97 		}
98 
99 		(void) metadrivedesc_append(&dd_head, dnp, dr->dr_dbcnt,
100 		    dr->dr_dbsize, dr->dr_ctime, dr->dr_genid, dr->dr_flags);
101 	}
102 
103 	if (!(flags & MD_BYPASS_DAEMON)) {
104 		free_sr(sr);
105 	}
106 	return (dd_head);
107 }
108 
109 static int
110 get_sidenmlist(
111 	mdsetname_t	*sp,
112 	mddrivename_t	*dnp,
113 	md_error_t	*ep
114 )
115 {
116 	md_set_desc	*sd;
117 	mdsidenames_t	*sn, **sn_next;
118 	int		i;
119 
120 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
121 		return (-1);
122 
123 	metaflushsidenames(dnp);
124 	sn_next = &dnp->side_names;
125 	if (MD_MNSET_DESC(sd)) {
126 		/*
127 		 * Only get sidenames for this node since
128 		 * that is the only side information stored in
129 		 * the local mddb for a multi-node diskset.
130 		 */
131 		if (sd->sd_mn_mynode) {
132 			sn = Zalloc(sizeof (*sn));
133 			sn->sideno = sd->sd_mn_mynode->nd_nodeid;
134 			if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET,
135 			    sn->sideno, dnp->side_names_key, &sn->dname,
136 			    &sn->mnum, NULL, ep)) == NULL) {
137 				if (sn->dname != NULL)
138 					Free(sn->dname);
139 				Free(sn);
140 				return (-1);
141 			}
142 
143 			/* Add to the end of the linked list */
144 			assert(*sn_next == NULL);
145 			*sn_next = sn;
146 			sn_next = &sn->next;
147 		}
148 	} else {
149 		for (i = 0; i < MD_MAXSIDES; i++) {
150 			/* Skip empty slots */
151 			if (sd->sd_nodes[i][0] == '\0')
152 				continue;
153 
154 			sn = Zalloc(sizeof (*sn));
155 			sn->sideno = i;
156 			if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET,
157 			    i+SKEW, dnp->side_names_key, &sn->dname,
158 			    &sn->mnum, NULL, ep)) == NULL) {
159 				/*
160 				 * It is possible that during the add of a
161 				 * host to have a 'missing' side as the side
162 				 * for this disk will be added later. So ignore
163 				 * the error. The 'missing' side will be added
164 				 * once the addhosts process has completed.
165 				 */
166 				if (mdissyserror(ep, ENOENT)) {
167 					mdclrerror(ep);
168 					Free(sn);
169 					continue;
170 				}
171 
172 				if (sn->dname != NULL)
173 					Free(sn->dname);
174 				Free(sn);
175 				return (-1);
176 			}
177 
178 			/* Add to the end of the linked list */
179 			assert(*sn_next == NULL);
180 			*sn_next = sn;
181 			sn_next = &sn->next;
182 		}
183 	}
184 
185 	return (0);
186 }
187 
188 static md_drive_desc *
189 rl_to_dd(
190 	mdsetname_t		*sp,
191 	md_replicalist_t	*rlp,
192 	md_error_t		*ep
193 )
194 {
195 	md_replicalist_t	*rl;
196 	md_replica_t		*r;
197 	md_drive_desc		*dd = NULL;
198 	md_drive_desc		*d;
199 	int			found;
200 	md_set_desc		*sd;
201 	daddr_t			nblks = 0;
202 
203 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
204 		return (NULL);
205 
206 	/* find the smallest existing replica */
207 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
208 		r = rl->rl_repp;
209 		nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks));
210 	}
211 
212 	if (nblks <= 0)
213 		nblks = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;
214 
215 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
216 		r = rl->rl_repp;
217 
218 		found = 0;
219 		for (d = dd; d != NULL; d = d->dd_next) {
220 			if (strcmp(r->r_namep->drivenamep->cname,
221 			    d->dd_dnp->cname) == 0) {
222 				found = 1;
223 				dd->dd_dbcnt++;
224 				break;
225 			}
226 		}
227 
228 		if (! found)
229 			(void) metadrivedesc_append(&dd, r->r_namep->drivenamep,
230 			    1, nblks, sd->sd_ctime, sd->sd_genid, MD_DR_OK);
231 	}
232 
233 	return (dd);
234 }
235 
236 /*
237  * Exported Entry Points
238  */
239 
240 set_t
241 get_max_sets(md_error_t *ep)
242 {
243 
244 	static set_t		max_sets = 0;
245 
246 	if (max_sets == 0)
247 		if (metaioctl(MD_IOCGETNSET, &max_sets, ep, NULL) != 0)
248 			return (0);
249 
250 	return (max_sets);
251 }
252 
253 int
254 get_max_meds(md_error_t *ep)
255 {
256 	static int		max_meds = 0;
257 
258 	if (max_meds == 0)
259 		if (metaioctl(MD_MED_GET_NMED, &max_meds, ep, NULL) != 0)
260 			return (0);
261 
262 	return (max_meds);
263 }
264 
265 side_t
266 getmyside(mdsetname_t *sp, md_error_t *ep)
267 {
268 	md_set_desc		*sd;
269 	char 			*node = NULL;
270 	side_t			sideno;
271 
272 	if (sp->setno == 0)
273 		return (0);
274 
275 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
276 		return (MD_SIDEWILD);
277 
278 	node = mynode();
279 
280 	assert(node != NULL);
281 
282 	sideno = getnodeside(node, sd);
283 
284 	if (sideno != MD_SIDEWILD)
285 		return (sideno);
286 
287 	return (mddserror(ep, MDE_DS_HOSTNOSIDE, sp->setno, node, NULL, node));
288 }
289 
290 /*
291  * get set info from name
292  */
293 md_set_record *
294 getsetbyname(char *setname, md_error_t *ep)
295 {
296 	md_set_record		*sr = NULL;
297 	md_mnset_record		*mnsr = NULL;
298 	char			*p;
299 	size_t			len;
300 
301 	/* get set info from daemon */
302 	if (clnt_getset(mynode(), setname, MD_SET_BAD, &sr, ep) == -1)
303 		return (NULL);
304 	if (sr != NULL) {
305 		/*
306 		 * Returned record could be for a multi-node set or a
307 		 * non-multi-node set.
308 		 */
309 		if (MD_MNSET_REC(sr)) {
310 			/*
311 			 * Record is for a multi-node set.  Reissue call
312 			 * to get mnset information.  Need to free
313 			 * record as if a non-multi-node set record since
314 			 * that is what clnt_getset gave us.  If in
315 			 * the daemon, don't free since this is a pointer
316 			 * into the setrecords array.
317 			 */
318 			if (! md_in_daemon) {
319 				sr->sr_flags &= ~MD_SR_MN;
320 				free_sr(sr);
321 			}
322 			if (clnt_mngetset(mynode(), setname, MD_SET_BAD, &mnsr,
323 			    ep) == -1)
324 				return (NULL);
325 			if (mnsr != NULL)
326 				return ((struct md_set_record *)mnsr);
327 		} else {
328 			return (sr);
329 		}
330 	}
331 
332 	/* no such set */
333 	len = strlen(setname) + 30;
334 	p = Malloc(len);
335 	(void) snprintf(p, len, "setname \"%s\"", setname);
336 	(void) mderror(ep, MDE_NO_SET, p);
337 	Free(p);
338 	return (NULL);
339 }
340 
341 /*
342  * get set info from number
343  */
344 md_set_record *
345 getsetbynum(set_t setno, md_error_t *ep)
346 {
347 	md_set_record		*sr;
348 	md_mnset_record		*mnsr = NULL;
349 	char			buf[100];
350 
351 	if (clnt_getset(mynode(), NULL, setno, &sr, ep) == -1)
352 		return (NULL);
353 
354 	if (sr != NULL) {
355 		/*
356 		 * Record is for a multi-node set.  Reissue call
357 		 * to get mnset information.  Need to free
358 		 * record as if a non-multi-node set record since
359 		 * that is what clnt_getset gave us.  If in
360 		 * the daemon, don't free since this is a pointer
361 		 * into the setrecords array.
362 		 */
363 		if (MD_MNSET_REC(sr)) {
364 			/*
365 			 * Record is for a multi-node set.  Reissue call
366 			 * to get mnset information.
367 			 */
368 			if (! md_in_daemon) {
369 				sr->sr_flags &= ~MD_SR_MN;
370 				free_sr(sr);
371 			}
372 			if (clnt_mngetset(mynode(), NULL, setno, &mnsr,
373 			    ep) == -1)
374 				return (NULL);
375 			if (mnsr != NULL)
376 				return ((struct md_set_record *)mnsr);
377 		} else {
378 			return (sr);
379 		}
380 	}
381 
382 	(void) sprintf(buf, "setno %u", setno);
383 	(void) mderror(ep, MDE_NO_SET, buf);
384 	return (NULL);
385 }
386 
387 int
388 meta_check_drive_inuse(
389 	mdsetname_t	*sp,
390 	mddrivename_t	*dnp,
391 	int		check_db,
392 	md_error_t	*ep
393 )
394 {
395 	mdnamelist_t	*nlp = NULL;
396 	mdnamelist_t	*p;
397 	int		rval = 0;
398 
399 	/* get all underlying partitions */
400 	if (meta_getalldevs(sp, &nlp, check_db, ep) != 0)
401 		return (-1);
402 
403 	/* search for drive */
404 	for (p = nlp; (p != NULL); p = p->next) {
405 		mdname_t	*np = p->namep;
406 
407 		if (strcmp(dnp->cname, np->drivenamep->cname) == 0) {
408 			rval = (mddserror(ep, MDE_DS_DRIVEINUSE, sp->setno,
409 			    NULL, dnp->cname, sp->setname));
410 			break;
411 		}
412 	}
413 
414 	/* cleanup, return success */
415 	metafreenamelist(nlp);
416 	return (rval);
417 }
418 
419 /*
420  * simple check for ownership
421  */
422 int
423 meta_check_ownership(mdsetname_t *sp, md_error_t *ep)
424 {
425 	int			ownset;
426 	md_set_desc		*sd;
427 	md_drive_desc		*dd;
428 	md_replicalist_t	*rlp = NULL;
429 	md_error_t		xep = mdnullerror;
430 
431 	if (metaislocalset(sp))
432 		return (0);
433 
434 	ownset = own_set(sp, NULL, TRUE, ep);
435 	if (! mdisok(ep))
436 		return (-1);
437 
438 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
439 		return (-1);
440 
441 	dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep);
442 	if (! mdisok(ep))
443 		return (-1);
444 
445 	/* If we have no drive descriptors, check for no ownership */
446 	if (dd == NULL) {
447 		if (ownset == MD_SETOWNER_NONE)
448 			return (0);
449 
450 		/* If ownership somehow has come to exist, we must clean up */
451 
452 		if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp,
453 		    &xep) < 0)
454 			mdclrerror(&xep);
455 
456 		if ((dd = rl_to_dd(sp, rlp, &xep)) == NULL)
457 			if (! mdisok(&xep))
458 				mdclrerror(&xep);
459 
460 		if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
461 			if (rel_own_bydd(sp, dd, TRUE, &xep))
462 				mdclrerror(&xep);
463 		}
464 
465 		if (halt_set(sp, &xep))
466 			mdclrerror(&xep);
467 
468 		metafreereplicalist(rlp);
469 
470 		metafreedrivedesc(&dd);
471 
472 		return (0);
473 	}
474 
475 	metafreedrivedesc(&sd->sd_drvs);
476 
477 	if (ownset == MD_SETOWNER_YES)
478 		return (0);
479 
480 	return (mddserror(ep, MDE_DS_NOOWNER, sp->setno, NULL, NULL,
481 	    sp->setname));
482 }
483 
484 /*
485  * simple check for ownership
486  */
487 int
488 meta_check_ownership_on_host(mdsetname_t *sp, char *hostname, md_error_t *ep)
489 {
490 	md_set_desc	*sd;
491 	md_drive_desc	*dd;
492 	int		bool;
493 
494 	if (metaislocalset(sp))
495 		return (0);
496 
497 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
498 		return (-1);
499 
500 	if (getnodeside(hostname, sd) == MD_SIDEWILD)
501 		return (mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
502 		    hostname, NULL, sp->setname));
503 
504 	dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep);
505 	if (! mdisok(ep))
506 		return (-1);
507 
508 	if (clnt_ownset(hostname, sp, &bool, ep) == -1)
509 		return (-1);
510 
511 	if (dd == NULL)
512 		return (0);
513 
514 	metafreedrivedesc(&sd->sd_drvs);
515 
516 	if (bool == TRUE)
517 		return (0);
518 
519 	return (mddserror(ep, MDE_DS_NODEISNOTOWNER, sp->setno, hostname, NULL,
520 	    sp->setname));
521 }
522 
523 /*
524  * Function that determines if a node is in the multinode diskset
525  * membership list.  Calling node passes in node to be checked and
526  * the nodelist as returned from meta_read_nodelist.  This routine
527  * anticipates being called many times using the same diskset membership
528  * list which is why the alloc and free of the diskset membership list
529  * is left to the calling routine.
530  * Returns:
531  *	1 - if a member
532  *	0 - not a member
533  */
534 int
535 meta_is_member(
536 	char				*node_name,
537 	md_mn_nodeid_t			node_id,
538 	mndiskset_membershiplist_t	*nl
539 )
540 {
541 	mndiskset_membershiplist_t	*nl2;
542 	int				flag_check_name;
543 
544 	if (node_id != 0)
545 		flag_check_name = 0;
546 	else if (node_name != NULL)
547 		flag_check_name = 1;
548 	else
549 		return (0);
550 
551 	nl2 = nl;
552 	while (nl2) {
553 		if (flag_check_name) {
554 			/* Compare given name against name in member list */
555 			if (strcmp(nl2->msl_node_name, node_name) == 0)
556 				break;
557 		} else {
558 			/* Compare given nodeid against nodeid in member list */
559 			if (nl2->msl_node_id == node_id)
560 				break;
561 		}
562 		nl2 = nl2->next;
563 	}
564 	/* No match found in member list */
565 	if (nl2 == NULL) {
566 		return (0);
567 	}
568 	/* Return 1 if node is in member list */
569 	return (1);
570 }
571 
572 /*
573  * meta_getnext_devinfo should go to the host that
574  * has the device, to return the device name, driver name, minor num.
575  * We can take the big cheat for now, since it is a requirement
576  * that the device names and device numbers are the same, and
577  * just get the info locally.
578  *
579  * This routine is very similar to meta_getnextside_devinfo except
580  * that the specific side to be used is being passed in.
581  *
582  * Exit status:
583  *	 0 - No more side info to return
584  *	 1 - More side info's to return
585  *	-1 - An error has been detected
586  */
587 /*ARGSUSED*/
588 int
589 meta_getside_devinfo(
590 	mdsetname_t	*sp,		/* for this set */
591 	char		*bname,		/* local block name (myside) */
592 	side_t		sideno,		/* sideno */
593 	char		**ret_bname,	/* block device name of returned side */
594 	char		**ret_dname,	/* driver name of returned side */
595 	minor_t		*ret_mnum,	/* minor number of returned side */
596 	md_error_t	*ep
597 )
598 {
599 	mdname_t	*np;
600 
601 	if (ret_bname != NULL)
602 		*ret_bname = NULL;
603 	if (ret_dname != NULL)
604 		*ret_dname = NULL;
605 	if (ret_mnum != NULL)
606 		*ret_mnum = NODEV32;
607 
608 
609 	if ((np = metaname(&sp, bname, LOGICAL_DEVICE, ep)) == NULL)
610 		return (-1);
611 
612 /*
613  * NOTE (future) - There will be more work here once devids are integrated
614  * into disksets.  Then the side should be used to find the correct
615  * host and the b/d names should be gotten from that host.
616  */
617 
618 	/*
619 	 * Return the side info.
620 	 */
621 	if (ret_bname != NULL)
622 		*ret_bname = Strdup(np->bname);
623 
624 	if (ret_dname != NULL) {
625 		mdcinfo_t	*cinfo;
626 
627 		if ((cinfo = metagetcinfo(np, ep)) == NULL)
628 			return (-1);
629 
630 		*ret_dname = Strdup(cinfo->dname);
631 	}
632 
633 	if (ret_mnum != NULL)
634 		*ret_mnum = meta_getminor(np->dev);
635 
636 	return (1);
637 }
638 
639 /*
640  * Get the information on the device from the remote node using the devid
641  * of the disk.
642  *
643  * Exit status:
644  *	 0 - No more side info to return
645  *	 1 - More side info's to return
646  *	-1 - An error has been detected
647  */
648 int
649 meta_getnextside_devinfo(
650 	mdsetname_t	*sp,		/* for this set */
651 	char		*bname,		/* local block name (myside) */
652 	side_t		*sideno,	/* previous sideno & returned sideno */
653 	char		**ret_bname,	/* block device name of returned side */
654 	char		**ret_dname,	/* driver name of returned side */
655 	minor_t		*ret_mnum,	/* minor number of returned side */
656 	md_error_t	*ep
657 )
658 {
659 	md_set_desc	*sd;
660 	int		i;
661 	mdname_t	*np;
662 	mddrivename_t	*dnp;
663 	char		*devidstr = NULL;
664 	int		devidstrlen;
665 	md_dev64_t	retdev = NODEV64;
666 	char		*ret_devname = NULL;
667 	char		*ret_blkdevname = NULL;
668 	char		*ret_driver = NULL;
669 	char		*nodename;
670 	int		fd;
671 	int		ret = -1;
672 	char		*minor_name = NULL;
673 	md_mnnode_desc	*nd;
674 
675 
676 	if (ret_bname != NULL)
677 		*ret_bname = NULL;
678 	if (ret_dname != NULL)
679 		*ret_dname = NULL;
680 	if (ret_mnum != NULL)
681 		*ret_mnum = NODEV32;
682 
683 	if (metaislocalset(sp)) {
684 		/* no more sides - we are done */
685 		if (*sideno != MD_SIDEWILD)
686 			return (0);
687 
688 		/* First time through -  set up return sideno */
689 		*sideno = 0;
690 	} else {
691 
692 		/*
693 		 * Find the next sideno, starting after the one given.
694 		 */
695 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
696 			return (-1);
697 
698 		if (MD_MNSET_DESC(sd)) {
699 			nd = sd->sd_nodelist;
700 			if ((*sideno == MD_SIDEWILD) &&
701 			    (nd != (struct md_mnnode_desc *)NULL)) {
702 				*sideno = nd->nd_nodeid;
703 			} else {
704 				while (nd) {
705 					/*
706 					 * Found given sideno, now find
707 					 * next sideno, if there is one.
708 					 */
709 					if ((*sideno == nd->nd_nodeid) &&
710 					    (nd->nd_next !=
711 					    (struct md_mnnode_desc *)NULL)) {
712 						*sideno =
713 						    nd->nd_next->nd_nodeid;
714 						break;
715 					}
716 					nd = nd->nd_next;
717 				}
718 				if (nd == NULL) {
719 					return (0);
720 				}
721 			}
722 			if (*sideno == MD_SIDEWILD)
723 				return (0);
724 		} else {
725 			for (i = (*sideno)+1; i < MD_MAXSIDES; i++)
726 				/* Find next full slot */
727 				if (sd->sd_nodes[i][0] != '\0')
728 					break;
729 
730 			/* No more sides - we are done */
731 			if (i == MD_MAXSIDES)
732 				return (0);
733 
734 			/* Set up the return sideno */
735 			*sideno = i;
736 			nodename = (char *)sd->sd_nodes[i];
737 		}
738 	}
739 
740 	/*
741 	 * Need to pass the node the devid of the disk and get it to
742 	 * send back the details of the disk from that side.
743 	 */
744 	if ((np = metaname(&sp, bname, UNKNOWN, ep)) == NULL)
745 		return (-1);
746 
747 	dnp = np->drivenamep;
748 
749 	/*
750 	 * By default, set up the parameters so that they are copied out.
751 	 */
752 	if (ret_bname != NULL)
753 		*ret_bname = Strdup(np->bname);
754 
755 	if (ret_dname != NULL) {
756 		mdcinfo_t	*cinfo;
757 
758 		if ((cinfo = metagetcinfo(np, ep)) == NULL)
759 			return (-1);
760 
761 		*ret_dname = Strdup(cinfo->dname);
762 	}
763 
764 	if (ret_mnum != NULL)
765 		*ret_mnum = meta_getminor(np->dev);
766 
767 	/*
768 	 * Try some optimization. If this is the local set or the device
769 	 * is a metadevice then just copy the information. If the device
770 	 * does not have a devid (due to not having a minor name) then
771 	 * fall back to the pre-devid behaviour of copying the information
772 	 * on the device: this is okay because the sanity checks before this
773 	 * call would have found any issues with the device. If it's a
774 	 * multi-node diskset also just return ie. copy.
775 	 */
776 	if (metaislocalset(sp) || metaismeta(np) || (dnp->devid == NULL) ||
777 	    (MD_MNSET_DESC(sd)))
778 		return (1);
779 
780 	if (np->minor_name == (char *)NULL) {
781 		/*
782 		 * Have to get the minor name then. The slice should exist
783 		 * on the disk because it will have already been repartitioned
784 		 * up prior to getting to this point.
785 		 */
786 		if ((fd = open(np->bname, (O_RDONLY|O_NDELAY), 0)) < 0) {
787 			(void) mdsyserror(ep, errno, np->bname);
788 			return (-1);
789 		}
790 		(void) devid_get_minor_name(fd, &minor_name);
791 		np->minor_name = Strdup(minor_name);
792 		devid_str_free(minor_name);
793 		(void) close(fd);
794 	}
795 
796 	/* allocate extra space for "/" and NULL hence +2 */
797 	devidstrlen = strlen(dnp->devid) + strlen(np->minor_name) + 2;
798 	devidstr = (char *)Malloc(devidstrlen);
799 
800 	/*
801 	 * As a minor name is supplied then the ret_devname will be
802 	 * appropriate to that minor_name and in this case it will be
803 	 * a block device ie /dev/dsk.
804 	 */
805 	(void) snprintf(devidstr, devidstrlen,
806 		"%s/%s", dnp->devid, np->minor_name);
807 
808 	ret = clnt_devinfo_by_devid(nodename, sp, devidstr, &retdev,
809 	    np->bname, &ret_devname, &ret_driver, ep);
810 
811 	Free(devidstr);
812 
813 	/*
814 	 * If the other side is not running device id in disksets,
815 	 * 'ret' is set to ENOTSUP in which case we fallback to
816 	 * the existing behaviour
817 	 */
818 	if (ret == ENOTSUP)
819 		return (1);
820 	else if (ret == -1)
821 		return (-1);
822 
823 	/*
824 	 * ret_devname comes from the rpc call and is a
825 	 * raw device name. We need to make this into a
826 	 * block device via blkname for further processing.
827 	 * Unfortunately, when our device id isn't found in
828 	 * the system, the rpc call will return a " " in
829 	 * ret_devname in which case we need to fill that in
830 	 * as ret_blkname because blkname of " " returns NULL.
831 	 */
832 	if (ret_bname != NULL && ret_devname != NULL) {
833 		ret_blkdevname = blkname(ret_devname);
834 		if (ret_blkdevname == NULL)
835 			*ret_bname = Strdup(ret_devname);
836 		else
837 			*ret_bname = Strdup(ret_blkdevname);
838 	}
839 
840 	if (ret_dname != NULL && ret_driver != NULL)
841 		*ret_dname = Strdup(ret_driver);
842 
843 	if (ret_mnum != NULL)
844 		*ret_mnum = meta_getminor(retdev);
845 
846 	return (1);
847 }
848 
849 int
850 meta_is_drive_in_anyset(
851 	mddrivename_t	*dnp,
852 	mdsetname_t	**spp,
853 	int		bypass_daemon,
854 	md_error_t 	*ep
855 )
856 {
857 	set_t		setno;
858 	mdsetname_t	*this_sp;
859 	int		is_it;
860 	set_t		max_sets;
861 
862 	if ((max_sets = get_max_sets(ep)) == 0)
863 		return (-1);
864 
865 	assert(spp != NULL);
866 	*spp = NULL;
867 
868 	for (setno = 1; setno < max_sets; setno++) {
869 		if (!bypass_daemon) {
870 			if ((this_sp = metasetnosetname(setno, ep)) == NULL) {
871 				if (mdismddberror(ep, MDE_DB_NODB)) {
872 					mdclrerror(ep);
873 					return (0);
874 				}
875 				if (mdiserror(ep, MDE_NO_SET)) {
876 					mdclrerror(ep);
877 					continue;
878 				}
879 				return (-1);
880 			}
881 		} else
882 			this_sp = metafakesetname(setno, NULL);
883 
884 		if ((is_it = meta_is_drive_in_thisset(this_sp, dnp,
885 		    bypass_daemon, ep)) == -1) {
886 			if (mdiserror(ep, MDE_NO_SET)) {
887 				mdclrerror(ep);
888 				continue;
889 			}
890 			return (-1);
891 		}
892 		if (is_it) {
893 			*spp = this_sp;
894 			return (0);
895 		}
896 	}
897 	return (0);
898 }
899 
900 int
901 meta_is_drive_in_thisset(
902 	mdsetname_t	*sp,
903 	mddrivename_t	*dnp,
904 	int		bypass_daemon,
905 	md_error_t	*ep
906 )
907 {
908 	md_drive_desc	*dd, *p;
909 
910 	if (bypass_daemon)
911 		dd = dr2drivedesc(sp, MD_SIDEWILD,
912 		    (MD_BASICNAME_OK | MD_BYPASS_DAEMON), ep);
913 	else
914 		dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
915 
916 	if (dd == NULL) {
917 		if (! mdisok(ep))
918 			return (-1);
919 		return (0);
920 	}
921 
922 
923 	for (p = dd; p != NULL; p = p->dd_next)
924 		if (strcmp(p->dd_dnp->cname, dnp->cname) == 0)
925 			return (1);
926 	return (0);
927 }
928 
929 int
930 meta_set_balance(
931 	mdsetname_t		*sp,
932 	md_error_t		*ep
933 )
934 {
935 	md_set_desc		*sd;
936 	md_drive_desc		*dd, *curdd;
937 	daddr_t			dbsize;
938 	daddr_t			nblks;
939 	int			i;
940 	int			rval = 0;
941 	sigset_t		oldsigs;
942 	md_setkey_t		*cl_sk;
943 	md_error_t		xep = mdnullerror;
944 	md_mnnode_desc		*nd;
945 	int			suspend1_flag = 0;
946 
947 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
948 		return (-1);
949 
950 	dbsize = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;
951 
952 	/* Make sure we own the set */
953 	if (meta_check_ownership(sp, ep) != 0)
954 		return (-1);
955 
956 	/* END CHECK CODE */
957 
958 	/*
959 	 * Get drive descriptors for the drives that are currently in the set.
960 	 */
961 	curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep);
962 
963 	if (! mdisok(ep))
964 		return (-1);
965 
966 	/* Find the minimum replica size in use is or use the default */
967 	if ((nblks = meta_db_minreplica(sp, ep)) < 0)
968 		mdclrerror(ep);
969 	else
970 		dbsize = nblks;	/* adjust replica size */
971 
972 	/* Make sure we are blocking all signals */
973 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
974 		mdclrerror(&xep);
975 
976 	/*
977 	 * Lock the set on current set members.
978 	 * For MN diskset lock_set and SUSPEND are used to protect against
979 	 * other meta* commands running on the other nodes.
980 	 */
981 	if (MD_MNSET_DESC(sd)) {
982 		nd = sd->sd_nodelist;
983 		while (nd) {
984 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
985 				nd = nd->nd_next;
986 				continue;
987 			}
988 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
989 				rval = -1;
990 				goto out;
991 			}
992 			nd = nd->nd_next;
993 		}
994 		/*
995 		 * Lock out other meta* commands by suspending
996 		 * class 1 messages across the diskset.
997 		 */
998 		nd = sd->sd_nodelist;
999 		while (nd) {
1000 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1001 				nd = nd->nd_next;
1002 				continue;
1003 			}
1004 			if (clnt_mdcommdctl(nd->nd_nodename,
1005 			    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
1006 			    MD_MSCF_NO_FLAGS, ep)) {
1007 				rval = -1;
1008 				goto out;
1009 			}
1010 			suspend1_flag = 1;
1011 			nd = nd->nd_next;
1012 		}
1013 	} else {
1014 		for (i = 0; i < MD_MAXSIDES; i++) {
1015 			/* Skip empty slots */
1016 			if (sd->sd_nodes[i][0] == '\0') continue;
1017 
1018 			if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
1019 				rval = -1;
1020 				goto out;
1021 			}
1022 		}
1023 	}
1024 
1025 	/* We are not adding or deleting any drives, just balancing */
1026 	dd = NULL;
1027 
1028 	/*
1029 	 * Balance the DB's according to the list of existing drives and the
1030 	 * list of added drives.
1031 	 */
1032 	if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1)
1033 		goto out;
1034 
1035 out:
1036 	/*
1037 	 * Unlock diskset by resuming class 1 messages across the diskset.
1038 	 * Just resume all classes so that resume is the same whether
1039 	 * just one class was locked or all classes were locked.
1040 	 */
1041 	if (suspend1_flag) {
1042 		nd = sd->sd_nodelist;
1043 		while (nd) {
1044 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1045 				nd = nd->nd_next;
1046 				continue;
1047 			}
1048 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
1049 				sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
1050 				/*
1051 				 * We are here because we failed to resume
1052 				 * rpc.mdcommd.  However we potentially have
1053 				 * an error from the previous call
1054 				 * (meta_db_balance). If the previous call
1055 				 * did fail,  we capture that error and
1056 				 * generate a perror withthe string,
1057 				 * "Unable to resume...".
1058 				 * Setting rval to -1 ensures that in the
1059 				 * next iteration of the loop, ep is not
1060 				 * clobbered.
1061 				 */
1062 				if (rval == 0)
1063 					(void) mdstealerror(ep, &xep);
1064 				else
1065 					mdclrerror(&xep);
1066 				rval = -1;
1067 				mde_perror(ep, dgettext(TEXT_DOMAIN,
1068 				    "Unable to resume rpc.mdcommd."));
1069 			}
1070 			nd = nd->nd_next;
1071 		}
1072 	}
1073 
1074 	/* Unlock the set */
1075 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1076 	if (MD_MNSET_DESC(sd)) {
1077 		nd = sd->sd_nodelist;
1078 		while (nd) {
1079 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1080 				nd = nd->nd_next;
1081 				continue;
1082 			}
1083 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
1084 				if (rval == 0)
1085 					(void) mdstealerror(ep, &xep);
1086 				else
1087 					mdclrerror(&xep);
1088 				rval = -1;
1089 			}
1090 			nd = nd->nd_next;
1091 		}
1092 	} else {
1093 		for (i = 0; i < MD_MAXSIDES; i++) {
1094 			/* Skip empty slots */
1095 			if (sd->sd_nodes[i][0] == '\0')
1096 				continue;
1097 
1098 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
1099 				if (rval == 0)
1100 					(void) mdstealerror(ep, &xep);
1101 				rval = -1;
1102 			}
1103 		}
1104 	}
1105 
1106 	/* release signals back to what they were on entry */
1107 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
1108 		mdclrerror(&xep);
1109 
1110 	cl_set_setkey(NULL);
1111 
1112 	metaflushsetname(sp);
1113 
1114 	return (rval);
1115 }
1116 
1117 int
1118 meta_set_destroy(
1119 	mdsetname_t	*sp,
1120 	int		lock_set,
1121 	md_error_t	*ep
1122 )
1123 {
1124 	int		i;
1125 	med_rec_t	medr;
1126 	md_set_desc	*sd;
1127 	md_drive_desc	*dd, *p, *p1;
1128 	mddrivename_t	*dnp;
1129 	mdname_t	*np;
1130 	mdnamelist_t	*nlp = NULL;
1131 	int		num_users = 0;
1132 	int		has_set;
1133 	side_t		mysideno;
1134 	sigset_t	oldsigs;
1135 	md_error_t	xep = mdnullerror;
1136 	md_setkey_t	*cl_sk;
1137 	int		rval = 0;
1138 	int		delete_end = 1;
1139 
1140 	/* Make sure we are blocking all signals */
1141 	if (procsigs(TRUE, &oldsigs, ep) < 0)
1142 		return (-1);
1143 
1144 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1145 		if (! mdisok(ep))
1146 			rval = -1;
1147 		goto out;
1148 	}
1149 
1150 	/*
1151 	 * meta_set_destroy should not be called for a MN diskset.
1152 	 * This routine destroys a set without communicating this information
1153 	 * to the other nodes which would lead to an inconsistency in
1154 	 * the MN diskset.
1155 	 */
1156 	if (MD_MNSET_DESC(sd)) {
1157 		rval = -1;
1158 		goto out;
1159 	}
1160 
1161 	/* Continue if a traditional diskset */
1162 
1163 	/*
1164 	 * Check to see who has the set.  If we are not the last user of the
1165 	 * set, we will not touch the replicas.
1166 	 */
1167 	for (i = 0; i < MD_MAXSIDES; i++) {
1168 		/* Skip empty slots */
1169 		if (sd->sd_nodes[i][0] == '\0')
1170 			continue;
1171 
1172 		has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NST_EQ,
1173 		    ep);
1174 
1175 		if (has_set < 0) {
1176 			mdclrerror(ep);
1177 		} else
1178 			num_users++;
1179 	}
1180 
1181 	if ((dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) == NULL) {
1182 		if (! mdisok(ep)) {
1183 			rval = -1;
1184 			goto out;
1185 		}
1186 	}
1187 
1188 	if (setup_db_bydd(sp, dd, TRUE, ep) == -1) {
1189 		rval = -1;
1190 		goto out;
1191 	}
1192 
1193 	if (lock_set == TRUE) {
1194 		/* Lock the set on our side */
1195 		if (clnt_lock_set(mynode(), sp, ep)) {
1196 			rval = -1;
1197 			goto out;
1198 		}
1199 	}
1200 
1201 	/*
1202 	 * A traditional diskset has no diskset stale information to send
1203 	 * since there can only be one owner node at a time.
1204 	 */
1205 	if (snarf_set(sp, FALSE, ep))
1206 		mdclrerror(ep);
1207 
1208 	if (dd != NULL) {
1209 		/*
1210 		 * Make sure that no drives are in use as parts of metadrives
1211 		 * or hot spare pools, this is one of the few error conditions
1212 		 * that will stop this routine, unless the environment has
1213 		 * META_DESTROY_SET_OK set, in which case, the operation will
1214 		 * proceed.
1215 		 */
1216 		if (getenv("META_DESTROY_SET_OK") == NULL) {
1217 			for (p = dd; p != NULL; p = p->dd_next) {
1218 				dnp = p->dd_dnp;
1219 
1220 				i = meta_check_drive_inuse(sp, dnp, FALSE, ep);
1221 				if (i == -1) {
1222 					/* need xep - wire calls clear error */
1223 					i = metaget_setownership(sp, &xep);
1224 					if (i == -1) {
1225 						rval = -1;
1226 						goto out;
1227 					}
1228 
1229 					mysideno = getmyside(sp, &xep);
1230 
1231 					if (mysideno == MD_SIDEWILD) {
1232 						rval = -1;
1233 						goto out;
1234 					}
1235 
1236 					if (sd->sd_isown[mysideno] == FALSE)
1237 						if (halt_set(sp, &xep)) {
1238 							rval = -1;
1239 							goto out;
1240 						}
1241 
1242 					rval = -1;
1243 					goto out;
1244 				}
1245 			}
1246 		}
1247 
1248 		for (i = 0; i < MD_MAXSIDES; i++) {
1249 			/* Skip empty slots */
1250 			if (sd->sd_nodes[i][0] == '\0')
1251 				continue;
1252 
1253 			/* Skip non local nodes */
1254 			if (strcmp(mynode(), sd->sd_nodes[i]) != 0)
1255 				continue;
1256 
1257 			if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep))
1258 				mdclrerror(ep);
1259 		}
1260 
1261 		/*
1262 		 * Go thru each drive and individually delete the replicas.
1263 		 * This way we can ignore individual errors.
1264 		 */
1265 		for (p = dd; p != NULL; p = p->dd_next) {
1266 			uint_t	rep_slice;
1267 
1268 			dnp = p->dd_dnp;
1269 			if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) ||
1270 			    (((np = metaslicename(dnp, rep_slice, ep))
1271 				== NULL) &&
1272 				((np = metaslicename(dnp, MD_SLICE0, ep))
1273 				    == NULL))) {
1274 				rval = -1;
1275 				goto out;
1276 			}
1277 
1278 			if ((np = metaslicename(dnp,
1279 			    rep_slice, ep)) == NULL) {
1280 				if ((np = metaslicename(dnp,
1281 				    MD_SLICE0, ep)) == NULL) {
1282 					rval = -1;
1283 					goto out;
1284 				}
1285 				mdclrerror(ep);
1286 			}
1287 
1288 			/* Yes this is UGLY!!! */
1289 			p1 = p->dd_next;
1290 			p->dd_next = NULL;
1291 			if (rel_own_bydd(sp, p, FALSE, ep))
1292 				mdclrerror(ep);
1293 			p->dd_next = p1;
1294 
1295 			if (p->dd_dbcnt == 0)
1296 				continue;
1297 
1298 			/*
1299 			 * Skip the replica removal if we are not the last user
1300 			 */
1301 			if (num_users != 1)
1302 				continue;
1303 
1304 			nlp = NULL;
1305 			(void) metanamelist_append(&nlp, np);
1306 			if (meta_db_detach(sp, nlp,
1307 			    (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, ep))
1308 				mdclrerror(ep);
1309 			metafreenamelist(nlp);
1310 		}
1311 	}
1312 
1313 	if (halt_set(sp, ep)) {
1314 		rval = -1;
1315 		goto out;
1316 	}
1317 
1318 	/* Setup the mediator record */
1319 	(void) memset(&medr, '\0', sizeof (med_rec_t));
1320 	medr.med_rec_mag = MED_REC_MAGIC;
1321 	medr.med_rec_rev = MED_REC_REV;
1322 	medr.med_rec_fl  = 0;
1323 	medr.med_rec_sn  = sp->setno;
1324 	(void) strcpy(medr.med_rec_snm, sp->setname);
1325 	medr.med_rec_meds = sd->sd_med;	/* structure assigment */
1326 	(void) memset(&medr.med_rec_data, '\0', sizeof (med_data_t));
1327 	medr.med_rec_foff = 0;
1328 
1329 	/*
1330 	 * If we are the last remaining user, then remove the mediator hosts
1331 	 */
1332 	if (num_users == 1) {
1333 		for (i = 0; i < MED_MAX_HOSTS; i++) {
1334 			if (medr.med_rec_meds.n_lst[i].a_cnt != 0)
1335 				SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE,
1336 				    SVM_TAG_MEDIATOR, sp->setno, i);
1337 			(void) memset(&medr.med_rec_meds.n_lst[i], '\0',
1338 			    sizeof (md_h_t));
1339 		}
1340 		medr.med_rec_meds.n_cnt = 0;
1341 	} else { 	/* Remove this host from the mediator node list. */
1342 		for (i = 0; i < MD_MAXSIDES; i++) {
1343 			/* Skip empty slots */
1344 			if (sd->sd_nodes[i][0] == '\0')
1345 				continue;
1346 
1347 			/* Copy non local node */
1348 			if (strcmp(mynode(), sd->sd_nodes[i]) != 0) {
1349 				(void) strcpy(medr.med_rec_nodes[i],
1350 				    sd->sd_nodes[i]);
1351 				continue;
1352 			}
1353 
1354 			/* Clear local node */
1355 			(void) memset(&medr.med_rec_nodes[i], '\0',
1356 			    sizeof (md_node_nm_t));
1357 		}
1358 	}
1359 
1360 	crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
1361 
1362 	/*
1363 	 * If the client is part of a cluster put the DCS service
1364 	 * into a deleteing state.
1365 	 */
1366 	if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) {
1367 		if (metad_isautotakebyname(sp->setname)) {
1368 			delete_end = 0;
1369 		} else {
1370 			mdclrerror(ep);
1371 			goto out;
1372 		}
1373 	}
1374 
1375 	/* Inform the mediator hosts of the new information */
1376 	for (i = 0; i < MED_MAX_HOSTS; i++) {
1377 		if (sd->sd_med.n_lst[i].a_cnt == 0)
1378 			continue;
1379 
1380 		if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep))
1381 			mdclrerror(ep);
1382 	}
1383 
1384 	/* Delete the set locally */
1385 	for (i = 0; i < MD_MAXSIDES; i++) {
1386 		/* Skip empty slots */
1387 		if (sd->sd_nodes[i][0] == '\0')
1388 			continue;
1389 
1390 		/* Skip non local nodes */
1391 		if (strcmp(mynode(), sd->sd_nodes[i]) != 0)
1392 			continue;
1393 
1394 		if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1)
1395 			mdclrerror(ep);
1396 	}
1397 	if (delete_end &&
1398 	    sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR)
1399 		rval = -1;
1400 
1401 out:
1402 	/* release signals back to what they were on entry */
1403 	if (procsigs(FALSE, &oldsigs, &xep) < 0) {
1404 		if (rval == 0)
1405 			(void) mdstealerror(ep, &xep);
1406 		rval = -1;
1407 	}
1408 
1409 	if (lock_set == TRUE) {
1410 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
1411 		if (clnt_unlock_set(mynode(), cl_sk, &xep)) {
1412 			if (rval == 0)
1413 				(void) mdstealerror(ep, &xep);
1414 			rval = -1;
1415 		}
1416 		cl_set_setkey(NULL);
1417 	}
1418 
1419 	metaflushsetname(sp);
1420 	return (rval);
1421 }
1422 
1423 int
1424 meta_set_purge(
1425 	mdsetname_t	*sp,
1426 	int		bypass_cluster,
1427 	int		forceflg,
1428 	md_error_t	*ep
1429 )
1430 {
1431 	char		*thishost = mynode();
1432 	md_set_desc	*sd;
1433 	md_setkey_t	*cl_sk;
1434 	md_error_t	xep = mdnullerror;
1435 	int		rval = 0;
1436 	int		i, num_hosts = 0;
1437 	int		has_set = 0;
1438 	int		max_node = 0;
1439 	int		delete_end = 1;
1440 	md_mnnode_desc	*nd;
1441 
1442 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1443 		/* unable to find set description */
1444 		rval = 1;
1445 		return (rval);
1446 	}
1447 
1448 	if (MD_MNSET_DESC(sd)) {
1449 		/*
1450 		 * Get a count of the hosts in the set and also lock the set
1451 		 * on those hosts that know about it.
1452 		 */
1453 		nd = sd->sd_nodelist;
1454 		while (nd) {
1455 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1456 				nd = nd->nd_next;
1457 				continue;
1458 			}
1459 			has_set = nodehasset(sp, nd->nd_nodename,
1460 				NHS_NST_EQ, ep);
1461 
1462 			/*
1463 			 * The host is not aware of this set (has_set < 0) or
1464 			 * the set does not match (has_set == 0). This check
1465 			 * prevents the code getting confused by an apparent
1466 			 * inconsistancy in the set's state, this is in the
1467 			 * purge code so something is broken in any case and
1468 			 * this is just trying to fix the brokeness.
1469 			 */
1470 			if (has_set <= 0) {
1471 				mdclrerror(ep);
1472 				nd->nd_flags |= MD_MN_NODE_NOSET;
1473 			} else {
1474 				num_hosts++;
1475 				if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
1476 					/*
1477 					 * If the force flag is set then
1478 					 * ignore any RPC failures because we
1479 					 * are only really interested with
1480 					 * the set on local node.
1481 					 */
1482 					if (forceflg && mdanyrpcerror(ep)) {
1483 						mdclrerror(ep);
1484 					} else {
1485 						/*
1486 						 * set max_node so that in the
1487 						 * unlock code nodes in the
1488 						 * set that have not been
1489 						 * locked are not unlocked.
1490 						 */
1491 						max_node = nd->nd_nodeid;
1492 						rval = 2;
1493 						goto out1;
1494 					}
1495 				}
1496 
1497 			}
1498 			nd = nd->nd_next;
1499 		}
1500 		max_node = 0;
1501 	} else {
1502 		/*
1503 		 * Get a count of the hosts in the set and also lock the set
1504 		 * on those hosts that know about it.
1505 		 */
1506 		for (i = 0; i < MD_MAXSIDES; i++) {
1507 			/* Skip empty slots */
1508 			if (sd->sd_nodes[i][0] == '\0')
1509 				continue;
1510 
1511 			has_set = nodehasset(sp, sd->sd_nodes[i],
1512 				NHS_NST_EQ, ep);
1513 
1514 			/*
1515 			 * The host is not aware of this set (has_set < 0) or
1516 			 * the set does not match (has_set == 0). This check
1517 			 * prevents the code getting confused by an apparent
1518 			 * inconsistancy in the set's state, this is in the
1519 			 * purge code so something is broken in any case and
1520 			 * this is just trying to fix the brokeness.
1521 			 */
1522 			if (has_set <= 0) {
1523 				mdclrerror(ep);
1524 				/*
1525 				 * set the node to NULL to prevent further
1526 				 * requests to this unresponsive node.
1527 				 */
1528 				sd->sd_nodes[i][0] = '\0';
1529 			} else {
1530 				num_hosts++;
1531 				if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
1532 					/*
1533 					 * If the force flag is set then
1534 					 * ignore any RPC failures because we
1535 					 * are only really interested with
1536 					 * the set on local node.
1537 					 */
1538 					if (forceflg && mdanyrpcerror(ep)) {
1539 						mdclrerror(ep);
1540 					} else {
1541 						rval = 2;
1542 						/*
1543 						 * set max_node so that in the
1544 						 * unlock code nodes in the
1545 						 * set that have not been
1546 						 * locked are not unlocked.
1547 						 */
1548 						max_node = i;
1549 						goto out1;
1550 					}
1551 				}
1552 			}
1553 		}
1554 		max_node = i;	/* now MD_MAXSIDES */
1555 	}
1556 	if (!bypass_cluster) {
1557 		/*
1558 		 * If there is only one host associated with the
1559 		 * set then remove the set from the cluster.
1560 		 */
1561 		if (num_hosts == 1) {
1562 			if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) {
1563 				if (metad_isautotakebyname(sp->setname)) {
1564 					delete_end = 0;
1565 				} else {
1566 					mdclrerror(ep);
1567 					rval = 3;
1568 					goto out1;
1569 				}
1570 			}
1571 		}
1572 	}
1573 
1574 	if (MD_MNSET_DESC(sd)) {
1575 		/*
1576 		 * Get a count of the hosts in the set and also lock the set
1577 		 * on those hosts that know about it.
1578 		 */
1579 		nd = sd->sd_nodelist;
1580 		while (nd) {
1581 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1582 				nd = nd->nd_next;
1583 				continue;
1584 			}
1585 			if (nd->nd_nodeid != sd->sd_mn_mynode->nd_nodeid) {
1586 				/*
1587 				 * Tell the remote node to remove this node
1588 				 */
1589 				if (clnt_delhosts(nd->nd_nodename, sp, 1,
1590 					&thishost, ep) == -1) {
1591 					/*
1592 					 * If we fail to delete ourselves
1593 					 * from the remote host it does not
1594 					 * really matter because the set is
1595 					 * being "purged" from this node. The
1596 					 * set can be purged from the other
1597 					 * node at a later time.
1598 					 */
1599 					mdclrerror(ep);
1600 				}
1601 				nd = nd->nd_next;
1602 				continue;
1603 			}
1604 			/* remove the set from this host */
1605 			if (clnt_delset(nd->nd_nodename, sp, ep) == -1) {
1606 				md_perror(dgettext(TEXT_DOMAIN, "delset"));
1607 				if (!bypass_cluster && num_hosts == 1)
1608 					(void) sdssc_delete_end(sp->setname,
1609 					    SDSSC_CLEANUP);
1610 				mdclrerror(ep);
1611 				goto out1;
1612 			}
1613 			nd = nd->nd_next;
1614 		}
1615 	} else {
1616 		for (i = 0; i < MD_MAXSIDES; i++) {
1617 			/* Skip empty slots */
1618 			if (sd->sd_nodes[i][0] == '\0')
1619 				continue;
1620 			if (strcmp(thishost, sd->sd_nodes[i]) != 0) {
1621 				/*
1622 				 * Tell the remote node to remove this node
1623 				 */
1624 				if (clnt_delhosts(sd->sd_nodes[i], sp, 1,
1625 				    &thishost, ep) == -1) {
1626 					/*
1627 					 * If we fail to delete ourselves
1628 					 * from the remote host it does not
1629 					 * really matter because the set is
1630 					 * being "purged" from this node. The
1631 					 * set can be purged from the other
1632 					 * node at a later time.
1633 					 */
1634 					mdclrerror(ep);
1635 				}
1636 				continue;
1637 			}
1638 
1639 			/* remove the set from this host */
1640 			if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) {
1641 				md_perror(dgettext(TEXT_DOMAIN, "delset"));
1642 				if (!bypass_cluster && num_hosts == 1)
1643 					(void) sdssc_delete_end(sp->setname,
1644 					    SDSSC_CLEANUP);
1645 				mdclrerror(ep);
1646 				goto out1;
1647 			}
1648 		}
1649 	}
1650 
1651 	if (!bypass_cluster && num_hosts == 1) {
1652 		if (delete_end && sdssc_delete_end(sp->setname, SDSSC_COMMIT) ==
1653 		    SDSSC_ERROR) {
1654 			rval = 4;
1655 		}
1656 	}
1657 
1658 out1:
1659 
1660 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1661 
1662 	/*
1663 	 * Remove the set lock on those nodes that had the set locked
1664 	 * max_node will either be MD_MAXSIDES or array index of the last
1665 	 * node contacted (or rather failed to contact) for traditional
1666 	 * diskset.  For a MN diskset, max_node is the node_id of the node
1667 	 * that failed the lock.
1668 	 */
1669 	if (MD_MNSET_DESC(sd)) {
1670 		nd = sd->sd_nodelist;
1671 		while (nd) {
1672 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1673 				nd = nd->nd_next;
1674 				continue;
1675 			}
1676 			if (nd->nd_nodeid == max_node)
1677 				break;
1678 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
1679 				if (forceflg && mdanyrpcerror(&xep)) {
1680 					mdclrerror(&xep);
1681 					nd = nd->nd_next;
1682 					continue;
1683 				}
1684 				if (rval == 0)
1685 					(void) mdstealerror(ep, &xep);
1686 				rval = 5;
1687 			}
1688 			nd = nd->nd_next;
1689 		}
1690 	} else {
1691 		for (i = 0; i < max_node; i++) {
1692 			/* Skip empty slots */
1693 			if (sd->sd_nodes[i][0] == '\0')
1694 				continue;
1695 
1696 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
1697 				if (forceflg && mdanyrpcerror(&xep)) {
1698 					mdclrerror(&xep);
1699 					continue;
1700 				}
1701 				if (rval == 0)
1702 					(void) mdstealerror(ep, &xep);
1703 				rval = 5;
1704 			}
1705 		}
1706 	}
1707 
1708 	cl_set_setkey(NULL);
1709 
1710 	return (rval);
1711 }
1712 
1713 int
1714 meta_set_query(
1715 	mdsetname_t		*sp,
1716 	mddb_dtag_lst_t		**dtlpp,
1717 	md_error_t		*ep
1718 )
1719 {
1720 	mddb_dtag_get_parm_t	dtgp;
1721 
1722 	(void) memset(&dtgp, '\0', sizeof (mddb_dtag_get_parm_t));
1723 	dtgp.dtgp_setno = sp->setno;
1724 
1725 	/*CONSTCOND*/
1726 	while (1) {
1727 		if (metaioctl(MD_MED_GET_TAG, &dtgp, &dtgp.dtgp_mde, NULL) != 0)
1728 			if (! mdismddberror(&dtgp.dtgp_mde, MDE_DB_NOTAG) ||
1729 			    *dtlpp == NULL)
1730 				return (mdstealerror(ep, &dtgp.dtgp_mde));
1731 			else
1732 				break;
1733 
1734 		/*
1735 		 * Run to the end of the list
1736 		 */
1737 		for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx)
1738 			/* void */;
1739 
1740 		*dtlpp = Zalloc(sizeof (mddb_dtag_lst_t));
1741 
1742 		(void) memmove(&(*dtlpp)->dtl_dt, &dtgp.dtgp_dt,
1743 		    sizeof (mddb_dtag_t));
1744 
1745 		dtgp.dtgp_dt.dt_id++;
1746 	}
1747 	return (0);
1748 }
1749 
1750 /*
1751  * return drivename get by key
1752  */
1753 mddrivename_t *
1754 metadrivename_withdrkey(
1755 	mdsetname_t	*sp,
1756 	side_t		sideno,
1757 	mdkey_t		key,
1758 	int		flags,
1759 	md_error_t	*ep
1760 )
1761 {
1762 	char		*nm;
1763 	mdname_t	*np;
1764 	mddrivename_t	*dnp;
1765 	ddi_devid_t	devidp;
1766 	md_set_desc	*sd;
1767 
1768 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1769 		return (NULL);
1770 	}
1771 
1772 	/* get namespace info */
1773 	if (MD_MNSET_DESC(sd)) {
1774 		if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno,
1775 		    key, ep)) == NULL)
1776 			return (NULL);
1777 	} else {
1778 		if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno+SKEW,
1779 		    key, ep)) == NULL)
1780 			return (NULL);
1781 	}
1782 
1783 	/* get device name */
1784 	if (flags & PRINT_FAST) {
1785 		if ((np = metaname_fast(&sp, nm, LOGICAL_DEVICE, ep)) == NULL) {
1786 			Free(nm);
1787 			return (NULL);
1788 		}
1789 	} else {
1790 		if ((np = metaname(&sp, nm, LOGICAL_DEVICE, ep)) == NULL) {
1791 			Free(nm);
1792 			return (NULL);
1793 		}
1794 	}
1795 	Free(nm);
1796 
1797 	/* make sure it's OK */
1798 	if ((! (flags & MD_BASICNAME_OK)) && (metachkcomp(np, ep) != 0))
1799 		return (NULL);
1800 
1801 	/* get drivename */
1802 	dnp = np->drivenamep;
1803 	dnp->side_names_key = key;
1804 
1805 	/*
1806 	 * Skip the following devid check if dnp is did device
1807 	 * The device id is disabled for did device due to the
1808 	 * lack of minor name support in the did driver. The following
1809 	 * devid code path can set and propagate the error and
1810 	 * eventually prevent did disks from being added to the
1811 	 * diskset under SunCluster systems
1812 	 */
1813 	if (strncmp(dnp->rname, "/dev/did/", strlen("/dev/did/")) == 0) {
1814 		goto out;
1815 	}
1816 
1817 	/* Also, Skip the check if MN diskset, no devid's */
1818 	if (MD_MNSET_DESC(sd)) {
1819 		goto out;
1820 	}
1821 
1822 	/*
1823 	 * Get the devid associated with the key.
1824 	 *
1825 	 * If a devid was returned, it MUST be valid even in
1826 	 * the case where a device id has been "updated". The
1827 	 * "update" of the device id may have occured due to
1828 	 * a firmware upgrade.
1829 	 */
1830 	if ((devidp = meta_getdidbykey(MD_LOCAL_SET, sideno+SKEW, key, ep))
1831 	    != NULL) {
1832 		dnp->devid = devid_str_encode(devidp, NULL);
1833 		free(devidp);
1834 	} else {
1835 		/*
1836 		 * It is okay if replica is not in devid mode
1837 		 */
1838 		if (mdissyserror(ep, MDDB_F_NODEVID)) {
1839 			mdclrerror(ep);
1840 			goto out;
1841 		}
1842 
1843 		/*
1844 		 * devid is missing so this means that we have
1845 		 * just upgraded from a configuration where
1846 		 * devid's were not used so try to add in
1847 		 * the devid and requery.
1848 		 */
1849 		if (meta_setdid(MD_LOCAL_SET, sideno + SKEW, key,
1850 		    ep) < 0)
1851 			return (NULL);
1852 		if ((devidp = (ddi_devid_t)meta_getdidbykey(MD_LOCAL_SET,
1853 		    sideno+SKEW, key, ep)) == NULL)
1854 			return (NULL);
1855 		dnp->devid = devid_str_encode(devidp, NULL);
1856 		devid_free(devidp);
1857 	}
1858 
1859 out:
1860 	if (flags & MD_BYPASS_DAEMON)
1861 		return (dnp);
1862 
1863 	if (get_sidenmlist(sp, dnp, ep))
1864 		return (NULL);
1865 
1866 	/* return success */
1867 	return (dnp);
1868 }
1869 
1870 void
1871 metafreedrivedesc(md_drive_desc **dd)
1872 {
1873 	md_drive_desc	*p, *next = NULL;
1874 
1875 	for (p = *dd; p != NULL; p = next) {
1876 		next = p->dd_next;
1877 		Free(p);
1878 	}
1879 	*dd = NULL;
1880 }
1881 
1882 md_drive_desc *
1883 metaget_drivedesc(
1884 	mdsetname_t	*sp,
1885 	int		flags,
1886 	md_error_t	*ep
1887 )
1888 {
1889 	side_t		sideno = MD_SIDEWILD;
1890 
1891 	assert(! (flags & MD_BYPASS_DAEMON));
1892 
1893 	if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD)
1894 		return (NULL);
1895 
1896 	return (metaget_drivedesc_sideno(sp, sideno, flags, ep));
1897 }
1898 
1899 md_drive_desc *
1900 metaget_drivedesc_fromnamelist(
1901 	mdsetname_t	*sp,
1902 	mdnamelist_t	*nlp,
1903 	md_error_t	*ep
1904 )
1905 {
1906 	md_set_desc		*sd;
1907 	mdnamelist_t		*p;
1908 	md_drive_desc		*dd = NULL;
1909 
1910 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1911 		return (NULL);
1912 
1913 	for (p = nlp; p != NULL; p = p->next)
1914 		(void) metadrivedesc_append(&dd, p->namep->drivenamep, 0, 0,
1915 		    sd->sd_ctime, sd->sd_genid, MD_DR_ADD);
1916 
1917 	return (dd);
1918 }
1919 
1920 md_drive_desc *
1921 metaget_drivedesc_sideno(
1922 	mdsetname_t *sp,
1923 	side_t sideno,
1924 	int flags,
1925 	md_error_t *ep
1926 )
1927 {
1928 	md_set_desc	*sd = NULL;
1929 
1930 	assert(! (flags & MD_BYPASS_DAEMON));
1931 
1932 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1933 		return (NULL);
1934 
1935 	if (sd->sd_drvs)
1936 		return (sd->sd_drvs);
1937 
1938 	if ((sd->sd_drvs = dr2drivedesc(sp, sideno, flags, ep)) == NULL)
1939 		return (NULL);
1940 
1941 	return (sd->sd_drvs);
1942 }
1943 
1944 int
1945 metaget_setownership(
1946 	mdsetname_t	*sp,
1947 	md_error_t	*ep
1948 )
1949 {
1950 	md_set_desc	*sd;
1951 	int		bool;
1952 	int		i;
1953 	md_mnnode_desc	*nd;
1954 
1955 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1956 		return (-1);
1957 
1958 	if (MD_MNSET_DESC(sd)) {
1959 		nd = sd->sd_nodelist;
1960 		while (nd) {
1961 			/* If node isn't alive, can't own diskset */
1962 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1963 				nd->nd_flags &= ~MD_MN_NODE_OWN;
1964 				nd = nd->nd_next;
1965 				continue;
1966 			}
1967 			/*
1968 			 * If can't communicate with rpc.metad, then mark
1969 			 * this node as not an owner.  That node may
1970 			 * in fact, be an owner, but without rpc.metad running
1971 			 * that node can't do much.
1972 			 */
1973 			if (clnt_ownset(nd->nd_nodename, sp, &bool, ep) == -1) {
1974 				nd->nd_flags &= ~MD_MN_NODE_OWN;
1975 			} else if (bool == TRUE) {
1976 				nd->nd_flags |= MD_MN_NODE_OWN;
1977 			} else {
1978 				nd->nd_flags &= ~MD_MN_NODE_OWN;
1979 			}
1980 			nd = nd->nd_next;
1981 		}
1982 		return (0);
1983 	}
1984 
1985 	/* Rest of code handles traditional disksets */
1986 
1987 	for (i = 0; i < MD_MAXSIDES; i++)
1988 		sd->sd_isown[i] = 0;
1989 
1990 	if (clnt_ownset(mynode(), sp, &bool, ep) == -1)
1991 		return (-1);
1992 
1993 	if (bool == TRUE)
1994 		sd->sd_isown[getmyside(sp, ep)] = 1;
1995 
1996 	return (0);
1997 }
1998 
1999 char *
2000 mynode(void)
2001 {
2002 	static struct utsname	myuname;
2003 	static int		done = 0;
2004 
2005 	if (! done) {
2006 		if (uname(&myuname) == -1) {
2007 			md_perror(dgettext(TEXT_DOMAIN, "uname"));
2008 			assert(0);
2009 		}
2010 		done = 1;
2011 	}
2012 	return (myuname.nodename);
2013 }
2014 
2015 int
2016 strinlst(char *str, int cnt, char **lst)
2017 {
2018 	int i;
2019 
2020 	for (i = 0; i < cnt; i++)
2021 		if (strcmp(lst[i], str) == 0)
2022 			return (TRUE);
2023 
2024 	return (FALSE);
2025 }
2026 
2027 /*
2028  * meta_get_reserved_names
2029  *  returns an mdnamelist_t of reserved slices
2030  *  reserved slices are those that are used but don't necessarily
2031  *  show up as metadevices (ex. reserved slice for db in sets, logs)
2032  */
2033 
2034 /*ARGSUSED*/
2035 int
2036 meta_get_reserved_names(
2037 	mdsetname_t	*sp,
2038 	mdnamelist_t	**nlpp,
2039 	int		options,
2040 	md_error_t	*ep)
2041 {
2042 	int		 count		= 0;
2043 	mdname_t	*np		= NULL;
2044 	mdnamelist_t	*transnlp	= NULL;
2045 	mdnamelist_t	**tailpp 	= nlpp;
2046 	mdnamelist_t	*nlp;
2047 	md_drive_desc	*dd, *di;
2048 
2049 	if (metaislocalset(sp))
2050 		goto out;
2051 
2052 	if (!(dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) && !mdisok(ep)) {
2053 		count = -1;
2054 		goto out;
2055 	}
2056 
2057 	/* db in for sets on reserved slice */
2058 	for (di = dd; di && count >= 0; di = di->dd_next) {
2059 		uint_t	rep_slice;
2060 
2061 		/*
2062 		 * Add the name struct to the end of the
2063 		 * namelist but keep a pointer to the last
2064 		 * element so that we don't incur the overhead
2065 		 * of traversing the list each time
2066 		 */
2067 		if (di->dd_dnp &&
2068 		    (meta_replicaslice(di->dd_dnp, &rep_slice, ep) == 0) &&
2069 		    (np = metaslicename(di->dd_dnp, rep_slice, ep)) &&
2070 		    (tailpp = meta_namelist_append_wrapper(tailpp, np)))
2071 			count++;
2072 		else
2073 			count = -1;
2074 	}
2075 
2076 	/* now find logs */
2077 	if (meta_get_trans_names(sp, &transnlp, 0, ep) < 0) {
2078 		count = -1;
2079 		goto out;
2080 	}
2081 
2082 	for (nlp = transnlp; (nlp != NULL); nlp = nlp->next) {
2083 		mdname_t	*transnp = nlp->namep;
2084 		md_trans_t	*transp;
2085 
2086 		if ((transp = meta_get_trans(sp, transnp, ep)) == NULL) {
2087 			count = -1;
2088 			goto out;
2089 		}
2090 		if (transp->lognamep) {
2091 			/*
2092 			 * Add the name struct to the end of the
2093 			 * namelist but keep a pointer to the last
2094 			 * element so that we don't incur the overhead
2095 			 * of traversing the list each time
2096 			 */
2097 			tailpp = meta_namelist_append_wrapper(
2098 			    tailpp, transp->lognamep);
2099 		}
2100 	}
2101 out:
2102 	metafreenamelist(transnlp);
2103 	return (count);
2104 }
2105 
2106 /*
2107  * Entry point to join a node to MultiNode diskset.
2108  *
2109  * Validate host in diskset.
2110  *	- Should be in membership list from API
2111  *	- Should not already be joined into diskset.
2112  *	- Set must have drives
2113  * Assume valid configuration is stored in the set/drive/node records
2114  * in the local mddb since no node or drive can be added to the MNset
2115  * unless all drives and nodes are available.  Reconfig steps will
2116  * resync all ALIVE nodes in case of panic in critical areas.
2117  *
2118  * Lock down the set.
2119  * Verify host is a member of this diskset.
2120  * If drives exist in the configuration, load the mddbs.
2121  * Set this node to active by notifying master if one exists.
2122  * If this is the first node active in the diskset, this node
2123  * 	becomes the master.
2124  * Unlock the set.
2125  *
2126  * Mirror Resync:
2127  * If this node is the last node to join the set and clustering
2128  * isn't running, then start the 'metasync -r' type resync
2129  * on all mirrors in this diskset.
2130  * If clustering is running, this resync operation will
2131  * be handled by the reconfig steps and should NOT
2132  * be handled during a join operation.
2133  *
2134  * There are multiple return values in order to assist
2135  * the join operation of all sets in the metaset command.
2136  *
2137  * Return values:
2138  *	0  - Node successfully joined to set.
2139  *	-1 - Join attempted but failed
2140  *		- any failure from libmeta calls
2141  *		- node not in the member list
2142  *	-2 - Join not attempted since
2143  *		- this set had no drives in set
2144  *		- this node already joined to set
2145  *		- set is not a multinode set
2146  *	-3 - Node joined to STALE set.
2147  */
2148 extern int
2149 meta_set_join(
2150 	mdsetname_t	*sp,
2151 	md_error_t	*ep
2152 )
2153 {
2154 	md_set_desc		*sd;
2155 	md_drive_desc		*dd;
2156 	md_mnnode_desc		*nd, *nd2, my_nd;
2157 	int			rval = 0;
2158 	md_setkey_t		*cl_sk;
2159 	md_error_t		xep = mdnullerror;
2160 	md_error_t		ep_snarf = mdnullerror;
2161 	int			master_flag = 0;
2162 	md_mnset_record		*mas_mnsr = NULL;
2163 	int			clear_nr_flags = 0;
2164 	md_mnnode_record	*nr;
2165 	int			stale_set = 0;
2166 	int			rb_flags = 0;
2167 	int			stale_bool = FALSE;
2168 	int			suspendall_flag = 0;
2169 	int			suspend1_flag = 0;
2170 	sigset_t		oldsigs;
2171 	int			send_reinit = 0;
2172 
2173 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2174 		return (-1);
2175 	}
2176 
2177 	/* Must be a multinode diskset */
2178 	if (!MD_MNSET_DESC(sd)) {
2179 		(void) mderror(ep, MDE_NOT_MN, sp->setname);
2180 		return (-2);
2181 	}
2182 
2183 	/* Verify that the node is ALIVE (i.e. is in the API membership list) */
2184 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_ALIVE)) {
2185 		(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno,
2186 			sd->sd_mn_mynode->nd_nodename, NULL,
2187 			sp->setname);
2188 		return (-1);
2189 	}
2190 
2191 	/* Make sure we are blocking all signals */
2192 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
2193 		mdclrerror(&xep);
2194 
2195 	/*
2196 	 * Lock the set on current set members.
2197 	 * For MN diskset lock_set and SUSPEND are used to protect against
2198 	 * other meta* commands running on the other nodes.
2199 	 */
2200 	nd = sd->sd_nodelist;
2201 	while (nd) {
2202 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2203 			nd = nd->nd_next;
2204 			continue;
2205 		}
2206 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2207 			rval = -1;
2208 			goto out;
2209 		}
2210 		nd = nd->nd_next;
2211 	}
2212 
2213 	/*
2214 	 * Lock out other meta* commands by suspending
2215 	 * class 1 messages across the diskset.
2216 	 */
2217 	nd = sd->sd_nodelist;
2218 	while (nd) {
2219 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2220 			nd = nd->nd_next;
2221 			continue;
2222 		}
2223 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2224 			    sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) {
2225 			rval = -1;
2226 			goto out;
2227 		}
2228 		suspend1_flag = 1;
2229 		nd = nd->nd_next;
2230 	}
2231 
2232 	/*
2233 	 * Verify that this host is a member (in the host list) of the set.
2234 	 */
2235 	nd = sd->sd_nodelist;
2236 	while (nd) {
2237 		if (strcmp(mynode(), nd->nd_nodename) == 0) {
2238 			break;
2239 		}
2240 		nd = nd->nd_next;
2241 	}
2242 	if (!nd) {
2243 		(void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
2244 			sd->sd_mn_mynode->nd_nodename, NULL,
2245 			sp->setname);
2246 		rval = -1;
2247 		goto out;
2248 	}
2249 
2250 	/*
2251 	 * Need to return failure if host is already 'joined'
2252 	 * into the set.  This is done so that if later the user
2253 	 * issues a command to join all sets and a failure is
2254 	 * encountered - that the resulting cleanup effort
2255 	 * (withdrawing from all sets that were joined
2256 	 * during that command) won't withdraw from this set.
2257 	 */
2258 	if (nd->nd_flags & MD_MN_NODE_OWN) {
2259 		rval = -2;
2260 		goto out2;
2261 	}
2262 
2263 	/*
2264 	 * Call metaget_setownership that calls each node in diskset and
2265 	 * marks in set descriptor if node is an owner of the set or not.
2266 	 * metaget_setownership checks to see if a node is an owner by
2267 	 * checking to see if that node's kernel has the mddb loaded.
2268 	 * If a node had panic'd during a reconfig or an
2269 	 * add/delete/join/withdraw operation, the other nodes' node
2270 	 * records may not reflect the current state of the diskset,
2271 	 * so calling metaget_setownership is the safest thing to do.
2272 	 */
2273 	if (metaget_setownership(sp, ep) == -1) {
2274 		rval = -1;
2275 		goto out;
2276 	}
2277 
2278 	/* If first active member of diskset, become the master. */
2279 	nd = sd->sd_nodelist;
2280 	while (nd) {
2281 		if (nd->nd_flags & MD_MN_NODE_OWN)
2282 			break;
2283 		nd = nd->nd_next;
2284 	}
2285 	if (nd == NULL)
2286 		master_flag = 1;
2287 
2288 	/*
2289 	 * If not first active member of diskset, then get the
2290 	 * master information from a node that is already joined
2291 	 * and set the master information for this node.  Be sure
2292 	 * that this node (the already joined node) has its own
2293 	 * join flag set.  If not, then this diskset isn't currently
2294 	 * consistent and shouldn't allow a node to join.  This diskset
2295 	 * inconsistency should only occur when a node has panic'd in
2296 	 * the set while doing a metaset operation and the sysadmin is
2297 	 * attempting to join a node into the set.  This inconsistency
2298 	 * will be fixed during a reconfig cycle which should be occurring
2299 	 * soon since a node panic'd.
2300 	 *
2301 	 * If unable to get this information from an owning node, then
2302 	 * this diskset isn't currently consistent and shouldn't
2303 	 * allow a node to join.
2304 	 */
2305 	if (!master_flag) {
2306 		/* get master information from an owner (joined) node */
2307 		if (clnt_mngetset(nd->nd_nodename, sp->setname,
2308 		    sp->setno, &mas_mnsr, ep) == -1) {
2309 			rval = -1;
2310 			goto out;
2311 		}
2312 
2313 		/* Verify that owner (joined) node has its own JOIN flag set */
2314 		nr = mas_mnsr->sr_nodechain;
2315 		while (nr) {
2316 			if ((nd->nd_nodeid == nr->nr_nodeid) &&
2317 			    ((nr->nr_flags & MD_MN_NODE_OWN) == NULL)) {
2318 				(void) mddserror(ep, MDE_DS_NODENOSET,
2319 				    sp->setno, nd->nd_nodename, NULL,
2320 				    nd->nd_nodename);
2321 				free_sr((md_set_record *)mas_mnsr);
2322 				rval = -1;
2323 				goto out;
2324 			}
2325 			nr = nr->nr_next;
2326 		}
2327 
2328 		/*
2329 		 * Does master have set marked as STALE?
2330 		 * If so, need to pass this down to kernel when
2331 		 * this node snarfs the set.
2332 		 */
2333 		if (clnt_mn_is_stale(nd->nd_nodename, sp,
2334 		    &stale_bool, ep) == -1) {
2335 			rval = -1;
2336 			goto out;
2337 		}
2338 
2339 		/* set master information in my rpc.metad's set record */
2340 		if (clnt_mnsetmaster(mynode(), sp, mas_mnsr->sr_master_nodenm,
2341 		    mas_mnsr->sr_master_nodeid, ep)) {
2342 			free_sr((md_set_record *)mas_mnsr);
2343 			rval = -1;
2344 			goto out;
2345 		}
2346 
2347 		/* set master information in my cached set desc */
2348 		(void) strcpy(sd->sd_mn_master_nodenm,
2349 		    mas_mnsr->sr_master_nodenm);
2350 		sd->sd_mn_master_nodeid = mas_mnsr->sr_master_nodeid;
2351 		nd2 = sd->sd_nodelist;
2352 		while (nd2) {
2353 		    if (nd2->nd_nodeid == mas_mnsr->sr_master_nodeid) {
2354 			sd->sd_mn_masternode = nd2;
2355 			break;
2356 		    }
2357 		    nd2 = nd2->nd_next;
2358 		}
2359 		free_sr((md_set_record *)mas_mnsr);
2360 
2361 		/*
2362 		 * Set the node flags in mynode's rpc.metad node records for
2363 		 * the nodes that are in the diskset.  Can use my sd
2364 		 * since earlier call to metaget_setownership set the
2365 		 * owner flags based on whether that node had snarfed
2366 		 * the MN diskset mddb.  Reconfig steps guarantee that
2367 		 * return of metaget_setownership will match the owning
2368 		 * node's owner list except in the case where a node
2369 		 * has just panic'd and in this case, a reconfig will
2370 		 * be starting immediately and the owner lists will
2371 		 * be sync'd up by the reconfig.
2372 		 *
2373 		 * Flag of SET means to take no action except to
2374 		 * set the node flags as given in the nodelist linked list.
2375 		 */
2376 		if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist,
2377 		    MD_NR_SET, NULL, ep)) {
2378 			rval = -1;
2379 			goto out;
2380 		}
2381 	}
2382 
2383 	/*
2384 	 * Read in the mddb if there are drives in the set.
2385 	 */
2386 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
2387 	    ep)) == NULL) {
2388 		/* No drives in list */
2389 		if (! mdisok(ep)) {
2390 			rval = -1;
2391 			goto out;
2392 		}
2393 		rval = -2;
2394 		goto out;
2395 	}
2396 
2397 	/*
2398 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2399 	 * Start by suspending rpc.mdcommd (which drains it of all messages),
2400 	 * then change the nodelist followed by a reinit and resume.
2401 	 */
2402 	nd = sd->sd_nodelist;
2403 	while (nd) {
2404 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2405 			nd = nd->nd_next;
2406 			continue;
2407 		}
2408 
2409 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp,
2410 		    MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
2411 			rval = -1;
2412 			goto out;
2413 		}
2414 		suspendall_flag = 1;
2415 		nd = nd->nd_next;
2416 	}
2417 
2418 	/* Set master in my set record in rpc.metad */
2419 	if (master_flag) {
2420 		if (clnt_mnsetmaster(mynode(), sp,
2421 		    sd->sd_mn_mynode->nd_nodename,
2422 		    sd->sd_mn_mynode->nd_nodeid, ep)) {
2423 			rval = -1;
2424 			goto out;
2425 		}
2426 	}
2427 	/*
2428 	 * Causes mddbs to be loaded into the kernel.
2429 	 * Set the force flag so that replica locations can be
2430 	 * loaded into the kernel even if a mediator node was
2431 	 * unavailable.  This allows a node to join an MO
2432 	 * diskset when there are sufficient replicas available,
2433 	 * but a mediator node in unavailable.
2434 	 */
2435 	if (setup_db_bydd(sp, dd, TRUE, ep) == -1) {
2436 		mde_perror(ep, dgettext(TEXT_DOMAIN,
2437 		    "Host not able to start diskset."));
2438 		rval = -1;
2439 		goto out;
2440 	}
2441 
2442 	if (! mdisok(ep)) {
2443 		rval = -1;
2444 		goto out;
2445 	}
2446 
2447 	/*
2448 	 * Set rollback flags to 1 so that halt_set is called if a failure
2449 	 * is seen after this point.  If snarf_set fails, still need to
2450 	 * call halt_set to cleanup the diskset.
2451 	 */
2452 	rb_flags = 1;
2453 
2454 	/* Starts the set */
2455 	if (snarf_set(sp, stale_bool, ep) != 0) {
2456 		if (mdismddberror(ep, MDE_DB_STALE)) {
2457 			/*
2458 			 * Don't fail join, STALE means that set has
2459 			 * < 50% mddbs.
2460 			 */
2461 			(void) mdstealerror(&ep_snarf, ep);
2462 			stale_set = 1;
2463 		} else if (mdisok(ep)) {
2464 			/* If snarf failed, but no error was set - set it */
2465 			(void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64,
2466 			    sp->setno, 0, NULL);
2467 				rval = -1;
2468 				goto out;
2469 		} else if (!(mdismddberror(ep, MDE_DB_ACCOK))) {
2470 			/*
2471 			 * Don't fail join if ACCOK; ACCOK means that mediator
2472 			 * provided extra vote.
2473 			 */
2474 			rval = -1;
2475 			goto out;
2476 		}
2477 	}
2478 
2479 	/* Did set really get snarfed? */
2480 	if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_NO) {
2481 		if (mdisok(ep)) {
2482 			/* If snarf failed, but no error was set - set it */
2483 			(void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64,
2484 				sp->setno, 0, NULL);
2485 		}
2486 		mde_perror(ep, dgettext(TEXT_DOMAIN,
2487 		    "Host not able to start diskset."));
2488 		rval = -1;
2489 		goto out;
2490 	}
2491 
2492 	/* Change to nodelist so need to send reinit to rpc.mdcommd */
2493 	send_reinit = 1;
2494 
2495 	/* If first node to enter set, setup master and clear change log */
2496 	if (master_flag) {
2497 		/* Set master in my locally cached set descriptor */
2498 		(void) strcpy(sd->sd_mn_master_nodenm,
2499 		    sd->sd_mn_mynode->nd_nodename);
2500 		sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid;
2501 		sd->sd_mn_am_i_master = 1;
2502 
2503 		/*
2504 		 * If first node to join set, then clear out change log
2505 		 * entries.  Change log entries are only needed when a
2506 		 * change of master is occurring in a diskset that has
2507 		 * multiple owners.   Since this node is the first owner
2508 		 * of the diskset, clear the entries.
2509 		 *
2510 		 * Only do this if we are in a single node non-SC3.x
2511 		 * situation.
2512 		 */
2513 		if (meta_mn_singlenode() &&
2514 			mdmn_reset_changelog(sp, ep,  MDMN_CLF_RESETLOG) != 0) {
2515 			mde_perror(ep, dgettext(TEXT_DOMAIN,
2516 			    "Unable to reset changelog."));
2517 			rval = -1;
2518 			goto out;
2519 		}
2520 	}
2521 
2522 	/* Set my locally cached flag */
2523 	sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN;
2524 
2525 	/*
2526 	 * Set this node's own flag on all joined nodes in the set
2527 	 * (including my node).
2528 	 */
2529 	clear_nr_flags = 1;
2530 
2531 	my_nd = *(sd->sd_mn_mynode);
2532 	my_nd.nd_next = NULL;
2533 	nd = sd->sd_nodelist;
2534 	while (nd) {
2535 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2536 			nd = nd->nd_next;
2537 			continue;
2538 		}
2539 		if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
2540 		    MD_NR_JOIN, NULL, ep)) {
2541 			rval = -1;
2542 			goto out;
2543 		}
2544 		nd = nd->nd_next;
2545 	}
2546 
2547 out:
2548 	if (rval != NULL) {
2549 		/*
2550 		 * If rollback flag is 1, then node was joined to set.
2551 		 * Since an error occurred, withdraw node from set in
2552 		 * order to rollback to before command was run.
2553 		 * Need to preserve ep so that calling function can
2554 		 * get error information.
2555 		 */
2556 		if (rb_flags == 1) {
2557 			if (halt_set(sp, &xep)) {
2558 				mdclrerror(&xep);
2559 			}
2560 		}
2561 
2562 		/*
2563 		 * If error, reset master to INVALID.
2564 		 * Ignore error since (next) first node to successfully join
2565 		 * will set master on all nodes.
2566 		 */
2567 		(void) clnt_mnsetmaster(mynode(), sp, "",
2568 			MD_MN_INVALID_NID, &xep);
2569 		mdclrerror(&xep);
2570 		/* Reset master in my locally cached set descriptor */
2571 		sd->sd_mn_master_nodeid = MD_MN_INVALID_NID;
2572 		sd->sd_mn_am_i_master = 0;
2573 
2574 		/*
2575 		 * If nr flags set on other nodes, reset them.
2576 		 */
2577 		if (clear_nr_flags) {
2578 			nd = sd->sd_nodelist;
2579 			while (nd) {
2580 				if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2581 					nd = nd->nd_next;
2582 					continue;
2583 				}
2584 				(void) clnt_upd_nr_flags(nd->nd_nodename, sp,
2585 					&my_nd, MD_NR_WITHDRAW, NULL, &xep);
2586 				mdclrerror(&xep);
2587 				nd = nd->nd_next;
2588 			}
2589 			/* Reset my locally cached flag */
2590 			sd->sd_mn_mynode->nd_flags &= ~MD_MN_NODE_OWN;
2591 		}
2592 	}
2593 
2594 	/*
2595 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2596 	 * Send reinit command to mdcommd which forces it to get
2597 	 * fresh set description.
2598 	 */
2599 	if (send_reinit) {
2600 		/* Send reinit */
2601 		nd = sd->sd_nodelist;
2602 		while (nd) {
2603 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2604 				nd = nd->nd_next;
2605 				continue;
2606 			}
2607 
2608 			/* Class is ignored for REINIT */
2609 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
2610 				sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
2611 				/*
2612 				 * We are here because we failed to resume
2613 				 * rpc.mdcommd.  However we potentially have
2614 				 * an error from the previous call
2615 				 * If the previous call did fail,  we capture
2616 				 * that error and generate a perror with
2617 				 * the string, "Unable to resume...".
2618 				 * Setting rval to -1 ensures that in the
2619 				 * next iteration of the loop, ep is not
2620 				 * clobbered.
2621 				 */
2622 				if (rval == 0)
2623 					(void) mdstealerror(ep, &xep);
2624 				else
2625 					mdclrerror(&xep);
2626 				rval = -1;
2627 				mde_perror(ep, dgettext(TEXT_DOMAIN,
2628 				    "Unable to reinit rpc.mdcommd."));
2629 			}
2630 			nd = nd->nd_next;
2631 		}
2632 
2633 	}
2634 
2635 out2:
2636 	/*
2637 	 * Unlock diskset by resuming messages across the diskset.
2638 	 * Just resume all classes so that resume is the same whether
2639 	 * just one class was locked or all classes were locked.
2640 	 */
2641 	if ((suspend1_flag) || (suspendall_flag)) {
2642 		nd = sd->sd_nodelist;
2643 		while (nd) {
2644 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2645 				nd = nd->nd_next;
2646 				continue;
2647 			}
2648 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
2649 				sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
2650 				/*
2651 				 * We are here because we failed to resume
2652 				 * rpc.mdcommd.  However we potentially have
2653 				 * an error from the previous call
2654 				 * If the previous call did fail,  we capture
2655 				 * that error and generate a perror with
2656 				 * the string, "Unable to resume...".
2657 				 * Setting rval to -1 ensures that in the
2658 				 * next iteration of the loop, ep is not
2659 				 * clobbered.
2660 				 */
2661 				if (rval == 0)
2662 					(void) mdstealerror(ep, &xep);
2663 				else
2664 					mdclrerror(&xep);
2665 				rval = -1;
2666 				mde_perror(ep, dgettext(TEXT_DOMAIN,
2667 				    "Unable to resume rpc.mdcommd."));
2668 			}
2669 			nd = nd->nd_next;
2670 		}
2671 		meta_ping_mnset(sp->setno);
2672 	}
2673 
2674 	/*
2675 	 * Unlock set.  This flushes the caches on the servers.
2676 	 */
2677 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
2678 	nd = sd->sd_nodelist;
2679 	while (nd) {
2680 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2681 			nd = nd->nd_next;
2682 			continue;
2683 		}
2684 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
2685 			if (rval == 0)
2686 				(void) mdstealerror(ep, &xep);
2687 			else
2688 				mdclrerror(&xep);
2689 			rval = -1;
2690 		}
2691 		nd = nd->nd_next;
2692 	}
2693 
2694 	/*
2695 	 * If this node is the last to join the diskset and clustering isn't
2696 	 * running, then resync the mirrors in the diskset. We have to wait
2697 	 * until all nodes are joined so that the status gets propagated to
2698 	 * all of the members of the set.
2699 	 * Ignore any error from the resync as the join function shouldn't fail
2700 	 * because the mirror resync had a problem.
2701 	 *
2702 	 * Don't start resync if set is stale.
2703 	 */
2704 	if ((rval == 0) && (sdssc_bind_library() != SDSSC_OKAY) &&
2705 	    (stale_set != 1)) {
2706 		nd = sd->sd_nodelist;
2707 		while (nd) {
2708 			if (!(nd->nd_flags & MD_MN_NODE_OWN))
2709 				break;
2710 			nd = nd->nd_next;
2711 		}
2712 		/*
2713 		 * nd set to NULL means that we have no nodes in the set that
2714 		 * haven't joined. In this case we start the resync.
2715 		 */
2716 		if (nd == NULL) {
2717 			(void) meta_mirror_resync_all(sp, 0, &xep);
2718 			mdclrerror(&xep);
2719 		}
2720 	}
2721 
2722 	/* Update ABR state for all soft partitions */
2723 	(void) meta_sp_update_abr(sp, &xep);
2724 	mdclrerror(&xep);
2725 
2726 	/*
2727 	 * call metaflushsetnames to reset local cache for master and
2728 	 * node information.
2729 	 */
2730 	metaflushsetname(sp);
2731 
2732 	/* release signals back to what they were on entry */
2733 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
2734 		mdclrerror(&xep);
2735 
2736 	/*
2737 	 * If no error and stale_set is set, then set ep back
2738 	 * to ep from snarf_set call and return -3.  If another error
2739 	 * occurred and rval is not 0, then that error would have
2740 	 * caused the node to be withdrawn from the set and would
2741 	 * have set ep to that error information.
2742 	 */
2743 	if ((rval == 0) && (stale_set)) {
2744 		(void) mdstealerror(ep, &ep_snarf);
2745 		return (-3);
2746 	}
2747 
2748 	return (rval);
2749 }
2750 
2751 /*
2752  * Entry point to withdraw a node from MultiNode diskset.
2753  *
2754  * Validate host in diskset.
2755  *	- Should be joined into diskset.
2756  * Assume valid configuration is stored in the set/drive/node records
2757  * in the local mddb since no node or drive can be added to the MNset
2758  * unless all drives and nodes are available.  Reconfig steps will
2759  * resync all ALIVE nodes in case of panic in critical areas.
2760  *
2761  * Lock down the set.
2762  * Verify that drives exist in configuration.
2763  * Verify host is a member of this diskset.
2764  * Verify host is an owner of the diskset (host is joined to diskset).
2765  * Only allow withdrawal of master node if master node is the only joined
2766  * in the diskset.
2767  * Halt the diskset on this node.
2768  * Reset Master on this node.
2769  * Updated node flags that this node with withdrawn.
2770  * Unlock the set.
2771  *
2772  * Return values:
2773  *	0  - Node successfully withdrew from set.
2774  *	-1 - Withdrawal attempted but failed
2775  *		- any failure from libmeta calls
2776  *		- node not in the member list
2777  *	-2 - Withdrawal not attempted since
2778  *		- this set had no drives in set
2779  *		- this node not joined to set
2780  *		- set is not a multinode set
2781  */
2782 extern int
2783 meta_set_withdraw(
2784 	mdsetname_t	*sp,
2785 	md_error_t	*ep
2786 )
2787 {
2788 	md_set_desc		*sd;
2789 	md_drive_desc		*dd = 0;
2790 	md_mnnode_desc		*nd, my_nd;
2791 	int			rval = 0;
2792 	md_setkey_t		*cl_sk;
2793 	md_error_t		xep = mdnullerror;
2794 	int			set_halted = 0;
2795 	int			suspendall_flag = 0;
2796 	int			suspend1_flag = 0;
2797 	bool_t			stale_bool = FALSE;
2798 	mddb_config_t		c;
2799 	int			node_id_list[1];
2800 	sigset_t		oldsigs;
2801 	int			send_reinit = 0;
2802 
2803 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2804 		return (-1);
2805 	}
2806 
2807 	/* Must be a multinode diskset */
2808 	if (!MD_MNSET_DESC(sd)) {
2809 		(void) mderror(ep, MDE_NOT_MN, sp->setname);
2810 		return (-1);
2811 	}
2812 
2813 	/* Make sure we are blocking all signals */
2814 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
2815 		mdclrerror(&xep);
2816 
2817 	/*
2818 	 * Lock the set on current set members.
2819 	 * For MN diskset lock_set and SUSPEND are used to protect against
2820 	 * other meta* commands running on the other nodes.
2821 	 */
2822 	nd = sd->sd_nodelist;
2823 	while (nd) {
2824 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2825 			nd = nd->nd_next;
2826 			continue;
2827 		}
2828 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2829 			rval = -1;
2830 			goto out;
2831 		}
2832 		nd = nd->nd_next;
2833 	}
2834 	/*
2835 	 * Lock out other meta* commands by suspending
2836 	 * class 1 messages across the diskset.
2837 	 */
2838 	nd = sd->sd_nodelist;
2839 	while (nd) {
2840 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2841 			nd = nd->nd_next;
2842 			continue;
2843 		}
2844 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2845 			sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) {
2846 			rval = -1;
2847 			goto out;
2848 		}
2849 		suspend1_flag = 1;
2850 		nd = nd->nd_next;
2851 	}
2852 
2853 	/* Get list of drives - needed in case of failure */
2854 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
2855 	    ep)) == NULL) {
2856 		/* Error getting drives in list */
2857 		if (! mdisok(ep)) {
2858 			rval = -1;
2859 			goto out2;
2860 		}
2861 		/* no drives in list */
2862 		rval = -2;
2863 		goto out2;
2864 	}
2865 
2866 	/*
2867 	 * Verify that this host is a member (in the host list) of the set.
2868 	 */
2869 	nd = sd->sd_nodelist;
2870 	while (nd) {
2871 		if (strcmp(mynode(), nd->nd_nodename) == 0) {
2872 			break;
2873 		}
2874 		nd = nd->nd_next;
2875 	}
2876 	if (!nd) {
2877 		(void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
2878 			sd->sd_mn_mynode->nd_nodename, NULL,
2879 			sp->setname);
2880 		rval = -1;
2881 		goto out2;
2882 	}
2883 
2884 	/*
2885 	 * Call metaget_setownership that calls each node in diskset and
2886 	 * marks in set descriptor if node is an owner of the set or not.
2887 	 * metaget_setownership checks to see if a node is an owner by
2888 	 * checking to see if that node's kernel has the mddb loaded.
2889 	 * If a node had panic'd during a reconfig or an
2890 	 * add/delete/join/withdraw operation, the other nodes' node
2891 	 * records may not reflect the current state of the diskset,
2892 	 * so calling metaget_setownership is the safest thing to do.
2893 	 */
2894 	if (metaget_setownership(sp, ep) == -1) {
2895 		rval = -1;
2896 		goto out2;
2897 	}
2898 
2899 	/*
2900 	 * Verify that this node is joined
2901 	 * to diskset (i.e. is an owner of the diskset).
2902 	 */
2903 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
2904 		rval = -2;
2905 		goto out2;
2906 	}
2907 
2908 	/*
2909 	 * For a MN diskset, only withdraw master if it is
2910 	 * the only joined node.
2911 	 */
2912 	if (sd->sd_mn_master_nodeid == sd->sd_mn_mynode->nd_nodeid) {
2913 		nd = sd->sd_nodelist;
2914 		while (nd) {
2915 			/* Skip my node since checking for other owners */
2916 			if (nd->nd_nodeid == sd->sd_mn_master_nodeid) {
2917 				nd = nd->nd_next;
2918 				continue;
2919 			}
2920 			/* If another owner node if found, error */
2921 			if (nd->nd_flags & MD_MN_NODE_OWN) {
2922 				(void) mddserror(ep, MDE_DS_WITHDRAWMASTER,
2923 					sp->setno,
2924 					sd->sd_mn_mynode->nd_nodename, NULL,
2925 					sp->setname);
2926 				rval = -1;
2927 				goto out2;
2928 			}
2929 			nd = nd->nd_next;
2930 		}
2931 	}
2932 
2933 	/*
2934 	 * Is current set STALE?
2935 	 */
2936 	(void) memset(&c, 0, sizeof (c));
2937 	c.c_id = 0;
2938 	c.c_setno = sp->setno;
2939 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
2940 		(void) mdstealerror(ep, &c.c_mde);
2941 		rval = -1;
2942 		goto out;
2943 	}
2944 	if (c.c_flags & MDDB_C_STALE) {
2945 		stale_bool = TRUE;
2946 	}
2947 
2948 	/*
2949 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2950 	 * Start by suspending rpc.mdcommd (which drains it of all messages),
2951 	 * then change the nodelist followed by a reinit and resume.
2952 	 */
2953 	nd = sd->sd_nodelist;
2954 	while (nd) {
2955 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2956 			nd = nd->nd_next;
2957 			continue;
2958 		}
2959 
2960 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2961 		    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
2962 			rval = -1;
2963 			goto out;
2964 		}
2965 		suspendall_flag = 1;
2966 		nd = nd->nd_next;
2967 	}
2968 
2969 	/*
2970 	 * Withdraw the set - halt set.
2971 	 * This will fail if any I/O is occuring to any metadevice which
2972 	 * includes a resync to a mirror metadevice.
2973 	 */
2974 	set_halted = 1;
2975 	if (halt_set(sp, ep)) {
2976 		/* Was set actually halted? */
2977 		if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_YES) {
2978 			set_halted = 0;
2979 		}
2980 		rval = -1;
2981 		goto out;
2982 	}
2983 
2984 	/* Change to nodelist so need to send reinit to rpc.mdcommd */
2985 	send_reinit = 1;
2986 
2987 	/* Reset master on withdrawn node */
2988 	if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, "",
2989 	    MD_MN_INVALID_NID, ep)) {
2990 		rval = -1;
2991 		goto out;
2992 	}
2993 
2994 	/* Mark my node as withdrawn and send to other nodes */
2995 	nd = sd->sd_nodelist;
2996 	my_nd = *(sd->sd_mn_mynode);	/* structure copy */
2997 	my_nd.nd_next = NULL;
2998 	while (nd) {
2999 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3000 			nd = nd->nd_next;
3001 			continue;
3002 		}
3003 		if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
3004 		    MD_NR_WITHDRAW, NULL, ep)) {
3005 			rval = -1;
3006 			goto out;
3007 		}
3008 		nd = nd->nd_next;
3009 	}
3010 
3011 	/*
3012 	 * If withdrawn node is a mirror owner, reset mirror owner
3013 	 * to NULL.  If an error occurs, print a warning and continue.
3014 	 * Don't fail metaset because of mirror owner reset problem since
3015 	 * next node to grab mirror will resolve this issue.
3016 	 * Before next node grabs mirrors, metaset will show the withdrawn
3017 	 * node as owner which is why an attempt to reset the mirror owner
3018 	 * is made.
3019 	 */
3020 	node_id_list[0] = sd->sd_mn_mynode->nd_nodeid;	/* Setup my nodeid */
3021 	nd = sd->sd_nodelist;
3022 	while (nd) {
3023 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3024 			nd = nd->nd_next;
3025 			continue;
3026 		}
3027 		if (clnt_reset_mirror_owner(nd->nd_nodename, sp,
3028 		    1, &node_id_list[0], &xep) == 01) {
3029 			mde_perror(&xep, dgettext(TEXT_DOMAIN,
3030 			    "Unable to reset mirror owner on node %s"),
3031 			    nd->nd_nodename);
3032 			mdclrerror(&xep);
3033 		}
3034 		nd = nd->nd_next;
3035 	}
3036 
3037 out:
3038 	if (rval == -1) {
3039 		/* Rejoin node - Mark node as joined and send to other nodes */
3040 		nd = sd->sd_nodelist;
3041 		my_nd = *(sd->sd_mn_mynode);	/* structure copy */
3042 		my_nd.nd_next = NULL;
3043 		while (nd) {
3044 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3045 				nd = nd->nd_next;
3046 				continue;
3047 			}
3048 			if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
3049 			    MD_NR_JOIN, NULL, &xep)) {
3050 				mdclrerror(&xep);
3051 			}
3052 			nd = nd->nd_next;
3053 		}
3054 
3055 		/* Set master on withdrawn node */
3056 		if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp,
3057 		    sd->sd_mn_master_nodenm,
3058 		    sd->sd_mn_master_nodeid, &xep)) {
3059 			mdclrerror(&xep);
3060 		}
3061 
3062 		/* Join set if halt_set had succeeded */
3063 		if (set_halted) {
3064 			/*
3065 			 * Causes mddbs to be loaded into the kernel.
3066 			 * Set the force flag so that replica locations can be
3067 			 * loaded into the kernel even if a mediator node was
3068 			 * unavailable.  This allows a node to join an MO
3069 			 * diskset when there are sufficient replicas available,
3070 			 * but a mediator node in unavailable.
3071 			 */
3072 			if (setup_db_bydd(sp, dd, TRUE, &xep) == -1) {
3073 				mdclrerror(&xep);
3074 			}
3075 			/* If set previously stale - make it so at re-join */
3076 			if (snarf_set(sp, stale_bool, &xep) != 0) {
3077 				mdclrerror(&xep);
3078 				(void) halt_set(sp, &xep);
3079 				mdclrerror(&xep);
3080 			}
3081 		}
3082 	}
3083 
3084 	/*
3085 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
3086 	 * Send reinit command to mdcommd which forces it to get
3087 	 * fresh set description.
3088 	 */
3089 	if (send_reinit) {
3090 		/* Send reinit */
3091 		nd = sd->sd_nodelist;
3092 		while (nd) {
3093 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3094 				nd = nd->nd_next;
3095 				continue;
3096 			}
3097 
3098 			/* Class is ignored for REINIT */
3099 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
3100 				sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
3101 				/*
3102 				 * We are here because we failed to resume
3103 				 * rpc.mdcommd.  However we potentially have
3104 				 * an error from the previous call.
3105 				 * If the previous call did fail,  we
3106 				 * capture that error and generate a perror
3107 				 * withthe string,  "Unable to resume...".
3108 				 * Setting rval to -1 ensures that in the
3109 				 * next iteration of the loop, ep is not
3110 				 * clobbered.
3111 				 */
3112 				if (rval == 0)
3113 					(void) mdstealerror(ep, &xep);
3114 				else
3115 					mdclrerror(&xep);
3116 				rval = -1;
3117 				mde_perror(ep, dgettext(TEXT_DOMAIN,
3118 				    "Unable to reinit rpc.mdcommd."));
3119 			}
3120 			nd = nd->nd_next;
3121 		}
3122 	}
3123 
3124 out2:
3125 	/*
3126 	 * Unlock diskset by resuming messages across the diskset.
3127 	 * Just resume all classes so that resume is the same whether
3128 	 * just one class was locked or all classes were locked.
3129 	 */
3130 	if ((suspend1_flag) || (suspendall_flag)) {
3131 		nd = sd->sd_nodelist;
3132 		while (nd) {
3133 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3134 				nd = nd->nd_next;
3135 				continue;
3136 			}
3137 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
3138 				sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
3139 				/*
3140 				 * We are here because we failed to resume
3141 				 * rpc.mdcommd.  However we potentially have
3142 				 * an error from the previous call
3143 				 * If the previous call did fail,  we capture
3144 				 * that error and generate a perror with
3145 				 * the string, "Unable to resume...".
3146 				 * Setting rval to -1 ensures that in the
3147 				 * next iteration of the loop, ep is not
3148 				 * clobbered.
3149 				 */
3150 				if (rval == 0)
3151 					(void) mdstealerror(ep, &xep);
3152 				else
3153 					mdclrerror(&xep);
3154 				rval = -1;
3155 				mde_perror(ep, dgettext(TEXT_DOMAIN,
3156 				    "Unable to resume rpc.mdcommd."));
3157 			}
3158 			nd = nd->nd_next;
3159 		}
3160 		meta_ping_mnset(sp->setno);
3161 	}
3162 
3163 	/*
3164 	 * Unlock set.  This flushes the caches on the servers.
3165 	 */
3166 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
3167 	nd = sd->sd_nodelist;
3168 	while (nd) {
3169 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3170 			nd = nd->nd_next;
3171 			continue;
3172 		}
3173 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
3174 			if (rval == 0)
3175 				(void) mdstealerror(ep, &xep);
3176 			else
3177 				mdclrerror(&xep);
3178 			rval = -1;
3179 		}
3180 		nd = nd->nd_next;
3181 	}
3182 
3183 	/*
3184 	 * call metaflushsetnames to reset local cache for master and
3185 	 * node information.
3186 	 */
3187 	metaflushsetname(sp);
3188 
3189 	/* release signals back to what they were on entry */
3190 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
3191 		mdclrerror(&xep);
3192 
3193 	return (rval);
3194 
3195 }
3196 
3197 /*
3198  * Update nodelist with cluster member information.
3199  * A node not in the member list will be marked
3200  * as not ALIVE and not OWN.
3201  * A node in the member list will be marked ALIVE, but
3202  * the OWN bit will not be changed.
3203  *
3204  * If mynode isn't in the membership list, fail causing
3205  * another reconfig cycle to be started since a non-member
3206  * node shouldn't be taking part in the reconfig cycle.
3207  *
3208  * Return values:
3209  *	0 - No problem.
3210  *	1 - Any failure including RPC failure to my node.
3211  */
3212 int
3213 meta_reconfig_update_nodelist(
3214 	mdsetname_t			*sp,
3215 	mndiskset_membershiplist_t	*nl,
3216 	md_set_desc			*sd,
3217 	md_error_t			*ep
3218 )
3219 {
3220 	mndiskset_membershiplist_t	*nl2;
3221 	md_mnnode_desc			*nd;
3222 	md_error_t			xep = mdnullerror;
3223 	int				rval = 0;
3224 
3225 	/*
3226 	 * Walk through nodelist, checking to see if each
3227 	 * node is in the member list.
3228 	 * If node is not a member, reset ALIVE and OWN node flag.
3229 	 * If node is a member, set ALIVE.
3230 	 * If mynode's OWN flag gets reset, then halt the diskset on this node.
3231 	 */
3232 	nd = sd->sd_nodelist;
3233 	while (nd) {
3234 		nl2 = nl;
3235 		while (nl2) {
3236 			/* If node is in member list, set ALIVE */
3237 			if (nl2->msl_node_id == nd->nd_nodeid) {
3238 				nd->nd_flags |= MD_MN_NODE_ALIVE;
3239 				break;
3240 			} else {
3241 				nl2 = nl2->next;
3242 			}
3243 			/* node is not in member list, mark !ALIVE and !OWN */
3244 			if (nl2 == NULL) {
3245 				/* If node is mynode, then halt set if needed */
3246 				if (strcmp(mynode(), nd->nd_nodename) == 0) {
3247 					/*
3248 					 * This shouldn't happen, but just
3249 					 * in case...  Any node not in the
3250 					 * membership list should be dead and
3251 					 * not running reconfig step1.
3252 					 */
3253 					if (nd->nd_flags & MD_MN_NODE_OWN) {
3254 						if (halt_set(sp, &xep)) {
3255 							mde_perror(&xep, "");
3256 							mdclrerror(&xep);
3257 						}
3258 					}
3259 					/*
3260 					 * Return failure since this node
3261 					 * (mynode) is not in the membership
3262 					 * list, but process the rest of the
3263 					 * nodelist first so that rpc.metad
3264 					 * can be updated with the latest
3265 					 * membership information.
3266 					 */
3267 					(void) mddserror(ep,
3268 					    MDE_DS_NOTINMEMBERLIST,
3269 					    sp->setno, nd->nd_nodename, NULL,
3270 					    sp->setname);
3271 					rval = 1;
3272 				}
3273 				nd->nd_flags &= ~MD_MN_NODE_ALIVE;
3274 				nd->nd_flags &= ~MD_MN_NODE_OWN;
3275 			}
3276 		}
3277 		nd = nd->nd_next;
3278 	}
3279 
3280 	/* Send this information to rpc.metad */
3281 	if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist,
3282 	    MD_NR_SET,  MNSET_IN_RECONFIG, &xep)) {
3283 		/* Return failure if can't send node flags to rpc.metad */
3284 		if (rval == 0) {
3285 			(void) mdstealerror(ep, &xep);
3286 			rval = 1;
3287 		}
3288 	}
3289 	return (rval);
3290 }
3291 
3292 /*
3293  * Choose master determines the master for a diskset.
3294  * Each node determines the master on its own and
3295  * adds this information to its local rpc.metad nodelist
3296  * and also sends it to the kernel.
3297  *
3298  * Nodelist in set descriptor (sd) is sorted in
3299  * monotonically increasing sequence of nodeid.
3300  *
3301  * Return values:
3302  *	0 - No problem.
3303  *	205 - There was an RPC problem to another node.
3304  *	-1 - There was an error.  This could be an RPC error to my node.
3305  *		This is a catastrophic failure causing node to panic.
3306  */
3307 int
3308 meta_reconfig_choose_master_for_set(
3309 	mdsetname_t	*sp,
3310 	md_set_desc	*sd,
3311 	md_error_t	*ep
3312 )
3313 {
3314 	int			is_owner;
3315 	md_mnset_record		*mnsr = NULL;
3316 	int			lowest_alive_nodeid = 0;
3317 	uint_t			master_nodeid;
3318 	md_mnnode_desc		*nd, *nd2;
3319 	md_mnnode_record	*nr;
3320 	md_drive_desc		*dd;
3321 	md_setkey_t		*cl_sk;
3322 	int			rval = 0;
3323 	md_error_t		xep = mdnullerror;
3324 	mddb_setflags_config_t	sf;
3325 
3326 	/*
3327 	 * Is current node joined to diskset?
3328 	 * Don't trust flags, really check to see if mddb is snarfed.
3329 	 */
3330 	if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
3331 		/*
3332 		 * If a node is joined to the diskset, this node checks
3333 		 * to see if the current master of the diskset is valid and
3334 		 * is still in the membership list (ALIVE) and is
3335 		 * still joined (OWN).  Need to verify if master is
3336 		 * really joined - don't trust the flags.  (Can trust
3337 		 * ALIVE since set during earlier part of reconfig cycle.)
3338 		 * If the current master is valid, still in the membership
3339 		 * list and joined, then master is not changed on this node.
3340 		 * Just return.
3341 		 *
3342 		 * Verify that nodeid is valid before accessing masternode.
3343 		 */
3344 		if ((sd->sd_mn_master_nodeid != MD_MN_INVALID_NID) &&
3345 		    (sd->sd_mn_masternode->nd_flags & MD_MN_NODE_ALIVE)) {
3346 			if (clnt_ownset(sd->sd_mn_master_nodenm, sp,
3347 			    &is_owner, ep) == -1) {
3348 				/* If RPC failure to another node return 205 */
3349 				if ((mdanyrpcerror(ep)) &&
3350 				    (sd->sd_mn_mynode->nd_nodeid !=
3351 				    sd->sd_mn_master_nodeid)) {
3352 					return (205);
3353 				} else {
3354 					/* Any other failure */
3355 					return (-1);
3356 				}
3357 			} else {
3358 				if (is_owner == TRUE) {
3359 
3360 					meta_mc_log(MC_LOG5, dgettext(
3361 					    TEXT_DOMAIN, "Set %s previous "
3362 					    "master chosen %s (%d): %s"),
3363 					    sp->setname,
3364 					    sd->sd_mn_master_nodenm,
3365 					    sd->sd_mn_master_nodeid,
3366 					    meta_print_hrtime(gethrtime() -
3367 					    start_time));
3368 
3369 					/* Previous master is ok - done */
3370 					return (0);
3371 				}
3372 			}
3373 		}
3374 
3375 		/*
3376 		 * If current master is no longer in the membership list or
3377 		 * is no longer joined, then this node uses the following
3378 		 * algorithm:
3379 		 * - node calls RPC routine clnt_ownset to get latest
3380 		 *	information on which nodes are owners of diskset.
3381 		 * 	clnt_ownset checks on each node to see if its kernel
3382 		 *	has that diskset snarfed.
3383 		 */
3384 		nd = sd->sd_nodelist;
3385 		while (nd) {
3386 			/* Don't consider node that isn't in member list */
3387 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3388 				nd = nd->nd_next;
3389 				continue;
3390 			}
3391 
3392 			if (clnt_ownset(nd->nd_nodename, sp,
3393 			    &is_owner, ep) == -1) {
3394 				/* If RPC failure to another node return 205 */
3395 				if ((mdanyrpcerror(ep)) &&
3396 				    (sd->sd_mn_mynode->nd_nodeid !=
3397 				    nd->nd_nodeid)) {
3398 					return (205);
3399 				} else {
3400 					/* Any other failure */
3401 					return (-1);
3402 				}
3403 			}
3404 
3405 			/*
3406 			 * Set owner flag for each node based on whether
3407 			 * that node really has a diskset mddb snarfed in
3408 			 * or not.
3409 			 */
3410 			if (is_owner == TRUE)
3411 				nd->nd_flags |= MD_MN_NODE_OWN;
3412 			else
3413 				nd->nd_flags &= ~MD_MN_NODE_OWN;
3414 
3415 			nd = nd->nd_next;
3416 		}
3417 
3418 		/*
3419 		 * - node walks through nodelist looking for nodes that are
3420 		 *	owners of the diskset that are in the membership list.
3421 		 * - for each owner, node calls RPC routine clnt_getset to
3422 		 *	 see if that node has its node record set to OK.
3423 		 * - If so, master is chosen to be this owner node.
3424 		 */
3425 		nd = sd->sd_nodelist;
3426 		while (nd) {
3427 			/* Don't consider node that isn't in member list */
3428 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3429 				nd = nd->nd_next;
3430 				continue;
3431 			}
3432 
3433 			/* Don't consider a node that isn't an owner */
3434 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3435 				nd = nd->nd_next;
3436 				continue;
3437 			}
3438 
3439 			/* Does node has its own node record set to OK? */
3440 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3441 			    MD_SET_BAD, &mnsr, ep) == -1) {
3442 				/* If RPC failure to another node return 205 */
3443 				if ((mdanyrpcerror(ep)) &&
3444 				    (sd->sd_mn_mynode->nd_nodeid !=
3445 				    nd->nd_nodeid)) {
3446 					return (205);
3447 				} else {
3448 					/* Any other failure */
3449 					return (-1);
3450 				}
3451 			}
3452 			nr = mnsr->sr_nodechain;
3453 			while (nr) {
3454 				if (nd->nd_nodeid == nr->nr_nodeid) {
3455 					if (nr->nr_flags & MD_MN_NODE_OK) {
3456 						/* Found a master */
3457 						free_sr(
3458 						    (md_set_record *)mnsr);
3459 						goto found_master;
3460 					}
3461 				}
3462 				nr = nr->nr_next;
3463 			}
3464 			free_sr((md_set_record *)mnsr);
3465 			nd = nd->nd_next;
3466 		}
3467 
3468 		/*
3469 		 * - If no owner node has its own node record on its own node
3470 		 *	set to OK, then this node checks all of the non-owner
3471 		 * 	nodes that are in the membership list.
3472 		 * - for each non-owner, node calls RPC routine clnt_getset to
3473 		 *	 see if that node has its node record set to OK.
3474 		 * - If set doesn't exist, don't choose node for master.
3475 		 * - If so, master is chosen to be this non-owner node.
3476 		 *
3477 		 */
3478 		nd = sd->sd_nodelist;
3479 		while (nd) {
3480 			/* Don't consider node that isn't in member list */
3481 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3482 				nd = nd->nd_next;
3483 				continue;
3484 			}
3485 
3486 			/* Only checking non-owner nodes this time around */
3487 			if (nd->nd_flags & MD_MN_NODE_OWN) {
3488 				nd = nd->nd_next;
3489 				continue;
3490 			}
3491 
3492 			/* Does node has its own node record set to OK? */
3493 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3494 			    MD_SET_BAD, &mnsr, ep) == -1) {
3495 				/*
3496 				 * If set doesn't exist on non-owner node,
3497 				 * don't consider this node for master.
3498 				 */
3499 				if (mdiserror(ep, MDE_NO_SET)) {
3500 					nd = nd->nd_next;
3501 					continue;
3502 				} else if ((mdanyrpcerror(ep)) &&
3503 				    (sd->sd_mn_mynode->nd_nodeid !=
3504 				    nd->nd_nodeid)) {
3505 					/* RPC failure to another node */
3506 					return (205);
3507 				} else {
3508 					/* Any other failure */
3509 					return (-1);
3510 				}
3511 			}
3512 			nr = mnsr->sr_nodechain;
3513 			while (nr) {
3514 				if (nd->nd_nodeid == nr->nr_nodeid) {
3515 					if (nr->nr_flags & MD_MN_NODE_OK) {
3516 						/* Found a master */
3517 						free_sr(
3518 						    (md_set_record *)mnsr);
3519 						goto found_master;
3520 					}
3521 				}
3522 				nr = nr->nr_next;
3523 			}
3524 			free_sr((md_set_record *)mnsr);
3525 			nd = nd->nd_next;
3526 		}
3527 
3528 		/*
3529 		 * - If no node can be found that has its own node record on
3530 		 *	its node to be set to OK, then all alive nodes
3531 		 * 	were in the process of being added to or deleted
3532 		 *	from set.  Each alive node will remove all
3533 		 *	information pertaining to this set from its node.
3534 		 *
3535 		 * If all nodes in set are ALIVE, then call sdssc end routines
3536 		 * since set was truly being initially created or destroyed.
3537 		 */
3538 		goto delete_set;
3539 	} else {
3540 
3541 		/*
3542 		 * If node is not joined to diskset, then this
3543 		 * node uses the following algorithm:
3544 		 * - If unjoined node doesn't have a node record for itself,
3545 		 *	just delete the diskset since diskset was in the
3546 		 *	process of being created.
3547 		 * - node needs to find master of diskset before
3548 		 *	reconfig cycle, if a master existed.
3549 		 * - node calls RPC routine clnt_ownset to get latest
3550 		 * 	information on which nodes are owners of diskset.
3551 		 *	clnt_ownset checks on each node to see if its
3552 		 *	kernel has that diskset snarfed.
3553 		 */
3554 
3555 		/*
3556 		 * Is my node in the set description?
3557 		 * If not, delete the set from this node.
3558 		 * sr2setdesc sets sd_mn_mynode pointer to the node
3559 		 * descriptor for this node if there was a node
3560 		 * record for this node.
3561 		 *
3562 		 */
3563 		if (sd->sd_mn_mynode == NULL) {
3564 			goto delete_set;
3565 		}
3566 
3567 		nd = sd->sd_nodelist;
3568 		while (nd) {
3569 			/* Don't consider node that isn't in member list */
3570 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3571 				nd = nd->nd_next;
3572 				continue;
3573 			}
3574 
3575 			if (clnt_ownset(nd->nd_nodename, sp,
3576 			    &is_owner, ep) == -1) {
3577 				/* If RPC failure to another node return 205 */
3578 				if ((mdanyrpcerror(ep)) &&
3579 				    (sd->sd_mn_mynode->nd_nodeid !=
3580 				    nd->nd_nodeid)) {
3581 					return (205);
3582 				} else {
3583 					/* Any other failure */
3584 					return (-1);
3585 				}
3586 			}
3587 
3588 			/*
3589 			 * Set owner flag for each node based on whether
3590 			 * that node really has a diskset mddb snarfed in
3591 			 * or not.
3592 			 */
3593 			if (is_owner == TRUE)
3594 				nd->nd_flags |= MD_MN_NODE_OWN;
3595 			else
3596 				nd->nd_flags &= ~MD_MN_NODE_OWN;
3597 
3598 			nd = nd->nd_next;
3599 		}
3600 
3601 		/*
3602 		 * - node walks through nodelist looking for nodes that
3603 		 *	are owners of the diskset that are in
3604 		 *	the membership list.
3605 		 * - for each owner, node calls RPC routine clnt_getset to
3606 		 *	see if that node has a master set and to get the
3607 		 *	diskset description.
3608 		 * - If the owner node has a set description that doesn't
3609 		 *	include the non-joined node in the nodelist, this node
3610 		 *	removes its set description of that diskset
3611 		 *	(i.e. removes the set from its local mddbs).  This is
3612 		 *	handling the case of when a node was removed from a
3613 		 *	diskset while it was not in the cluster membership
3614 		 *	list.
3615 		 * - If that node has a master set and the master is in the
3616 		 *	membership list and is an owner, then either this was
3617 		 *	the master from before the reconfig cycle or this
3618 		 *	node has already chosen a new master - either way,
3619 		 *	the master value is valid as long as it is in the
3620 		 *	membership list and is an owner
3621 		 * - master is chosen to be owner node's master
3622 		 */
3623 		nd = sd->sd_nodelist;
3624 		while (nd) {
3625 			/* Don't consider node that isn't in member list */
3626 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3627 				nd = nd->nd_next;
3628 				continue;
3629 			}
3630 
3631 			/* Don't consider a node that isn't an owner */
3632 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3633 				nd = nd->nd_next;
3634 				continue;
3635 			}
3636 
3637 			/* Get owner node's set record */
3638 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3639 			    MD_SET_BAD, &mnsr, ep) == -1) {
3640 				/* If RPC failure to another node return 205 */
3641 				if ((mdanyrpcerror(ep)) &&
3642 				    (sd->sd_mn_mynode->nd_nodeid !=
3643 				    nd->nd_nodeid)) {
3644 					return (205);
3645 				} else {
3646 					/* Any other failure */
3647 					return (-1);
3648 				}
3649 			}
3650 
3651 			/* Is this node in the owner node's set record */
3652 			nr = mnsr->sr_nodechain;
3653 			while (nr) {
3654 				if (sd->sd_mn_mynode->nd_nodeid ==
3655 				    nr->nr_nodeid) {
3656 					break;
3657 				}
3658 				nr = nr->nr_next;
3659 			}
3660 			if (nr == NULL) {
3661 				/* my node not found - delete set */
3662 				free_sr((md_set_record *)mnsr);
3663 				goto delete_set;
3664 			}
3665 
3666 			/* Is owner's node's master valid? */
3667 			master_nodeid = mnsr->sr_master_nodeid;
3668 			free_sr((md_set_record *)mnsr);
3669 			if (master_nodeid == MD_MN_INVALID_NID) {
3670 				nd = nd->nd_next;
3671 				continue;
3672 			}
3673 
3674 			nd2 = sd->sd_nodelist;
3675 			while (nd2) {
3676 				if ((nd2->nd_nodeid == master_nodeid) &&
3677 				    (nd2->nd_flags & MD_MN_NODE_ALIVE) &&
3678 				    (nd2->nd_flags & MD_MN_NODE_OWN)) {
3679 						nd = nd2;
3680 						goto found_master;
3681 				}
3682 				nd2 = nd2->nd_next;
3683 			}
3684 			nd = nd->nd_next;
3685 		}
3686 
3687 		/*
3688 		 * - If no owner node has a valid master, then follow
3689 		 * 	algorithm of when a node is joined to the diskset.
3690 		 * - node walks through nodelist looking for nodes that are
3691 		 *	owners of the diskset that are in the membership list.
3692 		 * - for each owner, node calls RPC routine clnt_getset to
3693 		 *	 see if that node has its node record set to OK.
3694 		 * - If so, master is chosen to be this owner node.
3695 		 */
3696 		nd = sd->sd_nodelist;
3697 		while (nd) {
3698 			/* Don't consider node that isn't in member list */
3699 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3700 				nd = nd->nd_next;
3701 				continue;
3702 			}
3703 
3704 			/* Don't consider a node that isn't an owner */
3705 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3706 				nd = nd->nd_next;
3707 				continue;
3708 			}
3709 
3710 			/* Does node has its own node record set to OK? */
3711 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3712 			    MD_SET_BAD, &mnsr, ep) == -1) {
3713 				/* If RPC failure to another node return 205 */
3714 				if ((mdanyrpcerror(ep)) &&
3715 				    (sd->sd_mn_mynode->nd_nodeid !=
3716 				    nd->nd_nodeid)) {
3717 					return (205);
3718 				} else {
3719 					/* Any other failure */
3720 					return (-1);
3721 				}
3722 			}
3723 			nr = mnsr->sr_nodechain;
3724 			while (nr) {
3725 				if (nd->nd_nodeid == nr->nr_nodeid) {
3726 					if (nr->nr_flags & MD_MN_NODE_OK) {
3727 						/* Found a master */
3728 						free_sr(
3729 						    (md_set_record *)mnsr);
3730 						goto found_master;
3731 					}
3732 				}
3733 				nr = nr->nr_next;
3734 			}
3735 			free_sr((md_set_record *)mnsr);
3736 			nd = nd->nd_next;
3737 		}
3738 
3739 		/*
3740 		 * - If no owner node has its own node record on its own node
3741 		 *	set to OK, then this node checks all of the non-owner
3742 		 *	nodes that are in the membership list.
3743 		 * - for each non-owner, node calls RPC routine clnt_getset to
3744 		 *	see if that node has its node record set to OK.
3745 		 * - If set doesn't exist, don't choose node for master.
3746 		 * - If this node doesn't exist in the nodelist on any of the
3747 		 *	non-owner nodes, this node removes its set description
3748 		 *	of that diskset (i.e. removes the set from its local
3749 		 *	mddbs). This is handling the case of when a node was
3750 		 *	removed from a diskset while it was not in the
3751 		 *	cluster membership list.
3752 		 * - If non-owner node has its node record set to OK and if
3753 		 *	this node hasn't removed this diskset (step directly
3754 		 *	before this one), then the master is chosen to be this
3755 		 *	non-owner node.
3756 		 */
3757 		nd = sd->sd_nodelist;
3758 		while (nd) {
3759 			/* Don't consider node that isn't in member list */
3760 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3761 				nd->nd_flags |= MD_MN_NODE_DEL;
3762 				nd = nd->nd_next;
3763 				continue;
3764 			}
3765 
3766 			/* Don't consider owner nodes since none are OK */
3767 			if (nd->nd_flags & MD_MN_NODE_OWN) {
3768 				nd->nd_flags |= MD_MN_NODE_DEL;
3769 				nd = nd->nd_next;
3770 				continue;
3771 			}
3772 
3773 			/*
3774 			 * Don't need to get nodelist from my node since
3775 			 * this is where sd_nodelist was obtained.
3776 			 */
3777 			if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) {
3778 				nd = nd->nd_next;
3779 				continue;
3780 			}
3781 
3782 			/*
3783 			 * If node has already been decided against for
3784 			 * master, then skip it.
3785 			 */
3786 			if (nd->nd_flags & MD_MN_NODE_DEL) {
3787 				nd = nd->nd_next;
3788 				continue;
3789 			}
3790 
3791 			/*
3792 			 * Does node in my nodelist have its own node
3793 			 * record marked OK on its node?  And does node
3794 			 * in my nodelist exist on all other nodes?
3795 			 * Don't want to choose a node for master unless
3796 			 * that node is marked OK on its own node and that
3797 			 * node exists on all other alive nodes.
3798 			 *
3799 			 * This is guarding against the case when several
3800 			 * nodes are down and one of the downed nodes is
3801 			 * deleted from the diskset.  When the down nodes
3802 			 * are rebooted into the cluster, you don't want
3803 			 * any node to pick the deleted node as the master.
3804 			 */
3805 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3806 			    MD_SET_BAD, &mnsr, ep) == -1) {
3807 				/*
3808 				 * If set doesn't exist on non-owner node,
3809 				 * don't consider this node for master.
3810 				 */
3811 				if (mdiserror(ep, MDE_NO_SET)) {
3812 					nd->nd_flags |= MD_MN_NODE_DEL;
3813 					nd = nd->nd_next;
3814 					continue;
3815 				} else if (mdanyrpcerror(ep)) {
3816 					/* RPC failure to another node */
3817 					return (205);
3818 				} else {
3819 					/* Any other failure */
3820 					return (-1);
3821 				}
3822 			}
3823 			/*
3824 			 * Is my node in the nodelist gotten from the other
3825 			 * node?  If not, then remove the set from my node
3826 			 * since set was deleted from my node while my node
3827 			 * was out of the cluster.
3828 			 */
3829 			nr = mnsr->sr_nodechain;
3830 			while (nr) {
3831 				if (sd->sd_mn_mynode->nd_nodeid ==
3832 				    nr->nr_nodeid) {
3833 					break;
3834 				}
3835 				nr = nr->nr_next;
3836 			}
3837 			if (nr == NULL) {
3838 				/* my node not found - delete set */
3839 				free_sr((md_set_record *)mnsr);
3840 				goto delete_set;
3841 			}
3842 
3843 			/* Is node being checked marked OK on its own node? */
3844 			nr = mnsr->sr_nodechain;
3845 			while (nr) {
3846 				if (nd->nd_nodeid == nr->nr_nodeid) {
3847 					if (!(nr->nr_flags & MD_MN_NODE_OK)) {
3848 						nd->nd_flags |= MD_MN_NODE_DEL;
3849 					}
3850 					break;
3851 				}
3852 				nr = nr->nr_next;
3853 			}
3854 			/*
3855 			 * If node being checked doesn't exist on its
3856 			 * own node - don't choose it as master.
3857 			 */
3858 			if (nr == NULL) {
3859 				nd->nd_flags |= MD_MN_NODE_DEL;
3860 			}
3861 
3862 			/*
3863 			 * Check every node in my node's nodelist against
3864 			 * the nodelist gotten from the other node.
3865 			 * If a node in my node's nodelist is not found in the
3866 			 * other node's nodelist, then set the DEL flag.
3867 			 */
3868 			nd2 = sd->sd_nodelist;
3869 			while (nd2) {
3870 				nr = mnsr->sr_nodechain;
3871 				while (nr) {
3872 					if (nd2->nd_nodeid == nr->nr_nodeid) {
3873 						break;
3874 					}
3875 					nr = nr->nr_next;
3876 				}
3877 				/* nd2 not found in other node's nodelist */
3878 				if (nr == NULL) {
3879 					nd2->nd_flags |= MD_MN_NODE_DEL;
3880 				}
3881 				nd2 = nd2->nd_next;
3882 			}
3883 
3884 			free_sr((md_set_record *)mnsr);
3885 			nd = nd->nd_next;
3886 		}
3887 
3888 		/*
3889 		 * Rescan list look for node that has not been marked DEL.
3890 		 * First node found is the master.
3891 		 */
3892 		nd = sd->sd_nodelist;
3893 		while (nd) {
3894 			if (!(nd->nd_flags & MD_MN_NODE_DEL)) {
3895 				break;
3896 			}
3897 			nd = nd->nd_next;
3898 			continue;
3899 		}
3900 		if (nd) {
3901 			/* Found a master */
3902 			goto found_master;
3903 		}
3904 
3905 		/*
3906 		 * - If no node can be found that has its own node record on
3907 		 *	its node to be set to OK, then all alive nodes
3908 		 * 	were in the process of being added to or deleted
3909 		 *	from set.  Each alive node will remove all
3910 		 *	information pertaining to this set from its node.
3911 		 *
3912 		 * If all nodes in set are ALIVE, then call sdssc end routines
3913 		 * since set was truly being initially created or destroyed.
3914 		 */
3915 		goto delete_set;
3916 	}
3917 
3918 found_master:
3919 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
3920 	    "Set %s master chosen %s (%d): %s"),
3921 	    sp->setname, nd->nd_nodename, nd->nd_nodeid,
3922 	    meta_print_hrtime(gethrtime() - start_time));
3923 
3924 	if (clnt_lock_set(mynode(), sp, ep) == -1) {
3925 		return (-1);
3926 	}
3927 
3928 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
3929 
3930 	if (clnt_mnsetmaster(mynode(), sp,
3931 	    nd->nd_nodename, nd->nd_nodeid, ep)) {
3932 		rval = -1;
3933 	} else if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) {
3934 		/* If this node is new master, set flag in this node's kernel */
3935 		(void) memset(&sf, 0, sizeof (sf));
3936 		sf.sf_setno = sp->setno;
3937 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
3938 		/* Use magic to help protect ioctl against attack. */
3939 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
3940 		sf.sf_flags = MDDB_NM_SET;
3941 
3942 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
3943 		    "Setting new master flag for set %s: %s"),
3944 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
3945 
3946 		/*
3947 		 * Fail reconfig cycle if ioctl fails since it is critical
3948 		 * to set new master flag.
3949 		 */
3950 		if (metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde,
3951 		    NULL) != NULL) {
3952 			(void) mdstealerror(ep, &sf.sf_mde);
3953 			rval = -1;
3954 		}
3955 	}
3956 
3957 	if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) {
3958 		if (rval == 0) {
3959 			(void) mdstealerror(ep, &xep);
3960 			rval = -1;
3961 		}
3962 	}
3963 
3964 	cl_set_setkey(NULL);
3965 
3966 	metaflushsetname(sp);
3967 
3968 	return (rval);
3969 
3970 delete_set:
3971 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
3972 	    "Master not chosen, deleting set %s: %s"),
3973 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
3974 
3975 	/*
3976 	 * Remove all set information from this node:
3977 	 *	- node records for this set
3978 	 *	- drive records for this set
3979 	 *	- set record for this set
3980 	 * (Only do this on this node since each node
3981 	 * will do it for its own local mddb.)
3982 	 *
3983 	 * If all nodes in set are ALIVE, then
3984 	 * the lowest numbered ALIVE nodeid in set
3985 	 * (irregardless of whether an owner node or not) will
3986 	 * call the DCS service to cleanup for create/delete of set.
3987 	 *   sdssc_create_end(cleanup) if set was being created or
3988 	 *   sdssc_delete_end(cleanup) if set was being deleted.
3989 	 * A node record with flag ADD denotes a set being
3990 	 * created.  A node record with flag DEL denotes a
3991 	 * set being deleted.
3992 	 */
3993 	nd = sd->sd_nodelist;
3994 	while (nd) {
3995 		/* Found a node that isn't alive */
3996 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
3997 			break;
3998 
3999 		/* Is my node the lowest numbered ALIVE node? */
4000 		if (nd->nd_nodeid < sd->sd_mn_mynode->nd_nodeid) {
4001 			break;
4002 		}
4003 		nd = nd->nd_next;
4004 	}
4005 	if (nd == NULL) {
4006 		/* All nodes ALIVE and this is the lowest nodeid */
4007 		lowest_alive_nodeid = 1;
4008 	}
4009 
4010 	if (clnt_lock_set(mynode(), sp, ep) == -1) {
4011 		return (-1);
4012 	}
4013 
4014 
4015 	/*
4016 	 * If this node had been joined, withdraw and reset master.
4017 	 *
4018 	 * This could happen if a node was being added to or removed
4019 	 * from a diskset and the node doing the add/delete operation and
4020 	 * all other nodes in the diskset have left the cluster.
4021 	 */
4022 	if (sd->sd_mn_mynode) {
4023 		nd = sd->sd_mn_mynode;
4024 		if (nd->nd_flags & MD_MN_NODE_OWN) {
4025 			if (clnt_withdrawset(mynode(), sp, ep)) {
4026 				rval = -1;
4027 				goto out;
4028 			}
4029 			if (clnt_mnsetmaster(mynode(), sp, "",
4030 			    MD_MN_INVALID_NID, ep)) {
4031 				rval = -1;
4032 				goto out;
4033 			}
4034 		}
4035 	}
4036 
4037 	/*
4038 	 * Remove side records for this node (side) from local mddb
4039 	 * (clnt_deldrvs does this) if there are drives in the set.
4040 	 *
4041 	 * Don't need to mark this node as DEL since already marked as
4042 	 * ADD or DEL (or this node would have been chosen as master).
4043 	 * Don't need to mark other node records, drive records or
4044 	 * set records as DEL.  If a panic occurs during clnt_delset,
4045 	 * these records will be deleted the next time this node
4046 	 * becomes a member and goes through the reconfig cycle.
4047 	 */
4048 	/* Get the drive descriptors for this set */
4049 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
4050 	    ep)) == NULL) {
4051 		if (! mdisok(ep)) {
4052 			/*
4053 			 * Ignore and clear out any failures from
4054 			 * metaget_drivedesc since a panic could have
4055 			 * occurred when a node was partially added to a set.
4056 			 */
4057 			mdclrerror(ep);
4058 		}
4059 	} else {
4060 		if (clnt_deldrvs(mynode(), sp, dd, ep)) {
4061 			rval = -1;
4062 			goto out;
4063 		}
4064 	}
4065 
4066 	/*
4067 	 * Now, delete the set - this removes the node, drive
4068 	 * and set records from the local mddb.
4069 	 */
4070 	if (clnt_delset(mynode(), sp, ep)) {
4071 		rval = -1;
4072 		goto out;
4073 	}
4074 
4075 out:
4076 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
4077 
4078 	/*
4079 	 * Ignore errors from unlock of set since set is no longer
4080 	 * known (if clnt_delset worked).
4081 	 */
4082 	if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) {
4083 		mdclrerror(&xep);
4084 	}
4085 
4086 	cl_set_setkey(NULL);
4087 
4088 	metaflushsetname(sp);
4089 
4090 	/*
4091 	 * If this node is the lowest numbered nodeid then
4092 	 * call sdssc_create/delete_end depending on whether
4093 	 * this node is marked as ADD or DEL in the node record.
4094 	 */
4095 	if (lowest_alive_nodeid) {
4096 		if (nd->nd_flags & MD_MN_NODE_ADD)
4097 			sdssc_create_end(sp->setname, SDSSC_CLEANUP);
4098 		else if (nd->nd_flags & MD_MN_NODE_DEL)
4099 			sdssc_delete_end(sp->setname, SDSSC_CLEANUP);
4100 	}
4101 
4102 	/* Finished with this set -- return */
4103 	return (rval);
4104 }
4105 
4106 /*
4107  * Reconfig step to choose a new master for all MN disksets.
4108  * Return values:
4109  *	0 - Everything is great.
4110  *	1 - This node failed to reconfig.
4111  *	205 - Cause another reconfig due to a nodelist problem
4112  *		or RPC failure to another node
4113  */
4114 int
4115 meta_reconfig_choose_master(
4116 	md_error_t	*ep
4117 )
4118 {
4119 	set_t				max_sets, setno;
4120 	int				nodecnt;
4121 	mndiskset_membershiplist_t	*nl;
4122 	md_set_desc			*sd;
4123 	mdsetname_t			*sp;
4124 	int				rval = 0;
4125 	mddb_setflags_config_t		sf;
4126 	int				start_node_delayed = 0;
4127 
4128 	if ((max_sets = get_max_sets(ep)) == 0) {
4129 		mde_perror(ep, dgettext(TEXT_DOMAIN,
4130 		    "Unable to get number of sets"));
4131 		return (1);
4132 	}
4133 
4134 	/*
4135 	 * Get membershiplist from API routine.  If there's
4136 	 * an error, return a 205 to cause another reconfig.
4137 	 */
4138 	if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
4139 		mde_perror(ep, "");
4140 		return (205);
4141 	}
4142 
4143 	for (setno = 1; setno < max_sets; setno++) {
4144 		if ((sp = metasetnosetname(setno, ep)) == NULL) {
4145 			if (mdiserror(ep, MDE_NO_SET)) {
4146 				/* No set for this setno - continue */
4147 				mdclrerror(ep);
4148 				continue;
4149 			} else {
4150 				/*
4151 				 * If encountered an RPC error from my node,
4152 				 * then immediately fail.
4153 				 */
4154 				if (mdanyrpcerror(ep)) {
4155 					mde_perror(ep, "");
4156 					return (1);
4157 				}
4158 				/* Can't get set information */
4159 				mde_perror(ep, dgettext(TEXT_DOMAIN,
4160 					"Unable to get information for "
4161 					"set number %d"), setno);
4162 				mdclrerror(ep);
4163 				continue;
4164 			}
4165 		}
4166 
4167 		/* If setname is there, set desc should exist. */
4168 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4169 			/*
4170 			 * If encountered an RPC error from my node,
4171 			 * then immediately fail.
4172 			 */
4173 			if (mdanyrpcerror(ep)) {
4174 				mde_perror(ep, "");
4175 				return (1);
4176 			}
4177 			mde_perror(ep, dgettext(TEXT_DOMAIN,
4178 				"Unable to get set %s desc information"),
4179 				sp->setname);
4180 			mdclrerror(ep);
4181 			continue;
4182 		}
4183 
4184 		/* Only reconfig MN disksets */
4185 		if (!MD_MNSET_DESC(sd)) {
4186 			continue;
4187 		}
4188 
4189 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4190 		    "Begin choose master for set %s: %s"),
4191 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4192 
4193 		/* Update nodelist with member information. */
4194 		if (meta_reconfig_update_nodelist(sp, nl, sd, ep)) {
4195 			/*
4196 			 * If encountered an RPC error from my node,
4197 			 * then immediately fail.
4198 			 */
4199 			if (mdanyrpcerror(ep)) {
4200 				mde_perror(ep, "");
4201 				return (1);
4202 			}
4203 			mde_perror(ep, "");
4204 			mdclrerror(ep);
4205 			continue;
4206 		}
4207 
4208 		/*
4209 		 * If all nodes in a cluster are starting, then
4210 		 * all nodes will attempt to contact all other nodes
4211 		 * to determine a master node.  This can lead to a
4212 		 * problem where node 1 is trying to contact the rpc.metad
4213 		 * node 2 and node 2 is trying to contact the rpc.metad
4214 		 * on node 1 -- and this causes the rpc call to fail
4215 		 * on both nodes and causes a new reconfig cycle.
4216 		 *
4217 		 * In order to break this problem, a newly starting node
4218 		 * will delay a small amount of time (nodeid mod 4 seconds)
4219 		 * and will then run the code to choose a master for the
4220 		 * first set.  Delay will only be done once regardless of the
4221 		 * number of sets.
4222 		 */
4223 		if (start_node_delayed == 0) {
4224 			(void) memset(&sf, 0, sizeof (sf));
4225 			sf.sf_setno = sp->setno;
4226 			sf.sf_flags = MDDB_NM_GET;
4227 			/* Use magic to help protect ioctl against attack. */
4228 			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
4229 			if ((metaioctl(MD_MN_GET_SETFLAGS, &sf,
4230 			    &sf.sf_mde, NULL) == 0) &&
4231 			    ((sf.sf_setflags & MD_SET_MN_START_RC) ==
4232 			    MD_SET_MN_START_RC)) {
4233 				(void) sleep(sd->sd_mn_mynode->nd_nodeid % 4);
4234 			}
4235 			start_node_delayed = 1;
4236 		}
4237 
4238 		/* Choose master for this set */
4239 		rval = meta_reconfig_choose_master_for_set(sp, sd, ep);
4240 		if (rval == -1) {
4241 			mde_perror(ep, "");
4242 			return (1);
4243 		} else if (rval == 205) {
4244 			mde_perror(ep, "");
4245 			return (205);
4246 		}
4247 
4248 		/* Send new nodelist to rpc.mdcommd */
4249 		(void) mdmn_reinit_set(sp->setno);
4250 
4251 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4252 		    "Choose master for set %s completed: %s"),
4253 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4254 	}
4255 
4256 	/*
4257 	 * Each node turns on I/Os for all MN disksets.
4258 	 * This is to recover from the situation where the master died
4259 	 * during a previous reconfig cycle when I/Os were suspended
4260 	 * for a MN diskset.
4261 	 * If a failure occurs return a 1 which will force this node to
4262 	 * panic.  Cannot leave node in the situation where I/Os are
4263 	 * not resumed.
4264 	 */
4265 	setno = 0; /* 0 means all MN sets */
4266 	if (metaioctl(MD_MN_RESUME_SET, &setno, ep, NULL)) {
4267 		mde_perror(ep, "");
4268 		return (1);
4269 	}
4270 
4271 	/* Free the nodelist */
4272 	if (nodecnt)
4273 		meta_free_nodelist(nl);
4274 
4275 	return (0);
4276 }
4277 
4278 /*
4279  * meta_mnsync_user_records will synchronize the diskset user records across
4280  * all nodes in the diskset.  The diskset user records are stored in
4281  * each node's local set mddb.
4282  *
4283  * This needs to be done even if there is no master change during the
4284  * reconfig cycle since this routine should clean up any mess left by
4285  * the untimely termination of a metaset or metadb command (due to a
4286  * node panic or to user intervention).
4287  *
4288  * Caller is the Master node.
4289  *
4290  * Returns	 0 - Success
4291  *		205 - Failure during RPC to another node
4292  *		-1 - Any other failure and ep is filled in.
4293  */
4294 int
4295 meta_mnsync_user_records(
4296 	mdsetname_t	*sp,
4297 	md_error_t	*ep
4298 )
4299 {
4300 	md_set_desc		*sd;
4301 	md_mnnode_desc		*master_nodelist, *nd, *nd2, *ndtail;
4302 	md_mnset_record		*mnsr;
4303 	md_mnsr_node_t		*master_mnsr_node = NULL, *mnsr_node = NULL;
4304 	md_mnnode_record	*nr;
4305 	md_drive_record		*dr;
4306 	int			dr_cnt, dd_cnt;
4307 	int			found_my_nr;
4308 	md_drive_desc		*dd, *dd_prev, *master_dd, *other_dd;
4309 	int			all_drives_ok;
4310 	int			rval = 0;
4311 	int			max_genid = 0;
4312 	int			num_alive_nodes, num_alive_nodes_del = 0;
4313 	int			set_locked = 0;
4314 	md_setkey_t		*cl_sk;
4315 	md_error_t		xep = mdnullerror;
4316 	char			*anode[1];
4317 	mddb_setflags_config_t	sf;
4318 
4319 	/*
4320 	 * Sync up node records first.
4321 	 * Construct a master nodelist using the nodelist from this
4322 	 * node's rpc.metad node records and then setting the state of each
4323 	 * node following these rules:
4324 	 *	- If a node record is marked OK on its node, mark it OK
4325 	 *		in the master nodelist (and later OK on all nodes)
4326 	 *		If a node record is also marked OWN on its node,
4327 	 *		mark it OWN in the master nodelist.
4328 	 *	- If a node record is not marked OK on its node, then mark
4329 	 *		it as DEL in the master list (later deleting it)
4330 	 *	- If node record doesn't exist on that node, then mark it DEL
4331 	 *		(later deleting it)
4332 	 *	- If set record doesn't exist on that node, mark node as DEL
4333 	 *	- If a node record doesn't exist on all nodes, then mark it DEL
4334 	 *	- If a node is not ALIVE, then
4335 	 *		- If that node marked DEL on any node - mark it DEL
4336 	 *			in master list but leave in nodelist
4337 	 *		- If that node is marked as ADD on any node, mark it
4338 	 *			ADD in the master list but leave in nodelist
4339 	 *		- When that node returns to the living, the DEL
4340 	 *			node record will be removed and the ADD node
4341 	 *			record may be removed if marked ADD on that
4342 	 *			node.
4343 	 * The key rule is to not remove a node from the nodelist until
4344 	 * that node record is removed from its own node.  Do not want to
4345 	 * remove a node's record from all other nodes and then have
4346 	 * that node have its own record marked OK so that a node will pick
4347 	 * a different master than the other nodes.
4348 	 *
4349 	 * Next,
4350 	 * If node is ALIVE and node record is marked DEL in master nodelist,
4351 	 * remove node from set.
4352 	 * If node is ALIVE and node record is marked OK in master nodelist,
4353 	 * mark it OK on all other nodes.
4354 	 * If node is not ALIVE and node record is marked DEL in master
4355 	 * nodelist, mark it DEL on all other nodes.
4356 	 * If node is not ALIVE and node record is marked ADD in master,
4357 	 * nodelist, mark it ADD on all other nodes.
4358 	 */
4359 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4360 		return (-1);
4361 	}
4362 	master_nodelist = sd->sd_nodelist;
4363 
4364 	/*
4365 	 * Walk through nodelist creating a master nodelist.
4366 	 */
4367 	num_alive_nodes = 0;
4368 	nd = master_nodelist;
4369 	while (nd) {
4370 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4371 			nd = nd->nd_next;
4372 			continue;
4373 		}
4374 		num_alive_nodes++;
4375 		if (clnt_mngetset(nd->nd_nodename, sp->setname,
4376 		    MD_SET_BAD, &mnsr, ep) == -1) {
4377 			if (mdiserror(ep, MDE_NO_SET)) {
4378 				/* set doesn't exist, mark node as DEL */
4379 				nd->nd_flags &= ~MD_MN_NODE_OK;
4380 				nd->nd_flags &= ~MD_MN_NODE_ADD;
4381 				nd->nd_flags |= MD_MN_NODE_DEL;
4382 				nd->nd_flags |= MD_MN_NODE_NOSET;
4383 				nd = nd->nd_next;
4384 				continue;
4385 			} else {
4386 				/* If RPC failure to another node return 205 */
4387 				if ((mdanyrpcerror(ep)) &&
4388 				    (sd->sd_mn_mynode->nd_nodeid !=
4389 				    nd->nd_nodeid)) {
4390 					rval = 205;
4391 				} else {
4392 					/* Any other failure */
4393 					rval = -1;
4394 				}
4395 				goto out;
4396 			}
4397 		}
4398 		/* Find biggest genid in records for this diskset */
4399 		if (mnsr->sr_genid > max_genid)
4400 			max_genid = mnsr->sr_genid;
4401 
4402 		dr = mnsr->sr_drivechain;
4403 		while (dr) {
4404 			/* Find biggest genid in records for this diskset */
4405 			if (dr->dr_genid > max_genid) {
4406 				max_genid = dr->dr_genid;
4407 			}
4408 			dr = dr->dr_next;
4409 		}
4410 
4411 		found_my_nr = 0;
4412 		nr = mnsr->sr_nodechain;
4413 		/* nr is the list of node recs from nd_nodename node */
4414 		while (nr) {
4415 			/* Find biggest genid in records for this diskset */
4416 			if (nr->nr_genid > max_genid)
4417 				max_genid = nr->nr_genid;
4418 			nd2 = master_nodelist;
4419 			ndtail = NULL;
4420 			/* For each node record, is it in master list? */
4421 			while (nd2) {
4422 				if (nd2->nd_nodeid == nr->nr_nodeid)
4423 					break;
4424 				if (nd2->nd_next == NULL)
4425 					ndtail = nd2;
4426 				nd2 = nd2->nd_next;
4427 			}
4428 			/*
4429 			 * Found node record not in master list -- add it
4430 			 * to list marking it as DEL since node record
4431 			 * should exist on all nodes unless a panic occurred
4432 			 * during addition or deletion of host to diskset.
4433 			 */
4434 			if (nd2 == NULL) {
4435 				nd2 = Zalloc(sizeof (*nd2));
4436 				(void) strcpy(nd2->nd_nodename,
4437 				    nr->nr_nodename);
4438 				nd2->nd_flags = nr->nr_flags;
4439 				nd2->nd_flags |= MD_MN_NODE_DEL;
4440 				nd2->nd_nodeid = nr->nr_nodeid;
4441 				nd2->nd_next = NULL;
4442 				ndtail->nd_next = nd2;
4443 				nd2 = NULL;
4444 				nr = nr->nr_next;
4445 				continue;
4446 			}
4447 			/*
4448 			 * Is this the node record for the node that
4449 			 * we requested the set desc from?
4450 			 * If so, check if node has its own node record
4451 			 * marked OK. If marked OK, check for the OWN bit.
4452 			 */
4453 			if (nr->nr_nodeid == nd->nd_nodeid) {
4454 				found_my_nr = 1;
4455 				if (nr->nr_flags & MD_MN_NODE_OK) {
4456 					/*
4457 					 * If node record is marked OK
4458 					 * on its own node, then mark it OK
4459 					 * in the master list.  Node record
4460 					 * would have to exist on all nodes
4461 					 * in the ADD state before it could
4462 					 * be put into the OK state.
4463 					 */
4464 					nd->nd_flags |= MD_MN_NODE_OK;
4465 					nd->nd_flags &=
4466 					    ~(MD_MN_NODE_ADD | MD_MN_NODE_DEL);
4467 					/*
4468 					 * Mark own in master list as marked
4469 					 * on own node.
4470 					 */
4471 					if (nr->nr_flags & MD_MN_NODE_OWN)
4472 						nd->nd_flags |= MD_MN_NODE_OWN;
4473 					else
4474 						nd->nd_flags &= ~MD_MN_NODE_OWN;
4475 				} else {
4476 					/* Otherwise, mark node as DEL */
4477 					nd->nd_flags &= ~MD_MN_NODE_OK;
4478 					nd->nd_flags &= ~MD_MN_NODE_ADD;
4479 					nd->nd_flags |= MD_MN_NODE_DEL;
4480 				}
4481 			}
4482 			/*
4483 			 * If node is not ALIVE and marked DEL
4484 			 * on any node, make it DEL in master list.
4485 			 * If node is not ALIVE and marked ADD
4486 			 * on any node, make it ADD in master list
4487 			 * unless node record has already been marked DEL.
4488 			 */
4489 			if (!(nr->nr_flags & MD_MN_NODE_ALIVE)) {
4490 				if (nr->nr_flags & MD_MN_NODE_ADD) {
4491 					if (!(nd->nd_flags & MD_MN_NODE_DEL)) {
4492 						/* If not DEL - mark it ADD */
4493 						nd->nd_flags |= MD_MN_NODE_ADD;
4494 						nd->nd_flags &= ~MD_MN_NODE_OK;
4495 					}
4496 				}
4497 				if (nr->nr_flags & MD_MN_NODE_DEL) {
4498 					nd->nd_flags |= MD_MN_NODE_DEL;
4499 					nd->nd_flags &= ~MD_MN_NODE_OK;
4500 					/* Could already be ADD - make it DEL */
4501 					nd->nd_flags &= ~MD_MN_NODE_ADD;
4502 				}
4503 			}
4504 			nr = nr->nr_next;
4505 		}
4506 		/*
4507 		 * If a node record doesn't exist on its own node,
4508 		 * then mark node as DEL.
4509 		 */
4510 		if (found_my_nr == 0) {
4511 			nd->nd_flags &= ~MD_MN_NODE_OK;
4512 			nd->nd_flags |= MD_MN_NODE_DEL;
4513 		}
4514 
4515 		/*
4516 		 * If node is OK - put mnsr onto master_mnsr_node list for
4517 		 * later use when syncing up the drive records in the set.
4518 		 */
4519 		if (nd->nd_flags & MD_MN_NODE_OK) {
4520 			mnsr_node = Zalloc(sizeof (*mnsr_node));
4521 			mnsr_node->mmn_mnsr = mnsr;
4522 			(void) strncpy(mnsr_node->mmn_nodename,
4523 				nd->nd_nodename, MD_MAX_MNNODENAME_PLUS_1);
4524 			mnsr_node->mmn_next = master_mnsr_node;
4525 			master_mnsr_node = mnsr_node;
4526 		} else {
4527 			free_sr((struct md_set_record *)mnsr);
4528 		}
4529 
4530 		nd = nd->nd_next;
4531 	}
4532 
4533 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4534 	    "Master nodelist created for set %s: %s"),
4535 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4536 
4537 	/*
4538 	 * Send master nodelist to the rpc.metad on all nodes (including
4539 	 * myself) and each node will update itself.  This will set the
4540 	 * ADD and DEL flags on each node as setup in the master nodelist.
4541 	 * Don't send nodelist to node where set doesn't exist.
4542 	 */
4543 	nd = master_nodelist;
4544 	while (nd) {
4545 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4546 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
4547 			nd = nd->nd_next;
4548 			continue;
4549 		}
4550 		if (clnt_upd_nr_flags(nd->nd_nodename, sp,
4551 		    master_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) {
4552 			/* If RPC failure to another node return 205 */
4553 			if ((mdanyrpcerror(ep)) &&
4554 			    (sd->sd_mn_mynode->nd_nodeid !=
4555 			    nd->nd_nodeid)) {
4556 				rval = 205;
4557 			} else {
4558 				/* Any other failure */
4559 				rval = -1;
4560 			}
4561 			goto out;
4562 		}
4563 		nd = nd->nd_next;
4564 	}
4565 
4566 	/*
4567 	 * Now, delete nodes that need to be deleted.
4568 	 */
4569 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
4570 	    ep))  == NULL) {
4571 		if (! mdisok(ep)) {
4572 			rval = -1;
4573 			goto out;
4574 		}
4575 	}
4576 
4577 	/*
4578 	 * May be doing lots of RPC commands to the nodes, so lock the
4579 	 * ALIVE members of the set since most of the rpc.metad routines
4580 	 * require this for security reasons.
4581 	 */
4582 	nd = master_nodelist;
4583 	while (nd) {
4584 		/* Skip non-alive nodes and node without set */
4585 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4586 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
4587 			nd = nd->nd_next;
4588 			continue;
4589 		}
4590 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
4591 			/* If RPC failure to another node return 205 */
4592 			if ((mdanyrpcerror(ep)) &&
4593 			    (sd->sd_mn_mynode->nd_nodeid !=
4594 			    nd->nd_nodeid)) {
4595 				rval = 205;
4596 			} else {
4597 				/* Any other failure */
4598 				rval = -1;
4599 			}
4600 			goto out;
4601 		}
4602 		set_locked = 1;
4603 		nd = nd->nd_next;
4604 	}
4605 
4606 	nd = master_nodelist;
4607 	while (nd) {
4608 		/* Skip non-alive nodes */
4609 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4610 			nd = nd->nd_next;
4611 			continue;
4612 		}
4613 		if (nd->nd_flags & MD_MN_NODE_DEL) {
4614 			num_alive_nodes_del++;
4615 			/*
4616 			 * Delete this node rec from all ALIVE nodes in diskset.
4617 			 */
4618 			nd2 = master_nodelist;
4619 			while (nd2) {
4620 				/* Skip non-alive nodes and node without set */
4621 				if (!(nd2->nd_flags & MD_MN_NODE_ALIVE) ||
4622 				    (nd2->nd_flags & MD_MN_NODE_NOSET)) {
4623 					nd2 = nd2->nd_next;
4624 					continue;
4625 				}
4626 
4627 				/* This is a node being deleted from set */
4628 				if (nd2->nd_nodeid == nd->nd_nodeid) {
4629 					/* Mark set record as DEL */
4630 					if (clnt_upd_sr_flags(nd->nd_nodename,
4631 					    sp, MD_SR_DEL, ep)) {
4632 						/* RPC failure to !my node */
4633 						if ((mdanyrpcerror(ep)) &&
4634 						    (sd->sd_mn_mynode->
4635 						    nd_nodeid
4636 						    != nd->nd_nodeid)) {
4637 							rval = 205;
4638 						} else {
4639 							/* Any other failure */
4640 							rval = -1;
4641 						}
4642 						goto out;
4643 					}
4644 					if (clnt_deldrvs(nd->nd_nodename, sp,
4645 					    dd, ep)) {
4646 						/* RPC failure to !my node */
4647 						if ((mdanyrpcerror(ep)) &&
4648 						    (sd->sd_mn_mynode->
4649 						    nd_nodeid
4650 						    != nd->nd_nodeid)) {
4651 							rval = 205;
4652 						} else {
4653 							/* Any other failure */
4654 							rval = -1;
4655 						}
4656 						goto out;
4657 					}
4658 					if (clnt_delset(nd->nd_nodename, sp,
4659 					    ep) == -1) {
4660 						/* RPC failure to !my node */
4661 						if ((mdanyrpcerror(ep)) &&
4662 						    (sd->sd_mn_mynode->
4663 						    nd_nodeid
4664 						    != nd->nd_nodeid)) {
4665 							rval = 205;
4666 						} else {
4667 							/* Any other failure */
4668 							rval = -1;
4669 						}
4670 						goto out;
4671 					}
4672 				} else {
4673 					/*
4674 					 * Delete host from sets on hosts
4675 					 * not being deleted.
4676 					 */
4677 					anode[0] = Strdup(nd->nd_nodename);
4678 					if (clnt_delhosts(nd2->nd_nodename, sp,
4679 					    1, anode, ep) == -1) {
4680 						Free(anode[0]);
4681 						/* RPC failure to !my node */
4682 						if ((mdanyrpcerror(ep)) &&
4683 						    (sd->sd_mn_mynode->
4684 						    nd_nodeid
4685 						    != nd2->nd_nodeid)) {
4686 							rval = 205;
4687 						} else {
4688 							/* Any other failure */
4689 							rval = -1;
4690 						}
4691 						goto out;
4692 					}
4693 
4694 					meta_mc_log(MC_LOG5,
4695 					    dgettext(TEXT_DOMAIN,
4696 					    "Deleted node %s (%d) on node %s "
4697 					    "from set %s: %s"),
4698 					    nd->nd_nodename, nd->nd_nodeid,
4699 					    nd2->nd_nodename,
4700 					    sp->setname,
4701 					    meta_print_hrtime(
4702 					    gethrtime() - start_time));
4703 
4704 					Free(anode[0]);
4705 				}
4706 				nd2 = nd2->nd_next;
4707 			}
4708 		}
4709 		nd = nd->nd_next;
4710 	}
4711 
4712 	nd = master_nodelist;
4713 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
4714 	while (nd) {
4715 		/* Skip non-alive nodes and node without set */
4716 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4717 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
4718 			nd = nd->nd_next;
4719 			continue;
4720 		}
4721 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) {
4722 			/* If RPC failure to another node return 205 */
4723 			if ((mdanyrpcerror(ep)) &&
4724 			    (sd->sd_mn_mynode->nd_nodeid !=
4725 			    nd->nd_nodeid)) {
4726 				rval = 205;
4727 			} else {
4728 				/* Any other failure */
4729 				rval = -1;
4730 			}
4731 			goto out;
4732 		}
4733 		nd = nd->nd_next;
4734 	}
4735 	cl_set_setkey(NULL);
4736 	set_locked = 0;
4737 
4738 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4739 	    "Nodelist syncronization complete for set %s: %s"),
4740 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4741 
4742 	metaflushsetname(sp);
4743 
4744 	/*
4745 	 * If all alive nodes have been deleted from set, just
4746 	 * return since nothing else can be done until non-alive
4747 	 * nodes (if there are any) rejoin the cluster.
4748 	 */
4749 	if (num_alive_nodes == num_alive_nodes_del) {
4750 		rval = 0;
4751 		goto out;
4752 	}
4753 
4754 	/*
4755 	 * Sync up drive records.
4756 	 *
4757 	 * If a node panic'd (or metaset command was killed) during the
4758 	 * addition or deletion of a drive to the diskset, the nodes
4759 	 * may have a different view of the drive list.  During cleanup
4760 	 * of the drive list during reconfig, a drive will be deleted
4761 	 * from the list if the master node sees that the drive has been
4762 	 * marked in the ADD state on any node or is marked in the DEL state
4763 	 * on all nodes.
4764 	 * This cleanup must occur even if all nodes in the cluster are
4765 	 * not part of the cluster so that all nodes have the same view
4766 	 * of the drivelist.
4767 	 * Then if the entire cluster goes down and comes back up, the
4768 	 * new master node could be a node that wasn't in the cluster when
4769 	 * the node was deleted.  This could lead to a situation where the
4770 	 * master node thinks that a drive is OK, but this drive isn't
4771 	 * known to the other nodes.
4772 	 * This situation can also occur during the addition of a drive
4773 	 * where a node has the drive marked OK, but the node executing the
4774 	 * metaset command enountered a failure before marking that drive OK
4775 	 * on the rest of the nodes.  If the node with the OK drive then
4776 	 * panics, then rest of the nodes will remove that drive marked ADD
4777 	 * and when the node with the OK drive rejoins the cluster, it will
4778 	 * have a drive marked OK that is unknown by the other nodes.
4779 	 *
4780 	 * There are 2 situations to consider:
4781 	 * A) Master knows about a drive that other nodes don't know about.
4782 	 * B) At least one slave node knows about a drive that the master
4783 	 *    node doesn't know about.
4784 	 *
4785 	 * To handle these situations the following steps are followed:
4786 	 * 1) Count number of drives known by this master node and the
4787 	 *    other slave nodes.
4788 	 *    If all nodes have the same number of drives and the master has
4789 	 *    all drives marked OK, then skip to step4.
4790 	 *
4791 	 * 2) If a node has less drives listed than the master, the master
4792 	 *    must get the drive descriptor list from that node so that
4793 	 *    master can determine which drive it needs to delete from that
4794 	 *    node.  Master must get the drive descriptor list since the
4795 	 *    drive record list does not contain the name of the drive, but
4796 	 *    only a key and the key can only be interprested on that other
4797 	 *    node.
4798 	 *
4799 	 * 3) The master will then create the master drive list by doing:
4800 	 *	- Master starts with drive list known by master.
4801 	 *	- Any drive marked ADD will be removed from the list.
4802 	 *	- Any drive not known by another node (from step2) will be
4803 	 *	removed from the drive list.
4804 	 *	- If a drive is marked DEL on the master, the master must
4805 	 *	verify that the drive record is marked DEL on all nodes.
4806 	 *	If any node has the drive record marked OK, mark it OK
4807 	 *	on the master.  (The reason why is described below).
4808 	 *
4809 	 * 4) The master sends out the master drive list and the slave
4810 	 *    nodes will force their drive lists to match the master
4811 	 *    drive list by deleting drives, if necessary and by changing
4812 	 *    the drive record states from ADD->OK if master has drive
4813 	 *    marked OK and slave has drive marked ADD.
4814 	 *
4815 	 * Interesting scenarios:
4816 	 *
4817 	 * 1) System has 4 nodes with node 1 as the master.  Node 3 starts
4818 	 *    to delete a drive record (drive record on node 1 is marked DEL),
4819 	 *    but is stopped when node 3 panics.  Node 1 also panics.
4820 	 *    During reconfig cycle, node 2 is picked as master and the drive
4821 	 *    record is left alone since all nodes in the cluster have it
4822 	 *    marked OK.  User now sees drive as part of diskset.
4823 	 *    Now, entire cluster is rebooted and node 1 rejoins the cluster.
4824 	 *    Node 1 is picked as the master and node 1 has drive record
4825 	 *    marked DEL.  Node 1 contacts all other nodes in the cluster
4826 	 *    and since at least one node has the drive record marked OK,
4827 	 *    the master marks the drive record OK.
4828 	 *    User continues to see the drive as part of the diskset.
4829 	 */
4830 
4831 	/* Reget set descriptor since flushed above */
4832 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4833 		rval = -1;
4834 		goto out;
4835 	}
4836 
4837 	/* Has side effect of setting sd->sd_drvs to same as master_dd */
4838 	if ((master_dd = metaget_drivedesc_sideno(sp,
4839 	    sd->sd_mn_mynode->nd_nodeid,
4840 	    (MD_BASICNAME_OK | PRINT_FAST), ep)) == NULL) {
4841 		/* No drives in list */
4842 		if (!mdisok(ep)) {
4843 			/*
4844 			 * Can't get drive list for this node, so
4845 			 * return -1 causing this node to be removed
4846 			 * cluster config and fixed.
4847 			 */
4848 			rval = -1;
4849 			goto out;
4850 		}
4851 	}
4852 
4853 	/* Count the number of drives for all nodes */
4854 	mnsr_node = master_mnsr_node;
4855 	while (mnsr_node) {
4856 		dr_cnt = 0;
4857 		dr = mnsr_node->mmn_mnsr->sr_drivechain;
4858 		while (dr) {
4859 			dr_cnt++;
4860 			dr = dr->dr_next;
4861 		}
4862 		mnsr_node->mmn_numdrives = dr_cnt;
4863 		mnsr_node = mnsr_node->mmn_next;
4864 	}
4865 
4866 	/* Count the number of drives for the master; also check flags */
4867 	all_drives_ok = 1;
4868 	dd_cnt = 0;
4869 	dd = master_dd;
4870 	while (dd) {
4871 		dd_cnt++;
4872 		if (!(dd->dd_flags & MD_DR_OK))
4873 			all_drives_ok = 0;
4874 		dd = dd->dd_next;
4875 	}
4876 
4877 	/* If all drives are ok, do quick check against number of drives */
4878 	if (all_drives_ok) {
4879 		/* If all nodes have same number of drives, almost done */
4880 		mnsr_node = master_mnsr_node;
4881 		while (mnsr_node) {
4882 			if (mnsr_node->mmn_numdrives != dd_cnt)
4883 				break;
4884 			mnsr_node = mnsr_node->mmn_next;
4885 		}
4886 		/* All nodes have same number of drives, just send flags */
4887 		if (mnsr_node == NULL) {
4888 			goto send_drive_list;
4889 		}
4890 	}
4891 
4892 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4893 	    "Begin detailed drive synchronization for set %s: %s"),
4894 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4895 
4896 	/* Detailed check required  */
4897 	mnsr_node = master_mnsr_node;
4898 	while (mnsr_node) {
4899 		/* Does slave node have less drives than master? */
4900 		if (mnsr_node->mmn_numdrives < dd_cnt) {
4901 			/* Yes - must determine which drive is missing */
4902 			if (clnt_getdrivedesc(mnsr_node->mmn_nodename, sp,
4903 			    &other_dd, ep)) {
4904 				/* RPC failure to !my node */
4905 				if ((mdanyrpcerror(ep)) &&
4906 				    (strcmp(mynode(), mnsr_node->mmn_nodename)
4907 				    != 0)) {
4908 					rval = 205;
4909 				} else {
4910 					/* Any other failure */
4911 					rval = -1;
4912 				}
4913 				mde_perror(ep, dgettext(TEXT_DOMAIN,
4914 				    "Master node %s unable to "
4915 				    "retrieve drive list from node %s"),
4916 				    mynode(), mnsr_node->mmn_nodename);
4917 				goto out;
4918 			}
4919 			mnsr_node->mmn_dd = other_dd;
4920 			dd = master_dd;
4921 			while (dd) {
4922 				if (!(dd->dd_flags & MD_DR_OK)) {
4923 					dd = dd->dd_next;
4924 					continue;
4925 				}
4926 				other_dd = mnsr_node->mmn_dd;
4927 				while (other_dd) {
4928 					/* Convert to devids, when available */
4929 					if (strcmp(other_dd->dd_dnp->cname,
4930 					    dd->dd_dnp->cname) == 0) {
4931 						break;
4932 					}
4933 					other_dd = other_dd->dd_next;
4934 				}
4935 				/*
4936 				 * dd not found on slave so mark it
4937 				 * ADD for later deletion (drives in ADD
4938 				 * state are deleted later in this routine).
4939 				 */
4940 				if (other_dd == NULL) {
4941 					dd->dd_flags = MD_DR_ADD;
4942 				}
4943 				dd = dd->dd_next;
4944 			}
4945 
4946 		}
4947 		mnsr_node = mnsr_node->mmn_next;
4948 	}
4949 
4950 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4951 	    "Drive check completed for set %s: %s"),
4952 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4953 
4954 	dd = master_dd;
4955 	dd_prev = 0;
4956 	while (dd) {
4957 		/* Remove any ADD drives from list */
4958 		if (dd->dd_flags & MD_DR_ADD) {
4959 			if (dd_prev) {
4960 				dd_prev->dd_next = dd->dd_next;
4961 				dd->dd_next = NULL;
4962 				metafreedrivedesc(&dd);
4963 				dd = dd_prev->dd_next;
4964 			} else {
4965 				/*
4966 				 * If removing drive descriptor from head
4967 				 * of linked list, also change sd->sd_drvs.
4968 				 */
4969 				master_dd = sd->sd_drvs = dd->dd_next;
4970 				dd->dd_next = NULL;
4971 				metafreedrivedesc(&dd);
4972 				dd = master_dd;
4973 			}
4974 			/* dd setup in if/else above */
4975 			continue;
4976 		}
4977 		/*
4978 		 * If drive is marked DEL, check all other nodes.
4979 		 * If drive on another node is marked OK, mark drive OK
4980 		 * in master list.  If drive is marked DEL or doesn't exist
4981 		 * on all nodes, remove drive from list.
4982 		 */
4983 		if (dd->dd_flags & MD_DR_DEL) {
4984 			mnsr_node = master_mnsr_node;
4985 			while (mnsr_node) {
4986 				if (mnsr_node->mmn_dd == NULL) {
4987 				    if (clnt_getdrivedesc(
4988 					mnsr_node->mmn_nodename, sp,
4989 					&other_dd, ep)) {
4990 					    /* RPC failure to !my node */
4991 					    if ((mdanyrpcerror(ep)) &&
4992 						(strcmp(mynode(),
4993 						mnsr_node->mmn_nodename)
4994 						!= 0)) {
4995 						    rval = 205;
4996 					    } else {
4997 						    /* Any other failure */
4998 						    rval = -1;
4999 					    }
5000 					    mde_perror(ep, dgettext(TEXT_DOMAIN,
5001 						"Master node %s unable "
5002 						"to retrieve drive list from "
5003 						"node %s"), mynode(),
5004 						mnsr_node->mmn_nodename);
5005 					    goto out;
5006 				    }
5007 				    mnsr_node->mmn_dd = other_dd;
5008 				}
5009 				other_dd = mnsr_node->mmn_dd;
5010 				while (other_dd) {
5011 					/* Found drive (OK) from other node */
5012 					if (strcmp(dd->dd_dnp->cname,
5013 					    other_dd->dd_dnp->cname)
5014 					    == 0) {
5015 						/* Drive marked OK */
5016 						if (other_dd->dd_flags &
5017 						    MD_DR_OK) {
5018 						    dd->dd_flags = MD_DR_OK;
5019 						}
5020 						break;
5021 					}
5022 					other_dd = other_dd->dd_next;
5023 				}
5024 				if (dd->dd_flags == MD_DR_OK)
5025 					break;
5026 
5027 				mnsr_node = mnsr_node->mmn_next;
5028 			}
5029 			/*
5030 			 * If no node had this drive marked OK, delete it.
5031 			 */
5032 			if (dd->dd_flags & MD_DR_DEL) {
5033 				if (dd_prev) {
5034 					dd_prev->dd_next = dd->dd_next;
5035 					dd->dd_next = NULL;
5036 					metafreedrivedesc(&dd);
5037 					dd = dd_prev->dd_next;
5038 				} else {
5039 					/*
5040 					 * If removing drive descriptor from
5041 					 * head of linked list, also change
5042 					 * sd->sd_drvs.
5043 					 */
5044 					master_dd = sd->sd_drvs = dd->dd_next;
5045 					dd->dd_next = NULL;
5046 					metafreedrivedesc(&dd);
5047 					dd = master_dd;
5048 				}
5049 				/* dd setup in if/else above */
5050 				continue;
5051 			}
5052 		}
5053 		dd_prev = dd;
5054 		dd = dd->dd_next;
5055 	}
5056 
5057 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5058 	    "Setting drive states completed for set %s: %s"),
5059 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5060 
5061 send_drive_list:
5062 	/*
5063 	 * Set genid on all drives to be the highest value seen.
5064 	 */
5065 	dd = master_dd;
5066 	while (dd) {
5067 		dd->dd_genid = max_genid;
5068 		dd = dd->dd_next;
5069 	}
5070 	/*
5071 	 * Send updated drive list to all alive nodes.
5072 	 * Will also set genid on set and node records to have same
5073 	 * as the drive records.
5074 	 */
5075 	nd = sd->sd_nodelist;
5076 	while (nd) {
5077 		/* Skip non-alive nodes */
5078 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5079 			nd = nd->nd_next;
5080 			continue;
5081 		}
5082 		if (clnt_upd_dr_reconfig(nd->nd_nodename, sp, master_dd, ep)) {
5083 			/* RPC failure to another node */
5084 			if ((mdanyrpcerror(ep)) &&
5085 			    (sd->sd_mn_mynode->nd_nodeid != nd->nd_nodeid)) {
5086 				rval = 205;
5087 			} else {
5088 				/* Any other failure */
5089 				rval = -1;
5090 			}
5091 			goto out;
5092 		}
5093 		nd = nd->nd_next;
5094 	}
5095 
5096 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5097 	    "Sent drive list to all nodes for set %s: %s"),
5098 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5099 
5100 	/*
5101 	 * If no drive records left in set and nodes had been joined,
5102 	 * withdraw the nodes.  Always reset the master and mark
5103 	 * all nodes as withdrawn on all nodes.
5104 	 */
5105 	if (master_dd == NULL) {
5106 		/* Reset new master flag since no longer master */
5107 		(void) memset(&sf, 0, sizeof (sf));
5108 		sf.sf_setno = sp->setno;
5109 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5110 		sf.sf_flags = MDDB_NM_RESET;
5111 		/* Use magic to help protect ioctl against attack. */
5112 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5113 		/* Ignore failure, failure to reset flag isn't catastrophic */
5114 		(void) metaioctl(MD_MN_SET_SETFLAGS, &sf,
5115 		    &sf.sf_mde, NULL);
5116 
5117 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5118 		    "Reset new master flag for " "set %s: %s"),
5119 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5120 
5121 		nd = sd->sd_nodelist;
5122 		while (nd) {
5123 			/* Skip non-alive nodes  */
5124 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5125 				nd = nd->nd_next;
5126 				continue;
5127 			}
5128 
5129 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
5130 				/* RPC failure to another node */
5131 				if ((mdanyrpcerror(ep)) &&
5132 				    (sd->sd_mn_mynode->nd_nodeid !=
5133 				    nd->nd_nodeid)) {
5134 					rval = 205;
5135 				} else {
5136 					/* Any other failure */
5137 					rval = -1;
5138 				}
5139 				goto out;
5140 			}
5141 			set_locked = 1;
5142 
5143 			/* Withdraw node from set if owner */
5144 			if ((nd->nd_flags & MD_MN_NODE_OWN) &&
5145 			    (clnt_withdrawset(nd->nd_nodename, sp, ep))) {
5146 				/* RPC failure to another node */
5147 				if ((mdanyrpcerror(ep)) &&
5148 				    (sd->sd_mn_mynode->nd_nodeid !=
5149 				    nd->nd_nodeid)) {
5150 					rval = 205;
5151 				} else {
5152 					/* Any other failure */
5153 					rval = -1;
5154 				}
5155 				goto out;
5156 			}
5157 
5158 			/* Mark all nodes as withdrawn on this node */
5159 			if (clnt_upd_nr_flags(nd->nd_nodename, sp,
5160 			    sd->sd_nodelist, MD_NR_WITHDRAW, NULL, ep)) {
5161 				/* RPC failure to another node */
5162 				if ((mdanyrpcerror(ep)) &&
5163 				    (sd->sd_mn_mynode->nd_nodeid !=
5164 				    nd->nd_nodeid)) {
5165 					rval = 205;
5166 				} else {
5167 					/* Any other failure */
5168 					rval = -1;
5169 				}
5170 				goto out;
5171 			}
5172 
5173 			/* Resets master to no-master on this node */
5174 			if (clnt_mnsetmaster(nd->nd_nodename, sp,
5175 			    "", MD_MN_INVALID_NID, ep)) {
5176 				/* RPC failure to another node */
5177 				if ((mdanyrpcerror(ep)) &&
5178 				    (sd->sd_mn_mynode->nd_nodeid !=
5179 				    nd->nd_nodeid)) {
5180 					rval = 205;
5181 				} else {
5182 					/* Any other failure */
5183 					rval = -1;
5184 				}
5185 				goto out;
5186 			}
5187 
5188 			cl_sk = cl_get_setkey(sp->setno, sp->setname);
5189 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) {
5190 				/* RPC failure to another node */
5191 				if ((mdanyrpcerror(ep)) &&
5192 				    (sd->sd_mn_mynode->nd_nodeid !=
5193 				    nd->nd_nodeid)) {
5194 					rval = 205;
5195 				} else {
5196 					/* Any other failure */
5197 					rval = -1;
5198 				}
5199 				goto out;
5200 			}
5201 			set_locked = 0;
5202 			nd = nd->nd_next;
5203 		}
5204 	}
5205 
5206 out:
5207 	/*
5208 	 * If got here and set is still locked, then an error has
5209 	 * occurred and master_nodelist is still valid.
5210 	 * If error is not an RPC error, then unlock.
5211 	 * If error is an RPC error, skip unlocks since this could cause
5212 	 * yet another RPC timeout if a node has failed.
5213 	 * Ignore failures in unlock since unlock is just trying to
5214 	 * clean things up.
5215 	 */
5216 	if ((set_locked) && !(mdanyrpcerror(ep))) {
5217 		nd = master_nodelist;
5218 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
5219 		while (nd) {
5220 			/* Skip non-alive nodes */
5221 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5222 				nd = nd->nd_next;
5223 				continue;
5224 			}
5225 			/*
5226 			 * If clnt_unlock fails, just break out since next
5227 			 * reconfig cycle will reset the locks anyway.
5228 			 */
5229 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
5230 				break;
5231 			}
5232 			nd = nd->nd_next;
5233 		}
5234 		cl_set_setkey(NULL);
5235 	}
5236 	/* Free master_mnsr and drive descs */
5237 	mnsr_node = master_mnsr_node;
5238 	while (mnsr_node) {
5239 		master_mnsr_node = mnsr_node->mmn_next;
5240 		free_sr((md_set_record *)mnsr_node->mmn_mnsr);
5241 		free_rem_dd(mnsr_node->mmn_dd);
5242 		Free(mnsr_node);
5243 		mnsr_node = master_mnsr_node;
5244 	}
5245 
5246 	/* Frees sd->sd_drvs (which is also master_dd) */
5247 	metaflushsetname(sp);
5248 	return (rval);
5249 }
5250 
5251 /*
5252  * meta_mnsync_diskset_mddbs
5253  * Calling node is guaranteed to be an owner node.
5254  * Calling node is the master node.
5255  *
5256  * Master node verifies that ondisk mddb format matches its incore format.
5257  * If no nodes are joined to set, remove the change log entries.
5258  * If a node is joined to set, play the change log.
5259  *
5260  * Returns	 0 - Success
5261  *		 1 - Master unable to join to set.
5262  *		205 - Failure during RPC to another node
5263  *		-1 - Any other failure and ep is filled in.
5264  *			-1 return will eventually cause node to panic
5265  *			in a SunCluster environment.
5266  */
5267 int
5268 meta_mnsync_diskset_mddbs(
5269 	mdsetname_t	*sp,
5270 	md_error_t	*ep
5271 )
5272 {
5273 	md_set_desc		*sd;
5274 	mddb_config_t		c;
5275 	md_mn_msgclass_t	class;
5276 	mddb_setflags_config_t	sf;
5277 	md_mnnode_desc		*nd, *nd2;
5278 	md_error_t		xep = mdnullerror;
5279 	int			stale_set = 0;
5280 
5281 	/* If setname is there, set desc should exist. */
5282 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5283 		mde_perror(ep, dgettext(TEXT_DOMAIN,
5284 		    "Unable to get set %s desc information"), sp->setname);
5285 		return (-1);
5286 	}
5287 
5288 	/* Are there drives in the set? */
5289 	if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
5290 	    ep) == NULL) {
5291 		if (! mdisok(ep)) {
5292 			return (-1);
5293 		}
5294 		/* No drives in set -- nothing to sync up */
5295 		return (0);
5296 	}
5297 
5298 	/*
5299 	 * Is master node (which is this node) joined to set?
5300 	 * If master node isn't joined (which means that no nodes
5301 	 * are joined to diskset), remove the change log entries
5302 	 * since no need to replay them - all nodes will have same
5303 	 * view of mddbs since all nodes are reading in the mddbs
5304 	 * from disk.
5305 	 * There is also no need to sync up the master and ondisk mddbs
5306 	 * since master has no incore knowledge.
5307 	 * Need to join master to set in order to flush the change
5308 	 * log entries. Don't need to block I/O during join of master
5309 	 * to set since no other nodes are joined to set and so no I/O
5310 	 * can be occurring.
5311 	 */
5312 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
5313 		/* Join master to set */
5314 		if (clnt_joinset(mynode(), sp,
5315 		    MNSET_IN_RECONFIG, ep)) {
5316 			if (mdismddberror(ep, MDE_DB_STALE)) {
5317 				/*
5318 				 * If STALE, print message and continue on.
5319 				 * Don't do any writes or reads to mddbs
5320 				 * so don't clear change log.
5321 				 */
5322 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5323 				    "Join of master node to STALE set %s"),
5324 				    sp->setname);
5325 				stale_set = 1;
5326 				mdclrerror(ep);
5327 			} else if (mdismddberror(ep, MDE_DB_ACCOK)) {
5328 				/* ACCOK means mediator provided extra vote */
5329 				mdclrerror(ep);
5330 			} else {
5331 				/*
5332 				 * If master is unable to join set, print an
5333 				 * error message.  Don't return failure or node
5334 				 * will panic during cluster reconfig cycle.
5335 				 * Also, withdraw node from set in order to
5336 				 * cleanup from failed join attempt.
5337 				 */
5338 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5339 				    "Join of master node in set %s failed"),
5340 				    sp->setname);
5341 				if (clnt_withdrawset(mynode(), sp, &xep))
5342 					mdclrerror(&xep);
5343 				return (1);
5344 			}
5345 		}
5346 		/*
5347 		 * Master node successfully joined.
5348 		 * Set local copy of flags to OWN and
5349 		 * send owner flag to rpc.metad. If not stale,
5350 		 * flush the change log.
5351 		 */
5352 		sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN;
5353 		if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, MD_NR_SET,
5354 		    MNSET_IN_RECONFIG, ep)) {
5355 			mde_perror(ep, dgettext(TEXT_DOMAIN,
5356 			    "Flag update of master node join in set %s failed"),
5357 			    sp->setname);
5358 			return (-1);
5359 		}
5360 
5361 		if (!stale_set) {
5362 			if (mdmn_reset_changelog(sp, ep,
5363 			    MDMN_CLF_RESETLOG) != 0) {
5364 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5365 				    "Unable to reset changelog."));
5366 				return (-1);
5367 			}
5368 			meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5369 			    "Removed changelog entries for set %s: %s"),
5370 			    sp->setname,
5371 			    meta_print_hrtime(gethrtime() - start_time));
5372 		}
5373 		/* Reset new master flag before return */
5374 		(void) memset(&sf, 0, sizeof (sf));
5375 		sf.sf_setno = sp->setno;
5376 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5377 		sf.sf_flags = MDDB_NM_RESET;
5378 		/* Use magic to help protect ioctl against attack. */
5379 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5380 		/* Ignore failure, failure to reset flag isn't catastrophic */
5381 		(void) metaioctl(MD_MN_SET_SETFLAGS, &sf,
5382 		    &sf.sf_mde, NULL);
5383 
5384 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5385 		    "Reset new master flag for set %s: %s"),
5386 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5387 
5388 		return (0);
5389 	}
5390 
5391 	/*
5392 	 * Is master already joined to STALE set (< 50% mddbs avail)?
5393 	 * If so, can make no config changes to mddbs so don't check or play
5394 	 * changelog and don't sync master node to ondisk mddbs.
5395 	 * To get out of the stale state all nodes must be withdrawn
5396 	 * from set.  Then as nodes are re-joined, all nodes will
5397 	 * have same view of mddbs since all nodes are reading the
5398 	 * mddbs from disk.
5399 	 */
5400 	(void) memset(&c, 0, sizeof (c));
5401 	c.c_id = 0;
5402 	c.c_setno = sp->setno;
5403 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
5404 		(void) mdstealerror(ep, &c.c_mde);
5405 		return (-1);
5406 	}
5407 	if (c.c_flags & MDDB_C_STALE) {
5408 		return (0);
5409 	}
5410 
5411 	/*
5412 	 * If this node is NOT a newly chosen master, then there's
5413 	 * nothing else to do since the change log should be empty and
5414 	 * the ondisk and incore mddbs are already consistent.
5415 	 *
5416 	 * A newly chosen master is a node that was not the master
5417 	 * at the beginning of the reconfig cycle.  If a node is a new
5418 	 * master, then the new master state is reset after the ondisk
5419 	 * and incore mddbs are consistent and the change log has
5420 	 * been replayed.
5421 	 */
5422 	(void) memset(&sf, 0, sizeof (sf));
5423 	sf.sf_setno = sp->setno;
5424 	sf.sf_flags = MDDB_NM_GET;
5425 	/* Use magic to help protect ioctl against attack. */
5426 	sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5427 	if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, &sf.sf_mde, NULL) == 0) &&
5428 	    ((sf.sf_setflags & MD_SET_MN_NEWMAS_RC) == 0)) {
5429 		return (0);
5430 	}
5431 
5432 	/*
5433 	 * Now, sync up incore master view to ondisk mddbs.
5434 	 * This is needed in the case where a master node
5435 	 * had made a change to the mddb, but this change
5436 	 * may not have been relayed to the slaves yet.
5437 	 * So, the new master needs to verify that the ondisk
5438 	 * mddbs match what the new master has incore -
5439 	 * if different, new master rewrites all of the mddbs.
5440 	 * Then the new master will replay the changelog and the
5441 	 * new master will then execute what the old master had
5442 	 * done.
5443 	 *
5444 	 * Block all I/Os to disks in this diskset on all nodes in
5445 	 * the diskset.  This will allow the rewriting of the mddbs
5446 	 * (if needed), to proceed in a timely manner.
5447 	 *
5448 	 * If block of I/Os fail, return a -1.
5449 	 */
5450 
5451 	nd = sd->sd_nodelist;
5452 	while (nd) {
5453 		/* Skip non-alive and non-owner nodes  */
5454 		if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5455 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5456 			nd = nd->nd_next;
5457 			continue;
5458 		}
5459 		if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5460 		    MN_SUSP_IO, ep)) {
5461 			mde_perror(ep, dgettext(TEXT_DOMAIN,
5462 			    "Unable to suspend I/O on node %s in set %s"),
5463 			    nd->nd_nodename, sp->setname);
5464 
5465 			/*
5466 			 * Resume all other nodes that had been suspended.
5467 			 * (Reconfig return step also resumes I/Os
5468 			 * for all sets.)
5469 			 */
5470 			nd2 = sd->sd_nodelist;
5471 			while (nd2) {
5472 				/* Stop when reaching failed node */
5473 				if (nd2->nd_nodeid == nd->nd_nodeid)
5474 					break;
5475 				/* Skip non-alive and non-owner nodes  */
5476 				if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) ||
5477 				    (!(nd2->nd_flags & MD_MN_NODE_OWN))) {
5478 					nd2 = nd2->nd_next;
5479 					continue;
5480 				}
5481 				(void) (clnt_mn_susp_res_io(nd2->nd_nodename,
5482 					sp->setno, MN_RES_IO, &xep));
5483 				nd2 = nd2->nd_next;
5484 			}
5485 
5486 			/*
5487 			 * If an RPC failure on another node, return a 205.
5488 			 * Otherwise, exit with failure.
5489 			 */
5490 			if ((mdanyrpcerror(ep)) &&
5491 			    (sd->sd_mn_mynode->nd_nodeid !=
5492 			    nd->nd_nodeid)) {
5493 				return (205);
5494 			} else {
5495 				return (-1);
5496 			}
5497 
5498 		}
5499 		nd = nd->nd_next;
5500 	}
5501 
5502 	(void) memset(&c, 0, sizeof (c));
5503 	c.c_id = 0;
5504 	c.c_setno = sp->setno;
5505 	/* Master can't sync up to ondisk mddbs?  Kick it out of cluster */
5506 	if (metaioctl(MD_MN_CHK_WRT_MDDB, &c, &c.c_mde, NULL) != 0)
5507 		return (-1);
5508 
5509 	/*
5510 	 * Resume I/Os that were suspended above.
5511 	 */
5512 	nd = sd->sd_nodelist;
5513 	while (nd) {
5514 		/* Skip non-alive and non-owner nodes  */
5515 		if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5516 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5517 			nd = nd->nd_next;
5518 			continue;
5519 		}
5520 		if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5521 		    MN_RES_IO, ep)) {
5522 			mde_perror(ep, dgettext(TEXT_DOMAIN,
5523 			    "Unable to resume I/O on node %s in set %s"),
5524 			    nd->nd_nodename, sp->setname);
5525 
5526 			/*
5527 			 * If an RPC failure then don't do any
5528 			 * more RPC calls, since one timeout is enough
5529 			 * to endure.  If RPC failure to another node, return
5530 			 * 205.  If RPC failure to my node, return -1.
5531 			 * If not an RPC failure, continue resuming the
5532 			 * rest of the nodes and then return -1.
5533 			 */
5534 			if (mdanyrpcerror(ep)) {
5535 				if (sd->sd_mn_mynode->nd_nodeid ==
5536 				    nd->nd_nodeid) {
5537 					return (-1);
5538 				} else {
5539 					return (205);
5540 				}
5541 			}
5542 
5543 			/*
5544 			 * If not an RPC error, continue resuming rest of
5545 			 * nodes, ignoring any failures except for an
5546 			 * RPC failure which constitutes an immediate exit.
5547 			 * Start in middle of list with failing node.
5548 			 */
5549 			nd2 = nd->nd_next;
5550 			while (nd2) {
5551 				/* Skip non-alive and non-owner nodes  */
5552 				if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) ||
5553 				    (!(nd2->nd_flags & MD_MN_NODE_OWN))) {
5554 					nd2 = nd2->nd_next;
5555 					continue;
5556 				}
5557 				(void) (clnt_mn_susp_res_io(nd2->nd_nodename,
5558 					sp->setno, MN_RES_IO, &xep));
5559 				if (mdanyrpcerror(&xep)) {
5560 					return (-1);
5561 				}
5562 				nd2 = nd2->nd_next;
5563 			}
5564 		}
5565 		nd = nd->nd_next;
5566 	}
5567 
5568 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, "Master node has completed "
5569 	    "checking/writing the mddb for set %s: %s"), sp->setname,
5570 	    meta_print_hrtime(gethrtime() - start_time));
5571 
5572 	/*
5573 	 * Send (aka replay) all messages we find in the changelog.
5574 	 * Flag the messages with
5575 	 *   MD_MSGF_REPLAY_MSG, so no new message ID is generated for them
5576 	 *   MD_MSGF_OVERRIDE_SUSPEND so they can pass the suspended commd.
5577 	 */
5578 	for (class = MD_MN_NCLASSES - 1; class > 0; class--) {
5579 		mdmn_changelog_record_t	*lr;
5580 		md_error_t	xep = mdnullerror;
5581 		md_mn_result_t	*resultp = NULL;
5582 		int		ret;
5583 
5584 		lr = mdmn_get_changelogrec(sp->setno, class);
5585 		if ((lr->lr_flags & MD_MN_LR_INUSE) == 0) {
5586 			/* no entry for this class */
5587 			continue;
5588 		}
5589 
5590 		meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN,
5591 		    "replaying message ID=(%d, 0x%llx-%d)\n"),
5592 		    MSGID_ELEMS(lr->lr_msg.msg_msgid));
5593 
5594 		ret = mdmn_send_message_with_msgid(
5595 			lr->lr_msg.msg_setno,
5596 			lr->lr_msg.msg_type,
5597 			lr->lr_msg.msg_flags |  MD_MSGF_REPLAY_MSG |
5598 						MD_MSGF_OVERRIDE_SUSPEND,
5599 			lr->lr_msg.msg_event_data,
5600 			lr->lr_msg.msg_event_size,
5601 			&resultp,
5602 			&lr->lr_msg.msg_msgid,
5603 			&xep);
5604 
5605 		meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN,
5606 		    "mdmn_send_message returned %d\n"), ret);
5607 
5608 		if (resultp)
5609 			free_result(resultp);
5610 	}
5611 
5612 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5613 	    "Playing changelog completed for set %s: %s"),
5614 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5615 
5616 	/*
5617 	 * Now that new master has ondisk and incore mddbs in sync, reset
5618 	 * this node's new master kernel flag (for this set).  If this node
5619 	 * re-enters another reconfig cycle before the completion of this
5620 	 * reconfig cycle, this master node won't need to check if the ondisk
5621 	 * and incore mddbs are in sync since this node won't be considered
5622 	 * a new master (since this flag is being reset here in the middle of
5623 	 * step2).  This will save time during any subsequent reconfig
5624 	 * cycles as long as this node continues to be master.
5625 	 */
5626 	(void) memset(&sf, 0, sizeof (sf));
5627 	sf.sf_setno = sp->setno;
5628 	sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5629 	sf.sf_flags = MDDB_NM_RESET;
5630 	/* Use magic to help protect ioctl against attack. */
5631 	sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5632 	/* Ignore failure, since failure to reset flag isn't catastrophic */
5633 	(void) metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, NULL);
5634 
5635 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5636 	    "Reset new master flag for set %s: %s"),
5637 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5638 
5639 	return (0);
5640 }
5641 
5642 /*
5643  * meta_mnjoin_all will join all starting nodes in the diskset.
5644  * A starting node is considered to be any node that is not
5645  * an owner of the set but is a member of the cluster.
5646  * Master node is already joined to set (done in meta_mnsync_diskset_mddbs).
5647  *
5648  * Caller is the Master node.
5649  *
5650  * Returns	 0 - Success
5651  *		205 - Failure during RPC to another node
5652  *		-1 - Any other failure and ep is filled in.
5653  */
5654 int
5655 meta_mnjoin_all(
5656 	mdsetname_t	*sp,
5657 	md_error_t	*ep
5658 )
5659 {
5660 	md_set_desc		*sd;
5661 	md_mnnode_desc		*nd, *nd2;
5662 	int			rval = 0;
5663 	int			stale_flag = 0;
5664 	mddb_config_t		c;
5665 	int			susp_res_flag = 0;
5666 	md_error_t		xep = mdnullerror;
5667 
5668 	/* If setname is there, set desc should exist. */
5669 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5670 		mde_perror(ep, dgettext(TEXT_DOMAIN,
5671 		    "Unable to get set %s desc information"), sp->setname);
5672 		return (-1);
5673 	}
5674 
5675 	/* Are there drives in the set? */
5676 	if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
5677 	    ep) == NULL) {
5678 		if (! mdisok(ep)) {
5679 			return (-1);
5680 		}
5681 		/* No drives in set -- nothing to join */
5682 		return (0);
5683 	}
5684 
5685 	/*
5686 	 * Is set currently stale?
5687 	 */
5688 	(void) memset(&c, 0, sizeof (c));
5689 	c.c_id = 0;
5690 	c.c_setno = sp->setno;
5691 	/* Ignore failure since master node may not be joined yet */
5692 	(void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL);
5693 	if (c.c_flags & MDDB_C_STALE) {
5694 		stale_flag = MNSET_IS_STALE;
5695 	}
5696 
5697 	/*
5698 	 * If any nodes are going to be joined to diskset, then
5699 	 * suspend I/O to all disks in diskset so that nodes can join
5700 	 * (read in mddbs) in a reasonable amount of time even under
5701 	 * high I/O load.  Don't need to do this if set is STALE since
5702 	 * no I/O can be occurring to a STALE set.
5703 	 */
5704 	if (stale_flag != MNSET_IS_STALE) {
5705 		nd = sd->sd_nodelist;
5706 		while (nd) {
5707 			/* Found a node that will be joined to diskset */
5708 			if ((nd->nd_flags & MD_MN_NODE_ALIVE) &&
5709 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5710 				/* Set flag that diskset should be suspended */
5711 				susp_res_flag = 1;
5712 				break;
5713 			}
5714 			nd = nd->nd_next;
5715 		}
5716 	}
5717 
5718 	if (susp_res_flag) {
5719 		/*
5720 		 * Block all I/Os to disks in this diskset on all joined
5721 		 * nodes in the diskset.
5722 		 * If block of I/Os fails due to an RPC failure on another
5723 		 * node, return 205; otherwise, return -1.
5724 		 */
5725 		nd = sd->sd_nodelist;
5726 		while (nd) {
5727 			/* Skip non-alive and non-owner nodes  */
5728 			if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5729 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5730 				nd = nd->nd_next;
5731 				continue;
5732 			}
5733 			if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5734 			    MN_SUSP_IO, ep)) {
5735 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5736 				    "Unable to suspend I/O on node %s"
5737 				    " in set %s"), nd->nd_nodename,
5738 				    sp->setname);
5739 				/*
5740 				 * Resume other nodes that had been suspended.
5741 				 * (Reconfig return step also resumes I/Os
5742 				 * for all sets.)
5743 				 */
5744 				nd2 = sd->sd_nodelist;
5745 				while (nd2) {
5746 					/* Stop when reaching failed node */
5747 					if (nd2->nd_nodeid == nd->nd_nodeid)
5748 						break;
5749 					/* Skip non-alive/non-owner nodes  */
5750 					if ((!(nd2->nd_flags &
5751 					    MD_MN_NODE_ALIVE)) ||
5752 					    (!(nd2->nd_flags &
5753 					    MD_MN_NODE_OWN))) {
5754 						nd2 = nd2->nd_next;
5755 						continue;
5756 					}
5757 					(void) (clnt_mn_susp_res_io(
5758 					    nd2->nd_nodename, sp->setno,
5759 					    MN_RES_IO, &xep));
5760 					nd2 = nd2->nd_next;
5761 				}
5762 
5763 				/*
5764 				 * If the suspend failed due to an
5765 				 * RPC failure on another node, return
5766 				 * a 205.
5767 				 * Otherwise, exit with failure.
5768 				 * The return reconfig step will resume
5769 				 * I/Os for all disksets.
5770 				 */
5771 				if ((mdanyrpcerror(ep)) &&
5772 				    (sd->sd_mn_mynode->nd_nodeid !=
5773 				    nd->nd_nodeid)) {
5774 					return (205);
5775 				} else {
5776 					return (-1);
5777 				}
5778 			}
5779 			nd = nd->nd_next;
5780 		}
5781 	}
5782 
5783 	nd = sd->sd_nodelist;
5784 	while (nd) {
5785 		/*
5786 		 * If a node is in the membership list but isn't joined
5787 		 * to the set, try to join the node.
5788 		 */
5789 		if ((nd->nd_flags & MD_MN_NODE_ALIVE) &&
5790 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5791 			if (clnt_joinset(nd->nd_nodename, sp,
5792 			    (MNSET_IN_RECONFIG | stale_flag), ep)) {
5793 				/*
5794 				 * If RPC failure to another node
5795 				 * then exit without attempting anything else.
5796 				 * (Reconfig return step will resume I/Os
5797 				 * for all sets.)
5798 				 */
5799 				if (mdanyrpcerror(ep)) {
5800 					mde_perror(ep, "");
5801 					return (205);
5802 				}
5803 				/*
5804 				 * STALE and ACCOK failures aren't true
5805 				 * failures.  STALE means that <50% mddbs
5806 				 * are available. ACCOK means that the
5807 				 * mediator provided the extra vote.
5808 				 * If a true failure, then print messasge
5809 				 * and withdraw node from set in order to
5810 				 * cleanup from failed join attempt.
5811 				 */
5812 				if ((!mdismddberror(ep, MDE_DB_STALE)) &&
5813 				    (!mdismddberror(ep, MDE_DB_ACCOK))) {
5814 					mde_perror(ep,
5815 					    "WARNING: Unable to join node %s "
5816 					    "to set %s", nd->nd_nodename,
5817 					    sp->setname);
5818 					mdclrerror(ep);
5819 					if (clnt_withdrawset(nd->nd_nodename,
5820 					    sp, &xep))
5821 						mdclrerror(&xep);
5822 					nd = nd->nd_next;
5823 					continue;
5824 				}
5825 			}
5826 			/* Set owner flag even if STALE or ACCOK */
5827 			nd->nd_flags |= MD_MN_NODE_OWN;
5828 		}
5829 		nd = nd->nd_next;
5830 	}
5831 	/*
5832 	 * Resume I/Os if suspended above.
5833 	 */
5834 	if (susp_res_flag) {
5835 		nd = sd->sd_nodelist;
5836 		while (nd) {
5837 			/*
5838 			 * Skip non-alive and non-owner nodes
5839 			 * (this list doesn't include any of
5840 			 * the nodes that were joined).
5841 			 */
5842 			if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5843 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5844 				nd = nd->nd_next;
5845 				continue;
5846 			}
5847 			if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5848 			    MN_RES_IO, ep)) {
5849 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5850 				    "Unable to resume I/O on node %s"
5851 				    " in set %s"), nd->nd_nodename,
5852 				    sp->setname);
5853 
5854 				/*
5855 				 * If an RPC failure then don't do any
5856 				 * more RPC calls, since one timeout is enough
5857 				 * to endure.  If RPC failure to another node,
5858 				 * return 205.  If RPC failure to my node,
5859 				 * return -1.
5860 				 * (Reconfig return step will resume I/Os
5861 				 * for all sets.)
5862 				 * If not an RPC failure, continue resuming the
5863 				 * rest of the nodes and then return -1.
5864 				 */
5865 				if (mdanyrpcerror(ep)) {
5866 					if (sd->sd_mn_mynode->nd_nodeid ==
5867 					    nd->nd_nodeid) {
5868 						return (-1);
5869 					} else {
5870 						return (205);
5871 					}
5872 				}
5873 
5874 				/*
5875 				 * If not an RPC error, continue resuming rest
5876 				 * of nodes, ignoring any failures except for
5877 				 * an RPC failure which constitutes an
5878 				 * immediate exit.
5879 				 * Start in middle of list with failing node.
5880 				 */
5881 				nd2 = nd->nd_next;
5882 				while (nd2) {
5883 					/* Skip non-owner nodes  */
5884 					if ((!(nd2->nd_flags &
5885 					    MD_MN_NODE_ALIVE)) ||
5886 					    (!(nd2->nd_flags &
5887 					    MD_MN_NODE_OWN))) {
5888 						nd2 = nd2->nd_next;
5889 						continue;
5890 					}
5891 					(void) (clnt_mn_susp_res_io(
5892 					    nd2->nd_nodename, sp->setno,
5893 					    MN_RES_IO, &xep));
5894 					if (mdanyrpcerror(&xep)) {
5895 						return (-1);
5896 					}
5897 					nd2 = nd2->nd_next;
5898 				}
5899 			}
5900 			nd = nd->nd_next;
5901 		}
5902 	}
5903 
5904 	nd = sd->sd_nodelist;
5905 	while (nd) {
5906 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
5907 			nd = nd->nd_next;
5908 			continue;
5909 		}
5910 		/*
5911 		 * If 1 node fails - go ahead and update the rest except
5912 		 * in the case of an RPC failure, fail immediately.
5913 		 */
5914 		if (clnt_upd_nr_flags(nd->nd_nodename, sp,
5915 		    sd->sd_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) {
5916 			/* RPC failure to another node */
5917 			if (mdanyrpcerror(ep)) {
5918 				return (205);
5919 			}
5920 			nd = nd->nd_next;
5921 			rval = -1;
5922 			continue;
5923 		}
5924 		nd = nd->nd_next;
5925 	}
5926 
5927 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5928 	    "Join of all nodes completed for set %s: %s"),
5929 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5930 
5931 	return (rval);
5932 }
5933