xref: /titanic_41/usr/src/lib/lvm/libmeta/common/meta_set.c (revision 55553f719b521a0bb4deab6efc944cd30c1a56aa)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Just in case we're not in a build environment, make sure that
30  * TEXT_DOMAIN gets set to something.
31  */
32 #if !defined(TEXT_DOMAIN)
33 #define	TEXT_DOMAIN "SYS_TEST"
34 #endif
35 
36 /*
37  * Metadevice diskset interfaces
38  */
39 
40 #include "meta_set_prv.h"
41 #include <meta.h>
42 #include <metad.h>
43 #include <mdmn_changelog.h>
44 #include <sys/lvm/md_crc.h>
45 #include <sys/utsname.h>
46 #include <sdssc.h>
47 
48 #include <sys/sysevent/eventdefs.h>
49 #include <sys/sysevent/svm.h>
50 extern	char	*blkname(char *);
51 
52 static md_drive_desc *
53 dr2drivedesc(
54 	mdsetname_t	*sp,
55 	side_t		sideno,
56 	int		flags,
57 	md_error_t	*ep
58 )
59 {
60 	md_set_record	*sr;
61 	md_drive_record	*dr;
62 	mddrivename_t	*dnp;
63 	md_drive_desc	*dd_head = NULL;
64 	md_set_desc	*sd;
65 
66 	if (flags & MD_BYPASS_DAEMON) {
67 		if ((sr = metad_getsetbynum(sp->setno, ep)) == NULL)
68 			return (NULL);
69 		sd = metaget_setdesc(sp, ep);
70 		sideno = getnodeside(mynode(), sd);
71 		sp = metafakesetname(sp->setno, sr->sr_setname);
72 	} else {
73 		if ((sr = getsetbyname(sp->setname, ep)) == NULL)
74 			return (NULL);
75 	}
76 
77 	assert(sideno != MD_SIDEWILD);
78 
79 	/*
80 	 * WARNING:
81 	 * The act of getting the dnp from the namespace means that we
82 	 * will get the devid of the disk as recorded in the namespace.
83 	 * This devid has the potential to be stale if the disk is being
84 	 * replaced via a rebind, this means that any code that relies
85 	 * on any of the dnp information should take the appropriate action
86 	 * to preserve that information. For example in the rebind code the
87 	 * devid of the new disk is saved off and then copied back in once
88 	 * the code that has called this function has completed.
89 	 */
90 	for (dr = sr->sr_drivechain; dr != NULL; dr = dr->dr_next) {
91 		if ((dnp = metadrivename_withdrkey(sp, sideno, dr->dr_key,
92 		    flags, ep)) == NULL) {
93 			if (!(flags & MD_BYPASS_DAEMON))
94 				free_sr(sr);
95 			metafreedrivedesc(&dd_head);
96 			return (NULL);
97 		}
98 
99 		(void) metadrivedesc_append(&dd_head, dnp, dr->dr_dbcnt,
100 		    dr->dr_dbsize, dr->dr_ctime, dr->dr_genid, dr->dr_flags);
101 	}
102 
103 	if (!(flags & MD_BYPASS_DAEMON)) {
104 		free_sr(sr);
105 	}
106 	return (dd_head);
107 }
108 
109 static int
110 get_sidenmlist(
111 	mdsetname_t	*sp,
112 	mddrivename_t	*dnp,
113 	md_error_t	*ep
114 )
115 {
116 	md_set_desc	*sd;
117 	mdsidenames_t	*sn, **sn_next;
118 	int		i;
119 
120 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
121 		return (-1);
122 
123 	metaflushsidenames(dnp);
124 	sn_next = &dnp->side_names;
125 	if (MD_MNSET_DESC(sd)) {
126 		/*
127 		 * Only get sidenames for this node since
128 		 * that is the only side information stored in
129 		 * the local mddb for a multi-node diskset.
130 		 */
131 		if (sd->sd_mn_mynode) {
132 			sn = Zalloc(sizeof (*sn));
133 			sn->sideno = sd->sd_mn_mynode->nd_nodeid;
134 			if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET,
135 			    sn->sideno, dnp->side_names_key, &sn->dname,
136 			    &sn->mnum, NULL, ep)) == NULL) {
137 				if (sn->dname != NULL)
138 					Free(sn->dname);
139 				Free(sn);
140 				return (-1);
141 			}
142 
143 			/* Add to the end of the linked list */
144 			assert(*sn_next == NULL);
145 			*sn_next = sn;
146 			sn_next = &sn->next;
147 		}
148 	} else {
149 		for (i = 0; i < MD_MAXSIDES; i++) {
150 			/* Skip empty slots */
151 			if (sd->sd_nodes[i][0] == '\0')
152 				continue;
153 
154 			sn = Zalloc(sizeof (*sn));
155 			sn->sideno = i;
156 			if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET,
157 			    i+SKEW, dnp->side_names_key, &sn->dname,
158 			    &sn->mnum, NULL, ep)) == NULL) {
159 				/*
160 				 * It is possible that during the add of a
161 				 * host to have a 'missing' side as the side
162 				 * for this disk will be added later. So ignore
163 				 * the error. The 'missing' side will be added
164 				 * once the addhosts process has completed.
165 				 */
166 				if (mdissyserror(ep, ENOENT)) {
167 					mdclrerror(ep);
168 					Free(sn);
169 					continue;
170 				}
171 
172 				if (sn->dname != NULL)
173 					Free(sn->dname);
174 				Free(sn);
175 				return (-1);
176 			}
177 
178 			/* Add to the end of the linked list */
179 			assert(*sn_next == NULL);
180 			*sn_next = sn;
181 			sn_next = &sn->next;
182 		}
183 	}
184 
185 	return (0);
186 }
187 
188 static md_drive_desc *
189 rl_to_dd(
190 	mdsetname_t		*sp,
191 	md_replicalist_t	*rlp,
192 	md_error_t		*ep
193 )
194 {
195 	md_replicalist_t	*rl;
196 	md_replica_t		*r;
197 	md_drive_desc		*dd = NULL;
198 	md_drive_desc		*d;
199 	int			found;
200 	md_set_desc		*sd;
201 	daddr_t			nblks = 0;
202 
203 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
204 		return (NULL);
205 
206 	/* find the smallest existing replica */
207 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
208 		r = rl->rl_repp;
209 		nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks));
210 	}
211 
212 	if (nblks <= 0)
213 		nblks = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;
214 
215 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
216 		r = rl->rl_repp;
217 
218 		found = 0;
219 		for (d = dd; d != NULL; d = d->dd_next) {
220 			if (strcmp(r->r_namep->drivenamep->cname,
221 			    d->dd_dnp->cname) == 0) {
222 				found = 1;
223 				dd->dd_dbcnt++;
224 				break;
225 			}
226 		}
227 
228 		if (! found)
229 			(void) metadrivedesc_append(&dd, r->r_namep->drivenamep,
230 			    1, nblks, sd->sd_ctime, sd->sd_genid, MD_DR_OK);
231 	}
232 
233 	return (dd);
234 }
235 
236 /*
237  * Exported Entry Points
238  */
239 
240 set_t
241 get_max_sets(md_error_t *ep)
242 {
243 
244 	static set_t		max_sets = 0;
245 
246 	if (max_sets == 0)
247 		if (metaioctl(MD_IOCGETNSET, &max_sets, ep, NULL) != 0)
248 			return (0);
249 
250 	return (max_sets);
251 }
252 
253 int
254 get_max_meds(md_error_t *ep)
255 {
256 	static int		max_meds = 0;
257 
258 	if (max_meds == 0)
259 		if (metaioctl(MD_MED_GET_NMED, &max_meds, ep, NULL) != 0)
260 			return (0);
261 
262 	return (max_meds);
263 }
264 
265 side_t
266 getmyside(mdsetname_t *sp, md_error_t *ep)
267 {
268 	md_set_desc		*sd;
269 	char 			*node = NULL;
270 	side_t			sideno;
271 
272 	if (sp->setno == 0)
273 		return (0);
274 
275 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
276 		return (MD_SIDEWILD);
277 
278 	node = mynode();
279 
280 	assert(node != NULL);
281 
282 	sideno = getnodeside(node, sd);
283 
284 	if (sideno != MD_SIDEWILD)
285 		return (sideno);
286 
287 	return (mddserror(ep, MDE_DS_HOSTNOSIDE, sp->setno, node, NULL, node));
288 }
289 
290 /*
291  * get set info from name
292  */
293 md_set_record *
294 getsetbyname(char *setname, md_error_t *ep)
295 {
296 	md_set_record		*sr = NULL;
297 	md_mnset_record		*mnsr = NULL;
298 	char			*p;
299 	size_t			len;
300 
301 	/* get set info from daemon */
302 	if (clnt_getset(mynode(), setname, MD_SET_BAD, &sr, ep) == -1)
303 		return (NULL);
304 	if (sr != NULL) {
305 		/*
306 		 * Returned record could be for a multi-node set or a
307 		 * non-multi-node set.
308 		 */
309 		if (MD_MNSET_REC(sr)) {
310 			/*
311 			 * Record is for a multi-node set.  Reissue call
312 			 * to get mnset information.  Need to free
313 			 * record as if a non-multi-node set record since
314 			 * that is what clnt_getset gave us.  If in
315 			 * the daemon, don't free since this is a pointer
316 			 * into the setrecords array.
317 			 */
318 			if (! md_in_daemon) {
319 				sr->sr_flags &= ~MD_SR_MN;
320 				free_sr(sr);
321 			}
322 			if (clnt_mngetset(mynode(), setname, MD_SET_BAD, &mnsr,
323 			    ep) == -1)
324 				return (NULL);
325 			if (mnsr != NULL)
326 				return ((struct md_set_record *)mnsr);
327 		} else {
328 			return (sr);
329 		}
330 	}
331 
332 	/* no such set */
333 	len = strlen(setname) + 30;
334 	p = Malloc(len);
335 	(void) snprintf(p, len, "setname \"%s\"", setname);
336 	(void) mderror(ep, MDE_NO_SET, p);
337 	Free(p);
338 	return (NULL);
339 }
340 
341 /*
342  * get set info from number
343  */
344 md_set_record *
345 getsetbynum(set_t setno, md_error_t *ep)
346 {
347 	md_set_record		*sr;
348 	md_mnset_record		*mnsr = NULL;
349 	char			buf[100];
350 
351 	if (clnt_getset(mynode(), NULL, setno, &sr, ep) == -1)
352 		return (NULL);
353 
354 	if (sr != NULL) {
355 		/*
356 		 * Record is for a multi-node set.  Reissue call
357 		 * to get mnset information.  Need to free
358 		 * record as if a non-multi-node set record since
359 		 * that is what clnt_getset gave us.  If in
360 		 * the daemon, don't free since this is a pointer
361 		 * into the setrecords array.
362 		 */
363 		if (MD_MNSET_REC(sr)) {
364 			/*
365 			 * Record is for a multi-node set.  Reissue call
366 			 * to get mnset information.
367 			 */
368 			if (! md_in_daemon) {
369 				sr->sr_flags &= ~MD_SR_MN;
370 				free_sr(sr);
371 			}
372 			if (clnt_mngetset(mynode(), NULL, setno, &mnsr,
373 			    ep) == -1)
374 				return (NULL);
375 			if (mnsr != NULL)
376 				return ((struct md_set_record *)mnsr);
377 		} else {
378 			return (sr);
379 		}
380 	}
381 
382 	(void) sprintf(buf, "setno %u", setno);
383 	(void) mderror(ep, MDE_NO_SET, buf);
384 	return (NULL);
385 }
386 
387 int
388 meta_check_drive_inuse(
389 	mdsetname_t	*sp,
390 	mddrivename_t	*dnp,
391 	int		check_db,
392 	md_error_t	*ep
393 )
394 {
395 	mdnamelist_t	*nlp = NULL;
396 	mdnamelist_t	*p;
397 	int		rval = 0;
398 
399 	/* get all underlying partitions */
400 	if (meta_getalldevs(sp, &nlp, check_db, ep) != 0)
401 		return (-1);
402 
403 	/* search for drive */
404 	for (p = nlp; (p != NULL); p = p->next) {
405 		mdname_t	*np = p->namep;
406 
407 		if (strcmp(dnp->cname, np->drivenamep->cname) == 0) {
408 			rval = (mddserror(ep, MDE_DS_DRIVEINUSE, sp->setno,
409 			    NULL, dnp->cname, sp->setname));
410 			break;
411 		}
412 	}
413 
414 	/* cleanup, return success */
415 	metafreenamelist(nlp);
416 	return (rval);
417 }
418 
419 /*
420  * simple check for ownership
421  */
422 int
423 meta_check_ownership(mdsetname_t *sp, md_error_t *ep)
424 {
425 	int			ownset;
426 	md_set_desc		*sd;
427 	md_drive_desc		*dd;
428 	md_replicalist_t	*rlp = NULL;
429 	md_error_t		xep = mdnullerror;
430 
431 	if (metaislocalset(sp))
432 		return (0);
433 
434 	ownset = own_set(sp, NULL, TRUE, ep);
435 	if (! mdisok(ep))
436 		return (-1);
437 
438 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
439 		return (-1);
440 
441 	dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep);
442 	if (! mdisok(ep))
443 		return (-1);
444 
445 	/* If we have no drive descriptors, check for no ownership */
446 	if (dd == NULL) {
447 		if (ownset == MD_SETOWNER_NONE)
448 			return (0);
449 
450 		/* If ownership somehow has come to exist, we must clean up */
451 
452 		if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp,
453 		    &xep) < 0)
454 			mdclrerror(&xep);
455 
456 		if ((dd = rl_to_dd(sp, rlp, &xep)) == NULL)
457 			if (! mdisok(&xep))
458 				mdclrerror(&xep);
459 
460 		if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
461 			if (rel_own_bydd(sp, dd, TRUE, &xep))
462 				mdclrerror(&xep);
463 		}
464 
465 		if (halt_set(sp, &xep))
466 			mdclrerror(&xep);
467 
468 		metafreereplicalist(rlp);
469 
470 		metafreedrivedesc(&dd);
471 
472 		return (0);
473 	}
474 
475 	metafreedrivedesc(&sd->sd_drvs);
476 
477 	if (ownset == MD_SETOWNER_YES)
478 		return (0);
479 
480 	return (mddserror(ep, MDE_DS_NOOWNER, sp->setno, NULL, NULL,
481 	    sp->setname));
482 }
483 
484 /*
485  * simple check for ownership
486  */
487 int
488 meta_check_ownership_on_host(mdsetname_t *sp, char *hostname, md_error_t *ep)
489 {
490 	md_set_desc	*sd;
491 	md_drive_desc	*dd;
492 	int		bool;
493 
494 	if (metaislocalset(sp))
495 		return (0);
496 
497 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
498 		return (-1);
499 
500 	if (getnodeside(hostname, sd) == MD_SIDEWILD)
501 		return (mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
502 		    hostname, NULL, sp->setname));
503 
504 	dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep);
505 	if (! mdisok(ep))
506 		return (-1);
507 
508 	if (clnt_ownset(hostname, sp, &bool, ep) == -1)
509 		return (-1);
510 
511 	if (dd == NULL)
512 		return (0);
513 
514 	metafreedrivedesc(&sd->sd_drvs);
515 
516 	if (bool == TRUE)
517 		return (0);
518 
519 	return (mddserror(ep, MDE_DS_NODEISNOTOWNER, sp->setno, hostname, NULL,
520 	    sp->setname));
521 }
522 
523 /*
524  * Function that determines if a node is in the multinode diskset
525  * membership list.  Calling node passes in node to be checked and
526  * the nodelist as returned from meta_read_nodelist.  This routine
527  * anticipates being called many times using the same diskset membership
528  * list which is why the alloc and free of the diskset membership list
529  * is left to the calling routine.
530  * Returns:
531  *	1 - if a member
532  *	0 - not a member
533  */
534 int
535 meta_is_member(
536 	char				*node_name,
537 	md_mn_nodeid_t			node_id,
538 	mndiskset_membershiplist_t	*nl
539 )
540 {
541 	mndiskset_membershiplist_t	*nl2;
542 	int				flag_check_name;
543 
544 	if (node_id != 0)
545 		flag_check_name = 0;
546 	else if (node_name != NULL)
547 		flag_check_name = 1;
548 	else
549 		return (0);
550 
551 	nl2 = nl;
552 	while (nl2) {
553 		if (flag_check_name) {
554 			/* Compare given name against name in member list */
555 			if (strcmp(nl2->msl_node_name, node_name) == 0)
556 				break;
557 		} else {
558 			/* Compare given nodeid against nodeid in member list */
559 			if (nl2->msl_node_id == node_id)
560 				break;
561 		}
562 		nl2 = nl2->next;
563 	}
564 	/* No match found in member list */
565 	if (nl2 == NULL) {
566 		return (0);
567 	}
568 	/* Return 1 if node is in member list */
569 	return (1);
570 }
571 
572 /*
573  * meta_getnext_devinfo should go to the host that
574  * has the device, to return the device name, driver name, minor num.
575  * We can take the big cheat for now, since it is a requirement
576  * that the device names and device numbers are the same, and
577  * just get the info locally.
578  *
579  * This routine is very similar to meta_getnextside_devinfo except
580  * that the specific side to be used is being passed in.
581  *
582  * Exit status:
583  *	 0 - No more side info to return
584  *	 1 - More side info's to return
585  *	-1 - An error has been detected
586  */
587 /*ARGSUSED*/
588 int
589 meta_getside_devinfo(
590 	mdsetname_t	*sp,		/* for this set */
591 	char		*bname,		/* local block name (myside) */
592 	side_t		sideno,		/* sideno */
593 	char		**ret_bname,	/* block device name of returned side */
594 	char		**ret_dname,	/* driver name of returned side */
595 	minor_t		*ret_mnum,	/* minor number of returned side */
596 	md_error_t	*ep
597 )
598 {
599 	mdname_t	*np;
600 
601 	if (ret_bname != NULL)
602 		*ret_bname = NULL;
603 	if (ret_dname != NULL)
604 		*ret_dname = NULL;
605 	if (ret_mnum != NULL)
606 		*ret_mnum = NODEV32;
607 
608 
609 	if ((np = metaname(&sp, bname, LOGICAL_DEVICE, ep)) == NULL)
610 		return (-1);
611 
612 /*
613  * NOTE (future) - There will be more work here once devids are integrated
614  * into disksets.  Then the side should be used to find the correct
615  * host and the b/d names should be gotten from that host.
616  */
617 
618 	/*
619 	 * Return the side info.
620 	 */
621 	if (ret_bname != NULL)
622 		*ret_bname = Strdup(np->bname);
623 
624 	if (ret_dname != NULL) {
625 		mdcinfo_t	*cinfo;
626 
627 		if ((cinfo = metagetcinfo(np, ep)) == NULL)
628 			return (-1);
629 
630 		*ret_dname = Strdup(cinfo->dname);
631 	}
632 
633 	if (ret_mnum != NULL)
634 		*ret_mnum = meta_getminor(np->dev);
635 
636 	return (1);
637 }
638 
639 /*
640  * Get the information on the device from the remote node using the devid
641  * of the disk.
642  *
643  * Exit status:
644  *	 0 - No more side info to return
645  *	 1 - More side info's to return
646  *	-1 - An error has been detected
647  */
648 int
649 meta_getnextside_devinfo(
650 	mdsetname_t	*sp,		/* for this set */
651 	char		*bname,		/* local block name (myside) */
652 	side_t		*sideno,	/* previous sideno & returned sideno */
653 	char		**ret_bname,	/* block device name of returned side */
654 	char		**ret_dname,	/* driver name of returned side */
655 	minor_t		*ret_mnum,	/* minor number of returned side */
656 	md_error_t	*ep
657 )
658 {
659 	md_set_desc	*sd;
660 	int		i;
661 	mdname_t	*np;
662 	mddrivename_t	*dnp;
663 	char		*devidstr = NULL;
664 	int		devidstrlen;
665 	md_dev64_t	retdev = NODEV64;
666 	char		*ret_devname = NULL;
667 	char		*ret_blkdevname = NULL;
668 	char		*ret_driver = NULL;
669 	char		*nodename;
670 	int		fd;
671 	int		ret = -1;
672 	char		*minor_name = NULL;
673 	md_mnnode_desc	*nd;
674 
675 
676 	if (ret_bname != NULL)
677 		*ret_bname = NULL;
678 	if (ret_dname != NULL)
679 		*ret_dname = NULL;
680 	if (ret_mnum != NULL)
681 		*ret_mnum = NODEV32;
682 
683 	if (metaislocalset(sp)) {
684 		/* no more sides - we are done */
685 		if (*sideno != MD_SIDEWILD)
686 			return (0);
687 
688 		/* First time through -  set up return sideno */
689 		*sideno = 0;
690 	} else {
691 
692 		/*
693 		 * Find the next sideno, starting after the one given.
694 		 */
695 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
696 			return (-1);
697 
698 		if (MD_MNSET_DESC(sd)) {
699 			nd = sd->sd_nodelist;
700 			if ((*sideno == MD_SIDEWILD) &&
701 			    (nd != (struct md_mnnode_desc *)NULL)) {
702 				*sideno = nd->nd_nodeid;
703 			} else {
704 				while (nd) {
705 					/*
706 					 * Found given sideno, now find
707 					 * next sideno, if there is one.
708 					 */
709 					if ((*sideno == nd->nd_nodeid) &&
710 					    (nd->nd_next !=
711 					    (struct md_mnnode_desc *)NULL)) {
712 						*sideno =
713 						    nd->nd_next->nd_nodeid;
714 						break;
715 					}
716 					nd = nd->nd_next;
717 				}
718 				if (nd == NULL) {
719 					return (0);
720 				}
721 			}
722 			if (*sideno == MD_SIDEWILD)
723 				return (0);
724 		} else {
725 			for (i = (*sideno)+1; i < MD_MAXSIDES; i++)
726 				/* Find next full slot */
727 				if (sd->sd_nodes[i][0] != '\0')
728 					break;
729 
730 			/* No more sides - we are done */
731 			if (i == MD_MAXSIDES)
732 				return (0);
733 
734 			/* Set up the return sideno */
735 			*sideno = i;
736 			nodename = (char *)sd->sd_nodes[i];
737 		}
738 	}
739 
740 	/*
741 	 * Need to pass the node the devid of the disk and get it to
742 	 * send back the details of the disk from that side.
743 	 */
744 	if ((np = metaname(&sp, bname, UNKNOWN, ep)) == NULL)
745 		return (-1);
746 
747 	dnp = np->drivenamep;
748 
749 	/*
750 	 * By default, set up the parameters so that they are copied out.
751 	 */
752 	if (ret_bname != NULL)
753 		*ret_bname = Strdup(np->bname);
754 
755 	if (ret_dname != NULL) {
756 		mdcinfo_t	*cinfo;
757 
758 		if ((cinfo = metagetcinfo(np, ep)) == NULL)
759 			return (-1);
760 
761 		*ret_dname = Strdup(cinfo->dname);
762 	}
763 
764 	if (ret_mnum != NULL)
765 		*ret_mnum = meta_getminor(np->dev);
766 
767 	/*
768 	 * Try some optimization. If this is the local set or the device
769 	 * is a metadevice then just copy the information. If the device
770 	 * does not have a devid (due to not having a minor name) then
771 	 * fall back to the pre-devid behaviour of copying the information
772 	 * on the device: this is okay because the sanity checks before this
773 	 * call would have found any issues with the device. If it's a
774 	 * multi-node diskset also just return ie. copy.
775 	 */
776 	if (metaislocalset(sp) || metaismeta(np) || (dnp->devid == NULL) ||
777 	    (MD_MNSET_DESC(sd)))
778 		return (1);
779 
780 	if (np->minor_name == (char *)NULL) {
781 		/*
782 		 * Have to get the minor name then. The slice should exist
783 		 * on the disk because it will have already been repartitioned
784 		 * up prior to getting to this point.
785 		 */
786 		if ((fd = open(np->bname, (O_RDONLY|O_NDELAY), 0)) < 0) {
787 			(void) mdsyserror(ep, errno, np->bname);
788 			return (-1);
789 		}
790 		(void) devid_get_minor_name(fd, &minor_name);
791 		np->minor_name = Strdup(minor_name);
792 		devid_str_free(minor_name);
793 		(void) close(fd);
794 	}
795 
796 	/* allocate extra space for "/" and NULL hence +2 */
797 	devidstrlen = strlen(dnp->devid) + strlen(np->minor_name) + 2;
798 	devidstr = (char *)Malloc(devidstrlen);
799 
800 	/*
801 	 * As a minor name is supplied then the ret_devname will be
802 	 * appropriate to that minor_name and in this case it will be
803 	 * a block device ie /dev/dsk.
804 	 */
805 	(void) snprintf(devidstr, devidstrlen,
806 	    "%s/%s", dnp->devid, np->minor_name);
807 
808 	ret = clnt_devinfo_by_devid(nodename, sp, devidstr, &retdev,
809 	    np->bname, &ret_devname, &ret_driver, ep);
810 
811 	Free(devidstr);
812 
813 	/*
814 	 * If the other side is not running device id in disksets,
815 	 * 'ret' is set to ENOTSUP in which case we fallback to
816 	 * the existing behaviour
817 	 */
818 	if (ret == ENOTSUP)
819 		return (1);
820 	else if (ret == -1)
821 		return (-1);
822 
823 	/*
824 	 * ret_devname comes from the rpc call and is a
825 	 * raw device name. We need to make this into a
826 	 * block device via blkname for further processing.
827 	 * Unfortunately, when our device id isn't found in
828 	 * the system, the rpc call will return a " " in
829 	 * ret_devname in which case we need to fill that in
830 	 * as ret_blkname because blkname of " " returns NULL.
831 	 */
832 	if (ret_bname != NULL && ret_devname != NULL) {
833 		ret_blkdevname = blkname(ret_devname);
834 		if (ret_blkdevname == NULL)
835 			*ret_bname = Strdup(ret_devname);
836 		else
837 			*ret_bname = Strdup(ret_blkdevname);
838 	}
839 
840 	if (ret_dname != NULL && ret_driver != NULL)
841 		*ret_dname = Strdup(ret_driver);
842 
843 	if (ret_mnum != NULL)
844 		*ret_mnum = meta_getminor(retdev);
845 
846 	return (1);
847 }
848 
849 int
850 meta_is_drive_in_anyset(
851 	mddrivename_t	*dnp,
852 	mdsetname_t	**spp,
853 	int		bypass_daemon,
854 	md_error_t 	*ep
855 )
856 {
857 	set_t		setno;
858 	mdsetname_t	*this_sp;
859 	int		is_it;
860 	set_t		max_sets;
861 
862 	if ((max_sets = get_max_sets(ep)) == 0)
863 		return (-1);
864 
865 	assert(spp != NULL);
866 	*spp = NULL;
867 
868 	for (setno = 1; setno < max_sets; setno++) {
869 		if (!bypass_daemon) {
870 			if ((this_sp = metasetnosetname(setno, ep)) == NULL) {
871 				if (mdismddberror(ep, MDE_DB_NODB)) {
872 					mdclrerror(ep);
873 					return (0);
874 				}
875 				if (mdiserror(ep, MDE_NO_SET)) {
876 					mdclrerror(ep);
877 					continue;
878 				}
879 				return (-1);
880 			}
881 		} else
882 			this_sp = metafakesetname(setno, NULL);
883 
884 		if ((is_it = meta_is_drive_in_thisset(this_sp, dnp,
885 		    bypass_daemon, ep)) == -1) {
886 			if (mdiserror(ep, MDE_NO_SET)) {
887 				mdclrerror(ep);
888 				continue;
889 			}
890 			return (-1);
891 		}
892 		if (is_it) {
893 			*spp = this_sp;
894 			return (0);
895 		}
896 	}
897 	return (0);
898 }
899 
900 int
901 meta_is_drive_in_thisset(
902 	mdsetname_t	*sp,
903 	mddrivename_t	*dnp,
904 	int		bypass_daemon,
905 	md_error_t	*ep
906 )
907 {
908 	md_drive_desc	*dd, *p;
909 
910 	if (bypass_daemon)
911 		dd = dr2drivedesc(sp, MD_SIDEWILD,
912 		    (MD_BASICNAME_OK | MD_BYPASS_DAEMON), ep);
913 	else
914 		dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
915 
916 	if (dd == NULL) {
917 		if (! mdisok(ep))
918 			return (-1);
919 		return (0);
920 	}
921 
922 
923 	for (p = dd; p != NULL; p = p->dd_next)
924 		if (strcmp(p->dd_dnp->cname, dnp->cname) == 0)
925 			return (1);
926 	return (0);
927 }
928 
929 /*
930  * Check to see if devid is in use in any diskset.
931  * This is used in the case when a partial diskset is being imported
932  * to make sure that the unvailable drive isn't already in use in an
933  * already imported partial diskset.  Can't check on the cname since the
934  * unavailable disk's cname is from the previous system and may collide
935  * with a cname on this system.
936  * Return values:
937  *	1: devid has been found in a diskset
938  *	0: devid not found in any diskset
939  */
940 int
941 meta_is_devid_in_anyset(
942 	void		*devid,
943 	mdsetname_t	**spp,
944 	md_error_t 	*ep
945 )
946 {
947 	set_t		setno;
948 	mdsetname_t	*this_sp;
949 	int		is_it;
950 	set_t		max_sets;
951 
952 	if ((max_sets = get_max_sets(ep)) == 0)
953 		return (-1);
954 
955 	assert(spp != NULL);
956 	*spp = NULL;
957 
958 	for (setno = 1; setno < max_sets; setno++) {
959 		if ((this_sp = metasetnosetname(setno, ep)) == NULL) {
960 			if (mdismddberror(ep, MDE_DB_NODB)) {
961 				mdclrerror(ep);
962 				return (0);
963 			}
964 			if (mdiserror(ep, MDE_NO_SET)) {
965 				mdclrerror(ep);
966 				continue;
967 			}
968 			return (-1);
969 		}
970 
971 		if ((is_it = meta_is_devid_in_thisset(this_sp,
972 		    devid, ep)) == -1) {
973 			if (mdiserror(ep, MDE_NO_SET)) {
974 				mdclrerror(ep);
975 				continue;
976 			}
977 			return (-1);
978 		}
979 		if (is_it) {
980 			*spp = this_sp;
981 			return (0);
982 		}
983 	}
984 	return (0);
985 }
986 
987 int
988 meta_is_devid_in_thisset(
989 	mdsetname_t	*sp,
990 	void		*devid,
991 	md_error_t	*ep
992 )
993 {
994 	md_drive_desc	*dd, *p;
995 	ddi_devid_t	dd_devid;
996 
997 	dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
998 	if (dd == NULL) {
999 		if (! mdisok(ep))
1000 			return (-1);
1001 		return (0);
1002 	}
1003 
1004 	for (p = dd; p != NULL; p = p->dd_next) {
1005 		if (p->dd_dnp->devid == NULL)
1006 			continue;
1007 		(void) devid_str_decode(p->dd_dnp->devid,
1008 		    &dd_devid, NULL);
1009 		if (dd_devid == NULL)
1010 			continue;
1011 		if (devid_compare(devid, dd_devid) == 0) {
1012 			devid_free(dd_devid);
1013 			return (1);
1014 		}
1015 		devid_free(dd_devid);
1016 	}
1017 	return (0);
1018 }
1019 
1020 int
1021 meta_set_balance(
1022 	mdsetname_t		*sp,
1023 	md_error_t		*ep
1024 )
1025 {
1026 	md_set_desc		*sd;
1027 	md_drive_desc		*dd, *curdd;
1028 	daddr_t			dbsize;
1029 	daddr_t			nblks;
1030 	int			i;
1031 	int			rval = 0;
1032 	sigset_t		oldsigs;
1033 	md_setkey_t		*cl_sk;
1034 	md_error_t		xep = mdnullerror;
1035 	md_mnnode_desc		*nd;
1036 	int			suspend1_flag = 0;
1037 
1038 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1039 		return (-1);
1040 
1041 	dbsize = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;
1042 
1043 	/* Make sure we own the set */
1044 	if (meta_check_ownership(sp, ep) != 0)
1045 		return (-1);
1046 
1047 	/* END CHECK CODE */
1048 
1049 	/*
1050 	 * Get drive descriptors for the drives that are currently in the set.
1051 	 */
1052 	curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep);
1053 
1054 	if (! mdisok(ep))
1055 		return (-1);
1056 
1057 	/* Find the minimum replica size in use is or use the default */
1058 	if ((nblks = meta_db_minreplica(sp, ep)) < 0)
1059 		mdclrerror(ep);
1060 	else
1061 		dbsize = nblks;	/* adjust replica size */
1062 
1063 	/* Make sure we are blocking all signals */
1064 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
1065 		mdclrerror(&xep);
1066 
1067 	/*
1068 	 * Lock the set on current set members.
1069 	 * For MN diskset lock_set and SUSPEND are used to protect against
1070 	 * other meta* commands running on the other nodes.
1071 	 */
1072 	if (MD_MNSET_DESC(sd)) {
1073 		nd = sd->sd_nodelist;
1074 		while (nd) {
1075 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1076 				nd = nd->nd_next;
1077 				continue;
1078 			}
1079 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
1080 				rval = -1;
1081 				goto out;
1082 			}
1083 			nd = nd->nd_next;
1084 		}
1085 		/*
1086 		 * Lock out other meta* commands by suspending
1087 		 * class 1 messages across the diskset.
1088 		 */
1089 		nd = sd->sd_nodelist;
1090 		while (nd) {
1091 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1092 				nd = nd->nd_next;
1093 				continue;
1094 			}
1095 			if (clnt_mdcommdctl(nd->nd_nodename,
1096 			    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
1097 			    MD_MSCF_NO_FLAGS, ep)) {
1098 				rval = -1;
1099 				goto out;
1100 			}
1101 			suspend1_flag = 1;
1102 			nd = nd->nd_next;
1103 		}
1104 	} else {
1105 		for (i = 0; i < MD_MAXSIDES; i++) {
1106 			/* Skip empty slots */
1107 			if (sd->sd_nodes[i][0] == '\0') continue;
1108 
1109 			if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
1110 				rval = -1;
1111 				goto out;
1112 			}
1113 		}
1114 	}
1115 
1116 	/* We are not adding or deleting any drives, just balancing */
1117 	dd = NULL;
1118 
1119 	/*
1120 	 * Balance the DB's according to the list of existing drives and the
1121 	 * list of added drives.
1122 	 */
1123 	if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1)
1124 		goto out;
1125 
1126 out:
1127 	/*
1128 	 * Unlock diskset by resuming class 1 messages across the diskset.
1129 	 * Just resume all classes so that resume is the same whether
1130 	 * just one class was locked or all classes were locked.
1131 	 */
1132 	if (suspend1_flag) {
1133 		nd = sd->sd_nodelist;
1134 		while (nd) {
1135 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1136 				nd = nd->nd_next;
1137 				continue;
1138 			}
1139 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
1140 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
1141 				/*
1142 				 * We are here because we failed to resume
1143 				 * rpc.mdcommd.  However we potentially have
1144 				 * an error from the previous call
1145 				 * (meta_db_balance). If the previous call
1146 				 * did fail,  we capture that error and
1147 				 * generate a perror withthe string,
1148 				 * "Unable to resume...".
1149 				 * Setting rval to -1 ensures that in the
1150 				 * next iteration of the loop, ep is not
1151 				 * clobbered.
1152 				 */
1153 				if (rval == 0)
1154 					(void) mdstealerror(ep, &xep);
1155 				else
1156 					mdclrerror(&xep);
1157 				rval = -1;
1158 				mde_perror(ep, dgettext(TEXT_DOMAIN,
1159 				    "Unable to resume rpc.mdcommd."));
1160 			}
1161 			nd = nd->nd_next;
1162 		}
1163 	}
1164 
1165 	/* Unlock the set */
1166 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1167 	if (MD_MNSET_DESC(sd)) {
1168 		nd = sd->sd_nodelist;
1169 		while (nd) {
1170 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1171 				nd = nd->nd_next;
1172 				continue;
1173 			}
1174 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
1175 				if (rval == 0)
1176 					(void) mdstealerror(ep, &xep);
1177 				else
1178 					mdclrerror(&xep);
1179 				rval = -1;
1180 			}
1181 			nd = nd->nd_next;
1182 		}
1183 	} else {
1184 		for (i = 0; i < MD_MAXSIDES; i++) {
1185 			/* Skip empty slots */
1186 			if (sd->sd_nodes[i][0] == '\0')
1187 				continue;
1188 
1189 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
1190 				if (rval == 0)
1191 					(void) mdstealerror(ep, &xep);
1192 				rval = -1;
1193 			}
1194 		}
1195 	}
1196 
1197 	/* release signals back to what they were on entry */
1198 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
1199 		mdclrerror(&xep);
1200 
1201 	cl_set_setkey(NULL);
1202 
1203 	metaflushsetname(sp);
1204 
1205 	return (rval);
1206 }
1207 
1208 int
1209 meta_set_destroy(
1210 	mdsetname_t	*sp,
1211 	int		lock_set,
1212 	md_error_t	*ep
1213 )
1214 {
1215 	int		i;
1216 	med_rec_t	medr;
1217 	md_set_desc	*sd;
1218 	md_drive_desc	*dd, *p, *p1;
1219 	mddrivename_t	*dnp;
1220 	mdname_t	*np;
1221 	mdnamelist_t	*nlp = NULL;
1222 	int		num_users = 0;
1223 	int		has_set;
1224 	side_t		mysideno;
1225 	sigset_t	oldsigs;
1226 	md_error_t	xep = mdnullerror;
1227 	md_setkey_t	*cl_sk;
1228 	int		rval = 0;
1229 	int		delete_end = 1;
1230 
1231 	/* Make sure we are blocking all signals */
1232 	if (procsigs(TRUE, &oldsigs, ep) < 0)
1233 		return (-1);
1234 
1235 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1236 		if (! mdisok(ep))
1237 			rval = -1;
1238 		goto out;
1239 	}
1240 
1241 	/*
1242 	 * meta_set_destroy should not be called for a MN diskset.
1243 	 * This routine destroys a set without communicating this information
1244 	 * to the other nodes which would lead to an inconsistency in
1245 	 * the MN diskset.
1246 	 */
1247 	if (MD_MNSET_DESC(sd)) {
1248 		rval = -1;
1249 		goto out;
1250 	}
1251 
1252 	/* Continue if a traditional diskset */
1253 
1254 	/*
1255 	 * Check to see who has the set.  If we are not the last user of the
1256 	 * set, we will not touch the replicas.
1257 	 */
1258 	for (i = 0; i < MD_MAXSIDES; i++) {
1259 		/* Skip empty slots */
1260 		if (sd->sd_nodes[i][0] == '\0')
1261 			continue;
1262 
1263 		has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NST_EQ,
1264 		    ep);
1265 
1266 		if (has_set < 0) {
1267 			mdclrerror(ep);
1268 		} else
1269 			num_users++;
1270 	}
1271 
1272 	if ((dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) == NULL) {
1273 		if (! mdisok(ep)) {
1274 			rval = -1;
1275 			goto out;
1276 		}
1277 	}
1278 
1279 	if (setup_db_bydd(sp, dd, TRUE, ep) == -1) {
1280 		rval = -1;
1281 		goto out;
1282 	}
1283 
1284 	if (lock_set == TRUE) {
1285 		/* Lock the set on our side */
1286 		if (clnt_lock_set(mynode(), sp, ep)) {
1287 			rval = -1;
1288 			goto out;
1289 		}
1290 	}
1291 
1292 	/*
1293 	 * A traditional diskset has no diskset stale information to send
1294 	 * since there can only be one owner node at a time.
1295 	 */
1296 	if (snarf_set(sp, FALSE, ep))
1297 		mdclrerror(ep);
1298 
1299 	if (dd != NULL) {
1300 		/*
1301 		 * Make sure that no drives are in use as parts of metadrives
1302 		 * or hot spare pools, this is one of the few error conditions
1303 		 * that will stop this routine, unless the environment has
1304 		 * META_DESTROY_SET_OK set, in which case, the operation will
1305 		 * proceed.
1306 		 */
1307 		if (getenv("META_DESTROY_SET_OK") == NULL) {
1308 			for (p = dd; p != NULL; p = p->dd_next) {
1309 				dnp = p->dd_dnp;
1310 
1311 				i = meta_check_drive_inuse(sp, dnp, FALSE, ep);
1312 				if (i == -1) {
1313 					/* need xep - wire calls clear error */
1314 					i = metaget_setownership(sp, &xep);
1315 					if (i == -1) {
1316 						rval = -1;
1317 						goto out;
1318 					}
1319 
1320 					mysideno = getmyside(sp, &xep);
1321 
1322 					if (mysideno == MD_SIDEWILD) {
1323 						rval = -1;
1324 						goto out;
1325 					}
1326 
1327 					if (sd->sd_isown[mysideno] == FALSE)
1328 						if (halt_set(sp, &xep)) {
1329 							rval = -1;
1330 							goto out;
1331 						}
1332 
1333 					rval = -1;
1334 					goto out;
1335 				}
1336 			}
1337 		}
1338 
1339 		for (i = 0; i < MD_MAXSIDES; i++) {
1340 			/* Skip empty slots */
1341 			if (sd->sd_nodes[i][0] == '\0')
1342 				continue;
1343 
1344 			/* Skip non local nodes */
1345 			if (strcmp(mynode(), sd->sd_nodes[i]) != 0)
1346 				continue;
1347 
1348 			if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep))
1349 				mdclrerror(ep);
1350 		}
1351 
1352 		/*
1353 		 * Go thru each drive and individually delete the replicas.
1354 		 * This way we can ignore individual errors.
1355 		 */
1356 		for (p = dd; p != NULL; p = p->dd_next) {
1357 			uint_t	rep_slice;
1358 
1359 			dnp = p->dd_dnp;
1360 			if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) ||
1361 			    (((np = metaslicename(dnp, rep_slice, ep))
1362 			    == NULL) &&
1363 			    ((np = metaslicename(dnp, MD_SLICE0, ep))
1364 			    == NULL))) {
1365 				rval = -1;
1366 				goto out;
1367 			}
1368 
1369 			if ((np = metaslicename(dnp,
1370 			    rep_slice, ep)) == NULL) {
1371 				if ((np = metaslicename(dnp,
1372 				    MD_SLICE0, ep)) == NULL) {
1373 					rval = -1;
1374 					goto out;
1375 				}
1376 				mdclrerror(ep);
1377 			}
1378 
1379 			/* Yes this is UGLY!!! */
1380 			p1 = p->dd_next;
1381 			p->dd_next = NULL;
1382 			if (rel_own_bydd(sp, p, FALSE, ep))
1383 				mdclrerror(ep);
1384 			p->dd_next = p1;
1385 
1386 			if (p->dd_dbcnt == 0)
1387 				continue;
1388 
1389 			/*
1390 			 * Skip the replica removal if we are not the last user
1391 			 */
1392 			if (num_users != 1)
1393 				continue;
1394 
1395 			nlp = NULL;
1396 			(void) metanamelist_append(&nlp, np);
1397 			if (meta_db_detach(sp, nlp,
1398 			    (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, ep))
1399 				mdclrerror(ep);
1400 			metafreenamelist(nlp);
1401 		}
1402 	}
1403 
1404 	if (halt_set(sp, ep)) {
1405 		rval = -1;
1406 		goto out;
1407 	}
1408 
1409 	/* Setup the mediator record */
1410 	(void) memset(&medr, '\0', sizeof (med_rec_t));
1411 	medr.med_rec_mag = MED_REC_MAGIC;
1412 	medr.med_rec_rev = MED_REC_REV;
1413 	medr.med_rec_fl  = 0;
1414 	medr.med_rec_sn  = sp->setno;
1415 	(void) strcpy(medr.med_rec_snm, sp->setname);
1416 	medr.med_rec_meds = sd->sd_med;	/* structure assigment */
1417 	(void) memset(&medr.med_rec_data, '\0', sizeof (med_data_t));
1418 	medr.med_rec_foff = 0;
1419 
1420 	/*
1421 	 * If we are the last remaining user, then remove the mediator hosts
1422 	 */
1423 	if (num_users == 1) {
1424 		for (i = 0; i < MED_MAX_HOSTS; i++) {
1425 			if (medr.med_rec_meds.n_lst[i].a_cnt != 0)
1426 				SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE,
1427 				    SVM_TAG_MEDIATOR, sp->setno, i);
1428 			(void) memset(&medr.med_rec_meds.n_lst[i], '\0',
1429 			    sizeof (md_h_t));
1430 		}
1431 		medr.med_rec_meds.n_cnt = 0;
1432 	} else { 	/* Remove this host from the mediator node list. */
1433 		for (i = 0; i < MD_MAXSIDES; i++) {
1434 			/* Skip empty slots */
1435 			if (sd->sd_nodes[i][0] == '\0')
1436 				continue;
1437 
1438 			/* Copy non local node */
1439 			if (strcmp(mynode(), sd->sd_nodes[i]) != 0) {
1440 				(void) strcpy(medr.med_rec_nodes[i],
1441 				    sd->sd_nodes[i]);
1442 				continue;
1443 			}
1444 
1445 			/* Clear local node */
1446 			(void) memset(&medr.med_rec_nodes[i], '\0',
1447 			    sizeof (md_node_nm_t));
1448 		}
1449 	}
1450 
1451 	crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
1452 
1453 	/*
1454 	 * If the client is part of a cluster put the DCS service
1455 	 * into a deleteing state.
1456 	 */
1457 	if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) {
1458 		if (metad_isautotakebyname(sp->setname)) {
1459 			delete_end = 0;
1460 		} else {
1461 			mdclrerror(ep);
1462 			goto out;
1463 		}
1464 	}
1465 
1466 	/* Inform the mediator hosts of the new information */
1467 	for (i = 0; i < MED_MAX_HOSTS; i++) {
1468 		if (sd->sd_med.n_lst[i].a_cnt == 0)
1469 			continue;
1470 
1471 		if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep))
1472 			mdclrerror(ep);
1473 	}
1474 
1475 	/* Delete the set locally */
1476 	for (i = 0; i < MD_MAXSIDES; i++) {
1477 		/* Skip empty slots */
1478 		if (sd->sd_nodes[i][0] == '\0')
1479 			continue;
1480 
1481 		/* Skip non local nodes */
1482 		if (strcmp(mynode(), sd->sd_nodes[i]) != 0)
1483 			continue;
1484 
1485 		if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1)
1486 			mdclrerror(ep);
1487 	}
1488 	if (delete_end &&
1489 	    sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR)
1490 		rval = -1;
1491 
1492 out:
1493 	/* release signals back to what they were on entry */
1494 	if (procsigs(FALSE, &oldsigs, &xep) < 0) {
1495 		if (rval == 0)
1496 			(void) mdstealerror(ep, &xep);
1497 		rval = -1;
1498 	}
1499 
1500 	if (lock_set == TRUE) {
1501 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
1502 		if (clnt_unlock_set(mynode(), cl_sk, &xep)) {
1503 			if (rval == 0)
1504 				(void) mdstealerror(ep, &xep);
1505 			rval = -1;
1506 		}
1507 		cl_set_setkey(NULL);
1508 	}
1509 
1510 	metaflushsetname(sp);
1511 	return (rval);
1512 }
1513 
1514 int
1515 meta_set_purge(
1516 	mdsetname_t	*sp,
1517 	int		bypass_cluster,
1518 	int		forceflg,
1519 	md_error_t	*ep
1520 )
1521 {
1522 	char		*thishost = mynode();
1523 	md_set_desc	*sd;
1524 	md_setkey_t	*cl_sk;
1525 	md_error_t	xep = mdnullerror;
1526 	int		rval = 0;
1527 	int		i, num_hosts = 0;
1528 	int		has_set = 0;
1529 	int		max_node = 0;
1530 	int		delete_end = 1;
1531 	md_mnnode_desc	*nd;
1532 
1533 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1534 		/* unable to find set description */
1535 		rval = 1;
1536 		return (rval);
1537 	}
1538 
1539 	if (MD_MNSET_DESC(sd)) {
1540 		/*
1541 		 * Get a count of the hosts in the set and also lock the set
1542 		 * on those hosts that know about it.
1543 		 */
1544 		nd = sd->sd_nodelist;
1545 		while (nd) {
1546 			/*
1547 			 * Only deal with those nodes that are members of
1548 			 * the set (MD_MN_NODE_ALIVE) or the node on which
1549 			 * the purge is being run. We must lock the set
1550 			 * on the purging node because the delset call
1551 			 * requires the lock to be set.
1552 			 */
1553 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE) &&
1554 			    nd->nd_nodeid != sd->sd_mn_mynode->nd_nodeid) {
1555 				nd = nd->nd_next;
1556 				continue;
1557 			}
1558 			has_set = nodehasset(sp, nd->nd_nodename,
1559 			    NHS_NST_EQ, ep);
1560 
1561 			/*
1562 			 * The host is not aware of this set (has_set < 0) or
1563 			 * the set does not match (has_set == 0). This check
1564 			 * prevents the code getting confused by an apparent
1565 			 * inconsistancy in the set's state, this is in the
1566 			 * purge code so something is broken in any case and
1567 			 * this is just trying to fix the brokeness.
1568 			 */
1569 			if (has_set <= 0) {
1570 				mdclrerror(ep);
1571 				nd->nd_flags |= MD_MN_NODE_NOSET;
1572 			} else {
1573 				num_hosts++;
1574 				if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
1575 					/*
1576 					 * If the force flag is set then
1577 					 * ignore any RPC failures because we
1578 					 * are only really interested with
1579 					 * the set on local node.
1580 					 */
1581 					if (forceflg && mdanyrpcerror(ep)) {
1582 						mdclrerror(ep);
1583 					} else {
1584 						/*
1585 						 * set max_node so that in the
1586 						 * unlock code nodes in the
1587 						 * set that have not been
1588 						 * locked are not unlocked.
1589 						 */
1590 						max_node = nd->nd_nodeid;
1591 						rval = 2;
1592 						goto out1;
1593 					}
1594 				}
1595 
1596 			}
1597 			nd = nd->nd_next;
1598 		}
1599 		max_node = 0;
1600 	} else {
1601 		/*
1602 		 * Get a count of the hosts in the set and also lock the set
1603 		 * on those hosts that know about it.
1604 		 */
1605 		for (i = 0; i < MD_MAXSIDES; i++) {
1606 			/* Skip empty slots */
1607 			if (sd->sd_nodes[i][0] == '\0')
1608 				continue;
1609 
1610 			has_set = nodehasset(sp, sd->sd_nodes[i],
1611 			    NHS_NST_EQ, ep);
1612 
1613 			/*
1614 			 * The host is not aware of this set (has_set < 0) or
1615 			 * the set does not match (has_set == 0). This check
1616 			 * prevents the code getting confused by an apparent
1617 			 * inconsistancy in the set's state, this is in the
1618 			 * purge code so something is broken in any case and
1619 			 * this is just trying to fix the brokeness.
1620 			 */
1621 			if (has_set <= 0) {
1622 				mdclrerror(ep);
1623 				/*
1624 				 * set the node to NULL to prevent further
1625 				 * requests to this unresponsive node.
1626 				 */
1627 				sd->sd_nodes[i][0] = '\0';
1628 			} else {
1629 				num_hosts++;
1630 				if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
1631 					/*
1632 					 * If the force flag is set then
1633 					 * ignore any RPC failures because we
1634 					 * are only really interested with
1635 					 * the set on local node.
1636 					 */
1637 					if (forceflg && mdanyrpcerror(ep)) {
1638 						mdclrerror(ep);
1639 					} else {
1640 						rval = 2;
1641 						/*
1642 						 * set max_node so that in the
1643 						 * unlock code nodes in the
1644 						 * set that have not been
1645 						 * locked are not unlocked.
1646 						 */
1647 						max_node = i;
1648 						goto out1;
1649 					}
1650 				}
1651 			}
1652 		}
1653 		max_node = i;	/* now MD_MAXSIDES */
1654 	}
1655 	if (!bypass_cluster) {
1656 		/*
1657 		 * If there is only one host associated with the
1658 		 * set then remove the set from the cluster.
1659 		 */
1660 		if (num_hosts == 1) {
1661 			if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) {
1662 				if (metad_isautotakebyname(sp->setname)) {
1663 					delete_end = 0;
1664 				} else {
1665 					mdclrerror(ep);
1666 					rval = 3;
1667 					goto out1;
1668 				}
1669 			}
1670 		}
1671 	}
1672 
1673 	if (MD_MNSET_DESC(sd)) {
1674 		nd = sd->sd_nodelist;
1675 		while (nd) {
1676 			if (nd->nd_nodeid == sd->sd_mn_mynode->nd_nodeid) {
1677 				/*
1678 				 * This is the node on which the purge is
1679 				 * being run. We do not care if it is
1680 				 * alive or not, just want to get rid of
1681 				 * the set.
1682 				 */
1683 				if (clnt_delset(nd->nd_nodename, sp,
1684 				    ep) == -1) {
1685 					md_perror(dgettext(TEXT_DOMAIN,
1686 					    "delset"));
1687 					if (!bypass_cluster && num_hosts == 1)
1688 						(void) sdssc_delete_end(
1689 						    sp->setname, SDSSC_CLEANUP);
1690 					mdclrerror(ep);
1691 					goto out1;
1692 				}
1693 				nd = nd->nd_next;
1694 				continue;
1695 			}
1696 
1697 			/*
1698 			 * Only contact those nodes that are members of
1699 			 * the set.
1700 			 */
1701 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1702 				nd = nd->nd_next;
1703 				continue;
1704 			}
1705 
1706 			/*
1707 			 * Tell the remote node to remove this node
1708 			 */
1709 			if (clnt_delhosts(nd->nd_nodename, sp, 1, &thishost,
1710 			    ep) == -1) {
1711 				/*
1712 				 * If we fail to delete ourselves
1713 				 * from the remote host it does not
1714 				 * really matter because the set is
1715 				 * being "purged" from this node. The
1716 				 * set can be purged from the other
1717 				 * node at a later time.
1718 				 */
1719 				mdclrerror(ep);
1720 			}
1721 			nd = nd->nd_next;
1722 		}
1723 	} else {
1724 		for (i = 0; i < MD_MAXSIDES; i++) {
1725 			/* Skip empty slots */
1726 			if (sd->sd_nodes[i][0] == '\0')
1727 				continue;
1728 			if (strcmp(thishost, sd->sd_nodes[i]) != 0) {
1729 				/*
1730 				 * Tell the remote node to remove this node
1731 				 */
1732 				if (clnt_delhosts(sd->sd_nodes[i], sp, 1,
1733 				    &thishost, ep) == -1) {
1734 					/*
1735 					 * If we fail to delete ourselves
1736 					 * from the remote host it does not
1737 					 * really matter because the set is
1738 					 * being "purged" from this node. The
1739 					 * set can be purged from the other
1740 					 * node at a later time.
1741 					 */
1742 					mdclrerror(ep);
1743 				}
1744 				continue;
1745 			}
1746 
1747 			/* remove the set from this host */
1748 			if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) {
1749 				md_perror(dgettext(TEXT_DOMAIN, "delset"));
1750 				if (!bypass_cluster && num_hosts == 1)
1751 					(void) sdssc_delete_end(sp->setname,
1752 					    SDSSC_CLEANUP);
1753 				mdclrerror(ep);
1754 				goto out1;
1755 			}
1756 		}
1757 	}
1758 
1759 	if (!bypass_cluster && num_hosts == 1) {
1760 		if (delete_end && sdssc_delete_end(sp->setname, SDSSC_COMMIT) ==
1761 		    SDSSC_ERROR) {
1762 			rval = 4;
1763 		}
1764 	}
1765 
1766 out1:
1767 
1768 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1769 
1770 	/*
1771 	 * Remove the set lock on those nodes that had the set locked
1772 	 * max_node will either be MD_MAXSIDES or array index of the last
1773 	 * node contacted (or rather failed to contact) for traditional
1774 	 * diskset.  For a MN diskset, max_node is the node_id of the node
1775 	 * that failed the lock.
1776 	 */
1777 	if (MD_MNSET_DESC(sd)) {
1778 		nd = sd->sd_nodelist;
1779 		while (nd) {
1780 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1781 				nd = nd->nd_next;
1782 				continue;
1783 			}
1784 			if (nd->nd_nodeid == max_node)
1785 				break;
1786 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
1787 				if (forceflg && mdanyrpcerror(&xep)) {
1788 					mdclrerror(&xep);
1789 					nd = nd->nd_next;
1790 					continue;
1791 				}
1792 				if (rval == 0)
1793 					(void) mdstealerror(ep, &xep);
1794 				rval = 5;
1795 			}
1796 			nd = nd->nd_next;
1797 		}
1798 	} else {
1799 		for (i = 0; i < max_node; i++) {
1800 			/* Skip empty slots */
1801 			if (sd->sd_nodes[i][0] == '\0')
1802 				continue;
1803 
1804 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
1805 				if (forceflg && mdanyrpcerror(&xep)) {
1806 					mdclrerror(&xep);
1807 					continue;
1808 				}
1809 				if (rval == 0)
1810 					(void) mdstealerror(ep, &xep);
1811 				rval = 5;
1812 			}
1813 		}
1814 	}
1815 
1816 	cl_set_setkey(NULL);
1817 
1818 	return (rval);
1819 }
1820 
1821 int
1822 meta_set_query(
1823 	mdsetname_t		*sp,
1824 	mddb_dtag_lst_t		**dtlpp,
1825 	md_error_t		*ep
1826 )
1827 {
1828 	mddb_dtag_get_parm_t	dtgp;
1829 
1830 	(void) memset(&dtgp, '\0', sizeof (mddb_dtag_get_parm_t));
1831 	dtgp.dtgp_setno = sp->setno;
1832 
1833 	/*CONSTCOND*/
1834 	while (1) {
1835 		if (metaioctl(MD_MED_GET_TAG, &dtgp, &dtgp.dtgp_mde, NULL) != 0)
1836 			if (! mdismddberror(&dtgp.dtgp_mde, MDE_DB_NOTAG) ||
1837 			    *dtlpp == NULL)
1838 				return (mdstealerror(ep, &dtgp.dtgp_mde));
1839 			else
1840 				break;
1841 
1842 		/*
1843 		 * Run to the end of the list
1844 		 */
1845 		for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx)
1846 			/* void */;
1847 
1848 		*dtlpp = Zalloc(sizeof (mddb_dtag_lst_t));
1849 
1850 		(void) memmove(&(*dtlpp)->dtl_dt, &dtgp.dtgp_dt,
1851 		    sizeof (mddb_dtag_t));
1852 
1853 		dtgp.dtgp_dt.dt_id++;
1854 	}
1855 	return (0);
1856 }
1857 
1858 /*
1859  * return drivename get by key
1860  */
1861 mddrivename_t *
1862 metadrivename_withdrkey(
1863 	mdsetname_t	*sp,
1864 	side_t		sideno,
1865 	mdkey_t		key,
1866 	int		flags,
1867 	md_error_t	*ep
1868 )
1869 {
1870 	char		*nm;
1871 	mdname_t	*np;
1872 	mddrivename_t	*dnp;
1873 	ddi_devid_t	devidp;
1874 	md_set_desc	*sd;
1875 
1876 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1877 		return (NULL);
1878 	}
1879 
1880 
1881 	/*
1882 	 * Get the devid associated with the key.
1883 	 *
1884 	 * If a devid was returned, it MUST be valid even in
1885 	 * the case where a device id has been "updated". The
1886 	 * "update" of the device id may have occured due to
1887 	 * a firmware upgrade.
1888 	 */
1889 	if ((devidp = meta_getdidbykey(MD_LOCAL_SET, sideno+SKEW, key, ep))
1890 	    != NULL) {
1891 		/*
1892 		 * Look for the correct dnp using the devid for comparison.
1893 		 */
1894 		dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep);
1895 		free(devidp);
1896 		dnp->side_names_key = key;
1897 	} else {
1898 		/*
1899 		 * We didn't get a devid. We'll try for a dnp using the
1900 		 * name. If we have a MN diskset or if the dnp is a did
1901 		 * device, we're done because then we don't have devids.
1902 		 * Otherwise we'll try to set the devid
1903 		 * and get the dnp via devid again.
1904 		 * We also need to clear the ep structure. When the
1905 		 * above call to meta_getdidbykey returned a null, it
1906 		 * also put an error code into ep. In this case, the null
1907 		 * return is actually OK and any errors can be ignored. The
1908 		 * reason it is OK is because this could be a MN set or
1909 		 * we could  be running without devids (ex cluster).
1910 		 */
1911 		mdclrerror(ep);
1912 
1913 		if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno, key,
1914 		    ep)) == NULL)
1915 			return (NULL);
1916 		/* get device name */
1917 		if (flags & PRINT_FAST) {
1918 			if ((np = metaname_fast(&sp, nm,
1919 			    LOGICAL_DEVICE, ep)) == NULL) {
1920 				Free(nm);
1921 				return (NULL);
1922 			}
1923 		} else {
1924 			if ((np = metaname(&sp, nm, LOGICAL_DEVICE,
1925 			    ep)) == NULL) {
1926 				Free(nm);
1927 				return (NULL);
1928 			}
1929 		}
1930 		Free(nm);
1931 		/* make sure it's OK */
1932 		if ((! (flags & MD_BASICNAME_OK)) && (metachkcomp(np,
1933 		    ep) != 0))
1934 			return (NULL);
1935 
1936 		/* get drivename */
1937 		dnp = np->drivenamep;
1938 		dnp->side_names_key = key;
1939 		/*
1940 		 * Skip the devid set/check for the following cases:
1941 		 * 1) If MN diskset, there are no devid's
1942 		 * 2) if dnp is did device
1943 		 * The device id is disabled for did device due to the
1944 		 * lack of minor name support in the did driver. The following
1945 		 * devid code path can set and propagate the error and
1946 		 * eventually prevent did disks from being added to the
1947 		 * diskset under SunCluster systems
1948 		 *
1949 		 * Note that this code can be called through rpc.mdcommd.
1950 		 * sdssc_version cannot be used because the library won't
1951 		 * be bound.
1952 		 */
1953 		if ((strncmp(dnp->rname, "/dev/did/", strlen("/dev/did/"))
1954 		    == 0) || (MD_MNSET_DESC(sd)))
1955 			goto out;
1956 
1957 		/*
1958 		 * It is okay if replica is not in devid mode
1959 		 */
1960 		if (mdissyserror(ep, MDDB_F_NODEVID)) {
1961 			mdclrerror(ep);
1962 			goto out;
1963 		}
1964 
1965 		/*
1966 		 * We're not MN or did devices but
1967 		 * devid is missing so this means that we have
1968 		 * just upgraded from a configuration where
1969 		 * devid's were not used so try to add in
1970 		 * the devid and requery. If the devid still isn't there,
1971 		 * that's OK. dnp->devid will be null as it is in any
1972 		 * configuration with no devids.
1973 		 */
1974 		if (meta_setdid(MD_LOCAL_SET, sideno + SKEW, key, ep) < 0)
1975 			return (NULL);
1976 		if ((devidp = (ddi_devid_t)meta_getdidbykey(MD_LOCAL_SET,
1977 		    sideno+SKEW, key, ep)) != NULL) {
1978 			/*
1979 			 * Found a devid so look for the dnp using the
1980 			 * devid as the search mechanism.
1981 			 */
1982 			dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep);
1983 			free(devidp);
1984 			dnp->side_names_key = key;
1985 		}
1986 	}
1987 
1988 
1989 
1990 out:
1991 	if (flags & MD_BYPASS_DAEMON)
1992 		return (dnp);
1993 
1994 	if (get_sidenmlist(sp, dnp, ep))
1995 		return (NULL);
1996 
1997 	/* return success */
1998 	return (dnp);
1999 }
2000 
2001 void
2002 metafreedrivedesc(md_drive_desc **dd)
2003 {
2004 	md_drive_desc	*p, *next = NULL;
2005 
2006 	for (p = *dd; p != NULL; p = next) {
2007 		next = p->dd_next;
2008 		Free(p);
2009 	}
2010 	*dd = NULL;
2011 }
2012 
2013 md_drive_desc *
2014 metaget_drivedesc(
2015 	mdsetname_t	*sp,
2016 	int		flags,
2017 	md_error_t	*ep
2018 )
2019 {
2020 	side_t		sideno = MD_SIDEWILD;
2021 
2022 	assert(! (flags & MD_BYPASS_DAEMON));
2023 
2024 	if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD)
2025 		return (NULL);
2026 
2027 	return (metaget_drivedesc_sideno(sp, sideno, flags, ep));
2028 }
2029 
2030 md_drive_desc *
2031 metaget_drivedesc_fromnamelist(
2032 	mdsetname_t	*sp,
2033 	mdnamelist_t	*nlp,
2034 	md_error_t	*ep
2035 )
2036 {
2037 	md_set_desc		*sd;
2038 	mdnamelist_t		*p;
2039 	md_drive_desc		*dd = NULL;
2040 
2041 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
2042 		return (NULL);
2043 
2044 	for (p = nlp; p != NULL; p = p->next)
2045 		(void) metadrivedesc_append(&dd, p->namep->drivenamep, 0, 0,
2046 		    sd->sd_ctime, sd->sd_genid, MD_DR_ADD);
2047 
2048 	return (dd);
2049 }
2050 
2051 md_drive_desc *
2052 metaget_drivedesc_sideno(
2053 	mdsetname_t *sp,
2054 	side_t sideno,
2055 	int flags,
2056 	md_error_t *ep
2057 )
2058 {
2059 	md_set_desc	*sd = NULL;
2060 
2061 	assert(! (flags & MD_BYPASS_DAEMON));
2062 
2063 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
2064 		return (NULL);
2065 
2066 	if (sd->sd_drvs)
2067 		return (sd->sd_drvs);
2068 
2069 	if ((sd->sd_drvs = dr2drivedesc(sp, sideno, flags, ep)) == NULL)
2070 		return (NULL);
2071 
2072 	return (sd->sd_drvs);
2073 }
2074 
2075 int
2076 metaget_setownership(
2077 	mdsetname_t	*sp,
2078 	md_error_t	*ep
2079 )
2080 {
2081 	md_set_desc	*sd;
2082 	int		bool;
2083 	int		i;
2084 	md_mnnode_desc	*nd;
2085 
2086 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
2087 		return (-1);
2088 
2089 	if (MD_MNSET_DESC(sd)) {
2090 		nd = sd->sd_nodelist;
2091 		while (nd) {
2092 			/* If node isn't alive, can't own diskset */
2093 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2094 				nd->nd_flags &= ~MD_MN_NODE_OWN;
2095 				nd = nd->nd_next;
2096 				continue;
2097 			}
2098 			/*
2099 			 * If can't communicate with rpc.metad, then mark
2100 			 * this node as not an owner.  That node may
2101 			 * in fact, be an owner, but without rpc.metad running
2102 			 * that node can't do much.
2103 			 */
2104 			if (clnt_ownset(nd->nd_nodename, sp, &bool, ep) == -1) {
2105 				nd->nd_flags &= ~MD_MN_NODE_OWN;
2106 			} else if (bool == TRUE) {
2107 				nd->nd_flags |= MD_MN_NODE_OWN;
2108 			} else {
2109 				nd->nd_flags &= ~MD_MN_NODE_OWN;
2110 			}
2111 			nd = nd->nd_next;
2112 		}
2113 		return (0);
2114 	}
2115 
2116 	/* Rest of code handles traditional disksets */
2117 
2118 	for (i = 0; i < MD_MAXSIDES; i++)
2119 		sd->sd_isown[i] = 0;
2120 
2121 	if (clnt_ownset(mynode(), sp, &bool, ep) == -1)
2122 		return (-1);
2123 
2124 	if (bool == TRUE)
2125 		sd->sd_isown[getmyside(sp, ep)] = 1;
2126 
2127 	return (0);
2128 }
2129 
2130 char *
2131 mynode(void)
2132 {
2133 	static struct utsname	myuname;
2134 	static int		done = 0;
2135 
2136 	if (! done) {
2137 		if (uname(&myuname) == -1) {
2138 			md_perror(dgettext(TEXT_DOMAIN, "uname"));
2139 			assert(0);
2140 		}
2141 		done = 1;
2142 	}
2143 	return (myuname.nodename);
2144 }
2145 
2146 int
2147 strinlst(char *str, int cnt, char **lst)
2148 {
2149 	int i;
2150 
2151 	for (i = 0; i < cnt; i++)
2152 		if (strcmp(lst[i], str) == 0)
2153 			return (TRUE);
2154 
2155 	return (FALSE);
2156 }
2157 
2158 /*
2159  * meta_get_reserved_names
2160  *  returns an mdnamelist_t of reserved slices
2161  *  reserved slices are those that are used but don't necessarily
2162  *  show up as metadevices (ex. reserved slice for db in sets, logs)
2163  */
2164 
2165 /*ARGSUSED*/
2166 int
2167 meta_get_reserved_names(
2168 	mdsetname_t	*sp,
2169 	mdnamelist_t	**nlpp,
2170 	int		options,
2171 	md_error_t	*ep)
2172 {
2173 	int		 count		= 0;
2174 	mdname_t	*np		= NULL;
2175 	mdnamelist_t	*transnlp	= NULL;
2176 	mdnamelist_t	**tailpp 	= nlpp;
2177 	mdnamelist_t	*nlp;
2178 	md_drive_desc	*dd, *di;
2179 
2180 	if (metaislocalset(sp))
2181 		goto out;
2182 
2183 	if (!(dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) && !mdisok(ep)) {
2184 		count = -1;
2185 		goto out;
2186 	}
2187 
2188 	/* db in for sets on reserved slice */
2189 	for (di = dd; di && count >= 0; di = di->dd_next) {
2190 		uint_t	rep_slice;
2191 
2192 		/*
2193 		 * Add the name struct to the end of the
2194 		 * namelist but keep a pointer to the last
2195 		 * element so that we don't incur the overhead
2196 		 * of traversing the list each time
2197 		 */
2198 		if (di->dd_dnp &&
2199 		    (meta_replicaslice(di->dd_dnp, &rep_slice, ep) == 0) &&
2200 		    (np = metaslicename(di->dd_dnp, rep_slice, ep)) &&
2201 		    (tailpp = meta_namelist_append_wrapper(tailpp, np)))
2202 			count++;
2203 		else
2204 			count = -1;
2205 	}
2206 
2207 	/* now find logs */
2208 	if (meta_get_trans_names(sp, &transnlp, 0, ep) < 0) {
2209 		count = -1;
2210 		goto out;
2211 	}
2212 
2213 	for (nlp = transnlp; (nlp != NULL); nlp = nlp->next) {
2214 		mdname_t	*transnp = nlp->namep;
2215 		md_trans_t	*transp;
2216 
2217 		if ((transp = meta_get_trans(sp, transnp, ep)) == NULL) {
2218 			count = -1;
2219 			goto out;
2220 		}
2221 		if (transp->lognamep) {
2222 			/*
2223 			 * Add the name struct to the end of the
2224 			 * namelist but keep a pointer to the last
2225 			 * element so that we don't incur the overhead
2226 			 * of traversing the list each time
2227 			 */
2228 			tailpp = meta_namelist_append_wrapper(
2229 			    tailpp, transp->lognamep);
2230 		}
2231 	}
2232 out:
2233 	metafreenamelist(transnlp);
2234 	return (count);
2235 }
2236 
2237 /*
2238  * Entry point to join a node to MultiNode diskset.
2239  *
2240  * Validate host in diskset.
2241  *	- Should be in membership list from API
2242  *	- Should not already be joined into diskset.
2243  *	- Set must have drives
2244  * Assume valid configuration is stored in the set/drive/node records
2245  * in the local mddb since no node or drive can be added to the MNset
2246  * unless all drives and nodes are available.  Reconfig steps will
2247  * resync all ALIVE nodes in case of panic in critical areas.
2248  *
2249  * Lock down the set.
2250  * Verify host is a member of this diskset.
2251  * If drives exist in the configuration, load the mddbs.
2252  * Set this node to active by notifying master if one exists.
2253  * If this is the first node active in the diskset, this node
2254  * 	becomes the master.
2255  * Unlock the set.
2256  *
2257  * Mirror Resync:
2258  * If this node is the last node to join the set and clustering
2259  * isn't running, then start the 'metasync -r' type resync
2260  * on all mirrors in this diskset.
2261  * If clustering is running, this resync operation will
2262  * be handled by the reconfig steps and should NOT
2263  * be handled during a join operation.
2264  *
2265  * There are multiple return values in order to assist
2266  * the join operation of all sets in the metaset command.
2267  *
2268  * Return values:
2269  *	0  - Node successfully joined to set.
2270  *	-1 - Join attempted but failed
2271  *		- any failure from libmeta calls
2272  *		- node not in the member list
2273  *	-2 - Join not attempted since
2274  *		- this set had no drives in set
2275  *		- this node already joined to set
2276  *		- set is not a multinode set
2277  *	-3 - Node joined to STALE set.
2278  */
2279 extern int
2280 meta_set_join(
2281 	mdsetname_t	*sp,
2282 	md_error_t	*ep
2283 )
2284 {
2285 	md_set_desc		*sd;
2286 	md_drive_desc		*dd;
2287 	md_mnnode_desc		*nd, *nd2, my_nd;
2288 	int			rval = 0;
2289 	md_setkey_t		*cl_sk;
2290 	md_error_t		xep = mdnullerror;
2291 	md_error_t		ep_snarf = mdnullerror;
2292 	int			master_flag = 0;
2293 	md_mnset_record		*mas_mnsr = NULL;
2294 	int			clear_nr_flags = 0;
2295 	md_mnnode_record	*nr;
2296 	int			stale_set = 0;
2297 	int			rb_flags = 0;
2298 	int			stale_bool = FALSE;
2299 	int			suspendall_flag = 0;
2300 	int			suspend1_flag = 0;
2301 	sigset_t		oldsigs;
2302 	int			send_reinit = 0;
2303 
2304 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2305 		return (-1);
2306 	}
2307 
2308 	/* Must be a multinode diskset */
2309 	if (!MD_MNSET_DESC(sd)) {
2310 		(void) mderror(ep, MDE_NOT_MN, sp->setname);
2311 		return (-2);
2312 	}
2313 
2314 	/* Verify that the node is ALIVE (i.e. is in the API membership list) */
2315 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_ALIVE)) {
2316 		(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno,
2317 		    sd->sd_mn_mynode->nd_nodename, NULL, sp->setname);
2318 		return (-1);
2319 	}
2320 
2321 	/* Make sure we are blocking all signals */
2322 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
2323 		mdclrerror(&xep);
2324 
2325 	/*
2326 	 * Lock the set on current set members.
2327 	 * For MN diskset lock_set and SUSPEND are used to protect against
2328 	 * other meta* commands running on the other nodes.
2329 	 */
2330 	nd = sd->sd_nodelist;
2331 	while (nd) {
2332 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2333 			nd = nd->nd_next;
2334 			continue;
2335 		}
2336 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2337 			rval = -1;
2338 			goto out;
2339 		}
2340 		nd = nd->nd_next;
2341 	}
2342 
2343 	/*
2344 	 * Lock out other meta* commands by suspending
2345 	 * class 1 messages across the diskset.
2346 	 */
2347 	nd = sd->sd_nodelist;
2348 	while (nd) {
2349 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2350 			nd = nd->nd_next;
2351 			continue;
2352 		}
2353 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2354 		    sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) {
2355 			rval = -1;
2356 			goto out;
2357 		}
2358 		suspend1_flag = 1;
2359 		nd = nd->nd_next;
2360 	}
2361 
2362 	/*
2363 	 * Verify that this host is a member (in the host list) of the set.
2364 	 */
2365 	nd = sd->sd_nodelist;
2366 	while (nd) {
2367 		if (strcmp(mynode(), nd->nd_nodename) == 0) {
2368 			break;
2369 		}
2370 		nd = nd->nd_next;
2371 	}
2372 	if (!nd) {
2373 		(void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
2374 		    sd->sd_mn_mynode->nd_nodename, NULL,
2375 		    sp->setname);
2376 		rval = -1;
2377 		goto out;
2378 	}
2379 
2380 	/*
2381 	 * Need to return failure if host is already 'joined'
2382 	 * into the set.  This is done so that if later the user
2383 	 * issues a command to join all sets and a failure is
2384 	 * encountered - that the resulting cleanup effort
2385 	 * (withdrawing from all sets that were joined
2386 	 * during that command) won't withdraw from this set.
2387 	 */
2388 	if (nd->nd_flags & MD_MN_NODE_OWN) {
2389 		rval = -2;
2390 		goto out2;
2391 	}
2392 
2393 	/*
2394 	 * Call metaget_setownership that calls each node in diskset and
2395 	 * marks in set descriptor if node is an owner of the set or not.
2396 	 * metaget_setownership checks to see if a node is an owner by
2397 	 * checking to see if that node's kernel has the mddb loaded.
2398 	 * If a node had panic'd during a reconfig or an
2399 	 * add/delete/join/withdraw operation, the other nodes' node
2400 	 * records may not reflect the current state of the diskset,
2401 	 * so calling metaget_setownership is the safest thing to do.
2402 	 */
2403 	if (metaget_setownership(sp, ep) == -1) {
2404 		rval = -1;
2405 		goto out;
2406 	}
2407 
2408 	/* If first active member of diskset, become the master. */
2409 	nd = sd->sd_nodelist;
2410 	while (nd) {
2411 		if (nd->nd_flags & MD_MN_NODE_OWN)
2412 			break;
2413 		nd = nd->nd_next;
2414 	}
2415 	if (nd == NULL)
2416 		master_flag = 1;
2417 
2418 	/*
2419 	 * If not first active member of diskset, then get the
2420 	 * master information from a node that is already joined
2421 	 * and set the master information for this node.  Be sure
2422 	 * that this node (the already joined node) has its own
2423 	 * join flag set.  If not, then this diskset isn't currently
2424 	 * consistent and shouldn't allow a node to join.  This diskset
2425 	 * inconsistency should only occur when a node has panic'd in
2426 	 * the set while doing a metaset operation and the sysadmin is
2427 	 * attempting to join a node into the set.  This inconsistency
2428 	 * will be fixed during a reconfig cycle which should be occurring
2429 	 * soon since a node panic'd.
2430 	 *
2431 	 * If unable to get this information from an owning node, then
2432 	 * this diskset isn't currently consistent and shouldn't
2433 	 * allow a node to join.
2434 	 */
2435 	if (!master_flag) {
2436 		/* get master information from an owner (joined) node */
2437 		if (clnt_mngetset(nd->nd_nodename, sp->setname,
2438 		    sp->setno, &mas_mnsr, ep) == -1) {
2439 			rval = -1;
2440 			goto out;
2441 		}
2442 
2443 		/* Verify that owner (joined) node has its own JOIN flag set */
2444 		nr = mas_mnsr->sr_nodechain;
2445 		while (nr) {
2446 			if ((nd->nd_nodeid == nr->nr_nodeid) &&
2447 			    ((nr->nr_flags & MD_MN_NODE_OWN) == NULL)) {
2448 				(void) mddserror(ep, MDE_DS_NODENOSET,
2449 				    sp->setno, nd->nd_nodename, NULL,
2450 				    nd->nd_nodename);
2451 				free_sr((md_set_record *)mas_mnsr);
2452 				rval = -1;
2453 				goto out;
2454 			}
2455 			nr = nr->nr_next;
2456 		}
2457 
2458 		/*
2459 		 * Does master have set marked as STALE?
2460 		 * If so, need to pass this down to kernel when
2461 		 * this node snarfs the set.
2462 		 */
2463 		if (clnt_mn_is_stale(nd->nd_nodename, sp,
2464 		    &stale_bool, ep) == -1) {
2465 			rval = -1;
2466 			goto out;
2467 		}
2468 
2469 		/* set master information in my rpc.metad's set record */
2470 		if (clnt_mnsetmaster(mynode(), sp, mas_mnsr->sr_master_nodenm,
2471 		    mas_mnsr->sr_master_nodeid, ep)) {
2472 			free_sr((md_set_record *)mas_mnsr);
2473 			rval = -1;
2474 			goto out;
2475 		}
2476 
2477 		/* set master information in my cached set desc */
2478 		(void) strcpy(sd->sd_mn_master_nodenm,
2479 		    mas_mnsr->sr_master_nodenm);
2480 		sd->sd_mn_master_nodeid = mas_mnsr->sr_master_nodeid;
2481 		nd2 = sd->sd_nodelist;
2482 		while (nd2) {
2483 			if (nd2->nd_nodeid == mas_mnsr->sr_master_nodeid) {
2484 				sd->sd_mn_masternode = nd2;
2485 				break;
2486 			}
2487 			nd2 = nd2->nd_next;
2488 		}
2489 		free_sr((md_set_record *)mas_mnsr);
2490 
2491 		/*
2492 		 * Set the node flags in mynode's rpc.metad node records for
2493 		 * the nodes that are in the diskset.  Can use my sd
2494 		 * since earlier call to metaget_setownership set the
2495 		 * owner flags based on whether that node had snarfed
2496 		 * the MN diskset mddb.  Reconfig steps guarantee that
2497 		 * return of metaget_setownership will match the owning
2498 		 * node's owner list except in the case where a node
2499 		 * has just panic'd and in this case, a reconfig will
2500 		 * be starting immediately and the owner lists will
2501 		 * be sync'd up by the reconfig.
2502 		 *
2503 		 * Flag of SET means to take no action except to
2504 		 * set the node flags as given in the nodelist linked list.
2505 		 */
2506 		if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist,
2507 		    MD_NR_SET, NULL, ep)) {
2508 			rval = -1;
2509 			goto out;
2510 		}
2511 	}
2512 
2513 	/*
2514 	 * Read in the mddb if there are drives in the set.
2515 	 */
2516 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
2517 	    ep)) == NULL) {
2518 		/* No drives in list */
2519 		if (! mdisok(ep)) {
2520 			rval = -1;
2521 			goto out;
2522 		}
2523 		rval = -2;
2524 		goto out;
2525 	}
2526 
2527 	/*
2528 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2529 	 * Start by suspending rpc.mdcommd (which drains it of all messages),
2530 	 * then change the nodelist followed by a reinit and resume.
2531 	 */
2532 	nd = sd->sd_nodelist;
2533 	while (nd) {
2534 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2535 			nd = nd->nd_next;
2536 			continue;
2537 		}
2538 
2539 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp,
2540 		    MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
2541 			rval = -1;
2542 			goto out;
2543 		}
2544 		suspendall_flag = 1;
2545 		nd = nd->nd_next;
2546 	}
2547 
2548 	/* Set master in my set record in rpc.metad */
2549 	if (master_flag) {
2550 		if (clnt_mnsetmaster(mynode(), sp,
2551 		    sd->sd_mn_mynode->nd_nodename,
2552 		    sd->sd_mn_mynode->nd_nodeid, ep)) {
2553 			rval = -1;
2554 			goto out;
2555 		}
2556 	}
2557 	/*
2558 	 * Causes mddbs to be loaded into the kernel.
2559 	 * Set the force flag so that replica locations can be
2560 	 * loaded into the kernel even if a mediator node was
2561 	 * unavailable.  This allows a node to join an MO
2562 	 * diskset when there are sufficient replicas available,
2563 	 * but a mediator node in unavailable.
2564 	 */
2565 	if (setup_db_bydd(sp, dd, TRUE, ep) == -1) {
2566 		mde_perror(ep, dgettext(TEXT_DOMAIN,
2567 		    "Host not able to start diskset."));
2568 		rval = -1;
2569 		goto out;
2570 	}
2571 
2572 	if (! mdisok(ep)) {
2573 		rval = -1;
2574 		goto out;
2575 	}
2576 
2577 	/*
2578 	 * Set rollback flags to 1 so that halt_set is called if a failure
2579 	 * is seen after this point.  If snarf_set fails, still need to
2580 	 * call halt_set to cleanup the diskset.
2581 	 */
2582 	rb_flags = 1;
2583 
2584 	/* Starts the set */
2585 	if (snarf_set(sp, stale_bool, ep) != 0) {
2586 		if (mdismddberror(ep, MDE_DB_STALE)) {
2587 			/*
2588 			 * Don't fail join, STALE means that set has
2589 			 * < 50% mddbs.
2590 			 */
2591 			(void) mdstealerror(&ep_snarf, ep);
2592 			stale_set = 1;
2593 		} else if (mdisok(ep)) {
2594 			/* If snarf failed, but no error was set - set it */
2595 			(void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64,
2596 			    sp->setno, 0, NULL);
2597 				rval = -1;
2598 				goto out;
2599 		} else if (!(mdismddberror(ep, MDE_DB_ACCOK))) {
2600 			/*
2601 			 * Don't fail join if ACCOK; ACCOK means that mediator
2602 			 * provided extra vote.
2603 			 */
2604 			rval = -1;
2605 			goto out;
2606 		}
2607 	}
2608 
2609 	/* Did set really get snarfed? */
2610 	if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_NO) {
2611 		if (mdisok(ep)) {
2612 			/* If snarf failed, but no error was set - set it */
2613 			(void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64,
2614 			    sp->setno, 0, NULL);
2615 		}
2616 		mde_perror(ep, dgettext(TEXT_DOMAIN,
2617 		    "Host not able to start diskset."));
2618 		rval = -1;
2619 		goto out;
2620 	}
2621 
2622 	/* Change to nodelist so need to send reinit to rpc.mdcommd */
2623 	send_reinit = 1;
2624 
2625 	/* If first node to enter set, setup master and clear change log */
2626 	if (master_flag) {
2627 		/* Set master in my locally cached set descriptor */
2628 		(void) strcpy(sd->sd_mn_master_nodenm,
2629 		    sd->sd_mn_mynode->nd_nodename);
2630 		sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid;
2631 		sd->sd_mn_am_i_master = 1;
2632 
2633 		/*
2634 		 * If first node to join set, then clear out change log
2635 		 * entries.  Change log entries are only needed when a
2636 		 * change of master is occurring in a diskset that has
2637 		 * multiple owners.   Since this node is the first owner
2638 		 * of the diskset, clear the entries.
2639 		 *
2640 		 * Only do this if we are in a single node non-SC3.x
2641 		 * situation.
2642 		 */
2643 		if (meta_mn_singlenode() &&
2644 		    mdmn_reset_changelog(sp, ep,  MDMN_CLF_RESETLOG) != 0) {
2645 			mde_perror(ep, dgettext(TEXT_DOMAIN,
2646 			    "Unable to reset changelog."));
2647 			rval = -1;
2648 			goto out;
2649 		}
2650 	}
2651 
2652 	/* Set my locally cached flag */
2653 	sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN;
2654 
2655 	/*
2656 	 * Set this node's own flag on all joined nodes in the set
2657 	 * (including my node).
2658 	 */
2659 	clear_nr_flags = 1;
2660 
2661 	my_nd = *(sd->sd_mn_mynode);
2662 	my_nd.nd_next = NULL;
2663 	nd = sd->sd_nodelist;
2664 	while (nd) {
2665 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2666 			nd = nd->nd_next;
2667 			continue;
2668 		}
2669 		if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
2670 		    MD_NR_JOIN, NULL, ep)) {
2671 			rval = -1;
2672 			goto out;
2673 		}
2674 		nd = nd->nd_next;
2675 	}
2676 
2677 out:
2678 	if (rval != NULL) {
2679 		/*
2680 		 * If rollback flag is 1, then node was joined to set.
2681 		 * Since an error occurred, withdraw node from set in
2682 		 * order to rollback to before command was run.
2683 		 * Need to preserve ep so that calling function can
2684 		 * get error information.
2685 		 */
2686 		if (rb_flags == 1) {
2687 			if (halt_set(sp, &xep)) {
2688 				mdclrerror(&xep);
2689 			}
2690 		}
2691 
2692 		/*
2693 		 * If error, reset master to INVALID.
2694 		 * Ignore error since (next) first node to successfully join
2695 		 * will set master on all nodes.
2696 		 */
2697 		(void) clnt_mnsetmaster(mynode(), sp, "",
2698 		    MD_MN_INVALID_NID, &xep);
2699 		mdclrerror(&xep);
2700 		/* Reset master in my locally cached set descriptor */
2701 		sd->sd_mn_master_nodeid = MD_MN_INVALID_NID;
2702 		sd->sd_mn_am_i_master = 0;
2703 
2704 		/*
2705 		 * If nr flags set on other nodes, reset them.
2706 		 */
2707 		if (clear_nr_flags) {
2708 			nd = sd->sd_nodelist;
2709 			while (nd) {
2710 				if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2711 					nd = nd->nd_next;
2712 					continue;
2713 				}
2714 				(void) clnt_upd_nr_flags(nd->nd_nodename, sp,
2715 				    &my_nd, MD_NR_WITHDRAW, NULL, &xep);
2716 				mdclrerror(&xep);
2717 				nd = nd->nd_next;
2718 			}
2719 			/* Reset my locally cached flag */
2720 			sd->sd_mn_mynode->nd_flags &= ~MD_MN_NODE_OWN;
2721 		}
2722 	}
2723 
2724 	/*
2725 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2726 	 * Send reinit command to mdcommd which forces it to get
2727 	 * fresh set description.
2728 	 */
2729 	if (send_reinit) {
2730 		/* Send reinit */
2731 		nd = sd->sd_nodelist;
2732 		while (nd) {
2733 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2734 				nd = nd->nd_next;
2735 				continue;
2736 			}
2737 
2738 			/* Class is ignored for REINIT */
2739 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
2740 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
2741 				/*
2742 				 * We are here because we failed to resume
2743 				 * rpc.mdcommd.  However we potentially have
2744 				 * an error from the previous call
2745 				 * If the previous call did fail,  we capture
2746 				 * that error and generate a perror with
2747 				 * the string, "Unable to resume...".
2748 				 * Setting rval to -1 ensures that in the
2749 				 * next iteration of the loop, ep is not
2750 				 * clobbered.
2751 				 */
2752 				if (rval == 0)
2753 					(void) mdstealerror(ep, &xep);
2754 				else
2755 					mdclrerror(&xep);
2756 				rval = -1;
2757 				mde_perror(ep, dgettext(TEXT_DOMAIN,
2758 				    "Unable to reinit rpc.mdcommd."));
2759 			}
2760 			nd = nd->nd_next;
2761 		}
2762 
2763 	}
2764 
2765 out2:
2766 	/*
2767 	 * Unlock diskset by resuming messages across the diskset.
2768 	 * Just resume all classes so that resume is the same whether
2769 	 * just one class was locked or all classes were locked.
2770 	 */
2771 	if ((suspend1_flag) || (suspendall_flag)) {
2772 		nd = sd->sd_nodelist;
2773 		while (nd) {
2774 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2775 				nd = nd->nd_next;
2776 				continue;
2777 			}
2778 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
2779 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
2780 				/*
2781 				 * We are here because we failed to resume
2782 				 * rpc.mdcommd.  However we potentially have
2783 				 * an error from the previous call
2784 				 * If the previous call did fail,  we capture
2785 				 * that error and generate a perror with
2786 				 * the string, "Unable to resume...".
2787 				 * Setting rval to -1 ensures that in the
2788 				 * next iteration of the loop, ep is not
2789 				 * clobbered.
2790 				 */
2791 				if (rval == 0)
2792 					(void) mdstealerror(ep, &xep);
2793 				else
2794 					mdclrerror(&xep);
2795 				rval = -1;
2796 				mde_perror(ep, dgettext(TEXT_DOMAIN,
2797 				    "Unable to resume rpc.mdcommd."));
2798 			}
2799 			nd = nd->nd_next;
2800 		}
2801 		meta_ping_mnset(sp->setno);
2802 	}
2803 
2804 	/*
2805 	 * Unlock set.  This flushes the caches on the servers.
2806 	 */
2807 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
2808 	nd = sd->sd_nodelist;
2809 	while (nd) {
2810 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2811 			nd = nd->nd_next;
2812 			continue;
2813 		}
2814 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
2815 			if (rval == 0)
2816 				(void) mdstealerror(ep, &xep);
2817 			else
2818 				mdclrerror(&xep);
2819 			rval = -1;
2820 		}
2821 		nd = nd->nd_next;
2822 	}
2823 
2824 	/*
2825 	 * If this node is the last to join the diskset and clustering isn't
2826 	 * running, then resync the mirrors in the diskset. We have to wait
2827 	 * until all nodes are joined so that the status gets propagated to
2828 	 * all of the members of the set.
2829 	 * Ignore any error from the resync as the join function shouldn't fail
2830 	 * because the mirror resync had a problem.
2831 	 *
2832 	 * Don't start resync if set is stale.
2833 	 */
2834 	if ((rval == 0) && (sdssc_bind_library() != SDSSC_OKAY) &&
2835 	    (stale_set != 1)) {
2836 		nd = sd->sd_nodelist;
2837 		while (nd) {
2838 			if (!(nd->nd_flags & MD_MN_NODE_OWN))
2839 				break;
2840 			nd = nd->nd_next;
2841 		}
2842 		/*
2843 		 * nd set to NULL means that we have no nodes in the set that
2844 		 * haven't joined. In this case we start the resync.
2845 		 */
2846 		if (nd == NULL) {
2847 			(void) meta_mirror_resync_all(sp, 0, &xep);
2848 			mdclrerror(&xep);
2849 		}
2850 	}
2851 
2852 	/* Update ABR state for all soft partitions */
2853 	(void) meta_sp_update_abr(sp, &xep);
2854 	mdclrerror(&xep);
2855 
2856 	/*
2857 	 * call metaflushsetnames to reset local cache for master and
2858 	 * node information.
2859 	 */
2860 	metaflushsetname(sp);
2861 
2862 	/* release signals back to what they were on entry */
2863 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
2864 		mdclrerror(&xep);
2865 
2866 	/*
2867 	 * If no error and stale_set is set, then set ep back
2868 	 * to ep from snarf_set call and return -3.  If another error
2869 	 * occurred and rval is not 0, then that error would have
2870 	 * caused the node to be withdrawn from the set and would
2871 	 * have set ep to that error information.
2872 	 */
2873 	if ((rval == 0) && (stale_set)) {
2874 		(void) mdstealerror(ep, &ep_snarf);
2875 		return (-3);
2876 	}
2877 
2878 	return (rval);
2879 }
2880 
2881 /*
2882  * Entry point to withdraw a node from MultiNode diskset.
2883  *
2884  * Validate host in diskset.
2885  *	- Should be joined into diskset.
2886  * Assume valid configuration is stored in the set/drive/node records
2887  * in the local mddb since no node or drive can be added to the MNset
2888  * unless all drives and nodes are available.  Reconfig steps will
2889  * resync all ALIVE nodes in case of panic in critical areas.
2890  *
2891  * Lock down the set.
2892  * Verify that drives exist in configuration.
2893  * Verify host is a member of this diskset.
2894  * Verify host is an owner of the diskset (host is joined to diskset).
2895  * Only allow withdrawal of master node if master node is the only joined
2896  * in the diskset.
2897  * Halt the diskset on this node.
2898  * Reset Master on this node.
2899  * Updated node flags that this node with withdrawn.
2900  * Unlock the set.
2901  *
2902  * Return values:
2903  *	0  - Node successfully withdrew from set.
2904  *	-1 - Withdrawal attempted but failed
2905  *		- any failure from libmeta calls
2906  *		- node not in the member list
2907  *	-2 - Withdrawal not attempted since
2908  *		- this set had no drives in set
2909  *		- this node not joined to set
2910  *		- set is not a multinode set
2911  */
2912 extern int
2913 meta_set_withdraw(
2914 	mdsetname_t	*sp,
2915 	md_error_t	*ep
2916 )
2917 {
2918 	md_set_desc		*sd;
2919 	md_drive_desc		*dd = 0;
2920 	md_mnnode_desc		*nd, my_nd;
2921 	int			rval = 0;
2922 	md_setkey_t		*cl_sk;
2923 	md_error_t		xep = mdnullerror;
2924 	int			set_halted = 0;
2925 	int			suspendall_flag = 0;
2926 	int			suspend1_flag = 0;
2927 	bool_t			stale_bool = FALSE;
2928 	mddb_config_t		c;
2929 	int			node_id_list[1];
2930 	sigset_t		oldsigs;
2931 	int			send_reinit = 0;
2932 
2933 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2934 		return (-1);
2935 	}
2936 
2937 	/* Must be a multinode diskset */
2938 	if (!MD_MNSET_DESC(sd)) {
2939 		(void) mderror(ep, MDE_NOT_MN, sp->setname);
2940 		return (-1);
2941 	}
2942 
2943 	/* Make sure we are blocking all signals */
2944 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
2945 		mdclrerror(&xep);
2946 
2947 	/*
2948 	 * Lock the set on current set members.
2949 	 * For MN diskset lock_set and SUSPEND are used to protect against
2950 	 * other meta* commands running on the other nodes.
2951 	 */
2952 	nd = sd->sd_nodelist;
2953 	while (nd) {
2954 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2955 			nd = nd->nd_next;
2956 			continue;
2957 		}
2958 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2959 			rval = -1;
2960 			goto out;
2961 		}
2962 		nd = nd->nd_next;
2963 	}
2964 	/*
2965 	 * Lock out other meta* commands by suspending
2966 	 * class 1 messages across the diskset.
2967 	 */
2968 	nd = sd->sd_nodelist;
2969 	while (nd) {
2970 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2971 			nd = nd->nd_next;
2972 			continue;
2973 		}
2974 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2975 		    sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) {
2976 			rval = -1;
2977 			goto out;
2978 		}
2979 		suspend1_flag = 1;
2980 		nd = nd->nd_next;
2981 	}
2982 
2983 	/* Get list of drives - needed in case of failure */
2984 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
2985 	    ep)) == NULL) {
2986 		/* Error getting drives in list */
2987 		if (! mdisok(ep)) {
2988 			rval = -1;
2989 			goto out2;
2990 		}
2991 		/* no drives in list */
2992 		rval = -2;
2993 		goto out2;
2994 	}
2995 
2996 	/*
2997 	 * Verify that this host is a member (in the host list) of the set.
2998 	 */
2999 	nd = sd->sd_nodelist;
3000 	while (nd) {
3001 		if (strcmp(mynode(), nd->nd_nodename) == 0) {
3002 			break;
3003 		}
3004 		nd = nd->nd_next;
3005 	}
3006 	if (!nd) {
3007 		(void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
3008 		    sd->sd_mn_mynode->nd_nodename, NULL,
3009 		    sp->setname);
3010 		rval = -1;
3011 		goto out2;
3012 	}
3013 
3014 	/*
3015 	 * Call metaget_setownership that calls each node in diskset and
3016 	 * marks in set descriptor if node is an owner of the set or not.
3017 	 * metaget_setownership checks to see if a node is an owner by
3018 	 * checking to see if that node's kernel has the mddb loaded.
3019 	 * If a node had panic'd during a reconfig or an
3020 	 * add/delete/join/withdraw operation, the other nodes' node
3021 	 * records may not reflect the current state of the diskset,
3022 	 * so calling metaget_setownership is the safest thing to do.
3023 	 */
3024 	if (metaget_setownership(sp, ep) == -1) {
3025 		rval = -1;
3026 		goto out2;
3027 	}
3028 
3029 	/*
3030 	 * Verify that this node is joined
3031 	 * to diskset (i.e. is an owner of the diskset).
3032 	 */
3033 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
3034 		rval = -2;
3035 		goto out2;
3036 	}
3037 
3038 	/*
3039 	 * For a MN diskset, only withdraw master if it is
3040 	 * the only joined node.
3041 	 */
3042 	if (sd->sd_mn_master_nodeid == sd->sd_mn_mynode->nd_nodeid) {
3043 		nd = sd->sd_nodelist;
3044 		while (nd) {
3045 			/* Skip my node since checking for other owners */
3046 			if (nd->nd_nodeid == sd->sd_mn_master_nodeid) {
3047 				nd = nd->nd_next;
3048 				continue;
3049 			}
3050 			/* If another owner node if found, error */
3051 			if (nd->nd_flags & MD_MN_NODE_OWN) {
3052 				(void) mddserror(ep, MDE_DS_WITHDRAWMASTER,
3053 				    sp->setno,
3054 				    sd->sd_mn_mynode->nd_nodename, NULL,
3055 				    sp->setname);
3056 				rval = -1;
3057 				goto out2;
3058 			}
3059 			nd = nd->nd_next;
3060 		}
3061 	}
3062 
3063 	/*
3064 	 * Is current set STALE?
3065 	 */
3066 	(void) memset(&c, 0, sizeof (c));
3067 	c.c_id = 0;
3068 	c.c_setno = sp->setno;
3069 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
3070 		(void) mdstealerror(ep, &c.c_mde);
3071 		rval = -1;
3072 		goto out;
3073 	}
3074 	if (c.c_flags & MDDB_C_STALE) {
3075 		stale_bool = TRUE;
3076 	}
3077 
3078 	/*
3079 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
3080 	 * Start by suspending rpc.mdcommd (which drains it of all messages),
3081 	 * then change the nodelist followed by a reinit and resume.
3082 	 */
3083 	nd = sd->sd_nodelist;
3084 	while (nd) {
3085 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3086 			nd = nd->nd_next;
3087 			continue;
3088 		}
3089 
3090 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
3091 		    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
3092 			rval = -1;
3093 			goto out;
3094 		}
3095 		suspendall_flag = 1;
3096 		nd = nd->nd_next;
3097 	}
3098 
3099 	/*
3100 	 * Withdraw the set - halt set.
3101 	 * This will fail if any I/O is occuring to any metadevice which
3102 	 * includes a resync to a mirror metadevice.
3103 	 */
3104 	set_halted = 1;
3105 	if (halt_set(sp, ep)) {
3106 		/* Was set actually halted? */
3107 		if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_YES) {
3108 			set_halted = 0;
3109 		}
3110 		rval = -1;
3111 		goto out;
3112 	}
3113 
3114 	/* Change to nodelist so need to send reinit to rpc.mdcommd */
3115 	send_reinit = 1;
3116 
3117 	/* Reset master on withdrawn node */
3118 	if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, "",
3119 	    MD_MN_INVALID_NID, ep)) {
3120 		rval = -1;
3121 		goto out;
3122 	}
3123 
3124 	/* Mark my node as withdrawn and send to other nodes */
3125 	nd = sd->sd_nodelist;
3126 	my_nd = *(sd->sd_mn_mynode);	/* structure copy */
3127 	my_nd.nd_next = NULL;
3128 	while (nd) {
3129 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3130 			nd = nd->nd_next;
3131 			continue;
3132 		}
3133 		if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
3134 		    MD_NR_WITHDRAW, NULL, ep)) {
3135 			rval = -1;
3136 			goto out;
3137 		}
3138 		nd = nd->nd_next;
3139 	}
3140 
3141 	/*
3142 	 * If withdrawn node is a mirror owner, reset mirror owner
3143 	 * to NULL.  If an error occurs, print a warning and continue.
3144 	 * Don't fail metaset because of mirror owner reset problem since
3145 	 * next node to grab mirror will resolve this issue.
3146 	 * Before next node grabs mirrors, metaset will show the withdrawn
3147 	 * node as owner which is why an attempt to reset the mirror owner
3148 	 * is made.
3149 	 */
3150 	node_id_list[0] = sd->sd_mn_mynode->nd_nodeid;	/* Setup my nodeid */
3151 	nd = sd->sd_nodelist;
3152 	while (nd) {
3153 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3154 			nd = nd->nd_next;
3155 			continue;
3156 		}
3157 		if (clnt_reset_mirror_owner(nd->nd_nodename, sp,
3158 		    1, &node_id_list[0], &xep) == 01) {
3159 			mde_perror(&xep, dgettext(TEXT_DOMAIN,
3160 			    "Unable to reset mirror owner on node %s"),
3161 			    nd->nd_nodename);
3162 			mdclrerror(&xep);
3163 		}
3164 		nd = nd->nd_next;
3165 	}
3166 
3167 out:
3168 	if (rval == -1) {
3169 		/* Rejoin node - Mark node as joined and send to other nodes */
3170 		nd = sd->sd_nodelist;
3171 		my_nd = *(sd->sd_mn_mynode);	/* structure copy */
3172 		my_nd.nd_next = NULL;
3173 		while (nd) {
3174 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3175 				nd = nd->nd_next;
3176 				continue;
3177 			}
3178 			if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
3179 			    MD_NR_JOIN, NULL, &xep)) {
3180 				mdclrerror(&xep);
3181 			}
3182 			nd = nd->nd_next;
3183 		}
3184 
3185 		/* Set master on withdrawn node */
3186 		if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp,
3187 		    sd->sd_mn_master_nodenm,
3188 		    sd->sd_mn_master_nodeid, &xep)) {
3189 			mdclrerror(&xep);
3190 		}
3191 
3192 		/* Join set if halt_set had succeeded */
3193 		if (set_halted) {
3194 			/*
3195 			 * Causes mddbs to be loaded into the kernel.
3196 			 * Set the force flag so that replica locations can be
3197 			 * loaded into the kernel even if a mediator node was
3198 			 * unavailable.  This allows a node to join an MO
3199 			 * diskset when there are sufficient replicas available,
3200 			 * but a mediator node in unavailable.
3201 			 */
3202 			if (setup_db_bydd(sp, dd, TRUE, &xep) == -1) {
3203 				mdclrerror(&xep);
3204 			}
3205 			/* If set previously stale - make it so at re-join */
3206 			if (snarf_set(sp, stale_bool, &xep) != 0) {
3207 				mdclrerror(&xep);
3208 				(void) halt_set(sp, &xep);
3209 				mdclrerror(&xep);
3210 			}
3211 		}
3212 	}
3213 
3214 	/*
3215 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
3216 	 * Send reinit command to mdcommd which forces it to get
3217 	 * fresh set description.
3218 	 */
3219 	if (send_reinit) {
3220 		/* Send reinit */
3221 		nd = sd->sd_nodelist;
3222 		while (nd) {
3223 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3224 				nd = nd->nd_next;
3225 				continue;
3226 			}
3227 
3228 			/* Class is ignored for REINIT */
3229 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
3230 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
3231 				/*
3232 				 * We are here because we failed to resume
3233 				 * rpc.mdcommd.  However we potentially have
3234 				 * an error from the previous call.
3235 				 * If the previous call did fail,  we
3236 				 * capture that error and generate a perror
3237 				 * withthe string,  "Unable to resume...".
3238 				 * Setting rval to -1 ensures that in the
3239 				 * next iteration of the loop, ep is not
3240 				 * clobbered.
3241 				 */
3242 				if (rval == 0)
3243 					(void) mdstealerror(ep, &xep);
3244 				else
3245 					mdclrerror(&xep);
3246 				rval = -1;
3247 				mde_perror(ep, dgettext(TEXT_DOMAIN,
3248 				    "Unable to reinit rpc.mdcommd."));
3249 			}
3250 			nd = nd->nd_next;
3251 		}
3252 	}
3253 
3254 out2:
3255 	/*
3256 	 * Unlock diskset by resuming messages across the diskset.
3257 	 * Just resume all classes so that resume is the same whether
3258 	 * just one class was locked or all classes were locked.
3259 	 */
3260 	if ((suspend1_flag) || (suspendall_flag)) {
3261 		nd = sd->sd_nodelist;
3262 		while (nd) {
3263 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3264 				nd = nd->nd_next;
3265 				continue;
3266 			}
3267 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
3268 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
3269 				/*
3270 				 * We are here because we failed to resume
3271 				 * rpc.mdcommd.  However we potentially have
3272 				 * an error from the previous call
3273 				 * If the previous call did fail,  we capture
3274 				 * that error and generate a perror with
3275 				 * the string, "Unable to resume...".
3276 				 * Setting rval to -1 ensures that in the
3277 				 * next iteration of the loop, ep is not
3278 				 * clobbered.
3279 				 */
3280 				if (rval == 0)
3281 					(void) mdstealerror(ep, &xep);
3282 				else
3283 					mdclrerror(&xep);
3284 				rval = -1;
3285 				mde_perror(ep, dgettext(TEXT_DOMAIN,
3286 				    "Unable to resume rpc.mdcommd."));
3287 			}
3288 			nd = nd->nd_next;
3289 		}
3290 		meta_ping_mnset(sp->setno);
3291 	}
3292 
3293 	/*
3294 	 * Unlock set.  This flushes the caches on the servers.
3295 	 */
3296 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
3297 	nd = sd->sd_nodelist;
3298 	while (nd) {
3299 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3300 			nd = nd->nd_next;
3301 			continue;
3302 		}
3303 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
3304 			if (rval == 0)
3305 				(void) mdstealerror(ep, &xep);
3306 			else
3307 				mdclrerror(&xep);
3308 			rval = -1;
3309 		}
3310 		nd = nd->nd_next;
3311 	}
3312 
3313 	/*
3314 	 * call metaflushsetnames to reset local cache for master and
3315 	 * node information.
3316 	 */
3317 	metaflushsetname(sp);
3318 
3319 	/* release signals back to what they were on entry */
3320 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
3321 		mdclrerror(&xep);
3322 
3323 	return (rval);
3324 
3325 }
3326 
3327 /*
3328  * Update nodelist with cluster member information.
3329  * A node not in the member list will be marked
3330  * as not ALIVE and not OWN.
3331  * A node in the member list will be marked ALIVE, but
3332  * the OWN bit will not be changed.
3333  *
3334  * If mynode isn't in the membership list, fail causing
3335  * another reconfig cycle to be started since a non-member
3336  * node shouldn't be taking part in the reconfig cycle.
3337  *
3338  * Return values:
3339  *	0 - No problem.
3340  *	1 - Any failure including RPC failure to my node.
3341  */
3342 int
3343 meta_reconfig_update_nodelist(
3344 	mdsetname_t			*sp,
3345 	mndiskset_membershiplist_t	*nl,
3346 	md_set_desc			*sd,
3347 	md_error_t			*ep
3348 )
3349 {
3350 	mndiskset_membershiplist_t	*nl2;
3351 	md_mnnode_desc			*nd;
3352 	md_error_t			xep = mdnullerror;
3353 	int				rval = 0;
3354 
3355 	/*
3356 	 * Walk through nodelist, checking to see if each
3357 	 * node is in the member list.
3358 	 * If node is not a member, reset ALIVE and OWN node flag.
3359 	 * If node is a member, set ALIVE.
3360 	 * If mynode's OWN flag gets reset, then halt the diskset on this node.
3361 	 */
3362 	nd = sd->sd_nodelist;
3363 	while (nd) {
3364 		nl2 = nl;
3365 		while (nl2) {
3366 			/* If node is in member list, set ALIVE */
3367 			if (nl2->msl_node_id == nd->nd_nodeid) {
3368 				nd->nd_flags |= MD_MN_NODE_ALIVE;
3369 				break;
3370 			} else {
3371 				nl2 = nl2->next;
3372 			}
3373 			/* node is not in member list, mark !ALIVE and !OWN */
3374 			if (nl2 == NULL) {
3375 				/* If node is mynode, then halt set if needed */
3376 				if (strcmp(mynode(), nd->nd_nodename) == 0) {
3377 					/*
3378 					 * This shouldn't happen, but just
3379 					 * in case...  Any node not in the
3380 					 * membership list should be dead and
3381 					 * not running reconfig step1.
3382 					 */
3383 					if (nd->nd_flags & MD_MN_NODE_OWN) {
3384 						if (halt_set(sp, &xep)) {
3385 							mde_perror(&xep, "");
3386 							mdclrerror(&xep);
3387 						}
3388 					}
3389 					/*
3390 					 * Return failure since this node
3391 					 * (mynode) is not in the membership
3392 					 * list, but process the rest of the
3393 					 * nodelist first so that rpc.metad
3394 					 * can be updated with the latest
3395 					 * membership information.
3396 					 */
3397 					(void) mddserror(ep,
3398 					    MDE_DS_NOTINMEMBERLIST,
3399 					    sp->setno, nd->nd_nodename, NULL,
3400 					    sp->setname);
3401 					rval = 1;
3402 				}
3403 				nd->nd_flags &= ~MD_MN_NODE_ALIVE;
3404 				nd->nd_flags &= ~MD_MN_NODE_OWN;
3405 			}
3406 		}
3407 		nd = nd->nd_next;
3408 	}
3409 
3410 	/* Send this information to rpc.metad */
3411 	if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist,
3412 	    MD_NR_SET,  MNSET_IN_RECONFIG, &xep)) {
3413 		/* Return failure if can't send node flags to rpc.metad */
3414 		if (rval == 0) {
3415 			(void) mdstealerror(ep, &xep);
3416 			rval = 1;
3417 		}
3418 	}
3419 	return (rval);
3420 }
3421 
3422 /*
3423  * Choose master determines the master for a diskset.
3424  * Each node determines the master on its own and
3425  * adds this information to its local rpc.metad nodelist
3426  * and also sends it to the kernel.
3427  *
3428  * Nodelist in set descriptor (sd) is sorted in
3429  * monotonically increasing sequence of nodeid.
3430  *
3431  * Return values:
3432  *	0 - No problem.
3433  *	205 - There was an RPC problem to another node.
3434  *	-1 - There was an error.  This could be an RPC error to my node.
3435  *		This is a catastrophic failure causing node to panic.
3436  */
3437 int
3438 meta_reconfig_choose_master_for_set(
3439 	mdsetname_t	*sp,
3440 	md_set_desc	*sd,
3441 	md_error_t	*ep
3442 )
3443 {
3444 	int			is_owner;
3445 	md_mnset_record		*mnsr = NULL;
3446 	int			lowest_alive_nodeid = 0;
3447 	uint_t			master_nodeid;
3448 	md_mnnode_desc		*nd, *nd2;
3449 	md_mnnode_record	*nr;
3450 	md_drive_desc		*dd;
3451 	md_setkey_t		*cl_sk;
3452 	int			rval = 0;
3453 	md_error_t		xep = mdnullerror;
3454 	mddb_setflags_config_t	sf;
3455 
3456 	/*
3457 	 * Is current node joined to diskset?
3458 	 * Don't trust flags, really check to see if mddb is snarfed.
3459 	 */
3460 	if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
3461 		/*
3462 		 * If a node is joined to the diskset, this node checks
3463 		 * to see if the current master of the diskset is valid and
3464 		 * is still in the membership list (ALIVE) and is
3465 		 * still joined (OWN).  Need to verify if master is
3466 		 * really joined - don't trust the flags.  (Can trust
3467 		 * ALIVE since set during earlier part of reconfig cycle.)
3468 		 * If the current master is valid, still in the membership
3469 		 * list and joined, then master is not changed on this node.
3470 		 * Just return.
3471 		 *
3472 		 * Verify that nodeid is valid before accessing masternode.
3473 		 */
3474 		if ((sd->sd_mn_master_nodeid != MD_MN_INVALID_NID) &&
3475 		    (sd->sd_mn_masternode->nd_flags & MD_MN_NODE_ALIVE)) {
3476 			if (clnt_ownset(sd->sd_mn_master_nodenm, sp,
3477 			    &is_owner, ep) == -1) {
3478 				/* If RPC failure to another node return 205 */
3479 				if ((mdanyrpcerror(ep)) &&
3480 				    (sd->sd_mn_mynode->nd_nodeid !=
3481 				    sd->sd_mn_master_nodeid)) {
3482 					return (205);
3483 				} else {
3484 					/* Any other failure */
3485 					return (-1);
3486 				}
3487 			} else {
3488 				if (is_owner == TRUE) {
3489 
3490 					meta_mc_log(MC_LOG5, dgettext(
3491 					    TEXT_DOMAIN, "Set %s previous "
3492 					    "master chosen %s (%d): %s"),
3493 					    sp->setname,
3494 					    sd->sd_mn_master_nodenm,
3495 					    sd->sd_mn_master_nodeid,
3496 					    meta_print_hrtime(gethrtime() -
3497 					    start_time));
3498 
3499 					/* Previous master is ok - done */
3500 					return (0);
3501 				}
3502 			}
3503 		}
3504 
3505 		/*
3506 		 * If current master is no longer in the membership list or
3507 		 * is no longer joined, then this node uses the following
3508 		 * algorithm:
3509 		 * - node calls RPC routine clnt_ownset to get latest
3510 		 *	information on which nodes are owners of diskset.
3511 		 * 	clnt_ownset checks on each node to see if its kernel
3512 		 *	has that diskset snarfed.
3513 		 */
3514 		nd = sd->sd_nodelist;
3515 		while (nd) {
3516 			/* Don't consider node that isn't in member list */
3517 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3518 				nd = nd->nd_next;
3519 				continue;
3520 			}
3521 
3522 			if (clnt_ownset(nd->nd_nodename, sp,
3523 			    &is_owner, ep) == -1) {
3524 				/* If RPC failure to another node return 205 */
3525 				if ((mdanyrpcerror(ep)) &&
3526 				    (sd->sd_mn_mynode->nd_nodeid !=
3527 				    nd->nd_nodeid)) {
3528 					return (205);
3529 				} else {
3530 					/* Any other failure */
3531 					return (-1);
3532 				}
3533 			}
3534 
3535 			/*
3536 			 * Set owner flag for each node based on whether
3537 			 * that node really has a diskset mddb snarfed in
3538 			 * or not.
3539 			 */
3540 			if (is_owner == TRUE)
3541 				nd->nd_flags |= MD_MN_NODE_OWN;
3542 			else
3543 				nd->nd_flags &= ~MD_MN_NODE_OWN;
3544 
3545 			nd = nd->nd_next;
3546 		}
3547 
3548 		/*
3549 		 * - node walks through nodelist looking for nodes that are
3550 		 *	owners of the diskset that are in the membership list.
3551 		 * - for each owner, node calls RPC routine clnt_getset to
3552 		 *	 see if that node has its node record set to OK.
3553 		 * - If so, master is chosen to be this owner node.
3554 		 */
3555 		nd = sd->sd_nodelist;
3556 		while (nd) {
3557 			/* Don't consider node that isn't in member list */
3558 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3559 				nd = nd->nd_next;
3560 				continue;
3561 			}
3562 
3563 			/* Don't consider a node that isn't an owner */
3564 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3565 				nd = nd->nd_next;
3566 				continue;
3567 			}
3568 
3569 			/* Does node has its own node record set to OK? */
3570 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3571 			    MD_SET_BAD, &mnsr, ep) == -1) {
3572 				/* If RPC failure to another node return 205 */
3573 				if ((mdanyrpcerror(ep)) &&
3574 				    (sd->sd_mn_mynode->nd_nodeid !=
3575 				    nd->nd_nodeid)) {
3576 					return (205);
3577 				} else {
3578 					/* Any other failure */
3579 					return (-1);
3580 				}
3581 			}
3582 			nr = mnsr->sr_nodechain;
3583 			while (nr) {
3584 				if (nd->nd_nodeid == nr->nr_nodeid) {
3585 					if (nr->nr_flags & MD_MN_NODE_OK) {
3586 						/* Found a master */
3587 						free_sr(
3588 						    (md_set_record *)mnsr);
3589 						goto found_master;
3590 					}
3591 				}
3592 				nr = nr->nr_next;
3593 			}
3594 			free_sr((md_set_record *)mnsr);
3595 			nd = nd->nd_next;
3596 		}
3597 
3598 		/*
3599 		 * - If no owner node has its own node record on its own node
3600 		 *	set to OK, then this node checks all of the non-owner
3601 		 * 	nodes that are in the membership list.
3602 		 * - for each non-owner, node calls RPC routine clnt_getset to
3603 		 *	 see if that node has its node record set to OK.
3604 		 * - If set doesn't exist, don't choose node for master.
3605 		 * - If so, master is chosen to be this non-owner node.
3606 		 *
3607 		 */
3608 		nd = sd->sd_nodelist;
3609 		while (nd) {
3610 			/* Don't consider node that isn't in member list */
3611 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3612 				nd = nd->nd_next;
3613 				continue;
3614 			}
3615 
3616 			/* Only checking non-owner nodes this time around */
3617 			if (nd->nd_flags & MD_MN_NODE_OWN) {
3618 				nd = nd->nd_next;
3619 				continue;
3620 			}
3621 
3622 			/* Does node has its own node record set to OK? */
3623 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3624 			    MD_SET_BAD, &mnsr, ep) == -1) {
3625 				/*
3626 				 * If set doesn't exist on non-owner node,
3627 				 * don't consider this node for master.
3628 				 */
3629 				if (mdiserror(ep, MDE_NO_SET)) {
3630 					nd = nd->nd_next;
3631 					continue;
3632 				} else if ((mdanyrpcerror(ep)) &&
3633 				    (sd->sd_mn_mynode->nd_nodeid !=
3634 				    nd->nd_nodeid)) {
3635 					/* RPC failure to another node */
3636 					return (205);
3637 				} else {
3638 					/* Any other failure */
3639 					return (-1);
3640 				}
3641 			}
3642 			nr = mnsr->sr_nodechain;
3643 			while (nr) {
3644 				if (nd->nd_nodeid == nr->nr_nodeid) {
3645 					if (nr->nr_flags & MD_MN_NODE_OK) {
3646 						/* Found a master */
3647 						free_sr(
3648 						    (md_set_record *)mnsr);
3649 						goto found_master;
3650 					}
3651 				}
3652 				nr = nr->nr_next;
3653 			}
3654 			free_sr((md_set_record *)mnsr);
3655 			nd = nd->nd_next;
3656 		}
3657 
3658 		/*
3659 		 * - If no node can be found that has its own node record on
3660 		 *	its node to be set to OK, then all alive nodes
3661 		 * 	were in the process of being added to or deleted
3662 		 *	from set.  Each alive node will remove all
3663 		 *	information pertaining to this set from its node.
3664 		 *
3665 		 * If all nodes in set are ALIVE, then call sdssc end routines
3666 		 * since set was truly being initially created or destroyed.
3667 		 */
3668 		goto delete_set;
3669 	} else {
3670 
3671 		/*
3672 		 * If node is not joined to diskset, then this
3673 		 * node uses the following algorithm:
3674 		 * - If unjoined node doesn't have a node record for itself,
3675 		 *	just delete the diskset since diskset was in the
3676 		 *	process of being created.
3677 		 * - node needs to find master of diskset before
3678 		 *	reconfig cycle, if a master existed.
3679 		 * - node calls RPC routine clnt_ownset to get latest
3680 		 * 	information on which nodes are owners of diskset.
3681 		 *	clnt_ownset checks on each node to see if its
3682 		 *	kernel has that diskset snarfed.
3683 		 */
3684 
3685 		/*
3686 		 * Is my node in the set description?
3687 		 * If not, delete the set from this node.
3688 		 * sr2setdesc sets sd_mn_mynode pointer to the node
3689 		 * descriptor for this node if there was a node
3690 		 * record for this node.
3691 		 *
3692 		 */
3693 		if (sd->sd_mn_mynode == NULL) {
3694 			goto delete_set;
3695 		}
3696 
3697 		nd = sd->sd_nodelist;
3698 		while (nd) {
3699 			/* Don't consider node that isn't in member list */
3700 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3701 				nd = nd->nd_next;
3702 				continue;
3703 			}
3704 
3705 			if (clnt_ownset(nd->nd_nodename, sp,
3706 			    &is_owner, ep) == -1) {
3707 				/* If RPC failure to another node return 205 */
3708 				if ((mdanyrpcerror(ep)) &&
3709 				    (sd->sd_mn_mynode->nd_nodeid !=
3710 				    nd->nd_nodeid)) {
3711 					return (205);
3712 				} else {
3713 					/* Any other failure */
3714 					return (-1);
3715 				}
3716 			}
3717 
3718 			/*
3719 			 * Set owner flag for each node based on whether
3720 			 * that node really has a diskset mddb snarfed in
3721 			 * or not.
3722 			 */
3723 			if (is_owner == TRUE)
3724 				nd->nd_flags |= MD_MN_NODE_OWN;
3725 			else
3726 				nd->nd_flags &= ~MD_MN_NODE_OWN;
3727 
3728 			nd = nd->nd_next;
3729 		}
3730 
3731 		/*
3732 		 * - node walks through nodelist looking for nodes that
3733 		 *	are owners of the diskset that are in
3734 		 *	the membership list.
3735 		 * - for each owner, node calls RPC routine clnt_getset to
3736 		 *	see if that node has a master set and to get the
3737 		 *	diskset description.
3738 		 * - If the owner node has a set description that doesn't
3739 		 *	include the non-joined node in the nodelist, this node
3740 		 *	removes its set description of that diskset
3741 		 *	(i.e. removes the set from its local mddbs).  This is
3742 		 *	handling the case of when a node was removed from a
3743 		 *	diskset while it was not in the cluster membership
3744 		 *	list.
3745 		 * - If that node has a master set and the master is in the
3746 		 *	membership list and is an owner, then either this was
3747 		 *	the master from before the reconfig cycle or this
3748 		 *	node has already chosen a new master - either way,
3749 		 *	the master value is valid as long as it is in the
3750 		 *	membership list and is an owner
3751 		 * - master is chosen to be owner node's master
3752 		 */
3753 		nd = sd->sd_nodelist;
3754 		while (nd) {
3755 			/* Don't consider node that isn't in member list */
3756 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3757 				nd = nd->nd_next;
3758 				continue;
3759 			}
3760 
3761 			/* Don't consider a node that isn't an owner */
3762 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3763 				nd = nd->nd_next;
3764 				continue;
3765 			}
3766 
3767 			/* Get owner node's set record */
3768 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3769 			    MD_SET_BAD, &mnsr, ep) == -1) {
3770 				/* If RPC failure to another node return 205 */
3771 				if ((mdanyrpcerror(ep)) &&
3772 				    (sd->sd_mn_mynode->nd_nodeid !=
3773 				    nd->nd_nodeid)) {
3774 					return (205);
3775 				} else {
3776 					/* Any other failure */
3777 					return (-1);
3778 				}
3779 			}
3780 
3781 			/* Is this node in the owner node's set record */
3782 			nr = mnsr->sr_nodechain;
3783 			while (nr) {
3784 				if (sd->sd_mn_mynode->nd_nodeid ==
3785 				    nr->nr_nodeid) {
3786 					break;
3787 				}
3788 				nr = nr->nr_next;
3789 			}
3790 			if (nr == NULL) {
3791 				/* my node not found - delete set */
3792 				free_sr((md_set_record *)mnsr);
3793 				goto delete_set;
3794 			}
3795 
3796 			/* Is owner's node's master valid? */
3797 			master_nodeid = mnsr->sr_master_nodeid;
3798 			free_sr((md_set_record *)mnsr);
3799 			if (master_nodeid == MD_MN_INVALID_NID) {
3800 				nd = nd->nd_next;
3801 				continue;
3802 			}
3803 
3804 			nd2 = sd->sd_nodelist;
3805 			while (nd2) {
3806 				if ((nd2->nd_nodeid == master_nodeid) &&
3807 				    (nd2->nd_flags & MD_MN_NODE_ALIVE) &&
3808 				    (nd2->nd_flags & MD_MN_NODE_OWN)) {
3809 						nd = nd2;
3810 						goto found_master;
3811 				}
3812 				nd2 = nd2->nd_next;
3813 			}
3814 			nd = nd->nd_next;
3815 		}
3816 
3817 		/*
3818 		 * - If no owner node has a valid master, then follow
3819 		 * 	algorithm of when a node is joined to the diskset.
3820 		 * - node walks through nodelist looking for nodes that are
3821 		 *	owners of the diskset that are in the membership list.
3822 		 * - for each owner, node calls RPC routine clnt_getset to
3823 		 *	 see if that node has its node record set to OK.
3824 		 * - If so, master is chosen to be this owner node.
3825 		 */
3826 		nd = sd->sd_nodelist;
3827 		while (nd) {
3828 			/* Don't consider node that isn't in member list */
3829 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3830 				nd = nd->nd_next;
3831 				continue;
3832 			}
3833 
3834 			/* Don't consider a node that isn't an owner */
3835 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3836 				nd = nd->nd_next;
3837 				continue;
3838 			}
3839 
3840 			/* Does node has its own node record set to OK? */
3841 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3842 			    MD_SET_BAD, &mnsr, ep) == -1) {
3843 				/* If RPC failure to another node return 205 */
3844 				if ((mdanyrpcerror(ep)) &&
3845 				    (sd->sd_mn_mynode->nd_nodeid !=
3846 				    nd->nd_nodeid)) {
3847 					return (205);
3848 				} else {
3849 					/* Any other failure */
3850 					return (-1);
3851 				}
3852 			}
3853 			nr = mnsr->sr_nodechain;
3854 			while (nr) {
3855 				if (nd->nd_nodeid == nr->nr_nodeid) {
3856 					if (nr->nr_flags & MD_MN_NODE_OK) {
3857 						/* Found a master */
3858 						free_sr(
3859 						    (md_set_record *)mnsr);
3860 						goto found_master;
3861 					}
3862 				}
3863 				nr = nr->nr_next;
3864 			}
3865 			free_sr((md_set_record *)mnsr);
3866 			nd = nd->nd_next;
3867 		}
3868 
3869 		/*
3870 		 * - If no owner node has its own node record on its own node
3871 		 *	set to OK, then this node checks all of the non-owner
3872 		 *	nodes that are in the membership list.
3873 		 * - for each non-owner, node calls RPC routine clnt_getset to
3874 		 *	see if that node has its node record set to OK.
3875 		 * - If set doesn't exist, don't choose node for master.
3876 		 * - If this node doesn't exist in the nodelist on any of the
3877 		 *	non-owner nodes, this node removes its set description
3878 		 *	of that diskset (i.e. removes the set from its local
3879 		 *	mddbs). This is handling the case of when a node was
3880 		 *	removed from a diskset while it was not in the
3881 		 *	cluster membership list.
3882 		 * - If non-owner node has its node record set to OK and if
3883 		 *	this node hasn't removed this diskset (step directly
3884 		 *	before this one), then the master is chosen to be this
3885 		 *	non-owner node.
3886 		 */
3887 		nd = sd->sd_nodelist;
3888 		while (nd) {
3889 			/* Don't consider node that isn't in member list */
3890 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3891 				nd->nd_flags |= MD_MN_NODE_DEL;
3892 				nd = nd->nd_next;
3893 				continue;
3894 			}
3895 
3896 			/* Don't consider owner nodes since none are OK */
3897 			if (nd->nd_flags & MD_MN_NODE_OWN) {
3898 				nd->nd_flags |= MD_MN_NODE_DEL;
3899 				nd = nd->nd_next;
3900 				continue;
3901 			}
3902 
3903 			/*
3904 			 * Don't need to get nodelist from my node since
3905 			 * this is where sd_nodelist was obtained.
3906 			 */
3907 			if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) {
3908 				nd = nd->nd_next;
3909 				continue;
3910 			}
3911 
3912 			/*
3913 			 * If node has already been decided against for
3914 			 * master, then skip it.
3915 			 */
3916 			if (nd->nd_flags & MD_MN_NODE_DEL) {
3917 				nd = nd->nd_next;
3918 				continue;
3919 			}
3920 
3921 			/*
3922 			 * Does node in my nodelist have its own node
3923 			 * record marked OK on its node?  And does node
3924 			 * in my nodelist exist on all other nodes?
3925 			 * Don't want to choose a node for master unless
3926 			 * that node is marked OK on its own node and that
3927 			 * node exists on all other alive nodes.
3928 			 *
3929 			 * This is guarding against the case when several
3930 			 * nodes are down and one of the downed nodes is
3931 			 * deleted from the diskset.  When the down nodes
3932 			 * are rebooted into the cluster, you don't want
3933 			 * any node to pick the deleted node as the master.
3934 			 */
3935 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3936 			    MD_SET_BAD, &mnsr, ep) == -1) {
3937 				/*
3938 				 * If set doesn't exist on non-owner node,
3939 				 * don't consider this node for master.
3940 				 */
3941 				if (mdiserror(ep, MDE_NO_SET)) {
3942 					nd->nd_flags |= MD_MN_NODE_DEL;
3943 					nd = nd->nd_next;
3944 					continue;
3945 				} else if (mdanyrpcerror(ep)) {
3946 					/* RPC failure to another node */
3947 					return (205);
3948 				} else {
3949 					/* Any other failure */
3950 					return (-1);
3951 				}
3952 			}
3953 			/*
3954 			 * Is my node in the nodelist gotten from the other
3955 			 * node?  If not, then remove the set from my node
3956 			 * since set was deleted from my node while my node
3957 			 * was out of the cluster.
3958 			 */
3959 			nr = mnsr->sr_nodechain;
3960 			while (nr) {
3961 				if (sd->sd_mn_mynode->nd_nodeid ==
3962 				    nr->nr_nodeid) {
3963 					break;
3964 				}
3965 				nr = nr->nr_next;
3966 			}
3967 			if (nr == NULL) {
3968 				/* my node not found - delete set */
3969 				free_sr((md_set_record *)mnsr);
3970 				goto delete_set;
3971 			}
3972 
3973 			/* Is node being checked marked OK on its own node? */
3974 			nr = mnsr->sr_nodechain;
3975 			while (nr) {
3976 				if (nd->nd_nodeid == nr->nr_nodeid) {
3977 					if (!(nr->nr_flags & MD_MN_NODE_OK)) {
3978 						nd->nd_flags |= MD_MN_NODE_DEL;
3979 					}
3980 					break;
3981 				}
3982 				nr = nr->nr_next;
3983 			}
3984 			/*
3985 			 * If node being checked doesn't exist on its
3986 			 * own node - don't choose it as master.
3987 			 */
3988 			if (nr == NULL) {
3989 				nd->nd_flags |= MD_MN_NODE_DEL;
3990 			}
3991 
3992 			/*
3993 			 * Check every node in my node's nodelist against
3994 			 * the nodelist gotten from the other node.
3995 			 * If a node in my node's nodelist is not found in the
3996 			 * other node's nodelist, then set the DEL flag.
3997 			 */
3998 			nd2 = sd->sd_nodelist;
3999 			while (nd2) {
4000 				nr = mnsr->sr_nodechain;
4001 				while (nr) {
4002 					if (nd2->nd_nodeid == nr->nr_nodeid) {
4003 						break;
4004 					}
4005 					nr = nr->nr_next;
4006 				}
4007 				/* nd2 not found in other node's nodelist */
4008 				if (nr == NULL) {
4009 					nd2->nd_flags |= MD_MN_NODE_DEL;
4010 				}
4011 				nd2 = nd2->nd_next;
4012 			}
4013 
4014 			free_sr((md_set_record *)mnsr);
4015 			nd = nd->nd_next;
4016 		}
4017 
4018 		/*
4019 		 * Rescan list look for node that has not been marked DEL.
4020 		 * First node found is the master.
4021 		 */
4022 		nd = sd->sd_nodelist;
4023 		while (nd) {
4024 			if (!(nd->nd_flags & MD_MN_NODE_DEL)) {
4025 				break;
4026 			}
4027 			nd = nd->nd_next;
4028 			continue;
4029 		}
4030 		if (nd) {
4031 			/* Found a master */
4032 			goto found_master;
4033 		}
4034 
4035 		/*
4036 		 * - If no node can be found that has its own node record on
4037 		 *	its node to be set to OK, then all alive nodes
4038 		 * 	were in the process of being added to or deleted
4039 		 *	from set.  Each alive node will remove all
4040 		 *	information pertaining to this set from its node.
4041 		 *
4042 		 * If all nodes in set are ALIVE, then call sdssc end routines
4043 		 * since set was truly being initially created or destroyed.
4044 		 */
4045 		goto delete_set;
4046 	}
4047 
4048 found_master:
4049 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4050 	    "Set %s master chosen %s (%d): %s"),
4051 	    sp->setname, nd->nd_nodename, nd->nd_nodeid,
4052 	    meta_print_hrtime(gethrtime() - start_time));
4053 
4054 	if (clnt_lock_set(mynode(), sp, ep) == -1) {
4055 		return (-1);
4056 	}
4057 
4058 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
4059 
4060 	if (clnt_mnsetmaster(mynode(), sp,
4061 	    nd->nd_nodename, nd->nd_nodeid, ep)) {
4062 		rval = -1;
4063 	} else if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) {
4064 		/* If this node is new master, set flag in this node's kernel */
4065 		(void) memset(&sf, 0, sizeof (sf));
4066 		sf.sf_setno = sp->setno;
4067 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
4068 		/* Use magic to help protect ioctl against attack. */
4069 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
4070 		sf.sf_flags = MDDB_NM_SET;
4071 
4072 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4073 		    "Setting new master flag for set %s: %s"),
4074 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4075 
4076 		/*
4077 		 * Fail reconfig cycle if ioctl fails since it is critical
4078 		 * to set new master flag.
4079 		 */
4080 		if (metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde,
4081 		    NULL) != NULL) {
4082 			(void) mdstealerror(ep, &sf.sf_mde);
4083 			rval = -1;
4084 		}
4085 	}
4086 
4087 	if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) {
4088 		if (rval == 0) {
4089 			(void) mdstealerror(ep, &xep);
4090 			rval = -1;
4091 		}
4092 	}
4093 
4094 	cl_set_setkey(NULL);
4095 
4096 	metaflushsetname(sp);
4097 
4098 	return (rval);
4099 
4100 delete_set:
4101 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4102 	    "Master not chosen, deleting set %s: %s"),
4103 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4104 
4105 	/*
4106 	 * Remove all set information from this node:
4107 	 *	- node records for this set
4108 	 *	- drive records for this set
4109 	 *	- set record for this set
4110 	 * (Only do this on this node since each node
4111 	 * will do it for its own local mddb.)
4112 	 *
4113 	 * If all nodes in set are ALIVE, then
4114 	 * the lowest numbered ALIVE nodeid in set
4115 	 * (irregardless of whether an owner node or not) will
4116 	 * call the DCS service to cleanup for create/delete of set.
4117 	 *   sdssc_create_end(cleanup) if set was being created or
4118 	 *   sdssc_delete_end(cleanup) if set was being deleted.
4119 	 * A node record with flag ADD denotes a set being
4120 	 * created.  A node record with flag DEL denotes a
4121 	 * set being deleted.
4122 	 */
4123 	nd = sd->sd_nodelist;
4124 	while (nd) {
4125 		/* Found a node that isn't alive */
4126 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
4127 			break;
4128 
4129 		/* Is my node the lowest numbered ALIVE node? */
4130 		if (nd->nd_nodeid < sd->sd_mn_mynode->nd_nodeid) {
4131 			break;
4132 		}
4133 		nd = nd->nd_next;
4134 	}
4135 	if (nd == NULL) {
4136 		/* All nodes ALIVE and this is the lowest nodeid */
4137 		lowest_alive_nodeid = 1;
4138 	}
4139 
4140 	if (clnt_lock_set(mynode(), sp, ep) == -1) {
4141 		return (-1);
4142 	}
4143 
4144 
4145 	/*
4146 	 * If this node had been joined, withdraw and reset master.
4147 	 *
4148 	 * This could happen if a node was being added to or removed
4149 	 * from a diskset and the node doing the add/delete operation and
4150 	 * all other nodes in the diskset have left the cluster.
4151 	 */
4152 	if (sd->sd_mn_mynode) {
4153 		nd = sd->sd_mn_mynode;
4154 		if (nd->nd_flags & MD_MN_NODE_OWN) {
4155 			if (clnt_withdrawset(mynode(), sp, ep)) {
4156 				rval = -1;
4157 				goto out;
4158 			}
4159 			if (clnt_mnsetmaster(mynode(), sp, "",
4160 			    MD_MN_INVALID_NID, ep)) {
4161 				rval = -1;
4162 				goto out;
4163 			}
4164 		}
4165 	}
4166 
4167 	/*
4168 	 * Remove side records for this node (side) from local mddb
4169 	 * (clnt_deldrvs does this) if there are drives in the set.
4170 	 *
4171 	 * Don't need to mark this node as DEL since already marked as
4172 	 * ADD or DEL (or this node would have been chosen as master).
4173 	 * Don't need to mark other node records, drive records or
4174 	 * set records as DEL.  If a panic occurs during clnt_delset,
4175 	 * these records will be deleted the next time this node
4176 	 * becomes a member and goes through the reconfig cycle.
4177 	 */
4178 	/* Get the drive descriptors for this set */
4179 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
4180 	    ep)) == NULL) {
4181 		if (! mdisok(ep)) {
4182 			/*
4183 			 * Ignore and clear out any failures from
4184 			 * metaget_drivedesc since a panic could have
4185 			 * occurred when a node was partially added to a set.
4186 			 */
4187 			mdclrerror(ep);
4188 		}
4189 	} else {
4190 		if (clnt_deldrvs(mynode(), sp, dd, ep)) {
4191 			rval = -1;
4192 			goto out;
4193 		}
4194 	}
4195 
4196 	/*
4197 	 * Now, delete the set - this removes the node, drive
4198 	 * and set records from the local mddb.
4199 	 */
4200 	if (clnt_delset(mynode(), sp, ep)) {
4201 		rval = -1;
4202 		goto out;
4203 	}
4204 
4205 out:
4206 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
4207 
4208 	/*
4209 	 * Ignore errors from unlock of set since set is no longer
4210 	 * known (if clnt_delset worked).
4211 	 */
4212 	if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) {
4213 		mdclrerror(&xep);
4214 	}
4215 
4216 	cl_set_setkey(NULL);
4217 
4218 	metaflushsetname(sp);
4219 
4220 	/*
4221 	 * If this node is the lowest numbered nodeid then
4222 	 * call sdssc_create/delete_end depending on whether
4223 	 * this node is marked as ADD or DEL in the node record.
4224 	 */
4225 	if (lowest_alive_nodeid) {
4226 		if (nd->nd_flags & MD_MN_NODE_ADD)
4227 			sdssc_create_end(sp->setname, SDSSC_CLEANUP);
4228 		else if (nd->nd_flags & MD_MN_NODE_DEL)
4229 			sdssc_delete_end(sp->setname, SDSSC_CLEANUP);
4230 	}
4231 
4232 	/* Finished with this set -- return */
4233 	return (rval);
4234 }
4235 
4236 /*
4237  * Reconfig step to choose a new master for all MN disksets.
4238  * Return values:
4239  *	0 - Everything is great.
4240  *	1 - This node failed to reconfig.
4241  *	205 - Cause another reconfig due to a nodelist problem
4242  *		or RPC failure to another node
4243  */
4244 int
4245 meta_reconfig_choose_master(
4246 	long		timeout,
4247 	md_error_t	*ep
4248 )
4249 {
4250 	set_t				max_sets, setno;
4251 	int				nodecnt;
4252 	mndiskset_membershiplist_t	*nl;
4253 	md_set_desc			*sd;
4254 	mdsetname_t			*sp;
4255 	int				rval = 0;
4256 	mddb_setflags_config_t		sf;
4257 	int				start_node_delayed = 0;
4258 
4259 	if ((max_sets = get_max_sets(ep)) == 0) {
4260 		mde_perror(ep, dgettext(TEXT_DOMAIN,
4261 		    "Unable to get number of sets"));
4262 		return (1);
4263 	}
4264 
4265 	/*
4266 	 * Get membershiplist from API routine.  If there's
4267 	 * an error, return a 205 to cause another reconfig.
4268 	 */
4269 	if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
4270 		mde_perror(ep, "");
4271 		return (205);
4272 	}
4273 
4274 	for (setno = 1; setno < max_sets; setno++) {
4275 		if ((sp = metasetnosetname(setno, ep)) == NULL) {
4276 			if (mdiserror(ep, MDE_NO_SET)) {
4277 				/* No set for this setno - continue */
4278 				mdclrerror(ep);
4279 				continue;
4280 			} else {
4281 				/*
4282 				 * If encountered an RPC error from my node,
4283 				 * then immediately fail.
4284 				 */
4285 				if (mdanyrpcerror(ep)) {
4286 					mde_perror(ep, "");
4287 					return (1);
4288 				}
4289 				/* Can't get set information */
4290 				mde_perror(ep, dgettext(TEXT_DOMAIN,
4291 				    "Unable to get information for "
4292 				    "set number %d"), setno);
4293 				mdclrerror(ep);
4294 				continue;
4295 			}
4296 		}
4297 
4298 		/* If setname is there, set desc should exist. */
4299 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4300 			/*
4301 			 * If encountered an RPC error from my node,
4302 			 * then immediately fail.
4303 			 */
4304 			if (mdanyrpcerror(ep)) {
4305 				mde_perror(ep, "");
4306 				return (1);
4307 			}
4308 			mde_perror(ep, dgettext(TEXT_DOMAIN,
4309 			    "Unable to get set %s desc information"),
4310 			    sp->setname);
4311 			mdclrerror(ep);
4312 			continue;
4313 		}
4314 
4315 		/* Only reconfig MN disksets */
4316 		if (!MD_MNSET_DESC(sd)) {
4317 			continue;
4318 		}
4319 
4320 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4321 		    "Begin choose master for set %s: %s"),
4322 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4323 
4324 		/* Update nodelist with member information. */
4325 		if (meta_reconfig_update_nodelist(sp, nl, sd, ep)) {
4326 			/*
4327 			 * If encountered an RPC error from my node,
4328 			 * then immediately fail.
4329 			 */
4330 			if (mdanyrpcerror(ep)) {
4331 				mde_perror(ep, "");
4332 				return (1);
4333 			}
4334 			mde_perror(ep, "");
4335 			mdclrerror(ep);
4336 			continue;
4337 		}
4338 
4339 		/*
4340 		 * If all nodes in a cluster are starting, then
4341 		 * all nodes will attempt to contact all other nodes
4342 		 * to determine a master node.  This can lead to a
4343 		 * problem where node 1 is trying to contact the rpc.metad
4344 		 * node 2 and node 2 is trying to contact the rpc.metad
4345 		 * on node 1 -- and this causes the rpc call to fail
4346 		 * on both nodes and causes a new reconfig cycle.
4347 		 *
4348 		 * In order to break this problem, a newly starting node
4349 		 * will delay a small amount of time (nodeid mod 4 seconds)
4350 		 * and will then run the code to choose a master for the
4351 		 * first set.  Delay will only be done once regardless of the
4352 		 * number of sets.
4353 		 */
4354 		if (start_node_delayed == 0) {
4355 			(void) memset(&sf, 0, sizeof (sf));
4356 			sf.sf_setno = sp->setno;
4357 			sf.sf_flags = MDDB_NM_GET;
4358 			/* Use magic to help protect ioctl against attack. */
4359 			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
4360 			if ((metaioctl(MD_MN_GET_SETFLAGS, &sf,
4361 			    &sf.sf_mde, NULL) == 0) &&
4362 			    ((sf.sf_setflags & MD_SET_MN_START_RC) ==
4363 			    MD_SET_MN_START_RC)) {
4364 				(void) sleep(sd->sd_mn_mynode->nd_nodeid % 4);
4365 			}
4366 			start_node_delayed = 1;
4367 		}
4368 
4369 		/* Choose master for this set */
4370 		rval = meta_reconfig_choose_master_for_set(sp, sd, ep);
4371 		if (rval == -1) {
4372 			mde_perror(ep, "");
4373 			return (1);
4374 		} else if (rval == 205) {
4375 			mde_perror(ep, "");
4376 			return (205);
4377 		}
4378 
4379 		/* reinit rpc.mdcommd with new nodelist */
4380 		if (mdmn_reinit_set(sp->setno, timeout)) {
4381 			md_eprintf(dgettext(TEXT_DOMAIN,
4382 			    "Could not re-initialise rpc.mdcommd for "
4383 			    "set %s\n"), sp->setname);
4384 			return (1);
4385 		}
4386 
4387 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4388 		    "Choose master for set %s completed: %s"),
4389 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4390 	}
4391 
4392 	/*
4393 	 * Each node turns on I/Os for all MN disksets.
4394 	 * This is to recover from the situation where the master died
4395 	 * during a previous reconfig cycle when I/Os were suspended
4396 	 * for a MN diskset.
4397 	 * If a failure occurs return a 1 which will force this node to
4398 	 * panic.  Cannot leave node in the situation where I/Os are
4399 	 * not resumed.
4400 	 */
4401 	setno = 0; /* 0 means all MN sets */
4402 	if (metaioctl(MD_MN_RESUME_SET, &setno, ep, NULL)) {
4403 		mde_perror(ep, "");
4404 		return (1);
4405 	}
4406 
4407 	/* Free the nodelist */
4408 	if (nodecnt)
4409 		meta_free_nodelist(nl);
4410 
4411 	return (0);
4412 }
4413 
4414 /*
4415  * meta_mnsync_user_records will synchronize the diskset user records across
4416  * all nodes in the diskset.  The diskset user records are stored in
4417  * each node's local set mddb.
4418  *
4419  * This needs to be done even if there is no master change during the
4420  * reconfig cycle since this routine should clean up any mess left by
4421  * the untimely termination of a metaset or metadb command (due to a
4422  * node panic or to user intervention).
4423  *
4424  * Caller is the Master node.
4425  *
4426  * Returns	 0 - Success
4427  *		205 - Failure during RPC to another node
4428  *		-1 - Any other failure and ep is filled in.
4429  */
4430 int
4431 meta_mnsync_user_records(
4432 	mdsetname_t	*sp,
4433 	md_error_t	*ep
4434 )
4435 {
4436 	md_set_desc		*sd;
4437 	md_mnnode_desc		*master_nodelist, *nd, *nd2, *ndtail;
4438 	md_mnset_record		*mnsr;
4439 	md_mnsr_node_t		*master_mnsr_node = NULL, *mnsr_node = NULL;
4440 	md_mnnode_record	*nr;
4441 	md_drive_record		*dr;
4442 	int			dr_cnt, dd_cnt;
4443 	int			found_my_nr;
4444 	md_drive_desc		*dd, *dd_prev, *master_dd, *other_dd;
4445 	int			all_drives_ok;
4446 	int			rval = 0;
4447 	int			max_genid = 0;
4448 	int			num_alive_nodes, num_alive_nodes_del = 0;
4449 	int			set_locked = 0;
4450 	md_setkey_t		*cl_sk;
4451 	md_error_t		xep = mdnullerror;
4452 	char			*anode[1];
4453 	mddb_setflags_config_t	sf;
4454 
4455 	/*
4456 	 * Sync up node records first.
4457 	 * Construct a master nodelist using the nodelist from this
4458 	 * node's rpc.metad node records and then setting the state of each
4459 	 * node following these rules:
4460 	 *	- If a node record is marked OK on its node, mark it OK
4461 	 *		in the master nodelist (and later OK on all nodes)
4462 	 *		If a node record is also marked OWN on its node,
4463 	 *		mark it OWN in the master nodelist.
4464 	 *	- If a node record is not marked OK on its node, then mark
4465 	 *		it as DEL in the master list (later deleting it)
4466 	 *	- If node record doesn't exist on that node, then mark it DEL
4467 	 *		(later deleting it)
4468 	 *	- If set record doesn't exist on that node, mark node as DEL
4469 	 *	- If a node record doesn't exist on all nodes, then mark it DEL
4470 	 *	- If a node is not ALIVE, then
4471 	 *		- If that node marked DEL on any node - mark it DEL
4472 	 *			in master list but leave in nodelist
4473 	 *		- If that node is marked as ADD on any node, mark it
4474 	 *			ADD in the master list but leave in nodelist
4475 	 *		- When that node returns to the living, the DEL
4476 	 *			node record will be removed and the ADD node
4477 	 *			record may be removed if marked ADD on that
4478 	 *			node.
4479 	 * The key rule is to not remove a node from the nodelist until
4480 	 * that node record is removed from its own node.  Do not want to
4481 	 * remove a node's record from all other nodes and then have
4482 	 * that node have its own record marked OK so that a node will pick
4483 	 * a different master than the other nodes.
4484 	 *
4485 	 * Next,
4486 	 * If node is ALIVE and node record is marked DEL in master nodelist,
4487 	 * remove node from set.
4488 	 * If node is ALIVE and node record is marked OK in master nodelist,
4489 	 * mark it OK on all other nodes.
4490 	 * If node is not ALIVE and node record is marked DEL in master
4491 	 * nodelist, mark it DEL on all other nodes.
4492 	 * If node is not ALIVE and node record is marked ADD in master,
4493 	 * nodelist, mark it ADD on all other nodes.
4494 	 */
4495 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4496 		return (-1);
4497 	}
4498 	master_nodelist = sd->sd_nodelist;
4499 
4500 	/*
4501 	 * Walk through nodelist creating a master nodelist.
4502 	 */
4503 	num_alive_nodes = 0;
4504 	nd = master_nodelist;
4505 	while (nd) {
4506 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4507 			nd = nd->nd_next;
4508 			continue;
4509 		}
4510 		num_alive_nodes++;
4511 		if (clnt_mngetset(nd->nd_nodename, sp->setname,
4512 		    MD_SET_BAD, &mnsr, ep) == -1) {
4513 			if (mdiserror(ep, MDE_NO_SET)) {
4514 				/* set doesn't exist, mark node as DEL */
4515 				nd->nd_flags &= ~MD_MN_NODE_OK;
4516 				nd->nd_flags &= ~MD_MN_NODE_ADD;
4517 				nd->nd_flags |= MD_MN_NODE_DEL;
4518 				nd->nd_flags |= MD_MN_NODE_NOSET;
4519 				nd = nd->nd_next;
4520 				continue;
4521 			} else {
4522 				/* If RPC failure to another node return 205 */
4523 				if ((mdanyrpcerror(ep)) &&
4524 				    (sd->sd_mn_mynode->nd_nodeid !=
4525 				    nd->nd_nodeid)) {
4526 					rval = 205;
4527 				} else {
4528 					/* Any other failure */
4529 					rval = -1;
4530 				}
4531 				goto out;
4532 			}
4533 		}
4534 		/* Find biggest genid in records for this diskset */
4535 		if (mnsr->sr_genid > max_genid)
4536 			max_genid = mnsr->sr_genid;
4537 
4538 		dr = mnsr->sr_drivechain;
4539 		while (dr) {
4540 			/* Find biggest genid in records for this diskset */
4541 			if (dr->dr_genid > max_genid) {
4542 				max_genid = dr->dr_genid;
4543 			}
4544 			dr = dr->dr_next;
4545 		}
4546 
4547 		found_my_nr = 0;
4548 		nr = mnsr->sr_nodechain;
4549 		/* nr is the list of node recs from nd_nodename node */
4550 		while (nr) {
4551 			/* Find biggest genid in records for this diskset */
4552 			if (nr->nr_genid > max_genid)
4553 				max_genid = nr->nr_genid;
4554 			nd2 = master_nodelist;
4555 			ndtail = NULL;
4556 			/* For each node record, is it in master list? */
4557 			while (nd2) {
4558 				if (nd2->nd_nodeid == nr->nr_nodeid)
4559 					break;
4560 				if (nd2->nd_next == NULL)
4561 					ndtail = nd2;
4562 				nd2 = nd2->nd_next;
4563 			}
4564 			/*
4565 			 * Found node record not in master list -- add it
4566 			 * to list marking it as DEL since node record
4567 			 * should exist on all nodes unless a panic occurred
4568 			 * during addition or deletion of host to diskset.
4569 			 */
4570 			if (nd2 == NULL) {
4571 				nd2 = Zalloc(sizeof (*nd2));
4572 				(void) strcpy(nd2->nd_nodename,
4573 				    nr->nr_nodename);
4574 				nd2->nd_flags = nr->nr_flags;
4575 				nd2->nd_flags |= MD_MN_NODE_DEL;
4576 				nd2->nd_nodeid = nr->nr_nodeid;
4577 				nd2->nd_next = NULL;
4578 				ndtail->nd_next = nd2;
4579 				nd2 = NULL;
4580 				nr = nr->nr_next;
4581 				continue;
4582 			}
4583 			/*
4584 			 * Is this the node record for the node that
4585 			 * we requested the set desc from?
4586 			 * If so, check if node has its own node record
4587 			 * marked OK. If marked OK, check for the OWN bit.
4588 			 */
4589 			if (nr->nr_nodeid == nd->nd_nodeid) {
4590 				found_my_nr = 1;
4591 				if (nr->nr_flags & MD_MN_NODE_OK) {
4592 					/*
4593 					 * If node record is marked OK
4594 					 * on its own node, then mark it OK
4595 					 * in the master list.  Node record
4596 					 * would have to exist on all nodes
4597 					 * in the ADD state before it could
4598 					 * be put into the OK state.
4599 					 */
4600 					nd->nd_flags |= MD_MN_NODE_OK;
4601 					nd->nd_flags &=
4602 					    ~(MD_MN_NODE_ADD | MD_MN_NODE_DEL);
4603 					/*
4604 					 * Mark own in master list as marked
4605 					 * on own node.
4606 					 */
4607 					if (nr->nr_flags & MD_MN_NODE_OWN)
4608 						nd->nd_flags |= MD_MN_NODE_OWN;
4609 					else
4610 						nd->nd_flags &= ~MD_MN_NODE_OWN;
4611 				} else {
4612 					/* Otherwise, mark node as DEL */
4613 					nd->nd_flags &= ~MD_MN_NODE_OK;
4614 					nd->nd_flags &= ~MD_MN_NODE_ADD;
4615 					nd->nd_flags |= MD_MN_NODE_DEL;
4616 				}
4617 			}
4618 			/*
4619 			 * If node is not ALIVE and marked DEL
4620 			 * on any node, make it DEL in master list.
4621 			 * If node is not ALIVE and marked ADD
4622 			 * on any node, make it ADD in master list
4623 			 * unless node record has already been marked DEL.
4624 			 */
4625 			if (!(nr->nr_flags & MD_MN_NODE_ALIVE)) {
4626 				if (nr->nr_flags & MD_MN_NODE_ADD) {
4627 					if (!(nd->nd_flags & MD_MN_NODE_DEL)) {
4628 						/* If not DEL - mark it ADD */
4629 						nd->nd_flags |= MD_MN_NODE_ADD;
4630 						nd->nd_flags &= ~MD_MN_NODE_OK;
4631 					}
4632 				}
4633 				if (nr->nr_flags & MD_MN_NODE_DEL) {
4634 					nd->nd_flags |= MD_MN_NODE_DEL;
4635 					nd->nd_flags &= ~MD_MN_NODE_OK;
4636 					/* Could already be ADD - make it DEL */
4637 					nd->nd_flags &= ~MD_MN_NODE_ADD;
4638 				}
4639 			}
4640 			nr = nr->nr_next;
4641 		}
4642 		/*
4643 		 * If a node record doesn't exist on its own node,
4644 		 * then mark node as DEL.
4645 		 */
4646 		if (found_my_nr == 0) {
4647 			nd->nd_flags &= ~MD_MN_NODE_OK;
4648 			nd->nd_flags |= MD_MN_NODE_DEL;
4649 		}
4650 
4651 		/*
4652 		 * If node is OK - put mnsr onto master_mnsr_node list for
4653 		 * later use when syncing up the drive records in the set.
4654 		 */
4655 		if (nd->nd_flags & MD_MN_NODE_OK) {
4656 			mnsr_node = Zalloc(sizeof (*mnsr_node));
4657 			mnsr_node->mmn_mnsr = mnsr;
4658 			(void) strncpy(mnsr_node->mmn_nodename,
4659 			    nd->nd_nodename, MD_MAX_MNNODENAME_PLUS_1);
4660 			mnsr_node->mmn_next = master_mnsr_node;
4661 			master_mnsr_node = mnsr_node;
4662 		} else {
4663 			free_sr((struct md_set_record *)mnsr);
4664 		}
4665 
4666 		nd = nd->nd_next;
4667 	}
4668 
4669 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4670 	    "Master nodelist created for set %s: %s"),
4671 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4672 
4673 	/*
4674 	 * Send master nodelist to the rpc.metad on all nodes (including
4675 	 * myself) and each node will update itself.  This will set the
4676 	 * ADD and DEL flags on each node as setup in the master nodelist.
4677 	 * Don't send nodelist to node where set doesn't exist.
4678 	 */
4679 	nd = master_nodelist;
4680 	while (nd) {
4681 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4682 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
4683 			nd = nd->nd_next;
4684 			continue;
4685 		}
4686 		if (clnt_upd_nr_flags(nd->nd_nodename, sp,
4687 		    master_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) {
4688 			/* If RPC failure to another node return 205 */
4689 			if ((mdanyrpcerror(ep)) &&
4690 			    (sd->sd_mn_mynode->nd_nodeid !=
4691 			    nd->nd_nodeid)) {
4692 				rval = 205;
4693 			} else {
4694 				/* Any other failure */
4695 				rval = -1;
4696 			}
4697 			goto out;
4698 		}
4699 		nd = nd->nd_next;
4700 	}
4701 
4702 	/*
4703 	 * Now, delete nodes that need to be deleted.
4704 	 */
4705 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
4706 	    ep))  == NULL) {
4707 		if (! mdisok(ep)) {
4708 			rval = -1;
4709 			goto out;
4710 		}
4711 	}
4712 
4713 	/*
4714 	 * May be doing lots of RPC commands to the nodes, so lock the
4715 	 * ALIVE members of the set since most of the rpc.metad routines
4716 	 * require this for security reasons.
4717 	 */
4718 	nd = master_nodelist;
4719 	while (nd) {
4720 		/* Skip non-alive nodes and node without set */
4721 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4722 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
4723 			nd = nd->nd_next;
4724 			continue;
4725 		}
4726 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
4727 			/* If RPC failure to another node return 205 */
4728 			if ((mdanyrpcerror(ep)) &&
4729 			    (sd->sd_mn_mynode->nd_nodeid !=
4730 			    nd->nd_nodeid)) {
4731 				rval = 205;
4732 			} else {
4733 				/* Any other failure */
4734 				rval = -1;
4735 			}
4736 			goto out;
4737 		}
4738 		set_locked = 1;
4739 		nd = nd->nd_next;
4740 	}
4741 
4742 	nd = master_nodelist;
4743 	while (nd) {
4744 		/* Skip non-alive nodes */
4745 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4746 			nd = nd->nd_next;
4747 			continue;
4748 		}
4749 		if (nd->nd_flags & MD_MN_NODE_DEL) {
4750 			num_alive_nodes_del++;
4751 			/*
4752 			 * Delete this node rec from all ALIVE nodes in diskset.
4753 			 */
4754 			nd2 = master_nodelist;
4755 			while (nd2) {
4756 				/* Skip non-alive nodes and node without set */
4757 				if (!(nd2->nd_flags & MD_MN_NODE_ALIVE) ||
4758 				    (nd2->nd_flags & MD_MN_NODE_NOSET)) {
4759 					nd2 = nd2->nd_next;
4760 					continue;
4761 				}
4762 
4763 				/* This is a node being deleted from set */
4764 				if (nd2->nd_nodeid == nd->nd_nodeid) {
4765 					/* Mark set record as DEL */
4766 					if (clnt_upd_sr_flags(nd->nd_nodename,
4767 					    sp, MD_SR_DEL, ep)) {
4768 						/* RPC failure to !my node */
4769 						if ((mdanyrpcerror(ep)) &&
4770 						    (sd->sd_mn_mynode->
4771 						    nd_nodeid
4772 						    != nd->nd_nodeid)) {
4773 							rval = 205;
4774 						} else {
4775 							/* Any other failure */
4776 							rval = -1;
4777 						}
4778 						goto out;
4779 					}
4780 					if (clnt_deldrvs(nd->nd_nodename, sp,
4781 					    dd, ep)) {
4782 						/* RPC failure to !my node */
4783 						if ((mdanyrpcerror(ep)) &&
4784 						    (sd->sd_mn_mynode->
4785 						    nd_nodeid
4786 						    != nd->nd_nodeid)) {
4787 							rval = 205;
4788 						} else {
4789 							/* Any other failure */
4790 							rval = -1;
4791 						}
4792 						goto out;
4793 					}
4794 					if (clnt_delset(nd->nd_nodename, sp,
4795 					    ep) == -1) {
4796 						/* RPC failure to !my node */
4797 						if ((mdanyrpcerror(ep)) &&
4798 						    (sd->sd_mn_mynode->
4799 						    nd_nodeid
4800 						    != nd->nd_nodeid)) {
4801 							rval = 205;
4802 						} else {
4803 							/* Any other failure */
4804 							rval = -1;
4805 						}
4806 						goto out;
4807 					}
4808 				} else {
4809 					/*
4810 					 * Delete host from sets on hosts
4811 					 * not being deleted.
4812 					 */
4813 					anode[0] = Strdup(nd->nd_nodename);
4814 					if (clnt_delhosts(nd2->nd_nodename, sp,
4815 					    1, anode, ep) == -1) {
4816 						Free(anode[0]);
4817 						/* RPC failure to !my node */
4818 						if ((mdanyrpcerror(ep)) &&
4819 						    (sd->sd_mn_mynode->
4820 						    nd_nodeid
4821 						    != nd2->nd_nodeid)) {
4822 							rval = 205;
4823 						} else {
4824 							/* Any other failure */
4825 							rval = -1;
4826 						}
4827 						goto out;
4828 					}
4829 
4830 					meta_mc_log(MC_LOG5,
4831 					    dgettext(TEXT_DOMAIN,
4832 					    "Deleted node %s (%d) on node %s "
4833 					    "from set %s: %s"),
4834 					    nd->nd_nodename, nd->nd_nodeid,
4835 					    nd2->nd_nodename,
4836 					    sp->setname,
4837 					    meta_print_hrtime(
4838 					    gethrtime() - start_time));
4839 
4840 					Free(anode[0]);
4841 				}
4842 				nd2 = nd2->nd_next;
4843 			}
4844 		}
4845 		nd = nd->nd_next;
4846 	}
4847 
4848 	nd = master_nodelist;
4849 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
4850 	while (nd) {
4851 		/* Skip non-alive nodes and node without set */
4852 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4853 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
4854 			nd = nd->nd_next;
4855 			continue;
4856 		}
4857 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) {
4858 			/* If RPC failure to another node return 205 */
4859 			if ((mdanyrpcerror(ep)) &&
4860 			    (sd->sd_mn_mynode->nd_nodeid !=
4861 			    nd->nd_nodeid)) {
4862 				rval = 205;
4863 			} else {
4864 				/* Any other failure */
4865 				rval = -1;
4866 			}
4867 			goto out;
4868 		}
4869 		nd = nd->nd_next;
4870 	}
4871 	cl_set_setkey(NULL);
4872 	set_locked = 0;
4873 
4874 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4875 	    "Nodelist syncronization complete for set %s: %s"),
4876 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4877 
4878 	metaflushsetname(sp);
4879 
4880 	/*
4881 	 * If all alive nodes have been deleted from set, just
4882 	 * return since nothing else can be done until non-alive
4883 	 * nodes (if there are any) rejoin the cluster.
4884 	 */
4885 	if (num_alive_nodes == num_alive_nodes_del) {
4886 		rval = 0;
4887 		goto out;
4888 	}
4889 
4890 	/*
4891 	 * Sync up drive records.
4892 	 *
4893 	 * If a node panic'd (or metaset command was killed) during the
4894 	 * addition or deletion of a drive to the diskset, the nodes
4895 	 * may have a different view of the drive list.  During cleanup
4896 	 * of the drive list during reconfig, a drive will be deleted
4897 	 * from the list if the master node sees that the drive has been
4898 	 * marked in the ADD state on any node or is marked in the DEL state
4899 	 * on all nodes.
4900 	 * This cleanup must occur even if all nodes in the cluster are
4901 	 * not part of the cluster so that all nodes have the same view
4902 	 * of the drivelist.
4903 	 * Then if the entire cluster goes down and comes back up, the
4904 	 * new master node could be a node that wasn't in the cluster when
4905 	 * the node was deleted.  This could lead to a situation where the
4906 	 * master node thinks that a drive is OK, but this drive isn't
4907 	 * known to the other nodes.
4908 	 * This situation can also occur during the addition of a drive
4909 	 * where a node has the drive marked OK, but the node executing the
4910 	 * metaset command enountered a failure before marking that drive OK
4911 	 * on the rest of the nodes.  If the node with the OK drive then
4912 	 * panics, then rest of the nodes will remove that drive marked ADD
4913 	 * and when the node with the OK drive rejoins the cluster, it will
4914 	 * have a drive marked OK that is unknown by the other nodes.
4915 	 *
4916 	 * There are 2 situations to consider:
4917 	 * A) Master knows about a drive that other nodes don't know about.
4918 	 * B) At least one slave node knows about a drive that the master
4919 	 *    node doesn't know about.
4920 	 *
4921 	 * To handle these situations the following steps are followed:
4922 	 * 1) Count number of drives known by this master node and the
4923 	 *    other slave nodes.
4924 	 *    If all nodes have the same number of drives and the master has
4925 	 *    all drives marked OK, then skip to step4.
4926 	 *
4927 	 * 2) If a node has less drives listed than the master, the master
4928 	 *    must get the drive descriptor list from that node so that
4929 	 *    master can determine which drive it needs to delete from that
4930 	 *    node.  Master must get the drive descriptor list since the
4931 	 *    drive record list does not contain the name of the drive, but
4932 	 *    only a key and the key can only be interprested on that other
4933 	 *    node.
4934 	 *
4935 	 * 3) The master will then create the master drive list by doing:
4936 	 *	- Master starts with drive list known by master.
4937 	 *	- Any drive marked ADD will be removed from the list.
4938 	 *	- Any drive not known by another node (from step2) will be
4939 	 *	removed from the drive list.
4940 	 *	- If a drive is marked DEL on the master, the master must
4941 	 *	verify that the drive record is marked DEL on all nodes.
4942 	 *	If any node has the drive record marked OK, mark it OK
4943 	 *	on the master.  (The reason why is described below).
4944 	 *
4945 	 * 4) The master sends out the master drive list and the slave
4946 	 *    nodes will force their drive lists to match the master
4947 	 *    drive list by deleting drives, if necessary and by changing
4948 	 *    the drive record states from ADD->OK if master has drive
4949 	 *    marked OK and slave has drive marked ADD.
4950 	 *
4951 	 * Interesting scenarios:
4952 	 *
4953 	 * 1) System has 4 nodes with node 1 as the master.  Node 3 starts
4954 	 *    to delete a drive record (drive record on node 1 is marked DEL),
4955 	 *    but is stopped when node 3 panics.  Node 1 also panics.
4956 	 *    During reconfig cycle, node 2 is picked as master and the drive
4957 	 *    record is left alone since all nodes in the cluster have it
4958 	 *    marked OK.  User now sees drive as part of diskset.
4959 	 *    Now, entire cluster is rebooted and node 1 rejoins the cluster.
4960 	 *    Node 1 is picked as the master and node 1 has drive record
4961 	 *    marked DEL.  Node 1 contacts all other nodes in the cluster
4962 	 *    and since at least one node has the drive record marked OK,
4963 	 *    the master marks the drive record OK.
4964 	 *    User continues to see the drive as part of the diskset.
4965 	 */
4966 
4967 	/* Reget set descriptor since flushed above */
4968 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4969 		rval = -1;
4970 		goto out;
4971 	}
4972 
4973 	/* Has side effect of setting sd->sd_drvs to same as master_dd */
4974 	if ((master_dd = metaget_drivedesc_sideno(sp,
4975 	    sd->sd_mn_mynode->nd_nodeid,
4976 	    (MD_BASICNAME_OK | PRINT_FAST), ep)) == NULL) {
4977 		/* No drives in list */
4978 		if (!mdisok(ep)) {
4979 			/*
4980 			 * Can't get drive list for this node, so
4981 			 * return -1 causing this node to be removed
4982 			 * cluster config and fixed.
4983 			 */
4984 			rval = -1;
4985 			goto out;
4986 		}
4987 	}
4988 
4989 	/* Count the number of drives for all nodes */
4990 	mnsr_node = master_mnsr_node;
4991 	while (mnsr_node) {
4992 		dr_cnt = 0;
4993 		dr = mnsr_node->mmn_mnsr->sr_drivechain;
4994 		while (dr) {
4995 			dr_cnt++;
4996 			dr = dr->dr_next;
4997 		}
4998 		mnsr_node->mmn_numdrives = dr_cnt;
4999 		mnsr_node = mnsr_node->mmn_next;
5000 	}
5001 
5002 	/* Count the number of drives for the master; also check flags */
5003 	all_drives_ok = 1;
5004 	dd_cnt = 0;
5005 	dd = master_dd;
5006 	while (dd) {
5007 		dd_cnt++;
5008 		if (!(dd->dd_flags & MD_DR_OK))
5009 			all_drives_ok = 0;
5010 		dd = dd->dd_next;
5011 	}
5012 
5013 	/* If all drives are ok, do quick check against number of drives */
5014 	if (all_drives_ok) {
5015 		/* If all nodes have same number of drives, almost done */
5016 		mnsr_node = master_mnsr_node;
5017 		while (mnsr_node) {
5018 			if (mnsr_node->mmn_numdrives != dd_cnt)
5019 				break;
5020 			mnsr_node = mnsr_node->mmn_next;
5021 		}
5022 		/* All nodes have same number of drives, just send flags */
5023 		if (mnsr_node == NULL) {
5024 			goto send_drive_list;
5025 		}
5026 	}
5027 
5028 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5029 	    "Begin detailed drive synchronization for set %s: %s"),
5030 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5031 
5032 	/* Detailed check required  */
5033 	mnsr_node = master_mnsr_node;
5034 	while (mnsr_node) {
5035 		/* Does slave node have less drives than master? */
5036 		if (mnsr_node->mmn_numdrives < dd_cnt) {
5037 			/* Yes - must determine which drive is missing */
5038 			if (clnt_getdrivedesc(mnsr_node->mmn_nodename, sp,
5039 			    &other_dd, ep)) {
5040 				/* RPC failure to !my node */
5041 				if ((mdanyrpcerror(ep)) &&
5042 				    (strcmp(mynode(), mnsr_node->mmn_nodename)
5043 				    != 0)) {
5044 					rval = 205;
5045 				} else {
5046 					/* Any other failure */
5047 					rval = -1;
5048 				}
5049 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5050 				    "Master node %s unable to "
5051 				    "retrieve drive list from node %s"),
5052 				    mynode(), mnsr_node->mmn_nodename);
5053 				goto out;
5054 			}
5055 			mnsr_node->mmn_dd = other_dd;
5056 			dd = master_dd;
5057 			while (dd) {
5058 				if (!(dd->dd_flags & MD_DR_OK)) {
5059 					dd = dd->dd_next;
5060 					continue;
5061 				}
5062 				other_dd = mnsr_node->mmn_dd;
5063 				while (other_dd) {
5064 					/* Convert to devids, when available */
5065 					if (strcmp(other_dd->dd_dnp->cname,
5066 					    dd->dd_dnp->cname) == 0) {
5067 						break;
5068 					}
5069 					other_dd = other_dd->dd_next;
5070 				}
5071 				/*
5072 				 * dd not found on slave so mark it
5073 				 * ADD for later deletion (drives in ADD
5074 				 * state are deleted later in this routine).
5075 				 */
5076 				if (other_dd == NULL) {
5077 					dd->dd_flags = MD_DR_ADD;
5078 				}
5079 				dd = dd->dd_next;
5080 			}
5081 
5082 		}
5083 		mnsr_node = mnsr_node->mmn_next;
5084 	}
5085 
5086 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5087 	    "Drive check completed for set %s: %s"),
5088 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5089 
5090 	dd = master_dd;
5091 	dd_prev = 0;
5092 	while (dd) {
5093 		/* Remove any ADD drives from list */
5094 		if (dd->dd_flags & MD_DR_ADD) {
5095 			if (dd_prev) {
5096 				dd_prev->dd_next = dd->dd_next;
5097 				dd->dd_next = NULL;
5098 				metafreedrivedesc(&dd);
5099 				dd = dd_prev->dd_next;
5100 			} else {
5101 				/*
5102 				 * If removing drive descriptor from head
5103 				 * of linked list, also change sd->sd_drvs.
5104 				 */
5105 				master_dd = sd->sd_drvs = dd->dd_next;
5106 				dd->dd_next = NULL;
5107 				metafreedrivedesc(&dd);
5108 				dd = master_dd;
5109 			}
5110 			/* dd setup in if/else above */
5111 			continue;
5112 		}
5113 		/*
5114 		 * If drive is marked DEL, check all other nodes.
5115 		 * If drive on another node is marked OK, mark drive OK
5116 		 * in master list.  If drive is marked DEL or doesn't exist
5117 		 * on all nodes, remove drive from list.
5118 		 */
5119 		if (dd->dd_flags & MD_DR_DEL) {
5120 			mnsr_node = master_mnsr_node;
5121 			while (mnsr_node) {
5122 				if (mnsr_node->mmn_dd == NULL) {
5123 					if (clnt_getdrivedesc(
5124 					    mnsr_node->mmn_nodename, sp,
5125 					    &other_dd, ep)) {
5126 						/* RPC failure to !my node */
5127 						if ((mdanyrpcerror(ep)) &&
5128 						    (strcmp(mynode(),
5129 						    mnsr_node->mmn_nodename)
5130 						    != 0)) {
5131 							rval = 205;
5132 						} else {
5133 							/* Any other failure */
5134 							rval = -1;
5135 						}
5136 						mde_perror(ep,
5137 						    dgettext(TEXT_DOMAIN,
5138 						    "Master node %s unable "
5139 						    "to retrieve drive list "
5140 						    "from node %s"), mynode(),
5141 						    mnsr_node->mmn_nodename);
5142 						goto out;
5143 					}
5144 					mnsr_node->mmn_dd = other_dd;
5145 				}
5146 				other_dd = mnsr_node->mmn_dd;
5147 				while (other_dd) {
5148 					/* Found drive (OK) from other node */
5149 					if (strcmp(dd->dd_dnp->cname,
5150 					    other_dd->dd_dnp->cname)
5151 					    == 0) {
5152 						/* Drive marked OK */
5153 						if (other_dd->dd_flags &
5154 						    MD_DR_OK) {
5155 							dd->dd_flags = MD_DR_OK;
5156 						}
5157 						break;
5158 					}
5159 					other_dd = other_dd->dd_next;
5160 				}
5161 				if (dd->dd_flags == MD_DR_OK)
5162 					break;
5163 
5164 				mnsr_node = mnsr_node->mmn_next;
5165 			}
5166 			/*
5167 			 * If no node had this drive marked OK, delete it.
5168 			 */
5169 			if (dd->dd_flags & MD_DR_DEL) {
5170 				if (dd_prev) {
5171 					dd_prev->dd_next = dd->dd_next;
5172 					dd->dd_next = NULL;
5173 					metafreedrivedesc(&dd);
5174 					dd = dd_prev->dd_next;
5175 				} else {
5176 					/*
5177 					 * If removing drive descriptor from
5178 					 * head of linked list, also change
5179 					 * sd->sd_drvs.
5180 					 */
5181 					master_dd = sd->sd_drvs = dd->dd_next;
5182 					dd->dd_next = NULL;
5183 					metafreedrivedesc(&dd);
5184 					dd = master_dd;
5185 				}
5186 				/* dd setup in if/else above */
5187 				continue;
5188 			}
5189 		}
5190 		dd_prev = dd;
5191 		dd = dd->dd_next;
5192 	}
5193 
5194 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5195 	    "Setting drive states completed for set %s: %s"),
5196 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5197 
5198 send_drive_list:
5199 	/*
5200 	 * Set genid on all drives to be the highest value seen.
5201 	 */
5202 	dd = master_dd;
5203 	while (dd) {
5204 		dd->dd_genid = max_genid;
5205 		dd = dd->dd_next;
5206 	}
5207 	/*
5208 	 * Send updated drive list to all alive nodes.
5209 	 * Will also set genid on set and node records to have same
5210 	 * as the drive records.
5211 	 */
5212 	nd = sd->sd_nodelist;
5213 	while (nd) {
5214 		/* Skip non-alive nodes */
5215 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5216 			nd = nd->nd_next;
5217 			continue;
5218 		}
5219 		if (clnt_upd_dr_reconfig(nd->nd_nodename, sp, master_dd, ep)) {
5220 			/* RPC failure to another node */
5221 			if ((mdanyrpcerror(ep)) &&
5222 			    (sd->sd_mn_mynode->nd_nodeid != nd->nd_nodeid)) {
5223 				rval = 205;
5224 			} else {
5225 				/* Any other failure */
5226 				rval = -1;
5227 			}
5228 			goto out;
5229 		}
5230 		nd = nd->nd_next;
5231 	}
5232 
5233 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5234 	    "Sent drive list to all nodes for set %s: %s"),
5235 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5236 
5237 	/*
5238 	 * If no drive records left in set and nodes had been joined,
5239 	 * withdraw the nodes.  Always reset the master and mark
5240 	 * all nodes as withdrawn on all nodes.
5241 	 */
5242 	if (master_dd == NULL) {
5243 		/* Reset new master flag since no longer master */
5244 		(void) memset(&sf, 0, sizeof (sf));
5245 		sf.sf_setno = sp->setno;
5246 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5247 		sf.sf_flags = MDDB_NM_RESET;
5248 		/* Use magic to help protect ioctl against attack. */
5249 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5250 		/* Ignore failure, failure to reset flag isn't catastrophic */
5251 		(void) metaioctl(MD_MN_SET_SETFLAGS, &sf,
5252 		    &sf.sf_mde, NULL);
5253 
5254 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5255 		    "Reset new master flag for " "set %s: %s"),
5256 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5257 
5258 		nd = sd->sd_nodelist;
5259 		while (nd) {
5260 			/* Skip non-alive nodes  */
5261 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5262 				nd = nd->nd_next;
5263 				continue;
5264 			}
5265 
5266 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
5267 				/* RPC failure to another node */
5268 				if ((mdanyrpcerror(ep)) &&
5269 				    (sd->sd_mn_mynode->nd_nodeid !=
5270 				    nd->nd_nodeid)) {
5271 					rval = 205;
5272 				} else {
5273 					/* Any other failure */
5274 					rval = -1;
5275 				}
5276 				goto out;
5277 			}
5278 			set_locked = 1;
5279 
5280 			/* Withdraw node from set if owner */
5281 			if ((nd->nd_flags & MD_MN_NODE_OWN) &&
5282 			    (clnt_withdrawset(nd->nd_nodename, sp, ep))) {
5283 				/* RPC failure to another node */
5284 				if ((mdanyrpcerror(ep)) &&
5285 				    (sd->sd_mn_mynode->nd_nodeid !=
5286 				    nd->nd_nodeid)) {
5287 					rval = 205;
5288 				} else {
5289 					/* Any other failure */
5290 					rval = -1;
5291 				}
5292 				goto out;
5293 			}
5294 
5295 			/* Mark all nodes as withdrawn on this node */
5296 			if (clnt_upd_nr_flags(nd->nd_nodename, sp,
5297 			    sd->sd_nodelist, MD_NR_WITHDRAW, NULL, ep)) {
5298 				/* RPC failure to another node */
5299 				if ((mdanyrpcerror(ep)) &&
5300 				    (sd->sd_mn_mynode->nd_nodeid !=
5301 				    nd->nd_nodeid)) {
5302 					rval = 205;
5303 				} else {
5304 					/* Any other failure */
5305 					rval = -1;
5306 				}
5307 				goto out;
5308 			}
5309 
5310 			/* Resets master to no-master on this node */
5311 			if (clnt_mnsetmaster(nd->nd_nodename, sp,
5312 			    "", MD_MN_INVALID_NID, ep)) {
5313 				/* RPC failure to another node */
5314 				if ((mdanyrpcerror(ep)) &&
5315 				    (sd->sd_mn_mynode->nd_nodeid !=
5316 				    nd->nd_nodeid)) {
5317 					rval = 205;
5318 				} else {
5319 					/* Any other failure */
5320 					rval = -1;
5321 				}
5322 				goto out;
5323 			}
5324 
5325 			cl_sk = cl_get_setkey(sp->setno, sp->setname);
5326 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) {
5327 				/* RPC failure to another node */
5328 				if ((mdanyrpcerror(ep)) &&
5329 				    (sd->sd_mn_mynode->nd_nodeid !=
5330 				    nd->nd_nodeid)) {
5331 					rval = 205;
5332 				} else {
5333 					/* Any other failure */
5334 					rval = -1;
5335 				}
5336 				goto out;
5337 			}
5338 			set_locked = 0;
5339 			nd = nd->nd_next;
5340 		}
5341 	}
5342 
5343 out:
5344 	/*
5345 	 * If got here and set is still locked, then an error has
5346 	 * occurred and master_nodelist is still valid.
5347 	 * If error is not an RPC error, then unlock.
5348 	 * If error is an RPC error, skip unlocks since this could cause
5349 	 * yet another RPC timeout if a node has failed.
5350 	 * Ignore failures in unlock since unlock is just trying to
5351 	 * clean things up.
5352 	 */
5353 	if ((set_locked) && !(mdanyrpcerror(ep))) {
5354 		nd = master_nodelist;
5355 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
5356 		while (nd) {
5357 			/* Skip non-alive nodes */
5358 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5359 				nd = nd->nd_next;
5360 				continue;
5361 			}
5362 			/*
5363 			 * If clnt_unlock fails, just break out since next
5364 			 * reconfig cycle will reset the locks anyway.
5365 			 */
5366 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
5367 				break;
5368 			}
5369 			nd = nd->nd_next;
5370 		}
5371 		cl_set_setkey(NULL);
5372 	}
5373 	/* Free master_mnsr and drive descs */
5374 	mnsr_node = master_mnsr_node;
5375 	while (mnsr_node) {
5376 		master_mnsr_node = mnsr_node->mmn_next;
5377 		free_sr((md_set_record *)mnsr_node->mmn_mnsr);
5378 		free_rem_dd(mnsr_node->mmn_dd);
5379 		Free(mnsr_node);
5380 		mnsr_node = master_mnsr_node;
5381 	}
5382 
5383 	/* Frees sd->sd_drvs (which is also master_dd) */
5384 	metaflushsetname(sp);
5385 	return (rval);
5386 }
5387 
5388 /*
5389  * meta_mnsync_diskset_mddbs
5390  * Calling node is guaranteed to be an owner node.
5391  * Calling node is the master node.
5392  *
5393  * Master node verifies that ondisk mddb format matches its incore format.
5394  * If no nodes are joined to set, remove the change log entries.
5395  * If a node is joined to set, play the change log.
5396  *
5397  * Returns	 0 - Success
5398  *		 1 - Master unable to join to set.
5399  *		205 - Failure during RPC to another node
5400  *		-1 - Any other failure and ep is filled in.
5401  *			-1 return will eventually cause node to panic
5402  *			in a SunCluster environment.
5403  */
5404 int
5405 meta_mnsync_diskset_mddbs(
5406 	mdsetname_t	*sp,
5407 	md_error_t	*ep
5408 )
5409 {
5410 	md_set_desc		*sd;
5411 	mddb_config_t		c;
5412 	md_mn_msgclass_t	class;
5413 	mddb_setflags_config_t	sf;
5414 	md_mnnode_desc		*nd, *nd2;
5415 	md_error_t		xep = mdnullerror;
5416 	int			stale_set = 0;
5417 
5418 	/* If setname is there, set desc should exist. */
5419 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5420 		mde_perror(ep, dgettext(TEXT_DOMAIN,
5421 		    "Unable to get set %s desc information"), sp->setname);
5422 		return (-1);
5423 	}
5424 
5425 	/* Are there drives in the set? */
5426 	if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
5427 	    ep) == NULL) {
5428 		if (! mdisok(ep)) {
5429 			return (-1);
5430 		}
5431 		/* No drives in set -- nothing to sync up */
5432 		return (0);
5433 	}
5434 
5435 	/*
5436 	 * Is master node (which is this node) joined to set?
5437 	 * If master node isn't joined (which means that no nodes
5438 	 * are joined to diskset), remove the change log entries
5439 	 * since no need to replay them - all nodes will have same
5440 	 * view of mddbs since all nodes are reading in the mddbs
5441 	 * from disk.
5442 	 * There is also no need to sync up the master and ondisk mddbs
5443 	 * since master has no incore knowledge.
5444 	 * Need to join master to set in order to flush the change
5445 	 * log entries. Don't need to block I/O during join of master
5446 	 * to set since no other nodes are joined to set and so no I/O
5447 	 * can be occurring.
5448 	 */
5449 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
5450 		/* Join master to set */
5451 		if (clnt_joinset(mynode(), sp,
5452 		    MNSET_IN_RECONFIG, ep)) {
5453 			if (mdismddberror(ep, MDE_DB_STALE)) {
5454 				/*
5455 				 * If STALE, print message and continue on.
5456 				 * Don't do any writes or reads to mddbs
5457 				 * so don't clear change log.
5458 				 */
5459 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5460 				    "Join of master node to STALE set %s"),
5461 				    sp->setname);
5462 				stale_set = 1;
5463 				mdclrerror(ep);
5464 			} else if (mdismddberror(ep, MDE_DB_ACCOK)) {
5465 				/* ACCOK means mediator provided extra vote */
5466 				mdclrerror(ep);
5467 			} else {
5468 				/*
5469 				 * If master is unable to join set, print an
5470 				 * error message.  Don't return failure or node
5471 				 * will panic during cluster reconfig cycle.
5472 				 * Also, withdraw node from set in order to
5473 				 * cleanup from failed join attempt.
5474 				 */
5475 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5476 				    "Join of master node in set %s failed"),
5477 				    sp->setname);
5478 				if (clnt_withdrawset(mynode(), sp, &xep))
5479 					mdclrerror(&xep);
5480 				return (1);
5481 			}
5482 		}
5483 		/*
5484 		 * Master node successfully joined.
5485 		 * Set local copy of flags to OWN and
5486 		 * send owner flag to rpc.metad. If not stale,
5487 		 * flush the change log.
5488 		 */
5489 		sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN;
5490 		if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, MD_NR_SET,
5491 		    MNSET_IN_RECONFIG, ep)) {
5492 			mde_perror(ep, dgettext(TEXT_DOMAIN,
5493 			    "Flag update of master node join in set %s failed"),
5494 			    sp->setname);
5495 			return (-1);
5496 		}
5497 
5498 		if (!stale_set) {
5499 			if (mdmn_reset_changelog(sp, ep,
5500 			    MDMN_CLF_RESETLOG) != 0) {
5501 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5502 				    "Unable to reset changelog."));
5503 				return (-1);
5504 			}
5505 			meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5506 			    "Removed changelog entries for set %s: %s"),
5507 			    sp->setname,
5508 			    meta_print_hrtime(gethrtime() - start_time));
5509 		}
5510 		/* Reset new master flag before return */
5511 		(void) memset(&sf, 0, sizeof (sf));
5512 		sf.sf_setno = sp->setno;
5513 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5514 		sf.sf_flags = MDDB_NM_RESET;
5515 		/* Use magic to help protect ioctl against attack. */
5516 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5517 		/* Ignore failure, failure to reset flag isn't catastrophic */
5518 		(void) metaioctl(MD_MN_SET_SETFLAGS, &sf,
5519 		    &sf.sf_mde, NULL);
5520 
5521 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5522 		    "Reset new master flag for set %s: %s"),
5523 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5524 
5525 		return (0);
5526 	}
5527 
5528 	/*
5529 	 * Is master already joined to STALE set (< 50% mddbs avail)?
5530 	 * If so, can make no config changes to mddbs so don't check or play
5531 	 * changelog and don't sync master node to ondisk mddbs.
5532 	 * To get out of the stale state all nodes must be withdrawn
5533 	 * from set.  Then as nodes are re-joined, all nodes will
5534 	 * have same view of mddbs since all nodes are reading the
5535 	 * mddbs from disk.
5536 	 */
5537 	(void) memset(&c, 0, sizeof (c));
5538 	c.c_id = 0;
5539 	c.c_setno = sp->setno;
5540 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
5541 		(void) mdstealerror(ep, &c.c_mde);
5542 		return (-1);
5543 	}
5544 	if (c.c_flags & MDDB_C_STALE) {
5545 		return (0);
5546 	}
5547 
5548 	/*
5549 	 * If this node is NOT a newly chosen master, then there's
5550 	 * nothing else to do since the change log should be empty and
5551 	 * the ondisk and incore mddbs are already consistent.
5552 	 *
5553 	 * A newly chosen master is a node that was not the master
5554 	 * at the beginning of the reconfig cycle.  If a node is a new
5555 	 * master, then the new master state is reset after the ondisk
5556 	 * and incore mddbs are consistent and the change log has
5557 	 * been replayed.
5558 	 */
5559 	(void) memset(&sf, 0, sizeof (sf));
5560 	sf.sf_setno = sp->setno;
5561 	sf.sf_flags = MDDB_NM_GET;
5562 	/* Use magic to help protect ioctl against attack. */
5563 	sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5564 	if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, &sf.sf_mde, NULL) == 0) &&
5565 	    ((sf.sf_setflags & MD_SET_MN_NEWMAS_RC) == 0)) {
5566 		return (0);
5567 	}
5568 
5569 	/*
5570 	 * Now, sync up incore master view to ondisk mddbs.
5571 	 * This is needed in the case where a master node
5572 	 * had made a change to the mddb, but this change
5573 	 * may not have been relayed to the slaves yet.
5574 	 * So, the new master needs to verify that the ondisk
5575 	 * mddbs match what the new master has incore -
5576 	 * if different, new master rewrites all of the mddbs.
5577 	 * Then the new master will replay the changelog and the
5578 	 * new master will then execute what the old master had
5579 	 * done.
5580 	 *
5581 	 * Block all I/Os to disks in this diskset on all nodes in
5582 	 * the diskset.  This will allow the rewriting of the mddbs
5583 	 * (if needed), to proceed in a timely manner.
5584 	 *
5585 	 * If block of I/Os fail, return a -1.
5586 	 */
5587 
5588 	nd = sd->sd_nodelist;
5589 	while (nd) {
5590 		/* Skip non-alive and non-owner nodes  */
5591 		if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5592 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5593 			nd = nd->nd_next;
5594 			continue;
5595 		}
5596 		if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5597 		    MN_SUSP_IO, ep)) {
5598 			mde_perror(ep, dgettext(TEXT_DOMAIN,
5599 			    "Unable to suspend I/O on node %s in set %s"),
5600 			    nd->nd_nodename, sp->setname);
5601 
5602 			/*
5603 			 * Resume all other nodes that had been suspended.
5604 			 * (Reconfig return step also resumes I/Os
5605 			 * for all sets.)
5606 			 */
5607 			nd2 = sd->sd_nodelist;
5608 			while (nd2) {
5609 				/* Stop when reaching failed node */
5610 				if (nd2->nd_nodeid == nd->nd_nodeid)
5611 					break;
5612 				/* Skip non-alive and non-owner nodes  */
5613 				if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) ||
5614 				    (!(nd2->nd_flags & MD_MN_NODE_OWN))) {
5615 					nd2 = nd2->nd_next;
5616 					continue;
5617 				}
5618 				(void) (clnt_mn_susp_res_io(nd2->nd_nodename,
5619 				    sp->setno, MN_RES_IO, &xep));
5620 				nd2 = nd2->nd_next;
5621 			}
5622 
5623 			/*
5624 			 * If an RPC failure on another node, return a 205.
5625 			 * Otherwise, exit with failure.
5626 			 */
5627 			if ((mdanyrpcerror(ep)) &&
5628 			    (sd->sd_mn_mynode->nd_nodeid !=
5629 			    nd->nd_nodeid)) {
5630 				return (205);
5631 			} else {
5632 				return (-1);
5633 			}
5634 
5635 		}
5636 		nd = nd->nd_next;
5637 	}
5638 
5639 	(void) memset(&c, 0, sizeof (c));
5640 	c.c_id = 0;
5641 	c.c_setno = sp->setno;
5642 	/* Master can't sync up to ondisk mddbs?  Kick it out of cluster */
5643 	if (metaioctl(MD_MN_CHK_WRT_MDDB, &c, &c.c_mde, NULL) != 0)
5644 		return (-1);
5645 
5646 	/*
5647 	 * Resume I/Os that were suspended above.
5648 	 */
5649 	nd = sd->sd_nodelist;
5650 	while (nd) {
5651 		/* Skip non-alive and non-owner nodes  */
5652 		if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5653 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5654 			nd = nd->nd_next;
5655 			continue;
5656 		}
5657 		if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5658 		    MN_RES_IO, ep)) {
5659 			mde_perror(ep, dgettext(TEXT_DOMAIN,
5660 			    "Unable to resume I/O on node %s in set %s"),
5661 			    nd->nd_nodename, sp->setname);
5662 
5663 			/*
5664 			 * If an RPC failure then don't do any
5665 			 * more RPC calls, since one timeout is enough
5666 			 * to endure.  If RPC failure to another node, return
5667 			 * 205.  If RPC failure to my node, return -1.
5668 			 * If not an RPC failure, continue resuming the
5669 			 * rest of the nodes and then return -1.
5670 			 */
5671 			if (mdanyrpcerror(ep)) {
5672 				if (sd->sd_mn_mynode->nd_nodeid ==
5673 				    nd->nd_nodeid) {
5674 					return (-1);
5675 				} else {
5676 					return (205);
5677 				}
5678 			}
5679 
5680 			/*
5681 			 * If not an RPC error, continue resuming rest of
5682 			 * nodes, ignoring any failures except for an
5683 			 * RPC failure which constitutes an immediate exit.
5684 			 * Start in middle of list with failing node.
5685 			 */
5686 			nd2 = nd->nd_next;
5687 			while (nd2) {
5688 				/* Skip non-alive and non-owner nodes  */
5689 				if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) ||
5690 				    (!(nd2->nd_flags & MD_MN_NODE_OWN))) {
5691 					nd2 = nd2->nd_next;
5692 					continue;
5693 				}
5694 				(void) (clnt_mn_susp_res_io(nd2->nd_nodename,
5695 				    sp->setno, MN_RES_IO, &xep));
5696 				if (mdanyrpcerror(&xep)) {
5697 					return (-1);
5698 				}
5699 				nd2 = nd2->nd_next;
5700 			}
5701 		}
5702 		nd = nd->nd_next;
5703 	}
5704 
5705 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, "Master node has completed "
5706 	    "checking/writing the mddb for set %s: %s"), sp->setname,
5707 	    meta_print_hrtime(gethrtime() - start_time));
5708 
5709 	/*
5710 	 * Send (aka replay) all messages we find in the changelog.
5711 	 * Flag the messages with
5712 	 *   MD_MSGF_REPLAY_MSG, so no new message ID is generated for them
5713 	 *   MD_MSGF_OVERRIDE_SUSPEND so they can pass the suspended commd.
5714 	 */
5715 	for (class = MD_MN_NCLASSES - 1; class > 0; class--) {
5716 		mdmn_changelog_record_t	*lr;
5717 		md_error_t	xep = mdnullerror;
5718 		md_mn_result_t	*resultp = NULL;
5719 		int		ret;
5720 
5721 		lr = mdmn_get_changelogrec(sp->setno, class);
5722 		if ((lr->lr_flags & MD_MN_LR_INUSE) == 0) {
5723 			/* no entry for this class */
5724 			continue;
5725 		}
5726 
5727 		meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN,
5728 		    "replaying message ID=(%d, 0x%llx-%d)\n"),
5729 		    MSGID_ELEMS(lr->lr_msg.msg_msgid));
5730 
5731 		ret = mdmn_send_message_with_msgid(
5732 		    lr->lr_msg.msg_setno,
5733 		    lr->lr_msg.msg_type,
5734 		    lr->lr_msg.msg_flags | MD_MSGF_REPLAY_MSG |
5735 		    MD_MSGF_OVERRIDE_SUSPEND,
5736 		    lr->lr_msg.msg_event_data,
5737 		    lr->lr_msg.msg_event_size,
5738 		    &resultp,
5739 		    &lr->lr_msg.msg_msgid,
5740 		    &xep);
5741 
5742 		meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN,
5743 		    "mdmn_send_message returned %d\n"), ret);
5744 
5745 		if (resultp)
5746 			free_result(resultp);
5747 	}
5748 
5749 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5750 	    "Playing changelog completed for set %s: %s"),
5751 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5752 
5753 	/*
5754 	 * Now that new master has ondisk and incore mddbs in sync, reset
5755 	 * this node's new master kernel flag (for this set).  If this node
5756 	 * re-enters another reconfig cycle before the completion of this
5757 	 * reconfig cycle, this master node won't need to check if the ondisk
5758 	 * and incore mddbs are in sync since this node won't be considered
5759 	 * a new master (since this flag is being reset here in the middle of
5760 	 * step2).  This will save time during any subsequent reconfig
5761 	 * cycles as long as this node continues to be master.
5762 	 */
5763 	(void) memset(&sf, 0, sizeof (sf));
5764 	sf.sf_setno = sp->setno;
5765 	sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5766 	sf.sf_flags = MDDB_NM_RESET;
5767 	/* Use magic to help protect ioctl against attack. */
5768 	sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5769 	/* Ignore failure, since failure to reset flag isn't catastrophic */
5770 	(void) metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, NULL);
5771 
5772 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5773 	    "Reset new master flag for set %s: %s"),
5774 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5775 
5776 	return (0);
5777 }
5778 
5779 /*
5780  * meta_mnjoin_all will join all starting nodes in the diskset.
5781  * A starting node is considered to be any node that is not
5782  * an owner of the set but is a member of the cluster.
5783  * Master node is already joined to set (done in meta_mnsync_diskset_mddbs).
5784  *
5785  * Caller is the Master node.
5786  *
5787  * Returns	 0 - Success
5788  *		205 - Failure during RPC to another node
5789  *		-1 - Any other failure and ep is filled in.
5790  */
5791 int
5792 meta_mnjoin_all(
5793 	mdsetname_t	*sp,
5794 	md_error_t	*ep
5795 )
5796 {
5797 	md_set_desc		*sd;
5798 	md_mnnode_desc		*nd, *nd2;
5799 	int			rval = 0;
5800 	int			stale_flag = 0;
5801 	mddb_config_t		c;
5802 	int			susp_res_flag = 0;
5803 	md_error_t		xep = mdnullerror;
5804 
5805 	/* If setname is there, set desc should exist. */
5806 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5807 		mde_perror(ep, dgettext(TEXT_DOMAIN,
5808 		    "Unable to get set %s desc information"), sp->setname);
5809 		return (-1);
5810 	}
5811 
5812 	/* Are there drives in the set? */
5813 	if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
5814 	    ep) == NULL) {
5815 		if (! mdisok(ep)) {
5816 			return (-1);
5817 		}
5818 		/* No drives in set -- nothing to join */
5819 		return (0);
5820 	}
5821 
5822 	/*
5823 	 * Is set currently stale?
5824 	 */
5825 	(void) memset(&c, 0, sizeof (c));
5826 	c.c_id = 0;
5827 	c.c_setno = sp->setno;
5828 	/* Ignore failure since master node may not be joined yet */
5829 	(void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL);
5830 	if (c.c_flags & MDDB_C_STALE) {
5831 		stale_flag = MNSET_IS_STALE;
5832 	}
5833 
5834 	/*
5835 	 * If any nodes are going to be joined to diskset, then
5836 	 * suspend I/O to all disks in diskset so that nodes can join
5837 	 * (read in mddbs) in a reasonable amount of time even under
5838 	 * high I/O load.  Don't need to do this if set is STALE since
5839 	 * no I/O can be occurring to a STALE set.
5840 	 */
5841 	if (stale_flag != MNSET_IS_STALE) {
5842 		nd = sd->sd_nodelist;
5843 		while (nd) {
5844 			/* Found a node that will be joined to diskset */
5845 			if ((nd->nd_flags & MD_MN_NODE_ALIVE) &&
5846 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5847 				/* Set flag that diskset should be suspended */
5848 				susp_res_flag = 1;
5849 				break;
5850 			}
5851 			nd = nd->nd_next;
5852 		}
5853 	}
5854 
5855 	if (susp_res_flag) {
5856 		/*
5857 		 * Block all I/Os to disks in this diskset on all joined
5858 		 * nodes in the diskset.
5859 		 * If block of I/Os fails due to an RPC failure on another
5860 		 * node, return 205; otherwise, return -1.
5861 		 */
5862 		nd = sd->sd_nodelist;
5863 		while (nd) {
5864 			/* Skip non-alive and non-owner nodes  */
5865 			if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5866 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5867 				nd = nd->nd_next;
5868 				continue;
5869 			}
5870 			if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5871 			    MN_SUSP_IO, ep)) {
5872 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5873 				    "Unable to suspend I/O on node %s"
5874 				    " in set %s"), nd->nd_nodename,
5875 				    sp->setname);
5876 				/*
5877 				 * Resume other nodes that had been suspended.
5878 				 * (Reconfig return step also resumes I/Os
5879 				 * for all sets.)
5880 				 */
5881 				nd2 = sd->sd_nodelist;
5882 				while (nd2) {
5883 					/* Stop when reaching failed node */
5884 					if (nd2->nd_nodeid == nd->nd_nodeid)
5885 						break;
5886 					/* Skip non-alive/non-owner nodes  */
5887 					if ((!(nd2->nd_flags &
5888 					    MD_MN_NODE_ALIVE)) ||
5889 					    (!(nd2->nd_flags &
5890 					    MD_MN_NODE_OWN))) {
5891 						nd2 = nd2->nd_next;
5892 						continue;
5893 					}
5894 					(void) (clnt_mn_susp_res_io(
5895 					    nd2->nd_nodename, sp->setno,
5896 					    MN_RES_IO, &xep));
5897 					nd2 = nd2->nd_next;
5898 				}
5899 
5900 				/*
5901 				 * If the suspend failed due to an
5902 				 * RPC failure on another node, return
5903 				 * a 205.
5904 				 * Otherwise, exit with failure.
5905 				 * The return reconfig step will resume
5906 				 * I/Os for all disksets.
5907 				 */
5908 				if ((mdanyrpcerror(ep)) &&
5909 				    (sd->sd_mn_mynode->nd_nodeid !=
5910 				    nd->nd_nodeid)) {
5911 					return (205);
5912 				} else {
5913 					return (-1);
5914 				}
5915 			}
5916 			nd = nd->nd_next;
5917 		}
5918 	}
5919 
5920 	nd = sd->sd_nodelist;
5921 	while (nd) {
5922 		/*
5923 		 * If a node is in the membership list but isn't joined
5924 		 * to the set, try to join the node.
5925 		 */
5926 		if ((nd->nd_flags & MD_MN_NODE_ALIVE) &&
5927 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5928 			if (clnt_joinset(nd->nd_nodename, sp,
5929 			    (MNSET_IN_RECONFIG | stale_flag), ep)) {
5930 				/*
5931 				 * If RPC failure to another node
5932 				 * then exit without attempting anything else.
5933 				 * (Reconfig return step will resume I/Os
5934 				 * for all sets.)
5935 				 */
5936 				if (mdanyrpcerror(ep)) {
5937 					mde_perror(ep, "");
5938 					return (205);
5939 				}
5940 				/*
5941 				 * STALE and ACCOK failures aren't true
5942 				 * failures.  STALE means that <50% mddbs
5943 				 * are available. ACCOK means that the
5944 				 * mediator provided the extra vote.
5945 				 * If a true failure, then print messasge
5946 				 * and withdraw node from set in order to
5947 				 * cleanup from failed join attempt.
5948 				 */
5949 				if ((!mdismddberror(ep, MDE_DB_STALE)) &&
5950 				    (!mdismddberror(ep, MDE_DB_ACCOK))) {
5951 					mde_perror(ep,
5952 					    "WARNING: Unable to join node %s "
5953 					    "to set %s", nd->nd_nodename,
5954 					    sp->setname);
5955 					mdclrerror(ep);
5956 					if (clnt_withdrawset(nd->nd_nodename,
5957 					    sp, &xep))
5958 						mdclrerror(&xep);
5959 					nd = nd->nd_next;
5960 					continue;
5961 				}
5962 			}
5963 			/* Set owner flag even if STALE or ACCOK */
5964 			nd->nd_flags |= MD_MN_NODE_OWN;
5965 		}
5966 		nd = nd->nd_next;
5967 	}
5968 	/*
5969 	 * Resume I/Os if suspended above.
5970 	 */
5971 	if (susp_res_flag) {
5972 		nd = sd->sd_nodelist;
5973 		while (nd) {
5974 			/*
5975 			 * Skip non-alive and non-owner nodes
5976 			 * (this list doesn't include any of
5977 			 * the nodes that were joined).
5978 			 */
5979 			if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5980 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5981 				nd = nd->nd_next;
5982 				continue;
5983 			}
5984 			if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5985 			    MN_RES_IO, ep)) {
5986 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5987 				    "Unable to resume I/O on node %s"
5988 				    " in set %s"), nd->nd_nodename,
5989 				    sp->setname);
5990 
5991 				/*
5992 				 * If an RPC failure then don't do any
5993 				 * more RPC calls, since one timeout is enough
5994 				 * to endure.  If RPC failure to another node,
5995 				 * return 205.  If RPC failure to my node,
5996 				 * return -1.
5997 				 * (Reconfig return step will resume I/Os
5998 				 * for all sets.)
5999 				 * If not an RPC failure, continue resuming the
6000 				 * rest of the nodes and then return -1.
6001 				 */
6002 				if (mdanyrpcerror(ep)) {
6003 					if (sd->sd_mn_mynode->nd_nodeid ==
6004 					    nd->nd_nodeid) {
6005 						return (-1);
6006 					} else {
6007 						return (205);
6008 					}
6009 				}
6010 
6011 				/*
6012 				 * If not an RPC error, continue resuming rest
6013 				 * of nodes, ignoring any failures except for
6014 				 * an RPC failure which constitutes an
6015 				 * immediate exit.
6016 				 * Start in middle of list with failing node.
6017 				 */
6018 				nd2 = nd->nd_next;
6019 				while (nd2) {
6020 					/* Skip non-owner nodes  */
6021 					if ((!(nd2->nd_flags &
6022 					    MD_MN_NODE_ALIVE)) ||
6023 					    (!(nd2->nd_flags &
6024 					    MD_MN_NODE_OWN))) {
6025 						nd2 = nd2->nd_next;
6026 						continue;
6027 					}
6028 					(void) (clnt_mn_susp_res_io(
6029 					    nd2->nd_nodename, sp->setno,
6030 					    MN_RES_IO, &xep));
6031 					if (mdanyrpcerror(&xep)) {
6032 						return (-1);
6033 					}
6034 					nd2 = nd2->nd_next;
6035 				}
6036 			}
6037 			nd = nd->nd_next;
6038 		}
6039 	}
6040 
6041 	nd = sd->sd_nodelist;
6042 	while (nd) {
6043 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
6044 			nd = nd->nd_next;
6045 			continue;
6046 		}
6047 		/*
6048 		 * If 1 node fails - go ahead and update the rest except
6049 		 * in the case of an RPC failure, fail immediately.
6050 		 */
6051 		if (clnt_upd_nr_flags(nd->nd_nodename, sp,
6052 		    sd->sd_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) {
6053 			/* RPC failure to another node */
6054 			if (mdanyrpcerror(ep)) {
6055 				return (205);
6056 			}
6057 			nd = nd->nd_next;
6058 			rval = -1;
6059 			continue;
6060 		}
6061 		nd = nd->nd_next;
6062 	}
6063 
6064 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
6065 	    "Join of all nodes completed for set %s: %s"),
6066 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
6067 
6068 	return (rval);
6069 }
6070