xref: /titanic_41/usr/src/lib/lvm/libmeta/common/meta_set.c (revision 9113a79cf228b8f7bd509b1328adf88659dfe218)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Just in case we're not in a build environment, make sure that
30  * TEXT_DOMAIN gets set to something.
31  */
32 #if !defined(TEXT_DOMAIN)
33 #define	TEXT_DOMAIN "SYS_TEST"
34 #endif
35 
36 /*
37  * Metadevice diskset interfaces
38  */
39 
40 #include "meta_set_prv.h"
41 #include <meta.h>
42 #include <metad.h>
43 #include <mdmn_changelog.h>
44 #include <sys/lvm/md_crc.h>
45 #include <sys/utsname.h>
46 #include <sdssc.h>
47 
48 #include <sys/sysevent/eventdefs.h>
49 #include <sys/sysevent/svm.h>
50 extern	char	*blkname(char *);
51 
52 static md_drive_desc *
53 dr2drivedesc(
54 	mdsetname_t	*sp,
55 	side_t		sideno,
56 	int		flags,
57 	md_error_t	*ep
58 )
59 {
60 	md_set_record	*sr;
61 	md_drive_record	*dr;
62 	mddrivename_t	*dnp;
63 	md_drive_desc	*dd_head = NULL;
64 	md_set_desc	*sd;
65 
66 	if (flags & MD_BYPASS_DAEMON) {
67 		if ((sr = metad_getsetbynum(sp->setno, ep)) == NULL)
68 			return (NULL);
69 		sd = metaget_setdesc(sp, ep);
70 		sideno = getnodeside(mynode(), sd);
71 		sp = metafakesetname(sp->setno, sr->sr_setname);
72 	} else {
73 		if ((sr = getsetbyname(sp->setname, ep)) == NULL)
74 			return (NULL);
75 	}
76 
77 	assert(sideno != MD_SIDEWILD);
78 
79 	/*
80 	 * WARNING:
81 	 * The act of getting the dnp from the namespace means that we
82 	 * will get the devid of the disk as recorded in the namespace.
83 	 * This devid has the potential to be stale if the disk is being
84 	 * replaced via a rebind, this means that any code that relies
85 	 * on any of the dnp information should take the appropriate action
86 	 * to preserve that information. For example in the rebind code the
87 	 * devid of the new disk is saved off and then copied back in once
88 	 * the code that has called this function has completed.
89 	 */
90 	for (dr = sr->sr_drivechain; dr != NULL; dr = dr->dr_next) {
91 		if ((dnp = metadrivename_withdrkey(sp, sideno, dr->dr_key,
92 		    flags, ep)) == NULL) {
93 			if (!(flags & MD_BYPASS_DAEMON))
94 				free_sr(sr);
95 			metafreedrivedesc(&dd_head);
96 			return (NULL);
97 		}
98 
99 		(void) metadrivedesc_append(&dd_head, dnp, dr->dr_dbcnt,
100 		    dr->dr_dbsize, dr->dr_ctime, dr->dr_genid, dr->dr_flags);
101 	}
102 
103 	if (!(flags & MD_BYPASS_DAEMON)) {
104 		free_sr(sr);
105 	}
106 	return (dd_head);
107 }
108 
109 static int
110 get_sidenmlist(
111 	mdsetname_t	*sp,
112 	mddrivename_t	*dnp,
113 	md_error_t	*ep
114 )
115 {
116 	md_set_desc	*sd;
117 	mdsidenames_t	*sn, **sn_next;
118 	int		i;
119 
120 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
121 		return (-1);
122 
123 	metaflushsidenames(dnp);
124 	sn_next = &dnp->side_names;
125 	if (MD_MNSET_DESC(sd)) {
126 		/*
127 		 * Only get sidenames for this node since
128 		 * that is the only side information stored in
129 		 * the local mddb for a multi-node diskset.
130 		 */
131 		if (sd->sd_mn_mynode) {
132 			sn = Zalloc(sizeof (*sn));
133 			sn->sideno = sd->sd_mn_mynode->nd_nodeid;
134 			if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET,
135 			    sn->sideno, dnp->side_names_key, &sn->dname,
136 			    &sn->mnum, NULL, ep)) == NULL) {
137 				if (sn->dname != NULL)
138 					Free(sn->dname);
139 				Free(sn);
140 				return (-1);
141 			}
142 
143 			/* Add to the end of the linked list */
144 			assert(*sn_next == NULL);
145 			*sn_next = sn;
146 			sn_next = &sn->next;
147 		}
148 	} else {
149 		for (i = 0; i < MD_MAXSIDES; i++) {
150 			/* Skip empty slots */
151 			if (sd->sd_nodes[i][0] == '\0')
152 				continue;
153 
154 			sn = Zalloc(sizeof (*sn));
155 			sn->sideno = i;
156 			if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET,
157 			    i+SKEW, dnp->side_names_key, &sn->dname,
158 			    &sn->mnum, NULL, ep)) == NULL) {
159 				/*
160 				 * It is possible that during the add of a
161 				 * host to have a 'missing' side as the side
162 				 * for this disk will be added later. So ignore
163 				 * the error. The 'missing' side will be added
164 				 * once the addhosts process has completed.
165 				 */
166 				if (mdissyserror(ep, ENOENT)) {
167 					mdclrerror(ep);
168 					Free(sn);
169 					continue;
170 				}
171 
172 				if (sn->dname != NULL)
173 					Free(sn->dname);
174 				Free(sn);
175 				return (-1);
176 			}
177 
178 			/* Add to the end of the linked list */
179 			assert(*sn_next == NULL);
180 			*sn_next = sn;
181 			sn_next = &sn->next;
182 		}
183 	}
184 
185 	return (0);
186 }
187 
188 static md_drive_desc *
189 rl_to_dd(
190 	mdsetname_t		*sp,
191 	md_replicalist_t	*rlp,
192 	md_error_t		*ep
193 )
194 {
195 	md_replicalist_t	*rl;
196 	md_replica_t		*r;
197 	md_drive_desc		*dd = NULL;
198 	md_drive_desc		*d;
199 	int			found;
200 	md_set_desc		*sd;
201 	daddr_t			nblks = 0;
202 
203 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
204 		return (NULL);
205 
206 	/* find the smallest existing replica */
207 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
208 		r = rl->rl_repp;
209 		nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks));
210 	}
211 
212 	if (nblks <= 0)
213 		nblks = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;
214 
215 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
216 		r = rl->rl_repp;
217 
218 		found = 0;
219 		for (d = dd; d != NULL; d = d->dd_next) {
220 			if (strcmp(r->r_namep->drivenamep->cname,
221 			    d->dd_dnp->cname) == 0) {
222 				found = 1;
223 				dd->dd_dbcnt++;
224 				break;
225 			}
226 		}
227 
228 		if (! found)
229 			(void) metadrivedesc_append(&dd, r->r_namep->drivenamep,
230 			    1, nblks, sd->sd_ctime, sd->sd_genid, MD_DR_OK);
231 	}
232 
233 	return (dd);
234 }
235 
236 /*
237  * Exported Entry Points
238  */
239 
240 set_t
241 get_max_sets(md_error_t *ep)
242 {
243 
244 	static set_t		max_sets = 0;
245 
246 	if (max_sets == 0)
247 		if (metaioctl(MD_IOCGETNSET, &max_sets, ep, NULL) != 0)
248 			return (0);
249 
250 	return (max_sets);
251 }
252 
253 int
254 get_max_meds(md_error_t *ep)
255 {
256 	static int		max_meds = 0;
257 
258 	if (max_meds == 0)
259 		if (metaioctl(MD_MED_GET_NMED, &max_meds, ep, NULL) != 0)
260 			return (0);
261 
262 	return (max_meds);
263 }
264 
265 side_t
266 getmyside(mdsetname_t *sp, md_error_t *ep)
267 {
268 	md_set_desc		*sd;
269 	char 			*node = NULL;
270 	side_t			sideno;
271 
272 	if (sp->setno == 0)
273 		return (0);
274 
275 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
276 		return (MD_SIDEWILD);
277 
278 	node = mynode();
279 
280 	assert(node != NULL);
281 
282 	sideno = getnodeside(node, sd);
283 
284 	if (sideno != MD_SIDEWILD)
285 		return (sideno);
286 
287 	return (mddserror(ep, MDE_DS_HOSTNOSIDE, sp->setno, node, NULL, node));
288 }
289 
290 /*
291  * get set info from name
292  */
293 md_set_record *
294 getsetbyname(char *setname, md_error_t *ep)
295 {
296 	md_set_record		*sr = NULL;
297 	md_mnset_record		*mnsr = NULL;
298 	char			*p;
299 	size_t			len;
300 
301 	/* get set info from daemon */
302 	if (clnt_getset(mynode(), setname, MD_SET_BAD, &sr, ep) == -1)
303 		return (NULL);
304 	if (sr != NULL) {
305 		/*
306 		 * Returned record could be for a multi-node set or a
307 		 * non-multi-node set.
308 		 */
309 		if (MD_MNSET_REC(sr)) {
310 			/*
311 			 * Record is for a multi-node set.  Reissue call
312 			 * to get mnset information.  Need to free
313 			 * record as if a non-multi-node set record since
314 			 * that is what clnt_getset gave us.  If in
315 			 * the daemon, don't free since this is a pointer
316 			 * into the setrecords array.
317 			 */
318 			if (! md_in_daemon) {
319 				sr->sr_flags &= ~MD_SR_MN;
320 				free_sr(sr);
321 			}
322 			if (clnt_mngetset(mynode(), setname, MD_SET_BAD, &mnsr,
323 			    ep) == -1)
324 				return (NULL);
325 			if (mnsr != NULL)
326 				return ((struct md_set_record *)mnsr);
327 		} else {
328 			return (sr);
329 		}
330 	}
331 
332 	/* no such set */
333 	len = strlen(setname) + 30;
334 	p = Malloc(len);
335 	(void) snprintf(p, len, "setname \"%s\"", setname);
336 	(void) mderror(ep, MDE_NO_SET, p);
337 	Free(p);
338 	return (NULL);
339 }
340 
341 /*
342  * get set info from number
343  */
344 md_set_record *
345 getsetbynum(set_t setno, md_error_t *ep)
346 {
347 	md_set_record		*sr;
348 	md_mnset_record		*mnsr = NULL;
349 	char			buf[100];
350 
351 	if (clnt_getset(mynode(), NULL, setno, &sr, ep) == -1)
352 		return (NULL);
353 
354 	if (sr != NULL) {
355 		/*
356 		 * Record is for a multi-node set.  Reissue call
357 		 * to get mnset information.  Need to free
358 		 * record as if a non-multi-node set record since
359 		 * that is what clnt_getset gave us.  If in
360 		 * the daemon, don't free since this is a pointer
361 		 * into the setrecords array.
362 		 */
363 		if (MD_MNSET_REC(sr)) {
364 			/*
365 			 * Record is for a multi-node set.  Reissue call
366 			 * to get mnset information.
367 			 */
368 			if (! md_in_daemon) {
369 				sr->sr_flags &= ~MD_SR_MN;
370 				free_sr(sr);
371 			}
372 			if (clnt_mngetset(mynode(), NULL, setno, &mnsr,
373 			    ep) == -1)
374 				return (NULL);
375 			if (mnsr != NULL)
376 				return ((struct md_set_record *)mnsr);
377 		} else {
378 			return (sr);
379 		}
380 	}
381 
382 	(void) sprintf(buf, "setno %u", setno);
383 	(void) mderror(ep, MDE_NO_SET, buf);
384 	return (NULL);
385 }
386 
387 int
388 meta_check_drive_inuse(
389 	mdsetname_t	*sp,
390 	mddrivename_t	*dnp,
391 	int		check_db,
392 	md_error_t	*ep
393 )
394 {
395 	mdnamelist_t	*nlp = NULL;
396 	mdnamelist_t	*p;
397 	int		rval = 0;
398 
399 	/* get all underlying partitions */
400 	if (meta_getalldevs(sp, &nlp, check_db, ep) != 0)
401 		return (-1);
402 
403 	/* search for drive */
404 	for (p = nlp; (p != NULL); p = p->next) {
405 		mdname_t	*np = p->namep;
406 
407 		if (strcmp(dnp->cname, np->drivenamep->cname) == 0) {
408 			rval = (mddserror(ep, MDE_DS_DRIVEINUSE, sp->setno,
409 			    NULL, dnp->cname, sp->setname));
410 			break;
411 		}
412 	}
413 
414 	/* cleanup, return success */
415 	metafreenamelist(nlp);
416 	return (rval);
417 }
418 
419 /*
420  * simple check for ownership
421  */
422 int
423 meta_check_ownership(mdsetname_t *sp, md_error_t *ep)
424 {
425 	int			ownset;
426 	md_set_desc		*sd;
427 	md_drive_desc		*dd;
428 	md_replicalist_t	*rlp = NULL;
429 	md_error_t		xep = mdnullerror;
430 
431 	if (metaislocalset(sp))
432 		return (0);
433 
434 	ownset = own_set(sp, NULL, TRUE, ep);
435 	if (! mdisok(ep))
436 		return (-1);
437 
438 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
439 		return (-1);
440 
441 	dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep);
442 	if (! mdisok(ep))
443 		return (-1);
444 
445 	/* If we have no drive descriptors, check for no ownership */
446 	if (dd == NULL) {
447 		if (ownset == MD_SETOWNER_NONE)
448 			return (0);
449 
450 		/* If ownership somehow has come to exist, we must clean up */
451 
452 		if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp,
453 		    &xep) < 0)
454 			mdclrerror(&xep);
455 
456 		if ((dd = rl_to_dd(sp, rlp, &xep)) == NULL)
457 			if (! mdisok(&xep))
458 				mdclrerror(&xep);
459 
460 		if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
461 			if (rel_own_bydd(sp, dd, TRUE, &xep))
462 				mdclrerror(&xep);
463 		}
464 
465 		if (halt_set(sp, &xep))
466 			mdclrerror(&xep);
467 
468 		metafreereplicalist(rlp);
469 
470 		metafreedrivedesc(&dd);
471 
472 		return (0);
473 	}
474 
475 	metafreedrivedesc(&sd->sd_drvs);
476 
477 	if (ownset == MD_SETOWNER_YES)
478 		return (0);
479 
480 	return (mddserror(ep, MDE_DS_NOOWNER, sp->setno, NULL, NULL,
481 	    sp->setname));
482 }
483 
484 /*
485  * simple check for ownership
486  */
487 int
488 meta_check_ownership_on_host(mdsetname_t *sp, char *hostname, md_error_t *ep)
489 {
490 	md_set_desc	*sd;
491 	md_drive_desc	*dd;
492 	int		bool;
493 
494 	if (metaislocalset(sp))
495 		return (0);
496 
497 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
498 		return (-1);
499 
500 	if (getnodeside(hostname, sd) == MD_SIDEWILD)
501 		return (mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
502 		    hostname, NULL, sp->setname));
503 
504 	dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep);
505 	if (! mdisok(ep))
506 		return (-1);
507 
508 	if (clnt_ownset(hostname, sp, &bool, ep) == -1)
509 		return (-1);
510 
511 	if (dd == NULL)
512 		return (0);
513 
514 	metafreedrivedesc(&sd->sd_drvs);
515 
516 	if (bool == TRUE)
517 		return (0);
518 
519 	return (mddserror(ep, MDE_DS_NODEISNOTOWNER, sp->setno, hostname, NULL,
520 	    sp->setname));
521 }
522 
523 /*
524  * Function that determines if a node is in the multinode diskset
525  * membership list.  Calling node passes in node to be checked and
526  * the nodelist as returned from meta_read_nodelist.  This routine
527  * anticipates being called many times using the same diskset membership
528  * list which is why the alloc and free of the diskset membership list
529  * is left to the calling routine.
530  * Returns:
531  *	1 - if a member
532  *	0 - not a member
533  */
534 int
535 meta_is_member(
536 	char				*node_name,
537 	md_mn_nodeid_t			node_id,
538 	mndiskset_membershiplist_t	*nl
539 )
540 {
541 	mndiskset_membershiplist_t	*nl2;
542 	int				flag_check_name;
543 
544 	if (node_id != 0)
545 		flag_check_name = 0;
546 	else if (node_name != NULL)
547 		flag_check_name = 1;
548 	else
549 		return (0);
550 
551 	nl2 = nl;
552 	while (nl2) {
553 		if (flag_check_name) {
554 			/* Compare given name against name in member list */
555 			if (strcmp(nl2->msl_node_name, node_name) == 0)
556 				break;
557 		} else {
558 			/* Compare given nodeid against nodeid in member list */
559 			if (nl2->msl_node_id == node_id)
560 				break;
561 		}
562 		nl2 = nl2->next;
563 	}
564 	/* No match found in member list */
565 	if (nl2 == NULL) {
566 		return (0);
567 	}
568 	/* Return 1 if node is in member list */
569 	return (1);
570 }
571 
572 /*
573  * meta_getnext_devinfo should go to the host that
574  * has the device, to return the device name, driver name, minor num.
575  * We can take the big cheat for now, since it is a requirement
576  * that the device names and device numbers are the same, and
577  * just get the info locally.
578  *
579  * This routine is very similar to meta_getnextside_devinfo except
580  * that the specific side to be used is being passed in.
581  *
582  * Exit status:
583  *	 0 - No more side info to return
584  *	 1 - More side info's to return
585  *	-1 - An error has been detected
586  */
587 /*ARGSUSED*/
588 int
589 meta_getside_devinfo(
590 	mdsetname_t	*sp,		/* for this set */
591 	char		*bname,		/* local block name (myside) */
592 	side_t		sideno,		/* sideno */
593 	char		**ret_bname,	/* block device name of returned side */
594 	char		**ret_dname,	/* driver name of returned side */
595 	minor_t		*ret_mnum,	/* minor number of returned side */
596 	md_error_t	*ep
597 )
598 {
599 	mdname_t	*np;
600 
601 	if (ret_bname != NULL)
602 		*ret_bname = NULL;
603 	if (ret_dname != NULL)
604 		*ret_dname = NULL;
605 	if (ret_mnum != NULL)
606 		*ret_mnum = NODEV32;
607 
608 
609 	if ((np = metaname(&sp, bname, LOGICAL_DEVICE, ep)) == NULL)
610 		return (-1);
611 
612 /*
613  * NOTE (future) - There will be more work here once devids are integrated
614  * into disksets.  Then the side should be used to find the correct
615  * host and the b/d names should be gotten from that host.
616  */
617 
618 	/*
619 	 * Return the side info.
620 	 */
621 	if (ret_bname != NULL)
622 		*ret_bname = Strdup(np->bname);
623 
624 	if (ret_dname != NULL) {
625 		mdcinfo_t	*cinfo;
626 
627 		if ((cinfo = metagetcinfo(np, ep)) == NULL)
628 			return (-1);
629 
630 		*ret_dname = Strdup(cinfo->dname);
631 	}
632 
633 	if (ret_mnum != NULL)
634 		*ret_mnum = meta_getminor(np->dev);
635 
636 	return (1);
637 }
638 
639 /*
640  * Get the information on the device from the remote node using the devid
641  * of the disk.
642  *
643  * Exit status:
644  *	 0 - No more side info to return
645  *	 1 - More side info's to return
646  *	-1 - An error has been detected
647  */
648 int
649 meta_getnextside_devinfo(
650 	mdsetname_t	*sp,		/* for this set */
651 	char		*bname,		/* local block name (myside) */
652 	side_t		*sideno,	/* previous sideno & returned sideno */
653 	char		**ret_bname,	/* block device name of returned side */
654 	char		**ret_dname,	/* driver name of returned side */
655 	minor_t		*ret_mnum,	/* minor number of returned side */
656 	md_error_t	*ep
657 )
658 {
659 	md_set_desc	*sd;
660 	int		i;
661 	mdname_t	*np;
662 	mddrivename_t	*dnp;
663 	char		*devidstr = NULL;
664 	int		devidstrlen;
665 	md_dev64_t	retdev = NODEV64;
666 	char		*ret_devname = NULL;
667 	char		*ret_blkdevname = NULL;
668 	char		*ret_driver = NULL;
669 	char		*nodename;
670 	int		fd;
671 	int		ret = -1;
672 	char		*minor_name = NULL;
673 	md_mnnode_desc	*nd;
674 
675 
676 	if (ret_bname != NULL)
677 		*ret_bname = NULL;
678 	if (ret_dname != NULL)
679 		*ret_dname = NULL;
680 	if (ret_mnum != NULL)
681 		*ret_mnum = NODEV32;
682 
683 	if (metaislocalset(sp)) {
684 		/* no more sides - we are done */
685 		if (*sideno != MD_SIDEWILD)
686 			return (0);
687 
688 		/* First time through -  set up return sideno */
689 		*sideno = 0;
690 	} else {
691 
692 		/*
693 		 * Find the next sideno, starting after the one given.
694 		 */
695 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
696 			return (-1);
697 
698 		if (MD_MNSET_DESC(sd)) {
699 			nd = sd->sd_nodelist;
700 			if ((*sideno == MD_SIDEWILD) &&
701 			    (nd != (struct md_mnnode_desc *)NULL)) {
702 				*sideno = nd->nd_nodeid;
703 			} else {
704 				while (nd) {
705 					/*
706 					 * Found given sideno, now find
707 					 * next sideno, if there is one.
708 					 */
709 					if ((*sideno == nd->nd_nodeid) &&
710 					    (nd->nd_next !=
711 					    (struct md_mnnode_desc *)NULL)) {
712 						*sideno =
713 						    nd->nd_next->nd_nodeid;
714 						break;
715 					}
716 					nd = nd->nd_next;
717 				}
718 				if (nd == NULL) {
719 					return (0);
720 				}
721 			}
722 			if (*sideno == MD_SIDEWILD)
723 				return (0);
724 		} else {
725 			for (i = (*sideno)+1; i < MD_MAXSIDES; i++)
726 				/* Find next full slot */
727 				if (sd->sd_nodes[i][0] != '\0')
728 					break;
729 
730 			/* No more sides - we are done */
731 			if (i == MD_MAXSIDES)
732 				return (0);
733 
734 			/* Set up the return sideno */
735 			*sideno = i;
736 			nodename = (char *)sd->sd_nodes[i];
737 		}
738 	}
739 
740 	/*
741 	 * Need to pass the node the devid of the disk and get it to
742 	 * send back the details of the disk from that side.
743 	 */
744 	if ((np = metaname(&sp, bname, UNKNOWN, ep)) == NULL)
745 		return (-1);
746 
747 	dnp = np->drivenamep;
748 
749 	/*
750 	 * By default, set up the parameters so that they are copied out.
751 	 */
752 	if (ret_bname != NULL)
753 		*ret_bname = Strdup(np->bname);
754 
755 	if (ret_dname != NULL) {
756 		mdcinfo_t	*cinfo;
757 
758 		if ((cinfo = metagetcinfo(np, ep)) == NULL)
759 			return (-1);
760 
761 		*ret_dname = Strdup(cinfo->dname);
762 	}
763 
764 	if (ret_mnum != NULL)
765 		*ret_mnum = meta_getminor(np->dev);
766 
767 	/*
768 	 * Try some optimization. If this is the local set or the device
769 	 * is a metadevice then just copy the information. If the device
770 	 * does not have a devid (due to not having a minor name) then
771 	 * fall back to the pre-devid behaviour of copying the information
772 	 * on the device: this is okay because the sanity checks before this
773 	 * call would have found any issues with the device. If it's a
774 	 * multi-node diskset also just return ie. copy.
775 	 */
776 	if (metaislocalset(sp) || metaismeta(np) || (dnp->devid == NULL) ||
777 	    (MD_MNSET_DESC(sd)))
778 		return (1);
779 
780 	if (np->minor_name == (char *)NULL) {
781 		/*
782 		 * Have to get the minor name then. The slice should exist
783 		 * on the disk because it will have already been repartitioned
784 		 * up prior to getting to this point.
785 		 */
786 		if ((fd = open(np->bname, (O_RDONLY|O_NDELAY), 0)) < 0) {
787 			(void) mdsyserror(ep, errno, np->bname);
788 			return (-1);
789 		}
790 		(void) devid_get_minor_name(fd, &minor_name);
791 		np->minor_name = Strdup(minor_name);
792 		devid_str_free(minor_name);
793 		(void) close(fd);
794 	}
795 
796 	/* allocate extra space for "/" and NULL hence +2 */
797 	devidstrlen = strlen(dnp->devid) + strlen(np->minor_name) + 2;
798 	devidstr = (char *)Malloc(devidstrlen);
799 
800 	/*
801 	 * As a minor name is supplied then the ret_devname will be
802 	 * appropriate to that minor_name and in this case it will be
803 	 * a block device ie /dev/dsk.
804 	 */
805 	(void) snprintf(devidstr, devidstrlen,
806 		"%s/%s", dnp->devid, np->minor_name);
807 
808 	ret = clnt_devinfo_by_devid(nodename, sp, devidstr, &retdev,
809 	    np->bname, &ret_devname, &ret_driver, ep);
810 
811 	Free(devidstr);
812 
813 	/*
814 	 * If the other side is not running device id in disksets,
815 	 * 'ret' is set to ENOTSUP in which case we fallback to
816 	 * the existing behaviour
817 	 */
818 	if (ret == ENOTSUP)
819 		return (1);
820 	else if (ret == -1)
821 		return (-1);
822 
823 	/*
824 	 * ret_devname comes from the rpc call and is a
825 	 * raw device name. We need to make this into a
826 	 * block device via blkname for further processing.
827 	 * Unfortunately, when our device id isn't found in
828 	 * the system, the rpc call will return a " " in
829 	 * ret_devname in which case we need to fill that in
830 	 * as ret_blkname because blkname of " " returns NULL.
831 	 */
832 	if (ret_bname != NULL && ret_devname != NULL) {
833 		ret_blkdevname = blkname(ret_devname);
834 		if (ret_blkdevname == NULL)
835 			*ret_bname = Strdup(ret_devname);
836 		else
837 			*ret_bname = Strdup(ret_blkdevname);
838 	}
839 
840 	if (ret_dname != NULL && ret_driver != NULL)
841 		*ret_dname = Strdup(ret_driver);
842 
843 	if (ret_mnum != NULL)
844 		*ret_mnum = meta_getminor(retdev);
845 
846 	return (1);
847 }
848 
849 int
850 meta_is_drive_in_anyset(
851 	mddrivename_t	*dnp,
852 	mdsetname_t	**spp,
853 	int		bypass_daemon,
854 	md_error_t 	*ep
855 )
856 {
857 	set_t		setno;
858 	mdsetname_t	*this_sp;
859 	int		is_it;
860 	set_t		max_sets;
861 
862 	if ((max_sets = get_max_sets(ep)) == 0)
863 		return (-1);
864 
865 	assert(spp != NULL);
866 	*spp = NULL;
867 
868 	for (setno = 1; setno < max_sets; setno++) {
869 		if (!bypass_daemon) {
870 			if ((this_sp = metasetnosetname(setno, ep)) == NULL) {
871 				if (mdismddberror(ep, MDE_DB_NODB)) {
872 					mdclrerror(ep);
873 					return (0);
874 				}
875 				if (mdiserror(ep, MDE_NO_SET)) {
876 					mdclrerror(ep);
877 					continue;
878 				}
879 				return (-1);
880 			}
881 		} else
882 			this_sp = metafakesetname(setno, NULL);
883 
884 		if ((is_it = meta_is_drive_in_thisset(this_sp, dnp,
885 		    bypass_daemon, ep)) == -1) {
886 			if (mdiserror(ep, MDE_NO_SET)) {
887 				mdclrerror(ep);
888 				continue;
889 			}
890 			return (-1);
891 		}
892 		if (is_it) {
893 			*spp = this_sp;
894 			return (0);
895 		}
896 	}
897 	return (0);
898 }
899 
900 int
901 meta_is_drive_in_thisset(
902 	mdsetname_t	*sp,
903 	mddrivename_t	*dnp,
904 	int		bypass_daemon,
905 	md_error_t	*ep
906 )
907 {
908 	md_drive_desc	*dd, *p;
909 
910 	if (bypass_daemon)
911 		dd = dr2drivedesc(sp, MD_SIDEWILD,
912 		    (MD_BASICNAME_OK | MD_BYPASS_DAEMON), ep);
913 	else
914 		dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
915 
916 	if (dd == NULL) {
917 		if (! mdisok(ep))
918 			return (-1);
919 		return (0);
920 	}
921 
922 
923 	for (p = dd; p != NULL; p = p->dd_next)
924 		if (strcmp(p->dd_dnp->cname, dnp->cname) == 0)
925 			return (1);
926 	return (0);
927 }
928 
929 /*
930  * Check to see if devid is in use in any diskset.
931  * This is used in the case when a partial diskset is being imported
932  * to make sure that the unvailable drive isn't already in use in an
933  * already imported partial diskset.  Can't check on the cname since the
934  * unavailable disk's cname is from the previous system and may collide
935  * with a cname on this system.
936  * Return values:
937  *	1: devid has been found in a diskset
938  *	0: devid not found in any diskset
939  */
940 int
941 meta_is_devid_in_anyset(
942 	void		*devid,
943 	mdsetname_t	**spp,
944 	md_error_t 	*ep
945 )
946 {
947 	set_t		setno;
948 	mdsetname_t	*this_sp;
949 	int		is_it;
950 	set_t		max_sets;
951 
952 	if ((max_sets = get_max_sets(ep)) == 0)
953 		return (-1);
954 
955 	assert(spp != NULL);
956 	*spp = NULL;
957 
958 	for (setno = 1; setno < max_sets; setno++) {
959 		if ((this_sp = metasetnosetname(setno, ep)) == NULL) {
960 			if (mdismddberror(ep, MDE_DB_NODB)) {
961 				mdclrerror(ep);
962 				return (0);
963 			}
964 			if (mdiserror(ep, MDE_NO_SET)) {
965 				mdclrerror(ep);
966 				continue;
967 			}
968 			return (-1);
969 		}
970 
971 		if ((is_it = meta_is_devid_in_thisset(this_sp,
972 		    devid, ep)) == -1) {
973 			if (mdiserror(ep, MDE_NO_SET)) {
974 				mdclrerror(ep);
975 				continue;
976 			}
977 			return (-1);
978 		}
979 		if (is_it) {
980 			*spp = this_sp;
981 			return (0);
982 		}
983 	}
984 	return (0);
985 }
986 
987 int
988 meta_is_devid_in_thisset(
989 	mdsetname_t	*sp,
990 	void		*devid,
991 	md_error_t	*ep
992 )
993 {
994 	md_drive_desc	*dd, *p;
995 	ddi_devid_t	dd_devid;
996 
997 	dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
998 	if (dd == NULL) {
999 		if (! mdisok(ep))
1000 			return (-1);
1001 		return (0);
1002 	}
1003 
1004 	for (p = dd; p != NULL; p = p->dd_next) {
1005 		if (p->dd_dnp->devid == NULL)
1006 			continue;
1007 		(void) devid_str_decode(p->dd_dnp->devid,
1008 		    &dd_devid, NULL);
1009 		if (dd_devid == NULL)
1010 			continue;
1011 		if (devid_compare(devid, dd_devid) == 0) {
1012 			devid_free(dd_devid);
1013 			return (1);
1014 		}
1015 		devid_free(dd_devid);
1016 	}
1017 	return (0);
1018 }
1019 
1020 int
1021 meta_set_balance(
1022 	mdsetname_t		*sp,
1023 	md_error_t		*ep
1024 )
1025 {
1026 	md_set_desc		*sd;
1027 	md_drive_desc		*dd, *curdd;
1028 	daddr_t			dbsize;
1029 	daddr_t			nblks;
1030 	int			i;
1031 	int			rval = 0;
1032 	sigset_t		oldsigs;
1033 	md_setkey_t		*cl_sk;
1034 	md_error_t		xep = mdnullerror;
1035 	md_mnnode_desc		*nd;
1036 	int			suspend1_flag = 0;
1037 
1038 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1039 		return (-1);
1040 
1041 	dbsize = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;
1042 
1043 	/* Make sure we own the set */
1044 	if (meta_check_ownership(sp, ep) != 0)
1045 		return (-1);
1046 
1047 	/* END CHECK CODE */
1048 
1049 	/*
1050 	 * Get drive descriptors for the drives that are currently in the set.
1051 	 */
1052 	curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep);
1053 
1054 	if (! mdisok(ep))
1055 		return (-1);
1056 
1057 	/* Find the minimum replica size in use is or use the default */
1058 	if ((nblks = meta_db_minreplica(sp, ep)) < 0)
1059 		mdclrerror(ep);
1060 	else
1061 		dbsize = nblks;	/* adjust replica size */
1062 
1063 	/* Make sure we are blocking all signals */
1064 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
1065 		mdclrerror(&xep);
1066 
1067 	/*
1068 	 * Lock the set on current set members.
1069 	 * For MN diskset lock_set and SUSPEND are used to protect against
1070 	 * other meta* commands running on the other nodes.
1071 	 */
1072 	if (MD_MNSET_DESC(sd)) {
1073 		nd = sd->sd_nodelist;
1074 		while (nd) {
1075 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1076 				nd = nd->nd_next;
1077 				continue;
1078 			}
1079 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
1080 				rval = -1;
1081 				goto out;
1082 			}
1083 			nd = nd->nd_next;
1084 		}
1085 		/*
1086 		 * Lock out other meta* commands by suspending
1087 		 * class 1 messages across the diskset.
1088 		 */
1089 		nd = sd->sd_nodelist;
1090 		while (nd) {
1091 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1092 				nd = nd->nd_next;
1093 				continue;
1094 			}
1095 			if (clnt_mdcommdctl(nd->nd_nodename,
1096 			    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
1097 			    MD_MSCF_NO_FLAGS, ep)) {
1098 				rval = -1;
1099 				goto out;
1100 			}
1101 			suspend1_flag = 1;
1102 			nd = nd->nd_next;
1103 		}
1104 	} else {
1105 		for (i = 0; i < MD_MAXSIDES; i++) {
1106 			/* Skip empty slots */
1107 			if (sd->sd_nodes[i][0] == '\0') continue;
1108 
1109 			if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
1110 				rval = -1;
1111 				goto out;
1112 			}
1113 		}
1114 	}
1115 
1116 	/* We are not adding or deleting any drives, just balancing */
1117 	dd = NULL;
1118 
1119 	/*
1120 	 * Balance the DB's according to the list of existing drives and the
1121 	 * list of added drives.
1122 	 */
1123 	if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1)
1124 		goto out;
1125 
1126 out:
1127 	/*
1128 	 * Unlock diskset by resuming class 1 messages across the diskset.
1129 	 * Just resume all classes so that resume is the same whether
1130 	 * just one class was locked or all classes were locked.
1131 	 */
1132 	if (suspend1_flag) {
1133 		nd = sd->sd_nodelist;
1134 		while (nd) {
1135 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1136 				nd = nd->nd_next;
1137 				continue;
1138 			}
1139 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
1140 				sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
1141 				/*
1142 				 * We are here because we failed to resume
1143 				 * rpc.mdcommd.  However we potentially have
1144 				 * an error from the previous call
1145 				 * (meta_db_balance). If the previous call
1146 				 * did fail,  we capture that error and
1147 				 * generate a perror withthe string,
1148 				 * "Unable to resume...".
1149 				 * Setting rval to -1 ensures that in the
1150 				 * next iteration of the loop, ep is not
1151 				 * clobbered.
1152 				 */
1153 				if (rval == 0)
1154 					(void) mdstealerror(ep, &xep);
1155 				else
1156 					mdclrerror(&xep);
1157 				rval = -1;
1158 				mde_perror(ep, dgettext(TEXT_DOMAIN,
1159 				    "Unable to resume rpc.mdcommd."));
1160 			}
1161 			nd = nd->nd_next;
1162 		}
1163 	}
1164 
1165 	/* Unlock the set */
1166 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1167 	if (MD_MNSET_DESC(sd)) {
1168 		nd = sd->sd_nodelist;
1169 		while (nd) {
1170 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1171 				nd = nd->nd_next;
1172 				continue;
1173 			}
1174 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
1175 				if (rval == 0)
1176 					(void) mdstealerror(ep, &xep);
1177 				else
1178 					mdclrerror(&xep);
1179 				rval = -1;
1180 			}
1181 			nd = nd->nd_next;
1182 		}
1183 	} else {
1184 		for (i = 0; i < MD_MAXSIDES; i++) {
1185 			/* Skip empty slots */
1186 			if (sd->sd_nodes[i][0] == '\0')
1187 				continue;
1188 
1189 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
1190 				if (rval == 0)
1191 					(void) mdstealerror(ep, &xep);
1192 				rval = -1;
1193 			}
1194 		}
1195 	}
1196 
1197 	/* release signals back to what they were on entry */
1198 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
1199 		mdclrerror(&xep);
1200 
1201 	cl_set_setkey(NULL);
1202 
1203 	metaflushsetname(sp);
1204 
1205 	return (rval);
1206 }
1207 
1208 int
1209 meta_set_destroy(
1210 	mdsetname_t	*sp,
1211 	int		lock_set,
1212 	md_error_t	*ep
1213 )
1214 {
1215 	int		i;
1216 	med_rec_t	medr;
1217 	md_set_desc	*sd;
1218 	md_drive_desc	*dd, *p, *p1;
1219 	mddrivename_t	*dnp;
1220 	mdname_t	*np;
1221 	mdnamelist_t	*nlp = NULL;
1222 	int		num_users = 0;
1223 	int		has_set;
1224 	side_t		mysideno;
1225 	sigset_t	oldsigs;
1226 	md_error_t	xep = mdnullerror;
1227 	md_setkey_t	*cl_sk;
1228 	int		rval = 0;
1229 	int		delete_end = 1;
1230 
1231 	/* Make sure we are blocking all signals */
1232 	if (procsigs(TRUE, &oldsigs, ep) < 0)
1233 		return (-1);
1234 
1235 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1236 		if (! mdisok(ep))
1237 			rval = -1;
1238 		goto out;
1239 	}
1240 
1241 	/*
1242 	 * meta_set_destroy should not be called for a MN diskset.
1243 	 * This routine destroys a set without communicating this information
1244 	 * to the other nodes which would lead to an inconsistency in
1245 	 * the MN diskset.
1246 	 */
1247 	if (MD_MNSET_DESC(sd)) {
1248 		rval = -1;
1249 		goto out;
1250 	}
1251 
1252 	/* Continue if a traditional diskset */
1253 
1254 	/*
1255 	 * Check to see who has the set.  If we are not the last user of the
1256 	 * set, we will not touch the replicas.
1257 	 */
1258 	for (i = 0; i < MD_MAXSIDES; i++) {
1259 		/* Skip empty slots */
1260 		if (sd->sd_nodes[i][0] == '\0')
1261 			continue;
1262 
1263 		has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NST_EQ,
1264 		    ep);
1265 
1266 		if (has_set < 0) {
1267 			mdclrerror(ep);
1268 		} else
1269 			num_users++;
1270 	}
1271 
1272 	if ((dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) == NULL) {
1273 		if (! mdisok(ep)) {
1274 			rval = -1;
1275 			goto out;
1276 		}
1277 	}
1278 
1279 	if (setup_db_bydd(sp, dd, TRUE, ep) == -1) {
1280 		rval = -1;
1281 		goto out;
1282 	}
1283 
1284 	if (lock_set == TRUE) {
1285 		/* Lock the set on our side */
1286 		if (clnt_lock_set(mynode(), sp, ep)) {
1287 			rval = -1;
1288 			goto out;
1289 		}
1290 	}
1291 
1292 	/*
1293 	 * A traditional diskset has no diskset stale information to send
1294 	 * since there can only be one owner node at a time.
1295 	 */
1296 	if (snarf_set(sp, FALSE, ep))
1297 		mdclrerror(ep);
1298 
1299 	if (dd != NULL) {
1300 		/*
1301 		 * Make sure that no drives are in use as parts of metadrives
1302 		 * or hot spare pools, this is one of the few error conditions
1303 		 * that will stop this routine, unless the environment has
1304 		 * META_DESTROY_SET_OK set, in which case, the operation will
1305 		 * proceed.
1306 		 */
1307 		if (getenv("META_DESTROY_SET_OK") == NULL) {
1308 			for (p = dd; p != NULL; p = p->dd_next) {
1309 				dnp = p->dd_dnp;
1310 
1311 				i = meta_check_drive_inuse(sp, dnp, FALSE, ep);
1312 				if (i == -1) {
1313 					/* need xep - wire calls clear error */
1314 					i = metaget_setownership(sp, &xep);
1315 					if (i == -1) {
1316 						rval = -1;
1317 						goto out;
1318 					}
1319 
1320 					mysideno = getmyside(sp, &xep);
1321 
1322 					if (mysideno == MD_SIDEWILD) {
1323 						rval = -1;
1324 						goto out;
1325 					}
1326 
1327 					if (sd->sd_isown[mysideno] == FALSE)
1328 						if (halt_set(sp, &xep)) {
1329 							rval = -1;
1330 							goto out;
1331 						}
1332 
1333 					rval = -1;
1334 					goto out;
1335 				}
1336 			}
1337 		}
1338 
1339 		for (i = 0; i < MD_MAXSIDES; i++) {
1340 			/* Skip empty slots */
1341 			if (sd->sd_nodes[i][0] == '\0')
1342 				continue;
1343 
1344 			/* Skip non local nodes */
1345 			if (strcmp(mynode(), sd->sd_nodes[i]) != 0)
1346 				continue;
1347 
1348 			if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep))
1349 				mdclrerror(ep);
1350 		}
1351 
1352 		/*
1353 		 * Go thru each drive and individually delete the replicas.
1354 		 * This way we can ignore individual errors.
1355 		 */
1356 		for (p = dd; p != NULL; p = p->dd_next) {
1357 			uint_t	rep_slice;
1358 
1359 			dnp = p->dd_dnp;
1360 			if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) ||
1361 			    (((np = metaslicename(dnp, rep_slice, ep))
1362 				== NULL) &&
1363 				((np = metaslicename(dnp, MD_SLICE0, ep))
1364 				    == NULL))) {
1365 				rval = -1;
1366 				goto out;
1367 			}
1368 
1369 			if ((np = metaslicename(dnp,
1370 			    rep_slice, ep)) == NULL) {
1371 				if ((np = metaslicename(dnp,
1372 				    MD_SLICE0, ep)) == NULL) {
1373 					rval = -1;
1374 					goto out;
1375 				}
1376 				mdclrerror(ep);
1377 			}
1378 
1379 			/* Yes this is UGLY!!! */
1380 			p1 = p->dd_next;
1381 			p->dd_next = NULL;
1382 			if (rel_own_bydd(sp, p, FALSE, ep))
1383 				mdclrerror(ep);
1384 			p->dd_next = p1;
1385 
1386 			if (p->dd_dbcnt == 0)
1387 				continue;
1388 
1389 			/*
1390 			 * Skip the replica removal if we are not the last user
1391 			 */
1392 			if (num_users != 1)
1393 				continue;
1394 
1395 			nlp = NULL;
1396 			(void) metanamelist_append(&nlp, np);
1397 			if (meta_db_detach(sp, nlp,
1398 			    (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, ep))
1399 				mdclrerror(ep);
1400 			metafreenamelist(nlp);
1401 		}
1402 	}
1403 
1404 	if (halt_set(sp, ep)) {
1405 		rval = -1;
1406 		goto out;
1407 	}
1408 
1409 	/* Setup the mediator record */
1410 	(void) memset(&medr, '\0', sizeof (med_rec_t));
1411 	medr.med_rec_mag = MED_REC_MAGIC;
1412 	medr.med_rec_rev = MED_REC_REV;
1413 	medr.med_rec_fl  = 0;
1414 	medr.med_rec_sn  = sp->setno;
1415 	(void) strcpy(medr.med_rec_snm, sp->setname);
1416 	medr.med_rec_meds = sd->sd_med;	/* structure assigment */
1417 	(void) memset(&medr.med_rec_data, '\0', sizeof (med_data_t));
1418 	medr.med_rec_foff = 0;
1419 
1420 	/*
1421 	 * If we are the last remaining user, then remove the mediator hosts
1422 	 */
1423 	if (num_users == 1) {
1424 		for (i = 0; i < MED_MAX_HOSTS; i++) {
1425 			if (medr.med_rec_meds.n_lst[i].a_cnt != 0)
1426 				SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE,
1427 				    SVM_TAG_MEDIATOR, sp->setno, i);
1428 			(void) memset(&medr.med_rec_meds.n_lst[i], '\0',
1429 			    sizeof (md_h_t));
1430 		}
1431 		medr.med_rec_meds.n_cnt = 0;
1432 	} else { 	/* Remove this host from the mediator node list. */
1433 		for (i = 0; i < MD_MAXSIDES; i++) {
1434 			/* Skip empty slots */
1435 			if (sd->sd_nodes[i][0] == '\0')
1436 				continue;
1437 
1438 			/* Copy non local node */
1439 			if (strcmp(mynode(), sd->sd_nodes[i]) != 0) {
1440 				(void) strcpy(medr.med_rec_nodes[i],
1441 				    sd->sd_nodes[i]);
1442 				continue;
1443 			}
1444 
1445 			/* Clear local node */
1446 			(void) memset(&medr.med_rec_nodes[i], '\0',
1447 			    sizeof (md_node_nm_t));
1448 		}
1449 	}
1450 
1451 	crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
1452 
1453 	/*
1454 	 * If the client is part of a cluster put the DCS service
1455 	 * into a deleteing state.
1456 	 */
1457 	if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) {
1458 		if (metad_isautotakebyname(sp->setname)) {
1459 			delete_end = 0;
1460 		} else {
1461 			mdclrerror(ep);
1462 			goto out;
1463 		}
1464 	}
1465 
1466 	/* Inform the mediator hosts of the new information */
1467 	for (i = 0; i < MED_MAX_HOSTS; i++) {
1468 		if (sd->sd_med.n_lst[i].a_cnt == 0)
1469 			continue;
1470 
1471 		if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep))
1472 			mdclrerror(ep);
1473 	}
1474 
1475 	/* Delete the set locally */
1476 	for (i = 0; i < MD_MAXSIDES; i++) {
1477 		/* Skip empty slots */
1478 		if (sd->sd_nodes[i][0] == '\0')
1479 			continue;
1480 
1481 		/* Skip non local nodes */
1482 		if (strcmp(mynode(), sd->sd_nodes[i]) != 0)
1483 			continue;
1484 
1485 		if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1)
1486 			mdclrerror(ep);
1487 	}
1488 	if (delete_end &&
1489 	    sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR)
1490 		rval = -1;
1491 
1492 out:
1493 	/* release signals back to what they were on entry */
1494 	if (procsigs(FALSE, &oldsigs, &xep) < 0) {
1495 		if (rval == 0)
1496 			(void) mdstealerror(ep, &xep);
1497 		rval = -1;
1498 	}
1499 
1500 	if (lock_set == TRUE) {
1501 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
1502 		if (clnt_unlock_set(mynode(), cl_sk, &xep)) {
1503 			if (rval == 0)
1504 				(void) mdstealerror(ep, &xep);
1505 			rval = -1;
1506 		}
1507 		cl_set_setkey(NULL);
1508 	}
1509 
1510 	metaflushsetname(sp);
1511 	return (rval);
1512 }
1513 
1514 int
1515 meta_set_purge(
1516 	mdsetname_t	*sp,
1517 	int		bypass_cluster,
1518 	int		forceflg,
1519 	md_error_t	*ep
1520 )
1521 {
1522 	char		*thishost = mynode();
1523 	md_set_desc	*sd;
1524 	md_setkey_t	*cl_sk;
1525 	md_error_t	xep = mdnullerror;
1526 	int		rval = 0;
1527 	int		i, num_hosts = 0;
1528 	int		has_set = 0;
1529 	int		max_node = 0;
1530 	int		delete_end = 1;
1531 	md_mnnode_desc	*nd;
1532 
1533 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1534 		/* unable to find set description */
1535 		rval = 1;
1536 		return (rval);
1537 	}
1538 
1539 	if (MD_MNSET_DESC(sd)) {
1540 		/*
1541 		 * Get a count of the hosts in the set and also lock the set
1542 		 * on those hosts that know about it.
1543 		 */
1544 		nd = sd->sd_nodelist;
1545 		while (nd) {
1546 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1547 				nd = nd->nd_next;
1548 				continue;
1549 			}
1550 			has_set = nodehasset(sp, nd->nd_nodename,
1551 				NHS_NST_EQ, ep);
1552 
1553 			/*
1554 			 * The host is not aware of this set (has_set < 0) or
1555 			 * the set does not match (has_set == 0). This check
1556 			 * prevents the code getting confused by an apparent
1557 			 * inconsistancy in the set's state, this is in the
1558 			 * purge code so something is broken in any case and
1559 			 * this is just trying to fix the brokeness.
1560 			 */
1561 			if (has_set <= 0) {
1562 				mdclrerror(ep);
1563 				nd->nd_flags |= MD_MN_NODE_NOSET;
1564 			} else {
1565 				num_hosts++;
1566 				if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
1567 					/*
1568 					 * If the force flag is set then
1569 					 * ignore any RPC failures because we
1570 					 * are only really interested with
1571 					 * the set on local node.
1572 					 */
1573 					if (forceflg && mdanyrpcerror(ep)) {
1574 						mdclrerror(ep);
1575 					} else {
1576 						/*
1577 						 * set max_node so that in the
1578 						 * unlock code nodes in the
1579 						 * set that have not been
1580 						 * locked are not unlocked.
1581 						 */
1582 						max_node = nd->nd_nodeid;
1583 						rval = 2;
1584 						goto out1;
1585 					}
1586 				}
1587 
1588 			}
1589 			nd = nd->nd_next;
1590 		}
1591 		max_node = 0;
1592 	} else {
1593 		/*
1594 		 * Get a count of the hosts in the set and also lock the set
1595 		 * on those hosts that know about it.
1596 		 */
1597 		for (i = 0; i < MD_MAXSIDES; i++) {
1598 			/* Skip empty slots */
1599 			if (sd->sd_nodes[i][0] == '\0')
1600 				continue;
1601 
1602 			has_set = nodehasset(sp, sd->sd_nodes[i],
1603 				NHS_NST_EQ, ep);
1604 
1605 			/*
1606 			 * The host is not aware of this set (has_set < 0) or
1607 			 * the set does not match (has_set == 0). This check
1608 			 * prevents the code getting confused by an apparent
1609 			 * inconsistancy in the set's state, this is in the
1610 			 * purge code so something is broken in any case and
1611 			 * this is just trying to fix the brokeness.
1612 			 */
1613 			if (has_set <= 0) {
1614 				mdclrerror(ep);
1615 				/*
1616 				 * set the node to NULL to prevent further
1617 				 * requests to this unresponsive node.
1618 				 */
1619 				sd->sd_nodes[i][0] = '\0';
1620 			} else {
1621 				num_hosts++;
1622 				if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
1623 					/*
1624 					 * If the force flag is set then
1625 					 * ignore any RPC failures because we
1626 					 * are only really interested with
1627 					 * the set on local node.
1628 					 */
1629 					if (forceflg && mdanyrpcerror(ep)) {
1630 						mdclrerror(ep);
1631 					} else {
1632 						rval = 2;
1633 						/*
1634 						 * set max_node so that in the
1635 						 * unlock code nodes in the
1636 						 * set that have not been
1637 						 * locked are not unlocked.
1638 						 */
1639 						max_node = i;
1640 						goto out1;
1641 					}
1642 				}
1643 			}
1644 		}
1645 		max_node = i;	/* now MD_MAXSIDES */
1646 	}
1647 	if (!bypass_cluster) {
1648 		/*
1649 		 * If there is only one host associated with the
1650 		 * set then remove the set from the cluster.
1651 		 */
1652 		if (num_hosts == 1) {
1653 			if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) {
1654 				if (metad_isautotakebyname(sp->setname)) {
1655 					delete_end = 0;
1656 				} else {
1657 					mdclrerror(ep);
1658 					rval = 3;
1659 					goto out1;
1660 				}
1661 			}
1662 		}
1663 	}
1664 
1665 	if (MD_MNSET_DESC(sd)) {
1666 		/*
1667 		 * Get a count of the hosts in the set and also lock the set
1668 		 * on those hosts that know about it.
1669 		 */
1670 		nd = sd->sd_nodelist;
1671 		while (nd) {
1672 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1673 				nd = nd->nd_next;
1674 				continue;
1675 			}
1676 			if (nd->nd_nodeid != sd->sd_mn_mynode->nd_nodeid) {
1677 				/*
1678 				 * Tell the remote node to remove this node
1679 				 */
1680 				if (clnt_delhosts(nd->nd_nodename, sp, 1,
1681 					&thishost, ep) == -1) {
1682 					/*
1683 					 * If we fail to delete ourselves
1684 					 * from the remote host it does not
1685 					 * really matter because the set is
1686 					 * being "purged" from this node. The
1687 					 * set can be purged from the other
1688 					 * node at a later time.
1689 					 */
1690 					mdclrerror(ep);
1691 				}
1692 				nd = nd->nd_next;
1693 				continue;
1694 			}
1695 			/* remove the set from this host */
1696 			if (clnt_delset(nd->nd_nodename, sp, ep) == -1) {
1697 				md_perror(dgettext(TEXT_DOMAIN, "delset"));
1698 				if (!bypass_cluster && num_hosts == 1)
1699 					(void) sdssc_delete_end(sp->setname,
1700 					    SDSSC_CLEANUP);
1701 				mdclrerror(ep);
1702 				goto out1;
1703 			}
1704 			nd = nd->nd_next;
1705 		}
1706 	} else {
1707 		for (i = 0; i < MD_MAXSIDES; i++) {
1708 			/* Skip empty slots */
1709 			if (sd->sd_nodes[i][0] == '\0')
1710 				continue;
1711 			if (strcmp(thishost, sd->sd_nodes[i]) != 0) {
1712 				/*
1713 				 * Tell the remote node to remove this node
1714 				 */
1715 				if (clnt_delhosts(sd->sd_nodes[i], sp, 1,
1716 				    &thishost, ep) == -1) {
1717 					/*
1718 					 * If we fail to delete ourselves
1719 					 * from the remote host it does not
1720 					 * really matter because the set is
1721 					 * being "purged" from this node. The
1722 					 * set can be purged from the other
1723 					 * node at a later time.
1724 					 */
1725 					mdclrerror(ep);
1726 				}
1727 				continue;
1728 			}
1729 
1730 			/* remove the set from this host */
1731 			if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) {
1732 				md_perror(dgettext(TEXT_DOMAIN, "delset"));
1733 				if (!bypass_cluster && num_hosts == 1)
1734 					(void) sdssc_delete_end(sp->setname,
1735 					    SDSSC_CLEANUP);
1736 				mdclrerror(ep);
1737 				goto out1;
1738 			}
1739 		}
1740 	}
1741 
1742 	if (!bypass_cluster && num_hosts == 1) {
1743 		if (delete_end && sdssc_delete_end(sp->setname, SDSSC_COMMIT) ==
1744 		    SDSSC_ERROR) {
1745 			rval = 4;
1746 		}
1747 	}
1748 
1749 out1:
1750 
1751 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1752 
1753 	/*
1754 	 * Remove the set lock on those nodes that had the set locked
1755 	 * max_node will either be MD_MAXSIDES or array index of the last
1756 	 * node contacted (or rather failed to contact) for traditional
1757 	 * diskset.  For a MN diskset, max_node is the node_id of the node
1758 	 * that failed the lock.
1759 	 */
1760 	if (MD_MNSET_DESC(sd)) {
1761 		nd = sd->sd_nodelist;
1762 		while (nd) {
1763 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1764 				nd = nd->nd_next;
1765 				continue;
1766 			}
1767 			if (nd->nd_nodeid == max_node)
1768 				break;
1769 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
1770 				if (forceflg && mdanyrpcerror(&xep)) {
1771 					mdclrerror(&xep);
1772 					nd = nd->nd_next;
1773 					continue;
1774 				}
1775 				if (rval == 0)
1776 					(void) mdstealerror(ep, &xep);
1777 				rval = 5;
1778 			}
1779 			nd = nd->nd_next;
1780 		}
1781 	} else {
1782 		for (i = 0; i < max_node; i++) {
1783 			/* Skip empty slots */
1784 			if (sd->sd_nodes[i][0] == '\0')
1785 				continue;
1786 
1787 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
1788 				if (forceflg && mdanyrpcerror(&xep)) {
1789 					mdclrerror(&xep);
1790 					continue;
1791 				}
1792 				if (rval == 0)
1793 					(void) mdstealerror(ep, &xep);
1794 				rval = 5;
1795 			}
1796 		}
1797 	}
1798 
1799 	cl_set_setkey(NULL);
1800 
1801 	return (rval);
1802 }
1803 
1804 int
1805 meta_set_query(
1806 	mdsetname_t		*sp,
1807 	mddb_dtag_lst_t		**dtlpp,
1808 	md_error_t		*ep
1809 )
1810 {
1811 	mddb_dtag_get_parm_t	dtgp;
1812 
1813 	(void) memset(&dtgp, '\0', sizeof (mddb_dtag_get_parm_t));
1814 	dtgp.dtgp_setno = sp->setno;
1815 
1816 	/*CONSTCOND*/
1817 	while (1) {
1818 		if (metaioctl(MD_MED_GET_TAG, &dtgp, &dtgp.dtgp_mde, NULL) != 0)
1819 			if (! mdismddberror(&dtgp.dtgp_mde, MDE_DB_NOTAG) ||
1820 			    *dtlpp == NULL)
1821 				return (mdstealerror(ep, &dtgp.dtgp_mde));
1822 			else
1823 				break;
1824 
1825 		/*
1826 		 * Run to the end of the list
1827 		 */
1828 		for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx)
1829 			/* void */;
1830 
1831 		*dtlpp = Zalloc(sizeof (mddb_dtag_lst_t));
1832 
1833 		(void) memmove(&(*dtlpp)->dtl_dt, &dtgp.dtgp_dt,
1834 		    sizeof (mddb_dtag_t));
1835 
1836 		dtgp.dtgp_dt.dt_id++;
1837 	}
1838 	return (0);
1839 }
1840 
1841 /*
1842  * return drivename get by key
1843  */
1844 mddrivename_t *
1845 metadrivename_withdrkey(
1846 	mdsetname_t	*sp,
1847 	side_t		sideno,
1848 	mdkey_t		key,
1849 	int		flags,
1850 	md_error_t	*ep
1851 )
1852 {
1853 	char		*nm;
1854 	mdname_t	*np;
1855 	mddrivename_t	*dnp;
1856 	ddi_devid_t	devidp;
1857 	md_set_desc	*sd;
1858 
1859 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1860 		return (NULL);
1861 	}
1862 
1863 
1864 	/*
1865 	 * Get the devid associated with the key.
1866 	 *
1867 	 * If a devid was returned, it MUST be valid even in
1868 	 * the case where a device id has been "updated". The
1869 	 * "update" of the device id may have occured due to
1870 	 * a firmware upgrade.
1871 	 */
1872 	if ((devidp = meta_getdidbykey(MD_LOCAL_SET, sideno+SKEW, key, ep))
1873 	    != NULL) {
1874 		/*
1875 		 * Look for the correct dnp using the devid for comparison.
1876 		 */
1877 		dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep);
1878 		free(devidp);
1879 		dnp->side_names_key = key;
1880 	} else {
1881 		/*
1882 		 * We didn't get a devid. We'll try for a dnp using the
1883 		 * name. If we have a MN diskset or if the dnp is a did
1884 		 * device, we're done because then we don't have devids.
1885 		 * Otherwise we'll try to set the devid
1886 		 * and get the dnp via devid again.
1887 		 * We also need to clear the ep structure. When the
1888 		 * above call to meta_getdidbykey returned a null, it
1889 		 * also put an error code into ep. In this case, the null
1890 		 * return is actually OK and any errors can be ignored. The
1891 		 * reason it is OK is because this could be a MN set or
1892 		 * we could  be running without devids (ex cluster).
1893 		 */
1894 		mdclrerror(ep);
1895 
1896 		if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno, key,
1897 		    ep)) == NULL)
1898 			return (NULL);
1899 		/* get device name */
1900 		if (flags & PRINT_FAST) {
1901 			if ((np = metaname_fast(&sp, nm,
1902 			    LOGICAL_DEVICE, ep)) == NULL) {
1903 				Free(nm);
1904 				return (NULL);
1905 			}
1906 		} else {
1907 			if ((np = metaname(&sp, nm, LOGICAL_DEVICE,
1908 			    ep)) == NULL) {
1909 				Free(nm);
1910 				return (NULL);
1911 			}
1912 		}
1913 		Free(nm);
1914 		/* make sure it's OK */
1915 		if ((! (flags & MD_BASICNAME_OK)) && (metachkcomp(np,
1916 		    ep) != 0))
1917 			return (NULL);
1918 
1919 		/* get drivename */
1920 		dnp = np->drivenamep;
1921 		dnp->side_names_key = key;
1922 		/*
1923 		 * Skip the devid set/check for the following cases:
1924 		 * 1) If MN diskset, there are no devid's
1925 		 * 2) if dnp is did device
1926 		 * The device id is disabled for did device due to the
1927 		 * lack of minor name support in the did driver. The following
1928 		 * devid code path can set and propagate the error and
1929 		 * eventually prevent did disks from being added to the
1930 		 * diskset under SunCluster systems
1931 		 *
1932 		 * Note that this code can be called through rpc.mdcommd.
1933 		 * sdssc_version cannot be used because the library won't
1934 		 * be bound.
1935 		 */
1936 		if ((strncmp(dnp->rname, "/dev/did/", strlen("/dev/did/"))
1937 		    == 0) || (MD_MNSET_DESC(sd)))
1938 			goto out;
1939 
1940 		/*
1941 		 * It is okay if replica is not in devid mode
1942 		 */
1943 		if (mdissyserror(ep, MDDB_F_NODEVID)) {
1944 			mdclrerror(ep);
1945 			goto out;
1946 		}
1947 
1948 		/*
1949 		 * We're not MN or did devices but
1950 		 * devid is missing so this means that we have
1951 		 * just upgraded from a configuration where
1952 		 * devid's were not used so try to add in
1953 		 * the devid and requery. If the devid still isn't there,
1954 		 * that's OK. dnp->devid will be null as it is in any
1955 		 * configuration with no devids.
1956 		 */
1957 		if (meta_setdid(MD_LOCAL_SET, sideno + SKEW, key, ep) < 0)
1958 			return (NULL);
1959 		if ((devidp = (ddi_devid_t)meta_getdidbykey(MD_LOCAL_SET,
1960 		    sideno+SKEW, key, ep)) != NULL) {
1961 			/*
1962 			 * Found a devid so look for the dnp using the
1963 			 * devid as the search mechanism.
1964 			 */
1965 			dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep);
1966 			free(devidp);
1967 			dnp->side_names_key = key;
1968 		}
1969 	}
1970 
1971 
1972 
1973 out:
1974 	if (flags & MD_BYPASS_DAEMON)
1975 		return (dnp);
1976 
1977 	if (get_sidenmlist(sp, dnp, ep))
1978 		return (NULL);
1979 
1980 	/* return success */
1981 	return (dnp);
1982 }
1983 
1984 void
1985 metafreedrivedesc(md_drive_desc **dd)
1986 {
1987 	md_drive_desc	*p, *next = NULL;
1988 
1989 	for (p = *dd; p != NULL; p = next) {
1990 		next = p->dd_next;
1991 		Free(p);
1992 	}
1993 	*dd = NULL;
1994 }
1995 
1996 md_drive_desc *
1997 metaget_drivedesc(
1998 	mdsetname_t	*sp,
1999 	int		flags,
2000 	md_error_t	*ep
2001 )
2002 {
2003 	side_t		sideno = MD_SIDEWILD;
2004 
2005 	assert(! (flags & MD_BYPASS_DAEMON));
2006 
2007 	if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD)
2008 		return (NULL);
2009 
2010 	return (metaget_drivedesc_sideno(sp, sideno, flags, ep));
2011 }
2012 
2013 md_drive_desc *
2014 metaget_drivedesc_fromnamelist(
2015 	mdsetname_t	*sp,
2016 	mdnamelist_t	*nlp,
2017 	md_error_t	*ep
2018 )
2019 {
2020 	md_set_desc		*sd;
2021 	mdnamelist_t		*p;
2022 	md_drive_desc		*dd = NULL;
2023 
2024 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
2025 		return (NULL);
2026 
2027 	for (p = nlp; p != NULL; p = p->next)
2028 		(void) metadrivedesc_append(&dd, p->namep->drivenamep, 0, 0,
2029 		    sd->sd_ctime, sd->sd_genid, MD_DR_ADD);
2030 
2031 	return (dd);
2032 }
2033 
2034 md_drive_desc *
2035 metaget_drivedesc_sideno(
2036 	mdsetname_t *sp,
2037 	side_t sideno,
2038 	int flags,
2039 	md_error_t *ep
2040 )
2041 {
2042 	md_set_desc	*sd = NULL;
2043 
2044 	assert(! (flags & MD_BYPASS_DAEMON));
2045 
2046 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
2047 		return (NULL);
2048 
2049 	if (sd->sd_drvs)
2050 		return (sd->sd_drvs);
2051 
2052 	if ((sd->sd_drvs = dr2drivedesc(sp, sideno, flags, ep)) == NULL)
2053 		return (NULL);
2054 
2055 	return (sd->sd_drvs);
2056 }
2057 
2058 int
2059 metaget_setownership(
2060 	mdsetname_t	*sp,
2061 	md_error_t	*ep
2062 )
2063 {
2064 	md_set_desc	*sd;
2065 	int		bool;
2066 	int		i;
2067 	md_mnnode_desc	*nd;
2068 
2069 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
2070 		return (-1);
2071 
2072 	if (MD_MNSET_DESC(sd)) {
2073 		nd = sd->sd_nodelist;
2074 		while (nd) {
2075 			/* If node isn't alive, can't own diskset */
2076 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2077 				nd->nd_flags &= ~MD_MN_NODE_OWN;
2078 				nd = nd->nd_next;
2079 				continue;
2080 			}
2081 			/*
2082 			 * If can't communicate with rpc.metad, then mark
2083 			 * this node as not an owner.  That node may
2084 			 * in fact, be an owner, but without rpc.metad running
2085 			 * that node can't do much.
2086 			 */
2087 			if (clnt_ownset(nd->nd_nodename, sp, &bool, ep) == -1) {
2088 				nd->nd_flags &= ~MD_MN_NODE_OWN;
2089 			} else if (bool == TRUE) {
2090 				nd->nd_flags |= MD_MN_NODE_OWN;
2091 			} else {
2092 				nd->nd_flags &= ~MD_MN_NODE_OWN;
2093 			}
2094 			nd = nd->nd_next;
2095 		}
2096 		return (0);
2097 	}
2098 
2099 	/* Rest of code handles traditional disksets */
2100 
2101 	for (i = 0; i < MD_MAXSIDES; i++)
2102 		sd->sd_isown[i] = 0;
2103 
2104 	if (clnt_ownset(mynode(), sp, &bool, ep) == -1)
2105 		return (-1);
2106 
2107 	if (bool == TRUE)
2108 		sd->sd_isown[getmyside(sp, ep)] = 1;
2109 
2110 	return (0);
2111 }
2112 
2113 char *
2114 mynode(void)
2115 {
2116 	static struct utsname	myuname;
2117 	static int		done = 0;
2118 
2119 	if (! done) {
2120 		if (uname(&myuname) == -1) {
2121 			md_perror(dgettext(TEXT_DOMAIN, "uname"));
2122 			assert(0);
2123 		}
2124 		done = 1;
2125 	}
2126 	return (myuname.nodename);
2127 }
2128 
2129 int
2130 strinlst(char *str, int cnt, char **lst)
2131 {
2132 	int i;
2133 
2134 	for (i = 0; i < cnt; i++)
2135 		if (strcmp(lst[i], str) == 0)
2136 			return (TRUE);
2137 
2138 	return (FALSE);
2139 }
2140 
2141 /*
2142  * meta_get_reserved_names
2143  *  returns an mdnamelist_t of reserved slices
2144  *  reserved slices are those that are used but don't necessarily
2145  *  show up as metadevices (ex. reserved slice for db in sets, logs)
2146  */
2147 
2148 /*ARGSUSED*/
2149 int
2150 meta_get_reserved_names(
2151 	mdsetname_t	*sp,
2152 	mdnamelist_t	**nlpp,
2153 	int		options,
2154 	md_error_t	*ep)
2155 {
2156 	int		 count		= 0;
2157 	mdname_t	*np		= NULL;
2158 	mdnamelist_t	*transnlp	= NULL;
2159 	mdnamelist_t	**tailpp 	= nlpp;
2160 	mdnamelist_t	*nlp;
2161 	md_drive_desc	*dd, *di;
2162 
2163 	if (metaislocalset(sp))
2164 		goto out;
2165 
2166 	if (!(dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) && !mdisok(ep)) {
2167 		count = -1;
2168 		goto out;
2169 	}
2170 
2171 	/* db in for sets on reserved slice */
2172 	for (di = dd; di && count >= 0; di = di->dd_next) {
2173 		uint_t	rep_slice;
2174 
2175 		/*
2176 		 * Add the name struct to the end of the
2177 		 * namelist but keep a pointer to the last
2178 		 * element so that we don't incur the overhead
2179 		 * of traversing the list each time
2180 		 */
2181 		if (di->dd_dnp &&
2182 		    (meta_replicaslice(di->dd_dnp, &rep_slice, ep) == 0) &&
2183 		    (np = metaslicename(di->dd_dnp, rep_slice, ep)) &&
2184 		    (tailpp = meta_namelist_append_wrapper(tailpp, np)))
2185 			count++;
2186 		else
2187 			count = -1;
2188 	}
2189 
2190 	/* now find logs */
2191 	if (meta_get_trans_names(sp, &transnlp, 0, ep) < 0) {
2192 		count = -1;
2193 		goto out;
2194 	}
2195 
2196 	for (nlp = transnlp; (nlp != NULL); nlp = nlp->next) {
2197 		mdname_t	*transnp = nlp->namep;
2198 		md_trans_t	*transp;
2199 
2200 		if ((transp = meta_get_trans(sp, transnp, ep)) == NULL) {
2201 			count = -1;
2202 			goto out;
2203 		}
2204 		if (transp->lognamep) {
2205 			/*
2206 			 * Add the name struct to the end of the
2207 			 * namelist but keep a pointer to the last
2208 			 * element so that we don't incur the overhead
2209 			 * of traversing the list each time
2210 			 */
2211 			tailpp = meta_namelist_append_wrapper(
2212 			    tailpp, transp->lognamep);
2213 		}
2214 	}
2215 out:
2216 	metafreenamelist(transnlp);
2217 	return (count);
2218 }
2219 
2220 /*
2221  * Entry point to join a node to MultiNode diskset.
2222  *
2223  * Validate host in diskset.
2224  *	- Should be in membership list from API
2225  *	- Should not already be joined into diskset.
2226  *	- Set must have drives
2227  * Assume valid configuration is stored in the set/drive/node records
2228  * in the local mddb since no node or drive can be added to the MNset
2229  * unless all drives and nodes are available.  Reconfig steps will
2230  * resync all ALIVE nodes in case of panic in critical areas.
2231  *
2232  * Lock down the set.
2233  * Verify host is a member of this diskset.
2234  * If drives exist in the configuration, load the mddbs.
2235  * Set this node to active by notifying master if one exists.
2236  * If this is the first node active in the diskset, this node
2237  * 	becomes the master.
2238  * Unlock the set.
2239  *
2240  * Mirror Resync:
2241  * If this node is the last node to join the set and clustering
2242  * isn't running, then start the 'metasync -r' type resync
2243  * on all mirrors in this diskset.
2244  * If clustering is running, this resync operation will
2245  * be handled by the reconfig steps and should NOT
2246  * be handled during a join operation.
2247  *
2248  * There are multiple return values in order to assist
2249  * the join operation of all sets in the metaset command.
2250  *
2251  * Return values:
2252  *	0  - Node successfully joined to set.
2253  *	-1 - Join attempted but failed
2254  *		- any failure from libmeta calls
2255  *		- node not in the member list
2256  *	-2 - Join not attempted since
2257  *		- this set had no drives in set
2258  *		- this node already joined to set
2259  *		- set is not a multinode set
2260  *	-3 - Node joined to STALE set.
2261  */
2262 extern int
2263 meta_set_join(
2264 	mdsetname_t	*sp,
2265 	md_error_t	*ep
2266 )
2267 {
2268 	md_set_desc		*sd;
2269 	md_drive_desc		*dd;
2270 	md_mnnode_desc		*nd, *nd2, my_nd;
2271 	int			rval = 0;
2272 	md_setkey_t		*cl_sk;
2273 	md_error_t		xep = mdnullerror;
2274 	md_error_t		ep_snarf = mdnullerror;
2275 	int			master_flag = 0;
2276 	md_mnset_record		*mas_mnsr = NULL;
2277 	int			clear_nr_flags = 0;
2278 	md_mnnode_record	*nr;
2279 	int			stale_set = 0;
2280 	int			rb_flags = 0;
2281 	int			stale_bool = FALSE;
2282 	int			suspendall_flag = 0;
2283 	int			suspend1_flag = 0;
2284 	sigset_t		oldsigs;
2285 	int			send_reinit = 0;
2286 
2287 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2288 		return (-1);
2289 	}
2290 
2291 	/* Must be a multinode diskset */
2292 	if (!MD_MNSET_DESC(sd)) {
2293 		(void) mderror(ep, MDE_NOT_MN, sp->setname);
2294 		return (-2);
2295 	}
2296 
2297 	/* Verify that the node is ALIVE (i.e. is in the API membership list) */
2298 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_ALIVE)) {
2299 		(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno,
2300 			sd->sd_mn_mynode->nd_nodename, NULL,
2301 			sp->setname);
2302 		return (-1);
2303 	}
2304 
2305 	/* Make sure we are blocking all signals */
2306 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
2307 		mdclrerror(&xep);
2308 
2309 	/*
2310 	 * Lock the set on current set members.
2311 	 * For MN diskset lock_set and SUSPEND are used to protect against
2312 	 * other meta* commands running on the other nodes.
2313 	 */
2314 	nd = sd->sd_nodelist;
2315 	while (nd) {
2316 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2317 			nd = nd->nd_next;
2318 			continue;
2319 		}
2320 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2321 			rval = -1;
2322 			goto out;
2323 		}
2324 		nd = nd->nd_next;
2325 	}
2326 
2327 	/*
2328 	 * Lock out other meta* commands by suspending
2329 	 * class 1 messages across the diskset.
2330 	 */
2331 	nd = sd->sd_nodelist;
2332 	while (nd) {
2333 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2334 			nd = nd->nd_next;
2335 			continue;
2336 		}
2337 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2338 			    sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) {
2339 			rval = -1;
2340 			goto out;
2341 		}
2342 		suspend1_flag = 1;
2343 		nd = nd->nd_next;
2344 	}
2345 
2346 	/*
2347 	 * Verify that this host is a member (in the host list) of the set.
2348 	 */
2349 	nd = sd->sd_nodelist;
2350 	while (nd) {
2351 		if (strcmp(mynode(), nd->nd_nodename) == 0) {
2352 			break;
2353 		}
2354 		nd = nd->nd_next;
2355 	}
2356 	if (!nd) {
2357 		(void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
2358 			sd->sd_mn_mynode->nd_nodename, NULL,
2359 			sp->setname);
2360 		rval = -1;
2361 		goto out;
2362 	}
2363 
2364 	/*
2365 	 * Need to return failure if host is already 'joined'
2366 	 * into the set.  This is done so that if later the user
2367 	 * issues a command to join all sets and a failure is
2368 	 * encountered - that the resulting cleanup effort
2369 	 * (withdrawing from all sets that were joined
2370 	 * during that command) won't withdraw from this set.
2371 	 */
2372 	if (nd->nd_flags & MD_MN_NODE_OWN) {
2373 		rval = -2;
2374 		goto out2;
2375 	}
2376 
2377 	/*
2378 	 * Call metaget_setownership that calls each node in diskset and
2379 	 * marks in set descriptor if node is an owner of the set or not.
2380 	 * metaget_setownership checks to see if a node is an owner by
2381 	 * checking to see if that node's kernel has the mddb loaded.
2382 	 * If a node had panic'd during a reconfig or an
2383 	 * add/delete/join/withdraw operation, the other nodes' node
2384 	 * records may not reflect the current state of the diskset,
2385 	 * so calling metaget_setownership is the safest thing to do.
2386 	 */
2387 	if (metaget_setownership(sp, ep) == -1) {
2388 		rval = -1;
2389 		goto out;
2390 	}
2391 
2392 	/* If first active member of diskset, become the master. */
2393 	nd = sd->sd_nodelist;
2394 	while (nd) {
2395 		if (nd->nd_flags & MD_MN_NODE_OWN)
2396 			break;
2397 		nd = nd->nd_next;
2398 	}
2399 	if (nd == NULL)
2400 		master_flag = 1;
2401 
2402 	/*
2403 	 * If not first active member of diskset, then get the
2404 	 * master information from a node that is already joined
2405 	 * and set the master information for this node.  Be sure
2406 	 * that this node (the already joined node) has its own
2407 	 * join flag set.  If not, then this diskset isn't currently
2408 	 * consistent and shouldn't allow a node to join.  This diskset
2409 	 * inconsistency should only occur when a node has panic'd in
2410 	 * the set while doing a metaset operation and the sysadmin is
2411 	 * attempting to join a node into the set.  This inconsistency
2412 	 * will be fixed during a reconfig cycle which should be occurring
2413 	 * soon since a node panic'd.
2414 	 *
2415 	 * If unable to get this information from an owning node, then
2416 	 * this diskset isn't currently consistent and shouldn't
2417 	 * allow a node to join.
2418 	 */
2419 	if (!master_flag) {
2420 		/* get master information from an owner (joined) node */
2421 		if (clnt_mngetset(nd->nd_nodename, sp->setname,
2422 		    sp->setno, &mas_mnsr, ep) == -1) {
2423 			rval = -1;
2424 			goto out;
2425 		}
2426 
2427 		/* Verify that owner (joined) node has its own JOIN flag set */
2428 		nr = mas_mnsr->sr_nodechain;
2429 		while (nr) {
2430 			if ((nd->nd_nodeid == nr->nr_nodeid) &&
2431 			    ((nr->nr_flags & MD_MN_NODE_OWN) == NULL)) {
2432 				(void) mddserror(ep, MDE_DS_NODENOSET,
2433 				    sp->setno, nd->nd_nodename, NULL,
2434 				    nd->nd_nodename);
2435 				free_sr((md_set_record *)mas_mnsr);
2436 				rval = -1;
2437 				goto out;
2438 			}
2439 			nr = nr->nr_next;
2440 		}
2441 
2442 		/*
2443 		 * Does master have set marked as STALE?
2444 		 * If so, need to pass this down to kernel when
2445 		 * this node snarfs the set.
2446 		 */
2447 		if (clnt_mn_is_stale(nd->nd_nodename, sp,
2448 		    &stale_bool, ep) == -1) {
2449 			rval = -1;
2450 			goto out;
2451 		}
2452 
2453 		/* set master information in my rpc.metad's set record */
2454 		if (clnt_mnsetmaster(mynode(), sp, mas_mnsr->sr_master_nodenm,
2455 		    mas_mnsr->sr_master_nodeid, ep)) {
2456 			free_sr((md_set_record *)mas_mnsr);
2457 			rval = -1;
2458 			goto out;
2459 		}
2460 
2461 		/* set master information in my cached set desc */
2462 		(void) strcpy(sd->sd_mn_master_nodenm,
2463 		    mas_mnsr->sr_master_nodenm);
2464 		sd->sd_mn_master_nodeid = mas_mnsr->sr_master_nodeid;
2465 		nd2 = sd->sd_nodelist;
2466 		while (nd2) {
2467 		    if (nd2->nd_nodeid == mas_mnsr->sr_master_nodeid) {
2468 			sd->sd_mn_masternode = nd2;
2469 			break;
2470 		    }
2471 		    nd2 = nd2->nd_next;
2472 		}
2473 		free_sr((md_set_record *)mas_mnsr);
2474 
2475 		/*
2476 		 * Set the node flags in mynode's rpc.metad node records for
2477 		 * the nodes that are in the diskset.  Can use my sd
2478 		 * since earlier call to metaget_setownership set the
2479 		 * owner flags based on whether that node had snarfed
2480 		 * the MN diskset mddb.  Reconfig steps guarantee that
2481 		 * return of metaget_setownership will match the owning
2482 		 * node's owner list except in the case where a node
2483 		 * has just panic'd and in this case, a reconfig will
2484 		 * be starting immediately and the owner lists will
2485 		 * be sync'd up by the reconfig.
2486 		 *
2487 		 * Flag of SET means to take no action except to
2488 		 * set the node flags as given in the nodelist linked list.
2489 		 */
2490 		if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist,
2491 		    MD_NR_SET, NULL, ep)) {
2492 			rval = -1;
2493 			goto out;
2494 		}
2495 	}
2496 
2497 	/*
2498 	 * Read in the mddb if there are drives in the set.
2499 	 */
2500 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
2501 	    ep)) == NULL) {
2502 		/* No drives in list */
2503 		if (! mdisok(ep)) {
2504 			rval = -1;
2505 			goto out;
2506 		}
2507 		rval = -2;
2508 		goto out;
2509 	}
2510 
2511 	/*
2512 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2513 	 * Start by suspending rpc.mdcommd (which drains it of all messages),
2514 	 * then change the nodelist followed by a reinit and resume.
2515 	 */
2516 	nd = sd->sd_nodelist;
2517 	while (nd) {
2518 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2519 			nd = nd->nd_next;
2520 			continue;
2521 		}
2522 
2523 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp,
2524 		    MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
2525 			rval = -1;
2526 			goto out;
2527 		}
2528 		suspendall_flag = 1;
2529 		nd = nd->nd_next;
2530 	}
2531 
2532 	/* Set master in my set record in rpc.metad */
2533 	if (master_flag) {
2534 		if (clnt_mnsetmaster(mynode(), sp,
2535 		    sd->sd_mn_mynode->nd_nodename,
2536 		    sd->sd_mn_mynode->nd_nodeid, ep)) {
2537 			rval = -1;
2538 			goto out;
2539 		}
2540 	}
2541 	/*
2542 	 * Causes mddbs to be loaded into the kernel.
2543 	 * Set the force flag so that replica locations can be
2544 	 * loaded into the kernel even if a mediator node was
2545 	 * unavailable.  This allows a node to join an MO
2546 	 * diskset when there are sufficient replicas available,
2547 	 * but a mediator node in unavailable.
2548 	 */
2549 	if (setup_db_bydd(sp, dd, TRUE, ep) == -1) {
2550 		mde_perror(ep, dgettext(TEXT_DOMAIN,
2551 		    "Host not able to start diskset."));
2552 		rval = -1;
2553 		goto out;
2554 	}
2555 
2556 	if (! mdisok(ep)) {
2557 		rval = -1;
2558 		goto out;
2559 	}
2560 
2561 	/*
2562 	 * Set rollback flags to 1 so that halt_set is called if a failure
2563 	 * is seen after this point.  If snarf_set fails, still need to
2564 	 * call halt_set to cleanup the diskset.
2565 	 */
2566 	rb_flags = 1;
2567 
2568 	/* Starts the set */
2569 	if (snarf_set(sp, stale_bool, ep) != 0) {
2570 		if (mdismddberror(ep, MDE_DB_STALE)) {
2571 			/*
2572 			 * Don't fail join, STALE means that set has
2573 			 * < 50% mddbs.
2574 			 */
2575 			(void) mdstealerror(&ep_snarf, ep);
2576 			stale_set = 1;
2577 		} else if (mdisok(ep)) {
2578 			/* If snarf failed, but no error was set - set it */
2579 			(void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64,
2580 			    sp->setno, 0, NULL);
2581 				rval = -1;
2582 				goto out;
2583 		} else if (!(mdismddberror(ep, MDE_DB_ACCOK))) {
2584 			/*
2585 			 * Don't fail join if ACCOK; ACCOK means that mediator
2586 			 * provided extra vote.
2587 			 */
2588 			rval = -1;
2589 			goto out;
2590 		}
2591 	}
2592 
2593 	/* Did set really get snarfed? */
2594 	if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_NO) {
2595 		if (mdisok(ep)) {
2596 			/* If snarf failed, but no error was set - set it */
2597 			(void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64,
2598 				sp->setno, 0, NULL);
2599 		}
2600 		mde_perror(ep, dgettext(TEXT_DOMAIN,
2601 		    "Host not able to start diskset."));
2602 		rval = -1;
2603 		goto out;
2604 	}
2605 
2606 	/* Change to nodelist so need to send reinit to rpc.mdcommd */
2607 	send_reinit = 1;
2608 
2609 	/* If first node to enter set, setup master and clear change log */
2610 	if (master_flag) {
2611 		/* Set master in my locally cached set descriptor */
2612 		(void) strcpy(sd->sd_mn_master_nodenm,
2613 		    sd->sd_mn_mynode->nd_nodename);
2614 		sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid;
2615 		sd->sd_mn_am_i_master = 1;
2616 
2617 		/*
2618 		 * If first node to join set, then clear out change log
2619 		 * entries.  Change log entries are only needed when a
2620 		 * change of master is occurring in a diskset that has
2621 		 * multiple owners.   Since this node is the first owner
2622 		 * of the diskset, clear the entries.
2623 		 *
2624 		 * Only do this if we are in a single node non-SC3.x
2625 		 * situation.
2626 		 */
2627 		if (meta_mn_singlenode() &&
2628 			mdmn_reset_changelog(sp, ep,  MDMN_CLF_RESETLOG) != 0) {
2629 			mde_perror(ep, dgettext(TEXT_DOMAIN,
2630 			    "Unable to reset changelog."));
2631 			rval = -1;
2632 			goto out;
2633 		}
2634 	}
2635 
2636 	/* Set my locally cached flag */
2637 	sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN;
2638 
2639 	/*
2640 	 * Set this node's own flag on all joined nodes in the set
2641 	 * (including my node).
2642 	 */
2643 	clear_nr_flags = 1;
2644 
2645 	my_nd = *(sd->sd_mn_mynode);
2646 	my_nd.nd_next = NULL;
2647 	nd = sd->sd_nodelist;
2648 	while (nd) {
2649 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2650 			nd = nd->nd_next;
2651 			continue;
2652 		}
2653 		if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
2654 		    MD_NR_JOIN, NULL, ep)) {
2655 			rval = -1;
2656 			goto out;
2657 		}
2658 		nd = nd->nd_next;
2659 	}
2660 
2661 out:
2662 	if (rval != NULL) {
2663 		/*
2664 		 * If rollback flag is 1, then node was joined to set.
2665 		 * Since an error occurred, withdraw node from set in
2666 		 * order to rollback to before command was run.
2667 		 * Need to preserve ep so that calling function can
2668 		 * get error information.
2669 		 */
2670 		if (rb_flags == 1) {
2671 			if (halt_set(sp, &xep)) {
2672 				mdclrerror(&xep);
2673 			}
2674 		}
2675 
2676 		/*
2677 		 * If error, reset master to INVALID.
2678 		 * Ignore error since (next) first node to successfully join
2679 		 * will set master on all nodes.
2680 		 */
2681 		(void) clnt_mnsetmaster(mynode(), sp, "",
2682 			MD_MN_INVALID_NID, &xep);
2683 		mdclrerror(&xep);
2684 		/* Reset master in my locally cached set descriptor */
2685 		sd->sd_mn_master_nodeid = MD_MN_INVALID_NID;
2686 		sd->sd_mn_am_i_master = 0;
2687 
2688 		/*
2689 		 * If nr flags set on other nodes, reset them.
2690 		 */
2691 		if (clear_nr_flags) {
2692 			nd = sd->sd_nodelist;
2693 			while (nd) {
2694 				if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2695 					nd = nd->nd_next;
2696 					continue;
2697 				}
2698 				(void) clnt_upd_nr_flags(nd->nd_nodename, sp,
2699 					&my_nd, MD_NR_WITHDRAW, NULL, &xep);
2700 				mdclrerror(&xep);
2701 				nd = nd->nd_next;
2702 			}
2703 			/* Reset my locally cached flag */
2704 			sd->sd_mn_mynode->nd_flags &= ~MD_MN_NODE_OWN;
2705 		}
2706 	}
2707 
2708 	/*
2709 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2710 	 * Send reinit command to mdcommd which forces it to get
2711 	 * fresh set description.
2712 	 */
2713 	if (send_reinit) {
2714 		/* Send reinit */
2715 		nd = sd->sd_nodelist;
2716 		while (nd) {
2717 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2718 				nd = nd->nd_next;
2719 				continue;
2720 			}
2721 
2722 			/* Class is ignored for REINIT */
2723 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
2724 				sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
2725 				/*
2726 				 * We are here because we failed to resume
2727 				 * rpc.mdcommd.  However we potentially have
2728 				 * an error from the previous call
2729 				 * If the previous call did fail,  we capture
2730 				 * that error and generate a perror with
2731 				 * the string, "Unable to resume...".
2732 				 * Setting rval to -1 ensures that in the
2733 				 * next iteration of the loop, ep is not
2734 				 * clobbered.
2735 				 */
2736 				if (rval == 0)
2737 					(void) mdstealerror(ep, &xep);
2738 				else
2739 					mdclrerror(&xep);
2740 				rval = -1;
2741 				mde_perror(ep, dgettext(TEXT_DOMAIN,
2742 				    "Unable to reinit rpc.mdcommd."));
2743 			}
2744 			nd = nd->nd_next;
2745 		}
2746 
2747 	}
2748 
2749 out2:
2750 	/*
2751 	 * Unlock diskset by resuming messages across the diskset.
2752 	 * Just resume all classes so that resume is the same whether
2753 	 * just one class was locked or all classes were locked.
2754 	 */
2755 	if ((suspend1_flag) || (suspendall_flag)) {
2756 		nd = sd->sd_nodelist;
2757 		while (nd) {
2758 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2759 				nd = nd->nd_next;
2760 				continue;
2761 			}
2762 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
2763 				sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
2764 				/*
2765 				 * We are here because we failed to resume
2766 				 * rpc.mdcommd.  However we potentially have
2767 				 * an error from the previous call
2768 				 * If the previous call did fail,  we capture
2769 				 * that error and generate a perror with
2770 				 * the string, "Unable to resume...".
2771 				 * Setting rval to -1 ensures that in the
2772 				 * next iteration of the loop, ep is not
2773 				 * clobbered.
2774 				 */
2775 				if (rval == 0)
2776 					(void) mdstealerror(ep, &xep);
2777 				else
2778 					mdclrerror(&xep);
2779 				rval = -1;
2780 				mde_perror(ep, dgettext(TEXT_DOMAIN,
2781 				    "Unable to resume rpc.mdcommd."));
2782 			}
2783 			nd = nd->nd_next;
2784 		}
2785 		meta_ping_mnset(sp->setno);
2786 	}
2787 
2788 	/*
2789 	 * Unlock set.  This flushes the caches on the servers.
2790 	 */
2791 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
2792 	nd = sd->sd_nodelist;
2793 	while (nd) {
2794 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2795 			nd = nd->nd_next;
2796 			continue;
2797 		}
2798 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
2799 			if (rval == 0)
2800 				(void) mdstealerror(ep, &xep);
2801 			else
2802 				mdclrerror(&xep);
2803 			rval = -1;
2804 		}
2805 		nd = nd->nd_next;
2806 	}
2807 
2808 	/*
2809 	 * If this node is the last to join the diskset and clustering isn't
2810 	 * running, then resync the mirrors in the diskset. We have to wait
2811 	 * until all nodes are joined so that the status gets propagated to
2812 	 * all of the members of the set.
2813 	 * Ignore any error from the resync as the join function shouldn't fail
2814 	 * because the mirror resync had a problem.
2815 	 *
2816 	 * Don't start resync if set is stale.
2817 	 */
2818 	if ((rval == 0) && (sdssc_bind_library() != SDSSC_OKAY) &&
2819 	    (stale_set != 1)) {
2820 		nd = sd->sd_nodelist;
2821 		while (nd) {
2822 			if (!(nd->nd_flags & MD_MN_NODE_OWN))
2823 				break;
2824 			nd = nd->nd_next;
2825 		}
2826 		/*
2827 		 * nd set to NULL means that we have no nodes in the set that
2828 		 * haven't joined. In this case we start the resync.
2829 		 */
2830 		if (nd == NULL) {
2831 			(void) meta_mirror_resync_all(sp, 0, &xep);
2832 			mdclrerror(&xep);
2833 		}
2834 	}
2835 
2836 	/* Update ABR state for all soft partitions */
2837 	(void) meta_sp_update_abr(sp, &xep);
2838 	mdclrerror(&xep);
2839 
2840 	/*
2841 	 * call metaflushsetnames to reset local cache for master and
2842 	 * node information.
2843 	 */
2844 	metaflushsetname(sp);
2845 
2846 	/* release signals back to what they were on entry */
2847 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
2848 		mdclrerror(&xep);
2849 
2850 	/*
2851 	 * If no error and stale_set is set, then set ep back
2852 	 * to ep from snarf_set call and return -3.  If another error
2853 	 * occurred and rval is not 0, then that error would have
2854 	 * caused the node to be withdrawn from the set and would
2855 	 * have set ep to that error information.
2856 	 */
2857 	if ((rval == 0) && (stale_set)) {
2858 		(void) mdstealerror(ep, &ep_snarf);
2859 		return (-3);
2860 	}
2861 
2862 	return (rval);
2863 }
2864 
2865 /*
2866  * Entry point to withdraw a node from MultiNode diskset.
2867  *
2868  * Validate host in diskset.
2869  *	- Should be joined into diskset.
2870  * Assume valid configuration is stored in the set/drive/node records
2871  * in the local mddb since no node or drive can be added to the MNset
2872  * unless all drives and nodes are available.  Reconfig steps will
2873  * resync all ALIVE nodes in case of panic in critical areas.
2874  *
2875  * Lock down the set.
2876  * Verify that drives exist in configuration.
2877  * Verify host is a member of this diskset.
2878  * Verify host is an owner of the diskset (host is joined to diskset).
2879  * Only allow withdrawal of master node if master node is the only joined
2880  * in the diskset.
2881  * Halt the diskset on this node.
2882  * Reset Master on this node.
2883  * Updated node flags that this node with withdrawn.
2884  * Unlock the set.
2885  *
2886  * Return values:
2887  *	0  - Node successfully withdrew from set.
2888  *	-1 - Withdrawal attempted but failed
2889  *		- any failure from libmeta calls
2890  *		- node not in the member list
2891  *	-2 - Withdrawal not attempted since
2892  *		- this set had no drives in set
2893  *		- this node not joined to set
2894  *		- set is not a multinode set
2895  */
2896 extern int
2897 meta_set_withdraw(
2898 	mdsetname_t	*sp,
2899 	md_error_t	*ep
2900 )
2901 {
2902 	md_set_desc		*sd;
2903 	md_drive_desc		*dd = 0;
2904 	md_mnnode_desc		*nd, my_nd;
2905 	int			rval = 0;
2906 	md_setkey_t		*cl_sk;
2907 	md_error_t		xep = mdnullerror;
2908 	int			set_halted = 0;
2909 	int			suspendall_flag = 0;
2910 	int			suspend1_flag = 0;
2911 	bool_t			stale_bool = FALSE;
2912 	mddb_config_t		c;
2913 	int			node_id_list[1];
2914 	sigset_t		oldsigs;
2915 	int			send_reinit = 0;
2916 
2917 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2918 		return (-1);
2919 	}
2920 
2921 	/* Must be a multinode diskset */
2922 	if (!MD_MNSET_DESC(sd)) {
2923 		(void) mderror(ep, MDE_NOT_MN, sp->setname);
2924 		return (-1);
2925 	}
2926 
2927 	/* Make sure we are blocking all signals */
2928 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
2929 		mdclrerror(&xep);
2930 
2931 	/*
2932 	 * Lock the set on current set members.
2933 	 * For MN diskset lock_set and SUSPEND are used to protect against
2934 	 * other meta* commands running on the other nodes.
2935 	 */
2936 	nd = sd->sd_nodelist;
2937 	while (nd) {
2938 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2939 			nd = nd->nd_next;
2940 			continue;
2941 		}
2942 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2943 			rval = -1;
2944 			goto out;
2945 		}
2946 		nd = nd->nd_next;
2947 	}
2948 	/*
2949 	 * Lock out other meta* commands by suspending
2950 	 * class 1 messages across the diskset.
2951 	 */
2952 	nd = sd->sd_nodelist;
2953 	while (nd) {
2954 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2955 			nd = nd->nd_next;
2956 			continue;
2957 		}
2958 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2959 			sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) {
2960 			rval = -1;
2961 			goto out;
2962 		}
2963 		suspend1_flag = 1;
2964 		nd = nd->nd_next;
2965 	}
2966 
2967 	/* Get list of drives - needed in case of failure */
2968 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
2969 	    ep)) == NULL) {
2970 		/* Error getting drives in list */
2971 		if (! mdisok(ep)) {
2972 			rval = -1;
2973 			goto out2;
2974 		}
2975 		/* no drives in list */
2976 		rval = -2;
2977 		goto out2;
2978 	}
2979 
2980 	/*
2981 	 * Verify that this host is a member (in the host list) of the set.
2982 	 */
2983 	nd = sd->sd_nodelist;
2984 	while (nd) {
2985 		if (strcmp(mynode(), nd->nd_nodename) == 0) {
2986 			break;
2987 		}
2988 		nd = nd->nd_next;
2989 	}
2990 	if (!nd) {
2991 		(void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
2992 			sd->sd_mn_mynode->nd_nodename, NULL,
2993 			sp->setname);
2994 		rval = -1;
2995 		goto out2;
2996 	}
2997 
2998 	/*
2999 	 * Call metaget_setownership that calls each node in diskset and
3000 	 * marks in set descriptor if node is an owner of the set or not.
3001 	 * metaget_setownership checks to see if a node is an owner by
3002 	 * checking to see if that node's kernel has the mddb loaded.
3003 	 * If a node had panic'd during a reconfig or an
3004 	 * add/delete/join/withdraw operation, the other nodes' node
3005 	 * records may not reflect the current state of the diskset,
3006 	 * so calling metaget_setownership is the safest thing to do.
3007 	 */
3008 	if (metaget_setownership(sp, ep) == -1) {
3009 		rval = -1;
3010 		goto out2;
3011 	}
3012 
3013 	/*
3014 	 * Verify that this node is joined
3015 	 * to diskset (i.e. is an owner of the diskset).
3016 	 */
3017 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
3018 		rval = -2;
3019 		goto out2;
3020 	}
3021 
3022 	/*
3023 	 * For a MN diskset, only withdraw master if it is
3024 	 * the only joined node.
3025 	 */
3026 	if (sd->sd_mn_master_nodeid == sd->sd_mn_mynode->nd_nodeid) {
3027 		nd = sd->sd_nodelist;
3028 		while (nd) {
3029 			/* Skip my node since checking for other owners */
3030 			if (nd->nd_nodeid == sd->sd_mn_master_nodeid) {
3031 				nd = nd->nd_next;
3032 				continue;
3033 			}
3034 			/* If another owner node if found, error */
3035 			if (nd->nd_flags & MD_MN_NODE_OWN) {
3036 				(void) mddserror(ep, MDE_DS_WITHDRAWMASTER,
3037 					sp->setno,
3038 					sd->sd_mn_mynode->nd_nodename, NULL,
3039 					sp->setname);
3040 				rval = -1;
3041 				goto out2;
3042 			}
3043 			nd = nd->nd_next;
3044 		}
3045 	}
3046 
3047 	/*
3048 	 * Is current set STALE?
3049 	 */
3050 	(void) memset(&c, 0, sizeof (c));
3051 	c.c_id = 0;
3052 	c.c_setno = sp->setno;
3053 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
3054 		(void) mdstealerror(ep, &c.c_mde);
3055 		rval = -1;
3056 		goto out;
3057 	}
3058 	if (c.c_flags & MDDB_C_STALE) {
3059 		stale_bool = TRUE;
3060 	}
3061 
3062 	/*
3063 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
3064 	 * Start by suspending rpc.mdcommd (which drains it of all messages),
3065 	 * then change the nodelist followed by a reinit and resume.
3066 	 */
3067 	nd = sd->sd_nodelist;
3068 	while (nd) {
3069 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3070 			nd = nd->nd_next;
3071 			continue;
3072 		}
3073 
3074 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
3075 		    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
3076 			rval = -1;
3077 			goto out;
3078 		}
3079 		suspendall_flag = 1;
3080 		nd = nd->nd_next;
3081 	}
3082 
3083 	/*
3084 	 * Withdraw the set - halt set.
3085 	 * This will fail if any I/O is occuring to any metadevice which
3086 	 * includes a resync to a mirror metadevice.
3087 	 */
3088 	set_halted = 1;
3089 	if (halt_set(sp, ep)) {
3090 		/* Was set actually halted? */
3091 		if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_YES) {
3092 			set_halted = 0;
3093 		}
3094 		rval = -1;
3095 		goto out;
3096 	}
3097 
3098 	/* Change to nodelist so need to send reinit to rpc.mdcommd */
3099 	send_reinit = 1;
3100 
3101 	/* Reset master on withdrawn node */
3102 	if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, "",
3103 	    MD_MN_INVALID_NID, ep)) {
3104 		rval = -1;
3105 		goto out;
3106 	}
3107 
3108 	/* Mark my node as withdrawn and send to other nodes */
3109 	nd = sd->sd_nodelist;
3110 	my_nd = *(sd->sd_mn_mynode);	/* structure copy */
3111 	my_nd.nd_next = NULL;
3112 	while (nd) {
3113 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3114 			nd = nd->nd_next;
3115 			continue;
3116 		}
3117 		if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
3118 		    MD_NR_WITHDRAW, NULL, ep)) {
3119 			rval = -1;
3120 			goto out;
3121 		}
3122 		nd = nd->nd_next;
3123 	}
3124 
3125 	/*
3126 	 * If withdrawn node is a mirror owner, reset mirror owner
3127 	 * to NULL.  If an error occurs, print a warning and continue.
3128 	 * Don't fail metaset because of mirror owner reset problem since
3129 	 * next node to grab mirror will resolve this issue.
3130 	 * Before next node grabs mirrors, metaset will show the withdrawn
3131 	 * node as owner which is why an attempt to reset the mirror owner
3132 	 * is made.
3133 	 */
3134 	node_id_list[0] = sd->sd_mn_mynode->nd_nodeid;	/* Setup my nodeid */
3135 	nd = sd->sd_nodelist;
3136 	while (nd) {
3137 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3138 			nd = nd->nd_next;
3139 			continue;
3140 		}
3141 		if (clnt_reset_mirror_owner(nd->nd_nodename, sp,
3142 		    1, &node_id_list[0], &xep) == 01) {
3143 			mde_perror(&xep, dgettext(TEXT_DOMAIN,
3144 			    "Unable to reset mirror owner on node %s"),
3145 			    nd->nd_nodename);
3146 			mdclrerror(&xep);
3147 		}
3148 		nd = nd->nd_next;
3149 	}
3150 
3151 out:
3152 	if (rval == -1) {
3153 		/* Rejoin node - Mark node as joined and send to other nodes */
3154 		nd = sd->sd_nodelist;
3155 		my_nd = *(sd->sd_mn_mynode);	/* structure copy */
3156 		my_nd.nd_next = NULL;
3157 		while (nd) {
3158 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3159 				nd = nd->nd_next;
3160 				continue;
3161 			}
3162 			if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
3163 			    MD_NR_JOIN, NULL, &xep)) {
3164 				mdclrerror(&xep);
3165 			}
3166 			nd = nd->nd_next;
3167 		}
3168 
3169 		/* Set master on withdrawn node */
3170 		if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp,
3171 		    sd->sd_mn_master_nodenm,
3172 		    sd->sd_mn_master_nodeid, &xep)) {
3173 			mdclrerror(&xep);
3174 		}
3175 
3176 		/* Join set if halt_set had succeeded */
3177 		if (set_halted) {
3178 			/*
3179 			 * Causes mddbs to be loaded into the kernel.
3180 			 * Set the force flag so that replica locations can be
3181 			 * loaded into the kernel even if a mediator node was
3182 			 * unavailable.  This allows a node to join an MO
3183 			 * diskset when there are sufficient replicas available,
3184 			 * but a mediator node in unavailable.
3185 			 */
3186 			if (setup_db_bydd(sp, dd, TRUE, &xep) == -1) {
3187 				mdclrerror(&xep);
3188 			}
3189 			/* If set previously stale - make it so at re-join */
3190 			if (snarf_set(sp, stale_bool, &xep) != 0) {
3191 				mdclrerror(&xep);
3192 				(void) halt_set(sp, &xep);
3193 				mdclrerror(&xep);
3194 			}
3195 		}
3196 	}
3197 
3198 	/*
3199 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
3200 	 * Send reinit command to mdcommd which forces it to get
3201 	 * fresh set description.
3202 	 */
3203 	if (send_reinit) {
3204 		/* Send reinit */
3205 		nd = sd->sd_nodelist;
3206 		while (nd) {
3207 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3208 				nd = nd->nd_next;
3209 				continue;
3210 			}
3211 
3212 			/* Class is ignored for REINIT */
3213 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
3214 				sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
3215 				/*
3216 				 * We are here because we failed to resume
3217 				 * rpc.mdcommd.  However we potentially have
3218 				 * an error from the previous call.
3219 				 * If the previous call did fail,  we
3220 				 * capture that error and generate a perror
3221 				 * withthe string,  "Unable to resume...".
3222 				 * Setting rval to -1 ensures that in the
3223 				 * next iteration of the loop, ep is not
3224 				 * clobbered.
3225 				 */
3226 				if (rval == 0)
3227 					(void) mdstealerror(ep, &xep);
3228 				else
3229 					mdclrerror(&xep);
3230 				rval = -1;
3231 				mde_perror(ep, dgettext(TEXT_DOMAIN,
3232 				    "Unable to reinit rpc.mdcommd."));
3233 			}
3234 			nd = nd->nd_next;
3235 		}
3236 	}
3237 
3238 out2:
3239 	/*
3240 	 * Unlock diskset by resuming messages across the diskset.
3241 	 * Just resume all classes so that resume is the same whether
3242 	 * just one class was locked or all classes were locked.
3243 	 */
3244 	if ((suspend1_flag) || (suspendall_flag)) {
3245 		nd = sd->sd_nodelist;
3246 		while (nd) {
3247 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3248 				nd = nd->nd_next;
3249 				continue;
3250 			}
3251 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
3252 				sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
3253 				/*
3254 				 * We are here because we failed to resume
3255 				 * rpc.mdcommd.  However we potentially have
3256 				 * an error from the previous call
3257 				 * If the previous call did fail,  we capture
3258 				 * that error and generate a perror with
3259 				 * the string, "Unable to resume...".
3260 				 * Setting rval to -1 ensures that in the
3261 				 * next iteration of the loop, ep is not
3262 				 * clobbered.
3263 				 */
3264 				if (rval == 0)
3265 					(void) mdstealerror(ep, &xep);
3266 				else
3267 					mdclrerror(&xep);
3268 				rval = -1;
3269 				mde_perror(ep, dgettext(TEXT_DOMAIN,
3270 				    "Unable to resume rpc.mdcommd."));
3271 			}
3272 			nd = nd->nd_next;
3273 		}
3274 		meta_ping_mnset(sp->setno);
3275 	}
3276 
3277 	/*
3278 	 * Unlock set.  This flushes the caches on the servers.
3279 	 */
3280 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
3281 	nd = sd->sd_nodelist;
3282 	while (nd) {
3283 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3284 			nd = nd->nd_next;
3285 			continue;
3286 		}
3287 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
3288 			if (rval == 0)
3289 				(void) mdstealerror(ep, &xep);
3290 			else
3291 				mdclrerror(&xep);
3292 			rval = -1;
3293 		}
3294 		nd = nd->nd_next;
3295 	}
3296 
3297 	/*
3298 	 * call metaflushsetnames to reset local cache for master and
3299 	 * node information.
3300 	 */
3301 	metaflushsetname(sp);
3302 
3303 	/* release signals back to what they were on entry */
3304 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
3305 		mdclrerror(&xep);
3306 
3307 	return (rval);
3308 
3309 }
3310 
3311 /*
3312  * Update nodelist with cluster member information.
3313  * A node not in the member list will be marked
3314  * as not ALIVE and not OWN.
3315  * A node in the member list will be marked ALIVE, but
3316  * the OWN bit will not be changed.
3317  *
3318  * If mynode isn't in the membership list, fail causing
3319  * another reconfig cycle to be started since a non-member
3320  * node shouldn't be taking part in the reconfig cycle.
3321  *
3322  * Return values:
3323  *	0 - No problem.
3324  *	1 - Any failure including RPC failure to my node.
3325  */
3326 int
3327 meta_reconfig_update_nodelist(
3328 	mdsetname_t			*sp,
3329 	mndiskset_membershiplist_t	*nl,
3330 	md_set_desc			*sd,
3331 	md_error_t			*ep
3332 )
3333 {
3334 	mndiskset_membershiplist_t	*nl2;
3335 	md_mnnode_desc			*nd;
3336 	md_error_t			xep = mdnullerror;
3337 	int				rval = 0;
3338 
3339 	/*
3340 	 * Walk through nodelist, checking to see if each
3341 	 * node is in the member list.
3342 	 * If node is not a member, reset ALIVE and OWN node flag.
3343 	 * If node is a member, set ALIVE.
3344 	 * If mynode's OWN flag gets reset, then halt the diskset on this node.
3345 	 */
3346 	nd = sd->sd_nodelist;
3347 	while (nd) {
3348 		nl2 = nl;
3349 		while (nl2) {
3350 			/* If node is in member list, set ALIVE */
3351 			if (nl2->msl_node_id == nd->nd_nodeid) {
3352 				nd->nd_flags |= MD_MN_NODE_ALIVE;
3353 				break;
3354 			} else {
3355 				nl2 = nl2->next;
3356 			}
3357 			/* node is not in member list, mark !ALIVE and !OWN */
3358 			if (nl2 == NULL) {
3359 				/* If node is mynode, then halt set if needed */
3360 				if (strcmp(mynode(), nd->nd_nodename) == 0) {
3361 					/*
3362 					 * This shouldn't happen, but just
3363 					 * in case...  Any node not in the
3364 					 * membership list should be dead and
3365 					 * not running reconfig step1.
3366 					 */
3367 					if (nd->nd_flags & MD_MN_NODE_OWN) {
3368 						if (halt_set(sp, &xep)) {
3369 							mde_perror(&xep, "");
3370 							mdclrerror(&xep);
3371 						}
3372 					}
3373 					/*
3374 					 * Return failure since this node
3375 					 * (mynode) is not in the membership
3376 					 * list, but process the rest of the
3377 					 * nodelist first so that rpc.metad
3378 					 * can be updated with the latest
3379 					 * membership information.
3380 					 */
3381 					(void) mddserror(ep,
3382 					    MDE_DS_NOTINMEMBERLIST,
3383 					    sp->setno, nd->nd_nodename, NULL,
3384 					    sp->setname);
3385 					rval = 1;
3386 				}
3387 				nd->nd_flags &= ~MD_MN_NODE_ALIVE;
3388 				nd->nd_flags &= ~MD_MN_NODE_OWN;
3389 			}
3390 		}
3391 		nd = nd->nd_next;
3392 	}
3393 
3394 	/* Send this information to rpc.metad */
3395 	if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist,
3396 	    MD_NR_SET,  MNSET_IN_RECONFIG, &xep)) {
3397 		/* Return failure if can't send node flags to rpc.metad */
3398 		if (rval == 0) {
3399 			(void) mdstealerror(ep, &xep);
3400 			rval = 1;
3401 		}
3402 	}
3403 	return (rval);
3404 }
3405 
3406 /*
3407  * Choose master determines the master for a diskset.
3408  * Each node determines the master on its own and
3409  * adds this information to its local rpc.metad nodelist
3410  * and also sends it to the kernel.
3411  *
3412  * Nodelist in set descriptor (sd) is sorted in
3413  * monotonically increasing sequence of nodeid.
3414  *
3415  * Return values:
3416  *	0 - No problem.
3417  *	205 - There was an RPC problem to another node.
3418  *	-1 - There was an error.  This could be an RPC error to my node.
3419  *		This is a catastrophic failure causing node to panic.
3420  */
3421 int
3422 meta_reconfig_choose_master_for_set(
3423 	mdsetname_t	*sp,
3424 	md_set_desc	*sd,
3425 	md_error_t	*ep
3426 )
3427 {
3428 	int			is_owner;
3429 	md_mnset_record		*mnsr = NULL;
3430 	int			lowest_alive_nodeid = 0;
3431 	uint_t			master_nodeid;
3432 	md_mnnode_desc		*nd, *nd2;
3433 	md_mnnode_record	*nr;
3434 	md_drive_desc		*dd;
3435 	md_setkey_t		*cl_sk;
3436 	int			rval = 0;
3437 	md_error_t		xep = mdnullerror;
3438 	mddb_setflags_config_t	sf;
3439 
3440 	/*
3441 	 * Is current node joined to diskset?
3442 	 * Don't trust flags, really check to see if mddb is snarfed.
3443 	 */
3444 	if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
3445 		/*
3446 		 * If a node is joined to the diskset, this node checks
3447 		 * to see if the current master of the diskset is valid and
3448 		 * is still in the membership list (ALIVE) and is
3449 		 * still joined (OWN).  Need to verify if master is
3450 		 * really joined - don't trust the flags.  (Can trust
3451 		 * ALIVE since set during earlier part of reconfig cycle.)
3452 		 * If the current master is valid, still in the membership
3453 		 * list and joined, then master is not changed on this node.
3454 		 * Just return.
3455 		 *
3456 		 * Verify that nodeid is valid before accessing masternode.
3457 		 */
3458 		if ((sd->sd_mn_master_nodeid != MD_MN_INVALID_NID) &&
3459 		    (sd->sd_mn_masternode->nd_flags & MD_MN_NODE_ALIVE)) {
3460 			if (clnt_ownset(sd->sd_mn_master_nodenm, sp,
3461 			    &is_owner, ep) == -1) {
3462 				/* If RPC failure to another node return 205 */
3463 				if ((mdanyrpcerror(ep)) &&
3464 				    (sd->sd_mn_mynode->nd_nodeid !=
3465 				    sd->sd_mn_master_nodeid)) {
3466 					return (205);
3467 				} else {
3468 					/* Any other failure */
3469 					return (-1);
3470 				}
3471 			} else {
3472 				if (is_owner == TRUE) {
3473 
3474 					meta_mc_log(MC_LOG5, dgettext(
3475 					    TEXT_DOMAIN, "Set %s previous "
3476 					    "master chosen %s (%d): %s"),
3477 					    sp->setname,
3478 					    sd->sd_mn_master_nodenm,
3479 					    sd->sd_mn_master_nodeid,
3480 					    meta_print_hrtime(gethrtime() -
3481 					    start_time));
3482 
3483 					/* Previous master is ok - done */
3484 					return (0);
3485 				}
3486 			}
3487 		}
3488 
3489 		/*
3490 		 * If current master is no longer in the membership list or
3491 		 * is no longer joined, then this node uses the following
3492 		 * algorithm:
3493 		 * - node calls RPC routine clnt_ownset to get latest
3494 		 *	information on which nodes are owners of diskset.
3495 		 * 	clnt_ownset checks on each node to see if its kernel
3496 		 *	has that diskset snarfed.
3497 		 */
3498 		nd = sd->sd_nodelist;
3499 		while (nd) {
3500 			/* Don't consider node that isn't in member list */
3501 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3502 				nd = nd->nd_next;
3503 				continue;
3504 			}
3505 
3506 			if (clnt_ownset(nd->nd_nodename, sp,
3507 			    &is_owner, ep) == -1) {
3508 				/* If RPC failure to another node return 205 */
3509 				if ((mdanyrpcerror(ep)) &&
3510 				    (sd->sd_mn_mynode->nd_nodeid !=
3511 				    nd->nd_nodeid)) {
3512 					return (205);
3513 				} else {
3514 					/* Any other failure */
3515 					return (-1);
3516 				}
3517 			}
3518 
3519 			/*
3520 			 * Set owner flag for each node based on whether
3521 			 * that node really has a diskset mddb snarfed in
3522 			 * or not.
3523 			 */
3524 			if (is_owner == TRUE)
3525 				nd->nd_flags |= MD_MN_NODE_OWN;
3526 			else
3527 				nd->nd_flags &= ~MD_MN_NODE_OWN;
3528 
3529 			nd = nd->nd_next;
3530 		}
3531 
3532 		/*
3533 		 * - node walks through nodelist looking for nodes that are
3534 		 *	owners of the diskset that are in the membership list.
3535 		 * - for each owner, node calls RPC routine clnt_getset to
3536 		 *	 see if that node has its node record set to OK.
3537 		 * - If so, master is chosen to be this owner node.
3538 		 */
3539 		nd = sd->sd_nodelist;
3540 		while (nd) {
3541 			/* Don't consider node that isn't in member list */
3542 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3543 				nd = nd->nd_next;
3544 				continue;
3545 			}
3546 
3547 			/* Don't consider a node that isn't an owner */
3548 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3549 				nd = nd->nd_next;
3550 				continue;
3551 			}
3552 
3553 			/* Does node has its own node record set to OK? */
3554 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3555 			    MD_SET_BAD, &mnsr, ep) == -1) {
3556 				/* If RPC failure to another node return 205 */
3557 				if ((mdanyrpcerror(ep)) &&
3558 				    (sd->sd_mn_mynode->nd_nodeid !=
3559 				    nd->nd_nodeid)) {
3560 					return (205);
3561 				} else {
3562 					/* Any other failure */
3563 					return (-1);
3564 				}
3565 			}
3566 			nr = mnsr->sr_nodechain;
3567 			while (nr) {
3568 				if (nd->nd_nodeid == nr->nr_nodeid) {
3569 					if (nr->nr_flags & MD_MN_NODE_OK) {
3570 						/* Found a master */
3571 						free_sr(
3572 						    (md_set_record *)mnsr);
3573 						goto found_master;
3574 					}
3575 				}
3576 				nr = nr->nr_next;
3577 			}
3578 			free_sr((md_set_record *)mnsr);
3579 			nd = nd->nd_next;
3580 		}
3581 
3582 		/*
3583 		 * - If no owner node has its own node record on its own node
3584 		 *	set to OK, then this node checks all of the non-owner
3585 		 * 	nodes that are in the membership list.
3586 		 * - for each non-owner, node calls RPC routine clnt_getset to
3587 		 *	 see if that node has its node record set to OK.
3588 		 * - If set doesn't exist, don't choose node for master.
3589 		 * - If so, master is chosen to be this non-owner node.
3590 		 *
3591 		 */
3592 		nd = sd->sd_nodelist;
3593 		while (nd) {
3594 			/* Don't consider node that isn't in member list */
3595 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3596 				nd = nd->nd_next;
3597 				continue;
3598 			}
3599 
3600 			/* Only checking non-owner nodes this time around */
3601 			if (nd->nd_flags & MD_MN_NODE_OWN) {
3602 				nd = nd->nd_next;
3603 				continue;
3604 			}
3605 
3606 			/* Does node has its own node record set to OK? */
3607 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3608 			    MD_SET_BAD, &mnsr, ep) == -1) {
3609 				/*
3610 				 * If set doesn't exist on non-owner node,
3611 				 * don't consider this node for master.
3612 				 */
3613 				if (mdiserror(ep, MDE_NO_SET)) {
3614 					nd = nd->nd_next;
3615 					continue;
3616 				} else if ((mdanyrpcerror(ep)) &&
3617 				    (sd->sd_mn_mynode->nd_nodeid !=
3618 				    nd->nd_nodeid)) {
3619 					/* RPC failure to another node */
3620 					return (205);
3621 				} else {
3622 					/* Any other failure */
3623 					return (-1);
3624 				}
3625 			}
3626 			nr = mnsr->sr_nodechain;
3627 			while (nr) {
3628 				if (nd->nd_nodeid == nr->nr_nodeid) {
3629 					if (nr->nr_flags & MD_MN_NODE_OK) {
3630 						/* Found a master */
3631 						free_sr(
3632 						    (md_set_record *)mnsr);
3633 						goto found_master;
3634 					}
3635 				}
3636 				nr = nr->nr_next;
3637 			}
3638 			free_sr((md_set_record *)mnsr);
3639 			nd = nd->nd_next;
3640 		}
3641 
3642 		/*
3643 		 * - If no node can be found that has its own node record on
3644 		 *	its node to be set to OK, then all alive nodes
3645 		 * 	were in the process of being added to or deleted
3646 		 *	from set.  Each alive node will remove all
3647 		 *	information pertaining to this set from its node.
3648 		 *
3649 		 * If all nodes in set are ALIVE, then call sdssc end routines
3650 		 * since set was truly being initially created or destroyed.
3651 		 */
3652 		goto delete_set;
3653 	} else {
3654 
3655 		/*
3656 		 * If node is not joined to diskset, then this
3657 		 * node uses the following algorithm:
3658 		 * - If unjoined node doesn't have a node record for itself,
3659 		 *	just delete the diskset since diskset was in the
3660 		 *	process of being created.
3661 		 * - node needs to find master of diskset before
3662 		 *	reconfig cycle, if a master existed.
3663 		 * - node calls RPC routine clnt_ownset to get latest
3664 		 * 	information on which nodes are owners of diskset.
3665 		 *	clnt_ownset checks on each node to see if its
3666 		 *	kernel has that diskset snarfed.
3667 		 */
3668 
3669 		/*
3670 		 * Is my node in the set description?
3671 		 * If not, delete the set from this node.
3672 		 * sr2setdesc sets sd_mn_mynode pointer to the node
3673 		 * descriptor for this node if there was a node
3674 		 * record for this node.
3675 		 *
3676 		 */
3677 		if (sd->sd_mn_mynode == NULL) {
3678 			goto delete_set;
3679 		}
3680 
3681 		nd = sd->sd_nodelist;
3682 		while (nd) {
3683 			/* Don't consider node that isn't in member list */
3684 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3685 				nd = nd->nd_next;
3686 				continue;
3687 			}
3688 
3689 			if (clnt_ownset(nd->nd_nodename, sp,
3690 			    &is_owner, ep) == -1) {
3691 				/* If RPC failure to another node return 205 */
3692 				if ((mdanyrpcerror(ep)) &&
3693 				    (sd->sd_mn_mynode->nd_nodeid !=
3694 				    nd->nd_nodeid)) {
3695 					return (205);
3696 				} else {
3697 					/* Any other failure */
3698 					return (-1);
3699 				}
3700 			}
3701 
3702 			/*
3703 			 * Set owner flag for each node based on whether
3704 			 * that node really has a diskset mddb snarfed in
3705 			 * or not.
3706 			 */
3707 			if (is_owner == TRUE)
3708 				nd->nd_flags |= MD_MN_NODE_OWN;
3709 			else
3710 				nd->nd_flags &= ~MD_MN_NODE_OWN;
3711 
3712 			nd = nd->nd_next;
3713 		}
3714 
3715 		/*
3716 		 * - node walks through nodelist looking for nodes that
3717 		 *	are owners of the diskset that are in
3718 		 *	the membership list.
3719 		 * - for each owner, node calls RPC routine clnt_getset to
3720 		 *	see if that node has a master set and to get the
3721 		 *	diskset description.
3722 		 * - If the owner node has a set description that doesn't
3723 		 *	include the non-joined node in the nodelist, this node
3724 		 *	removes its set description of that diskset
3725 		 *	(i.e. removes the set from its local mddbs).  This is
3726 		 *	handling the case of when a node was removed from a
3727 		 *	diskset while it was not in the cluster membership
3728 		 *	list.
3729 		 * - If that node has a master set and the master is in the
3730 		 *	membership list and is an owner, then either this was
3731 		 *	the master from before the reconfig cycle or this
3732 		 *	node has already chosen a new master - either way,
3733 		 *	the master value is valid as long as it is in the
3734 		 *	membership list and is an owner
3735 		 * - master is chosen to be owner node's master
3736 		 */
3737 		nd = sd->sd_nodelist;
3738 		while (nd) {
3739 			/* Don't consider node that isn't in member list */
3740 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3741 				nd = nd->nd_next;
3742 				continue;
3743 			}
3744 
3745 			/* Don't consider a node that isn't an owner */
3746 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3747 				nd = nd->nd_next;
3748 				continue;
3749 			}
3750 
3751 			/* Get owner node's set record */
3752 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3753 			    MD_SET_BAD, &mnsr, ep) == -1) {
3754 				/* If RPC failure to another node return 205 */
3755 				if ((mdanyrpcerror(ep)) &&
3756 				    (sd->sd_mn_mynode->nd_nodeid !=
3757 				    nd->nd_nodeid)) {
3758 					return (205);
3759 				} else {
3760 					/* Any other failure */
3761 					return (-1);
3762 				}
3763 			}
3764 
3765 			/* Is this node in the owner node's set record */
3766 			nr = mnsr->sr_nodechain;
3767 			while (nr) {
3768 				if (sd->sd_mn_mynode->nd_nodeid ==
3769 				    nr->nr_nodeid) {
3770 					break;
3771 				}
3772 				nr = nr->nr_next;
3773 			}
3774 			if (nr == NULL) {
3775 				/* my node not found - delete set */
3776 				free_sr((md_set_record *)mnsr);
3777 				goto delete_set;
3778 			}
3779 
3780 			/* Is owner's node's master valid? */
3781 			master_nodeid = mnsr->sr_master_nodeid;
3782 			free_sr((md_set_record *)mnsr);
3783 			if (master_nodeid == MD_MN_INVALID_NID) {
3784 				nd = nd->nd_next;
3785 				continue;
3786 			}
3787 
3788 			nd2 = sd->sd_nodelist;
3789 			while (nd2) {
3790 				if ((nd2->nd_nodeid == master_nodeid) &&
3791 				    (nd2->nd_flags & MD_MN_NODE_ALIVE) &&
3792 				    (nd2->nd_flags & MD_MN_NODE_OWN)) {
3793 						nd = nd2;
3794 						goto found_master;
3795 				}
3796 				nd2 = nd2->nd_next;
3797 			}
3798 			nd = nd->nd_next;
3799 		}
3800 
3801 		/*
3802 		 * - If no owner node has a valid master, then follow
3803 		 * 	algorithm of when a node is joined to the diskset.
3804 		 * - node walks through nodelist looking for nodes that are
3805 		 *	owners of the diskset that are in the membership list.
3806 		 * - for each owner, node calls RPC routine clnt_getset to
3807 		 *	 see if that node has its node record set to OK.
3808 		 * - If so, master is chosen to be this owner node.
3809 		 */
3810 		nd = sd->sd_nodelist;
3811 		while (nd) {
3812 			/* Don't consider node that isn't in member list */
3813 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3814 				nd = nd->nd_next;
3815 				continue;
3816 			}
3817 
3818 			/* Don't consider a node that isn't an owner */
3819 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3820 				nd = nd->nd_next;
3821 				continue;
3822 			}
3823 
3824 			/* Does node has its own node record set to OK? */
3825 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3826 			    MD_SET_BAD, &mnsr, ep) == -1) {
3827 				/* If RPC failure to another node return 205 */
3828 				if ((mdanyrpcerror(ep)) &&
3829 				    (sd->sd_mn_mynode->nd_nodeid !=
3830 				    nd->nd_nodeid)) {
3831 					return (205);
3832 				} else {
3833 					/* Any other failure */
3834 					return (-1);
3835 				}
3836 			}
3837 			nr = mnsr->sr_nodechain;
3838 			while (nr) {
3839 				if (nd->nd_nodeid == nr->nr_nodeid) {
3840 					if (nr->nr_flags & MD_MN_NODE_OK) {
3841 						/* Found a master */
3842 						free_sr(
3843 						    (md_set_record *)mnsr);
3844 						goto found_master;
3845 					}
3846 				}
3847 				nr = nr->nr_next;
3848 			}
3849 			free_sr((md_set_record *)mnsr);
3850 			nd = nd->nd_next;
3851 		}
3852 
3853 		/*
3854 		 * - If no owner node has its own node record on its own node
3855 		 *	set to OK, then this node checks all of the non-owner
3856 		 *	nodes that are in the membership list.
3857 		 * - for each non-owner, node calls RPC routine clnt_getset to
3858 		 *	see if that node has its node record set to OK.
3859 		 * - If set doesn't exist, don't choose node for master.
3860 		 * - If this node doesn't exist in the nodelist on any of the
3861 		 *	non-owner nodes, this node removes its set description
3862 		 *	of that diskset (i.e. removes the set from its local
3863 		 *	mddbs). This is handling the case of when a node was
3864 		 *	removed from a diskset while it was not in the
3865 		 *	cluster membership list.
3866 		 * - If non-owner node has its node record set to OK and if
3867 		 *	this node hasn't removed this diskset (step directly
3868 		 *	before this one), then the master is chosen to be this
3869 		 *	non-owner node.
3870 		 */
3871 		nd = sd->sd_nodelist;
3872 		while (nd) {
3873 			/* Don't consider node that isn't in member list */
3874 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3875 				nd->nd_flags |= MD_MN_NODE_DEL;
3876 				nd = nd->nd_next;
3877 				continue;
3878 			}
3879 
3880 			/* Don't consider owner nodes since none are OK */
3881 			if (nd->nd_flags & MD_MN_NODE_OWN) {
3882 				nd->nd_flags |= MD_MN_NODE_DEL;
3883 				nd = nd->nd_next;
3884 				continue;
3885 			}
3886 
3887 			/*
3888 			 * Don't need to get nodelist from my node since
3889 			 * this is where sd_nodelist was obtained.
3890 			 */
3891 			if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) {
3892 				nd = nd->nd_next;
3893 				continue;
3894 			}
3895 
3896 			/*
3897 			 * If node has already been decided against for
3898 			 * master, then skip it.
3899 			 */
3900 			if (nd->nd_flags & MD_MN_NODE_DEL) {
3901 				nd = nd->nd_next;
3902 				continue;
3903 			}
3904 
3905 			/*
3906 			 * Does node in my nodelist have its own node
3907 			 * record marked OK on its node?  And does node
3908 			 * in my nodelist exist on all other nodes?
3909 			 * Don't want to choose a node for master unless
3910 			 * that node is marked OK on its own node and that
3911 			 * node exists on all other alive nodes.
3912 			 *
3913 			 * This is guarding against the case when several
3914 			 * nodes are down and one of the downed nodes is
3915 			 * deleted from the diskset.  When the down nodes
3916 			 * are rebooted into the cluster, you don't want
3917 			 * any node to pick the deleted node as the master.
3918 			 */
3919 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3920 			    MD_SET_BAD, &mnsr, ep) == -1) {
3921 				/*
3922 				 * If set doesn't exist on non-owner node,
3923 				 * don't consider this node for master.
3924 				 */
3925 				if (mdiserror(ep, MDE_NO_SET)) {
3926 					nd->nd_flags |= MD_MN_NODE_DEL;
3927 					nd = nd->nd_next;
3928 					continue;
3929 				} else if (mdanyrpcerror(ep)) {
3930 					/* RPC failure to another node */
3931 					return (205);
3932 				} else {
3933 					/* Any other failure */
3934 					return (-1);
3935 				}
3936 			}
3937 			/*
3938 			 * Is my node in the nodelist gotten from the other
3939 			 * node?  If not, then remove the set from my node
3940 			 * since set was deleted from my node while my node
3941 			 * was out of the cluster.
3942 			 */
3943 			nr = mnsr->sr_nodechain;
3944 			while (nr) {
3945 				if (sd->sd_mn_mynode->nd_nodeid ==
3946 				    nr->nr_nodeid) {
3947 					break;
3948 				}
3949 				nr = nr->nr_next;
3950 			}
3951 			if (nr == NULL) {
3952 				/* my node not found - delete set */
3953 				free_sr((md_set_record *)mnsr);
3954 				goto delete_set;
3955 			}
3956 
3957 			/* Is node being checked marked OK on its own node? */
3958 			nr = mnsr->sr_nodechain;
3959 			while (nr) {
3960 				if (nd->nd_nodeid == nr->nr_nodeid) {
3961 					if (!(nr->nr_flags & MD_MN_NODE_OK)) {
3962 						nd->nd_flags |= MD_MN_NODE_DEL;
3963 					}
3964 					break;
3965 				}
3966 				nr = nr->nr_next;
3967 			}
3968 			/*
3969 			 * If node being checked doesn't exist on its
3970 			 * own node - don't choose it as master.
3971 			 */
3972 			if (nr == NULL) {
3973 				nd->nd_flags |= MD_MN_NODE_DEL;
3974 			}
3975 
3976 			/*
3977 			 * Check every node in my node's nodelist against
3978 			 * the nodelist gotten from the other node.
3979 			 * If a node in my node's nodelist is not found in the
3980 			 * other node's nodelist, then set the DEL flag.
3981 			 */
3982 			nd2 = sd->sd_nodelist;
3983 			while (nd2) {
3984 				nr = mnsr->sr_nodechain;
3985 				while (nr) {
3986 					if (nd2->nd_nodeid == nr->nr_nodeid) {
3987 						break;
3988 					}
3989 					nr = nr->nr_next;
3990 				}
3991 				/* nd2 not found in other node's nodelist */
3992 				if (nr == NULL) {
3993 					nd2->nd_flags |= MD_MN_NODE_DEL;
3994 				}
3995 				nd2 = nd2->nd_next;
3996 			}
3997 
3998 			free_sr((md_set_record *)mnsr);
3999 			nd = nd->nd_next;
4000 		}
4001 
4002 		/*
4003 		 * Rescan list look for node that has not been marked DEL.
4004 		 * First node found is the master.
4005 		 */
4006 		nd = sd->sd_nodelist;
4007 		while (nd) {
4008 			if (!(nd->nd_flags & MD_MN_NODE_DEL)) {
4009 				break;
4010 			}
4011 			nd = nd->nd_next;
4012 			continue;
4013 		}
4014 		if (nd) {
4015 			/* Found a master */
4016 			goto found_master;
4017 		}
4018 
4019 		/*
4020 		 * - If no node can be found that has its own node record on
4021 		 *	its node to be set to OK, then all alive nodes
4022 		 * 	were in the process of being added to or deleted
4023 		 *	from set.  Each alive node will remove all
4024 		 *	information pertaining to this set from its node.
4025 		 *
4026 		 * If all nodes in set are ALIVE, then call sdssc end routines
4027 		 * since set was truly being initially created or destroyed.
4028 		 */
4029 		goto delete_set;
4030 	}
4031 
4032 found_master:
4033 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4034 	    "Set %s master chosen %s (%d): %s"),
4035 	    sp->setname, nd->nd_nodename, nd->nd_nodeid,
4036 	    meta_print_hrtime(gethrtime() - start_time));
4037 
4038 	if (clnt_lock_set(mynode(), sp, ep) == -1) {
4039 		return (-1);
4040 	}
4041 
4042 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
4043 
4044 	if (clnt_mnsetmaster(mynode(), sp,
4045 	    nd->nd_nodename, nd->nd_nodeid, ep)) {
4046 		rval = -1;
4047 	} else if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) {
4048 		/* If this node is new master, set flag in this node's kernel */
4049 		(void) memset(&sf, 0, sizeof (sf));
4050 		sf.sf_setno = sp->setno;
4051 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
4052 		/* Use magic to help protect ioctl against attack. */
4053 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
4054 		sf.sf_flags = MDDB_NM_SET;
4055 
4056 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4057 		    "Setting new master flag for set %s: %s"),
4058 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4059 
4060 		/*
4061 		 * Fail reconfig cycle if ioctl fails since it is critical
4062 		 * to set new master flag.
4063 		 */
4064 		if (metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde,
4065 		    NULL) != NULL) {
4066 			(void) mdstealerror(ep, &sf.sf_mde);
4067 			rval = -1;
4068 		}
4069 	}
4070 
4071 	if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) {
4072 		if (rval == 0) {
4073 			(void) mdstealerror(ep, &xep);
4074 			rval = -1;
4075 		}
4076 	}
4077 
4078 	cl_set_setkey(NULL);
4079 
4080 	metaflushsetname(sp);
4081 
4082 	return (rval);
4083 
4084 delete_set:
4085 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4086 	    "Master not chosen, deleting set %s: %s"),
4087 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4088 
4089 	/*
4090 	 * Remove all set information from this node:
4091 	 *	- node records for this set
4092 	 *	- drive records for this set
4093 	 *	- set record for this set
4094 	 * (Only do this on this node since each node
4095 	 * will do it for its own local mddb.)
4096 	 *
4097 	 * If all nodes in set are ALIVE, then
4098 	 * the lowest numbered ALIVE nodeid in set
4099 	 * (irregardless of whether an owner node or not) will
4100 	 * call the DCS service to cleanup for create/delete of set.
4101 	 *   sdssc_create_end(cleanup) if set was being created or
4102 	 *   sdssc_delete_end(cleanup) if set was being deleted.
4103 	 * A node record with flag ADD denotes a set being
4104 	 * created.  A node record with flag DEL denotes a
4105 	 * set being deleted.
4106 	 */
4107 	nd = sd->sd_nodelist;
4108 	while (nd) {
4109 		/* Found a node that isn't alive */
4110 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
4111 			break;
4112 
4113 		/* Is my node the lowest numbered ALIVE node? */
4114 		if (nd->nd_nodeid < sd->sd_mn_mynode->nd_nodeid) {
4115 			break;
4116 		}
4117 		nd = nd->nd_next;
4118 	}
4119 	if (nd == NULL) {
4120 		/* All nodes ALIVE and this is the lowest nodeid */
4121 		lowest_alive_nodeid = 1;
4122 	}
4123 
4124 	if (clnt_lock_set(mynode(), sp, ep) == -1) {
4125 		return (-1);
4126 	}
4127 
4128 
4129 	/*
4130 	 * If this node had been joined, withdraw and reset master.
4131 	 *
4132 	 * This could happen if a node was being added to or removed
4133 	 * from a diskset and the node doing the add/delete operation and
4134 	 * all other nodes in the diskset have left the cluster.
4135 	 */
4136 	if (sd->sd_mn_mynode) {
4137 		nd = sd->sd_mn_mynode;
4138 		if (nd->nd_flags & MD_MN_NODE_OWN) {
4139 			if (clnt_withdrawset(mynode(), sp, ep)) {
4140 				rval = -1;
4141 				goto out;
4142 			}
4143 			if (clnt_mnsetmaster(mynode(), sp, "",
4144 			    MD_MN_INVALID_NID, ep)) {
4145 				rval = -1;
4146 				goto out;
4147 			}
4148 		}
4149 	}
4150 
4151 	/*
4152 	 * Remove side records for this node (side) from local mddb
4153 	 * (clnt_deldrvs does this) if there are drives in the set.
4154 	 *
4155 	 * Don't need to mark this node as DEL since already marked as
4156 	 * ADD or DEL (or this node would have been chosen as master).
4157 	 * Don't need to mark other node records, drive records or
4158 	 * set records as DEL.  If a panic occurs during clnt_delset,
4159 	 * these records will be deleted the next time this node
4160 	 * becomes a member and goes through the reconfig cycle.
4161 	 */
4162 	/* Get the drive descriptors for this set */
4163 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
4164 	    ep)) == NULL) {
4165 		if (! mdisok(ep)) {
4166 			/*
4167 			 * Ignore and clear out any failures from
4168 			 * metaget_drivedesc since a panic could have
4169 			 * occurred when a node was partially added to a set.
4170 			 */
4171 			mdclrerror(ep);
4172 		}
4173 	} else {
4174 		if (clnt_deldrvs(mynode(), sp, dd, ep)) {
4175 			rval = -1;
4176 			goto out;
4177 		}
4178 	}
4179 
4180 	/*
4181 	 * Now, delete the set - this removes the node, drive
4182 	 * and set records from the local mddb.
4183 	 */
4184 	if (clnt_delset(mynode(), sp, ep)) {
4185 		rval = -1;
4186 		goto out;
4187 	}
4188 
4189 out:
4190 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
4191 
4192 	/*
4193 	 * Ignore errors from unlock of set since set is no longer
4194 	 * known (if clnt_delset worked).
4195 	 */
4196 	if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) {
4197 		mdclrerror(&xep);
4198 	}
4199 
4200 	cl_set_setkey(NULL);
4201 
4202 	metaflushsetname(sp);
4203 
4204 	/*
4205 	 * If this node is the lowest numbered nodeid then
4206 	 * call sdssc_create/delete_end depending on whether
4207 	 * this node is marked as ADD or DEL in the node record.
4208 	 */
4209 	if (lowest_alive_nodeid) {
4210 		if (nd->nd_flags & MD_MN_NODE_ADD)
4211 			sdssc_create_end(sp->setname, SDSSC_CLEANUP);
4212 		else if (nd->nd_flags & MD_MN_NODE_DEL)
4213 			sdssc_delete_end(sp->setname, SDSSC_CLEANUP);
4214 	}
4215 
4216 	/* Finished with this set -- return */
4217 	return (rval);
4218 }
4219 
4220 /*
4221  * Reconfig step to choose a new master for all MN disksets.
4222  * Return values:
4223  *	0 - Everything is great.
4224  *	1 - This node failed to reconfig.
4225  *	205 - Cause another reconfig due to a nodelist problem
4226  *		or RPC failure to another node
4227  */
4228 int
4229 meta_reconfig_choose_master(
4230 	long		timeout,
4231 	md_error_t	*ep
4232 )
4233 {
4234 	set_t				max_sets, setno;
4235 	int				nodecnt;
4236 	mndiskset_membershiplist_t	*nl;
4237 	md_set_desc			*sd;
4238 	mdsetname_t			*sp;
4239 	int				rval = 0;
4240 	mddb_setflags_config_t		sf;
4241 	int				start_node_delayed = 0;
4242 
4243 	if ((max_sets = get_max_sets(ep)) == 0) {
4244 		mde_perror(ep, dgettext(TEXT_DOMAIN,
4245 		    "Unable to get number of sets"));
4246 		return (1);
4247 	}
4248 
4249 	/*
4250 	 * Get membershiplist from API routine.  If there's
4251 	 * an error, return a 205 to cause another reconfig.
4252 	 */
4253 	if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
4254 		mde_perror(ep, "");
4255 		return (205);
4256 	}
4257 
4258 	for (setno = 1; setno < max_sets; setno++) {
4259 		if ((sp = metasetnosetname(setno, ep)) == NULL) {
4260 			if (mdiserror(ep, MDE_NO_SET)) {
4261 				/* No set for this setno - continue */
4262 				mdclrerror(ep);
4263 				continue;
4264 			} else {
4265 				/*
4266 				 * If encountered an RPC error from my node,
4267 				 * then immediately fail.
4268 				 */
4269 				if (mdanyrpcerror(ep)) {
4270 					mde_perror(ep, "");
4271 					return (1);
4272 				}
4273 				/* Can't get set information */
4274 				mde_perror(ep, dgettext(TEXT_DOMAIN,
4275 					"Unable to get information for "
4276 					"set number %d"), setno);
4277 				mdclrerror(ep);
4278 				continue;
4279 			}
4280 		}
4281 
4282 		/* If setname is there, set desc should exist. */
4283 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4284 			/*
4285 			 * If encountered an RPC error from my node,
4286 			 * then immediately fail.
4287 			 */
4288 			if (mdanyrpcerror(ep)) {
4289 				mde_perror(ep, "");
4290 				return (1);
4291 			}
4292 			mde_perror(ep, dgettext(TEXT_DOMAIN,
4293 				"Unable to get set %s desc information"),
4294 				sp->setname);
4295 			mdclrerror(ep);
4296 			continue;
4297 		}
4298 
4299 		/* Only reconfig MN disksets */
4300 		if (!MD_MNSET_DESC(sd)) {
4301 			continue;
4302 		}
4303 
4304 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4305 		    "Begin choose master for set %s: %s"),
4306 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4307 
4308 		/* Update nodelist with member information. */
4309 		if (meta_reconfig_update_nodelist(sp, nl, sd, ep)) {
4310 			/*
4311 			 * If encountered an RPC error from my node,
4312 			 * then immediately fail.
4313 			 */
4314 			if (mdanyrpcerror(ep)) {
4315 				mde_perror(ep, "");
4316 				return (1);
4317 			}
4318 			mde_perror(ep, "");
4319 			mdclrerror(ep);
4320 			continue;
4321 		}
4322 
4323 		/*
4324 		 * If all nodes in a cluster are starting, then
4325 		 * all nodes will attempt to contact all other nodes
4326 		 * to determine a master node.  This can lead to a
4327 		 * problem where node 1 is trying to contact the rpc.metad
4328 		 * node 2 and node 2 is trying to contact the rpc.metad
4329 		 * on node 1 -- and this causes the rpc call to fail
4330 		 * on both nodes and causes a new reconfig cycle.
4331 		 *
4332 		 * In order to break this problem, a newly starting node
4333 		 * will delay a small amount of time (nodeid mod 4 seconds)
4334 		 * and will then run the code to choose a master for the
4335 		 * first set.  Delay will only be done once regardless of the
4336 		 * number of sets.
4337 		 */
4338 		if (start_node_delayed == 0) {
4339 			(void) memset(&sf, 0, sizeof (sf));
4340 			sf.sf_setno = sp->setno;
4341 			sf.sf_flags = MDDB_NM_GET;
4342 			/* Use magic to help protect ioctl against attack. */
4343 			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
4344 			if ((metaioctl(MD_MN_GET_SETFLAGS, &sf,
4345 			    &sf.sf_mde, NULL) == 0) &&
4346 			    ((sf.sf_setflags & MD_SET_MN_START_RC) ==
4347 			    MD_SET_MN_START_RC)) {
4348 				(void) sleep(sd->sd_mn_mynode->nd_nodeid % 4);
4349 			}
4350 			start_node_delayed = 1;
4351 		}
4352 
4353 		/* Choose master for this set */
4354 		rval = meta_reconfig_choose_master_for_set(sp, sd, ep);
4355 		if (rval == -1) {
4356 			mde_perror(ep, "");
4357 			return (1);
4358 		} else if (rval == 205) {
4359 			mde_perror(ep, "");
4360 			return (205);
4361 		}
4362 
4363 		/* reinit rpc.mdcommd with new nodelist */
4364 		if (mdmn_reinit_set(sp->setno, timeout)) {
4365 			md_eprintf(dgettext(TEXT_DOMAIN,
4366 			    "Could not re-initialise rpc.mdcommd for "
4367 			    "set %s\n"), sp->setname);
4368 			return (1);
4369 		}
4370 
4371 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4372 		    "Choose master for set %s completed: %s"),
4373 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4374 	}
4375 
4376 	/*
4377 	 * Each node turns on I/Os for all MN disksets.
4378 	 * This is to recover from the situation where the master died
4379 	 * during a previous reconfig cycle when I/Os were suspended
4380 	 * for a MN diskset.
4381 	 * If a failure occurs return a 1 which will force this node to
4382 	 * panic.  Cannot leave node in the situation where I/Os are
4383 	 * not resumed.
4384 	 */
4385 	setno = 0; /* 0 means all MN sets */
4386 	if (metaioctl(MD_MN_RESUME_SET, &setno, ep, NULL)) {
4387 		mde_perror(ep, "");
4388 		return (1);
4389 	}
4390 
4391 	/* Free the nodelist */
4392 	if (nodecnt)
4393 		meta_free_nodelist(nl);
4394 
4395 	return (0);
4396 }
4397 
4398 /*
4399  * meta_mnsync_user_records will synchronize the diskset user records across
4400  * all nodes in the diskset.  The diskset user records are stored in
4401  * each node's local set mddb.
4402  *
4403  * This needs to be done even if there is no master change during the
4404  * reconfig cycle since this routine should clean up any mess left by
4405  * the untimely termination of a metaset or metadb command (due to a
4406  * node panic or to user intervention).
4407  *
4408  * Caller is the Master node.
4409  *
4410  * Returns	 0 - Success
4411  *		205 - Failure during RPC to another node
4412  *		-1 - Any other failure and ep is filled in.
4413  */
4414 int
4415 meta_mnsync_user_records(
4416 	mdsetname_t	*sp,
4417 	md_error_t	*ep
4418 )
4419 {
4420 	md_set_desc		*sd;
4421 	md_mnnode_desc		*master_nodelist, *nd, *nd2, *ndtail;
4422 	md_mnset_record		*mnsr;
4423 	md_mnsr_node_t		*master_mnsr_node = NULL, *mnsr_node = NULL;
4424 	md_mnnode_record	*nr;
4425 	md_drive_record		*dr;
4426 	int			dr_cnt, dd_cnt;
4427 	int			found_my_nr;
4428 	md_drive_desc		*dd, *dd_prev, *master_dd, *other_dd;
4429 	int			all_drives_ok;
4430 	int			rval = 0;
4431 	int			max_genid = 0;
4432 	int			num_alive_nodes, num_alive_nodes_del = 0;
4433 	int			set_locked = 0;
4434 	md_setkey_t		*cl_sk;
4435 	md_error_t		xep = mdnullerror;
4436 	char			*anode[1];
4437 	mddb_setflags_config_t	sf;
4438 
4439 	/*
4440 	 * Sync up node records first.
4441 	 * Construct a master nodelist using the nodelist from this
4442 	 * node's rpc.metad node records and then setting the state of each
4443 	 * node following these rules:
4444 	 *	- If a node record is marked OK on its node, mark it OK
4445 	 *		in the master nodelist (and later OK on all nodes)
4446 	 *		If a node record is also marked OWN on its node,
4447 	 *		mark it OWN in the master nodelist.
4448 	 *	- If a node record is not marked OK on its node, then mark
4449 	 *		it as DEL in the master list (later deleting it)
4450 	 *	- If node record doesn't exist on that node, then mark it DEL
4451 	 *		(later deleting it)
4452 	 *	- If set record doesn't exist on that node, mark node as DEL
4453 	 *	- If a node record doesn't exist on all nodes, then mark it DEL
4454 	 *	- If a node is not ALIVE, then
4455 	 *		- If that node marked DEL on any node - mark it DEL
4456 	 *			in master list but leave in nodelist
4457 	 *		- If that node is marked as ADD on any node, mark it
4458 	 *			ADD in the master list but leave in nodelist
4459 	 *		- When that node returns to the living, the DEL
4460 	 *			node record will be removed and the ADD node
4461 	 *			record may be removed if marked ADD on that
4462 	 *			node.
4463 	 * The key rule is to not remove a node from the nodelist until
4464 	 * that node record is removed from its own node.  Do not want to
4465 	 * remove a node's record from all other nodes and then have
4466 	 * that node have its own record marked OK so that a node will pick
4467 	 * a different master than the other nodes.
4468 	 *
4469 	 * Next,
4470 	 * If node is ALIVE and node record is marked DEL in master nodelist,
4471 	 * remove node from set.
4472 	 * If node is ALIVE and node record is marked OK in master nodelist,
4473 	 * mark it OK on all other nodes.
4474 	 * If node is not ALIVE and node record is marked DEL in master
4475 	 * nodelist, mark it DEL on all other nodes.
4476 	 * If node is not ALIVE and node record is marked ADD in master,
4477 	 * nodelist, mark it ADD on all other nodes.
4478 	 */
4479 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4480 		return (-1);
4481 	}
4482 	master_nodelist = sd->sd_nodelist;
4483 
4484 	/*
4485 	 * Walk through nodelist creating a master nodelist.
4486 	 */
4487 	num_alive_nodes = 0;
4488 	nd = master_nodelist;
4489 	while (nd) {
4490 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4491 			nd = nd->nd_next;
4492 			continue;
4493 		}
4494 		num_alive_nodes++;
4495 		if (clnt_mngetset(nd->nd_nodename, sp->setname,
4496 		    MD_SET_BAD, &mnsr, ep) == -1) {
4497 			if (mdiserror(ep, MDE_NO_SET)) {
4498 				/* set doesn't exist, mark node as DEL */
4499 				nd->nd_flags &= ~MD_MN_NODE_OK;
4500 				nd->nd_flags &= ~MD_MN_NODE_ADD;
4501 				nd->nd_flags |= MD_MN_NODE_DEL;
4502 				nd->nd_flags |= MD_MN_NODE_NOSET;
4503 				nd = nd->nd_next;
4504 				continue;
4505 			} else {
4506 				/* If RPC failure to another node return 205 */
4507 				if ((mdanyrpcerror(ep)) &&
4508 				    (sd->sd_mn_mynode->nd_nodeid !=
4509 				    nd->nd_nodeid)) {
4510 					rval = 205;
4511 				} else {
4512 					/* Any other failure */
4513 					rval = -1;
4514 				}
4515 				goto out;
4516 			}
4517 		}
4518 		/* Find biggest genid in records for this diskset */
4519 		if (mnsr->sr_genid > max_genid)
4520 			max_genid = mnsr->sr_genid;
4521 
4522 		dr = mnsr->sr_drivechain;
4523 		while (dr) {
4524 			/* Find biggest genid in records for this diskset */
4525 			if (dr->dr_genid > max_genid) {
4526 				max_genid = dr->dr_genid;
4527 			}
4528 			dr = dr->dr_next;
4529 		}
4530 
4531 		found_my_nr = 0;
4532 		nr = mnsr->sr_nodechain;
4533 		/* nr is the list of node recs from nd_nodename node */
4534 		while (nr) {
4535 			/* Find biggest genid in records for this diskset */
4536 			if (nr->nr_genid > max_genid)
4537 				max_genid = nr->nr_genid;
4538 			nd2 = master_nodelist;
4539 			ndtail = NULL;
4540 			/* For each node record, is it in master list? */
4541 			while (nd2) {
4542 				if (nd2->nd_nodeid == nr->nr_nodeid)
4543 					break;
4544 				if (nd2->nd_next == NULL)
4545 					ndtail = nd2;
4546 				nd2 = nd2->nd_next;
4547 			}
4548 			/*
4549 			 * Found node record not in master list -- add it
4550 			 * to list marking it as DEL since node record
4551 			 * should exist on all nodes unless a panic occurred
4552 			 * during addition or deletion of host to diskset.
4553 			 */
4554 			if (nd2 == NULL) {
4555 				nd2 = Zalloc(sizeof (*nd2));
4556 				(void) strcpy(nd2->nd_nodename,
4557 				    nr->nr_nodename);
4558 				nd2->nd_flags = nr->nr_flags;
4559 				nd2->nd_flags |= MD_MN_NODE_DEL;
4560 				nd2->nd_nodeid = nr->nr_nodeid;
4561 				nd2->nd_next = NULL;
4562 				ndtail->nd_next = nd2;
4563 				nd2 = NULL;
4564 				nr = nr->nr_next;
4565 				continue;
4566 			}
4567 			/*
4568 			 * Is this the node record for the node that
4569 			 * we requested the set desc from?
4570 			 * If so, check if node has its own node record
4571 			 * marked OK. If marked OK, check for the OWN bit.
4572 			 */
4573 			if (nr->nr_nodeid == nd->nd_nodeid) {
4574 				found_my_nr = 1;
4575 				if (nr->nr_flags & MD_MN_NODE_OK) {
4576 					/*
4577 					 * If node record is marked OK
4578 					 * on its own node, then mark it OK
4579 					 * in the master list.  Node record
4580 					 * would have to exist on all nodes
4581 					 * in the ADD state before it could
4582 					 * be put into the OK state.
4583 					 */
4584 					nd->nd_flags |= MD_MN_NODE_OK;
4585 					nd->nd_flags &=
4586 					    ~(MD_MN_NODE_ADD | MD_MN_NODE_DEL);
4587 					/*
4588 					 * Mark own in master list as marked
4589 					 * on own node.
4590 					 */
4591 					if (nr->nr_flags & MD_MN_NODE_OWN)
4592 						nd->nd_flags |= MD_MN_NODE_OWN;
4593 					else
4594 						nd->nd_flags &= ~MD_MN_NODE_OWN;
4595 				} else {
4596 					/* Otherwise, mark node as DEL */
4597 					nd->nd_flags &= ~MD_MN_NODE_OK;
4598 					nd->nd_flags &= ~MD_MN_NODE_ADD;
4599 					nd->nd_flags |= MD_MN_NODE_DEL;
4600 				}
4601 			}
4602 			/*
4603 			 * If node is not ALIVE and marked DEL
4604 			 * on any node, make it DEL in master list.
4605 			 * If node is not ALIVE and marked ADD
4606 			 * on any node, make it ADD in master list
4607 			 * unless node record has already been marked DEL.
4608 			 */
4609 			if (!(nr->nr_flags & MD_MN_NODE_ALIVE)) {
4610 				if (nr->nr_flags & MD_MN_NODE_ADD) {
4611 					if (!(nd->nd_flags & MD_MN_NODE_DEL)) {
4612 						/* If not DEL - mark it ADD */
4613 						nd->nd_flags |= MD_MN_NODE_ADD;
4614 						nd->nd_flags &= ~MD_MN_NODE_OK;
4615 					}
4616 				}
4617 				if (nr->nr_flags & MD_MN_NODE_DEL) {
4618 					nd->nd_flags |= MD_MN_NODE_DEL;
4619 					nd->nd_flags &= ~MD_MN_NODE_OK;
4620 					/* Could already be ADD - make it DEL */
4621 					nd->nd_flags &= ~MD_MN_NODE_ADD;
4622 				}
4623 			}
4624 			nr = nr->nr_next;
4625 		}
4626 		/*
4627 		 * If a node record doesn't exist on its own node,
4628 		 * then mark node as DEL.
4629 		 */
4630 		if (found_my_nr == 0) {
4631 			nd->nd_flags &= ~MD_MN_NODE_OK;
4632 			nd->nd_flags |= MD_MN_NODE_DEL;
4633 		}
4634 
4635 		/*
4636 		 * If node is OK - put mnsr onto master_mnsr_node list for
4637 		 * later use when syncing up the drive records in the set.
4638 		 */
4639 		if (nd->nd_flags & MD_MN_NODE_OK) {
4640 			mnsr_node = Zalloc(sizeof (*mnsr_node));
4641 			mnsr_node->mmn_mnsr = mnsr;
4642 			(void) strncpy(mnsr_node->mmn_nodename,
4643 				nd->nd_nodename, MD_MAX_MNNODENAME_PLUS_1);
4644 			mnsr_node->mmn_next = master_mnsr_node;
4645 			master_mnsr_node = mnsr_node;
4646 		} else {
4647 			free_sr((struct md_set_record *)mnsr);
4648 		}
4649 
4650 		nd = nd->nd_next;
4651 	}
4652 
4653 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4654 	    "Master nodelist created for set %s: %s"),
4655 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4656 
4657 	/*
4658 	 * Send master nodelist to the rpc.metad on all nodes (including
4659 	 * myself) and each node will update itself.  This will set the
4660 	 * ADD and DEL flags on each node as setup in the master nodelist.
4661 	 * Don't send nodelist to node where set doesn't exist.
4662 	 */
4663 	nd = master_nodelist;
4664 	while (nd) {
4665 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4666 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
4667 			nd = nd->nd_next;
4668 			continue;
4669 		}
4670 		if (clnt_upd_nr_flags(nd->nd_nodename, sp,
4671 		    master_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) {
4672 			/* If RPC failure to another node return 205 */
4673 			if ((mdanyrpcerror(ep)) &&
4674 			    (sd->sd_mn_mynode->nd_nodeid !=
4675 			    nd->nd_nodeid)) {
4676 				rval = 205;
4677 			} else {
4678 				/* Any other failure */
4679 				rval = -1;
4680 			}
4681 			goto out;
4682 		}
4683 		nd = nd->nd_next;
4684 	}
4685 
4686 	/*
4687 	 * Now, delete nodes that need to be deleted.
4688 	 */
4689 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
4690 	    ep))  == NULL) {
4691 		if (! mdisok(ep)) {
4692 			rval = -1;
4693 			goto out;
4694 		}
4695 	}
4696 
4697 	/*
4698 	 * May be doing lots of RPC commands to the nodes, so lock the
4699 	 * ALIVE members of the set since most of the rpc.metad routines
4700 	 * require this for security reasons.
4701 	 */
4702 	nd = master_nodelist;
4703 	while (nd) {
4704 		/* Skip non-alive nodes and node without set */
4705 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4706 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
4707 			nd = nd->nd_next;
4708 			continue;
4709 		}
4710 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
4711 			/* If RPC failure to another node return 205 */
4712 			if ((mdanyrpcerror(ep)) &&
4713 			    (sd->sd_mn_mynode->nd_nodeid !=
4714 			    nd->nd_nodeid)) {
4715 				rval = 205;
4716 			} else {
4717 				/* Any other failure */
4718 				rval = -1;
4719 			}
4720 			goto out;
4721 		}
4722 		set_locked = 1;
4723 		nd = nd->nd_next;
4724 	}
4725 
4726 	nd = master_nodelist;
4727 	while (nd) {
4728 		/* Skip non-alive nodes */
4729 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4730 			nd = nd->nd_next;
4731 			continue;
4732 		}
4733 		if (nd->nd_flags & MD_MN_NODE_DEL) {
4734 			num_alive_nodes_del++;
4735 			/*
4736 			 * Delete this node rec from all ALIVE nodes in diskset.
4737 			 */
4738 			nd2 = master_nodelist;
4739 			while (nd2) {
4740 				/* Skip non-alive nodes and node without set */
4741 				if (!(nd2->nd_flags & MD_MN_NODE_ALIVE) ||
4742 				    (nd2->nd_flags & MD_MN_NODE_NOSET)) {
4743 					nd2 = nd2->nd_next;
4744 					continue;
4745 				}
4746 
4747 				/* This is a node being deleted from set */
4748 				if (nd2->nd_nodeid == nd->nd_nodeid) {
4749 					/* Mark set record as DEL */
4750 					if (clnt_upd_sr_flags(nd->nd_nodename,
4751 					    sp, MD_SR_DEL, ep)) {
4752 						/* RPC failure to !my node */
4753 						if ((mdanyrpcerror(ep)) &&
4754 						    (sd->sd_mn_mynode->
4755 						    nd_nodeid
4756 						    != nd->nd_nodeid)) {
4757 							rval = 205;
4758 						} else {
4759 							/* Any other failure */
4760 							rval = -1;
4761 						}
4762 						goto out;
4763 					}
4764 					if (clnt_deldrvs(nd->nd_nodename, sp,
4765 					    dd, ep)) {
4766 						/* RPC failure to !my node */
4767 						if ((mdanyrpcerror(ep)) &&
4768 						    (sd->sd_mn_mynode->
4769 						    nd_nodeid
4770 						    != nd->nd_nodeid)) {
4771 							rval = 205;
4772 						} else {
4773 							/* Any other failure */
4774 							rval = -1;
4775 						}
4776 						goto out;
4777 					}
4778 					if (clnt_delset(nd->nd_nodename, sp,
4779 					    ep) == -1) {
4780 						/* RPC failure to !my node */
4781 						if ((mdanyrpcerror(ep)) &&
4782 						    (sd->sd_mn_mynode->
4783 						    nd_nodeid
4784 						    != nd->nd_nodeid)) {
4785 							rval = 205;
4786 						} else {
4787 							/* Any other failure */
4788 							rval = -1;
4789 						}
4790 						goto out;
4791 					}
4792 				} else {
4793 					/*
4794 					 * Delete host from sets on hosts
4795 					 * not being deleted.
4796 					 */
4797 					anode[0] = Strdup(nd->nd_nodename);
4798 					if (clnt_delhosts(nd2->nd_nodename, sp,
4799 					    1, anode, ep) == -1) {
4800 						Free(anode[0]);
4801 						/* RPC failure to !my node */
4802 						if ((mdanyrpcerror(ep)) &&
4803 						    (sd->sd_mn_mynode->
4804 						    nd_nodeid
4805 						    != nd2->nd_nodeid)) {
4806 							rval = 205;
4807 						} else {
4808 							/* Any other failure */
4809 							rval = -1;
4810 						}
4811 						goto out;
4812 					}
4813 
4814 					meta_mc_log(MC_LOG5,
4815 					    dgettext(TEXT_DOMAIN,
4816 					    "Deleted node %s (%d) on node %s "
4817 					    "from set %s: %s"),
4818 					    nd->nd_nodename, nd->nd_nodeid,
4819 					    nd2->nd_nodename,
4820 					    sp->setname,
4821 					    meta_print_hrtime(
4822 					    gethrtime() - start_time));
4823 
4824 					Free(anode[0]);
4825 				}
4826 				nd2 = nd2->nd_next;
4827 			}
4828 		}
4829 		nd = nd->nd_next;
4830 	}
4831 
4832 	nd = master_nodelist;
4833 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
4834 	while (nd) {
4835 		/* Skip non-alive nodes and node without set */
4836 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4837 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
4838 			nd = nd->nd_next;
4839 			continue;
4840 		}
4841 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) {
4842 			/* If RPC failure to another node return 205 */
4843 			if ((mdanyrpcerror(ep)) &&
4844 			    (sd->sd_mn_mynode->nd_nodeid !=
4845 			    nd->nd_nodeid)) {
4846 				rval = 205;
4847 			} else {
4848 				/* Any other failure */
4849 				rval = -1;
4850 			}
4851 			goto out;
4852 		}
4853 		nd = nd->nd_next;
4854 	}
4855 	cl_set_setkey(NULL);
4856 	set_locked = 0;
4857 
4858 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4859 	    "Nodelist syncronization complete for set %s: %s"),
4860 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4861 
4862 	metaflushsetname(sp);
4863 
4864 	/*
4865 	 * If all alive nodes have been deleted from set, just
4866 	 * return since nothing else can be done until non-alive
4867 	 * nodes (if there are any) rejoin the cluster.
4868 	 */
4869 	if (num_alive_nodes == num_alive_nodes_del) {
4870 		rval = 0;
4871 		goto out;
4872 	}
4873 
4874 	/*
4875 	 * Sync up drive records.
4876 	 *
4877 	 * If a node panic'd (or metaset command was killed) during the
4878 	 * addition or deletion of a drive to the diskset, the nodes
4879 	 * may have a different view of the drive list.  During cleanup
4880 	 * of the drive list during reconfig, a drive will be deleted
4881 	 * from the list if the master node sees that the drive has been
4882 	 * marked in the ADD state on any node or is marked in the DEL state
4883 	 * on all nodes.
4884 	 * This cleanup must occur even if all nodes in the cluster are
4885 	 * not part of the cluster so that all nodes have the same view
4886 	 * of the drivelist.
4887 	 * Then if the entire cluster goes down and comes back up, the
4888 	 * new master node could be a node that wasn't in the cluster when
4889 	 * the node was deleted.  This could lead to a situation where the
4890 	 * master node thinks that a drive is OK, but this drive isn't
4891 	 * known to the other nodes.
4892 	 * This situation can also occur during the addition of a drive
4893 	 * where a node has the drive marked OK, but the node executing the
4894 	 * metaset command enountered a failure before marking that drive OK
4895 	 * on the rest of the nodes.  If the node with the OK drive then
4896 	 * panics, then rest of the nodes will remove that drive marked ADD
4897 	 * and when the node with the OK drive rejoins the cluster, it will
4898 	 * have a drive marked OK that is unknown by the other nodes.
4899 	 *
4900 	 * There are 2 situations to consider:
4901 	 * A) Master knows about a drive that other nodes don't know about.
4902 	 * B) At least one slave node knows about a drive that the master
4903 	 *    node doesn't know about.
4904 	 *
4905 	 * To handle these situations the following steps are followed:
4906 	 * 1) Count number of drives known by this master node and the
4907 	 *    other slave nodes.
4908 	 *    If all nodes have the same number of drives and the master has
4909 	 *    all drives marked OK, then skip to step4.
4910 	 *
4911 	 * 2) If a node has less drives listed than the master, the master
4912 	 *    must get the drive descriptor list from that node so that
4913 	 *    master can determine which drive it needs to delete from that
4914 	 *    node.  Master must get the drive descriptor list since the
4915 	 *    drive record list does not contain the name of the drive, but
4916 	 *    only a key and the key can only be interprested on that other
4917 	 *    node.
4918 	 *
4919 	 * 3) The master will then create the master drive list by doing:
4920 	 *	- Master starts with drive list known by master.
4921 	 *	- Any drive marked ADD will be removed from the list.
4922 	 *	- Any drive not known by another node (from step2) will be
4923 	 *	removed from the drive list.
4924 	 *	- If a drive is marked DEL on the master, the master must
4925 	 *	verify that the drive record is marked DEL on all nodes.
4926 	 *	If any node has the drive record marked OK, mark it OK
4927 	 *	on the master.  (The reason why is described below).
4928 	 *
4929 	 * 4) The master sends out the master drive list and the slave
4930 	 *    nodes will force their drive lists to match the master
4931 	 *    drive list by deleting drives, if necessary and by changing
4932 	 *    the drive record states from ADD->OK if master has drive
4933 	 *    marked OK and slave has drive marked ADD.
4934 	 *
4935 	 * Interesting scenarios:
4936 	 *
4937 	 * 1) System has 4 nodes with node 1 as the master.  Node 3 starts
4938 	 *    to delete a drive record (drive record on node 1 is marked DEL),
4939 	 *    but is stopped when node 3 panics.  Node 1 also panics.
4940 	 *    During reconfig cycle, node 2 is picked as master and the drive
4941 	 *    record is left alone since all nodes in the cluster have it
4942 	 *    marked OK.  User now sees drive as part of diskset.
4943 	 *    Now, entire cluster is rebooted and node 1 rejoins the cluster.
4944 	 *    Node 1 is picked as the master and node 1 has drive record
4945 	 *    marked DEL.  Node 1 contacts all other nodes in the cluster
4946 	 *    and since at least one node has the drive record marked OK,
4947 	 *    the master marks the drive record OK.
4948 	 *    User continues to see the drive as part of the diskset.
4949 	 */
4950 
4951 	/* Reget set descriptor since flushed above */
4952 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4953 		rval = -1;
4954 		goto out;
4955 	}
4956 
4957 	/* Has side effect of setting sd->sd_drvs to same as master_dd */
4958 	if ((master_dd = metaget_drivedesc_sideno(sp,
4959 	    sd->sd_mn_mynode->nd_nodeid,
4960 	    (MD_BASICNAME_OK | PRINT_FAST), ep)) == NULL) {
4961 		/* No drives in list */
4962 		if (!mdisok(ep)) {
4963 			/*
4964 			 * Can't get drive list for this node, so
4965 			 * return -1 causing this node to be removed
4966 			 * cluster config and fixed.
4967 			 */
4968 			rval = -1;
4969 			goto out;
4970 		}
4971 	}
4972 
4973 	/* Count the number of drives for all nodes */
4974 	mnsr_node = master_mnsr_node;
4975 	while (mnsr_node) {
4976 		dr_cnt = 0;
4977 		dr = mnsr_node->mmn_mnsr->sr_drivechain;
4978 		while (dr) {
4979 			dr_cnt++;
4980 			dr = dr->dr_next;
4981 		}
4982 		mnsr_node->mmn_numdrives = dr_cnt;
4983 		mnsr_node = mnsr_node->mmn_next;
4984 	}
4985 
4986 	/* Count the number of drives for the master; also check flags */
4987 	all_drives_ok = 1;
4988 	dd_cnt = 0;
4989 	dd = master_dd;
4990 	while (dd) {
4991 		dd_cnt++;
4992 		if (!(dd->dd_flags & MD_DR_OK))
4993 			all_drives_ok = 0;
4994 		dd = dd->dd_next;
4995 	}
4996 
4997 	/* If all drives are ok, do quick check against number of drives */
4998 	if (all_drives_ok) {
4999 		/* If all nodes have same number of drives, almost done */
5000 		mnsr_node = master_mnsr_node;
5001 		while (mnsr_node) {
5002 			if (mnsr_node->mmn_numdrives != dd_cnt)
5003 				break;
5004 			mnsr_node = mnsr_node->mmn_next;
5005 		}
5006 		/* All nodes have same number of drives, just send flags */
5007 		if (mnsr_node == NULL) {
5008 			goto send_drive_list;
5009 		}
5010 	}
5011 
5012 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5013 	    "Begin detailed drive synchronization for set %s: %s"),
5014 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5015 
5016 	/* Detailed check required  */
5017 	mnsr_node = master_mnsr_node;
5018 	while (mnsr_node) {
5019 		/* Does slave node have less drives than master? */
5020 		if (mnsr_node->mmn_numdrives < dd_cnt) {
5021 			/* Yes - must determine which drive is missing */
5022 			if (clnt_getdrivedesc(mnsr_node->mmn_nodename, sp,
5023 			    &other_dd, ep)) {
5024 				/* RPC failure to !my node */
5025 				if ((mdanyrpcerror(ep)) &&
5026 				    (strcmp(mynode(), mnsr_node->mmn_nodename)
5027 				    != 0)) {
5028 					rval = 205;
5029 				} else {
5030 					/* Any other failure */
5031 					rval = -1;
5032 				}
5033 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5034 				    "Master node %s unable to "
5035 				    "retrieve drive list from node %s"),
5036 				    mynode(), mnsr_node->mmn_nodename);
5037 				goto out;
5038 			}
5039 			mnsr_node->mmn_dd = other_dd;
5040 			dd = master_dd;
5041 			while (dd) {
5042 				if (!(dd->dd_flags & MD_DR_OK)) {
5043 					dd = dd->dd_next;
5044 					continue;
5045 				}
5046 				other_dd = mnsr_node->mmn_dd;
5047 				while (other_dd) {
5048 					/* Convert to devids, when available */
5049 					if (strcmp(other_dd->dd_dnp->cname,
5050 					    dd->dd_dnp->cname) == 0) {
5051 						break;
5052 					}
5053 					other_dd = other_dd->dd_next;
5054 				}
5055 				/*
5056 				 * dd not found on slave so mark it
5057 				 * ADD for later deletion (drives in ADD
5058 				 * state are deleted later in this routine).
5059 				 */
5060 				if (other_dd == NULL) {
5061 					dd->dd_flags = MD_DR_ADD;
5062 				}
5063 				dd = dd->dd_next;
5064 			}
5065 
5066 		}
5067 		mnsr_node = mnsr_node->mmn_next;
5068 	}
5069 
5070 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5071 	    "Drive check completed for set %s: %s"),
5072 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5073 
5074 	dd = master_dd;
5075 	dd_prev = 0;
5076 	while (dd) {
5077 		/* Remove any ADD drives from list */
5078 		if (dd->dd_flags & MD_DR_ADD) {
5079 			if (dd_prev) {
5080 				dd_prev->dd_next = dd->dd_next;
5081 				dd->dd_next = NULL;
5082 				metafreedrivedesc(&dd);
5083 				dd = dd_prev->dd_next;
5084 			} else {
5085 				/*
5086 				 * If removing drive descriptor from head
5087 				 * of linked list, also change sd->sd_drvs.
5088 				 */
5089 				master_dd = sd->sd_drvs = dd->dd_next;
5090 				dd->dd_next = NULL;
5091 				metafreedrivedesc(&dd);
5092 				dd = master_dd;
5093 			}
5094 			/* dd setup in if/else above */
5095 			continue;
5096 		}
5097 		/*
5098 		 * If drive is marked DEL, check all other nodes.
5099 		 * If drive on another node is marked OK, mark drive OK
5100 		 * in master list.  If drive is marked DEL or doesn't exist
5101 		 * on all nodes, remove drive from list.
5102 		 */
5103 		if (dd->dd_flags & MD_DR_DEL) {
5104 			mnsr_node = master_mnsr_node;
5105 			while (mnsr_node) {
5106 				if (mnsr_node->mmn_dd == NULL) {
5107 				    if (clnt_getdrivedesc(
5108 					mnsr_node->mmn_nodename, sp,
5109 					&other_dd, ep)) {
5110 					    /* RPC failure to !my node */
5111 					    if ((mdanyrpcerror(ep)) &&
5112 						(strcmp(mynode(),
5113 						mnsr_node->mmn_nodename)
5114 						!= 0)) {
5115 						    rval = 205;
5116 					    } else {
5117 						    /* Any other failure */
5118 						    rval = -1;
5119 					    }
5120 					    mde_perror(ep, dgettext(TEXT_DOMAIN,
5121 						"Master node %s unable "
5122 						"to retrieve drive list from "
5123 						"node %s"), mynode(),
5124 						mnsr_node->mmn_nodename);
5125 					    goto out;
5126 				    }
5127 				    mnsr_node->mmn_dd = other_dd;
5128 				}
5129 				other_dd = mnsr_node->mmn_dd;
5130 				while (other_dd) {
5131 					/* Found drive (OK) from other node */
5132 					if (strcmp(dd->dd_dnp->cname,
5133 					    other_dd->dd_dnp->cname)
5134 					    == 0) {
5135 						/* Drive marked OK */
5136 						if (other_dd->dd_flags &
5137 						    MD_DR_OK) {
5138 						    dd->dd_flags = MD_DR_OK;
5139 						}
5140 						break;
5141 					}
5142 					other_dd = other_dd->dd_next;
5143 				}
5144 				if (dd->dd_flags == MD_DR_OK)
5145 					break;
5146 
5147 				mnsr_node = mnsr_node->mmn_next;
5148 			}
5149 			/*
5150 			 * If no node had this drive marked OK, delete it.
5151 			 */
5152 			if (dd->dd_flags & MD_DR_DEL) {
5153 				if (dd_prev) {
5154 					dd_prev->dd_next = dd->dd_next;
5155 					dd->dd_next = NULL;
5156 					metafreedrivedesc(&dd);
5157 					dd = dd_prev->dd_next;
5158 				} else {
5159 					/*
5160 					 * If removing drive descriptor from
5161 					 * head of linked list, also change
5162 					 * sd->sd_drvs.
5163 					 */
5164 					master_dd = sd->sd_drvs = dd->dd_next;
5165 					dd->dd_next = NULL;
5166 					metafreedrivedesc(&dd);
5167 					dd = master_dd;
5168 				}
5169 				/* dd setup in if/else above */
5170 				continue;
5171 			}
5172 		}
5173 		dd_prev = dd;
5174 		dd = dd->dd_next;
5175 	}
5176 
5177 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5178 	    "Setting drive states completed for set %s: %s"),
5179 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5180 
5181 send_drive_list:
5182 	/*
5183 	 * Set genid on all drives to be the highest value seen.
5184 	 */
5185 	dd = master_dd;
5186 	while (dd) {
5187 		dd->dd_genid = max_genid;
5188 		dd = dd->dd_next;
5189 	}
5190 	/*
5191 	 * Send updated drive list to all alive nodes.
5192 	 * Will also set genid on set and node records to have same
5193 	 * as the drive records.
5194 	 */
5195 	nd = sd->sd_nodelist;
5196 	while (nd) {
5197 		/* Skip non-alive nodes */
5198 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5199 			nd = nd->nd_next;
5200 			continue;
5201 		}
5202 		if (clnt_upd_dr_reconfig(nd->nd_nodename, sp, master_dd, ep)) {
5203 			/* RPC failure to another node */
5204 			if ((mdanyrpcerror(ep)) &&
5205 			    (sd->sd_mn_mynode->nd_nodeid != nd->nd_nodeid)) {
5206 				rval = 205;
5207 			} else {
5208 				/* Any other failure */
5209 				rval = -1;
5210 			}
5211 			goto out;
5212 		}
5213 		nd = nd->nd_next;
5214 	}
5215 
5216 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5217 	    "Sent drive list to all nodes for set %s: %s"),
5218 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5219 
5220 	/*
5221 	 * If no drive records left in set and nodes had been joined,
5222 	 * withdraw the nodes.  Always reset the master and mark
5223 	 * all nodes as withdrawn on all nodes.
5224 	 */
5225 	if (master_dd == NULL) {
5226 		/* Reset new master flag since no longer master */
5227 		(void) memset(&sf, 0, sizeof (sf));
5228 		sf.sf_setno = sp->setno;
5229 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5230 		sf.sf_flags = MDDB_NM_RESET;
5231 		/* Use magic to help protect ioctl against attack. */
5232 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5233 		/* Ignore failure, failure to reset flag isn't catastrophic */
5234 		(void) metaioctl(MD_MN_SET_SETFLAGS, &sf,
5235 		    &sf.sf_mde, NULL);
5236 
5237 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5238 		    "Reset new master flag for " "set %s: %s"),
5239 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5240 
5241 		nd = sd->sd_nodelist;
5242 		while (nd) {
5243 			/* Skip non-alive nodes  */
5244 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5245 				nd = nd->nd_next;
5246 				continue;
5247 			}
5248 
5249 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
5250 				/* RPC failure to another node */
5251 				if ((mdanyrpcerror(ep)) &&
5252 				    (sd->sd_mn_mynode->nd_nodeid !=
5253 				    nd->nd_nodeid)) {
5254 					rval = 205;
5255 				} else {
5256 					/* Any other failure */
5257 					rval = -1;
5258 				}
5259 				goto out;
5260 			}
5261 			set_locked = 1;
5262 
5263 			/* Withdraw node from set if owner */
5264 			if ((nd->nd_flags & MD_MN_NODE_OWN) &&
5265 			    (clnt_withdrawset(nd->nd_nodename, sp, ep))) {
5266 				/* RPC failure to another node */
5267 				if ((mdanyrpcerror(ep)) &&
5268 				    (sd->sd_mn_mynode->nd_nodeid !=
5269 				    nd->nd_nodeid)) {
5270 					rval = 205;
5271 				} else {
5272 					/* Any other failure */
5273 					rval = -1;
5274 				}
5275 				goto out;
5276 			}
5277 
5278 			/* Mark all nodes as withdrawn on this node */
5279 			if (clnt_upd_nr_flags(nd->nd_nodename, sp,
5280 			    sd->sd_nodelist, MD_NR_WITHDRAW, NULL, ep)) {
5281 				/* RPC failure to another node */
5282 				if ((mdanyrpcerror(ep)) &&
5283 				    (sd->sd_mn_mynode->nd_nodeid !=
5284 				    nd->nd_nodeid)) {
5285 					rval = 205;
5286 				} else {
5287 					/* Any other failure */
5288 					rval = -1;
5289 				}
5290 				goto out;
5291 			}
5292 
5293 			/* Resets master to no-master on this node */
5294 			if (clnt_mnsetmaster(nd->nd_nodename, sp,
5295 			    "", MD_MN_INVALID_NID, ep)) {
5296 				/* RPC failure to another node */
5297 				if ((mdanyrpcerror(ep)) &&
5298 				    (sd->sd_mn_mynode->nd_nodeid !=
5299 				    nd->nd_nodeid)) {
5300 					rval = 205;
5301 				} else {
5302 					/* Any other failure */
5303 					rval = -1;
5304 				}
5305 				goto out;
5306 			}
5307 
5308 			cl_sk = cl_get_setkey(sp->setno, sp->setname);
5309 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) {
5310 				/* RPC failure to another node */
5311 				if ((mdanyrpcerror(ep)) &&
5312 				    (sd->sd_mn_mynode->nd_nodeid !=
5313 				    nd->nd_nodeid)) {
5314 					rval = 205;
5315 				} else {
5316 					/* Any other failure */
5317 					rval = -1;
5318 				}
5319 				goto out;
5320 			}
5321 			set_locked = 0;
5322 			nd = nd->nd_next;
5323 		}
5324 	}
5325 
5326 out:
5327 	/*
5328 	 * If got here and set is still locked, then an error has
5329 	 * occurred and master_nodelist is still valid.
5330 	 * If error is not an RPC error, then unlock.
5331 	 * If error is an RPC error, skip unlocks since this could cause
5332 	 * yet another RPC timeout if a node has failed.
5333 	 * Ignore failures in unlock since unlock is just trying to
5334 	 * clean things up.
5335 	 */
5336 	if ((set_locked) && !(mdanyrpcerror(ep))) {
5337 		nd = master_nodelist;
5338 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
5339 		while (nd) {
5340 			/* Skip non-alive nodes */
5341 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5342 				nd = nd->nd_next;
5343 				continue;
5344 			}
5345 			/*
5346 			 * If clnt_unlock fails, just break out since next
5347 			 * reconfig cycle will reset the locks anyway.
5348 			 */
5349 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
5350 				break;
5351 			}
5352 			nd = nd->nd_next;
5353 		}
5354 		cl_set_setkey(NULL);
5355 	}
5356 	/* Free master_mnsr and drive descs */
5357 	mnsr_node = master_mnsr_node;
5358 	while (mnsr_node) {
5359 		master_mnsr_node = mnsr_node->mmn_next;
5360 		free_sr((md_set_record *)mnsr_node->mmn_mnsr);
5361 		free_rem_dd(mnsr_node->mmn_dd);
5362 		Free(mnsr_node);
5363 		mnsr_node = master_mnsr_node;
5364 	}
5365 
5366 	/* Frees sd->sd_drvs (which is also master_dd) */
5367 	metaflushsetname(sp);
5368 	return (rval);
5369 }
5370 
5371 /*
5372  * meta_mnsync_diskset_mddbs
5373  * Calling node is guaranteed to be an owner node.
5374  * Calling node is the master node.
5375  *
5376  * Master node verifies that ondisk mddb format matches its incore format.
5377  * If no nodes are joined to set, remove the change log entries.
5378  * If a node is joined to set, play the change log.
5379  *
5380  * Returns	 0 - Success
5381  *		 1 - Master unable to join to set.
5382  *		205 - Failure during RPC to another node
5383  *		-1 - Any other failure and ep is filled in.
5384  *			-1 return will eventually cause node to panic
5385  *			in a SunCluster environment.
5386  */
5387 int
5388 meta_mnsync_diskset_mddbs(
5389 	mdsetname_t	*sp,
5390 	md_error_t	*ep
5391 )
5392 {
5393 	md_set_desc		*sd;
5394 	mddb_config_t		c;
5395 	md_mn_msgclass_t	class;
5396 	mddb_setflags_config_t	sf;
5397 	md_mnnode_desc		*nd, *nd2;
5398 	md_error_t		xep = mdnullerror;
5399 	int			stale_set = 0;
5400 
5401 	/* If setname is there, set desc should exist. */
5402 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5403 		mde_perror(ep, dgettext(TEXT_DOMAIN,
5404 		    "Unable to get set %s desc information"), sp->setname);
5405 		return (-1);
5406 	}
5407 
5408 	/* Are there drives in the set? */
5409 	if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
5410 	    ep) == NULL) {
5411 		if (! mdisok(ep)) {
5412 			return (-1);
5413 		}
5414 		/* No drives in set -- nothing to sync up */
5415 		return (0);
5416 	}
5417 
5418 	/*
5419 	 * Is master node (which is this node) joined to set?
5420 	 * If master node isn't joined (which means that no nodes
5421 	 * are joined to diskset), remove the change log entries
5422 	 * since no need to replay them - all nodes will have same
5423 	 * view of mddbs since all nodes are reading in the mddbs
5424 	 * from disk.
5425 	 * There is also no need to sync up the master and ondisk mddbs
5426 	 * since master has no incore knowledge.
5427 	 * Need to join master to set in order to flush the change
5428 	 * log entries. Don't need to block I/O during join of master
5429 	 * to set since no other nodes are joined to set and so no I/O
5430 	 * can be occurring.
5431 	 */
5432 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
5433 		/* Join master to set */
5434 		if (clnt_joinset(mynode(), sp,
5435 		    MNSET_IN_RECONFIG, ep)) {
5436 			if (mdismddberror(ep, MDE_DB_STALE)) {
5437 				/*
5438 				 * If STALE, print message and continue on.
5439 				 * Don't do any writes or reads to mddbs
5440 				 * so don't clear change log.
5441 				 */
5442 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5443 				    "Join of master node to STALE set %s"),
5444 				    sp->setname);
5445 				stale_set = 1;
5446 				mdclrerror(ep);
5447 			} else if (mdismddberror(ep, MDE_DB_ACCOK)) {
5448 				/* ACCOK means mediator provided extra vote */
5449 				mdclrerror(ep);
5450 			} else {
5451 				/*
5452 				 * If master is unable to join set, print an
5453 				 * error message.  Don't return failure or node
5454 				 * will panic during cluster reconfig cycle.
5455 				 * Also, withdraw node from set in order to
5456 				 * cleanup from failed join attempt.
5457 				 */
5458 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5459 				    "Join of master node in set %s failed"),
5460 				    sp->setname);
5461 				if (clnt_withdrawset(mynode(), sp, &xep))
5462 					mdclrerror(&xep);
5463 				return (1);
5464 			}
5465 		}
5466 		/*
5467 		 * Master node successfully joined.
5468 		 * Set local copy of flags to OWN and
5469 		 * send owner flag to rpc.metad. If not stale,
5470 		 * flush the change log.
5471 		 */
5472 		sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN;
5473 		if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, MD_NR_SET,
5474 		    MNSET_IN_RECONFIG, ep)) {
5475 			mde_perror(ep, dgettext(TEXT_DOMAIN,
5476 			    "Flag update of master node join in set %s failed"),
5477 			    sp->setname);
5478 			return (-1);
5479 		}
5480 
5481 		if (!stale_set) {
5482 			if (mdmn_reset_changelog(sp, ep,
5483 			    MDMN_CLF_RESETLOG) != 0) {
5484 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5485 				    "Unable to reset changelog."));
5486 				return (-1);
5487 			}
5488 			meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5489 			    "Removed changelog entries for set %s: %s"),
5490 			    sp->setname,
5491 			    meta_print_hrtime(gethrtime() - start_time));
5492 		}
5493 		/* Reset new master flag before return */
5494 		(void) memset(&sf, 0, sizeof (sf));
5495 		sf.sf_setno = sp->setno;
5496 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5497 		sf.sf_flags = MDDB_NM_RESET;
5498 		/* Use magic to help protect ioctl against attack. */
5499 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5500 		/* Ignore failure, failure to reset flag isn't catastrophic */
5501 		(void) metaioctl(MD_MN_SET_SETFLAGS, &sf,
5502 		    &sf.sf_mde, NULL);
5503 
5504 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5505 		    "Reset new master flag for set %s: %s"),
5506 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5507 
5508 		return (0);
5509 	}
5510 
5511 	/*
5512 	 * Is master already joined to STALE set (< 50% mddbs avail)?
5513 	 * If so, can make no config changes to mddbs so don't check or play
5514 	 * changelog and don't sync master node to ondisk mddbs.
5515 	 * To get out of the stale state all nodes must be withdrawn
5516 	 * from set.  Then as nodes are re-joined, all nodes will
5517 	 * have same view of mddbs since all nodes are reading the
5518 	 * mddbs from disk.
5519 	 */
5520 	(void) memset(&c, 0, sizeof (c));
5521 	c.c_id = 0;
5522 	c.c_setno = sp->setno;
5523 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
5524 		(void) mdstealerror(ep, &c.c_mde);
5525 		return (-1);
5526 	}
5527 	if (c.c_flags & MDDB_C_STALE) {
5528 		return (0);
5529 	}
5530 
5531 	/*
5532 	 * If this node is NOT a newly chosen master, then there's
5533 	 * nothing else to do since the change log should be empty and
5534 	 * the ondisk and incore mddbs are already consistent.
5535 	 *
5536 	 * A newly chosen master is a node that was not the master
5537 	 * at the beginning of the reconfig cycle.  If a node is a new
5538 	 * master, then the new master state is reset after the ondisk
5539 	 * and incore mddbs are consistent and the change log has
5540 	 * been replayed.
5541 	 */
5542 	(void) memset(&sf, 0, sizeof (sf));
5543 	sf.sf_setno = sp->setno;
5544 	sf.sf_flags = MDDB_NM_GET;
5545 	/* Use magic to help protect ioctl against attack. */
5546 	sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5547 	if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, &sf.sf_mde, NULL) == 0) &&
5548 	    ((sf.sf_setflags & MD_SET_MN_NEWMAS_RC) == 0)) {
5549 		return (0);
5550 	}
5551 
5552 	/*
5553 	 * Now, sync up incore master view to ondisk mddbs.
5554 	 * This is needed in the case where a master node
5555 	 * had made a change to the mddb, but this change
5556 	 * may not have been relayed to the slaves yet.
5557 	 * So, the new master needs to verify that the ondisk
5558 	 * mddbs match what the new master has incore -
5559 	 * if different, new master rewrites all of the mddbs.
5560 	 * Then the new master will replay the changelog and the
5561 	 * new master will then execute what the old master had
5562 	 * done.
5563 	 *
5564 	 * Block all I/Os to disks in this diskset on all nodes in
5565 	 * the diskset.  This will allow the rewriting of the mddbs
5566 	 * (if needed), to proceed in a timely manner.
5567 	 *
5568 	 * If block of I/Os fail, return a -1.
5569 	 */
5570 
5571 	nd = sd->sd_nodelist;
5572 	while (nd) {
5573 		/* Skip non-alive and non-owner nodes  */
5574 		if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5575 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5576 			nd = nd->nd_next;
5577 			continue;
5578 		}
5579 		if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5580 		    MN_SUSP_IO, ep)) {
5581 			mde_perror(ep, dgettext(TEXT_DOMAIN,
5582 			    "Unable to suspend I/O on node %s in set %s"),
5583 			    nd->nd_nodename, sp->setname);
5584 
5585 			/*
5586 			 * Resume all other nodes that had been suspended.
5587 			 * (Reconfig return step also resumes I/Os
5588 			 * for all sets.)
5589 			 */
5590 			nd2 = sd->sd_nodelist;
5591 			while (nd2) {
5592 				/* Stop when reaching failed node */
5593 				if (nd2->nd_nodeid == nd->nd_nodeid)
5594 					break;
5595 				/* Skip non-alive and non-owner nodes  */
5596 				if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) ||
5597 				    (!(nd2->nd_flags & MD_MN_NODE_OWN))) {
5598 					nd2 = nd2->nd_next;
5599 					continue;
5600 				}
5601 				(void) (clnt_mn_susp_res_io(nd2->nd_nodename,
5602 					sp->setno, MN_RES_IO, &xep));
5603 				nd2 = nd2->nd_next;
5604 			}
5605 
5606 			/*
5607 			 * If an RPC failure on another node, return a 205.
5608 			 * Otherwise, exit with failure.
5609 			 */
5610 			if ((mdanyrpcerror(ep)) &&
5611 			    (sd->sd_mn_mynode->nd_nodeid !=
5612 			    nd->nd_nodeid)) {
5613 				return (205);
5614 			} else {
5615 				return (-1);
5616 			}
5617 
5618 		}
5619 		nd = nd->nd_next;
5620 	}
5621 
5622 	(void) memset(&c, 0, sizeof (c));
5623 	c.c_id = 0;
5624 	c.c_setno = sp->setno;
5625 	/* Master can't sync up to ondisk mddbs?  Kick it out of cluster */
5626 	if (metaioctl(MD_MN_CHK_WRT_MDDB, &c, &c.c_mde, NULL) != 0)
5627 		return (-1);
5628 
5629 	/*
5630 	 * Resume I/Os that were suspended above.
5631 	 */
5632 	nd = sd->sd_nodelist;
5633 	while (nd) {
5634 		/* Skip non-alive and non-owner nodes  */
5635 		if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5636 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5637 			nd = nd->nd_next;
5638 			continue;
5639 		}
5640 		if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5641 		    MN_RES_IO, ep)) {
5642 			mde_perror(ep, dgettext(TEXT_DOMAIN,
5643 			    "Unable to resume I/O on node %s in set %s"),
5644 			    nd->nd_nodename, sp->setname);
5645 
5646 			/*
5647 			 * If an RPC failure then don't do any
5648 			 * more RPC calls, since one timeout is enough
5649 			 * to endure.  If RPC failure to another node, return
5650 			 * 205.  If RPC failure to my node, return -1.
5651 			 * If not an RPC failure, continue resuming the
5652 			 * rest of the nodes and then return -1.
5653 			 */
5654 			if (mdanyrpcerror(ep)) {
5655 				if (sd->sd_mn_mynode->nd_nodeid ==
5656 				    nd->nd_nodeid) {
5657 					return (-1);
5658 				} else {
5659 					return (205);
5660 				}
5661 			}
5662 
5663 			/*
5664 			 * If not an RPC error, continue resuming rest of
5665 			 * nodes, ignoring any failures except for an
5666 			 * RPC failure which constitutes an immediate exit.
5667 			 * Start in middle of list with failing node.
5668 			 */
5669 			nd2 = nd->nd_next;
5670 			while (nd2) {
5671 				/* Skip non-alive and non-owner nodes  */
5672 				if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) ||
5673 				    (!(nd2->nd_flags & MD_MN_NODE_OWN))) {
5674 					nd2 = nd2->nd_next;
5675 					continue;
5676 				}
5677 				(void) (clnt_mn_susp_res_io(nd2->nd_nodename,
5678 					sp->setno, MN_RES_IO, &xep));
5679 				if (mdanyrpcerror(&xep)) {
5680 					return (-1);
5681 				}
5682 				nd2 = nd2->nd_next;
5683 			}
5684 		}
5685 		nd = nd->nd_next;
5686 	}
5687 
5688 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, "Master node has completed "
5689 	    "checking/writing the mddb for set %s: %s"), sp->setname,
5690 	    meta_print_hrtime(gethrtime() - start_time));
5691 
5692 	/*
5693 	 * Send (aka replay) all messages we find in the changelog.
5694 	 * Flag the messages with
5695 	 *   MD_MSGF_REPLAY_MSG, so no new message ID is generated for them
5696 	 *   MD_MSGF_OVERRIDE_SUSPEND so they can pass the suspended commd.
5697 	 */
5698 	for (class = MD_MN_NCLASSES - 1; class > 0; class--) {
5699 		mdmn_changelog_record_t	*lr;
5700 		md_error_t	xep = mdnullerror;
5701 		md_mn_result_t	*resultp = NULL;
5702 		int		ret;
5703 
5704 		lr = mdmn_get_changelogrec(sp->setno, class);
5705 		if ((lr->lr_flags & MD_MN_LR_INUSE) == 0) {
5706 			/* no entry for this class */
5707 			continue;
5708 		}
5709 
5710 		meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN,
5711 		    "replaying message ID=(%d, 0x%llx-%d)\n"),
5712 		    MSGID_ELEMS(lr->lr_msg.msg_msgid));
5713 
5714 		ret = mdmn_send_message_with_msgid(
5715 			lr->lr_msg.msg_setno,
5716 			lr->lr_msg.msg_type,
5717 			lr->lr_msg.msg_flags |  MD_MSGF_REPLAY_MSG |
5718 						MD_MSGF_OVERRIDE_SUSPEND,
5719 			lr->lr_msg.msg_event_data,
5720 			lr->lr_msg.msg_event_size,
5721 			&resultp,
5722 			&lr->lr_msg.msg_msgid,
5723 			&xep);
5724 
5725 		meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN,
5726 		    "mdmn_send_message returned %d\n"), ret);
5727 
5728 		if (resultp)
5729 			free_result(resultp);
5730 	}
5731 
5732 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5733 	    "Playing changelog completed for set %s: %s"),
5734 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5735 
5736 	/*
5737 	 * Now that new master has ondisk and incore mddbs in sync, reset
5738 	 * this node's new master kernel flag (for this set).  If this node
5739 	 * re-enters another reconfig cycle before the completion of this
5740 	 * reconfig cycle, this master node won't need to check if the ondisk
5741 	 * and incore mddbs are in sync since this node won't be considered
5742 	 * a new master (since this flag is being reset here in the middle of
5743 	 * step2).  This will save time during any subsequent reconfig
5744 	 * cycles as long as this node continues to be master.
5745 	 */
5746 	(void) memset(&sf, 0, sizeof (sf));
5747 	sf.sf_setno = sp->setno;
5748 	sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5749 	sf.sf_flags = MDDB_NM_RESET;
5750 	/* Use magic to help protect ioctl against attack. */
5751 	sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5752 	/* Ignore failure, since failure to reset flag isn't catastrophic */
5753 	(void) metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, NULL);
5754 
5755 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5756 	    "Reset new master flag for set %s: %s"),
5757 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5758 
5759 	return (0);
5760 }
5761 
5762 /*
5763  * meta_mnjoin_all will join all starting nodes in the diskset.
5764  * A starting node is considered to be any node that is not
5765  * an owner of the set but is a member of the cluster.
5766  * Master node is already joined to set (done in meta_mnsync_diskset_mddbs).
5767  *
5768  * Caller is the Master node.
5769  *
5770  * Returns	 0 - Success
5771  *		205 - Failure during RPC to another node
5772  *		-1 - Any other failure and ep is filled in.
5773  */
5774 int
5775 meta_mnjoin_all(
5776 	mdsetname_t	*sp,
5777 	md_error_t	*ep
5778 )
5779 {
5780 	md_set_desc		*sd;
5781 	md_mnnode_desc		*nd, *nd2;
5782 	int			rval = 0;
5783 	int			stale_flag = 0;
5784 	mddb_config_t		c;
5785 	int			susp_res_flag = 0;
5786 	md_error_t		xep = mdnullerror;
5787 
5788 	/* If setname is there, set desc should exist. */
5789 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5790 		mde_perror(ep, dgettext(TEXT_DOMAIN,
5791 		    "Unable to get set %s desc information"), sp->setname);
5792 		return (-1);
5793 	}
5794 
5795 	/* Are there drives in the set? */
5796 	if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
5797 	    ep) == NULL) {
5798 		if (! mdisok(ep)) {
5799 			return (-1);
5800 		}
5801 		/* No drives in set -- nothing to join */
5802 		return (0);
5803 	}
5804 
5805 	/*
5806 	 * Is set currently stale?
5807 	 */
5808 	(void) memset(&c, 0, sizeof (c));
5809 	c.c_id = 0;
5810 	c.c_setno = sp->setno;
5811 	/* Ignore failure since master node may not be joined yet */
5812 	(void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL);
5813 	if (c.c_flags & MDDB_C_STALE) {
5814 		stale_flag = MNSET_IS_STALE;
5815 	}
5816 
5817 	/*
5818 	 * If any nodes are going to be joined to diskset, then
5819 	 * suspend I/O to all disks in diskset so that nodes can join
5820 	 * (read in mddbs) in a reasonable amount of time even under
5821 	 * high I/O load.  Don't need to do this if set is STALE since
5822 	 * no I/O can be occurring to a STALE set.
5823 	 */
5824 	if (stale_flag != MNSET_IS_STALE) {
5825 		nd = sd->sd_nodelist;
5826 		while (nd) {
5827 			/* Found a node that will be joined to diskset */
5828 			if ((nd->nd_flags & MD_MN_NODE_ALIVE) &&
5829 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5830 				/* Set flag that diskset should be suspended */
5831 				susp_res_flag = 1;
5832 				break;
5833 			}
5834 			nd = nd->nd_next;
5835 		}
5836 	}
5837 
5838 	if (susp_res_flag) {
5839 		/*
5840 		 * Block all I/Os to disks in this diskset on all joined
5841 		 * nodes in the diskset.
5842 		 * If block of I/Os fails due to an RPC failure on another
5843 		 * node, return 205; otherwise, return -1.
5844 		 */
5845 		nd = sd->sd_nodelist;
5846 		while (nd) {
5847 			/* Skip non-alive and non-owner nodes  */
5848 			if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5849 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5850 				nd = nd->nd_next;
5851 				continue;
5852 			}
5853 			if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5854 			    MN_SUSP_IO, ep)) {
5855 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5856 				    "Unable to suspend I/O on node %s"
5857 				    " in set %s"), nd->nd_nodename,
5858 				    sp->setname);
5859 				/*
5860 				 * Resume other nodes that had been suspended.
5861 				 * (Reconfig return step also resumes I/Os
5862 				 * for all sets.)
5863 				 */
5864 				nd2 = sd->sd_nodelist;
5865 				while (nd2) {
5866 					/* Stop when reaching failed node */
5867 					if (nd2->nd_nodeid == nd->nd_nodeid)
5868 						break;
5869 					/* Skip non-alive/non-owner nodes  */
5870 					if ((!(nd2->nd_flags &
5871 					    MD_MN_NODE_ALIVE)) ||
5872 					    (!(nd2->nd_flags &
5873 					    MD_MN_NODE_OWN))) {
5874 						nd2 = nd2->nd_next;
5875 						continue;
5876 					}
5877 					(void) (clnt_mn_susp_res_io(
5878 					    nd2->nd_nodename, sp->setno,
5879 					    MN_RES_IO, &xep));
5880 					nd2 = nd2->nd_next;
5881 				}
5882 
5883 				/*
5884 				 * If the suspend failed due to an
5885 				 * RPC failure on another node, return
5886 				 * a 205.
5887 				 * Otherwise, exit with failure.
5888 				 * The return reconfig step will resume
5889 				 * I/Os for all disksets.
5890 				 */
5891 				if ((mdanyrpcerror(ep)) &&
5892 				    (sd->sd_mn_mynode->nd_nodeid !=
5893 				    nd->nd_nodeid)) {
5894 					return (205);
5895 				} else {
5896 					return (-1);
5897 				}
5898 			}
5899 			nd = nd->nd_next;
5900 		}
5901 	}
5902 
5903 	nd = sd->sd_nodelist;
5904 	while (nd) {
5905 		/*
5906 		 * If a node is in the membership list but isn't joined
5907 		 * to the set, try to join the node.
5908 		 */
5909 		if ((nd->nd_flags & MD_MN_NODE_ALIVE) &&
5910 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5911 			if (clnt_joinset(nd->nd_nodename, sp,
5912 			    (MNSET_IN_RECONFIG | stale_flag), ep)) {
5913 				/*
5914 				 * If RPC failure to another node
5915 				 * then exit without attempting anything else.
5916 				 * (Reconfig return step will resume I/Os
5917 				 * for all sets.)
5918 				 */
5919 				if (mdanyrpcerror(ep)) {
5920 					mde_perror(ep, "");
5921 					return (205);
5922 				}
5923 				/*
5924 				 * STALE and ACCOK failures aren't true
5925 				 * failures.  STALE means that <50% mddbs
5926 				 * are available. ACCOK means that the
5927 				 * mediator provided the extra vote.
5928 				 * If a true failure, then print messasge
5929 				 * and withdraw node from set in order to
5930 				 * cleanup from failed join attempt.
5931 				 */
5932 				if ((!mdismddberror(ep, MDE_DB_STALE)) &&
5933 				    (!mdismddberror(ep, MDE_DB_ACCOK))) {
5934 					mde_perror(ep,
5935 					    "WARNING: Unable to join node %s "
5936 					    "to set %s", nd->nd_nodename,
5937 					    sp->setname);
5938 					mdclrerror(ep);
5939 					if (clnt_withdrawset(nd->nd_nodename,
5940 					    sp, &xep))
5941 						mdclrerror(&xep);
5942 					nd = nd->nd_next;
5943 					continue;
5944 				}
5945 			}
5946 			/* Set owner flag even if STALE or ACCOK */
5947 			nd->nd_flags |= MD_MN_NODE_OWN;
5948 		}
5949 		nd = nd->nd_next;
5950 	}
5951 	/*
5952 	 * Resume I/Os if suspended above.
5953 	 */
5954 	if (susp_res_flag) {
5955 		nd = sd->sd_nodelist;
5956 		while (nd) {
5957 			/*
5958 			 * Skip non-alive and non-owner nodes
5959 			 * (this list doesn't include any of
5960 			 * the nodes that were joined).
5961 			 */
5962 			if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5963 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5964 				nd = nd->nd_next;
5965 				continue;
5966 			}
5967 			if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5968 			    MN_RES_IO, ep)) {
5969 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5970 				    "Unable to resume I/O on node %s"
5971 				    " in set %s"), nd->nd_nodename,
5972 				    sp->setname);
5973 
5974 				/*
5975 				 * If an RPC failure then don't do any
5976 				 * more RPC calls, since one timeout is enough
5977 				 * to endure.  If RPC failure to another node,
5978 				 * return 205.  If RPC failure to my node,
5979 				 * return -1.
5980 				 * (Reconfig return step will resume I/Os
5981 				 * for all sets.)
5982 				 * If not an RPC failure, continue resuming the
5983 				 * rest of the nodes and then return -1.
5984 				 */
5985 				if (mdanyrpcerror(ep)) {
5986 					if (sd->sd_mn_mynode->nd_nodeid ==
5987 					    nd->nd_nodeid) {
5988 						return (-1);
5989 					} else {
5990 						return (205);
5991 					}
5992 				}
5993 
5994 				/*
5995 				 * If not an RPC error, continue resuming rest
5996 				 * of nodes, ignoring any failures except for
5997 				 * an RPC failure which constitutes an
5998 				 * immediate exit.
5999 				 * Start in middle of list with failing node.
6000 				 */
6001 				nd2 = nd->nd_next;
6002 				while (nd2) {
6003 					/* Skip non-owner nodes  */
6004 					if ((!(nd2->nd_flags &
6005 					    MD_MN_NODE_ALIVE)) ||
6006 					    (!(nd2->nd_flags &
6007 					    MD_MN_NODE_OWN))) {
6008 						nd2 = nd2->nd_next;
6009 						continue;
6010 					}
6011 					(void) (clnt_mn_susp_res_io(
6012 					    nd2->nd_nodename, sp->setno,
6013 					    MN_RES_IO, &xep));
6014 					if (mdanyrpcerror(&xep)) {
6015 						return (-1);
6016 					}
6017 					nd2 = nd2->nd_next;
6018 				}
6019 			}
6020 			nd = nd->nd_next;
6021 		}
6022 	}
6023 
6024 	nd = sd->sd_nodelist;
6025 	while (nd) {
6026 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
6027 			nd = nd->nd_next;
6028 			continue;
6029 		}
6030 		/*
6031 		 * If 1 node fails - go ahead and update the rest except
6032 		 * in the case of an RPC failure, fail immediately.
6033 		 */
6034 		if (clnt_upd_nr_flags(nd->nd_nodename, sp,
6035 		    sd->sd_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) {
6036 			/* RPC failure to another node */
6037 			if (mdanyrpcerror(ep)) {
6038 				return (205);
6039 			}
6040 			nd = nd->nd_next;
6041 			rval = -1;
6042 			continue;
6043 		}
6044 		nd = nd->nd_next;
6045 	}
6046 
6047 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
6048 	    "Join of all nodes completed for set %s: %s"),
6049 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
6050 
6051 	return (rval);
6052 }
6053