xref: /titanic_41/usr/src/lib/lvm/libmeta/common/meta_set.c (revision b494511a9cf72b1fc4eb13a0e593f55c624ab829)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Just in case we're not in a build environment, make sure that
28  * TEXT_DOMAIN gets set to something.
29  */
30 #if !defined(TEXT_DOMAIN)
31 #define	TEXT_DOMAIN "SYS_TEST"
32 #endif
33 
34 /*
35  * Metadevice diskset interfaces
36  */
37 
38 #include "meta_set_prv.h"
39 #include <meta.h>
40 #include <metad.h>
41 #include <mdmn_changelog.h>
42 #include <sys/lvm/md_crc.h>
43 #include <sys/utsname.h>
44 #include <sdssc.h>
45 
46 #include <sys/sysevent/eventdefs.h>
47 #include <sys/sysevent/svm.h>
48 extern	char	*blkname(char *);
49 
50 static md_drive_desc *
51 dr2drivedesc(
52 	mdsetname_t	*sp,
53 	side_t		sideno,
54 	int		flags,
55 	md_error_t	*ep
56 )
57 {
58 	md_set_record	*sr;
59 	md_drive_record	*dr;
60 	mddrivename_t	*dnp;
61 	md_drive_desc	*dd_head = NULL;
62 	md_set_desc	*sd;
63 
64 	if (flags & MD_BYPASS_DAEMON) {
65 		if ((sr = metad_getsetbynum(sp->setno, ep)) == NULL)
66 			return (NULL);
67 		sd = metaget_setdesc(sp, ep);
68 		sideno = getnodeside(mynode(), sd);
69 		sp = metafakesetname(sp->setno, sr->sr_setname);
70 	} else {
71 		if ((sr = getsetbyname(sp->setname, ep)) == NULL)
72 			return (NULL);
73 	}
74 
75 	assert(sideno != MD_SIDEWILD);
76 
77 	/*
78 	 * WARNING:
79 	 * The act of getting the dnp from the namespace means that we
80 	 * will get the devid of the disk as recorded in the namespace.
81 	 * This devid has the potential to be stale if the disk is being
82 	 * replaced via a rebind, this means that any code that relies
83 	 * on any of the dnp information should take the appropriate action
84 	 * to preserve that information. For example in the rebind code the
85 	 * devid of the new disk is saved off and then copied back in once
86 	 * the code that has called this function has completed.
87 	 */
88 	for (dr = sr->sr_drivechain; dr != NULL; dr = dr->dr_next) {
89 		if ((dnp = metadrivename_withdrkey(sp, sideno, dr->dr_key,
90 		    flags, ep)) == NULL) {
91 			if (!(flags & MD_BYPASS_DAEMON))
92 				free_sr(sr);
93 			metafreedrivedesc(&dd_head);
94 			return (NULL);
95 		}
96 
97 		(void) metadrivedesc_append(&dd_head, dnp, dr->dr_dbcnt,
98 		    dr->dr_dbsize, dr->dr_ctime, dr->dr_genid, dr->dr_flags);
99 	}
100 
101 	if (!(flags & MD_BYPASS_DAEMON)) {
102 		free_sr(sr);
103 	}
104 	return (dd_head);
105 }
106 
107 static int
108 get_sidenmlist(
109 	mdsetname_t	*sp,
110 	mddrivename_t	*dnp,
111 	md_error_t	*ep
112 )
113 {
114 	md_set_desc	*sd;
115 	mdsidenames_t	*sn, **sn_next;
116 	int		i;
117 
118 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
119 		return (-1);
120 
121 	metaflushsidenames(dnp);
122 	sn_next = &dnp->side_names;
123 	if (MD_MNSET_DESC(sd)) {
124 		/*
125 		 * Only get sidenames for this node since
126 		 * that is the only side information stored in
127 		 * the local mddb for a multi-node diskset.
128 		 */
129 		if (sd->sd_mn_mynode) {
130 			sn = Zalloc(sizeof (*sn));
131 			sn->sideno = sd->sd_mn_mynode->nd_nodeid;
132 			if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET,
133 			    sn->sideno, dnp->side_names_key, &sn->dname,
134 			    &sn->mnum, NULL, ep)) == NULL) {
135 				if (sn->dname != NULL)
136 					Free(sn->dname);
137 				Free(sn);
138 				return (-1);
139 			}
140 
141 			/* Add to the end of the linked list */
142 			assert(*sn_next == NULL);
143 			*sn_next = sn;
144 			sn_next = &sn->next;
145 		}
146 	} else {
147 		for (i = 0; i < MD_MAXSIDES; i++) {
148 			/* Skip empty slots */
149 			if (sd->sd_nodes[i][0] == '\0')
150 				continue;
151 
152 			sn = Zalloc(sizeof (*sn));
153 			sn->sideno = i;
154 			if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET,
155 			    i+SKEW, dnp->side_names_key, &sn->dname,
156 			    &sn->mnum, NULL, ep)) == NULL) {
157 				/*
158 				 * It is possible that during the add of a
159 				 * host to have a 'missing' side as the side
160 				 * for this disk will be added later. So ignore
161 				 * the error. The 'missing' side will be added
162 				 * once the addhosts process has completed.
163 				 */
164 				if (mdissyserror(ep, ENOENT)) {
165 					mdclrerror(ep);
166 					Free(sn);
167 					continue;
168 				}
169 
170 				if (sn->dname != NULL)
171 					Free(sn->dname);
172 				Free(sn);
173 				return (-1);
174 			}
175 
176 			/* Add to the end of the linked list */
177 			assert(*sn_next == NULL);
178 			*sn_next = sn;
179 			sn_next = &sn->next;
180 		}
181 	}
182 
183 	return (0);
184 }
185 
186 static md_drive_desc *
187 rl_to_dd(
188 	mdsetname_t		*sp,
189 	md_replicalist_t	*rlp,
190 	md_error_t		*ep
191 )
192 {
193 	md_replicalist_t	*rl;
194 	md_replica_t		*r;
195 	md_drive_desc		*dd = NULL;
196 	md_drive_desc		*d;
197 	int			found;
198 	md_set_desc		*sd;
199 	daddr_t			nblks = 0;
200 
201 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
202 		return (NULL);
203 
204 	/* find the smallest existing replica */
205 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
206 		r = rl->rl_repp;
207 		nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks));
208 	}
209 
210 	if (nblks <= 0)
211 		nblks = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;
212 
213 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
214 		r = rl->rl_repp;
215 
216 		found = 0;
217 		for (d = dd; d != NULL; d = d->dd_next) {
218 			if (strcmp(r->r_namep->drivenamep->cname,
219 			    d->dd_dnp->cname) == 0) {
220 				found = 1;
221 				dd->dd_dbcnt++;
222 				break;
223 			}
224 		}
225 
226 		if (! found)
227 			(void) metadrivedesc_append(&dd, r->r_namep->drivenamep,
228 			    1, nblks, sd->sd_ctime, sd->sd_genid, MD_DR_OK);
229 	}
230 
231 	return (dd);
232 }
233 
234 /*
235  * Exported Entry Points
236  */
237 
238 set_t
239 get_max_sets(md_error_t *ep)
240 {
241 
242 	static set_t		max_sets = 0;
243 
244 	if (max_sets == 0)
245 		if (metaioctl(MD_IOCGETNSET, &max_sets, ep, NULL) != 0)
246 			return (0);
247 
248 	return (max_sets);
249 }
250 
251 int
252 get_max_meds(md_error_t *ep)
253 {
254 	static int		max_meds = 0;
255 
256 	if (max_meds == 0)
257 		if (metaioctl(MD_MED_GET_NMED, &max_meds, ep, NULL) != 0)
258 			return (0);
259 
260 	return (max_meds);
261 }
262 
263 side_t
264 getmyside(mdsetname_t *sp, md_error_t *ep)
265 {
266 	md_set_desc		*sd;
267 	char 			*node = NULL;
268 	side_t			sideno;
269 
270 	if (sp->setno == 0)
271 		return (0);
272 
273 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
274 		return (MD_SIDEWILD);
275 
276 	node = mynode();
277 
278 	assert(node != NULL);
279 
280 	sideno = getnodeside(node, sd);
281 
282 	if (sideno != MD_SIDEWILD)
283 		return (sideno);
284 
285 	return (mddserror(ep, MDE_DS_HOSTNOSIDE, sp->setno, node, NULL, node));
286 }
287 
288 /*
289  * get set info from name
290  */
291 md_set_record *
292 getsetbyname(char *setname, md_error_t *ep)
293 {
294 	md_set_record		*sr = NULL;
295 	md_mnset_record		*mnsr = NULL;
296 	char			*p;
297 	size_t			len;
298 
299 	/* get set info from daemon */
300 	if (clnt_getset(mynode(), setname, MD_SET_BAD, &sr, ep) == -1)
301 		return (NULL);
302 	if (sr != NULL) {
303 		/*
304 		 * Returned record could be for a multi-node set or a
305 		 * non-multi-node set.
306 		 */
307 		if (MD_MNSET_REC(sr)) {
308 			/*
309 			 * Record is for a multi-node set.  Reissue call
310 			 * to get mnset information.  Need to free
311 			 * record as if a non-multi-node set record since
312 			 * that is what clnt_getset gave us.  If in
313 			 * the daemon, don't free since this is a pointer
314 			 * into the setrecords array.
315 			 */
316 			if (! md_in_daemon) {
317 				sr->sr_flags &= ~MD_SR_MN;
318 				free_sr(sr);
319 			}
320 			if (clnt_mngetset(mynode(), setname, MD_SET_BAD, &mnsr,
321 			    ep) == -1)
322 				return (NULL);
323 			if (mnsr != NULL)
324 				return ((struct md_set_record *)mnsr);
325 		} else {
326 			return (sr);
327 		}
328 	}
329 
330 	/* no such set */
331 	len = strlen(setname) + 30;
332 	p = Malloc(len);
333 	(void) snprintf(p, len, "setname \"%s\"", setname);
334 	(void) mderror(ep, MDE_NO_SET, p);
335 	Free(p);
336 	return (NULL);
337 }
338 
339 /*
340  * get set info from number
341  */
342 md_set_record *
343 getsetbynum(set_t setno, md_error_t *ep)
344 {
345 	md_set_record		*sr;
346 	md_mnset_record		*mnsr = NULL;
347 	char			buf[100];
348 
349 	if (clnt_getset(mynode(), NULL, setno, &sr, ep) == -1)
350 		return (NULL);
351 
352 	if (sr != NULL) {
353 		/*
354 		 * Record is for a multi-node set.  Reissue call
355 		 * to get mnset information.  Need to free
356 		 * record as if a non-multi-node set record since
357 		 * that is what clnt_getset gave us.  If in
358 		 * the daemon, don't free since this is a pointer
359 		 * into the setrecords array.
360 		 */
361 		if (MD_MNSET_REC(sr)) {
362 			/*
363 			 * Record is for a multi-node set.  Reissue call
364 			 * to get mnset information.
365 			 */
366 			if (! md_in_daemon) {
367 				sr->sr_flags &= ~MD_SR_MN;
368 				free_sr(sr);
369 			}
370 			if (clnt_mngetset(mynode(), NULL, setno, &mnsr,
371 			    ep) == -1)
372 				return (NULL);
373 			if (mnsr != NULL)
374 				return ((struct md_set_record *)mnsr);
375 		} else {
376 			return (sr);
377 		}
378 	}
379 
380 	(void) sprintf(buf, "setno %u", setno);
381 	(void) mderror(ep, MDE_NO_SET, buf);
382 	return (NULL);
383 }
384 
385 int
386 meta_check_drive_inuse(
387 	mdsetname_t	*sp,
388 	mddrivename_t	*dnp,
389 	int		check_db,
390 	md_error_t	*ep
391 )
392 {
393 	mdnamelist_t	*nlp = NULL;
394 	mdnamelist_t	*p;
395 	int		rval = 0;
396 
397 	/* get all underlying partitions */
398 	if (meta_getalldevs(sp, &nlp, check_db, ep) != 0)
399 		return (-1);
400 
401 	/* search for drive */
402 	for (p = nlp; (p != NULL); p = p->next) {
403 		mdname_t	*np = p->namep;
404 
405 		if (strcmp(dnp->cname, np->drivenamep->cname) == 0) {
406 			rval = (mddserror(ep, MDE_DS_DRIVEINUSE, sp->setno,
407 			    NULL, dnp->cname, sp->setname));
408 			break;
409 		}
410 	}
411 
412 	/* cleanup, return success */
413 	metafreenamelist(nlp);
414 	return (rval);
415 }
416 
417 /*
418  * simple check for ownership
419  */
420 int
421 meta_check_ownership(mdsetname_t *sp, md_error_t *ep)
422 {
423 	int			ownset;
424 	md_set_desc		*sd;
425 	md_drive_desc		*dd;
426 	md_replicalist_t	*rlp = NULL;
427 	md_error_t		xep = mdnullerror;
428 
429 	if (metaislocalset(sp))
430 		return (0);
431 
432 	ownset = own_set(sp, NULL, TRUE, ep);
433 	if (! mdisok(ep))
434 		return (-1);
435 
436 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
437 		return (-1);
438 
439 	dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep);
440 	if (! mdisok(ep))
441 		return (-1);
442 
443 	/* If we have no drive descriptors, check for no ownership */
444 	if (dd == NULL) {
445 		if (ownset == MD_SETOWNER_NONE)
446 			return (0);
447 
448 		/* If ownership somehow has come to exist, we must clean up */
449 
450 		if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp,
451 		    &xep) < 0)
452 			mdclrerror(&xep);
453 
454 		if ((dd = rl_to_dd(sp, rlp, &xep)) == NULL)
455 			if (! mdisok(&xep))
456 				mdclrerror(&xep);
457 
458 		if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
459 			if (rel_own_bydd(sp, dd, TRUE, &xep))
460 				mdclrerror(&xep);
461 		}
462 
463 		if (halt_set(sp, &xep))
464 			mdclrerror(&xep);
465 
466 		metafreereplicalist(rlp);
467 
468 		metafreedrivedesc(&dd);
469 
470 		return (0);
471 	}
472 
473 	metafreedrivedesc(&sd->sd_drvs);
474 
475 	if (ownset == MD_SETOWNER_YES)
476 		return (0);
477 
478 	return (mddserror(ep, MDE_DS_NOOWNER, sp->setno, NULL, NULL,
479 	    sp->setname));
480 }
481 
482 /*
483  * simple check for ownership
484  */
485 int
486 meta_check_ownership_on_host(mdsetname_t *sp, char *hostname, md_error_t *ep)
487 {
488 	md_set_desc	*sd;
489 	md_drive_desc	*dd;
490 	int		bool;
491 
492 	if (metaislocalset(sp))
493 		return (0);
494 
495 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
496 		return (-1);
497 
498 	if (getnodeside(hostname, sd) == MD_SIDEWILD)
499 		return (mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
500 		    hostname, NULL, sp->setname));
501 
502 	dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep);
503 	if (! mdisok(ep))
504 		return (-1);
505 
506 	if (clnt_ownset(hostname, sp, &bool, ep) == -1)
507 		return (-1);
508 
509 	if (dd == NULL)
510 		return (0);
511 
512 	metafreedrivedesc(&sd->sd_drvs);
513 
514 	if (bool == TRUE)
515 		return (0);
516 
517 	return (mddserror(ep, MDE_DS_NODEISNOTOWNER, sp->setno, hostname, NULL,
518 	    sp->setname));
519 }
520 
521 /*
522  * Function that determines if a node is in the multinode diskset
523  * membership list.  Calling node passes in node to be checked and
524  * the nodelist as returned from meta_read_nodelist.  This routine
525  * anticipates being called many times using the same diskset membership
526  * list which is why the alloc and free of the diskset membership list
527  * is left to the calling routine.
528  * Returns:
529  *	1 - if a member
530  *	0 - not a member
531  */
532 int
533 meta_is_member(
534 	char				*node_name,
535 	md_mn_nodeid_t			node_id,
536 	mndiskset_membershiplist_t	*nl
537 )
538 {
539 	mndiskset_membershiplist_t	*nl2;
540 	int				flag_check_name;
541 
542 	if (node_id != 0)
543 		flag_check_name = 0;
544 	else if (node_name != NULL)
545 		flag_check_name = 1;
546 	else
547 		return (0);
548 
549 	nl2 = nl;
550 	while (nl2) {
551 		if (flag_check_name) {
552 			/* Compare given name against name in member list */
553 			if (strcmp(nl2->msl_node_name, node_name) == 0)
554 				break;
555 		} else {
556 			/* Compare given nodeid against nodeid in member list */
557 			if (nl2->msl_node_id == node_id)
558 				break;
559 		}
560 		nl2 = nl2->next;
561 	}
562 	/* No match found in member list */
563 	if (nl2 == NULL) {
564 		return (0);
565 	}
566 	/* Return 1 if node is in member list */
567 	return (1);
568 }
569 
570 /*
571  * meta_getnext_devinfo should go to the host that
572  * has the device, to return the device name, driver name, minor num.
573  * We can take the big cheat for now, since it is a requirement
574  * that the device names and device numbers are the same, and
575  * just get the info locally.
576  *
577  * This routine is very similar to meta_getnextside_devinfo except
578  * that the specific side to be used is being passed in.
579  *
580  * Exit status:
581  *	 0 - No more side info to return
582  *	 1 - More side info's to return
583  *	-1 - An error has been detected
584  */
585 /*ARGSUSED*/
586 int
587 meta_getside_devinfo(
588 	mdsetname_t	*sp,		/* for this set */
589 	char		*bname,		/* local block name (myside) */
590 	side_t		sideno,		/* sideno */
591 	char		**ret_bname,	/* block device name of returned side */
592 	char		**ret_dname,	/* driver name of returned side */
593 	minor_t		*ret_mnum,	/* minor number of returned side */
594 	md_error_t	*ep
595 )
596 {
597 	mdname_t	*np;
598 
599 	if (ret_bname != NULL)
600 		*ret_bname = NULL;
601 	if (ret_dname != NULL)
602 		*ret_dname = NULL;
603 	if (ret_mnum != NULL)
604 		*ret_mnum = NODEV32;
605 
606 
607 	if ((np = metaname(&sp, bname, LOGICAL_DEVICE, ep)) == NULL)
608 		return (-1);
609 
610 /*
611  * NOTE (future) - There will be more work here once devids are integrated
612  * into disksets.  Then the side should be used to find the correct
613  * host and the b/d names should be gotten from that host.
614  */
615 
616 	/*
617 	 * Return the side info.
618 	 */
619 	if (ret_bname != NULL)
620 		*ret_bname = Strdup(np->bname);
621 
622 	if (ret_dname != NULL) {
623 		mdcinfo_t	*cinfo;
624 
625 		if ((cinfo = metagetcinfo(np, ep)) == NULL)
626 			return (-1);
627 
628 		*ret_dname = Strdup(cinfo->dname);
629 	}
630 
631 	if (ret_mnum != NULL)
632 		*ret_mnum = meta_getminor(np->dev);
633 
634 	return (1);
635 }
636 
637 /*
638  * Get the information on the device from the remote node using the devid
639  * of the disk.
640  *
641  * Exit status:
642  *	 0 - No more side info to return
643  *	 1 - More side info's to return
644  *	-1 - An error has been detected
645  */
646 int
647 meta_getnextside_devinfo(
648 	mdsetname_t	*sp,		/* for this set */
649 	char		*bname,		/* local block name (myside) */
650 	side_t		*sideno,	/* previous sideno & returned sideno */
651 	char		**ret_bname,	/* block device name of returned side */
652 	char		**ret_dname,	/* driver name of returned side */
653 	minor_t		*ret_mnum,	/* minor number of returned side */
654 	md_error_t	*ep
655 )
656 {
657 	md_set_desc	*sd;
658 	int		i;
659 	mdname_t	*np;
660 	mddrivename_t	*dnp;
661 	char		*devidstr = NULL;
662 	int		devidstrlen;
663 	md_dev64_t	retdev = NODEV64;
664 	char		*ret_devname = NULL;
665 	char		*ret_blkdevname = NULL;
666 	char		*ret_driver = NULL;
667 	char		*nodename;
668 	int		fd;
669 	int		ret = -1;
670 	char		*minor_name = NULL;
671 	md_mnnode_desc	*nd;
672 
673 
674 	if (ret_bname != NULL)
675 		*ret_bname = NULL;
676 	if (ret_dname != NULL)
677 		*ret_dname = NULL;
678 	if (ret_mnum != NULL)
679 		*ret_mnum = NODEV32;
680 
681 	if (metaislocalset(sp)) {
682 		/* no more sides - we are done */
683 		if (*sideno != MD_SIDEWILD)
684 			return (0);
685 
686 		/* First time through -  set up return sideno */
687 		*sideno = 0;
688 	} else {
689 
690 		/*
691 		 * Find the next sideno, starting after the one given.
692 		 */
693 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
694 			return (-1);
695 
696 		if (MD_MNSET_DESC(sd)) {
697 			nd = sd->sd_nodelist;
698 			if ((*sideno == MD_SIDEWILD) &&
699 			    (nd != (struct md_mnnode_desc *)NULL)) {
700 				*sideno = nd->nd_nodeid;
701 			} else {
702 				while (nd) {
703 					/*
704 					 * Found given sideno, now find
705 					 * next sideno, if there is one.
706 					 */
707 					if ((*sideno == nd->nd_nodeid) &&
708 					    (nd->nd_next !=
709 					    (struct md_mnnode_desc *)NULL)) {
710 						*sideno =
711 						    nd->nd_next->nd_nodeid;
712 						break;
713 					}
714 					nd = nd->nd_next;
715 				}
716 				if (nd == NULL) {
717 					return (0);
718 				}
719 			}
720 			if (*sideno == MD_SIDEWILD)
721 				return (0);
722 		} else {
723 			for (i = (*sideno)+1; i < MD_MAXSIDES; i++)
724 				/* Find next full slot */
725 				if (sd->sd_nodes[i][0] != '\0')
726 					break;
727 
728 			/* No more sides - we are done */
729 			if (i == MD_MAXSIDES)
730 				return (0);
731 
732 			/* Set up the return sideno */
733 			*sideno = i;
734 			nodename = (char *)sd->sd_nodes[i];
735 		}
736 	}
737 
738 	/*
739 	 * Need to pass the node the devid of the disk and get it to
740 	 * send back the details of the disk from that side.
741 	 */
742 	if ((np = metaname(&sp, bname, UNKNOWN, ep)) == NULL)
743 		return (-1);
744 
745 	dnp = np->drivenamep;
746 
747 	/*
748 	 * By default, set up the parameters so that they are copied out.
749 	 */
750 	if (ret_bname != NULL)
751 		*ret_bname = Strdup(np->bname);
752 
753 	if (ret_dname != NULL) {
754 		mdcinfo_t	*cinfo;
755 
756 		if ((cinfo = metagetcinfo(np, ep)) == NULL)
757 			return (-1);
758 
759 		*ret_dname = Strdup(cinfo->dname);
760 	}
761 
762 	if (ret_mnum != NULL)
763 		*ret_mnum = meta_getminor(np->dev);
764 
765 	/*
766 	 * Try some optimization. If this is the local set or the device
767 	 * is a metadevice then just copy the information. If the device
768 	 * does not have a devid (due to not having a minor name) then
769 	 * fall back to the pre-devid behaviour of copying the information
770 	 * on the device: this is okay because the sanity checks before this
771 	 * call would have found any issues with the device. If it's a
772 	 * multi-node diskset also just return ie. copy.
773 	 */
774 	if (metaislocalset(sp) || metaismeta(np) || (dnp->devid == NULL) ||
775 	    (MD_MNSET_DESC(sd)))
776 		return (1);
777 
778 	if (np->minor_name == (char *)NULL) {
779 		/*
780 		 * Have to get the minor name then. The slice should exist
781 		 * on the disk because it will have already been repartitioned
782 		 * up prior to getting to this point.
783 		 */
784 		if ((fd = open(np->bname, (O_RDONLY|O_NDELAY), 0)) < 0) {
785 			(void) mdsyserror(ep, errno, np->bname);
786 			return (-1);
787 		}
788 		(void) devid_get_minor_name(fd, &minor_name);
789 		np->minor_name = Strdup(minor_name);
790 		devid_str_free(minor_name);
791 		(void) close(fd);
792 	}
793 
794 	/* allocate extra space for "/" and NULL hence +2 */
795 	devidstrlen = strlen(dnp->devid) + strlen(np->minor_name) + 2;
796 	devidstr = (char *)Malloc(devidstrlen);
797 
798 	/*
799 	 * As a minor name is supplied then the ret_devname will be
800 	 * appropriate to that minor_name and in this case it will be
801 	 * a block device ie /dev/dsk.
802 	 */
803 	(void) snprintf(devidstr, devidstrlen,
804 	    "%s/%s", dnp->devid, np->minor_name);
805 
806 	ret = clnt_devinfo_by_devid(nodename, sp, devidstr, &retdev,
807 	    np->bname, &ret_devname, &ret_driver, ep);
808 
809 	Free(devidstr);
810 
811 	/*
812 	 * If the other side is not running device id in disksets,
813 	 * 'ret' is set to ENOTSUP in which case we fallback to
814 	 * the existing behaviour
815 	 */
816 	if (ret == ENOTSUP)
817 		return (1);
818 	else if (ret == -1)
819 		return (-1);
820 
821 	/*
822 	 * ret_devname comes from the rpc call and is a
823 	 * raw device name. We need to make this into a
824 	 * block device via blkname for further processing.
825 	 * Unfortunately, when our device id isn't found in
826 	 * the system, the rpc call will return a " " in
827 	 * ret_devname in which case we need to fill that in
828 	 * as ret_blkname because blkname of " " returns NULL.
829 	 */
830 	if (ret_bname != NULL && ret_devname != NULL) {
831 		ret_blkdevname = blkname(ret_devname);
832 		if (ret_blkdevname == NULL)
833 			*ret_bname = Strdup(ret_devname);
834 		else
835 			*ret_bname = Strdup(ret_blkdevname);
836 	}
837 
838 	if (ret_dname != NULL && ret_driver != NULL)
839 		*ret_dname = Strdup(ret_driver);
840 
841 	if (ret_mnum != NULL)
842 		*ret_mnum = meta_getminor(retdev);
843 
844 	return (1);
845 }
846 
847 int
848 meta_is_drive_in_anyset(
849 	mddrivename_t	*dnp,
850 	mdsetname_t	**spp,
851 	int		bypass_daemon,
852 	md_error_t 	*ep
853 )
854 {
855 	set_t		setno;
856 	mdsetname_t	*this_sp;
857 	int		is_it;
858 	set_t		max_sets;
859 
860 	if ((max_sets = get_max_sets(ep)) == 0)
861 		return (-1);
862 
863 	assert(spp != NULL);
864 	*spp = NULL;
865 
866 	for (setno = 1; setno < max_sets; setno++) {
867 		if (!bypass_daemon) {
868 			if ((this_sp = metasetnosetname(setno, ep)) == NULL) {
869 				if (mdismddberror(ep, MDE_DB_NODB)) {
870 					mdclrerror(ep);
871 					return (0);
872 				}
873 				if (mdiserror(ep, MDE_NO_SET)) {
874 					mdclrerror(ep);
875 					continue;
876 				}
877 				return (-1);
878 			}
879 		} else
880 			this_sp = metafakesetname(setno, NULL);
881 
882 		if ((is_it = meta_is_drive_in_thisset(this_sp, dnp,
883 		    bypass_daemon, ep)) == -1) {
884 			if (mdiserror(ep, MDE_NO_SET)) {
885 				mdclrerror(ep);
886 				continue;
887 			}
888 			return (-1);
889 		}
890 		if (is_it) {
891 			*spp = this_sp;
892 			return (0);
893 		}
894 	}
895 	return (0);
896 }
897 
898 int
899 meta_is_drive_in_thisset(
900 	mdsetname_t	*sp,
901 	mddrivename_t	*dnp,
902 	int		bypass_daemon,
903 	md_error_t	*ep
904 )
905 {
906 	md_drive_desc	*dd, *p;
907 
908 	if (bypass_daemon)
909 		dd = dr2drivedesc(sp, MD_SIDEWILD,
910 		    (MD_BASICNAME_OK | MD_BYPASS_DAEMON), ep);
911 	else
912 		dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
913 
914 	if (dd == NULL) {
915 		if (! mdisok(ep))
916 			return (-1);
917 		return (0);
918 	}
919 
920 
921 	for (p = dd; p != NULL; p = p->dd_next)
922 		if (strcmp(p->dd_dnp->cname, dnp->cname) == 0)
923 			return (1);
924 	return (0);
925 }
926 
927 /*
928  * Check to see if devid is in use in any diskset.
929  * This is used in the case when a partial diskset is being imported
930  * to make sure that the unvailable drive isn't already in use in an
931  * already imported partial diskset.  Can't check on the cname since the
932  * unavailable disk's cname is from the previous system and may collide
933  * with a cname on this system.
934  * Return values:
935  *	1: devid has been found in a diskset
936  *	0: devid not found in any diskset
937  */
938 int
939 meta_is_devid_in_anyset(
940 	void		*devid,
941 	mdsetname_t	**spp,
942 	md_error_t 	*ep
943 )
944 {
945 	set_t		setno;
946 	mdsetname_t	*this_sp;
947 	int		is_it;
948 	set_t		max_sets;
949 
950 	if ((max_sets = get_max_sets(ep)) == 0)
951 		return (-1);
952 
953 	assert(spp != NULL);
954 	*spp = NULL;
955 
956 	for (setno = 1; setno < max_sets; setno++) {
957 		if ((this_sp = metasetnosetname(setno, ep)) == NULL) {
958 			if (mdismddberror(ep, MDE_DB_NODB)) {
959 				mdclrerror(ep);
960 				return (0);
961 			}
962 			if (mdiserror(ep, MDE_NO_SET)) {
963 				mdclrerror(ep);
964 				continue;
965 			}
966 			return (-1);
967 		}
968 
969 		if ((is_it = meta_is_devid_in_thisset(this_sp,
970 		    devid, ep)) == -1) {
971 			if (mdiserror(ep, MDE_NO_SET)) {
972 				mdclrerror(ep);
973 				continue;
974 			}
975 			return (-1);
976 		}
977 		if (is_it) {
978 			*spp = this_sp;
979 			return (0);
980 		}
981 	}
982 	return (0);
983 }
984 
985 int
986 meta_is_devid_in_thisset(
987 	mdsetname_t	*sp,
988 	void		*devid,
989 	md_error_t	*ep
990 )
991 {
992 	md_drive_desc	*dd, *p;
993 	ddi_devid_t	dd_devid;
994 
995 	dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
996 	if (dd == NULL) {
997 		if (! mdisok(ep))
998 			return (-1);
999 		return (0);
1000 	}
1001 
1002 	for (p = dd; p != NULL; p = p->dd_next) {
1003 		if (p->dd_dnp->devid == NULL)
1004 			continue;
1005 		(void) devid_str_decode(p->dd_dnp->devid,
1006 		    &dd_devid, NULL);
1007 		if (dd_devid == NULL)
1008 			continue;
1009 		if (devid_compare(devid, dd_devid) == 0) {
1010 			devid_free(dd_devid);
1011 			return (1);
1012 		}
1013 		devid_free(dd_devid);
1014 	}
1015 	return (0);
1016 }
1017 
1018 int
1019 meta_set_balance(
1020 	mdsetname_t		*sp,
1021 	md_error_t		*ep
1022 )
1023 {
1024 	md_set_desc		*sd;
1025 	md_drive_desc		*dd, *curdd;
1026 	daddr_t			dbsize;
1027 	daddr_t			nblks;
1028 	int			i;
1029 	int			rval = 0;
1030 	sigset_t		oldsigs;
1031 	md_setkey_t		*cl_sk;
1032 	md_error_t		xep = mdnullerror;
1033 	md_mnnode_desc		*nd;
1034 	int			suspend1_flag = 0;
1035 
1036 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1037 		return (-1);
1038 
1039 	dbsize = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;
1040 
1041 	/* Make sure we own the set */
1042 	if (meta_check_ownership(sp, ep) != 0)
1043 		return (-1);
1044 
1045 	/* END CHECK CODE */
1046 
1047 	/*
1048 	 * Get drive descriptors for the drives that are currently in the set.
1049 	 */
1050 	curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep);
1051 
1052 	if (! mdisok(ep))
1053 		return (-1);
1054 
1055 	/* Find the minimum replica size in use is or use the default */
1056 	if ((nblks = meta_db_minreplica(sp, ep)) < 0)
1057 		mdclrerror(ep);
1058 	else
1059 		dbsize = nblks;	/* adjust replica size */
1060 
1061 	/* Make sure we are blocking all signals */
1062 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
1063 		mdclrerror(&xep);
1064 
1065 	/*
1066 	 * Lock the set on current set members.
1067 	 * For MN diskset lock_set and SUSPEND are used to protect against
1068 	 * other meta* commands running on the other nodes.
1069 	 */
1070 	if (MD_MNSET_DESC(sd)) {
1071 		nd = sd->sd_nodelist;
1072 		while (nd) {
1073 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1074 				nd = nd->nd_next;
1075 				continue;
1076 			}
1077 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
1078 				rval = -1;
1079 				goto out;
1080 			}
1081 			nd = nd->nd_next;
1082 		}
1083 		/*
1084 		 * Lock out other meta* commands by suspending
1085 		 * class 1 messages across the diskset.
1086 		 */
1087 		nd = sd->sd_nodelist;
1088 		while (nd) {
1089 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1090 				nd = nd->nd_next;
1091 				continue;
1092 			}
1093 			if (clnt_mdcommdctl(nd->nd_nodename,
1094 			    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
1095 			    MD_MSCF_NO_FLAGS, ep)) {
1096 				rval = -1;
1097 				goto out;
1098 			}
1099 			suspend1_flag = 1;
1100 			nd = nd->nd_next;
1101 		}
1102 	} else {
1103 		for (i = 0; i < MD_MAXSIDES; i++) {
1104 			/* Skip empty slots */
1105 			if (sd->sd_nodes[i][0] == '\0') continue;
1106 
1107 			if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
1108 				rval = -1;
1109 				goto out;
1110 			}
1111 		}
1112 	}
1113 
1114 	/* We are not adding or deleting any drives, just balancing */
1115 	dd = NULL;
1116 
1117 	/*
1118 	 * Balance the DB's according to the list of existing drives and the
1119 	 * list of added drives.
1120 	 */
1121 	if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1)
1122 		goto out;
1123 
1124 out:
1125 	/*
1126 	 * Unlock diskset by resuming class 1 messages across the diskset.
1127 	 * Just resume all classes so that resume is the same whether
1128 	 * just one class was locked or all classes were locked.
1129 	 */
1130 	if (suspend1_flag) {
1131 		nd = sd->sd_nodelist;
1132 		while (nd) {
1133 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1134 				nd = nd->nd_next;
1135 				continue;
1136 			}
1137 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
1138 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
1139 				/*
1140 				 * We are here because we failed to resume
1141 				 * rpc.mdcommd.  However we potentially have
1142 				 * an error from the previous call
1143 				 * (meta_db_balance). If the previous call
1144 				 * did fail,  we capture that error and
1145 				 * generate a perror withthe string,
1146 				 * "Unable to resume...".
1147 				 * Setting rval to -1 ensures that in the
1148 				 * next iteration of the loop, ep is not
1149 				 * clobbered.
1150 				 */
1151 				if (rval == 0)
1152 					(void) mdstealerror(ep, &xep);
1153 				else
1154 					mdclrerror(&xep);
1155 				rval = -1;
1156 				mde_perror(ep, dgettext(TEXT_DOMAIN,
1157 				    "Unable to resume rpc.mdcommd."));
1158 			}
1159 			nd = nd->nd_next;
1160 		}
1161 	}
1162 
1163 	/* Unlock the set */
1164 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1165 	if (MD_MNSET_DESC(sd)) {
1166 		nd = sd->sd_nodelist;
1167 		while (nd) {
1168 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1169 				nd = nd->nd_next;
1170 				continue;
1171 			}
1172 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
1173 				if (rval == 0)
1174 					(void) mdstealerror(ep, &xep);
1175 				else
1176 					mdclrerror(&xep);
1177 				rval = -1;
1178 			}
1179 			nd = nd->nd_next;
1180 		}
1181 	} else {
1182 		for (i = 0; i < MD_MAXSIDES; i++) {
1183 			/* Skip empty slots */
1184 			if (sd->sd_nodes[i][0] == '\0')
1185 				continue;
1186 
1187 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
1188 				if (rval == 0)
1189 					(void) mdstealerror(ep, &xep);
1190 				rval = -1;
1191 			}
1192 		}
1193 	}
1194 
1195 	/* release signals back to what they were on entry */
1196 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
1197 		mdclrerror(&xep);
1198 
1199 	cl_set_setkey(NULL);
1200 
1201 	metaflushsetname(sp);
1202 
1203 	return (rval);
1204 }
1205 
1206 int
1207 meta_set_destroy(
1208 	mdsetname_t	*sp,
1209 	int		lock_set,
1210 	md_error_t	*ep
1211 )
1212 {
1213 	int		i;
1214 	med_rec_t	medr;
1215 	md_set_desc	*sd;
1216 	md_drive_desc	*dd, *p, *p1;
1217 	mddrivename_t	*dnp;
1218 	mdname_t	*np;
1219 	mdnamelist_t	*nlp = NULL;
1220 	int		num_users = 0;
1221 	int		has_set;
1222 	side_t		mysideno;
1223 	sigset_t	oldsigs;
1224 	md_error_t	xep = mdnullerror;
1225 	md_setkey_t	*cl_sk;
1226 	int		rval = 0;
1227 	int		delete_end = 1;
1228 
1229 	/* Make sure we are blocking all signals */
1230 	if (procsigs(TRUE, &oldsigs, ep) < 0)
1231 		return (-1);
1232 
1233 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1234 		if (! mdisok(ep))
1235 			rval = -1;
1236 		goto out;
1237 	}
1238 
1239 	/*
1240 	 * meta_set_destroy should not be called for a MN diskset.
1241 	 * This routine destroys a set without communicating this information
1242 	 * to the other nodes which would lead to an inconsistency in
1243 	 * the MN diskset.
1244 	 */
1245 	if (MD_MNSET_DESC(sd)) {
1246 		rval = -1;
1247 		goto out;
1248 	}
1249 
1250 	/* Continue if a traditional diskset */
1251 
1252 	/*
1253 	 * Check to see who has the set.  If we are not the last user of the
1254 	 * set, we will not touch the replicas.
1255 	 */
1256 	for (i = 0; i < MD_MAXSIDES; i++) {
1257 		/* Skip empty slots */
1258 		if (sd->sd_nodes[i][0] == '\0')
1259 			continue;
1260 
1261 		has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NST_EQ,
1262 		    ep);
1263 
1264 		if (has_set < 0) {
1265 			mdclrerror(ep);
1266 		} else
1267 			num_users++;
1268 	}
1269 
1270 	if ((dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) == NULL) {
1271 		if (! mdisok(ep)) {
1272 			rval = -1;
1273 			goto out;
1274 		}
1275 	}
1276 
1277 	if (setup_db_bydd(sp, dd, TRUE, ep) == -1) {
1278 		rval = -1;
1279 		goto out;
1280 	}
1281 
1282 	if (lock_set == TRUE) {
1283 		/* Lock the set on our side */
1284 		if (clnt_lock_set(mynode(), sp, ep)) {
1285 			rval = -1;
1286 			goto out;
1287 		}
1288 	}
1289 
1290 	/*
1291 	 * A traditional diskset has no diskset stale information to send
1292 	 * since there can only be one owner node at a time.
1293 	 */
1294 	if (snarf_set(sp, FALSE, ep))
1295 		mdclrerror(ep);
1296 
1297 	if (dd != NULL) {
1298 		/*
1299 		 * Make sure that no drives are in use as parts of metadrives
1300 		 * or hot spare pools, this is one of the few error conditions
1301 		 * that will stop this routine, unless the environment has
1302 		 * META_DESTROY_SET_OK set, in which case, the operation will
1303 		 * proceed.
1304 		 */
1305 		if (getenv("META_DESTROY_SET_OK") == NULL) {
1306 			for (p = dd; p != NULL; p = p->dd_next) {
1307 				dnp = p->dd_dnp;
1308 
1309 				i = meta_check_drive_inuse(sp, dnp, FALSE, ep);
1310 				if (i == -1) {
1311 					/* need xep - wire calls clear error */
1312 					i = metaget_setownership(sp, &xep);
1313 					if (i == -1) {
1314 						rval = -1;
1315 						goto out;
1316 					}
1317 
1318 					mysideno = getmyside(sp, &xep);
1319 
1320 					if (mysideno == MD_SIDEWILD) {
1321 						rval = -1;
1322 						goto out;
1323 					}
1324 
1325 					if (sd->sd_isown[mysideno] == FALSE)
1326 						if (halt_set(sp, &xep)) {
1327 							rval = -1;
1328 							goto out;
1329 						}
1330 
1331 					rval = -1;
1332 					goto out;
1333 				}
1334 			}
1335 		}
1336 
1337 		for (i = 0; i < MD_MAXSIDES; i++) {
1338 			/* Skip empty slots */
1339 			if (sd->sd_nodes[i][0] == '\0')
1340 				continue;
1341 
1342 			/* Skip non local nodes */
1343 			if (strcmp(mynode(), sd->sd_nodes[i]) != 0)
1344 				continue;
1345 
1346 			if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep))
1347 				mdclrerror(ep);
1348 		}
1349 
1350 		/*
1351 		 * Go thru each drive and individually delete the replicas.
1352 		 * This way we can ignore individual errors.
1353 		 */
1354 		for (p = dd; p != NULL; p = p->dd_next) {
1355 			uint_t	rep_slice;
1356 
1357 			dnp = p->dd_dnp;
1358 			if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) ||
1359 			    (((np = metaslicename(dnp, rep_slice, ep))
1360 			    == NULL) &&
1361 			    ((np = metaslicename(dnp, MD_SLICE0, ep))
1362 			    == NULL))) {
1363 				rval = -1;
1364 				goto out;
1365 			}
1366 
1367 			if ((np = metaslicename(dnp,
1368 			    rep_slice, ep)) == NULL) {
1369 				if ((np = metaslicename(dnp,
1370 				    MD_SLICE0, ep)) == NULL) {
1371 					rval = -1;
1372 					goto out;
1373 				}
1374 				mdclrerror(ep);
1375 			}
1376 
1377 			/* Yes this is UGLY!!! */
1378 			p1 = p->dd_next;
1379 			p->dd_next = NULL;
1380 			if (rel_own_bydd(sp, p, FALSE, ep))
1381 				mdclrerror(ep);
1382 			p->dd_next = p1;
1383 
1384 			if (p->dd_dbcnt == 0)
1385 				continue;
1386 
1387 			/*
1388 			 * Skip the replica removal if we are not the last user
1389 			 */
1390 			if (num_users != 1)
1391 				continue;
1392 
1393 			nlp = NULL;
1394 			(void) metanamelist_append(&nlp, np);
1395 			if (meta_db_detach(sp, nlp,
1396 			    (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, ep))
1397 				mdclrerror(ep);
1398 			metafreenamelist(nlp);
1399 		}
1400 	}
1401 
1402 	if (halt_set(sp, ep)) {
1403 		rval = -1;
1404 		goto out;
1405 	}
1406 
1407 	/* Setup the mediator record */
1408 	(void) memset(&medr, '\0', sizeof (med_rec_t));
1409 	medr.med_rec_mag = MED_REC_MAGIC;
1410 	medr.med_rec_rev = MED_REC_REV;
1411 	medr.med_rec_fl  = 0;
1412 	medr.med_rec_sn  = sp->setno;
1413 	(void) strcpy(medr.med_rec_snm, sp->setname);
1414 	medr.med_rec_meds = sd->sd_med;	/* structure assigment */
1415 	(void) memset(&medr.med_rec_data, '\0', sizeof (med_data_t));
1416 	medr.med_rec_foff = 0;
1417 
1418 	/*
1419 	 * If we are the last remaining user, then remove the mediator hosts
1420 	 */
1421 	if (num_users == 1) {
1422 		for (i = 0; i < MED_MAX_HOSTS; i++) {
1423 			if (medr.med_rec_meds.n_lst[i].a_cnt != 0)
1424 				SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE,
1425 				    SVM_TAG_MEDIATOR, sp->setno, i);
1426 			(void) memset(&medr.med_rec_meds.n_lst[i], '\0',
1427 			    sizeof (md_h_t));
1428 		}
1429 		medr.med_rec_meds.n_cnt = 0;
1430 	} else { 	/* Remove this host from the mediator node list. */
1431 		for (i = 0; i < MD_MAXSIDES; i++) {
1432 			/* Skip empty slots */
1433 			if (sd->sd_nodes[i][0] == '\0')
1434 				continue;
1435 
1436 			/* Copy non local node */
1437 			if (strcmp(mynode(), sd->sd_nodes[i]) != 0) {
1438 				(void) strcpy(medr.med_rec_nodes[i],
1439 				    sd->sd_nodes[i]);
1440 				continue;
1441 			}
1442 
1443 			/* Clear local node */
1444 			(void) memset(&medr.med_rec_nodes[i], '\0',
1445 			    sizeof (md_node_nm_t));
1446 		}
1447 	}
1448 
1449 	crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
1450 
1451 	/*
1452 	 * If the client is part of a cluster put the DCS service
1453 	 * into a deleteing state.
1454 	 */
1455 	if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) {
1456 		if (metad_isautotakebyname(sp->setname)) {
1457 			delete_end = 0;
1458 		} else {
1459 			mdclrerror(ep);
1460 			goto out;
1461 		}
1462 	}
1463 
1464 	/* Inform the mediator hosts of the new information */
1465 	for (i = 0; i < MED_MAX_HOSTS; i++) {
1466 		if (sd->sd_med.n_lst[i].a_cnt == 0)
1467 			continue;
1468 
1469 		if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep))
1470 			mdclrerror(ep);
1471 	}
1472 
1473 	/* Delete the set locally */
1474 	for (i = 0; i < MD_MAXSIDES; i++) {
1475 		/* Skip empty slots */
1476 		if (sd->sd_nodes[i][0] == '\0')
1477 			continue;
1478 
1479 		/* Skip non local nodes */
1480 		if (strcmp(mynode(), sd->sd_nodes[i]) != 0)
1481 			continue;
1482 
1483 		if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1)
1484 			mdclrerror(ep);
1485 	}
1486 	if (delete_end &&
1487 	    sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR)
1488 		rval = -1;
1489 
1490 out:
1491 	/* release signals back to what they were on entry */
1492 	if (procsigs(FALSE, &oldsigs, &xep) < 0) {
1493 		if (rval == 0)
1494 			(void) mdstealerror(ep, &xep);
1495 		rval = -1;
1496 	}
1497 
1498 	if (lock_set == TRUE) {
1499 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
1500 		if (clnt_unlock_set(mynode(), cl_sk, &xep)) {
1501 			if (rval == 0)
1502 				(void) mdstealerror(ep, &xep);
1503 			rval = -1;
1504 		}
1505 		cl_set_setkey(NULL);
1506 	}
1507 
1508 	metaflushsetname(sp);
1509 	return (rval);
1510 }
1511 
1512 int
1513 meta_set_purge(
1514 	mdsetname_t	*sp,
1515 	int		bypass_cluster,
1516 	int		forceflg,
1517 	md_error_t	*ep
1518 )
1519 {
1520 	char		*thishost = mynode();
1521 	md_set_desc	*sd;
1522 	md_setkey_t	*cl_sk;
1523 	md_error_t	xep = mdnullerror;
1524 	int		rval = 0;
1525 	int		i, num_hosts = 0;
1526 	int		has_set = 0;
1527 	int		max_node = 0;
1528 	int		delete_end = 1;
1529 	md_mnnode_desc	*nd;
1530 
1531 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1532 		/* unable to find set description */
1533 		rval = 1;
1534 		return (rval);
1535 	}
1536 
1537 	if (MD_MNSET_DESC(sd)) {
1538 		/*
1539 		 * Get a count of the hosts in the set and also lock the set
1540 		 * on those hosts that know about it.
1541 		 */
1542 		nd = sd->sd_nodelist;
1543 		while (nd) {
1544 			/*
1545 			 * Only deal with those nodes that are members of
1546 			 * the set (MD_MN_NODE_ALIVE) or the node on which
1547 			 * the purge is being run. We must lock the set
1548 			 * on the purging node because the delset call
1549 			 * requires the lock to be set.
1550 			 */
1551 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE) &&
1552 			    nd->nd_nodeid != sd->sd_mn_mynode->nd_nodeid) {
1553 				nd = nd->nd_next;
1554 				continue;
1555 			}
1556 			has_set = nodehasset(sp, nd->nd_nodename,
1557 			    NHS_NST_EQ, ep);
1558 
1559 			/*
1560 			 * The host is not aware of this set (has_set < 0) or
1561 			 * the set does not match (has_set == 0). This check
1562 			 * prevents the code getting confused by an apparent
1563 			 * inconsistancy in the set's state, this is in the
1564 			 * purge code so something is broken in any case and
1565 			 * this is just trying to fix the brokeness.
1566 			 */
1567 			if (has_set <= 0) {
1568 				mdclrerror(ep);
1569 				nd->nd_flags |= MD_MN_NODE_NOSET;
1570 			} else {
1571 				num_hosts++;
1572 				if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
1573 					/*
1574 					 * If the force flag is set then
1575 					 * ignore any RPC failures because we
1576 					 * are only really interested with
1577 					 * the set on local node.
1578 					 */
1579 					if (forceflg && mdanyrpcerror(ep)) {
1580 						mdclrerror(ep);
1581 					} else {
1582 						/*
1583 						 * set max_node so that in the
1584 						 * unlock code nodes in the
1585 						 * set that have not been
1586 						 * locked are not unlocked.
1587 						 */
1588 						max_node = nd->nd_nodeid;
1589 						rval = 2;
1590 						goto out1;
1591 					}
1592 				}
1593 
1594 			}
1595 			nd = nd->nd_next;
1596 		}
1597 		max_node = 0;
1598 	} else {
1599 		/*
1600 		 * Get a count of the hosts in the set and also lock the set
1601 		 * on those hosts that know about it.
1602 		 */
1603 		for (i = 0; i < MD_MAXSIDES; i++) {
1604 			/* Skip empty slots */
1605 			if (sd->sd_nodes[i][0] == '\0')
1606 				continue;
1607 
1608 			has_set = nodehasset(sp, sd->sd_nodes[i],
1609 			    NHS_NST_EQ, ep);
1610 
1611 			/*
1612 			 * The host is not aware of this set (has_set < 0) or
1613 			 * the set does not match (has_set == 0). This check
1614 			 * prevents the code getting confused by an apparent
1615 			 * inconsistancy in the set's state, this is in the
1616 			 * purge code so something is broken in any case and
1617 			 * this is just trying to fix the brokeness.
1618 			 */
1619 			if (has_set <= 0) {
1620 				mdclrerror(ep);
1621 				/*
1622 				 * set the node to NULL to prevent further
1623 				 * requests to this unresponsive node.
1624 				 */
1625 				sd->sd_nodes[i][0] = '\0';
1626 			} else {
1627 				num_hosts++;
1628 				if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
1629 					/*
1630 					 * If the force flag is set then
1631 					 * ignore any RPC failures because we
1632 					 * are only really interested with
1633 					 * the set on local node.
1634 					 */
1635 					if (forceflg && mdanyrpcerror(ep)) {
1636 						mdclrerror(ep);
1637 					} else {
1638 						rval = 2;
1639 						/*
1640 						 * set max_node so that in the
1641 						 * unlock code nodes in the
1642 						 * set that have not been
1643 						 * locked are not unlocked.
1644 						 */
1645 						max_node = i;
1646 						goto out1;
1647 					}
1648 				}
1649 			}
1650 		}
1651 		max_node = i;	/* now MD_MAXSIDES */
1652 	}
1653 	if (!bypass_cluster) {
1654 		/*
1655 		 * If there is only one host associated with the
1656 		 * set then remove the set from the cluster.
1657 		 */
1658 		if (num_hosts == 1) {
1659 			if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) {
1660 				if (metad_isautotakebyname(sp->setname)) {
1661 					delete_end = 0;
1662 				} else {
1663 					mdclrerror(ep);
1664 					rval = 3;
1665 					goto out1;
1666 				}
1667 			}
1668 		}
1669 	}
1670 
1671 	if (MD_MNSET_DESC(sd)) {
1672 		nd = sd->sd_nodelist;
1673 		while (nd) {
1674 			if (nd->nd_nodeid == sd->sd_mn_mynode->nd_nodeid) {
1675 				/*
1676 				 * This is the node on which the purge is
1677 				 * being run. We do not care if it is
1678 				 * alive or not, just want to get rid of
1679 				 * the set.
1680 				 */
1681 				if (clnt_delset(nd->nd_nodename, sp,
1682 				    ep) == -1) {
1683 					md_perror(dgettext(TEXT_DOMAIN,
1684 					    "delset"));
1685 					if (!bypass_cluster && num_hosts == 1)
1686 						(void) sdssc_delete_end(
1687 						    sp->setname, SDSSC_CLEANUP);
1688 					mdclrerror(ep);
1689 					goto out1;
1690 				}
1691 				nd = nd->nd_next;
1692 				continue;
1693 			}
1694 
1695 			/*
1696 			 * Only contact those nodes that are members of
1697 			 * the set.
1698 			 */
1699 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1700 				nd = nd->nd_next;
1701 				continue;
1702 			}
1703 
1704 			/*
1705 			 * Tell the remote node to remove this node
1706 			 */
1707 			if (clnt_delhosts(nd->nd_nodename, sp, 1, &thishost,
1708 			    ep) == -1) {
1709 				/*
1710 				 * If we fail to delete ourselves
1711 				 * from the remote host it does not
1712 				 * really matter because the set is
1713 				 * being "purged" from this node. The
1714 				 * set can be purged from the other
1715 				 * node at a later time.
1716 				 */
1717 				mdclrerror(ep);
1718 			}
1719 			nd = nd->nd_next;
1720 		}
1721 	} else {
1722 		for (i = 0; i < MD_MAXSIDES; i++) {
1723 			/* Skip empty slots */
1724 			if (sd->sd_nodes[i][0] == '\0')
1725 				continue;
1726 			if (strcmp(thishost, sd->sd_nodes[i]) != 0) {
1727 				/*
1728 				 * Tell the remote node to remove this node
1729 				 */
1730 				if (clnt_delhosts(sd->sd_nodes[i], sp, 1,
1731 				    &thishost, ep) == -1) {
1732 					/*
1733 					 * If we fail to delete ourselves
1734 					 * from the remote host it does not
1735 					 * really matter because the set is
1736 					 * being "purged" from this node. The
1737 					 * set can be purged from the other
1738 					 * node at a later time.
1739 					 */
1740 					mdclrerror(ep);
1741 				}
1742 				continue;
1743 			}
1744 
1745 			/* remove the set from this host */
1746 			if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) {
1747 				md_perror(dgettext(TEXT_DOMAIN, "delset"));
1748 				if (!bypass_cluster && num_hosts == 1)
1749 					(void) sdssc_delete_end(sp->setname,
1750 					    SDSSC_CLEANUP);
1751 				mdclrerror(ep);
1752 				goto out1;
1753 			}
1754 		}
1755 	}
1756 
1757 	if (!bypass_cluster && num_hosts == 1) {
1758 		if (delete_end && sdssc_delete_end(sp->setname, SDSSC_COMMIT) ==
1759 		    SDSSC_ERROR) {
1760 			rval = 4;
1761 		}
1762 	}
1763 
1764 out1:
1765 
1766 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1767 
1768 	/*
1769 	 * Remove the set lock on those nodes that had the set locked
1770 	 * max_node will either be MD_MAXSIDES or array index of the last
1771 	 * node contacted (or rather failed to contact) for traditional
1772 	 * diskset.  For a MN diskset, max_node is the node_id of the node
1773 	 * that failed the lock.
1774 	 */
1775 	if (MD_MNSET_DESC(sd)) {
1776 		nd = sd->sd_nodelist;
1777 		while (nd) {
1778 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1779 				nd = nd->nd_next;
1780 				continue;
1781 			}
1782 			if (nd->nd_nodeid == max_node)
1783 				break;
1784 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
1785 				if (forceflg && mdanyrpcerror(&xep)) {
1786 					mdclrerror(&xep);
1787 					nd = nd->nd_next;
1788 					continue;
1789 				}
1790 				if (rval == 0)
1791 					(void) mdstealerror(ep, &xep);
1792 				rval = 5;
1793 			}
1794 			nd = nd->nd_next;
1795 		}
1796 	} else {
1797 		for (i = 0; i < max_node; i++) {
1798 			/* Skip empty slots */
1799 			if (sd->sd_nodes[i][0] == '\0')
1800 				continue;
1801 
1802 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
1803 				if (forceflg && mdanyrpcerror(&xep)) {
1804 					mdclrerror(&xep);
1805 					continue;
1806 				}
1807 				if (rval == 0)
1808 					(void) mdstealerror(ep, &xep);
1809 				rval = 5;
1810 			}
1811 		}
1812 	}
1813 
1814 	cl_set_setkey(NULL);
1815 
1816 	return (rval);
1817 }
1818 
1819 int
1820 meta_set_query(
1821 	mdsetname_t		*sp,
1822 	mddb_dtag_lst_t		**dtlpp,
1823 	md_error_t		*ep
1824 )
1825 {
1826 	mddb_dtag_get_parm_t	dtgp;
1827 
1828 	(void) memset(&dtgp, '\0', sizeof (mddb_dtag_get_parm_t));
1829 	dtgp.dtgp_setno = sp->setno;
1830 
1831 	/*CONSTCOND*/
1832 	while (1) {
1833 		if (metaioctl(MD_MED_GET_TAG, &dtgp, &dtgp.dtgp_mde, NULL) != 0)
1834 			if (! mdismddberror(&dtgp.dtgp_mde, MDE_DB_NOTAG) ||
1835 			    *dtlpp == NULL)
1836 				return (mdstealerror(ep, &dtgp.dtgp_mde));
1837 			else
1838 				break;
1839 
1840 		/*
1841 		 * Run to the end of the list
1842 		 */
1843 		for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx)
1844 			/* void */;
1845 
1846 		*dtlpp = Zalloc(sizeof (mddb_dtag_lst_t));
1847 
1848 		(void) memmove(&(*dtlpp)->dtl_dt, &dtgp.dtgp_dt,
1849 		    sizeof (mddb_dtag_t));
1850 
1851 		dtgp.dtgp_dt.dt_id++;
1852 	}
1853 	return (0);
1854 }
1855 
1856 /*
1857  * return drivename get by key
1858  */
1859 mddrivename_t *
1860 metadrivename_withdrkey(
1861 	mdsetname_t	*sp,
1862 	side_t		sideno,
1863 	mdkey_t		key,
1864 	int		flags,
1865 	md_error_t	*ep
1866 )
1867 {
1868 	char		*nm;
1869 	mdname_t	*np;
1870 	mddrivename_t	*dnp;
1871 	ddi_devid_t	devidp;
1872 	md_set_desc	*sd;
1873 
1874 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1875 		return (NULL);
1876 	}
1877 
1878 	/*
1879 	 * Get the devid associated with the key.
1880 	 *
1881 	 * If a devid was returned, it MUST be valid even in
1882 	 * the case where a device id has been "updated". The
1883 	 * "update" of the device id may have occured due to
1884 	 * a firmware upgrade.
1885 	 */
1886 	if ((devidp = meta_getdidbykey(MD_LOCAL_SET, sideno+SKEW, key, ep))
1887 	    != NULL) {
1888 		/*
1889 		 * Look for the correct dnp using the devid for comparison.
1890 		 */
1891 		dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep);
1892 		free(devidp);
1893 
1894 		/* dnp could be NULL if the devid could not be decoded. */
1895 		if (dnp == NULL) {
1896 			return (NULL);
1897 		}
1898 		dnp->side_names_key = key;
1899 	} else {
1900 		/*
1901 		 * We didn't get a devid. We'll try for a dnp using the
1902 		 * name. If we have a MN diskset or if the dnp is a did
1903 		 * device, we're done because then we don't have devids.
1904 		 * Otherwise we'll try to set the devid
1905 		 * and get the dnp via devid again.
1906 		 * We also need to clear the ep structure. When the
1907 		 * above call to meta_getdidbykey returned a null, it
1908 		 * also put an error code into ep. In this case, the null
1909 		 * return is actually OK and any errors can be ignored. The
1910 		 * reason it is OK is because this could be a MN set or
1911 		 * we could  be running without devids (ex cluster).
1912 		 */
1913 		mdclrerror(ep);
1914 
1915 		if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno, key,
1916 		    ep)) == NULL)
1917 			return (NULL);
1918 		/* get device name */
1919 		if (flags & PRINT_FAST) {
1920 			if ((np = metaname_fast(&sp, nm,
1921 			    LOGICAL_DEVICE, ep)) == NULL) {
1922 				Free(nm);
1923 				return (NULL);
1924 			}
1925 		} else {
1926 			if ((np = metaname(&sp, nm, LOGICAL_DEVICE,
1927 			    ep)) == NULL) {
1928 				Free(nm);
1929 				return (NULL);
1930 			}
1931 		}
1932 		Free(nm);
1933 		/* make sure it's OK */
1934 		if ((! (flags & MD_BASICNAME_OK)) && (metachkcomp(np,
1935 		    ep) != 0))
1936 			return (NULL);
1937 
1938 		/* get drivename */
1939 		dnp = np->drivenamep;
1940 		dnp->side_names_key = key;
1941 		/*
1942 		 * Skip the devid set/check for the following cases:
1943 		 * 1) If MN diskset, there are no devid's
1944 		 * 2) if dnp is did device
1945 		 * The device id is disabled for did device due to the
1946 		 * lack of minor name support in the did driver. The following
1947 		 * devid code path can set and propagate the error and
1948 		 * eventually prevent did disks from being added to the
1949 		 * diskset under SunCluster systems
1950 		 *
1951 		 * Note that this code can be called through rpc.mdcommd.
1952 		 * sdssc_version cannot be used because the library won't
1953 		 * be bound.
1954 		 */
1955 		if ((strncmp(dnp->rname, "/dev/did/", strlen("/dev/did/"))
1956 		    == 0) || (MD_MNSET_DESC(sd)))
1957 			goto out;
1958 
1959 		/*
1960 		 * It is okay if replica is not in devid mode
1961 		 */
1962 		if (mdissyserror(ep, MDDB_F_NODEVID)) {
1963 			mdclrerror(ep);
1964 			goto out;
1965 		}
1966 
1967 		/*
1968 		 * We're not MN or did devices but
1969 		 * devid is missing so this means that we have
1970 		 * just upgraded from a configuration where
1971 		 * devid's were not used so try to add in
1972 		 * the devid and requery. If the devid still isn't there,
1973 		 * that's OK. dnp->devid will be null as it is in any
1974 		 * configuration with no devids.
1975 		 */
1976 		if (meta_setdid(MD_LOCAL_SET, sideno + SKEW, key, ep) < 0)
1977 			return (NULL);
1978 		if ((devidp = (ddi_devid_t)meta_getdidbykey(MD_LOCAL_SET,
1979 		    sideno+SKEW, key, ep)) != NULL) {
1980 			/*
1981 			 * Found a devid so look for the dnp using the
1982 			 * devid as the search mechanism.
1983 			 */
1984 			dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep);
1985 			free(devidp);
1986 			if (dnp == NULL) {
1987 				return (NULL);
1988 			}
1989 			dnp->side_names_key = key;
1990 		}
1991 	}
1992 
1993 
1994 
1995 out:
1996 	if (flags & MD_BYPASS_DAEMON)
1997 		return (dnp);
1998 
1999 	if (get_sidenmlist(sp, dnp, ep))
2000 		return (NULL);
2001 
2002 	/* return success */
2003 	return (dnp);
2004 }
2005 
2006 void
2007 metafreedrivedesc(md_drive_desc **dd)
2008 {
2009 	md_drive_desc	*p, *next = NULL;
2010 
2011 	for (p = *dd; p != NULL; p = next) {
2012 		next = p->dd_next;
2013 		Free(p);
2014 	}
2015 	*dd = NULL;
2016 }
2017 
2018 md_drive_desc *
2019 metaget_drivedesc(
2020 	mdsetname_t	*sp,
2021 	int		flags,
2022 	md_error_t	*ep
2023 )
2024 {
2025 	side_t		sideno = MD_SIDEWILD;
2026 
2027 	assert(! (flags & MD_BYPASS_DAEMON));
2028 
2029 	if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD)
2030 		return (NULL);
2031 
2032 	return (metaget_drivedesc_sideno(sp, sideno, flags, ep));
2033 }
2034 
2035 md_drive_desc *
2036 metaget_drivedesc_fromnamelist(
2037 	mdsetname_t	*sp,
2038 	mdnamelist_t	*nlp,
2039 	md_error_t	*ep
2040 )
2041 {
2042 	md_set_desc		*sd;
2043 	mdnamelist_t		*p;
2044 	md_drive_desc		*dd = NULL;
2045 
2046 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
2047 		return (NULL);
2048 
2049 	for (p = nlp; p != NULL; p = p->next)
2050 		(void) metadrivedesc_append(&dd, p->namep->drivenamep, 0, 0,
2051 		    sd->sd_ctime, sd->sd_genid, MD_DR_ADD);
2052 
2053 	return (dd);
2054 }
2055 
2056 md_drive_desc *
2057 metaget_drivedesc_sideno(
2058 	mdsetname_t *sp,
2059 	side_t sideno,
2060 	int flags,
2061 	md_error_t *ep
2062 )
2063 {
2064 	md_set_desc	*sd = NULL;
2065 
2066 	assert(! (flags & MD_BYPASS_DAEMON));
2067 
2068 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
2069 		return (NULL);
2070 
2071 	if (sd->sd_drvs)
2072 		return (sd->sd_drvs);
2073 
2074 	if ((sd->sd_drvs = dr2drivedesc(sp, sideno, flags, ep)) == NULL)
2075 		return (NULL);
2076 
2077 	return (sd->sd_drvs);
2078 }
2079 
2080 int
2081 metaget_setownership(
2082 	mdsetname_t	*sp,
2083 	md_error_t	*ep
2084 )
2085 {
2086 	md_set_desc	*sd;
2087 	int		bool;
2088 	int		i;
2089 	md_mnnode_desc	*nd;
2090 
2091 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
2092 		return (-1);
2093 
2094 	if (MD_MNSET_DESC(sd)) {
2095 		nd = sd->sd_nodelist;
2096 		while (nd) {
2097 			/* If node isn't alive, can't own diskset */
2098 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2099 				nd->nd_flags &= ~MD_MN_NODE_OWN;
2100 				nd = nd->nd_next;
2101 				continue;
2102 			}
2103 			/*
2104 			 * If can't communicate with rpc.metad, then mark
2105 			 * this node as not an owner.  That node may
2106 			 * in fact, be an owner, but without rpc.metad running
2107 			 * that node can't do much.
2108 			 */
2109 			if (clnt_ownset(nd->nd_nodename, sp, &bool, ep) == -1) {
2110 				nd->nd_flags &= ~MD_MN_NODE_OWN;
2111 			} else if (bool == TRUE) {
2112 				nd->nd_flags |= MD_MN_NODE_OWN;
2113 			} else {
2114 				nd->nd_flags &= ~MD_MN_NODE_OWN;
2115 			}
2116 			nd = nd->nd_next;
2117 		}
2118 		return (0);
2119 	}
2120 
2121 	/* Rest of code handles traditional disksets */
2122 
2123 	for (i = 0; i < MD_MAXSIDES; i++)
2124 		sd->sd_isown[i] = 0;
2125 
2126 	if (clnt_ownset(mynode(), sp, &bool, ep) == -1)
2127 		return (-1);
2128 
2129 	if (bool == TRUE)
2130 		sd->sd_isown[getmyside(sp, ep)] = 1;
2131 
2132 	return (0);
2133 }
2134 
2135 char *
2136 mynode(void)
2137 {
2138 	static struct utsname	myuname;
2139 	static int		done = 0;
2140 
2141 	if (! done) {
2142 		if (uname(&myuname) == -1) {
2143 			md_perror(dgettext(TEXT_DOMAIN, "uname"));
2144 			assert(0);
2145 		}
2146 		done = 1;
2147 	}
2148 	return (myuname.nodename);
2149 }
2150 
2151 int
2152 strinlst(char *str, int cnt, char **lst)
2153 {
2154 	int i;
2155 
2156 	for (i = 0; i < cnt; i++)
2157 		if (strcmp(lst[i], str) == 0)
2158 			return (TRUE);
2159 
2160 	return (FALSE);
2161 }
2162 
2163 /*
2164  * meta_get_reserved_names
2165  *  returns an mdnamelist_t of reserved slices
2166  *  reserved slices are those that are used but don't necessarily
2167  *  show up as metadevices (ex. reserved slice for db in sets, logs)
2168  */
2169 
2170 /*ARGSUSED*/
2171 int
2172 meta_get_reserved_names(
2173 	mdsetname_t	*sp,
2174 	mdnamelist_t	**nlpp,
2175 	int		options,
2176 	md_error_t	*ep)
2177 {
2178 	int		 count		= 0;
2179 	mdname_t	*np		= NULL;
2180 	mdnamelist_t	*transnlp	= NULL;
2181 	mdnamelist_t	**tailpp 	= nlpp;
2182 	mdnamelist_t	*nlp;
2183 	md_drive_desc	*dd, *di;
2184 
2185 	if (metaislocalset(sp))
2186 		goto out;
2187 
2188 	if (!(dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) && !mdisok(ep)) {
2189 		count = -1;
2190 		goto out;
2191 	}
2192 
2193 	/* db in for sets on reserved slice */
2194 	for (di = dd; di && count >= 0; di = di->dd_next) {
2195 		uint_t	rep_slice;
2196 
2197 		/*
2198 		 * Add the name struct to the end of the
2199 		 * namelist but keep a pointer to the last
2200 		 * element so that we don't incur the overhead
2201 		 * of traversing the list each time
2202 		 */
2203 		if (di->dd_dnp &&
2204 		    (meta_replicaslice(di->dd_dnp, &rep_slice, ep) == 0) &&
2205 		    (np = metaslicename(di->dd_dnp, rep_slice, ep)) &&
2206 		    (tailpp = meta_namelist_append_wrapper(tailpp, np)))
2207 			count++;
2208 		else
2209 			count = -1;
2210 	}
2211 
2212 	/* now find logs */
2213 	if (meta_get_trans_names(sp, &transnlp, 0, ep) < 0) {
2214 		count = -1;
2215 		goto out;
2216 	}
2217 
2218 	for (nlp = transnlp; (nlp != NULL); nlp = nlp->next) {
2219 		mdname_t	*transnp = nlp->namep;
2220 		md_trans_t	*transp;
2221 
2222 		if ((transp = meta_get_trans(sp, transnp, ep)) == NULL) {
2223 			count = -1;
2224 			goto out;
2225 		}
2226 		if (transp->lognamep) {
2227 			/*
2228 			 * Add the name struct to the end of the
2229 			 * namelist but keep a pointer to the last
2230 			 * element so that we don't incur the overhead
2231 			 * of traversing the list each time
2232 			 */
2233 			tailpp = meta_namelist_append_wrapper(
2234 			    tailpp, transp->lognamep);
2235 		}
2236 	}
2237 out:
2238 	metafreenamelist(transnlp);
2239 	return (count);
2240 }
2241 
2242 /*
2243  * Entry point to join a node to MultiNode diskset.
2244  *
2245  * Validate host in diskset.
2246  *	- Should be in membership list from API
2247  *	- Should not already be joined into diskset.
2248  *	- Set must have drives
2249  * Assume valid configuration is stored in the set/drive/node records
2250  * in the local mddb since no node or drive can be added to the MNset
2251  * unless all drives and nodes are available.  Reconfig steps will
2252  * resync all ALIVE nodes in case of panic in critical areas.
2253  *
2254  * Lock down the set.
2255  * Verify host is a member of this diskset.
2256  * If drives exist in the configuration, load the mddbs.
2257  * Set this node to active by notifying master if one exists.
2258  * If this is the first node active in the diskset, this node
2259  * 	becomes the master.
2260  * Unlock the set.
2261  *
2262  * Mirror Resync:
2263  * If this node is the last node to join the set and clustering
2264  * isn't running, then start the 'metasync -r' type resync
2265  * on all mirrors in this diskset.
2266  * If clustering is running, this resync operation will
2267  * be handled by the reconfig steps and should NOT
2268  * be handled during a join operation.
2269  *
2270  * There are multiple return values in order to assist
2271  * the join operation of all sets in the metaset command.
2272  *
2273  * Return values:
2274  *	0  - Node successfully joined to set.
2275  *	-1 - Join attempted but failed
2276  *		- any failure from libmeta calls
2277  *		- node not in the member list
2278  *	-2 - Join not attempted since
2279  *		- this set had no drives in set
2280  *		- this node already joined to set
2281  *		- set is not a multinode set
2282  *	-3 - Node joined to STALE set.
2283  */
2284 extern int
2285 meta_set_join(
2286 	mdsetname_t	*sp,
2287 	md_error_t	*ep
2288 )
2289 {
2290 	md_set_desc		*sd;
2291 	md_drive_desc		*dd;
2292 	md_mnnode_desc		*nd, *nd2, my_nd;
2293 	int			rval = 0;
2294 	md_setkey_t		*cl_sk;
2295 	md_error_t		xep = mdnullerror;
2296 	md_error_t		ep_snarf = mdnullerror;
2297 	int			master_flag = 0;
2298 	md_mnset_record		*mas_mnsr = NULL;
2299 	int			clear_nr_flags = 0;
2300 	md_mnnode_record	*nr;
2301 	int			stale_set = 0;
2302 	int			rb_flags = 0;
2303 	int			stale_bool = FALSE;
2304 	int			suspendall_flag = 0;
2305 	int			suspend1_flag = 0;
2306 	sigset_t		oldsigs;
2307 	int			send_reinit = 0;
2308 
2309 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2310 		return (-1);
2311 	}
2312 
2313 	/* Must be a multinode diskset */
2314 	if (!MD_MNSET_DESC(sd)) {
2315 		(void) mderror(ep, MDE_NOT_MN, sp->setname);
2316 		return (-2);
2317 	}
2318 
2319 	/* Verify that the node is ALIVE (i.e. is in the API membership list) */
2320 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_ALIVE)) {
2321 		(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno,
2322 		    sd->sd_mn_mynode->nd_nodename, NULL, sp->setname);
2323 		return (-1);
2324 	}
2325 
2326 	/* Make sure we are blocking all signals */
2327 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
2328 		mdclrerror(&xep);
2329 
2330 	/*
2331 	 * Lock the set on current set members.
2332 	 * For MN diskset lock_set and SUSPEND are used to protect against
2333 	 * other meta* commands running on the other nodes.
2334 	 */
2335 	nd = sd->sd_nodelist;
2336 	while (nd) {
2337 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2338 			nd = nd->nd_next;
2339 			continue;
2340 		}
2341 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2342 			rval = -1;
2343 			goto out;
2344 		}
2345 		nd = nd->nd_next;
2346 	}
2347 
2348 	/*
2349 	 * Lock out other meta* commands by suspending
2350 	 * class 1 messages across the diskset.
2351 	 */
2352 	nd = sd->sd_nodelist;
2353 	while (nd) {
2354 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2355 			nd = nd->nd_next;
2356 			continue;
2357 		}
2358 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2359 		    sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) {
2360 			rval = -1;
2361 			goto out;
2362 		}
2363 		suspend1_flag = 1;
2364 		nd = nd->nd_next;
2365 	}
2366 
2367 	/*
2368 	 * Verify that this host is a member (in the host list) of the set.
2369 	 */
2370 	nd = sd->sd_nodelist;
2371 	while (nd) {
2372 		if (strcmp(mynode(), nd->nd_nodename) == 0) {
2373 			break;
2374 		}
2375 		nd = nd->nd_next;
2376 	}
2377 	if (!nd) {
2378 		(void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
2379 		    sd->sd_mn_mynode->nd_nodename, NULL,
2380 		    sp->setname);
2381 		rval = -1;
2382 		goto out;
2383 	}
2384 
2385 	/*
2386 	 * Need to return failure if host is already 'joined'
2387 	 * into the set.  This is done so that if later the user
2388 	 * issues a command to join all sets and a failure is
2389 	 * encountered - that the resulting cleanup effort
2390 	 * (withdrawing from all sets that were joined
2391 	 * during that command) won't withdraw from this set.
2392 	 */
2393 	if (nd->nd_flags & MD_MN_NODE_OWN) {
2394 		rval = -2;
2395 		goto out2;
2396 	}
2397 
2398 	/*
2399 	 * Call metaget_setownership that calls each node in diskset and
2400 	 * marks in set descriptor if node is an owner of the set or not.
2401 	 * metaget_setownership checks to see if a node is an owner by
2402 	 * checking to see if that node's kernel has the mddb loaded.
2403 	 * If a node had panic'd during a reconfig or an
2404 	 * add/delete/join/withdraw operation, the other nodes' node
2405 	 * records may not reflect the current state of the diskset,
2406 	 * so calling metaget_setownership is the safest thing to do.
2407 	 */
2408 	if (metaget_setownership(sp, ep) == -1) {
2409 		rval = -1;
2410 		goto out;
2411 	}
2412 
2413 	/* If first active member of diskset, become the master. */
2414 	nd = sd->sd_nodelist;
2415 	while (nd) {
2416 		if (nd->nd_flags & MD_MN_NODE_OWN)
2417 			break;
2418 		nd = nd->nd_next;
2419 	}
2420 	if (nd == NULL)
2421 		master_flag = 1;
2422 
2423 	/*
2424 	 * If not first active member of diskset, then get the
2425 	 * master information from a node that is already joined
2426 	 * and set the master information for this node.  Be sure
2427 	 * that this node (the already joined node) has its own
2428 	 * join flag set.  If not, then this diskset isn't currently
2429 	 * consistent and shouldn't allow a node to join.  This diskset
2430 	 * inconsistency should only occur when a node has panic'd in
2431 	 * the set while doing a metaset operation and the sysadmin is
2432 	 * attempting to join a node into the set.  This inconsistency
2433 	 * will be fixed during a reconfig cycle which should be occurring
2434 	 * soon since a node panic'd.
2435 	 *
2436 	 * If unable to get this information from an owning node, then
2437 	 * this diskset isn't currently consistent and shouldn't
2438 	 * allow a node to join.
2439 	 */
2440 	if (!master_flag) {
2441 		/* get master information from an owner (joined) node */
2442 		if (clnt_mngetset(nd->nd_nodename, sp->setname,
2443 		    sp->setno, &mas_mnsr, ep) == -1) {
2444 			rval = -1;
2445 			goto out;
2446 		}
2447 
2448 		/* Verify that owner (joined) node has its own JOIN flag set */
2449 		nr = mas_mnsr->sr_nodechain;
2450 		while (nr) {
2451 			if ((nd->nd_nodeid == nr->nr_nodeid) &&
2452 			    ((nr->nr_flags & MD_MN_NODE_OWN) == NULL)) {
2453 				(void) mddserror(ep, MDE_DS_NODENOSET,
2454 				    sp->setno, nd->nd_nodename, NULL,
2455 				    nd->nd_nodename);
2456 				free_sr((md_set_record *)mas_mnsr);
2457 				rval = -1;
2458 				goto out;
2459 			}
2460 			nr = nr->nr_next;
2461 		}
2462 
2463 		/*
2464 		 * Does master have set marked as STALE?
2465 		 * If so, need to pass this down to kernel when
2466 		 * this node snarfs the set.
2467 		 */
2468 		if (clnt_mn_is_stale(nd->nd_nodename, sp,
2469 		    &stale_bool, ep) == -1) {
2470 			rval = -1;
2471 			goto out;
2472 		}
2473 
2474 		/* set master information in my rpc.metad's set record */
2475 		if (clnt_mnsetmaster(mynode(), sp, mas_mnsr->sr_master_nodenm,
2476 		    mas_mnsr->sr_master_nodeid, ep)) {
2477 			free_sr((md_set_record *)mas_mnsr);
2478 			rval = -1;
2479 			goto out;
2480 		}
2481 
2482 		/* set master information in my cached set desc */
2483 		(void) strcpy(sd->sd_mn_master_nodenm,
2484 		    mas_mnsr->sr_master_nodenm);
2485 		sd->sd_mn_master_nodeid = mas_mnsr->sr_master_nodeid;
2486 		nd2 = sd->sd_nodelist;
2487 		while (nd2) {
2488 			if (nd2->nd_nodeid == mas_mnsr->sr_master_nodeid) {
2489 				sd->sd_mn_masternode = nd2;
2490 				break;
2491 			}
2492 			nd2 = nd2->nd_next;
2493 		}
2494 		free_sr((md_set_record *)mas_mnsr);
2495 
2496 		/*
2497 		 * Set the node flags in mynode's rpc.metad node records for
2498 		 * the nodes that are in the diskset.  Can use my sd
2499 		 * since earlier call to metaget_setownership set the
2500 		 * owner flags based on whether that node had snarfed
2501 		 * the MN diskset mddb.  Reconfig steps guarantee that
2502 		 * return of metaget_setownership will match the owning
2503 		 * node's owner list except in the case where a node
2504 		 * has just panic'd and in this case, a reconfig will
2505 		 * be starting immediately and the owner lists will
2506 		 * be sync'd up by the reconfig.
2507 		 *
2508 		 * Flag of SET means to take no action except to
2509 		 * set the node flags as given in the nodelist linked list.
2510 		 */
2511 		if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist,
2512 		    MD_NR_SET, NULL, ep)) {
2513 			rval = -1;
2514 			goto out;
2515 		}
2516 	}
2517 
2518 	/*
2519 	 * Read in the mddb if there are drives in the set.
2520 	 */
2521 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
2522 	    ep)) == NULL) {
2523 		/* No drives in list */
2524 		if (! mdisok(ep)) {
2525 			rval = -1;
2526 			goto out;
2527 		}
2528 		rval = -2;
2529 		goto out;
2530 	}
2531 
2532 	/*
2533 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2534 	 * Start by suspending rpc.mdcommd (which drains it of all messages),
2535 	 * then change the nodelist followed by a reinit and resume.
2536 	 */
2537 	nd = sd->sd_nodelist;
2538 	while (nd) {
2539 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2540 			nd = nd->nd_next;
2541 			continue;
2542 		}
2543 
2544 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp,
2545 		    MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
2546 			rval = -1;
2547 			goto out;
2548 		}
2549 		suspendall_flag = 1;
2550 		nd = nd->nd_next;
2551 	}
2552 
2553 	/* Set master in my set record in rpc.metad */
2554 	if (master_flag) {
2555 		if (clnt_mnsetmaster(mynode(), sp,
2556 		    sd->sd_mn_mynode->nd_nodename,
2557 		    sd->sd_mn_mynode->nd_nodeid, ep)) {
2558 			rval = -1;
2559 			goto out;
2560 		}
2561 	}
2562 	/*
2563 	 * Causes mddbs to be loaded into the kernel.
2564 	 * Set the force flag so that replica locations can be
2565 	 * loaded into the kernel even if a mediator node was
2566 	 * unavailable.  This allows a node to join an MO
2567 	 * diskset when there are sufficient replicas available,
2568 	 * but a mediator node in unavailable.
2569 	 */
2570 	if (setup_db_bydd(sp, dd, TRUE, ep) == -1) {
2571 		mde_perror(ep, dgettext(TEXT_DOMAIN,
2572 		    "Host not able to start diskset."));
2573 		rval = -1;
2574 		goto out;
2575 	}
2576 
2577 	if (! mdisok(ep)) {
2578 		rval = -1;
2579 		goto out;
2580 	}
2581 
2582 	/*
2583 	 * Set rollback flags to 1 so that halt_set is called if a failure
2584 	 * is seen after this point.  If snarf_set fails, still need to
2585 	 * call halt_set to cleanup the diskset.
2586 	 */
2587 	rb_flags = 1;
2588 
2589 	/* Starts the set */
2590 	if (snarf_set(sp, stale_bool, ep) != 0) {
2591 		if (mdismddberror(ep, MDE_DB_STALE)) {
2592 			/*
2593 			 * Don't fail join, STALE means that set has
2594 			 * < 50% mddbs.
2595 			 */
2596 			(void) mdstealerror(&ep_snarf, ep);
2597 			stale_set = 1;
2598 		} else if (mdisok(ep)) {
2599 			/* If snarf failed, but no error was set - set it */
2600 			(void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64,
2601 			    sp->setno, 0, NULL);
2602 				rval = -1;
2603 				goto out;
2604 		} else if (!(mdismddberror(ep, MDE_DB_ACCOK))) {
2605 			/*
2606 			 * Don't fail join if ACCOK; ACCOK means that mediator
2607 			 * provided extra vote.
2608 			 */
2609 			rval = -1;
2610 			goto out;
2611 		}
2612 	}
2613 
2614 	/* Did set really get snarfed? */
2615 	if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_NO) {
2616 		if (mdisok(ep)) {
2617 			/* If snarf failed, but no error was set - set it */
2618 			(void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64,
2619 			    sp->setno, 0, NULL);
2620 		}
2621 		mde_perror(ep, dgettext(TEXT_DOMAIN,
2622 		    "Host not able to start diskset."));
2623 		rval = -1;
2624 		goto out;
2625 	}
2626 
2627 	/* Change to nodelist so need to send reinit to rpc.mdcommd */
2628 	send_reinit = 1;
2629 
2630 	/* If first node to enter set, setup master and clear change log */
2631 	if (master_flag) {
2632 		/* Set master in my locally cached set descriptor */
2633 		(void) strcpy(sd->sd_mn_master_nodenm,
2634 		    sd->sd_mn_mynode->nd_nodename);
2635 		sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid;
2636 		sd->sd_mn_am_i_master = 1;
2637 
2638 		/*
2639 		 * If first node to join set, then clear out change log
2640 		 * entries.  Change log entries are only needed when a
2641 		 * change of master is occurring in a diskset that has
2642 		 * multiple owners.   Since this node is the first owner
2643 		 * of the diskset, clear the entries.
2644 		 *
2645 		 * Only do this if we are in a single node non-SC3.x
2646 		 * situation.
2647 		 */
2648 		if (meta_mn_singlenode() &&
2649 		    mdmn_reset_changelog(sp, ep,  MDMN_CLF_RESETLOG) != 0) {
2650 			mde_perror(ep, dgettext(TEXT_DOMAIN,
2651 			    "Unable to reset changelog."));
2652 			rval = -1;
2653 			goto out;
2654 		}
2655 	}
2656 
2657 	/* Set my locally cached flag */
2658 	sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN;
2659 
2660 	/*
2661 	 * Set this node's own flag on all joined nodes in the set
2662 	 * (including my node).
2663 	 */
2664 	clear_nr_flags = 1;
2665 
2666 	my_nd = *(sd->sd_mn_mynode);
2667 	my_nd.nd_next = NULL;
2668 	nd = sd->sd_nodelist;
2669 	while (nd) {
2670 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2671 			nd = nd->nd_next;
2672 			continue;
2673 		}
2674 		if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
2675 		    MD_NR_JOIN, NULL, ep)) {
2676 			rval = -1;
2677 			goto out;
2678 		}
2679 		nd = nd->nd_next;
2680 	}
2681 
2682 out:
2683 	if (rval != NULL) {
2684 		/*
2685 		 * If rollback flag is 1, then node was joined to set.
2686 		 * Since an error occurred, withdraw node from set in
2687 		 * order to rollback to before command was run.
2688 		 * Need to preserve ep so that calling function can
2689 		 * get error information.
2690 		 */
2691 		if (rb_flags == 1) {
2692 			if (halt_set(sp, &xep)) {
2693 				mdclrerror(&xep);
2694 			}
2695 		}
2696 
2697 		/*
2698 		 * If error, reset master to INVALID.
2699 		 * Ignore error since (next) first node to successfully join
2700 		 * will set master on all nodes.
2701 		 */
2702 		(void) clnt_mnsetmaster(mynode(), sp, "",
2703 		    MD_MN_INVALID_NID, &xep);
2704 		mdclrerror(&xep);
2705 		/* Reset master in my locally cached set descriptor */
2706 		sd->sd_mn_master_nodeid = MD_MN_INVALID_NID;
2707 		sd->sd_mn_am_i_master = 0;
2708 
2709 		/*
2710 		 * If nr flags set on other nodes, reset them.
2711 		 */
2712 		if (clear_nr_flags) {
2713 			nd = sd->sd_nodelist;
2714 			while (nd) {
2715 				if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2716 					nd = nd->nd_next;
2717 					continue;
2718 				}
2719 				(void) clnt_upd_nr_flags(nd->nd_nodename, sp,
2720 				    &my_nd, MD_NR_WITHDRAW, NULL, &xep);
2721 				mdclrerror(&xep);
2722 				nd = nd->nd_next;
2723 			}
2724 			/* Reset my locally cached flag */
2725 			sd->sd_mn_mynode->nd_flags &= ~MD_MN_NODE_OWN;
2726 		}
2727 	}
2728 
2729 	/*
2730 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2731 	 * Send reinit command to mdcommd which forces it to get
2732 	 * fresh set description.
2733 	 */
2734 	if (send_reinit) {
2735 		/* Send reinit */
2736 		nd = sd->sd_nodelist;
2737 		while (nd) {
2738 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2739 				nd = nd->nd_next;
2740 				continue;
2741 			}
2742 
2743 			/* Class is ignored for REINIT */
2744 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
2745 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
2746 				/*
2747 				 * We are here because we failed to resume
2748 				 * rpc.mdcommd.  However we potentially have
2749 				 * an error from the previous call
2750 				 * If the previous call did fail,  we capture
2751 				 * that error and generate a perror with
2752 				 * the string, "Unable to resume...".
2753 				 * Setting rval to -1 ensures that in the
2754 				 * next iteration of the loop, ep is not
2755 				 * clobbered.
2756 				 */
2757 				if (rval == 0)
2758 					(void) mdstealerror(ep, &xep);
2759 				else
2760 					mdclrerror(&xep);
2761 				rval = -1;
2762 				mde_perror(ep, dgettext(TEXT_DOMAIN,
2763 				    "Unable to reinit rpc.mdcommd."));
2764 			}
2765 			nd = nd->nd_next;
2766 		}
2767 
2768 	}
2769 
2770 out2:
2771 	/*
2772 	 * Unlock diskset by resuming messages across the diskset.
2773 	 * Just resume all classes so that resume is the same whether
2774 	 * just one class was locked or all classes were locked.
2775 	 */
2776 	if ((suspend1_flag) || (suspendall_flag)) {
2777 		nd = sd->sd_nodelist;
2778 		while (nd) {
2779 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2780 				nd = nd->nd_next;
2781 				continue;
2782 			}
2783 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
2784 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
2785 				/*
2786 				 * We are here because we failed to resume
2787 				 * rpc.mdcommd.  However we potentially have
2788 				 * an error from the previous call
2789 				 * If the previous call did fail,  we capture
2790 				 * that error and generate a perror with
2791 				 * the string, "Unable to resume...".
2792 				 * Setting rval to -1 ensures that in the
2793 				 * next iteration of the loop, ep is not
2794 				 * clobbered.
2795 				 */
2796 				if (rval == 0)
2797 					(void) mdstealerror(ep, &xep);
2798 				else
2799 					mdclrerror(&xep);
2800 				rval = -1;
2801 				mde_perror(ep, dgettext(TEXT_DOMAIN,
2802 				    "Unable to resume rpc.mdcommd."));
2803 			}
2804 			nd = nd->nd_next;
2805 		}
2806 		meta_ping_mnset(sp->setno);
2807 	}
2808 
2809 	/*
2810 	 * Unlock set.  This flushes the caches on the servers.
2811 	 */
2812 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
2813 	nd = sd->sd_nodelist;
2814 	while (nd) {
2815 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2816 			nd = nd->nd_next;
2817 			continue;
2818 		}
2819 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
2820 			if (rval == 0)
2821 				(void) mdstealerror(ep, &xep);
2822 			else
2823 				mdclrerror(&xep);
2824 			rval = -1;
2825 		}
2826 		nd = nd->nd_next;
2827 	}
2828 
2829 	/*
2830 	 * If this node is the last to join the diskset and clustering isn't
2831 	 * running, then resync the mirrors in the diskset. We have to wait
2832 	 * until all nodes are joined so that the status gets propagated to
2833 	 * all of the members of the set.
2834 	 * Ignore any error from the resync as the join function shouldn't fail
2835 	 * because the mirror resync had a problem.
2836 	 *
2837 	 * Don't start resync if set is stale.
2838 	 */
2839 	if ((rval == 0) && (sdssc_bind_library() != SDSSC_OKAY) &&
2840 	    (stale_set != 1)) {
2841 		nd = sd->sd_nodelist;
2842 		while (nd) {
2843 			if (!(nd->nd_flags & MD_MN_NODE_OWN))
2844 				break;
2845 			nd = nd->nd_next;
2846 		}
2847 		/*
2848 		 * nd set to NULL means that we have no nodes in the set that
2849 		 * haven't joined. In this case we start the resync.
2850 		 */
2851 		if (nd == NULL) {
2852 			(void) meta_mirror_resync_all(sp, 0, &xep);
2853 			mdclrerror(&xep);
2854 		}
2855 	}
2856 
2857 	/* Update ABR state for all soft partitions */
2858 	(void) meta_sp_update_abr(sp, &xep);
2859 	mdclrerror(&xep);
2860 
2861 	/*
2862 	 * call metaflushsetnames to reset local cache for master and
2863 	 * node information.
2864 	 */
2865 	metaflushsetname(sp);
2866 
2867 	/* release signals back to what they were on entry */
2868 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
2869 		mdclrerror(&xep);
2870 
2871 	/*
2872 	 * If no error and stale_set is set, then set ep back
2873 	 * to ep from snarf_set call and return -3.  If another error
2874 	 * occurred and rval is not 0, then that error would have
2875 	 * caused the node to be withdrawn from the set and would
2876 	 * have set ep to that error information.
2877 	 */
2878 	if ((rval == 0) && (stale_set)) {
2879 		(void) mdstealerror(ep, &ep_snarf);
2880 		return (-3);
2881 	}
2882 
2883 	return (rval);
2884 }
2885 
2886 /*
2887  * Entry point to withdraw a node from MultiNode diskset.
2888  *
2889  * Validate host in diskset.
2890  *	- Should be joined into diskset.
2891  * Assume valid configuration is stored in the set/drive/node records
2892  * in the local mddb since no node or drive can be added to the MNset
2893  * unless all drives and nodes are available.  Reconfig steps will
2894  * resync all ALIVE nodes in case of panic in critical areas.
2895  *
2896  * Lock down the set.
2897  * Verify that drives exist in configuration.
2898  * Verify host is a member of this diskset.
2899  * Verify host is an owner of the diskset (host is joined to diskset).
2900  * Only allow withdrawal of master node if master node is the only joined
2901  * in the diskset.
2902  * Halt the diskset on this node.
2903  * Reset Master on this node.
2904  * Updated node flags that this node with withdrawn.
2905  * Unlock the set.
2906  *
2907  * Return values:
2908  *	0  - Node successfully withdrew from set.
2909  *	-1 - Withdrawal attempted but failed
2910  *		- any failure from libmeta calls
2911  *		- node not in the member list
2912  *	-2 - Withdrawal not attempted since
2913  *		- this set had no drives in set
2914  *		- this node not joined to set
2915  *		- set is not a multinode set
2916  */
2917 extern int
2918 meta_set_withdraw(
2919 	mdsetname_t	*sp,
2920 	md_error_t	*ep
2921 )
2922 {
2923 	md_set_desc		*sd;
2924 	md_drive_desc		*dd = 0;
2925 	md_mnnode_desc		*nd, my_nd;
2926 	int			rval = 0;
2927 	md_setkey_t		*cl_sk;
2928 	md_error_t		xep = mdnullerror;
2929 	int			set_halted = 0;
2930 	int			suspendall_flag = 0;
2931 	int			suspend1_flag = 0;
2932 	bool_t			stale_bool = FALSE;
2933 	mddb_config_t		c;
2934 	int			node_id_list[1];
2935 	sigset_t		oldsigs;
2936 	int			send_reinit = 0;
2937 
2938 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2939 		return (-1);
2940 	}
2941 
2942 	/* Must be a multinode diskset */
2943 	if (!MD_MNSET_DESC(sd)) {
2944 		(void) mderror(ep, MDE_NOT_MN, sp->setname);
2945 		return (-1);
2946 	}
2947 
2948 	/* Make sure we are blocking all signals */
2949 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
2950 		mdclrerror(&xep);
2951 
2952 	/*
2953 	 * Lock the set on current set members.
2954 	 * For MN diskset lock_set and SUSPEND are used to protect against
2955 	 * other meta* commands running on the other nodes.
2956 	 */
2957 	nd = sd->sd_nodelist;
2958 	while (nd) {
2959 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2960 			nd = nd->nd_next;
2961 			continue;
2962 		}
2963 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2964 			rval = -1;
2965 			goto out;
2966 		}
2967 		nd = nd->nd_next;
2968 	}
2969 	/*
2970 	 * Lock out other meta* commands by suspending
2971 	 * class 1 messages across the diskset.
2972 	 */
2973 	nd = sd->sd_nodelist;
2974 	while (nd) {
2975 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2976 			nd = nd->nd_next;
2977 			continue;
2978 		}
2979 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2980 		    sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) {
2981 			rval = -1;
2982 			goto out;
2983 		}
2984 		suspend1_flag = 1;
2985 		nd = nd->nd_next;
2986 	}
2987 
2988 	/* Get list of drives - needed in case of failure */
2989 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
2990 	    ep)) == NULL) {
2991 		/* Error getting drives in list */
2992 		if (! mdisok(ep)) {
2993 			rval = -1;
2994 			goto out2;
2995 		}
2996 		/* no drives in list */
2997 		rval = -2;
2998 		goto out2;
2999 	}
3000 
3001 	/*
3002 	 * Verify that this host is a member (in the host list) of the set.
3003 	 */
3004 	nd = sd->sd_nodelist;
3005 	while (nd) {
3006 		if (strcmp(mynode(), nd->nd_nodename) == 0) {
3007 			break;
3008 		}
3009 		nd = nd->nd_next;
3010 	}
3011 	if (!nd) {
3012 		(void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
3013 		    sd->sd_mn_mynode->nd_nodename, NULL,
3014 		    sp->setname);
3015 		rval = -1;
3016 		goto out2;
3017 	}
3018 
3019 	/*
3020 	 * Call metaget_setownership that calls each node in diskset and
3021 	 * marks in set descriptor if node is an owner of the set or not.
3022 	 * metaget_setownership checks to see if a node is an owner by
3023 	 * checking to see if that node's kernel has the mddb loaded.
3024 	 * If a node had panic'd during a reconfig or an
3025 	 * add/delete/join/withdraw operation, the other nodes' node
3026 	 * records may not reflect the current state of the diskset,
3027 	 * so calling metaget_setownership is the safest thing to do.
3028 	 */
3029 	if (metaget_setownership(sp, ep) == -1) {
3030 		rval = -1;
3031 		goto out2;
3032 	}
3033 
3034 	/*
3035 	 * Verify that this node is joined
3036 	 * to diskset (i.e. is an owner of the diskset).
3037 	 */
3038 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
3039 		rval = -2;
3040 		goto out2;
3041 	}
3042 
3043 	/*
3044 	 * For a MN diskset, only withdraw master if it is
3045 	 * the only joined node.
3046 	 */
3047 	if (sd->sd_mn_master_nodeid == sd->sd_mn_mynode->nd_nodeid) {
3048 		nd = sd->sd_nodelist;
3049 		while (nd) {
3050 			/* Skip my node since checking for other owners */
3051 			if (nd->nd_nodeid == sd->sd_mn_master_nodeid) {
3052 				nd = nd->nd_next;
3053 				continue;
3054 			}
3055 			/* If another owner node if found, error */
3056 			if (nd->nd_flags & MD_MN_NODE_OWN) {
3057 				(void) mddserror(ep, MDE_DS_WITHDRAWMASTER,
3058 				    sp->setno,
3059 				    sd->sd_mn_mynode->nd_nodename, NULL,
3060 				    sp->setname);
3061 				rval = -1;
3062 				goto out2;
3063 			}
3064 			nd = nd->nd_next;
3065 		}
3066 	}
3067 
3068 	/*
3069 	 * Is current set STALE?
3070 	 */
3071 	(void) memset(&c, 0, sizeof (c));
3072 	c.c_id = 0;
3073 	c.c_setno = sp->setno;
3074 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
3075 		(void) mdstealerror(ep, &c.c_mde);
3076 		rval = -1;
3077 		goto out;
3078 	}
3079 	if (c.c_flags & MDDB_C_STALE) {
3080 		stale_bool = TRUE;
3081 	}
3082 
3083 	/*
3084 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
3085 	 * Start by suspending rpc.mdcommd (which drains it of all messages),
3086 	 * then change the nodelist followed by a reinit and resume.
3087 	 */
3088 	nd = sd->sd_nodelist;
3089 	while (nd) {
3090 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3091 			nd = nd->nd_next;
3092 			continue;
3093 		}
3094 
3095 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
3096 		    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
3097 			rval = -1;
3098 			goto out;
3099 		}
3100 		suspendall_flag = 1;
3101 		nd = nd->nd_next;
3102 	}
3103 
3104 	/*
3105 	 * Withdraw the set - halt set.
3106 	 * This will fail if any I/O is occuring to any metadevice which
3107 	 * includes a resync to a mirror metadevice.
3108 	 */
3109 	set_halted = 1;
3110 	if (halt_set(sp, ep)) {
3111 		/* Was set actually halted? */
3112 		if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_YES) {
3113 			set_halted = 0;
3114 		}
3115 		rval = -1;
3116 		goto out;
3117 	}
3118 
3119 	/* Change to nodelist so need to send reinit to rpc.mdcommd */
3120 	send_reinit = 1;
3121 
3122 	/* Reset master on withdrawn node */
3123 	if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, "",
3124 	    MD_MN_INVALID_NID, ep)) {
3125 		rval = -1;
3126 		goto out;
3127 	}
3128 
3129 	/* Mark my node as withdrawn and send to other nodes */
3130 	nd = sd->sd_nodelist;
3131 	my_nd = *(sd->sd_mn_mynode);	/* structure copy */
3132 	my_nd.nd_next = NULL;
3133 	while (nd) {
3134 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3135 			nd = nd->nd_next;
3136 			continue;
3137 		}
3138 		if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
3139 		    MD_NR_WITHDRAW, NULL, ep)) {
3140 			rval = -1;
3141 			goto out;
3142 		}
3143 		nd = nd->nd_next;
3144 	}
3145 
3146 	/*
3147 	 * If withdrawn node is a mirror owner, reset mirror owner
3148 	 * to NULL.  If an error occurs, print a warning and continue.
3149 	 * Don't fail metaset because of mirror owner reset problem since
3150 	 * next node to grab mirror will resolve this issue.
3151 	 * Before next node grabs mirrors, metaset will show the withdrawn
3152 	 * node as owner which is why an attempt to reset the mirror owner
3153 	 * is made.
3154 	 */
3155 	node_id_list[0] = sd->sd_mn_mynode->nd_nodeid;	/* Setup my nodeid */
3156 	nd = sd->sd_nodelist;
3157 	while (nd) {
3158 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3159 			nd = nd->nd_next;
3160 			continue;
3161 		}
3162 		if (clnt_reset_mirror_owner(nd->nd_nodename, sp,
3163 		    1, &node_id_list[0], &xep) == 01) {
3164 			mde_perror(&xep, dgettext(TEXT_DOMAIN,
3165 			    "Unable to reset mirror owner on node %s"),
3166 			    nd->nd_nodename);
3167 			mdclrerror(&xep);
3168 		}
3169 		nd = nd->nd_next;
3170 	}
3171 
3172 out:
3173 	if (rval == -1) {
3174 		/* Rejoin node - Mark node as joined and send to other nodes */
3175 		nd = sd->sd_nodelist;
3176 		my_nd = *(sd->sd_mn_mynode);	/* structure copy */
3177 		my_nd.nd_next = NULL;
3178 		while (nd) {
3179 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3180 				nd = nd->nd_next;
3181 				continue;
3182 			}
3183 			if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
3184 			    MD_NR_JOIN, NULL, &xep)) {
3185 				mdclrerror(&xep);
3186 			}
3187 			nd = nd->nd_next;
3188 		}
3189 
3190 		/* Set master on withdrawn node */
3191 		if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp,
3192 		    sd->sd_mn_master_nodenm,
3193 		    sd->sd_mn_master_nodeid, &xep)) {
3194 			mdclrerror(&xep);
3195 		}
3196 
3197 		/* Join set if halt_set had succeeded */
3198 		if (set_halted) {
3199 			/*
3200 			 * Causes mddbs to be loaded into the kernel.
3201 			 * Set the force flag so that replica locations can be
3202 			 * loaded into the kernel even if a mediator node was
3203 			 * unavailable.  This allows a node to join an MO
3204 			 * diskset when there are sufficient replicas available,
3205 			 * but a mediator node in unavailable.
3206 			 */
3207 			if (setup_db_bydd(sp, dd, TRUE, &xep) == -1) {
3208 				mdclrerror(&xep);
3209 			}
3210 			/* If set previously stale - make it so at re-join */
3211 			if (snarf_set(sp, stale_bool, &xep) != 0) {
3212 				mdclrerror(&xep);
3213 				(void) halt_set(sp, &xep);
3214 				mdclrerror(&xep);
3215 			}
3216 		}
3217 	}
3218 
3219 	/*
3220 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
3221 	 * Send reinit command to mdcommd which forces it to get
3222 	 * fresh set description.
3223 	 */
3224 	if (send_reinit) {
3225 		/* Send reinit */
3226 		nd = sd->sd_nodelist;
3227 		while (nd) {
3228 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3229 				nd = nd->nd_next;
3230 				continue;
3231 			}
3232 
3233 			/* Class is ignored for REINIT */
3234 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
3235 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
3236 				/*
3237 				 * We are here because we failed to resume
3238 				 * rpc.mdcommd.  However we potentially have
3239 				 * an error from the previous call.
3240 				 * If the previous call did fail,  we
3241 				 * capture that error and generate a perror
3242 				 * withthe string,  "Unable to resume...".
3243 				 * Setting rval to -1 ensures that in the
3244 				 * next iteration of the loop, ep is not
3245 				 * clobbered.
3246 				 */
3247 				if (rval == 0)
3248 					(void) mdstealerror(ep, &xep);
3249 				else
3250 					mdclrerror(&xep);
3251 				rval = -1;
3252 				mde_perror(ep, dgettext(TEXT_DOMAIN,
3253 				    "Unable to reinit rpc.mdcommd."));
3254 			}
3255 			nd = nd->nd_next;
3256 		}
3257 	}
3258 
3259 out2:
3260 	/*
3261 	 * Unlock diskset by resuming messages across the diskset.
3262 	 * Just resume all classes so that resume is the same whether
3263 	 * just one class was locked or all classes were locked.
3264 	 */
3265 	if ((suspend1_flag) || (suspendall_flag)) {
3266 		nd = sd->sd_nodelist;
3267 		while (nd) {
3268 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3269 				nd = nd->nd_next;
3270 				continue;
3271 			}
3272 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
3273 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
3274 				/*
3275 				 * We are here because we failed to resume
3276 				 * rpc.mdcommd.  However we potentially have
3277 				 * an error from the previous call
3278 				 * If the previous call did fail,  we capture
3279 				 * that error and generate a perror with
3280 				 * the string, "Unable to resume...".
3281 				 * Setting rval to -1 ensures that in the
3282 				 * next iteration of the loop, ep is not
3283 				 * clobbered.
3284 				 */
3285 				if (rval == 0)
3286 					(void) mdstealerror(ep, &xep);
3287 				else
3288 					mdclrerror(&xep);
3289 				rval = -1;
3290 				mde_perror(ep, dgettext(TEXT_DOMAIN,
3291 				    "Unable to resume rpc.mdcommd."));
3292 			}
3293 			nd = nd->nd_next;
3294 		}
3295 		meta_ping_mnset(sp->setno);
3296 	}
3297 
3298 	/*
3299 	 * Unlock set.  This flushes the caches on the servers.
3300 	 */
3301 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
3302 	nd = sd->sd_nodelist;
3303 	while (nd) {
3304 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3305 			nd = nd->nd_next;
3306 			continue;
3307 		}
3308 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
3309 			if (rval == 0)
3310 				(void) mdstealerror(ep, &xep);
3311 			else
3312 				mdclrerror(&xep);
3313 			rval = -1;
3314 		}
3315 		nd = nd->nd_next;
3316 	}
3317 
3318 	/*
3319 	 * call metaflushsetnames to reset local cache for master and
3320 	 * node information.
3321 	 */
3322 	metaflushsetname(sp);
3323 
3324 	/* release signals back to what they were on entry */
3325 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
3326 		mdclrerror(&xep);
3327 
3328 	return (rval);
3329 
3330 }
3331 
3332 /*
3333  * Update nodelist with cluster member information.
3334  * A node not in the member list will be marked
3335  * as not ALIVE and not OWN.
3336  * A node in the member list will be marked ALIVE, but
3337  * the OWN bit will not be changed.
3338  *
3339  * If mynode isn't in the membership list, fail causing
3340  * another reconfig cycle to be started since a non-member
3341  * node shouldn't be taking part in the reconfig cycle.
3342  *
3343  * Return values:
3344  *	0 - No problem.
3345  *	1 - Any failure including RPC failure to my node.
3346  */
3347 int
3348 meta_reconfig_update_nodelist(
3349 	mdsetname_t			*sp,
3350 	mndiskset_membershiplist_t	*nl,
3351 	md_set_desc			*sd,
3352 	md_error_t			*ep
3353 )
3354 {
3355 	mndiskset_membershiplist_t	*nl2;
3356 	md_mnnode_desc			*nd;
3357 	md_error_t			xep = mdnullerror;
3358 	int				rval = 0;
3359 
3360 	/*
3361 	 * Walk through nodelist, checking to see if each
3362 	 * node is in the member list.
3363 	 * If node is not a member, reset ALIVE and OWN node flag.
3364 	 * If node is a member, set ALIVE.
3365 	 * If mynode's OWN flag gets reset, then halt the diskset on this node.
3366 	 */
3367 	nd = sd->sd_nodelist;
3368 	while (nd) {
3369 		nl2 = nl;
3370 		while (nl2) {
3371 			/* If node is in member list, set ALIVE */
3372 			if (nl2->msl_node_id == nd->nd_nodeid) {
3373 				nd->nd_flags |= MD_MN_NODE_ALIVE;
3374 				break;
3375 			} else {
3376 				nl2 = nl2->next;
3377 			}
3378 			/* node is not in member list, mark !ALIVE and !OWN */
3379 			if (nl2 == NULL) {
3380 				/* If node is mynode, then halt set if needed */
3381 				if (strcmp(mynode(), nd->nd_nodename) == 0) {
3382 					/*
3383 					 * This shouldn't happen, but just
3384 					 * in case...  Any node not in the
3385 					 * membership list should be dead and
3386 					 * not running reconfig step1.
3387 					 */
3388 					if (nd->nd_flags & MD_MN_NODE_OWN) {
3389 						if (halt_set(sp, &xep)) {
3390 							mde_perror(&xep, "");
3391 							mdclrerror(&xep);
3392 						}
3393 					}
3394 					/*
3395 					 * Return failure since this node
3396 					 * (mynode) is not in the membership
3397 					 * list, but process the rest of the
3398 					 * nodelist first so that rpc.metad
3399 					 * can be updated with the latest
3400 					 * membership information.
3401 					 */
3402 					(void) mddserror(ep,
3403 					    MDE_DS_NOTINMEMBERLIST,
3404 					    sp->setno, nd->nd_nodename, NULL,
3405 					    sp->setname);
3406 					rval = 1;
3407 				}
3408 				nd->nd_flags &= ~MD_MN_NODE_ALIVE;
3409 				nd->nd_flags &= ~MD_MN_NODE_OWN;
3410 			}
3411 		}
3412 		nd = nd->nd_next;
3413 	}
3414 
3415 	/* Send this information to rpc.metad */
3416 	if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist,
3417 	    MD_NR_SET,  MNSET_IN_RECONFIG, &xep)) {
3418 		/* Return failure if can't send node flags to rpc.metad */
3419 		if (rval == 0) {
3420 			(void) mdstealerror(ep, &xep);
3421 			rval = 1;
3422 		}
3423 	}
3424 	return (rval);
3425 }
3426 
3427 /*
3428  * Choose master determines the master for a diskset.
3429  * Each node determines the master on its own and
3430  * adds this information to its local rpc.metad nodelist
3431  * and also sends it to the kernel.
3432  *
3433  * Nodelist in set descriptor (sd) is sorted in
3434  * monotonically increasing sequence of nodeid.
3435  *
3436  * Return values:
3437  *	0 - No problem.
3438  *	205 - There was an RPC problem to another node.
3439  *	-1 - There was an error.  This could be an RPC error to my node.
3440  *		This is a catastrophic failure causing node to panic.
3441  */
3442 int
3443 meta_reconfig_choose_master_for_set(
3444 	mdsetname_t	*sp,
3445 	md_set_desc	*sd,
3446 	md_error_t	*ep
3447 )
3448 {
3449 	int			is_owner;
3450 	md_mnset_record		*mnsr = NULL;
3451 	int			lowest_alive_nodeid = 0;
3452 	uint_t			master_nodeid;
3453 	md_mnnode_desc		*nd, *nd2;
3454 	md_mnnode_record	*nr;
3455 	md_drive_desc		*dd;
3456 	md_setkey_t		*cl_sk;
3457 	int			rval = 0;
3458 	md_error_t		xep = mdnullerror;
3459 	mddb_setflags_config_t	sf;
3460 
3461 	/*
3462 	 * Is current node joined to diskset?
3463 	 * Don't trust flags, really check to see if mddb is snarfed.
3464 	 */
3465 	if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
3466 		/*
3467 		 * If a node is joined to the diskset, this node checks
3468 		 * to see if the current master of the diskset is valid and
3469 		 * is still in the membership list (ALIVE) and is
3470 		 * still joined (OWN).  Need to verify if master is
3471 		 * really joined - don't trust the flags.  (Can trust
3472 		 * ALIVE since set during earlier part of reconfig cycle.)
3473 		 * If the current master is valid, still in the membership
3474 		 * list and joined, then master is not changed on this node.
3475 		 * Just return.
3476 		 *
3477 		 * Verify that nodeid is valid before accessing masternode.
3478 		 */
3479 		if ((sd->sd_mn_master_nodeid != MD_MN_INVALID_NID) &&
3480 		    (sd->sd_mn_masternode->nd_flags & MD_MN_NODE_ALIVE)) {
3481 			if (clnt_ownset(sd->sd_mn_master_nodenm, sp,
3482 			    &is_owner, ep) == -1) {
3483 				/* If RPC failure to another node return 205 */
3484 				if ((mdanyrpcerror(ep)) &&
3485 				    (sd->sd_mn_mynode->nd_nodeid !=
3486 				    sd->sd_mn_master_nodeid)) {
3487 					return (205);
3488 				} else {
3489 					/* Any other failure */
3490 					return (-1);
3491 				}
3492 			} else {
3493 				if (is_owner == TRUE) {
3494 
3495 					meta_mc_log(MC_LOG5, dgettext(
3496 					    TEXT_DOMAIN, "Set %s previous "
3497 					    "master chosen %s (%d): %s"),
3498 					    sp->setname,
3499 					    sd->sd_mn_master_nodenm,
3500 					    sd->sd_mn_master_nodeid,
3501 					    meta_print_hrtime(gethrtime() -
3502 					    start_time));
3503 
3504 					/* Previous master is ok - done */
3505 					return (0);
3506 				}
3507 			}
3508 		}
3509 
3510 		/*
3511 		 * If current master is no longer in the membership list or
3512 		 * is no longer joined, then this node uses the following
3513 		 * algorithm:
3514 		 * - node calls RPC routine clnt_ownset to get latest
3515 		 *	information on which nodes are owners of diskset.
3516 		 * 	clnt_ownset checks on each node to see if its kernel
3517 		 *	has that diskset snarfed.
3518 		 */
3519 		nd = sd->sd_nodelist;
3520 		while (nd) {
3521 			/* Don't consider node that isn't in member list */
3522 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3523 				nd = nd->nd_next;
3524 				continue;
3525 			}
3526 
3527 			if (clnt_ownset(nd->nd_nodename, sp,
3528 			    &is_owner, ep) == -1) {
3529 				/* If RPC failure to another node return 205 */
3530 				if ((mdanyrpcerror(ep)) &&
3531 				    (sd->sd_mn_mynode->nd_nodeid !=
3532 				    nd->nd_nodeid)) {
3533 					return (205);
3534 				} else {
3535 					/* Any other failure */
3536 					return (-1);
3537 				}
3538 			}
3539 
3540 			/*
3541 			 * Set owner flag for each node based on whether
3542 			 * that node really has a diskset mddb snarfed in
3543 			 * or not.
3544 			 */
3545 			if (is_owner == TRUE)
3546 				nd->nd_flags |= MD_MN_NODE_OWN;
3547 			else
3548 				nd->nd_flags &= ~MD_MN_NODE_OWN;
3549 
3550 			nd = nd->nd_next;
3551 		}
3552 
3553 		/*
3554 		 * - node walks through nodelist looking for nodes that are
3555 		 *	owners of the diskset that are in the membership list.
3556 		 * - for each owner, node calls RPC routine clnt_getset to
3557 		 *	 see if that node has its node record set to OK.
3558 		 * - If so, master is chosen to be this owner node.
3559 		 */
3560 		nd = sd->sd_nodelist;
3561 		while (nd) {
3562 			/* Don't consider node that isn't in member list */
3563 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3564 				nd = nd->nd_next;
3565 				continue;
3566 			}
3567 
3568 			/* Don't consider a node that isn't an owner */
3569 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3570 				nd = nd->nd_next;
3571 				continue;
3572 			}
3573 
3574 			/* Does node has its own node record set to OK? */
3575 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3576 			    MD_SET_BAD, &mnsr, ep) == -1) {
3577 				/* If RPC failure to another node return 205 */
3578 				if ((mdanyrpcerror(ep)) &&
3579 				    (sd->sd_mn_mynode->nd_nodeid !=
3580 				    nd->nd_nodeid)) {
3581 					return (205);
3582 				} else {
3583 					/* Any other failure */
3584 					return (-1);
3585 				}
3586 			}
3587 			nr = mnsr->sr_nodechain;
3588 			while (nr) {
3589 				if (nd->nd_nodeid == nr->nr_nodeid) {
3590 					if (nr->nr_flags & MD_MN_NODE_OK) {
3591 						/* Found a master */
3592 						free_sr(
3593 						    (md_set_record *)mnsr);
3594 						goto found_master;
3595 					}
3596 				}
3597 				nr = nr->nr_next;
3598 			}
3599 			free_sr((md_set_record *)mnsr);
3600 			nd = nd->nd_next;
3601 		}
3602 
3603 		/*
3604 		 * - If no owner node has its own node record on its own node
3605 		 *	set to OK, then this node checks all of the non-owner
3606 		 * 	nodes that are in the membership list.
3607 		 * - for each non-owner, node calls RPC routine clnt_getset to
3608 		 *	 see if that node has its node record set to OK.
3609 		 * - If set doesn't exist, don't choose node for master.
3610 		 * - If so, master is chosen to be this non-owner node.
3611 		 *
3612 		 */
3613 		nd = sd->sd_nodelist;
3614 		while (nd) {
3615 			/* Don't consider node that isn't in member list */
3616 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3617 				nd = nd->nd_next;
3618 				continue;
3619 			}
3620 
3621 			/* Only checking non-owner nodes this time around */
3622 			if (nd->nd_flags & MD_MN_NODE_OWN) {
3623 				nd = nd->nd_next;
3624 				continue;
3625 			}
3626 
3627 			/* Does node has its own node record set to OK? */
3628 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3629 			    MD_SET_BAD, &mnsr, ep) == -1) {
3630 				/*
3631 				 * If set doesn't exist on non-owner node,
3632 				 * don't consider this node for master.
3633 				 */
3634 				if (mdiserror(ep, MDE_NO_SET)) {
3635 					nd = nd->nd_next;
3636 					continue;
3637 				} else if ((mdanyrpcerror(ep)) &&
3638 				    (sd->sd_mn_mynode->nd_nodeid !=
3639 				    nd->nd_nodeid)) {
3640 					/* RPC failure to another node */
3641 					return (205);
3642 				} else {
3643 					/* Any other failure */
3644 					return (-1);
3645 				}
3646 			}
3647 			nr = mnsr->sr_nodechain;
3648 			while (nr) {
3649 				if (nd->nd_nodeid == nr->nr_nodeid) {
3650 					if (nr->nr_flags & MD_MN_NODE_OK) {
3651 						/* Found a master */
3652 						free_sr(
3653 						    (md_set_record *)mnsr);
3654 						goto found_master;
3655 					}
3656 				}
3657 				nr = nr->nr_next;
3658 			}
3659 			free_sr((md_set_record *)mnsr);
3660 			nd = nd->nd_next;
3661 		}
3662 
3663 		/*
3664 		 * - If no node can be found that has its own node record on
3665 		 *	its node to be set to OK, then all alive nodes
3666 		 * 	were in the process of being added to or deleted
3667 		 *	from set.  Each alive node will remove all
3668 		 *	information pertaining to this set from its node.
3669 		 *
3670 		 * If all nodes in set are ALIVE, then call sdssc end routines
3671 		 * since set was truly being initially created or destroyed.
3672 		 */
3673 		goto delete_set;
3674 	} else {
3675 
3676 		/*
3677 		 * If node is not joined to diskset, then this
3678 		 * node uses the following algorithm:
3679 		 * - If unjoined node doesn't have a node record for itself,
3680 		 *	just delete the diskset since diskset was in the
3681 		 *	process of being created.
3682 		 * - node needs to find master of diskset before
3683 		 *	reconfig cycle, if a master existed.
3684 		 * - node calls RPC routine clnt_ownset to get latest
3685 		 * 	information on which nodes are owners of diskset.
3686 		 *	clnt_ownset checks on each node to see if its
3687 		 *	kernel has that diskset snarfed.
3688 		 */
3689 
3690 		/*
3691 		 * Is my node in the set description?
3692 		 * If not, delete the set from this node.
3693 		 * sr2setdesc sets sd_mn_mynode pointer to the node
3694 		 * descriptor for this node if there was a node
3695 		 * record for this node.
3696 		 *
3697 		 */
3698 		if (sd->sd_mn_mynode == NULL) {
3699 			goto delete_set;
3700 		}
3701 
3702 		nd = sd->sd_nodelist;
3703 		while (nd) {
3704 			/* Don't consider node that isn't in member list */
3705 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3706 				nd = nd->nd_next;
3707 				continue;
3708 			}
3709 
3710 			if (clnt_ownset(nd->nd_nodename, sp,
3711 			    &is_owner, ep) == -1) {
3712 				/* If RPC failure to another node return 205 */
3713 				if ((mdanyrpcerror(ep)) &&
3714 				    (sd->sd_mn_mynode->nd_nodeid !=
3715 				    nd->nd_nodeid)) {
3716 					return (205);
3717 				} else {
3718 					/* Any other failure */
3719 					return (-1);
3720 				}
3721 			}
3722 
3723 			/*
3724 			 * Set owner flag for each node based on whether
3725 			 * that node really has a diskset mddb snarfed in
3726 			 * or not.
3727 			 */
3728 			if (is_owner == TRUE)
3729 				nd->nd_flags |= MD_MN_NODE_OWN;
3730 			else
3731 				nd->nd_flags &= ~MD_MN_NODE_OWN;
3732 
3733 			nd = nd->nd_next;
3734 		}
3735 
3736 		/*
3737 		 * - node walks through nodelist looking for nodes that
3738 		 *	are owners of the diskset that are in
3739 		 *	the membership list.
3740 		 * - for each owner, node calls RPC routine clnt_getset to
3741 		 *	see if that node has a master set and to get the
3742 		 *	diskset description.
3743 		 * - If the owner node has a set description that doesn't
3744 		 *	include the non-joined node in the nodelist, this node
3745 		 *	removes its set description of that diskset
3746 		 *	(i.e. removes the set from its local mddbs).  This is
3747 		 *	handling the case of when a node was removed from a
3748 		 *	diskset while it was not in the cluster membership
3749 		 *	list.
3750 		 * - If that node has a master set and the master is in the
3751 		 *	membership list and is an owner, then either this was
3752 		 *	the master from before the reconfig cycle or this
3753 		 *	node has already chosen a new master - either way,
3754 		 *	the master value is valid as long as it is in the
3755 		 *	membership list and is an owner
3756 		 * - master is chosen to be owner node's master
3757 		 */
3758 		nd = sd->sd_nodelist;
3759 		while (nd) {
3760 			/* Don't consider node that isn't in member list */
3761 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3762 				nd = nd->nd_next;
3763 				continue;
3764 			}
3765 
3766 			/* Don't consider a node that isn't an owner */
3767 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3768 				nd = nd->nd_next;
3769 				continue;
3770 			}
3771 
3772 			/* Get owner node's set record */
3773 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3774 			    MD_SET_BAD, &mnsr, ep) == -1) {
3775 				/* If RPC failure to another node return 205 */
3776 				if ((mdanyrpcerror(ep)) &&
3777 				    (sd->sd_mn_mynode->nd_nodeid !=
3778 				    nd->nd_nodeid)) {
3779 					return (205);
3780 				} else {
3781 					/* Any other failure */
3782 					return (-1);
3783 				}
3784 			}
3785 
3786 			/* Is this node in the owner node's set record */
3787 			nr = mnsr->sr_nodechain;
3788 			while (nr) {
3789 				if (sd->sd_mn_mynode->nd_nodeid ==
3790 				    nr->nr_nodeid) {
3791 					break;
3792 				}
3793 				nr = nr->nr_next;
3794 			}
3795 			if (nr == NULL) {
3796 				/* my node not found - delete set */
3797 				free_sr((md_set_record *)mnsr);
3798 				goto delete_set;
3799 			}
3800 
3801 			/* Is owner's node's master valid? */
3802 			master_nodeid = mnsr->sr_master_nodeid;
3803 			free_sr((md_set_record *)mnsr);
3804 			if (master_nodeid == MD_MN_INVALID_NID) {
3805 				nd = nd->nd_next;
3806 				continue;
3807 			}
3808 
3809 			nd2 = sd->sd_nodelist;
3810 			while (nd2) {
3811 				if ((nd2->nd_nodeid == master_nodeid) &&
3812 				    (nd2->nd_flags & MD_MN_NODE_ALIVE) &&
3813 				    (nd2->nd_flags & MD_MN_NODE_OWN)) {
3814 						nd = nd2;
3815 						goto found_master;
3816 				}
3817 				nd2 = nd2->nd_next;
3818 			}
3819 			nd = nd->nd_next;
3820 		}
3821 
3822 		/*
3823 		 * - If no owner node has a valid master, then follow
3824 		 * 	algorithm of when a node is joined to the diskset.
3825 		 * - node walks through nodelist looking for nodes that are
3826 		 *	owners of the diskset that are in the membership list.
3827 		 * - for each owner, node calls RPC routine clnt_getset to
3828 		 *	 see if that node has its node record set to OK.
3829 		 * - If so, master is chosen to be this owner node.
3830 		 */
3831 		nd = sd->sd_nodelist;
3832 		while (nd) {
3833 			/* Don't consider node that isn't in member list */
3834 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3835 				nd = nd->nd_next;
3836 				continue;
3837 			}
3838 
3839 			/* Don't consider a node that isn't an owner */
3840 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3841 				nd = nd->nd_next;
3842 				continue;
3843 			}
3844 
3845 			/* Does node has its own node record set to OK? */
3846 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3847 			    MD_SET_BAD, &mnsr, ep) == -1) {
3848 				/* If RPC failure to another node return 205 */
3849 				if ((mdanyrpcerror(ep)) &&
3850 				    (sd->sd_mn_mynode->nd_nodeid !=
3851 				    nd->nd_nodeid)) {
3852 					return (205);
3853 				} else {
3854 					/* Any other failure */
3855 					return (-1);
3856 				}
3857 			}
3858 			nr = mnsr->sr_nodechain;
3859 			while (nr) {
3860 				if (nd->nd_nodeid == nr->nr_nodeid) {
3861 					if (nr->nr_flags & MD_MN_NODE_OK) {
3862 						/* Found a master */
3863 						free_sr(
3864 						    (md_set_record *)mnsr);
3865 						goto found_master;
3866 					}
3867 				}
3868 				nr = nr->nr_next;
3869 			}
3870 			free_sr((md_set_record *)mnsr);
3871 			nd = nd->nd_next;
3872 		}
3873 
3874 		/*
3875 		 * - If no owner node has its own node record on its own node
3876 		 *	set to OK, then this node checks all of the non-owner
3877 		 *	nodes that are in the membership list.
3878 		 * - for each non-owner, node calls RPC routine clnt_getset to
3879 		 *	see if that node has its node record set to OK.
3880 		 * - If set doesn't exist, don't choose node for master.
3881 		 * - If this node doesn't exist in the nodelist on any of the
3882 		 *	non-owner nodes, this node removes its set description
3883 		 *	of that diskset (i.e. removes the set from its local
3884 		 *	mddbs). This is handling the case of when a node was
3885 		 *	removed from a diskset while it was not in the
3886 		 *	cluster membership list.
3887 		 * - If non-owner node has its node record set to OK and if
3888 		 *	this node hasn't removed this diskset (step directly
3889 		 *	before this one), then the master is chosen to be this
3890 		 *	non-owner node.
3891 		 */
3892 		nd = sd->sd_nodelist;
3893 		while (nd) {
3894 			/* Don't consider node that isn't in member list */
3895 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3896 				nd->nd_flags |= MD_MN_NODE_DEL;
3897 				nd = nd->nd_next;
3898 				continue;
3899 			}
3900 
3901 			/* Don't consider owner nodes since none are OK */
3902 			if (nd->nd_flags & MD_MN_NODE_OWN) {
3903 				nd->nd_flags |= MD_MN_NODE_DEL;
3904 				nd = nd->nd_next;
3905 				continue;
3906 			}
3907 
3908 			/*
3909 			 * Don't need to get nodelist from my node since
3910 			 * this is where sd_nodelist was obtained.
3911 			 */
3912 			if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) {
3913 				nd = nd->nd_next;
3914 				continue;
3915 			}
3916 
3917 			/*
3918 			 * If node has already been decided against for
3919 			 * master, then skip it.
3920 			 */
3921 			if (nd->nd_flags & MD_MN_NODE_DEL) {
3922 				nd = nd->nd_next;
3923 				continue;
3924 			}
3925 
3926 			/*
3927 			 * Does node in my nodelist have its own node
3928 			 * record marked OK on its node?  And does node
3929 			 * in my nodelist exist on all other nodes?
3930 			 * Don't want to choose a node for master unless
3931 			 * that node is marked OK on its own node and that
3932 			 * node exists on all other alive nodes.
3933 			 *
3934 			 * This is guarding against the case when several
3935 			 * nodes are down and one of the downed nodes is
3936 			 * deleted from the diskset.  When the down nodes
3937 			 * are rebooted into the cluster, you don't want
3938 			 * any node to pick the deleted node as the master.
3939 			 */
3940 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3941 			    MD_SET_BAD, &mnsr, ep) == -1) {
3942 				/*
3943 				 * If set doesn't exist on non-owner node,
3944 				 * don't consider this node for master.
3945 				 */
3946 				if (mdiserror(ep, MDE_NO_SET)) {
3947 					nd->nd_flags |= MD_MN_NODE_DEL;
3948 					nd = nd->nd_next;
3949 					continue;
3950 				} else if (mdanyrpcerror(ep)) {
3951 					/* RPC failure to another node */
3952 					return (205);
3953 				} else {
3954 					/* Any other failure */
3955 					return (-1);
3956 				}
3957 			}
3958 			/*
3959 			 * Is my node in the nodelist gotten from the other
3960 			 * node?  If not, then remove the set from my node
3961 			 * since set was deleted from my node while my node
3962 			 * was out of the cluster.
3963 			 */
3964 			nr = mnsr->sr_nodechain;
3965 			while (nr) {
3966 				if (sd->sd_mn_mynode->nd_nodeid ==
3967 				    nr->nr_nodeid) {
3968 					break;
3969 				}
3970 				nr = nr->nr_next;
3971 			}
3972 			if (nr == NULL) {
3973 				/* my node not found - delete set */
3974 				free_sr((md_set_record *)mnsr);
3975 				goto delete_set;
3976 			}
3977 
3978 			/* Is node being checked marked OK on its own node? */
3979 			nr = mnsr->sr_nodechain;
3980 			while (nr) {
3981 				if (nd->nd_nodeid == nr->nr_nodeid) {
3982 					if (!(nr->nr_flags & MD_MN_NODE_OK)) {
3983 						nd->nd_flags |= MD_MN_NODE_DEL;
3984 					}
3985 					break;
3986 				}
3987 				nr = nr->nr_next;
3988 			}
3989 			/*
3990 			 * If node being checked doesn't exist on its
3991 			 * own node - don't choose it as master.
3992 			 */
3993 			if (nr == NULL) {
3994 				nd->nd_flags |= MD_MN_NODE_DEL;
3995 			}
3996 
3997 			/*
3998 			 * Check every node in my node's nodelist against
3999 			 * the nodelist gotten from the other node.
4000 			 * If a node in my node's nodelist is not found in the
4001 			 * other node's nodelist, then set the DEL flag.
4002 			 */
4003 			nd2 = sd->sd_nodelist;
4004 			while (nd2) {
4005 				nr = mnsr->sr_nodechain;
4006 				while (nr) {
4007 					if (nd2->nd_nodeid == nr->nr_nodeid) {
4008 						break;
4009 					}
4010 					nr = nr->nr_next;
4011 				}
4012 				/* nd2 not found in other node's nodelist */
4013 				if (nr == NULL) {
4014 					nd2->nd_flags |= MD_MN_NODE_DEL;
4015 				}
4016 				nd2 = nd2->nd_next;
4017 			}
4018 
4019 			free_sr((md_set_record *)mnsr);
4020 			nd = nd->nd_next;
4021 		}
4022 
4023 		/*
4024 		 * Rescan list look for node that has not been marked DEL.
4025 		 * First node found is the master.
4026 		 */
4027 		nd = sd->sd_nodelist;
4028 		while (nd) {
4029 			if (!(nd->nd_flags & MD_MN_NODE_DEL)) {
4030 				break;
4031 			}
4032 			nd = nd->nd_next;
4033 			continue;
4034 		}
4035 		if (nd) {
4036 			/* Found a master */
4037 			goto found_master;
4038 		}
4039 
4040 		/*
4041 		 * - If no node can be found that has its own node record on
4042 		 *	its node to be set to OK, then all alive nodes
4043 		 * 	were in the process of being added to or deleted
4044 		 *	from set.  Each alive node will remove all
4045 		 *	information pertaining to this set from its node.
4046 		 *
4047 		 * If all nodes in set are ALIVE, then call sdssc end routines
4048 		 * since set was truly being initially created or destroyed.
4049 		 */
4050 		goto delete_set;
4051 	}
4052 
4053 found_master:
4054 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4055 	    "Set %s master chosen %s (%d): %s"),
4056 	    sp->setname, nd->nd_nodename, nd->nd_nodeid,
4057 	    meta_print_hrtime(gethrtime() - start_time));
4058 
4059 	if (clnt_lock_set(mynode(), sp, ep) == -1) {
4060 		return (-1);
4061 	}
4062 
4063 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
4064 
4065 	if (clnt_mnsetmaster(mynode(), sp,
4066 	    nd->nd_nodename, nd->nd_nodeid, ep)) {
4067 		rval = -1;
4068 	} else if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) {
4069 		/* If this node is new master, set flag in this node's kernel */
4070 		(void) memset(&sf, 0, sizeof (sf));
4071 		sf.sf_setno = sp->setno;
4072 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
4073 		/* Use magic to help protect ioctl against attack. */
4074 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
4075 		sf.sf_flags = MDDB_NM_SET;
4076 
4077 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4078 		    "Setting new master flag for set %s: %s"),
4079 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4080 
4081 		/*
4082 		 * Fail reconfig cycle if ioctl fails since it is critical
4083 		 * to set new master flag.
4084 		 */
4085 		if (metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde,
4086 		    NULL) != NULL) {
4087 			(void) mdstealerror(ep, &sf.sf_mde);
4088 			rval = -1;
4089 		}
4090 	}
4091 
4092 	if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) {
4093 		if (rval == 0) {
4094 			(void) mdstealerror(ep, &xep);
4095 			rval = -1;
4096 		}
4097 	}
4098 
4099 	cl_set_setkey(NULL);
4100 
4101 	metaflushsetname(sp);
4102 
4103 	return (rval);
4104 
4105 delete_set:
4106 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4107 	    "Master not chosen, deleting set %s: %s"),
4108 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4109 
4110 	/*
4111 	 * Remove all set information from this node:
4112 	 *	- node records for this set
4113 	 *	- drive records for this set
4114 	 *	- set record for this set
4115 	 * (Only do this on this node since each node
4116 	 * will do it for its own local mddb.)
4117 	 *
4118 	 * If all nodes in set are ALIVE, then
4119 	 * the lowest numbered ALIVE nodeid in set
4120 	 * (irregardless of whether an owner node or not) will
4121 	 * call the DCS service to cleanup for create/delete of set.
4122 	 *   sdssc_create_end(cleanup) if set was being created or
4123 	 *   sdssc_delete_end(cleanup) if set was being deleted.
4124 	 * A node record with flag ADD denotes a set being
4125 	 * created.  A node record with flag DEL denotes a
4126 	 * set being deleted.
4127 	 */
4128 	nd = sd->sd_nodelist;
4129 	while (nd) {
4130 		/* Found a node that isn't alive */
4131 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
4132 			break;
4133 
4134 		/* Is my node the lowest numbered ALIVE node? */
4135 		if (nd->nd_nodeid < sd->sd_mn_mynode->nd_nodeid) {
4136 			break;
4137 		}
4138 		nd = nd->nd_next;
4139 	}
4140 	if (nd == NULL) {
4141 		/* All nodes ALIVE and this is the lowest nodeid */
4142 		lowest_alive_nodeid = 1;
4143 	}
4144 
4145 	if (clnt_lock_set(mynode(), sp, ep) == -1) {
4146 		return (-1);
4147 	}
4148 
4149 
4150 	/*
4151 	 * If this node had been joined, withdraw and reset master.
4152 	 *
4153 	 * This could happen if a node was being added to or removed
4154 	 * from a diskset and the node doing the add/delete operation and
4155 	 * all other nodes in the diskset have left the cluster.
4156 	 */
4157 	if (sd->sd_mn_mynode) {
4158 		nd = sd->sd_mn_mynode;
4159 		if (nd->nd_flags & MD_MN_NODE_OWN) {
4160 			if (clnt_withdrawset(mynode(), sp, ep)) {
4161 				rval = -1;
4162 				goto out;
4163 			}
4164 			if (clnt_mnsetmaster(mynode(), sp, "",
4165 			    MD_MN_INVALID_NID, ep)) {
4166 				rval = -1;
4167 				goto out;
4168 			}
4169 		}
4170 	}
4171 
4172 	/*
4173 	 * Remove side records for this node (side) from local mddb
4174 	 * (clnt_deldrvs does this) if there are drives in the set.
4175 	 *
4176 	 * Don't need to mark this node as DEL since already marked as
4177 	 * ADD or DEL (or this node would have been chosen as master).
4178 	 * Don't need to mark other node records, drive records or
4179 	 * set records as DEL.  If a panic occurs during clnt_delset,
4180 	 * these records will be deleted the next time this node
4181 	 * becomes a member and goes through the reconfig cycle.
4182 	 */
4183 	/* Get the drive descriptors for this set */
4184 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
4185 	    ep)) == NULL) {
4186 		if (! mdisok(ep)) {
4187 			/*
4188 			 * Ignore and clear out any failures from
4189 			 * metaget_drivedesc since a panic could have
4190 			 * occurred when a node was partially added to a set.
4191 			 */
4192 			mdclrerror(ep);
4193 		}
4194 	} else {
4195 		if (clnt_deldrvs(mynode(), sp, dd, ep)) {
4196 			rval = -1;
4197 			goto out;
4198 		}
4199 	}
4200 
4201 	/*
4202 	 * Now, delete the set - this removes the node, drive
4203 	 * and set records from the local mddb.
4204 	 */
4205 	if (clnt_delset(mynode(), sp, ep)) {
4206 		rval = -1;
4207 		goto out;
4208 	}
4209 
4210 out:
4211 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
4212 
4213 	/*
4214 	 * Ignore errors from unlock of set since set is no longer
4215 	 * known (if clnt_delset worked).
4216 	 */
4217 	if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) {
4218 		mdclrerror(&xep);
4219 	}
4220 
4221 	cl_set_setkey(NULL);
4222 
4223 	metaflushsetname(sp);
4224 
4225 	/*
4226 	 * If this node is the lowest numbered nodeid then
4227 	 * call sdssc_create/delete_end depending on whether
4228 	 * this node is marked as ADD or DEL in the node record.
4229 	 */
4230 	if (lowest_alive_nodeid) {
4231 		if (nd->nd_flags & MD_MN_NODE_ADD)
4232 			sdssc_create_end(sp->setname, SDSSC_CLEANUP);
4233 		else if (nd->nd_flags & MD_MN_NODE_DEL)
4234 			sdssc_delete_end(sp->setname, SDSSC_CLEANUP);
4235 	}
4236 
4237 	/* Finished with this set -- return */
4238 	return (rval);
4239 }
4240 
4241 /*
4242  * Reconfig step to choose a new master for all MN disksets.
4243  * Return values:
4244  *	0 - Everything is great.
4245  *	1 - This node failed to reconfig.
4246  *	205 - Cause another reconfig due to a nodelist problem
4247  *		or RPC failure to another node
4248  */
4249 int
4250 meta_reconfig_choose_master(
4251 	long		timeout,
4252 	md_error_t	*ep
4253 )
4254 {
4255 	set_t				max_sets, setno;
4256 	int				nodecnt;
4257 	mndiskset_membershiplist_t	*nl;
4258 	md_set_desc			*sd;
4259 	mdsetname_t			*sp;
4260 	int				rval = 0;
4261 	mddb_setflags_config_t		sf;
4262 	int				start_node_delayed = 0;
4263 
4264 	if ((max_sets = get_max_sets(ep)) == 0) {
4265 		mde_perror(ep, dgettext(TEXT_DOMAIN,
4266 		    "Unable to get number of sets"));
4267 		return (1);
4268 	}
4269 
4270 	/*
4271 	 * Get membershiplist from API routine.  If there's
4272 	 * an error, return a 205 to cause another reconfig.
4273 	 */
4274 	if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
4275 		mde_perror(ep, "");
4276 		return (205);
4277 	}
4278 
4279 	for (setno = 1; setno < max_sets; setno++) {
4280 		if ((sp = metasetnosetname(setno, ep)) == NULL) {
4281 			if (mdiserror(ep, MDE_NO_SET)) {
4282 				/* No set for this setno - continue */
4283 				mdclrerror(ep);
4284 				continue;
4285 			} else {
4286 				/*
4287 				 * If encountered an RPC error from my node,
4288 				 * then immediately fail.
4289 				 */
4290 				if (mdanyrpcerror(ep)) {
4291 					mde_perror(ep, "");
4292 					return (1);
4293 				}
4294 				/* Can't get set information */
4295 				mde_perror(ep, dgettext(TEXT_DOMAIN,
4296 				    "Unable to get information for "
4297 				    "set number %d"), setno);
4298 				mdclrerror(ep);
4299 				continue;
4300 			}
4301 		}
4302 
4303 		/* If setname is there, set desc should exist. */
4304 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4305 			/*
4306 			 * If encountered an RPC error from my node,
4307 			 * then immediately fail.
4308 			 */
4309 			if (mdanyrpcerror(ep)) {
4310 				mde_perror(ep, "");
4311 				return (1);
4312 			}
4313 			mde_perror(ep, dgettext(TEXT_DOMAIN,
4314 			    "Unable to get set %s desc information"),
4315 			    sp->setname);
4316 			mdclrerror(ep);
4317 			continue;
4318 		}
4319 
4320 		/* Only reconfig MN disksets */
4321 		if (!MD_MNSET_DESC(sd)) {
4322 			continue;
4323 		}
4324 
4325 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4326 		    "Begin choose master for set %s: %s"),
4327 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4328 
4329 		/* Update nodelist with member information. */
4330 		if (meta_reconfig_update_nodelist(sp, nl, sd, ep)) {
4331 			/*
4332 			 * If encountered an RPC error from my node,
4333 			 * then immediately fail.
4334 			 */
4335 			if (mdanyrpcerror(ep)) {
4336 				mde_perror(ep, "");
4337 				return (1);
4338 			}
4339 			mde_perror(ep, "");
4340 			mdclrerror(ep);
4341 			continue;
4342 		}
4343 
4344 		/*
4345 		 * If all nodes in a cluster are starting, then
4346 		 * all nodes will attempt to contact all other nodes
4347 		 * to determine a master node.  This can lead to a
4348 		 * problem where node 1 is trying to contact the rpc.metad
4349 		 * node 2 and node 2 is trying to contact the rpc.metad
4350 		 * on node 1 -- and this causes the rpc call to fail
4351 		 * on both nodes and causes a new reconfig cycle.
4352 		 *
4353 		 * In order to break this problem, a newly starting node
4354 		 * will delay a small amount of time (nodeid mod 4 seconds)
4355 		 * and will then run the code to choose a master for the
4356 		 * first set.  Delay will only be done once regardless of the
4357 		 * number of sets.
4358 		 */
4359 		if (start_node_delayed == 0) {
4360 			(void) memset(&sf, 0, sizeof (sf));
4361 			sf.sf_setno = sp->setno;
4362 			sf.sf_flags = MDDB_NM_GET;
4363 			/* Use magic to help protect ioctl against attack. */
4364 			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
4365 			if ((metaioctl(MD_MN_GET_SETFLAGS, &sf,
4366 			    &sf.sf_mde, NULL) == 0) &&
4367 			    ((sf.sf_setflags & MD_SET_MN_START_RC) ==
4368 			    MD_SET_MN_START_RC)) {
4369 				(void) sleep(sd->sd_mn_mynode->nd_nodeid % 4);
4370 			}
4371 			start_node_delayed = 1;
4372 		}
4373 
4374 		/* Choose master for this set */
4375 		rval = meta_reconfig_choose_master_for_set(sp, sd, ep);
4376 		if (rval == -1) {
4377 			mde_perror(ep, "");
4378 			return (1);
4379 		} else if (rval == 205) {
4380 			mde_perror(ep, "");
4381 			return (205);
4382 		}
4383 
4384 		/* reinit rpc.mdcommd with new nodelist */
4385 		if (mdmn_reinit_set(sp->setno, timeout)) {
4386 			md_eprintf(dgettext(TEXT_DOMAIN,
4387 			    "Could not re-initialise rpc.mdcommd for "
4388 			    "set %s\n"), sp->setname);
4389 			return (1);
4390 		}
4391 
4392 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4393 		    "Choose master for set %s completed: %s"),
4394 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4395 	}
4396 
4397 	/*
4398 	 * Each node turns on I/Os for all MN disksets.
4399 	 * This is to recover from the situation where the master died
4400 	 * during a previous reconfig cycle when I/Os were suspended
4401 	 * for a MN diskset.
4402 	 * If a failure occurs return a 1 which will force this node to
4403 	 * panic.  Cannot leave node in the situation where I/Os are
4404 	 * not resumed.
4405 	 */
4406 	setno = 0; /* 0 means all MN sets */
4407 	if (metaioctl(MD_MN_RESUME_SET, &setno, ep, NULL)) {
4408 		mde_perror(ep, "");
4409 		return (1);
4410 	}
4411 
4412 	/* Free the nodelist */
4413 	if (nodecnt)
4414 		meta_free_nodelist(nl);
4415 
4416 	return (0);
4417 }
4418 
4419 /*
4420  * meta_mnsync_user_records will synchronize the diskset user records across
4421  * all nodes in the diskset.  The diskset user records are stored in
4422  * each node's local set mddb.
4423  *
4424  * This needs to be done even if there is no master change during the
4425  * reconfig cycle since this routine should clean up any mess left by
4426  * the untimely termination of a metaset or metadb command (due to a
4427  * node panic or to user intervention).
4428  *
4429  * Caller is the Master node.
4430  *
4431  * Returns	 0 - Success
4432  *		205 - Failure during RPC to another node
4433  *		-1 - Any other failure and ep is filled in.
4434  */
4435 int
4436 meta_mnsync_user_records(
4437 	mdsetname_t	*sp,
4438 	md_error_t	*ep
4439 )
4440 {
4441 	md_set_desc		*sd;
4442 	md_mnnode_desc		*master_nodelist, *nd, *nd2, *ndtail;
4443 	md_mnset_record		*mnsr;
4444 	md_mnsr_node_t		*master_mnsr_node = NULL, *mnsr_node = NULL;
4445 	md_mnnode_record	*nr;
4446 	md_drive_record		*dr;
4447 	int			dr_cnt, dd_cnt;
4448 	int			found_my_nr;
4449 	md_drive_desc		*dd, *dd_prev, *master_dd, *other_dd;
4450 	int			all_drives_ok;
4451 	int			rval = 0;
4452 	int			max_genid = 0;
4453 	int			num_alive_nodes, num_alive_nodes_del = 0;
4454 	int			set_locked = 0;
4455 	md_setkey_t		*cl_sk;
4456 	md_error_t		xep = mdnullerror;
4457 	char			*anode[1];
4458 	mddb_setflags_config_t	sf;
4459 
4460 	/*
4461 	 * Sync up node records first.
4462 	 * Construct a master nodelist using the nodelist from this
4463 	 * node's rpc.metad node records and then setting the state of each
4464 	 * node following these rules:
4465 	 *	- If a node record is marked OK on its node, mark it OK
4466 	 *		in the master nodelist (and later OK on all nodes)
4467 	 *		If a node record is also marked OWN on its node,
4468 	 *		mark it OWN in the master nodelist.
4469 	 *	- If a node record is not marked OK on its node, then mark
4470 	 *		it as DEL in the master list (later deleting it)
4471 	 *	- If node record doesn't exist on that node, then mark it DEL
4472 	 *		(later deleting it)
4473 	 *	- If set record doesn't exist on that node, mark node as DEL
4474 	 *	- If a node record doesn't exist on all nodes, then mark it DEL
4475 	 *	- If a node is not ALIVE, then
4476 	 *		- If that node marked DEL on any node - mark it DEL
4477 	 *			in master list but leave in nodelist
4478 	 *		- If that node is marked as ADD on any node, mark it
4479 	 *			ADD in the master list but leave in nodelist
4480 	 *		- When that node returns to the living, the DEL
4481 	 *			node record will be removed and the ADD node
4482 	 *			record may be removed if marked ADD on that
4483 	 *			node.
4484 	 * The key rule is to not remove a node from the nodelist until
4485 	 * that node record is removed from its own node.  Do not want to
4486 	 * remove a node's record from all other nodes and then have
4487 	 * that node have its own record marked OK so that a node will pick
4488 	 * a different master than the other nodes.
4489 	 *
4490 	 * Next,
4491 	 * If node is ALIVE and node record is marked DEL in master nodelist,
4492 	 * remove node from set.
4493 	 * If node is ALIVE and node record is marked OK in master nodelist,
4494 	 * mark it OK on all other nodes.
4495 	 * If node is not ALIVE and node record is marked DEL in master
4496 	 * nodelist, mark it DEL on all other nodes.
4497 	 * If node is not ALIVE and node record is marked ADD in master,
4498 	 * nodelist, mark it ADD on all other nodes.
4499 	 */
4500 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4501 		return (-1);
4502 	}
4503 	master_nodelist = sd->sd_nodelist;
4504 
4505 	/*
4506 	 * Walk through nodelist creating a master nodelist.
4507 	 */
4508 	num_alive_nodes = 0;
4509 	nd = master_nodelist;
4510 	while (nd) {
4511 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4512 			nd = nd->nd_next;
4513 			continue;
4514 		}
4515 		num_alive_nodes++;
4516 		if (clnt_mngetset(nd->nd_nodename, sp->setname,
4517 		    MD_SET_BAD, &mnsr, ep) == -1) {
4518 			if (mdiserror(ep, MDE_NO_SET)) {
4519 				/* set doesn't exist, mark node as DEL */
4520 				nd->nd_flags &= ~MD_MN_NODE_OK;
4521 				nd->nd_flags &= ~MD_MN_NODE_ADD;
4522 				nd->nd_flags |= MD_MN_NODE_DEL;
4523 				nd->nd_flags |= MD_MN_NODE_NOSET;
4524 				nd = nd->nd_next;
4525 				continue;
4526 			} else {
4527 				/* If RPC failure to another node return 205 */
4528 				if ((mdanyrpcerror(ep)) &&
4529 				    (sd->sd_mn_mynode->nd_nodeid !=
4530 				    nd->nd_nodeid)) {
4531 					rval = 205;
4532 				} else {
4533 					/* Any other failure */
4534 					rval = -1;
4535 				}
4536 				goto out;
4537 			}
4538 		}
4539 		/* Find biggest genid in records for this diskset */
4540 		if (mnsr->sr_genid > max_genid)
4541 			max_genid = mnsr->sr_genid;
4542 
4543 		dr = mnsr->sr_drivechain;
4544 		while (dr) {
4545 			/* Find biggest genid in records for this diskset */
4546 			if (dr->dr_genid > max_genid) {
4547 				max_genid = dr->dr_genid;
4548 			}
4549 			dr = dr->dr_next;
4550 		}
4551 
4552 		found_my_nr = 0;
4553 		nr = mnsr->sr_nodechain;
4554 		/* nr is the list of node recs from nd_nodename node */
4555 		while (nr) {
4556 			/* Find biggest genid in records for this diskset */
4557 			if (nr->nr_genid > max_genid)
4558 				max_genid = nr->nr_genid;
4559 			nd2 = master_nodelist;
4560 			ndtail = NULL;
4561 			/* For each node record, is it in master list? */
4562 			while (nd2) {
4563 				if (nd2->nd_nodeid == nr->nr_nodeid)
4564 					break;
4565 				if (nd2->nd_next == NULL)
4566 					ndtail = nd2;
4567 				nd2 = nd2->nd_next;
4568 			}
4569 			/*
4570 			 * Found node record not in master list -- add it
4571 			 * to list marking it as DEL since node record
4572 			 * should exist on all nodes unless a panic occurred
4573 			 * during addition or deletion of host to diskset.
4574 			 */
4575 			if (nd2 == NULL) {
4576 				nd2 = Zalloc(sizeof (*nd2));
4577 				(void) strcpy(nd2->nd_nodename,
4578 				    nr->nr_nodename);
4579 				nd2->nd_flags = nr->nr_flags;
4580 				nd2->nd_flags |= MD_MN_NODE_DEL;
4581 				nd2->nd_nodeid = nr->nr_nodeid;
4582 				nd2->nd_next = NULL;
4583 				ndtail->nd_next = nd2;
4584 				nd2 = NULL;
4585 				nr = nr->nr_next;
4586 				continue;
4587 			}
4588 			/*
4589 			 * Is this the node record for the node that
4590 			 * we requested the set desc from?
4591 			 * If so, check if node has its own node record
4592 			 * marked OK. If marked OK, check for the OWN bit.
4593 			 */
4594 			if (nr->nr_nodeid == nd->nd_nodeid) {
4595 				found_my_nr = 1;
4596 				if (nr->nr_flags & MD_MN_NODE_OK) {
4597 					/*
4598 					 * If node record is marked OK
4599 					 * on its own node, then mark it OK
4600 					 * in the master list.  Node record
4601 					 * would have to exist on all nodes
4602 					 * in the ADD state before it could
4603 					 * be put into the OK state.
4604 					 */
4605 					nd->nd_flags |= MD_MN_NODE_OK;
4606 					nd->nd_flags &=
4607 					    ~(MD_MN_NODE_ADD | MD_MN_NODE_DEL);
4608 					/*
4609 					 * Mark own in master list as marked
4610 					 * on own node.
4611 					 */
4612 					if (nr->nr_flags & MD_MN_NODE_OWN)
4613 						nd->nd_flags |= MD_MN_NODE_OWN;
4614 					else
4615 						nd->nd_flags &= ~MD_MN_NODE_OWN;
4616 				} else {
4617 					/* Otherwise, mark node as DEL */
4618 					nd->nd_flags &= ~MD_MN_NODE_OK;
4619 					nd->nd_flags &= ~MD_MN_NODE_ADD;
4620 					nd->nd_flags |= MD_MN_NODE_DEL;
4621 				}
4622 			}
4623 			/*
4624 			 * If node is not ALIVE and marked DEL
4625 			 * on any node, make it DEL in master list.
4626 			 * If node is not ALIVE and marked ADD
4627 			 * on any node, make it ADD in master list
4628 			 * unless node record has already been marked DEL.
4629 			 */
4630 			if (!(nr->nr_flags & MD_MN_NODE_ALIVE)) {
4631 				if (nr->nr_flags & MD_MN_NODE_ADD) {
4632 					if (!(nd->nd_flags & MD_MN_NODE_DEL)) {
4633 						/* If not DEL - mark it ADD */
4634 						nd->nd_flags |= MD_MN_NODE_ADD;
4635 						nd->nd_flags &= ~MD_MN_NODE_OK;
4636 					}
4637 				}
4638 				if (nr->nr_flags & MD_MN_NODE_DEL) {
4639 					nd->nd_flags |= MD_MN_NODE_DEL;
4640 					nd->nd_flags &= ~MD_MN_NODE_OK;
4641 					/* Could already be ADD - make it DEL */
4642 					nd->nd_flags &= ~MD_MN_NODE_ADD;
4643 				}
4644 			}
4645 			nr = nr->nr_next;
4646 		}
4647 		/*
4648 		 * If a node record doesn't exist on its own node,
4649 		 * then mark node as DEL.
4650 		 */
4651 		if (found_my_nr == 0) {
4652 			nd->nd_flags &= ~MD_MN_NODE_OK;
4653 			nd->nd_flags |= MD_MN_NODE_DEL;
4654 		}
4655 
4656 		/*
4657 		 * If node is OK - put mnsr onto master_mnsr_node list for
4658 		 * later use when syncing up the drive records in the set.
4659 		 */
4660 		if (nd->nd_flags & MD_MN_NODE_OK) {
4661 			mnsr_node = Zalloc(sizeof (*mnsr_node));
4662 			mnsr_node->mmn_mnsr = mnsr;
4663 			(void) strncpy(mnsr_node->mmn_nodename,
4664 			    nd->nd_nodename, MD_MAX_MNNODENAME_PLUS_1);
4665 			mnsr_node->mmn_next = master_mnsr_node;
4666 			master_mnsr_node = mnsr_node;
4667 		} else {
4668 			free_sr((struct md_set_record *)mnsr);
4669 		}
4670 
4671 		nd = nd->nd_next;
4672 	}
4673 
4674 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4675 	    "Master nodelist created for set %s: %s"),
4676 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4677 
4678 	/*
4679 	 * Send master nodelist to the rpc.metad on all nodes (including
4680 	 * myself) and each node will update itself.  This will set the
4681 	 * ADD and DEL flags on each node as setup in the master nodelist.
4682 	 * Don't send nodelist to node where set doesn't exist.
4683 	 */
4684 	nd = master_nodelist;
4685 	while (nd) {
4686 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4687 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
4688 			nd = nd->nd_next;
4689 			continue;
4690 		}
4691 		if (clnt_upd_nr_flags(nd->nd_nodename, sp,
4692 		    master_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) {
4693 			/* If RPC failure to another node return 205 */
4694 			if ((mdanyrpcerror(ep)) &&
4695 			    (sd->sd_mn_mynode->nd_nodeid !=
4696 			    nd->nd_nodeid)) {
4697 				rval = 205;
4698 			} else {
4699 				/* Any other failure */
4700 				rval = -1;
4701 			}
4702 			goto out;
4703 		}
4704 		nd = nd->nd_next;
4705 	}
4706 
4707 	/*
4708 	 * Now, delete nodes that need to be deleted.
4709 	 */
4710 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
4711 	    ep))  == NULL) {
4712 		if (! mdisok(ep)) {
4713 			rval = -1;
4714 			goto out;
4715 		}
4716 	}
4717 
4718 	/*
4719 	 * May be doing lots of RPC commands to the nodes, so lock the
4720 	 * ALIVE members of the set since most of the rpc.metad routines
4721 	 * require this for security reasons.
4722 	 */
4723 	nd = master_nodelist;
4724 	while (nd) {
4725 		/* Skip non-alive nodes and node without set */
4726 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4727 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
4728 			nd = nd->nd_next;
4729 			continue;
4730 		}
4731 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
4732 			/* If RPC failure to another node return 205 */
4733 			if ((mdanyrpcerror(ep)) &&
4734 			    (sd->sd_mn_mynode->nd_nodeid !=
4735 			    nd->nd_nodeid)) {
4736 				rval = 205;
4737 			} else {
4738 				/* Any other failure */
4739 				rval = -1;
4740 			}
4741 			goto out;
4742 		}
4743 		set_locked = 1;
4744 		nd = nd->nd_next;
4745 	}
4746 
4747 	nd = master_nodelist;
4748 	while (nd) {
4749 		/* Skip non-alive nodes */
4750 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4751 			nd = nd->nd_next;
4752 			continue;
4753 		}
4754 		if (nd->nd_flags & MD_MN_NODE_DEL) {
4755 			num_alive_nodes_del++;
4756 			/*
4757 			 * Delete this node rec from all ALIVE nodes in diskset.
4758 			 */
4759 			nd2 = master_nodelist;
4760 			while (nd2) {
4761 				/* Skip non-alive nodes and node without set */
4762 				if (!(nd2->nd_flags & MD_MN_NODE_ALIVE) ||
4763 				    (nd2->nd_flags & MD_MN_NODE_NOSET)) {
4764 					nd2 = nd2->nd_next;
4765 					continue;
4766 				}
4767 
4768 				/* This is a node being deleted from set */
4769 				if (nd2->nd_nodeid == nd->nd_nodeid) {
4770 					/* Mark set record as DEL */
4771 					if (clnt_upd_sr_flags(nd->nd_nodename,
4772 					    sp, MD_SR_DEL, ep)) {
4773 						/* RPC failure to !my node */
4774 						if ((mdanyrpcerror(ep)) &&
4775 						    (sd->sd_mn_mynode->
4776 						    nd_nodeid
4777 						    != nd->nd_nodeid)) {
4778 							rval = 205;
4779 						} else {
4780 							/* Any other failure */
4781 							rval = -1;
4782 						}
4783 						goto out;
4784 					}
4785 					if (clnt_deldrvs(nd->nd_nodename, sp,
4786 					    dd, ep)) {
4787 						/* RPC failure to !my node */
4788 						if ((mdanyrpcerror(ep)) &&
4789 						    (sd->sd_mn_mynode->
4790 						    nd_nodeid
4791 						    != nd->nd_nodeid)) {
4792 							rval = 205;
4793 						} else {
4794 							/* Any other failure */
4795 							rval = -1;
4796 						}
4797 						goto out;
4798 					}
4799 					if (clnt_delset(nd->nd_nodename, sp,
4800 					    ep) == -1) {
4801 						/* RPC failure to !my node */
4802 						if ((mdanyrpcerror(ep)) &&
4803 						    (sd->sd_mn_mynode->
4804 						    nd_nodeid
4805 						    != nd->nd_nodeid)) {
4806 							rval = 205;
4807 						} else {
4808 							/* Any other failure */
4809 							rval = -1;
4810 						}
4811 						goto out;
4812 					}
4813 				} else {
4814 					/*
4815 					 * Delete host from sets on hosts
4816 					 * not being deleted.
4817 					 */
4818 					anode[0] = Strdup(nd->nd_nodename);
4819 					if (clnt_delhosts(nd2->nd_nodename, sp,
4820 					    1, anode, ep) == -1) {
4821 						Free(anode[0]);
4822 						/* RPC failure to !my node */
4823 						if ((mdanyrpcerror(ep)) &&
4824 						    (sd->sd_mn_mynode->
4825 						    nd_nodeid
4826 						    != nd2->nd_nodeid)) {
4827 							rval = 205;
4828 						} else {
4829 							/* Any other failure */
4830 							rval = -1;
4831 						}
4832 						goto out;
4833 					}
4834 
4835 					meta_mc_log(MC_LOG5,
4836 					    dgettext(TEXT_DOMAIN,
4837 					    "Deleted node %s (%d) on node %s "
4838 					    "from set %s: %s"),
4839 					    nd->nd_nodename, nd->nd_nodeid,
4840 					    nd2->nd_nodename,
4841 					    sp->setname,
4842 					    meta_print_hrtime(
4843 					    gethrtime() - start_time));
4844 
4845 					Free(anode[0]);
4846 				}
4847 				nd2 = nd2->nd_next;
4848 			}
4849 		}
4850 		nd = nd->nd_next;
4851 	}
4852 
4853 	nd = master_nodelist;
4854 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
4855 	while (nd) {
4856 		/* Skip non-alive nodes and node without set */
4857 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4858 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
4859 			nd = nd->nd_next;
4860 			continue;
4861 		}
4862 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) {
4863 			/* If RPC failure to another node return 205 */
4864 			if ((mdanyrpcerror(ep)) &&
4865 			    (sd->sd_mn_mynode->nd_nodeid !=
4866 			    nd->nd_nodeid)) {
4867 				rval = 205;
4868 			} else {
4869 				/* Any other failure */
4870 				rval = -1;
4871 			}
4872 			goto out;
4873 		}
4874 		nd = nd->nd_next;
4875 	}
4876 	cl_set_setkey(NULL);
4877 	set_locked = 0;
4878 
4879 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4880 	    "Nodelist syncronization complete for set %s: %s"),
4881 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4882 
4883 	metaflushsetname(sp);
4884 
4885 	/*
4886 	 * If all alive nodes have been deleted from set, just
4887 	 * return since nothing else can be done until non-alive
4888 	 * nodes (if there are any) rejoin the cluster.
4889 	 */
4890 	if (num_alive_nodes == num_alive_nodes_del) {
4891 		rval = 0;
4892 		goto out;
4893 	}
4894 
4895 	/*
4896 	 * Sync up drive records.
4897 	 *
4898 	 * If a node panic'd (or metaset command was killed) during the
4899 	 * addition or deletion of a drive to the diskset, the nodes
4900 	 * may have a different view of the drive list.  During cleanup
4901 	 * of the drive list during reconfig, a drive will be deleted
4902 	 * from the list if the master node sees that the drive has been
4903 	 * marked in the ADD state on any node or is marked in the DEL state
4904 	 * on all nodes.
4905 	 * This cleanup must occur even if all nodes in the cluster are
4906 	 * not part of the cluster so that all nodes have the same view
4907 	 * of the drivelist.
4908 	 * Then if the entire cluster goes down and comes back up, the
4909 	 * new master node could be a node that wasn't in the cluster when
4910 	 * the node was deleted.  This could lead to a situation where the
4911 	 * master node thinks that a drive is OK, but this drive isn't
4912 	 * known to the other nodes.
4913 	 * This situation can also occur during the addition of a drive
4914 	 * where a node has the drive marked OK, but the node executing the
4915 	 * metaset command enountered a failure before marking that drive OK
4916 	 * on the rest of the nodes.  If the node with the OK drive then
4917 	 * panics, then rest of the nodes will remove that drive marked ADD
4918 	 * and when the node with the OK drive rejoins the cluster, it will
4919 	 * have a drive marked OK that is unknown by the other nodes.
4920 	 *
4921 	 * There are 2 situations to consider:
4922 	 * A) Master knows about a drive that other nodes don't know about.
4923 	 * B) At least one slave node knows about a drive that the master
4924 	 *    node doesn't know about.
4925 	 *
4926 	 * To handle these situations the following steps are followed:
4927 	 * 1) Count number of drives known by this master node and the
4928 	 *    other slave nodes.
4929 	 *    If all nodes have the same number of drives and the master has
4930 	 *    all drives marked OK, then skip to step4.
4931 	 *
4932 	 * 2) If a node has less drives listed than the master, the master
4933 	 *    must get the drive descriptor list from that node so that
4934 	 *    master can determine which drive it needs to delete from that
4935 	 *    node.  Master must get the drive descriptor list since the
4936 	 *    drive record list does not contain the name of the drive, but
4937 	 *    only a key and the key can only be interprested on that other
4938 	 *    node.
4939 	 *
4940 	 * 3) The master will then create the master drive list by doing:
4941 	 *	- Master starts with drive list known by master.
4942 	 *	- Any drive marked ADD will be removed from the list.
4943 	 *	- Any drive not known by another node (from step2) will be
4944 	 *	removed from the drive list.
4945 	 *	- If a drive is marked DEL on the master, the master must
4946 	 *	verify that the drive record is marked DEL on all nodes.
4947 	 *	If any node has the drive record marked OK, mark it OK
4948 	 *	on the master.  (The reason why is described below).
4949 	 *
4950 	 * 4) The master sends out the master drive list and the slave
4951 	 *    nodes will force their drive lists to match the master
4952 	 *    drive list by deleting drives, if necessary and by changing
4953 	 *    the drive record states from ADD->OK if master has drive
4954 	 *    marked OK and slave has drive marked ADD.
4955 	 *
4956 	 * Interesting scenarios:
4957 	 *
4958 	 * 1) System has 4 nodes with node 1 as the master.  Node 3 starts
4959 	 *    to delete a drive record (drive record on node 1 is marked DEL),
4960 	 *    but is stopped when node 3 panics.  Node 1 also panics.
4961 	 *    During reconfig cycle, node 2 is picked as master and the drive
4962 	 *    record is left alone since all nodes in the cluster have it
4963 	 *    marked OK.  User now sees drive as part of diskset.
4964 	 *    Now, entire cluster is rebooted and node 1 rejoins the cluster.
4965 	 *    Node 1 is picked as the master and node 1 has drive record
4966 	 *    marked DEL.  Node 1 contacts all other nodes in the cluster
4967 	 *    and since at least one node has the drive record marked OK,
4968 	 *    the master marks the drive record OK.
4969 	 *    User continues to see the drive as part of the diskset.
4970 	 */
4971 
4972 	/* Reget set descriptor since flushed above */
4973 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4974 		rval = -1;
4975 		goto out;
4976 	}
4977 
4978 	/* Has side effect of setting sd->sd_drvs to same as master_dd */
4979 	if ((master_dd = metaget_drivedesc_sideno(sp,
4980 	    sd->sd_mn_mynode->nd_nodeid,
4981 	    (MD_BASICNAME_OK | PRINT_FAST), ep)) == NULL) {
4982 		/* No drives in list */
4983 		if (!mdisok(ep)) {
4984 			/*
4985 			 * Can't get drive list for this node, so
4986 			 * return -1 causing this node to be removed
4987 			 * cluster config and fixed.
4988 			 */
4989 			rval = -1;
4990 			goto out;
4991 		}
4992 	}
4993 
4994 	/* Count the number of drives for all nodes */
4995 	mnsr_node = master_mnsr_node;
4996 	while (mnsr_node) {
4997 		dr_cnt = 0;
4998 		dr = mnsr_node->mmn_mnsr->sr_drivechain;
4999 		while (dr) {
5000 			dr_cnt++;
5001 			dr = dr->dr_next;
5002 		}
5003 		mnsr_node->mmn_numdrives = dr_cnt;
5004 		mnsr_node = mnsr_node->mmn_next;
5005 	}
5006 
5007 	/* Count the number of drives for the master; also check flags */
5008 	all_drives_ok = 1;
5009 	dd_cnt = 0;
5010 	dd = master_dd;
5011 	while (dd) {
5012 		dd_cnt++;
5013 		if (!(dd->dd_flags & MD_DR_OK))
5014 			all_drives_ok = 0;
5015 		dd = dd->dd_next;
5016 	}
5017 
5018 	/* If all drives are ok, do quick check against number of drives */
5019 	if (all_drives_ok) {
5020 		/* If all nodes have same number of drives, almost done */
5021 		mnsr_node = master_mnsr_node;
5022 		while (mnsr_node) {
5023 			if (mnsr_node->mmn_numdrives != dd_cnt)
5024 				break;
5025 			mnsr_node = mnsr_node->mmn_next;
5026 		}
5027 		/* All nodes have same number of drives, just send flags */
5028 		if (mnsr_node == NULL) {
5029 			goto send_drive_list;
5030 		}
5031 	}
5032 
5033 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5034 	    "Begin detailed drive synchronization for set %s: %s"),
5035 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5036 
5037 	/* Detailed check required  */
5038 	mnsr_node = master_mnsr_node;
5039 	while (mnsr_node) {
5040 		/* Does slave node have less drives than master? */
5041 		if (mnsr_node->mmn_numdrives < dd_cnt) {
5042 			/* Yes - must determine which drive is missing */
5043 			if (clnt_getdrivedesc(mnsr_node->mmn_nodename, sp,
5044 			    &other_dd, ep)) {
5045 				/* RPC failure to !my node */
5046 				if ((mdanyrpcerror(ep)) &&
5047 				    (strcmp(mynode(), mnsr_node->mmn_nodename)
5048 				    != 0)) {
5049 					rval = 205;
5050 				} else {
5051 					/* Any other failure */
5052 					rval = -1;
5053 				}
5054 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5055 				    "Master node %s unable to "
5056 				    "retrieve drive list from node %s"),
5057 				    mynode(), mnsr_node->mmn_nodename);
5058 				goto out;
5059 			}
5060 			mnsr_node->mmn_dd = other_dd;
5061 			dd = master_dd;
5062 			while (dd) {
5063 				if (!(dd->dd_flags & MD_DR_OK)) {
5064 					dd = dd->dd_next;
5065 					continue;
5066 				}
5067 				other_dd = mnsr_node->mmn_dd;
5068 				while (other_dd) {
5069 					/* Convert to devids, when available */
5070 					if (strcmp(other_dd->dd_dnp->cname,
5071 					    dd->dd_dnp->cname) == 0) {
5072 						break;
5073 					}
5074 					other_dd = other_dd->dd_next;
5075 				}
5076 				/*
5077 				 * dd not found on slave so mark it
5078 				 * ADD for later deletion (drives in ADD
5079 				 * state are deleted later in this routine).
5080 				 */
5081 				if (other_dd == NULL) {
5082 					dd->dd_flags = MD_DR_ADD;
5083 				}
5084 				dd = dd->dd_next;
5085 			}
5086 
5087 		}
5088 		mnsr_node = mnsr_node->mmn_next;
5089 	}
5090 
5091 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5092 	    "Drive check completed for set %s: %s"),
5093 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5094 
5095 	dd = master_dd;
5096 	dd_prev = 0;
5097 	while (dd) {
5098 		/* Remove any ADD drives from list */
5099 		if (dd->dd_flags & MD_DR_ADD) {
5100 			if (dd_prev) {
5101 				dd_prev->dd_next = dd->dd_next;
5102 				dd->dd_next = NULL;
5103 				metafreedrivedesc(&dd);
5104 				dd = dd_prev->dd_next;
5105 			} else {
5106 				/*
5107 				 * If removing drive descriptor from head
5108 				 * of linked list, also change sd->sd_drvs.
5109 				 */
5110 				master_dd = sd->sd_drvs = dd->dd_next;
5111 				dd->dd_next = NULL;
5112 				metafreedrivedesc(&dd);
5113 				dd = master_dd;
5114 			}
5115 			/* dd setup in if/else above */
5116 			continue;
5117 		}
5118 		/*
5119 		 * If drive is marked DEL, check all other nodes.
5120 		 * If drive on another node is marked OK, mark drive OK
5121 		 * in master list.  If drive is marked DEL or doesn't exist
5122 		 * on all nodes, remove drive from list.
5123 		 */
5124 		if (dd->dd_flags & MD_DR_DEL) {
5125 			mnsr_node = master_mnsr_node;
5126 			while (mnsr_node) {
5127 				if (mnsr_node->mmn_dd == NULL) {
5128 					if (clnt_getdrivedesc(
5129 					    mnsr_node->mmn_nodename, sp,
5130 					    &other_dd, ep)) {
5131 						/* RPC failure to !my node */
5132 						if ((mdanyrpcerror(ep)) &&
5133 						    (strcmp(mynode(),
5134 						    mnsr_node->mmn_nodename)
5135 						    != 0)) {
5136 							rval = 205;
5137 						} else {
5138 							/* Any other failure */
5139 							rval = -1;
5140 						}
5141 						mde_perror(ep,
5142 						    dgettext(TEXT_DOMAIN,
5143 						    "Master node %s unable "
5144 						    "to retrieve drive list "
5145 						    "from node %s"), mynode(),
5146 						    mnsr_node->mmn_nodename);
5147 						goto out;
5148 					}
5149 					mnsr_node->mmn_dd = other_dd;
5150 				}
5151 				other_dd = mnsr_node->mmn_dd;
5152 				while (other_dd) {
5153 					/* Found drive (OK) from other node */
5154 					if (strcmp(dd->dd_dnp->cname,
5155 					    other_dd->dd_dnp->cname)
5156 					    == 0) {
5157 						/* Drive marked OK */
5158 						if (other_dd->dd_flags &
5159 						    MD_DR_OK) {
5160 							dd->dd_flags = MD_DR_OK;
5161 						}
5162 						break;
5163 					}
5164 					other_dd = other_dd->dd_next;
5165 				}
5166 				if (dd->dd_flags == MD_DR_OK)
5167 					break;
5168 
5169 				mnsr_node = mnsr_node->mmn_next;
5170 			}
5171 			/*
5172 			 * If no node had this drive marked OK, delete it.
5173 			 */
5174 			if (dd->dd_flags & MD_DR_DEL) {
5175 				if (dd_prev) {
5176 					dd_prev->dd_next = dd->dd_next;
5177 					dd->dd_next = NULL;
5178 					metafreedrivedesc(&dd);
5179 					dd = dd_prev->dd_next;
5180 				} else {
5181 					/*
5182 					 * If removing drive descriptor from
5183 					 * head of linked list, also change
5184 					 * sd->sd_drvs.
5185 					 */
5186 					master_dd = sd->sd_drvs = dd->dd_next;
5187 					dd->dd_next = NULL;
5188 					metafreedrivedesc(&dd);
5189 					dd = master_dd;
5190 				}
5191 				/* dd setup in if/else above */
5192 				continue;
5193 			}
5194 		}
5195 		dd_prev = dd;
5196 		dd = dd->dd_next;
5197 	}
5198 
5199 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5200 	    "Setting drive states completed for set %s: %s"),
5201 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5202 
5203 send_drive_list:
5204 	/*
5205 	 * Set genid on all drives to be the highest value seen.
5206 	 */
5207 	dd = master_dd;
5208 	while (dd) {
5209 		dd->dd_genid = max_genid;
5210 		dd = dd->dd_next;
5211 	}
5212 	/*
5213 	 * Send updated drive list to all alive nodes.
5214 	 * Will also set genid on set and node records to have same
5215 	 * as the drive records.
5216 	 */
5217 	nd = sd->sd_nodelist;
5218 	while (nd) {
5219 		/* Skip non-alive nodes */
5220 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5221 			nd = nd->nd_next;
5222 			continue;
5223 		}
5224 		if (clnt_upd_dr_reconfig(nd->nd_nodename, sp, master_dd, ep)) {
5225 			/* RPC failure to another node */
5226 			if ((mdanyrpcerror(ep)) &&
5227 			    (sd->sd_mn_mynode->nd_nodeid != nd->nd_nodeid)) {
5228 				rval = 205;
5229 			} else {
5230 				/* Any other failure */
5231 				rval = -1;
5232 			}
5233 			goto out;
5234 		}
5235 		nd = nd->nd_next;
5236 	}
5237 
5238 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5239 	    "Sent drive list to all nodes for set %s: %s"),
5240 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5241 
5242 	/*
5243 	 * If no drive records left in set and nodes had been joined,
5244 	 * withdraw the nodes.  Always reset the master and mark
5245 	 * all nodes as withdrawn on all nodes.
5246 	 */
5247 	if (master_dd == NULL) {
5248 		/* Reset new master flag since no longer master */
5249 		(void) memset(&sf, 0, sizeof (sf));
5250 		sf.sf_setno = sp->setno;
5251 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5252 		sf.sf_flags = MDDB_NM_RESET;
5253 		/* Use magic to help protect ioctl against attack. */
5254 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5255 		/* Ignore failure, failure to reset flag isn't catastrophic */
5256 		(void) metaioctl(MD_MN_SET_SETFLAGS, &sf,
5257 		    &sf.sf_mde, NULL);
5258 
5259 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5260 		    "Reset new master flag for " "set %s: %s"),
5261 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5262 
5263 		nd = sd->sd_nodelist;
5264 		while (nd) {
5265 			/* Skip non-alive nodes  */
5266 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5267 				nd = nd->nd_next;
5268 				continue;
5269 			}
5270 
5271 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
5272 				/* RPC failure to another node */
5273 				if ((mdanyrpcerror(ep)) &&
5274 				    (sd->sd_mn_mynode->nd_nodeid !=
5275 				    nd->nd_nodeid)) {
5276 					rval = 205;
5277 				} else {
5278 					/* Any other failure */
5279 					rval = -1;
5280 				}
5281 				goto out;
5282 			}
5283 			set_locked = 1;
5284 
5285 			/* Withdraw node from set if owner */
5286 			if ((nd->nd_flags & MD_MN_NODE_OWN) &&
5287 			    (clnt_withdrawset(nd->nd_nodename, sp, ep))) {
5288 				/* RPC failure to another node */
5289 				if ((mdanyrpcerror(ep)) &&
5290 				    (sd->sd_mn_mynode->nd_nodeid !=
5291 				    nd->nd_nodeid)) {
5292 					rval = 205;
5293 				} else {
5294 					/* Any other failure */
5295 					rval = -1;
5296 				}
5297 				goto out;
5298 			}
5299 
5300 			/* Mark all nodes as withdrawn on this node */
5301 			if (clnt_upd_nr_flags(nd->nd_nodename, sp,
5302 			    sd->sd_nodelist, MD_NR_WITHDRAW, NULL, ep)) {
5303 				/* RPC failure to another node */
5304 				if ((mdanyrpcerror(ep)) &&
5305 				    (sd->sd_mn_mynode->nd_nodeid !=
5306 				    nd->nd_nodeid)) {
5307 					rval = 205;
5308 				} else {
5309 					/* Any other failure */
5310 					rval = -1;
5311 				}
5312 				goto out;
5313 			}
5314 
5315 			/* Resets master to no-master on this node */
5316 			if (clnt_mnsetmaster(nd->nd_nodename, sp,
5317 			    "", MD_MN_INVALID_NID, ep)) {
5318 				/* RPC failure to another node */
5319 				if ((mdanyrpcerror(ep)) &&
5320 				    (sd->sd_mn_mynode->nd_nodeid !=
5321 				    nd->nd_nodeid)) {
5322 					rval = 205;
5323 				} else {
5324 					/* Any other failure */
5325 					rval = -1;
5326 				}
5327 				goto out;
5328 			}
5329 
5330 			cl_sk = cl_get_setkey(sp->setno, sp->setname);
5331 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) {
5332 				/* RPC failure to another node */
5333 				if ((mdanyrpcerror(ep)) &&
5334 				    (sd->sd_mn_mynode->nd_nodeid !=
5335 				    nd->nd_nodeid)) {
5336 					rval = 205;
5337 				} else {
5338 					/* Any other failure */
5339 					rval = -1;
5340 				}
5341 				goto out;
5342 			}
5343 			set_locked = 0;
5344 			nd = nd->nd_next;
5345 		}
5346 	}
5347 
5348 out:
5349 	/*
5350 	 * If got here and set is still locked, then an error has
5351 	 * occurred and master_nodelist is still valid.
5352 	 * If error is not an RPC error, then unlock.
5353 	 * If error is an RPC error, skip unlocks since this could cause
5354 	 * yet another RPC timeout if a node has failed.
5355 	 * Ignore failures in unlock since unlock is just trying to
5356 	 * clean things up.
5357 	 */
5358 	if ((set_locked) && !(mdanyrpcerror(ep))) {
5359 		nd = master_nodelist;
5360 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
5361 		while (nd) {
5362 			/* Skip non-alive nodes */
5363 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5364 				nd = nd->nd_next;
5365 				continue;
5366 			}
5367 			/*
5368 			 * If clnt_unlock fails, just break out since next
5369 			 * reconfig cycle will reset the locks anyway.
5370 			 */
5371 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
5372 				break;
5373 			}
5374 			nd = nd->nd_next;
5375 		}
5376 		cl_set_setkey(NULL);
5377 	}
5378 	/* Free master_mnsr and drive descs */
5379 	mnsr_node = master_mnsr_node;
5380 	while (mnsr_node) {
5381 		master_mnsr_node = mnsr_node->mmn_next;
5382 		free_sr((md_set_record *)mnsr_node->mmn_mnsr);
5383 		free_rem_dd(mnsr_node->mmn_dd);
5384 		Free(mnsr_node);
5385 		mnsr_node = master_mnsr_node;
5386 	}
5387 
5388 	/* Frees sd->sd_drvs (which is also master_dd) */
5389 	metaflushsetname(sp);
5390 	return (rval);
5391 }
5392 
5393 /*
5394  * meta_mnsync_diskset_mddbs
5395  * Calling node is guaranteed to be an owner node.
5396  * Calling node is the master node.
5397  *
5398  * Master node verifies that ondisk mddb format matches its incore format.
5399  * If no nodes are joined to set, remove the change log entries.
5400  * If a node is joined to set, play the change log.
5401  *
5402  * Returns	 0 - Success
5403  *		 1 - Master unable to join to set.
5404  *		205 - Failure during RPC to another node
5405  *		-1 - Any other failure and ep is filled in.
5406  *			-1 return will eventually cause node to panic
5407  *			in a SunCluster environment.
5408  */
5409 int
5410 meta_mnsync_diskset_mddbs(
5411 	mdsetname_t	*sp,
5412 	md_error_t	*ep
5413 )
5414 {
5415 	md_set_desc		*sd;
5416 	mddb_config_t		c;
5417 	md_mn_msgclass_t	class;
5418 	mddb_setflags_config_t	sf;
5419 	md_mnnode_desc		*nd, *nd2;
5420 	md_error_t		xep = mdnullerror;
5421 	int			stale_set = 0;
5422 
5423 	/* If setname is there, set desc should exist. */
5424 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5425 		mde_perror(ep, dgettext(TEXT_DOMAIN,
5426 		    "Unable to get set %s desc information"), sp->setname);
5427 		return (-1);
5428 	}
5429 
5430 	/* Are there drives in the set? */
5431 	if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
5432 	    ep) == NULL) {
5433 		if (! mdisok(ep)) {
5434 			return (-1);
5435 		}
5436 		/* No drives in set -- nothing to sync up */
5437 		return (0);
5438 	}
5439 
5440 	/*
5441 	 * Is master node (which is this node) joined to set?
5442 	 * If master node isn't joined (which means that no nodes
5443 	 * are joined to diskset), remove the change log entries
5444 	 * since no need to replay them - all nodes will have same
5445 	 * view of mddbs since all nodes are reading in the mddbs
5446 	 * from disk.
5447 	 * There is also no need to sync up the master and ondisk mddbs
5448 	 * since master has no incore knowledge.
5449 	 * Need to join master to set in order to flush the change
5450 	 * log entries. Don't need to block I/O during join of master
5451 	 * to set since no other nodes are joined to set and so no I/O
5452 	 * can be occurring.
5453 	 */
5454 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
5455 		/* Join master to set */
5456 		if (clnt_joinset(mynode(), sp,
5457 		    MNSET_IN_RECONFIG, ep)) {
5458 			if (mdismddberror(ep, MDE_DB_STALE)) {
5459 				/*
5460 				 * If STALE, print message and continue on.
5461 				 * Don't do any writes or reads to mddbs
5462 				 * so don't clear change log.
5463 				 */
5464 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5465 				    "Join of master node to STALE set %s"),
5466 				    sp->setname);
5467 				stale_set = 1;
5468 				mdclrerror(ep);
5469 			} else if (mdismddberror(ep, MDE_DB_ACCOK)) {
5470 				/* ACCOK means mediator provided extra vote */
5471 				mdclrerror(ep);
5472 			} else {
5473 				/*
5474 				 * If master is unable to join set, print an
5475 				 * error message.  Don't return failure or node
5476 				 * will panic during cluster reconfig cycle.
5477 				 * Also, withdraw node from set in order to
5478 				 * cleanup from failed join attempt.
5479 				 */
5480 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5481 				    "Join of master node in set %s failed"),
5482 				    sp->setname);
5483 				if (clnt_withdrawset(mynode(), sp, &xep))
5484 					mdclrerror(&xep);
5485 				return (1);
5486 			}
5487 		}
5488 		/*
5489 		 * Master node successfully joined.
5490 		 * Set local copy of flags to OWN and
5491 		 * send owner flag to rpc.metad. If not stale,
5492 		 * flush the change log.
5493 		 */
5494 		sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN;
5495 		if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, MD_NR_SET,
5496 		    MNSET_IN_RECONFIG, ep)) {
5497 			mde_perror(ep, dgettext(TEXT_DOMAIN,
5498 			    "Flag update of master node join in set %s failed"),
5499 			    sp->setname);
5500 			return (-1);
5501 		}
5502 
5503 		if (!stale_set) {
5504 			if (mdmn_reset_changelog(sp, ep,
5505 			    MDMN_CLF_RESETLOG) != 0) {
5506 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5507 				    "Unable to reset changelog."));
5508 				return (-1);
5509 			}
5510 			meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5511 			    "Removed changelog entries for set %s: %s"),
5512 			    sp->setname,
5513 			    meta_print_hrtime(gethrtime() - start_time));
5514 		}
5515 		/* Reset new master flag before return */
5516 		(void) memset(&sf, 0, sizeof (sf));
5517 		sf.sf_setno = sp->setno;
5518 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5519 		sf.sf_flags = MDDB_NM_RESET;
5520 		/* Use magic to help protect ioctl against attack. */
5521 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5522 		/* Ignore failure, failure to reset flag isn't catastrophic */
5523 		(void) metaioctl(MD_MN_SET_SETFLAGS, &sf,
5524 		    &sf.sf_mde, NULL);
5525 
5526 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5527 		    "Reset new master flag for set %s: %s"),
5528 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5529 
5530 		return (0);
5531 	}
5532 
5533 	/*
5534 	 * Is master already joined to STALE set (< 50% mddbs avail)?
5535 	 * If so, can make no config changes to mddbs so don't check or play
5536 	 * changelog and don't sync master node to ondisk mddbs.
5537 	 * To get out of the stale state all nodes must be withdrawn
5538 	 * from set.  Then as nodes are re-joined, all nodes will
5539 	 * have same view of mddbs since all nodes are reading the
5540 	 * mddbs from disk.
5541 	 */
5542 	(void) memset(&c, 0, sizeof (c));
5543 	c.c_id = 0;
5544 	c.c_setno = sp->setno;
5545 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
5546 		(void) mdstealerror(ep, &c.c_mde);
5547 		return (-1);
5548 	}
5549 	if (c.c_flags & MDDB_C_STALE) {
5550 		return (0);
5551 	}
5552 
5553 	/*
5554 	 * If this node is NOT a newly chosen master, then there's
5555 	 * nothing else to do since the change log should be empty and
5556 	 * the ondisk and incore mddbs are already consistent.
5557 	 *
5558 	 * A newly chosen master is a node that was not the master
5559 	 * at the beginning of the reconfig cycle.  If a node is a new
5560 	 * master, then the new master state is reset after the ondisk
5561 	 * and incore mddbs are consistent and the change log has
5562 	 * been replayed.
5563 	 */
5564 	(void) memset(&sf, 0, sizeof (sf));
5565 	sf.sf_setno = sp->setno;
5566 	sf.sf_flags = MDDB_NM_GET;
5567 	/* Use magic to help protect ioctl against attack. */
5568 	sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5569 	if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, &sf.sf_mde, NULL) == 0) &&
5570 	    ((sf.sf_setflags & MD_SET_MN_NEWMAS_RC) == 0)) {
5571 		return (0);
5572 	}
5573 
5574 	/*
5575 	 * Now, sync up incore master view to ondisk mddbs.
5576 	 * This is needed in the case where a master node
5577 	 * had made a change to the mddb, but this change
5578 	 * may not have been relayed to the slaves yet.
5579 	 * So, the new master needs to verify that the ondisk
5580 	 * mddbs match what the new master has incore -
5581 	 * if different, new master rewrites all of the mddbs.
5582 	 * Then the new master will replay the changelog and the
5583 	 * new master will then execute what the old master had
5584 	 * done.
5585 	 *
5586 	 * Block all I/Os to disks in this diskset on all nodes in
5587 	 * the diskset.  This will allow the rewriting of the mddbs
5588 	 * (if needed), to proceed in a timely manner.
5589 	 *
5590 	 * If block of I/Os fail, return a -1.
5591 	 */
5592 
5593 	nd = sd->sd_nodelist;
5594 	while (nd) {
5595 		/* Skip non-alive and non-owner nodes  */
5596 		if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5597 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5598 			nd = nd->nd_next;
5599 			continue;
5600 		}
5601 		if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5602 		    MN_SUSP_IO, ep)) {
5603 			mde_perror(ep, dgettext(TEXT_DOMAIN,
5604 			    "Unable to suspend I/O on node %s in set %s"),
5605 			    nd->nd_nodename, sp->setname);
5606 
5607 			/*
5608 			 * Resume all other nodes that had been suspended.
5609 			 * (Reconfig return step also resumes I/Os
5610 			 * for all sets.)
5611 			 */
5612 			nd2 = sd->sd_nodelist;
5613 			while (nd2) {
5614 				/* Stop when reaching failed node */
5615 				if (nd2->nd_nodeid == nd->nd_nodeid)
5616 					break;
5617 				/* Skip non-alive and non-owner nodes  */
5618 				if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) ||
5619 				    (!(nd2->nd_flags & MD_MN_NODE_OWN))) {
5620 					nd2 = nd2->nd_next;
5621 					continue;
5622 				}
5623 				(void) (clnt_mn_susp_res_io(nd2->nd_nodename,
5624 				    sp->setno, MN_RES_IO, &xep));
5625 				nd2 = nd2->nd_next;
5626 			}
5627 
5628 			/*
5629 			 * If an RPC failure on another node, return a 205.
5630 			 * Otherwise, exit with failure.
5631 			 */
5632 			if ((mdanyrpcerror(ep)) &&
5633 			    (sd->sd_mn_mynode->nd_nodeid !=
5634 			    nd->nd_nodeid)) {
5635 				return (205);
5636 			} else {
5637 				return (-1);
5638 			}
5639 
5640 		}
5641 		nd = nd->nd_next;
5642 	}
5643 
5644 	(void) memset(&c, 0, sizeof (c));
5645 	c.c_id = 0;
5646 	c.c_setno = sp->setno;
5647 	/* Master can't sync up to ondisk mddbs?  Kick it out of cluster */
5648 	if (metaioctl(MD_MN_CHK_WRT_MDDB, &c, &c.c_mde, NULL) != 0)
5649 		return (-1);
5650 
5651 	/*
5652 	 * Resume I/Os that were suspended above.
5653 	 */
5654 	nd = sd->sd_nodelist;
5655 	while (nd) {
5656 		/* Skip non-alive and non-owner nodes  */
5657 		if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5658 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5659 			nd = nd->nd_next;
5660 			continue;
5661 		}
5662 		if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5663 		    MN_RES_IO, ep)) {
5664 			mde_perror(ep, dgettext(TEXT_DOMAIN,
5665 			    "Unable to resume I/O on node %s in set %s"),
5666 			    nd->nd_nodename, sp->setname);
5667 
5668 			/*
5669 			 * If an RPC failure then don't do any
5670 			 * more RPC calls, since one timeout is enough
5671 			 * to endure.  If RPC failure to another node, return
5672 			 * 205.  If RPC failure to my node, return -1.
5673 			 * If not an RPC failure, continue resuming the
5674 			 * rest of the nodes and then return -1.
5675 			 */
5676 			if (mdanyrpcerror(ep)) {
5677 				if (sd->sd_mn_mynode->nd_nodeid ==
5678 				    nd->nd_nodeid) {
5679 					return (-1);
5680 				} else {
5681 					return (205);
5682 				}
5683 			}
5684 
5685 			/*
5686 			 * If not an RPC error, continue resuming rest of
5687 			 * nodes, ignoring any failures except for an
5688 			 * RPC failure which constitutes an immediate exit.
5689 			 * Start in middle of list with failing node.
5690 			 */
5691 			nd2 = nd->nd_next;
5692 			while (nd2) {
5693 				/* Skip non-alive and non-owner nodes  */
5694 				if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) ||
5695 				    (!(nd2->nd_flags & MD_MN_NODE_OWN))) {
5696 					nd2 = nd2->nd_next;
5697 					continue;
5698 				}
5699 				(void) (clnt_mn_susp_res_io(nd2->nd_nodename,
5700 				    sp->setno, MN_RES_IO, &xep));
5701 				if (mdanyrpcerror(&xep)) {
5702 					return (-1);
5703 				}
5704 				nd2 = nd2->nd_next;
5705 			}
5706 		}
5707 		nd = nd->nd_next;
5708 	}
5709 
5710 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, "Master node has completed "
5711 	    "checking/writing the mddb for set %s: %s"), sp->setname,
5712 	    meta_print_hrtime(gethrtime() - start_time));
5713 
5714 	/*
5715 	 * Send (aka replay) all messages we find in the changelog.
5716 	 * Flag the messages with
5717 	 *   MD_MSGF_REPLAY_MSG, so no new message ID is generated for them
5718 	 *   MD_MSGF_OVERRIDE_SUSPEND so they can pass the suspended commd.
5719 	 */
5720 	for (class = MD_MN_NCLASSES - 1; class > 0; class--) {
5721 		mdmn_changelog_record_t	*lr;
5722 		md_error_t	xep = mdnullerror;
5723 		md_mn_result_t	*resultp = NULL;
5724 		int		ret;
5725 
5726 		lr = mdmn_get_changelogrec(sp->setno, class);
5727 		if ((lr->lr_flags & MD_MN_LR_INUSE) == 0) {
5728 			/* no entry for this class */
5729 			continue;
5730 		}
5731 
5732 		meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN,
5733 		    "replaying message ID=(%d, 0x%llx-%d)\n"),
5734 		    MSGID_ELEMS(lr->lr_msg.msg_msgid));
5735 
5736 		ret = mdmn_send_message_with_msgid(
5737 		    lr->lr_msg.msg_setno,
5738 		    lr->lr_msg.msg_type,
5739 		    lr->lr_msg.msg_flags | MD_MSGF_REPLAY_MSG |
5740 		    MD_MSGF_OVERRIDE_SUSPEND,
5741 		    lr->lr_msg.msg_recipient,
5742 		    lr->lr_msg.msg_event_data,
5743 		    lr->lr_msg.msg_event_size,
5744 		    &resultp,
5745 		    &lr->lr_msg.msg_msgid,
5746 		    &xep);
5747 
5748 		meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN,
5749 		    "mdmn_send_message returned %d\n"), ret);
5750 
5751 		if (resultp)
5752 			free_result(resultp);
5753 	}
5754 
5755 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5756 	    "Playing changelog completed for set %s: %s"),
5757 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5758 
5759 	/*
5760 	 * Now that new master has ondisk and incore mddbs in sync, reset
5761 	 * this node's new master kernel flag (for this set).  If this node
5762 	 * re-enters another reconfig cycle before the completion of this
5763 	 * reconfig cycle, this master node won't need to check if the ondisk
5764 	 * and incore mddbs are in sync since this node won't be considered
5765 	 * a new master (since this flag is being reset here in the middle of
5766 	 * step2).  This will save time during any subsequent reconfig
5767 	 * cycles as long as this node continues to be master.
5768 	 */
5769 	(void) memset(&sf, 0, sizeof (sf));
5770 	sf.sf_setno = sp->setno;
5771 	sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5772 	sf.sf_flags = MDDB_NM_RESET;
5773 	/* Use magic to help protect ioctl against attack. */
5774 	sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5775 	/* Ignore failure, since failure to reset flag isn't catastrophic */
5776 	(void) metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, NULL);
5777 
5778 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5779 	    "Reset new master flag for set %s: %s"),
5780 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5781 
5782 	return (0);
5783 }
5784 
5785 /*
5786  * meta_mnjoin_all will join all starting nodes in the diskset.
5787  * A starting node is considered to be any node that is not
5788  * an owner of the set but is a member of the cluster.
5789  * Master node is already joined to set (done in meta_mnsync_diskset_mddbs).
5790  *
5791  * Caller is the Master node.
5792  *
5793  * Returns	 0 - Success
5794  *		205 - Failure during RPC to another node
5795  *		-1 - Any other failure and ep is filled in.
5796  */
5797 int
5798 meta_mnjoin_all(
5799 	mdsetname_t	*sp,
5800 	md_error_t	*ep
5801 )
5802 {
5803 	md_set_desc		*sd;
5804 	md_mnnode_desc		*nd, *nd2;
5805 	int			rval = 0;
5806 	int			stale_flag = 0;
5807 	mddb_config_t		c;
5808 	int			susp_res_flag = 0;
5809 	md_error_t		xep = mdnullerror;
5810 
5811 	/* If setname is there, set desc should exist. */
5812 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5813 		mde_perror(ep, dgettext(TEXT_DOMAIN,
5814 		    "Unable to get set %s desc information"), sp->setname);
5815 		return (-1);
5816 	}
5817 
5818 	/* Are there drives in the set? */
5819 	if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
5820 	    ep) == NULL) {
5821 		if (! mdisok(ep)) {
5822 			return (-1);
5823 		}
5824 		/* No drives in set -- nothing to join */
5825 		return (0);
5826 	}
5827 
5828 	/*
5829 	 * Is set currently stale?
5830 	 */
5831 	(void) memset(&c, 0, sizeof (c));
5832 	c.c_id = 0;
5833 	c.c_setno = sp->setno;
5834 	/* Ignore failure since master node may not be joined yet */
5835 	(void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL);
5836 	if (c.c_flags & MDDB_C_STALE) {
5837 		stale_flag = MNSET_IS_STALE;
5838 	}
5839 
5840 	/*
5841 	 * If any nodes are going to be joined to diskset, then
5842 	 * suspend I/O to all disks in diskset so that nodes can join
5843 	 * (read in mddbs) in a reasonable amount of time even under
5844 	 * high I/O load.  Don't need to do this if set is STALE since
5845 	 * no I/O can be occurring to a STALE set.
5846 	 */
5847 	if (stale_flag != MNSET_IS_STALE) {
5848 		nd = sd->sd_nodelist;
5849 		while (nd) {
5850 			/* Found a node that will be joined to diskset */
5851 			if ((nd->nd_flags & MD_MN_NODE_ALIVE) &&
5852 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5853 				/* Set flag that diskset should be suspended */
5854 				susp_res_flag = 1;
5855 				break;
5856 			}
5857 			nd = nd->nd_next;
5858 		}
5859 	}
5860 
5861 	if (susp_res_flag) {
5862 		/*
5863 		 * Block all I/Os to disks in this diskset on all joined
5864 		 * nodes in the diskset.
5865 		 * If block of I/Os fails due to an RPC failure on another
5866 		 * node, return 205; otherwise, return -1.
5867 		 */
5868 		nd = sd->sd_nodelist;
5869 		while (nd) {
5870 			/* Skip non-alive and non-owner nodes  */
5871 			if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5872 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5873 				nd = nd->nd_next;
5874 				continue;
5875 			}
5876 			if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5877 			    MN_SUSP_IO, ep)) {
5878 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5879 				    "Unable to suspend I/O on node %s"
5880 				    " in set %s"), nd->nd_nodename,
5881 				    sp->setname);
5882 				/*
5883 				 * Resume other nodes that had been suspended.
5884 				 * (Reconfig return step also resumes I/Os
5885 				 * for all sets.)
5886 				 */
5887 				nd2 = sd->sd_nodelist;
5888 				while (nd2) {
5889 					/* Stop when reaching failed node */
5890 					if (nd2->nd_nodeid == nd->nd_nodeid)
5891 						break;
5892 					/* Skip non-alive/non-owner nodes  */
5893 					if ((!(nd2->nd_flags &
5894 					    MD_MN_NODE_ALIVE)) ||
5895 					    (!(nd2->nd_flags &
5896 					    MD_MN_NODE_OWN))) {
5897 						nd2 = nd2->nd_next;
5898 						continue;
5899 					}
5900 					(void) (clnt_mn_susp_res_io(
5901 					    nd2->nd_nodename, sp->setno,
5902 					    MN_RES_IO, &xep));
5903 					nd2 = nd2->nd_next;
5904 				}
5905 
5906 				/*
5907 				 * If the suspend failed due to an
5908 				 * RPC failure on another node, return
5909 				 * a 205.
5910 				 * Otherwise, exit with failure.
5911 				 * The return reconfig step will resume
5912 				 * I/Os for all disksets.
5913 				 */
5914 				if ((mdanyrpcerror(ep)) &&
5915 				    (sd->sd_mn_mynode->nd_nodeid !=
5916 				    nd->nd_nodeid)) {
5917 					return (205);
5918 				} else {
5919 					return (-1);
5920 				}
5921 			}
5922 			nd = nd->nd_next;
5923 		}
5924 	}
5925 
5926 	nd = sd->sd_nodelist;
5927 	while (nd) {
5928 		/*
5929 		 * If a node is in the membership list but isn't joined
5930 		 * to the set, try to join the node.
5931 		 */
5932 		if ((nd->nd_flags & MD_MN_NODE_ALIVE) &&
5933 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5934 			if (clnt_joinset(nd->nd_nodename, sp,
5935 			    (MNSET_IN_RECONFIG | stale_flag), ep)) {
5936 				/*
5937 				 * If RPC failure to another node
5938 				 * then exit without attempting anything else.
5939 				 * (Reconfig return step will resume I/Os
5940 				 * for all sets.)
5941 				 */
5942 				if (mdanyrpcerror(ep)) {
5943 					mde_perror(ep, "");
5944 					return (205);
5945 				}
5946 				/*
5947 				 * STALE and ACCOK failures aren't true
5948 				 * failures.  STALE means that <50% mddbs
5949 				 * are available. ACCOK means that the
5950 				 * mediator provided the extra vote.
5951 				 * If a true failure, then print messasge
5952 				 * and withdraw node from set in order to
5953 				 * cleanup from failed join attempt.
5954 				 */
5955 				if ((!mdismddberror(ep, MDE_DB_STALE)) &&
5956 				    (!mdismddberror(ep, MDE_DB_ACCOK))) {
5957 					mde_perror(ep,
5958 					    "WARNING: Unable to join node %s "
5959 					    "to set %s", nd->nd_nodename,
5960 					    sp->setname);
5961 					mdclrerror(ep);
5962 					if (clnt_withdrawset(nd->nd_nodename,
5963 					    sp, &xep))
5964 						mdclrerror(&xep);
5965 					nd = nd->nd_next;
5966 					continue;
5967 				}
5968 			}
5969 			/* Set owner flag even if STALE or ACCOK */
5970 			nd->nd_flags |= MD_MN_NODE_OWN;
5971 		}
5972 		nd = nd->nd_next;
5973 	}
5974 	/*
5975 	 * Resume I/Os if suspended above.
5976 	 */
5977 	if (susp_res_flag) {
5978 		nd = sd->sd_nodelist;
5979 		while (nd) {
5980 			/*
5981 			 * Skip non-alive and non-owner nodes
5982 			 * (this list doesn't include any of
5983 			 * the nodes that were joined).
5984 			 */
5985 			if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5986 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5987 				nd = nd->nd_next;
5988 				continue;
5989 			}
5990 			if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5991 			    MN_RES_IO, ep)) {
5992 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5993 				    "Unable to resume I/O on node %s"
5994 				    " in set %s"), nd->nd_nodename,
5995 				    sp->setname);
5996 
5997 				/*
5998 				 * If an RPC failure then don't do any
5999 				 * more RPC calls, since one timeout is enough
6000 				 * to endure.  If RPC failure to another node,
6001 				 * return 205.  If RPC failure to my node,
6002 				 * return -1.
6003 				 * (Reconfig return step will resume I/Os
6004 				 * for all sets.)
6005 				 * If not an RPC failure, continue resuming the
6006 				 * rest of the nodes and then return -1.
6007 				 */
6008 				if (mdanyrpcerror(ep)) {
6009 					if (sd->sd_mn_mynode->nd_nodeid ==
6010 					    nd->nd_nodeid) {
6011 						return (-1);
6012 					} else {
6013 						return (205);
6014 					}
6015 				}
6016 
6017 				/*
6018 				 * If not an RPC error, continue resuming rest
6019 				 * of nodes, ignoring any failures except for
6020 				 * an RPC failure which constitutes an
6021 				 * immediate exit.
6022 				 * Start in middle of list with failing node.
6023 				 */
6024 				nd2 = nd->nd_next;
6025 				while (nd2) {
6026 					/* Skip non-owner nodes  */
6027 					if ((!(nd2->nd_flags &
6028 					    MD_MN_NODE_ALIVE)) ||
6029 					    (!(nd2->nd_flags &
6030 					    MD_MN_NODE_OWN))) {
6031 						nd2 = nd2->nd_next;
6032 						continue;
6033 					}
6034 					(void) (clnt_mn_susp_res_io(
6035 					    nd2->nd_nodename, sp->setno,
6036 					    MN_RES_IO, &xep));
6037 					if (mdanyrpcerror(&xep)) {
6038 						return (-1);
6039 					}
6040 					nd2 = nd2->nd_next;
6041 				}
6042 			}
6043 			nd = nd->nd_next;
6044 		}
6045 	}
6046 
6047 	nd = sd->sd_nodelist;
6048 	while (nd) {
6049 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
6050 			nd = nd->nd_next;
6051 			continue;
6052 		}
6053 		/*
6054 		 * If 1 node fails - go ahead and update the rest except
6055 		 * in the case of an RPC failure, fail immediately.
6056 		 */
6057 		if (clnt_upd_nr_flags(nd->nd_nodename, sp,
6058 		    sd->sd_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) {
6059 			/* RPC failure to another node */
6060 			if (mdanyrpcerror(ep)) {
6061 				return (205);
6062 			}
6063 			nd = nd->nd_next;
6064 			rval = -1;
6065 			continue;
6066 		}
6067 		nd = nd->nd_next;
6068 	}
6069 
6070 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
6071 	    "Join of all nodes completed for set %s: %s"),
6072 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
6073 
6074 	return (rval);
6075 }
6076