xref: /titanic_41/usr/src/lib/lvm/libmeta/common/meta_set.c (revision b0fc0e77220f1fa4c933fd58a4e1dedcd650b0f1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Just in case we're not in a build environment, make sure that
30  * TEXT_DOMAIN gets set to something.
31  */
32 #if !defined(TEXT_DOMAIN)
33 #define	TEXT_DOMAIN "SYS_TEST"
34 #endif
35 
36 /*
37  * Metadevice diskset interfaces
38  */
39 
40 #include "meta_set_prv.h"
41 #include <meta.h>
42 #include <metad.h>
43 #include <mdmn_changelog.h>
44 #include <sys/lvm/md_crc.h>
45 #include <sys/utsname.h>
46 #include <sdssc.h>
47 
48 #include <sys/sysevent/eventdefs.h>
49 #include <sys/sysevent/svm.h>
50 extern	char	*blkname(char *);
51 
52 static md_drive_desc *
53 dr2drivedesc(
54 	mdsetname_t	*sp,
55 	side_t		sideno,
56 	int		flags,
57 	md_error_t	*ep
58 )
59 {
60 	md_set_record	*sr;
61 	md_drive_record	*dr;
62 	mddrivename_t	*dnp;
63 	md_drive_desc	*dd_head = NULL;
64 	md_set_desc	*sd;
65 
66 	if (flags & MD_BYPASS_DAEMON) {
67 		if ((sr = metad_getsetbynum(sp->setno, ep)) == NULL)
68 			return (NULL);
69 		sd = metaget_setdesc(sp, ep);
70 		sideno = getnodeside(mynode(), sd);
71 		sp = metafakesetname(sp->setno, sr->sr_setname);
72 	} else {
73 		if ((sr = getsetbyname(sp->setname, ep)) == NULL)
74 			return (NULL);
75 	}
76 
77 	assert(sideno != MD_SIDEWILD);
78 
79 	/*
80 	 * WARNING:
81 	 * The act of getting the dnp from the namespace means that we
82 	 * will get the devid of the disk as recorded in the namespace.
83 	 * This devid has the potential to be stale if the disk is being
84 	 * replaced via a rebind, this means that any code that relies
85 	 * on any of the dnp information should take the appropriate action
86 	 * to preserve that information. For example in the rebind code the
87 	 * devid of the new disk is saved off and then copied back in once
88 	 * the code that has called this function has completed.
89 	 */
90 	for (dr = sr->sr_drivechain; dr != NULL; dr = dr->dr_next) {
91 		if ((dnp = metadrivename_withdrkey(sp, sideno, dr->dr_key,
92 		    flags, ep)) == NULL) {
93 			if (!(flags & MD_BYPASS_DAEMON))
94 				free_sr(sr);
95 			metafreedrivedesc(&dd_head);
96 			return (NULL);
97 		}
98 
99 		(void) metadrivedesc_append(&dd_head, dnp, dr->dr_dbcnt,
100 		    dr->dr_dbsize, dr->dr_ctime, dr->dr_genid, dr->dr_flags);
101 	}
102 
103 	if (!(flags & MD_BYPASS_DAEMON)) {
104 		free_sr(sr);
105 	}
106 	return (dd_head);
107 }
108 
109 static int
110 get_sidenmlist(
111 	mdsetname_t	*sp,
112 	mddrivename_t	*dnp,
113 	md_error_t	*ep
114 )
115 {
116 	md_set_desc	*sd;
117 	mdsidenames_t	*sn, **sn_next;
118 	int		i;
119 
120 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
121 		return (-1);
122 
123 	metaflushsidenames(dnp);
124 	sn_next = &dnp->side_names;
125 	if (MD_MNSET_DESC(sd)) {
126 		/*
127 		 * Only get sidenames for this node since
128 		 * that is the only side information stored in
129 		 * the local mddb for a multi-node diskset.
130 		 */
131 		if (sd->sd_mn_mynode) {
132 			sn = Zalloc(sizeof (*sn));
133 			sn->sideno = sd->sd_mn_mynode->nd_nodeid;
134 			if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET,
135 			    sn->sideno, dnp->side_names_key, &sn->dname,
136 			    &sn->mnum, NULL, ep)) == NULL) {
137 				if (sn->dname != NULL)
138 					Free(sn->dname);
139 				Free(sn);
140 				return (-1);
141 			}
142 
143 			/* Add to the end of the linked list */
144 			assert(*sn_next == NULL);
145 			*sn_next = sn;
146 			sn_next = &sn->next;
147 		}
148 	} else {
149 		for (i = 0; i < MD_MAXSIDES; i++) {
150 			/* Skip empty slots */
151 			if (sd->sd_nodes[i][0] == '\0')
152 				continue;
153 
154 			sn = Zalloc(sizeof (*sn));
155 			sn->sideno = i;
156 			if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET,
157 			    i+SKEW, dnp->side_names_key, &sn->dname,
158 			    &sn->mnum, NULL, ep)) == NULL) {
159 				/*
160 				 * It is possible that during the add of a
161 				 * host to have a 'missing' side as the side
162 				 * for this disk will be added later. So ignore
163 				 * the error. The 'missing' side will be added
164 				 * once the addhosts process has completed.
165 				 */
166 				if (mdissyserror(ep, ENOENT)) {
167 					mdclrerror(ep);
168 					Free(sn);
169 					continue;
170 				}
171 
172 				if (sn->dname != NULL)
173 					Free(sn->dname);
174 				Free(sn);
175 				return (-1);
176 			}
177 
178 			/* Add to the end of the linked list */
179 			assert(*sn_next == NULL);
180 			*sn_next = sn;
181 			sn_next = &sn->next;
182 		}
183 	}
184 
185 	return (0);
186 }
187 
188 static md_drive_desc *
189 rl_to_dd(
190 	mdsetname_t		*sp,
191 	md_replicalist_t	*rlp,
192 	md_error_t		*ep
193 )
194 {
195 	md_replicalist_t	*rl;
196 	md_replica_t		*r;
197 	md_drive_desc		*dd = NULL;
198 	md_drive_desc		*d;
199 	int			found;
200 	md_set_desc		*sd;
201 	daddr_t			nblks = 0;
202 
203 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
204 		return (NULL);
205 
206 	/* find the smallest existing replica */
207 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
208 		r = rl->rl_repp;
209 		nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks));
210 	}
211 
212 	if (nblks <= 0)
213 		nblks = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;
214 
215 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
216 		r = rl->rl_repp;
217 
218 		found = 0;
219 		for (d = dd; d != NULL; d = d->dd_next) {
220 			if (strcmp(r->r_namep->drivenamep->cname,
221 			    d->dd_dnp->cname) == 0) {
222 				found = 1;
223 				dd->dd_dbcnt++;
224 				break;
225 			}
226 		}
227 
228 		if (! found)
229 			(void) metadrivedesc_append(&dd, r->r_namep->drivenamep,
230 			    1, nblks, sd->sd_ctime, sd->sd_genid, MD_DR_OK);
231 	}
232 
233 	return (dd);
234 }
235 
236 /*
237  * Exported Entry Points
238  */
239 
240 set_t
241 get_max_sets(md_error_t *ep)
242 {
243 
244 	static set_t		max_sets = 0;
245 
246 	if (max_sets == 0)
247 		if (metaioctl(MD_IOCGETNSET, &max_sets, ep, NULL) != 0)
248 			return (0);
249 
250 	return (max_sets);
251 }
252 
253 int
254 get_max_meds(md_error_t *ep)
255 {
256 	static int		max_meds = 0;
257 
258 	if (max_meds == 0)
259 		if (metaioctl(MD_MED_GET_NMED, &max_meds, ep, NULL) != 0)
260 			return (0);
261 
262 	return (max_meds);
263 }
264 
265 side_t
266 getmyside(mdsetname_t *sp, md_error_t *ep)
267 {
268 	md_set_desc		*sd;
269 	char 			*node = NULL;
270 	side_t			sideno;
271 
272 	if (sp->setno == 0)
273 		return (0);
274 
275 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
276 		return (MD_SIDEWILD);
277 
278 	node = mynode();
279 
280 	assert(node != NULL);
281 
282 	sideno = getnodeside(node, sd);
283 
284 	if (sideno != MD_SIDEWILD)
285 		return (sideno);
286 
287 	return (mddserror(ep, MDE_DS_HOSTNOSIDE, sp->setno, node, NULL, node));
288 }
289 
290 /*
291  * get set info from name
292  */
293 md_set_record *
294 getsetbyname(char *setname, md_error_t *ep)
295 {
296 	md_set_record		*sr = NULL;
297 	md_mnset_record		*mnsr = NULL;
298 	char			*p;
299 	size_t			len;
300 
301 	/* get set info from daemon */
302 	if (clnt_getset(mynode(), setname, MD_SET_BAD, &sr, ep) == -1)
303 		return (NULL);
304 	if (sr != NULL) {
305 		/*
306 		 * Returned record could be for a multi-node set or a
307 		 * non-multi-node set.
308 		 */
309 		if (MD_MNSET_REC(sr)) {
310 			/*
311 			 * Record is for a multi-node set.  Reissue call
312 			 * to get mnset information.  Need to free
313 			 * record as if a non-multi-node set record since
314 			 * that is what clnt_getset gave us.  If in
315 			 * the daemon, don't free since this is a pointer
316 			 * into the setrecords array.
317 			 */
318 			if (! md_in_daemon) {
319 				sr->sr_flags &= ~MD_SR_MN;
320 				free_sr(sr);
321 			}
322 			if (clnt_mngetset(mynode(), setname, MD_SET_BAD, &mnsr,
323 			    ep) == -1)
324 				return (NULL);
325 			if (mnsr != NULL)
326 				return ((struct md_set_record *)mnsr);
327 		} else {
328 			return (sr);
329 		}
330 	}
331 
332 	/* no such set */
333 	len = strlen(setname) + 30;
334 	p = Malloc(len);
335 	(void) snprintf(p, len, "setname \"%s\"", setname);
336 	(void) mderror(ep, MDE_NO_SET, p);
337 	Free(p);
338 	return (NULL);
339 }
340 
341 /*
342  * get set info from number
343  */
344 md_set_record *
345 getsetbynum(set_t setno, md_error_t *ep)
346 {
347 	md_set_record		*sr;
348 	md_mnset_record		*mnsr = NULL;
349 	char			buf[100];
350 
351 	if (clnt_getset(mynode(), NULL, setno, &sr, ep) == -1)
352 		return (NULL);
353 
354 	if (sr != NULL) {
355 		/*
356 		 * Record is for a multi-node set.  Reissue call
357 		 * to get mnset information.  Need to free
358 		 * record as if a non-multi-node set record since
359 		 * that is what clnt_getset gave us.  If in
360 		 * the daemon, don't free since this is a pointer
361 		 * into the setrecords array.
362 		 */
363 		if (MD_MNSET_REC(sr)) {
364 			/*
365 			 * Record is for a multi-node set.  Reissue call
366 			 * to get mnset information.
367 			 */
368 			if (! md_in_daemon) {
369 				sr->sr_flags &= ~MD_SR_MN;
370 				free_sr(sr);
371 			}
372 			if (clnt_mngetset(mynode(), NULL, setno, &mnsr,
373 			    ep) == -1)
374 				return (NULL);
375 			if (mnsr != NULL)
376 				return ((struct md_set_record *)mnsr);
377 		} else {
378 			return (sr);
379 		}
380 	}
381 
382 	(void) sprintf(buf, "setno %u", setno);
383 	(void) mderror(ep, MDE_NO_SET, buf);
384 	return (NULL);
385 }
386 
387 int
388 meta_check_drive_inuse(
389 	mdsetname_t	*sp,
390 	mddrivename_t	*dnp,
391 	int		check_db,
392 	md_error_t	*ep
393 )
394 {
395 	mdnamelist_t	*nlp = NULL;
396 	mdnamelist_t	*p;
397 	int		rval = 0;
398 
399 	/* get all underlying partitions */
400 	if (meta_getalldevs(sp, &nlp, check_db, ep) != 0)
401 		return (-1);
402 
403 	/* search for drive */
404 	for (p = nlp; (p != NULL); p = p->next) {
405 		mdname_t	*np = p->namep;
406 
407 		if (strcmp(dnp->cname, np->drivenamep->cname) == 0) {
408 			rval = (mddserror(ep, MDE_DS_DRIVEINUSE, sp->setno,
409 			    NULL, dnp->cname, sp->setname));
410 			break;
411 		}
412 	}
413 
414 	/* cleanup, return success */
415 	metafreenamelist(nlp);
416 	return (rval);
417 }
418 
419 /*
420  * simple check for ownership
421  */
422 int
423 meta_check_ownership(mdsetname_t *sp, md_error_t *ep)
424 {
425 	int			ownset;
426 	md_set_desc		*sd;
427 	md_drive_desc		*dd;
428 	md_replicalist_t	*rlp = NULL;
429 	md_error_t		xep = mdnullerror;
430 
431 	if (metaislocalset(sp))
432 		return (0);
433 
434 	ownset = own_set(sp, NULL, TRUE, ep);
435 	if (! mdisok(ep))
436 		return (-1);
437 
438 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
439 		return (-1);
440 
441 	dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep);
442 	if (! mdisok(ep))
443 		return (-1);
444 
445 	/* If we have no drive descriptors, check for no ownership */
446 	if (dd == NULL) {
447 		if (ownset == MD_SETOWNER_NONE)
448 			return (0);
449 
450 		/* If ownership somehow has come to exist, we must clean up */
451 
452 		if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp,
453 		    &xep) < 0)
454 			mdclrerror(&xep);
455 
456 		if ((dd = rl_to_dd(sp, rlp, &xep)) == NULL)
457 			if (! mdisok(&xep))
458 				mdclrerror(&xep);
459 
460 		if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
461 			if (rel_own_bydd(sp, dd, TRUE, &xep))
462 				mdclrerror(&xep);
463 		}
464 
465 		if (halt_set(sp, &xep))
466 			mdclrerror(&xep);
467 
468 		metafreereplicalist(rlp);
469 
470 		metafreedrivedesc(&dd);
471 
472 		return (0);
473 	}
474 
475 	metafreedrivedesc(&sd->sd_drvs);
476 
477 	if (ownset == MD_SETOWNER_YES)
478 		return (0);
479 
480 	return (mddserror(ep, MDE_DS_NOOWNER, sp->setno, NULL, NULL,
481 	    sp->setname));
482 }
483 
484 /*
485  * simple check for ownership
486  */
487 int
488 meta_check_ownership_on_host(mdsetname_t *sp, char *hostname, md_error_t *ep)
489 {
490 	md_set_desc	*sd;
491 	md_drive_desc	*dd;
492 	int		bool;
493 
494 	if (metaislocalset(sp))
495 		return (0);
496 
497 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
498 		return (-1);
499 
500 	if (getnodeside(hostname, sd) == MD_SIDEWILD)
501 		return (mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
502 		    hostname, NULL, sp->setname));
503 
504 	dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep);
505 	if (! mdisok(ep))
506 		return (-1);
507 
508 	if (clnt_ownset(hostname, sp, &bool, ep) == -1)
509 		return (-1);
510 
511 	if (dd == NULL)
512 		return (0);
513 
514 	metafreedrivedesc(&sd->sd_drvs);
515 
516 	if (bool == TRUE)
517 		return (0);
518 
519 	return (mddserror(ep, MDE_DS_NODEISNOTOWNER, sp->setno, hostname, NULL,
520 	    sp->setname));
521 }
522 
523 /*
524  * Function that determines if a node is in the multinode diskset
525  * membership list.  Calling node passes in node to be checked and
526  * the nodelist as returned from meta_read_nodelist.  This routine
527  * anticipates being called many times using the same diskset membership
528  * list which is why the alloc and free of the diskset membership list
529  * is left to the calling routine.
530  * Returns:
531  *	1 - if a member
532  *	0 - not a member
533  */
534 int
535 meta_is_member(
536 	char				*node_name,
537 	md_mn_nodeid_t			node_id,
538 	mndiskset_membershiplist_t	*nl
539 )
540 {
541 	mndiskset_membershiplist_t	*nl2;
542 	int				flag_check_name;
543 
544 	if (node_id != 0)
545 		flag_check_name = 0;
546 	else if (node_name != NULL)
547 		flag_check_name = 1;
548 	else
549 		return (0);
550 
551 	nl2 = nl;
552 	while (nl2) {
553 		if (flag_check_name) {
554 			/* Compare given name against name in member list */
555 			if (strcmp(nl2->msl_node_name, node_name) == 0)
556 				break;
557 		} else {
558 			/* Compare given nodeid against nodeid in member list */
559 			if (nl2->msl_node_id == node_id)
560 				break;
561 		}
562 		nl2 = nl2->next;
563 	}
564 	/* No match found in member list */
565 	if (nl2 == NULL) {
566 		return (0);
567 	}
568 	/* Return 1 if node is in member list */
569 	return (1);
570 }
571 
572 /*
573  * meta_getnext_devinfo should go to the host that
574  * has the device, to return the device name, driver name, minor num.
575  * We can take the big cheat for now, since it is a requirement
576  * that the device names and device numbers are the same, and
577  * just get the info locally.
578  *
579  * This routine is very similar to meta_getnextside_devinfo except
580  * that the specific side to be used is being passed in.
581  *
582  * Exit status:
583  *	 0 - No more side info to return
584  *	 1 - More side info's to return
585  *	-1 - An error has been detected
586  */
587 /*ARGSUSED*/
588 int
589 meta_getside_devinfo(
590 	mdsetname_t	*sp,		/* for this set */
591 	char		*bname,		/* local block name (myside) */
592 	side_t		sideno,		/* sideno */
593 	char		**ret_bname,	/* block device name of returned side */
594 	char		**ret_dname,	/* driver name of returned side */
595 	minor_t		*ret_mnum,	/* minor number of returned side */
596 	md_error_t	*ep
597 )
598 {
599 	mdname_t	*np;
600 
601 	if (ret_bname != NULL)
602 		*ret_bname = NULL;
603 	if (ret_dname != NULL)
604 		*ret_dname = NULL;
605 	if (ret_mnum != NULL)
606 		*ret_mnum = NODEV32;
607 
608 
609 	if ((np = metaname(&sp, bname, LOGICAL_DEVICE, ep)) == NULL)
610 		return (-1);
611 
612 /*
613  * NOTE (future) - There will be more work here once devids are integrated
614  * into disksets.  Then the side should be used to find the correct
615  * host and the b/d names should be gotten from that host.
616  */
617 
618 	/*
619 	 * Return the side info.
620 	 */
621 	if (ret_bname != NULL)
622 		*ret_bname = Strdup(np->bname);
623 
624 	if (ret_dname != NULL) {
625 		mdcinfo_t	*cinfo;
626 
627 		if ((cinfo = metagetcinfo(np, ep)) == NULL)
628 			return (-1);
629 
630 		*ret_dname = Strdup(cinfo->dname);
631 	}
632 
633 	if (ret_mnum != NULL)
634 		*ret_mnum = meta_getminor(np->dev);
635 
636 	return (1);
637 }
638 
639 /*
640  * Get the information on the device from the remote node using the devid
641  * of the disk.
642  *
643  * Exit status:
644  *	 0 - No more side info to return
645  *	 1 - More side info's to return
646  *	-1 - An error has been detected
647  */
648 int
649 meta_getnextside_devinfo(
650 	mdsetname_t	*sp,		/* for this set */
651 	char		*bname,		/* local block name (myside) */
652 	side_t		*sideno,	/* previous sideno & returned sideno */
653 	char		**ret_bname,	/* block device name of returned side */
654 	char		**ret_dname,	/* driver name of returned side */
655 	minor_t		*ret_mnum,	/* minor number of returned side */
656 	md_error_t	*ep
657 )
658 {
659 	md_set_desc	*sd;
660 	int		i;
661 	mdname_t	*np;
662 	mddrivename_t	*dnp;
663 	char		*devidstr = NULL;
664 	int		devidstrlen;
665 	md_dev64_t	retdev = NODEV64;
666 	char		*ret_devname = NULL;
667 	char		*ret_blkdevname = NULL;
668 	char		*ret_driver = NULL;
669 	char		*nodename;
670 	int		fd;
671 	int		ret = -1;
672 	char		*minor_name = NULL;
673 	md_mnnode_desc	*nd;
674 
675 
676 	if (ret_bname != NULL)
677 		*ret_bname = NULL;
678 	if (ret_dname != NULL)
679 		*ret_dname = NULL;
680 	if (ret_mnum != NULL)
681 		*ret_mnum = NODEV32;
682 
683 	if (metaislocalset(sp)) {
684 		/* no more sides - we are done */
685 		if (*sideno != MD_SIDEWILD)
686 			return (0);
687 
688 		/* First time through -  set up return sideno */
689 		*sideno = 0;
690 	} else {
691 
692 		/*
693 		 * Find the next sideno, starting after the one given.
694 		 */
695 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
696 			return (-1);
697 
698 		if (MD_MNSET_DESC(sd)) {
699 			nd = sd->sd_nodelist;
700 			if ((*sideno == MD_SIDEWILD) &&
701 			    (nd != (struct md_mnnode_desc *)NULL)) {
702 				*sideno = nd->nd_nodeid;
703 			} else {
704 				while (nd) {
705 					/*
706 					 * Found given sideno, now find
707 					 * next sideno, if there is one.
708 					 */
709 					if ((*sideno == nd->nd_nodeid) &&
710 					    (nd->nd_next !=
711 					    (struct md_mnnode_desc *)NULL)) {
712 						*sideno =
713 						    nd->nd_next->nd_nodeid;
714 						break;
715 					}
716 					nd = nd->nd_next;
717 				}
718 				if (nd == NULL) {
719 					return (0);
720 				}
721 			}
722 			if (*sideno == MD_SIDEWILD)
723 				return (0);
724 		} else {
725 			for (i = (*sideno)+1; i < MD_MAXSIDES; i++)
726 				/* Find next full slot */
727 				if (sd->sd_nodes[i][0] != '\0')
728 					break;
729 
730 			/* No more sides - we are done */
731 			if (i == MD_MAXSIDES)
732 				return (0);
733 
734 			/* Set up the return sideno */
735 			*sideno = i;
736 			nodename = (char *)sd->sd_nodes[i];
737 		}
738 	}
739 
740 	/*
741 	 * Need to pass the node the devid of the disk and get it to
742 	 * send back the details of the disk from that side.
743 	 */
744 	if ((np = metaname(&sp, bname, UNKNOWN, ep)) == NULL)
745 		return (-1);
746 
747 	dnp = np->drivenamep;
748 
749 	/*
750 	 * By default, set up the parameters so that they are copied out.
751 	 */
752 	if (ret_bname != NULL)
753 		*ret_bname = Strdup(np->bname);
754 
755 	if (ret_dname != NULL) {
756 		mdcinfo_t	*cinfo;
757 
758 		if ((cinfo = metagetcinfo(np, ep)) == NULL)
759 			return (-1);
760 
761 		*ret_dname = Strdup(cinfo->dname);
762 	}
763 
764 	if (ret_mnum != NULL)
765 		*ret_mnum = meta_getminor(np->dev);
766 
767 	/*
768 	 * Try some optimization. If this is the local set or the device
769 	 * is a metadevice then just copy the information. If the device
770 	 * does not have a devid (due to not having a minor name) then
771 	 * fall back to the pre-devid behaviour of copying the information
772 	 * on the device: this is okay because the sanity checks before this
773 	 * call would have found any issues with the device. If it's a
774 	 * multi-node diskset also just return ie. copy.
775 	 */
776 	if (metaislocalset(sp) || metaismeta(np) || (dnp->devid == NULL) ||
777 	    (MD_MNSET_DESC(sd)))
778 		return (1);
779 
780 	if (np->minor_name == (char *)NULL) {
781 		/*
782 		 * Have to get the minor name then. The slice should exist
783 		 * on the disk because it will have already been repartitioned
784 		 * up prior to getting to this point.
785 		 */
786 		if ((fd = open(np->bname, (O_RDONLY|O_NDELAY), 0)) < 0) {
787 			(void) mdsyserror(ep, errno, np->bname);
788 			return (-1);
789 		}
790 		(void) devid_get_minor_name(fd, &minor_name);
791 		np->minor_name = Strdup(minor_name);
792 		devid_str_free(minor_name);
793 		(void) close(fd);
794 	}
795 
796 	/* allocate extra space for "/" and NULL hence +2 */
797 	devidstrlen = strlen(dnp->devid) + strlen(np->minor_name) + 2;
798 	devidstr = (char *)Malloc(devidstrlen);
799 
800 	/*
801 	 * As a minor name is supplied then the ret_devname will be
802 	 * appropriate to that minor_name and in this case it will be
803 	 * a block device ie /dev/dsk.
804 	 */
805 	(void) snprintf(devidstr, devidstrlen,
806 		"%s/%s", dnp->devid, np->minor_name);
807 
808 	ret = clnt_devinfo_by_devid(nodename, sp, devidstr, &retdev,
809 	    np->bname, &ret_devname, &ret_driver, ep);
810 
811 	Free(devidstr);
812 
813 	/*
814 	 * If the other side is not running device id in disksets,
815 	 * 'ret' is set to ENOTSUP in which case we fallback to
816 	 * the existing behaviour
817 	 */
818 	if (ret == ENOTSUP)
819 		return (1);
820 	else if (ret == -1)
821 		return (-1);
822 
823 	/*
824 	 * ret_devname comes from the rpc call and is a
825 	 * raw device name. We need to make this into a
826 	 * block device via blkname for further processing.
827 	 * Unfortunately, when our device id isn't found in
828 	 * the system, the rpc call will return a " " in
829 	 * ret_devname in which case we need to fill that in
830 	 * as ret_blkname because blkname of " " returns NULL.
831 	 */
832 	if (ret_bname != NULL && ret_devname != NULL) {
833 		ret_blkdevname = blkname(ret_devname);
834 		if (ret_blkdevname == NULL)
835 			*ret_bname = Strdup(ret_devname);
836 		else
837 			*ret_bname = Strdup(ret_blkdevname);
838 	}
839 
840 	if (ret_dname != NULL && ret_driver != NULL)
841 		*ret_dname = Strdup(ret_driver);
842 
843 	if (ret_mnum != NULL)
844 		*ret_mnum = meta_getminor(retdev);
845 
846 	return (1);
847 }
848 
849 int
850 meta_is_drive_in_anyset(
851 	mddrivename_t	*dnp,
852 	mdsetname_t	**spp,
853 	int		bypass_daemon,
854 	md_error_t 	*ep
855 )
856 {
857 	set_t		setno;
858 	mdsetname_t	*this_sp;
859 	int		is_it;
860 	set_t		max_sets;
861 
862 	if ((max_sets = get_max_sets(ep)) == 0)
863 		return (-1);
864 
865 	assert(spp != NULL);
866 	*spp = NULL;
867 
868 	for (setno = 1; setno < max_sets; setno++) {
869 		if (!bypass_daemon) {
870 			if ((this_sp = metasetnosetname(setno, ep)) == NULL) {
871 				if (mdismddberror(ep, MDE_DB_NODB)) {
872 					mdclrerror(ep);
873 					return (0);
874 				}
875 				if (mdiserror(ep, MDE_NO_SET)) {
876 					mdclrerror(ep);
877 					continue;
878 				}
879 				return (-1);
880 			}
881 		} else
882 			this_sp = metafakesetname(setno, NULL);
883 
884 		if ((is_it = meta_is_drive_in_thisset(this_sp, dnp,
885 		    bypass_daemon, ep)) == -1) {
886 			if (mdiserror(ep, MDE_NO_SET)) {
887 				mdclrerror(ep);
888 				continue;
889 			}
890 			return (-1);
891 		}
892 		if (is_it) {
893 			*spp = this_sp;
894 			return (0);
895 		}
896 	}
897 	return (0);
898 }
899 
900 int
901 meta_is_drive_in_thisset(
902 	mdsetname_t	*sp,
903 	mddrivename_t	*dnp,
904 	int		bypass_daemon,
905 	md_error_t	*ep
906 )
907 {
908 	md_drive_desc	*dd, *p;
909 
910 	if (bypass_daemon)
911 		dd = dr2drivedesc(sp, MD_SIDEWILD,
912 		    (MD_BASICNAME_OK | MD_BYPASS_DAEMON), ep);
913 	else
914 		dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
915 
916 	if (dd == NULL) {
917 		if (! mdisok(ep))
918 			return (-1);
919 		return (0);
920 	}
921 
922 
923 	for (p = dd; p != NULL; p = p->dd_next)
924 		if (strcmp(p->dd_dnp->cname, dnp->cname) == 0)
925 			return (1);
926 	return (0);
927 }
928 
929 /*
930  * Check to see if devid is in use in any diskset.
931  * This is used in the case when a partial diskset is being imported
932  * to make sure that the unvailable drive isn't already in use in an
933  * already imported partial diskset.  Can't check on the cname since the
934  * unavailable disk's cname is from the previous system and may collide
935  * with a cname on this system.
936  * Return values:
937  *	1: devid has been found in a diskset
938  *	0: devid not found in any diskset
939  */
940 int
941 meta_is_devid_in_anyset(
942 	void		*devid,
943 	mdsetname_t	**spp,
944 	md_error_t 	*ep
945 )
946 {
947 	set_t		setno;
948 	mdsetname_t	*this_sp;
949 	int		is_it;
950 	set_t		max_sets;
951 
952 	if ((max_sets = get_max_sets(ep)) == 0)
953 		return (-1);
954 
955 	assert(spp != NULL);
956 	*spp = NULL;
957 
958 	for (setno = 1; setno < max_sets; setno++) {
959 		if ((this_sp = metasetnosetname(setno, ep)) == NULL) {
960 			if (mdismddberror(ep, MDE_DB_NODB)) {
961 				mdclrerror(ep);
962 				return (0);
963 			}
964 			if (mdiserror(ep, MDE_NO_SET)) {
965 				mdclrerror(ep);
966 				continue;
967 			}
968 			return (-1);
969 		}
970 
971 		if ((is_it = meta_is_devid_in_thisset(this_sp,
972 		    devid, ep)) == -1) {
973 			if (mdiserror(ep, MDE_NO_SET)) {
974 				mdclrerror(ep);
975 				continue;
976 			}
977 			return (-1);
978 		}
979 		if (is_it) {
980 			*spp = this_sp;
981 			return (0);
982 		}
983 	}
984 	return (0);
985 }
986 
987 int
988 meta_is_devid_in_thisset(
989 	mdsetname_t	*sp,
990 	void		*devid,
991 	md_error_t	*ep
992 )
993 {
994 	md_drive_desc	*dd, *p;
995 	ddi_devid_t	dd_devid;
996 
997 	dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
998 	if (dd == NULL) {
999 		if (! mdisok(ep))
1000 			return (-1);
1001 		return (0);
1002 	}
1003 
1004 	for (p = dd; p != NULL; p = p->dd_next) {
1005 		if (p->dd_dnp->devid == NULL)
1006 			continue;
1007 		(void) devid_str_decode(p->dd_dnp->devid,
1008 		    &dd_devid, NULL);
1009 		if (dd_devid == NULL)
1010 			continue;
1011 		if (devid_compare(devid, dd_devid) == 0) {
1012 			devid_free(dd_devid);
1013 			return (1);
1014 		}
1015 		devid_free(dd_devid);
1016 	}
1017 	return (0);
1018 }
1019 
1020 int
1021 meta_set_balance(
1022 	mdsetname_t		*sp,
1023 	md_error_t		*ep
1024 )
1025 {
1026 	md_set_desc		*sd;
1027 	md_drive_desc		*dd, *curdd;
1028 	daddr_t			dbsize;
1029 	daddr_t			nblks;
1030 	int			i;
1031 	int			rval = 0;
1032 	sigset_t		oldsigs;
1033 	md_setkey_t		*cl_sk;
1034 	md_error_t		xep = mdnullerror;
1035 	md_mnnode_desc		*nd;
1036 	int			suspend1_flag = 0;
1037 
1038 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1039 		return (-1);
1040 
1041 	dbsize = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;
1042 
1043 	/* Make sure we own the set */
1044 	if (meta_check_ownership(sp, ep) != 0)
1045 		return (-1);
1046 
1047 	/* END CHECK CODE */
1048 
1049 	/*
1050 	 * Get drive descriptors for the drives that are currently in the set.
1051 	 */
1052 	curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep);
1053 
1054 	if (! mdisok(ep))
1055 		return (-1);
1056 
1057 	/* Find the minimum replica size in use is or use the default */
1058 	if ((nblks = meta_db_minreplica(sp, ep)) < 0)
1059 		mdclrerror(ep);
1060 	else
1061 		dbsize = nblks;	/* adjust replica size */
1062 
1063 	/* Make sure we are blocking all signals */
1064 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
1065 		mdclrerror(&xep);
1066 
1067 	/*
1068 	 * Lock the set on current set members.
1069 	 * For MN diskset lock_set and SUSPEND are used to protect against
1070 	 * other meta* commands running on the other nodes.
1071 	 */
1072 	if (MD_MNSET_DESC(sd)) {
1073 		nd = sd->sd_nodelist;
1074 		while (nd) {
1075 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1076 				nd = nd->nd_next;
1077 				continue;
1078 			}
1079 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
1080 				rval = -1;
1081 				goto out;
1082 			}
1083 			nd = nd->nd_next;
1084 		}
1085 		/*
1086 		 * Lock out other meta* commands by suspending
1087 		 * class 1 messages across the diskset.
1088 		 */
1089 		nd = sd->sd_nodelist;
1090 		while (nd) {
1091 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1092 				nd = nd->nd_next;
1093 				continue;
1094 			}
1095 			if (clnt_mdcommdctl(nd->nd_nodename,
1096 			    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
1097 			    MD_MSCF_NO_FLAGS, ep)) {
1098 				rval = -1;
1099 				goto out;
1100 			}
1101 			suspend1_flag = 1;
1102 			nd = nd->nd_next;
1103 		}
1104 	} else {
1105 		for (i = 0; i < MD_MAXSIDES; i++) {
1106 			/* Skip empty slots */
1107 			if (sd->sd_nodes[i][0] == '\0') continue;
1108 
1109 			if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
1110 				rval = -1;
1111 				goto out;
1112 			}
1113 		}
1114 	}
1115 
1116 	/* We are not adding or deleting any drives, just balancing */
1117 	dd = NULL;
1118 
1119 	/*
1120 	 * Balance the DB's according to the list of existing drives and the
1121 	 * list of added drives.
1122 	 */
1123 	if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1)
1124 		goto out;
1125 
1126 out:
1127 	/*
1128 	 * Unlock diskset by resuming class 1 messages across the diskset.
1129 	 * Just resume all classes so that resume is the same whether
1130 	 * just one class was locked or all classes were locked.
1131 	 */
1132 	if (suspend1_flag) {
1133 		nd = sd->sd_nodelist;
1134 		while (nd) {
1135 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1136 				nd = nd->nd_next;
1137 				continue;
1138 			}
1139 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
1140 				sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
1141 				/*
1142 				 * We are here because we failed to resume
1143 				 * rpc.mdcommd.  However we potentially have
1144 				 * an error from the previous call
1145 				 * (meta_db_balance). If the previous call
1146 				 * did fail,  we capture that error and
1147 				 * generate a perror withthe string,
1148 				 * "Unable to resume...".
1149 				 * Setting rval to -1 ensures that in the
1150 				 * next iteration of the loop, ep is not
1151 				 * clobbered.
1152 				 */
1153 				if (rval == 0)
1154 					(void) mdstealerror(ep, &xep);
1155 				else
1156 					mdclrerror(&xep);
1157 				rval = -1;
1158 				mde_perror(ep, dgettext(TEXT_DOMAIN,
1159 				    "Unable to resume rpc.mdcommd."));
1160 			}
1161 			nd = nd->nd_next;
1162 		}
1163 	}
1164 
1165 	/* Unlock the set */
1166 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1167 	if (MD_MNSET_DESC(sd)) {
1168 		nd = sd->sd_nodelist;
1169 		while (nd) {
1170 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1171 				nd = nd->nd_next;
1172 				continue;
1173 			}
1174 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
1175 				if (rval == 0)
1176 					(void) mdstealerror(ep, &xep);
1177 				else
1178 					mdclrerror(&xep);
1179 				rval = -1;
1180 			}
1181 			nd = nd->nd_next;
1182 		}
1183 	} else {
1184 		for (i = 0; i < MD_MAXSIDES; i++) {
1185 			/* Skip empty slots */
1186 			if (sd->sd_nodes[i][0] == '\0')
1187 				continue;
1188 
1189 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
1190 				if (rval == 0)
1191 					(void) mdstealerror(ep, &xep);
1192 				rval = -1;
1193 			}
1194 		}
1195 	}
1196 
1197 	/* release signals back to what they were on entry */
1198 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
1199 		mdclrerror(&xep);
1200 
1201 	cl_set_setkey(NULL);
1202 
1203 	metaflushsetname(sp);
1204 
1205 	return (rval);
1206 }
1207 
1208 int
1209 meta_set_destroy(
1210 	mdsetname_t	*sp,
1211 	int		lock_set,
1212 	md_error_t	*ep
1213 )
1214 {
1215 	int		i;
1216 	med_rec_t	medr;
1217 	md_set_desc	*sd;
1218 	md_drive_desc	*dd, *p, *p1;
1219 	mddrivename_t	*dnp;
1220 	mdname_t	*np;
1221 	mdnamelist_t	*nlp = NULL;
1222 	int		num_users = 0;
1223 	int		has_set;
1224 	side_t		mysideno;
1225 	sigset_t	oldsigs;
1226 	md_error_t	xep = mdnullerror;
1227 	md_setkey_t	*cl_sk;
1228 	int		rval = 0;
1229 	int		delete_end = 1;
1230 
1231 	/* Make sure we are blocking all signals */
1232 	if (procsigs(TRUE, &oldsigs, ep) < 0)
1233 		return (-1);
1234 
1235 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1236 		if (! mdisok(ep))
1237 			rval = -1;
1238 		goto out;
1239 	}
1240 
1241 	/*
1242 	 * meta_set_destroy should not be called for a MN diskset.
1243 	 * This routine destroys a set without communicating this information
1244 	 * to the other nodes which would lead to an inconsistency in
1245 	 * the MN diskset.
1246 	 */
1247 	if (MD_MNSET_DESC(sd)) {
1248 		rval = -1;
1249 		goto out;
1250 	}
1251 
1252 	/* Continue if a traditional diskset */
1253 
1254 	/*
1255 	 * Check to see who has the set.  If we are not the last user of the
1256 	 * set, we will not touch the replicas.
1257 	 */
1258 	for (i = 0; i < MD_MAXSIDES; i++) {
1259 		/* Skip empty slots */
1260 		if (sd->sd_nodes[i][0] == '\0')
1261 			continue;
1262 
1263 		has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NST_EQ,
1264 		    ep);
1265 
1266 		if (has_set < 0) {
1267 			mdclrerror(ep);
1268 		} else
1269 			num_users++;
1270 	}
1271 
1272 	if ((dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) == NULL) {
1273 		if (! mdisok(ep)) {
1274 			rval = -1;
1275 			goto out;
1276 		}
1277 	}
1278 
1279 	if (setup_db_bydd(sp, dd, TRUE, ep) == -1) {
1280 		rval = -1;
1281 		goto out;
1282 	}
1283 
1284 	if (lock_set == TRUE) {
1285 		/* Lock the set on our side */
1286 		if (clnt_lock_set(mynode(), sp, ep)) {
1287 			rval = -1;
1288 			goto out;
1289 		}
1290 	}
1291 
1292 	/*
1293 	 * A traditional diskset has no diskset stale information to send
1294 	 * since there can only be one owner node at a time.
1295 	 */
1296 	if (snarf_set(sp, FALSE, ep))
1297 		mdclrerror(ep);
1298 
1299 	if (dd != NULL) {
1300 		/*
1301 		 * Make sure that no drives are in use as parts of metadrives
1302 		 * or hot spare pools, this is one of the few error conditions
1303 		 * that will stop this routine, unless the environment has
1304 		 * META_DESTROY_SET_OK set, in which case, the operation will
1305 		 * proceed.
1306 		 */
1307 		if (getenv("META_DESTROY_SET_OK") == NULL) {
1308 			for (p = dd; p != NULL; p = p->dd_next) {
1309 				dnp = p->dd_dnp;
1310 
1311 				i = meta_check_drive_inuse(sp, dnp, FALSE, ep);
1312 				if (i == -1) {
1313 					/* need xep - wire calls clear error */
1314 					i = metaget_setownership(sp, &xep);
1315 					if (i == -1) {
1316 						rval = -1;
1317 						goto out;
1318 					}
1319 
1320 					mysideno = getmyside(sp, &xep);
1321 
1322 					if (mysideno == MD_SIDEWILD) {
1323 						rval = -1;
1324 						goto out;
1325 					}
1326 
1327 					if (sd->sd_isown[mysideno] == FALSE)
1328 						if (halt_set(sp, &xep)) {
1329 							rval = -1;
1330 							goto out;
1331 						}
1332 
1333 					rval = -1;
1334 					goto out;
1335 				}
1336 			}
1337 		}
1338 
1339 		for (i = 0; i < MD_MAXSIDES; i++) {
1340 			/* Skip empty slots */
1341 			if (sd->sd_nodes[i][0] == '\0')
1342 				continue;
1343 
1344 			/* Skip non local nodes */
1345 			if (strcmp(mynode(), sd->sd_nodes[i]) != 0)
1346 				continue;
1347 
1348 			if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep))
1349 				mdclrerror(ep);
1350 		}
1351 
1352 		/*
1353 		 * Go thru each drive and individually delete the replicas.
1354 		 * This way we can ignore individual errors.
1355 		 */
1356 		for (p = dd; p != NULL; p = p->dd_next) {
1357 			uint_t	rep_slice;
1358 
1359 			dnp = p->dd_dnp;
1360 			if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) ||
1361 			    (((np = metaslicename(dnp, rep_slice, ep))
1362 				== NULL) &&
1363 				((np = metaslicename(dnp, MD_SLICE0, ep))
1364 				    == NULL))) {
1365 				rval = -1;
1366 				goto out;
1367 			}
1368 
1369 			if ((np = metaslicename(dnp,
1370 			    rep_slice, ep)) == NULL) {
1371 				if ((np = metaslicename(dnp,
1372 				    MD_SLICE0, ep)) == NULL) {
1373 					rval = -1;
1374 					goto out;
1375 				}
1376 				mdclrerror(ep);
1377 			}
1378 
1379 			/* Yes this is UGLY!!! */
1380 			p1 = p->dd_next;
1381 			p->dd_next = NULL;
1382 			if (rel_own_bydd(sp, p, FALSE, ep))
1383 				mdclrerror(ep);
1384 			p->dd_next = p1;
1385 
1386 			if (p->dd_dbcnt == 0)
1387 				continue;
1388 
1389 			/*
1390 			 * Skip the replica removal if we are not the last user
1391 			 */
1392 			if (num_users != 1)
1393 				continue;
1394 
1395 			nlp = NULL;
1396 			(void) metanamelist_append(&nlp, np);
1397 			if (meta_db_detach(sp, nlp,
1398 			    (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, ep))
1399 				mdclrerror(ep);
1400 			metafreenamelist(nlp);
1401 		}
1402 	}
1403 
1404 	if (halt_set(sp, ep)) {
1405 		rval = -1;
1406 		goto out;
1407 	}
1408 
1409 	/* Setup the mediator record */
1410 	(void) memset(&medr, '\0', sizeof (med_rec_t));
1411 	medr.med_rec_mag = MED_REC_MAGIC;
1412 	medr.med_rec_rev = MED_REC_REV;
1413 	medr.med_rec_fl  = 0;
1414 	medr.med_rec_sn  = sp->setno;
1415 	(void) strcpy(medr.med_rec_snm, sp->setname);
1416 	medr.med_rec_meds = sd->sd_med;	/* structure assigment */
1417 	(void) memset(&medr.med_rec_data, '\0', sizeof (med_data_t));
1418 	medr.med_rec_foff = 0;
1419 
1420 	/*
1421 	 * If we are the last remaining user, then remove the mediator hosts
1422 	 */
1423 	if (num_users == 1) {
1424 		for (i = 0; i < MED_MAX_HOSTS; i++) {
1425 			if (medr.med_rec_meds.n_lst[i].a_cnt != 0)
1426 				SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE,
1427 				    SVM_TAG_MEDIATOR, sp->setno, i);
1428 			(void) memset(&medr.med_rec_meds.n_lst[i], '\0',
1429 			    sizeof (md_h_t));
1430 		}
1431 		medr.med_rec_meds.n_cnt = 0;
1432 	} else { 	/* Remove this host from the mediator node list. */
1433 		for (i = 0; i < MD_MAXSIDES; i++) {
1434 			/* Skip empty slots */
1435 			if (sd->sd_nodes[i][0] == '\0')
1436 				continue;
1437 
1438 			/* Copy non local node */
1439 			if (strcmp(mynode(), sd->sd_nodes[i]) != 0) {
1440 				(void) strcpy(medr.med_rec_nodes[i],
1441 				    sd->sd_nodes[i]);
1442 				continue;
1443 			}
1444 
1445 			/* Clear local node */
1446 			(void) memset(&medr.med_rec_nodes[i], '\0',
1447 			    sizeof (md_node_nm_t));
1448 		}
1449 	}
1450 
1451 	crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
1452 
1453 	/*
1454 	 * If the client is part of a cluster put the DCS service
1455 	 * into a deleteing state.
1456 	 */
1457 	if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) {
1458 		if (metad_isautotakebyname(sp->setname)) {
1459 			delete_end = 0;
1460 		} else {
1461 			mdclrerror(ep);
1462 			goto out;
1463 		}
1464 	}
1465 
1466 	/* Inform the mediator hosts of the new information */
1467 	for (i = 0; i < MED_MAX_HOSTS; i++) {
1468 		if (sd->sd_med.n_lst[i].a_cnt == 0)
1469 			continue;
1470 
1471 		if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep))
1472 			mdclrerror(ep);
1473 	}
1474 
1475 	/* Delete the set locally */
1476 	for (i = 0; i < MD_MAXSIDES; i++) {
1477 		/* Skip empty slots */
1478 		if (sd->sd_nodes[i][0] == '\0')
1479 			continue;
1480 
1481 		/* Skip non local nodes */
1482 		if (strcmp(mynode(), sd->sd_nodes[i]) != 0)
1483 			continue;
1484 
1485 		if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1)
1486 			mdclrerror(ep);
1487 	}
1488 	if (delete_end &&
1489 	    sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR)
1490 		rval = -1;
1491 
1492 out:
1493 	/* release signals back to what they were on entry */
1494 	if (procsigs(FALSE, &oldsigs, &xep) < 0) {
1495 		if (rval == 0)
1496 			(void) mdstealerror(ep, &xep);
1497 		rval = -1;
1498 	}
1499 
1500 	if (lock_set == TRUE) {
1501 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
1502 		if (clnt_unlock_set(mynode(), cl_sk, &xep)) {
1503 			if (rval == 0)
1504 				(void) mdstealerror(ep, &xep);
1505 			rval = -1;
1506 		}
1507 		cl_set_setkey(NULL);
1508 	}
1509 
1510 	metaflushsetname(sp);
1511 	return (rval);
1512 }
1513 
1514 int
1515 meta_set_purge(
1516 	mdsetname_t	*sp,
1517 	int		bypass_cluster,
1518 	int		forceflg,
1519 	md_error_t	*ep
1520 )
1521 {
1522 	char		*thishost = mynode();
1523 	md_set_desc	*sd;
1524 	md_setkey_t	*cl_sk;
1525 	md_error_t	xep = mdnullerror;
1526 	int		rval = 0;
1527 	int		i, num_hosts = 0;
1528 	int		has_set = 0;
1529 	int		max_node = 0;
1530 	int		delete_end = 1;
1531 	md_mnnode_desc	*nd;
1532 
1533 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1534 		/* unable to find set description */
1535 		rval = 1;
1536 		return (rval);
1537 	}
1538 
1539 	if (MD_MNSET_DESC(sd)) {
1540 		/*
1541 		 * Get a count of the hosts in the set and also lock the set
1542 		 * on those hosts that know about it.
1543 		 */
1544 		nd = sd->sd_nodelist;
1545 		while (nd) {
1546 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1547 				nd = nd->nd_next;
1548 				continue;
1549 			}
1550 			has_set = nodehasset(sp, nd->nd_nodename,
1551 				NHS_NST_EQ, ep);
1552 
1553 			/*
1554 			 * The host is not aware of this set (has_set < 0) or
1555 			 * the set does not match (has_set == 0). This check
1556 			 * prevents the code getting confused by an apparent
1557 			 * inconsistancy in the set's state, this is in the
1558 			 * purge code so something is broken in any case and
1559 			 * this is just trying to fix the brokeness.
1560 			 */
1561 			if (has_set <= 0) {
1562 				mdclrerror(ep);
1563 				nd->nd_flags |= MD_MN_NODE_NOSET;
1564 			} else {
1565 				num_hosts++;
1566 				if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
1567 					/*
1568 					 * If the force flag is set then
1569 					 * ignore any RPC failures because we
1570 					 * are only really interested with
1571 					 * the set on local node.
1572 					 */
1573 					if (forceflg && mdanyrpcerror(ep)) {
1574 						mdclrerror(ep);
1575 					} else {
1576 						/*
1577 						 * set max_node so that in the
1578 						 * unlock code nodes in the
1579 						 * set that have not been
1580 						 * locked are not unlocked.
1581 						 */
1582 						max_node = nd->nd_nodeid;
1583 						rval = 2;
1584 						goto out1;
1585 					}
1586 				}
1587 
1588 			}
1589 			nd = nd->nd_next;
1590 		}
1591 		max_node = 0;
1592 	} else {
1593 		/*
1594 		 * Get a count of the hosts in the set and also lock the set
1595 		 * on those hosts that know about it.
1596 		 */
1597 		for (i = 0; i < MD_MAXSIDES; i++) {
1598 			/* Skip empty slots */
1599 			if (sd->sd_nodes[i][0] == '\0')
1600 				continue;
1601 
1602 			has_set = nodehasset(sp, sd->sd_nodes[i],
1603 				NHS_NST_EQ, ep);
1604 
1605 			/*
1606 			 * The host is not aware of this set (has_set < 0) or
1607 			 * the set does not match (has_set == 0). This check
1608 			 * prevents the code getting confused by an apparent
1609 			 * inconsistancy in the set's state, this is in the
1610 			 * purge code so something is broken in any case and
1611 			 * this is just trying to fix the brokeness.
1612 			 */
1613 			if (has_set <= 0) {
1614 				mdclrerror(ep);
1615 				/*
1616 				 * set the node to NULL to prevent further
1617 				 * requests to this unresponsive node.
1618 				 */
1619 				sd->sd_nodes[i][0] = '\0';
1620 			} else {
1621 				num_hosts++;
1622 				if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
1623 					/*
1624 					 * If the force flag is set then
1625 					 * ignore any RPC failures because we
1626 					 * are only really interested with
1627 					 * the set on local node.
1628 					 */
1629 					if (forceflg && mdanyrpcerror(ep)) {
1630 						mdclrerror(ep);
1631 					} else {
1632 						rval = 2;
1633 						/*
1634 						 * set max_node so that in the
1635 						 * unlock code nodes in the
1636 						 * set that have not been
1637 						 * locked are not unlocked.
1638 						 */
1639 						max_node = i;
1640 						goto out1;
1641 					}
1642 				}
1643 			}
1644 		}
1645 		max_node = i;	/* now MD_MAXSIDES */
1646 	}
1647 	if (!bypass_cluster) {
1648 		/*
1649 		 * If there is only one host associated with the
1650 		 * set then remove the set from the cluster.
1651 		 */
1652 		if (num_hosts == 1) {
1653 			if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) {
1654 				if (metad_isautotakebyname(sp->setname)) {
1655 					delete_end = 0;
1656 				} else {
1657 					mdclrerror(ep);
1658 					rval = 3;
1659 					goto out1;
1660 				}
1661 			}
1662 		}
1663 	}
1664 
1665 	if (MD_MNSET_DESC(sd)) {
1666 		/*
1667 		 * Get a count of the hosts in the set and also lock the set
1668 		 * on those hosts that know about it.
1669 		 */
1670 		nd = sd->sd_nodelist;
1671 		while (nd) {
1672 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1673 				nd = nd->nd_next;
1674 				continue;
1675 			}
1676 			if (nd->nd_nodeid != sd->sd_mn_mynode->nd_nodeid) {
1677 				/*
1678 				 * Tell the remote node to remove this node
1679 				 */
1680 				if (clnt_delhosts(nd->nd_nodename, sp, 1,
1681 					&thishost, ep) == -1) {
1682 					/*
1683 					 * If we fail to delete ourselves
1684 					 * from the remote host it does not
1685 					 * really matter because the set is
1686 					 * being "purged" from this node. The
1687 					 * set can be purged from the other
1688 					 * node at a later time.
1689 					 */
1690 					mdclrerror(ep);
1691 				}
1692 				nd = nd->nd_next;
1693 				continue;
1694 			}
1695 			/* remove the set from this host */
1696 			if (clnt_delset(nd->nd_nodename, sp, ep) == -1) {
1697 				md_perror(dgettext(TEXT_DOMAIN, "delset"));
1698 				if (!bypass_cluster && num_hosts == 1)
1699 					(void) sdssc_delete_end(sp->setname,
1700 					    SDSSC_CLEANUP);
1701 				mdclrerror(ep);
1702 				goto out1;
1703 			}
1704 			nd = nd->nd_next;
1705 		}
1706 	} else {
1707 		for (i = 0; i < MD_MAXSIDES; i++) {
1708 			/* Skip empty slots */
1709 			if (sd->sd_nodes[i][0] == '\0')
1710 				continue;
1711 			if (strcmp(thishost, sd->sd_nodes[i]) != 0) {
1712 				/*
1713 				 * Tell the remote node to remove this node
1714 				 */
1715 				if (clnt_delhosts(sd->sd_nodes[i], sp, 1,
1716 				    &thishost, ep) == -1) {
1717 					/*
1718 					 * If we fail to delete ourselves
1719 					 * from the remote host it does not
1720 					 * really matter because the set is
1721 					 * being "purged" from this node. The
1722 					 * set can be purged from the other
1723 					 * node at a later time.
1724 					 */
1725 					mdclrerror(ep);
1726 				}
1727 				continue;
1728 			}
1729 
1730 			/* remove the set from this host */
1731 			if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) {
1732 				md_perror(dgettext(TEXT_DOMAIN, "delset"));
1733 				if (!bypass_cluster && num_hosts == 1)
1734 					(void) sdssc_delete_end(sp->setname,
1735 					    SDSSC_CLEANUP);
1736 				mdclrerror(ep);
1737 				goto out1;
1738 			}
1739 		}
1740 	}
1741 
1742 	if (!bypass_cluster && num_hosts == 1) {
1743 		if (delete_end && sdssc_delete_end(sp->setname, SDSSC_COMMIT) ==
1744 		    SDSSC_ERROR) {
1745 			rval = 4;
1746 		}
1747 	}
1748 
1749 out1:
1750 
1751 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1752 
1753 	/*
1754 	 * Remove the set lock on those nodes that had the set locked
1755 	 * max_node will either be MD_MAXSIDES or array index of the last
1756 	 * node contacted (or rather failed to contact) for traditional
1757 	 * diskset.  For a MN diskset, max_node is the node_id of the node
1758 	 * that failed the lock.
1759 	 */
1760 	if (MD_MNSET_DESC(sd)) {
1761 		nd = sd->sd_nodelist;
1762 		while (nd) {
1763 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1764 				nd = nd->nd_next;
1765 				continue;
1766 			}
1767 			if (nd->nd_nodeid == max_node)
1768 				break;
1769 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
1770 				if (forceflg && mdanyrpcerror(&xep)) {
1771 					mdclrerror(&xep);
1772 					nd = nd->nd_next;
1773 					continue;
1774 				}
1775 				if (rval == 0)
1776 					(void) mdstealerror(ep, &xep);
1777 				rval = 5;
1778 			}
1779 			nd = nd->nd_next;
1780 		}
1781 	} else {
1782 		for (i = 0; i < max_node; i++) {
1783 			/* Skip empty slots */
1784 			if (sd->sd_nodes[i][0] == '\0')
1785 				continue;
1786 
1787 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
1788 				if (forceflg && mdanyrpcerror(&xep)) {
1789 					mdclrerror(&xep);
1790 					continue;
1791 				}
1792 				if (rval == 0)
1793 					(void) mdstealerror(ep, &xep);
1794 				rval = 5;
1795 			}
1796 		}
1797 	}
1798 
1799 	cl_set_setkey(NULL);
1800 
1801 	return (rval);
1802 }
1803 
1804 int
1805 meta_set_query(
1806 	mdsetname_t		*sp,
1807 	mddb_dtag_lst_t		**dtlpp,
1808 	md_error_t		*ep
1809 )
1810 {
1811 	mddb_dtag_get_parm_t	dtgp;
1812 
1813 	(void) memset(&dtgp, '\0', sizeof (mddb_dtag_get_parm_t));
1814 	dtgp.dtgp_setno = sp->setno;
1815 
1816 	/*CONSTCOND*/
1817 	while (1) {
1818 		if (metaioctl(MD_MED_GET_TAG, &dtgp, &dtgp.dtgp_mde, NULL) != 0)
1819 			if (! mdismddberror(&dtgp.dtgp_mde, MDE_DB_NOTAG) ||
1820 			    *dtlpp == NULL)
1821 				return (mdstealerror(ep, &dtgp.dtgp_mde));
1822 			else
1823 				break;
1824 
1825 		/*
1826 		 * Run to the end of the list
1827 		 */
1828 		for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx)
1829 			/* void */;
1830 
1831 		*dtlpp = Zalloc(sizeof (mddb_dtag_lst_t));
1832 
1833 		(void) memmove(&(*dtlpp)->dtl_dt, &dtgp.dtgp_dt,
1834 		    sizeof (mddb_dtag_t));
1835 
1836 		dtgp.dtgp_dt.dt_id++;
1837 	}
1838 	return (0);
1839 }
1840 
1841 /*
1842  * return drivename get by key
1843  */
1844 mddrivename_t *
1845 metadrivename_withdrkey(
1846 	mdsetname_t	*sp,
1847 	side_t		sideno,
1848 	mdkey_t		key,
1849 	int		flags,
1850 	md_error_t	*ep
1851 )
1852 {
1853 	char		*nm;
1854 	mdname_t	*np;
1855 	mddrivename_t	*dnp;
1856 	ddi_devid_t	devidp;
1857 	md_set_desc	*sd;
1858 
1859 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1860 		return (NULL);
1861 	}
1862 
1863 
1864 	/*
1865 	 * Get the devid associated with the key.
1866 	 *
1867 	 * If a devid was returned, it MUST be valid even in
1868 	 * the case where a device id has been "updated". The
1869 	 * "update" of the device id may have occured due to
1870 	 * a firmware upgrade.
1871 	 */
1872 	if ((devidp = meta_getdidbykey(MD_LOCAL_SET, sideno+SKEW, key, ep))
1873 	    != NULL) {
1874 		/*
1875 		 * Look for the correct dnp using the devid for comparison.
1876 		 */
1877 		dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep);
1878 		free(devidp);
1879 		dnp->side_names_key = key;
1880 	} else {
1881 		/*
1882 		 * We didn't get a devid. We'll try for a dnp using the
1883 		 * name. If we have a MN diskset or if the dnp is a did
1884 		 * device, we're done because then we don't have devids.
1885 		 * Otherwise we'll try to set the devid
1886 		 * and get the dnp via devid again.
1887 		 * We also need to clear the ep structure. When the
1888 		 * above call to meta_getdidbykey returned a null, it
1889 		 * also put an error code into ep. In this case, the null
1890 		 * return is actually OK and any errors can be ignored. The
1891 		 * reason it is OK is because this could be a MN set or
1892 		 * we could  be running without devids (ex cluster).
1893 		 */
1894 		mdclrerror(ep);
1895 
1896 		if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno, key,
1897 		    ep)) == NULL)
1898 			return (NULL);
1899 		/* get device name */
1900 		if (flags & PRINT_FAST) {
1901 			if ((np = metaname_fast(&sp, nm,
1902 			    LOGICAL_DEVICE, ep)) == NULL) {
1903 				Free(nm);
1904 				return (NULL);
1905 			}
1906 		} else {
1907 			if ((np = metaname(&sp, nm, LOGICAL_DEVICE,
1908 			    ep)) == NULL) {
1909 				Free(nm);
1910 				return (NULL);
1911 			}
1912 		}
1913 		Free(nm);
1914 		/* make sure it's OK */
1915 		if ((! (flags & MD_BASICNAME_OK)) && (metachkcomp(np,
1916 		    ep) != 0))
1917 			return (NULL);
1918 
1919 		/* get drivename */
1920 		dnp = np->drivenamep;
1921 		dnp->side_names_key = key;
1922 		/*
1923 		 * Skip the devid set/check for the following cases:
1924 		 * 1) If MN diskset, there are no devid's
1925 		 * 2) if dnp is did device
1926 		 * The device id is disabled for did device due to the
1927 		 * lack of minor name support in the did driver. The following
1928 		 * devid code path can set and propagate the error and
1929 		 * eventually prevent did disks from being added to the
1930 		 * diskset under SunCluster systems
1931 		 */
1932 		if ((strncmp(dnp->rname, "/dev/did/", strlen("/dev/did/"))
1933 		    == 0) || (MD_MNSET_DESC(sd)))
1934 			goto out;
1935 
1936 		/*
1937 		 * It is okay if replica is not in devid mode
1938 		 */
1939 		if (mdissyserror(ep, MDDB_F_NODEVID)) {
1940 			mdclrerror(ep);
1941 			goto out;
1942 		}
1943 
1944 		/*
1945 		 * We're not MN or did devices but
1946 		 * devid is missing so this means that we have
1947 		 * just upgraded from a configuration where
1948 		 * devid's were not used so try to add in
1949 		 * the devid and requery. If the devid still isn't there,
1950 		 * that's OK. dnp->devid will be null as it is in any
1951 		 * configuration with no devids.
1952 		 */
1953 		if (meta_setdid(MD_LOCAL_SET, sideno + SKEW, key,
1954 		    ep) < 0)
1955 			return (NULL);
1956 		if ((devidp = (ddi_devid_t)meta_getdidbykey(MD_LOCAL_SET,
1957 		    sideno+SKEW, key, ep)) != NULL) {
1958 			/*
1959 			 * Found a devid so look for the dnp using the
1960 			 * devid as the search mechanism.
1961 			 */
1962 			dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep);
1963 			free(devidp);
1964 			dnp->side_names_key = key;
1965 		}
1966 	}
1967 
1968 
1969 
1970 out:
1971 	if (flags & MD_BYPASS_DAEMON)
1972 		return (dnp);
1973 
1974 	if (get_sidenmlist(sp, dnp, ep))
1975 		return (NULL);
1976 
1977 	/* return success */
1978 	return (dnp);
1979 }
1980 
1981 void
1982 metafreedrivedesc(md_drive_desc **dd)
1983 {
1984 	md_drive_desc	*p, *next = NULL;
1985 
1986 	for (p = *dd; p != NULL; p = next) {
1987 		next = p->dd_next;
1988 		Free(p);
1989 	}
1990 	*dd = NULL;
1991 }
1992 
1993 md_drive_desc *
1994 metaget_drivedesc(
1995 	mdsetname_t	*sp,
1996 	int		flags,
1997 	md_error_t	*ep
1998 )
1999 {
2000 	side_t		sideno = MD_SIDEWILD;
2001 
2002 	assert(! (flags & MD_BYPASS_DAEMON));
2003 
2004 	if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD)
2005 		return (NULL);
2006 
2007 	return (metaget_drivedesc_sideno(sp, sideno, flags, ep));
2008 }
2009 
2010 md_drive_desc *
2011 metaget_drivedesc_fromnamelist(
2012 	mdsetname_t	*sp,
2013 	mdnamelist_t	*nlp,
2014 	md_error_t	*ep
2015 )
2016 {
2017 	md_set_desc		*sd;
2018 	mdnamelist_t		*p;
2019 	md_drive_desc		*dd = NULL;
2020 
2021 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
2022 		return (NULL);
2023 
2024 	for (p = nlp; p != NULL; p = p->next)
2025 		(void) metadrivedesc_append(&dd, p->namep->drivenamep, 0, 0,
2026 		    sd->sd_ctime, sd->sd_genid, MD_DR_ADD);
2027 
2028 	return (dd);
2029 }
2030 
2031 md_drive_desc *
2032 metaget_drivedesc_sideno(
2033 	mdsetname_t *sp,
2034 	side_t sideno,
2035 	int flags,
2036 	md_error_t *ep
2037 )
2038 {
2039 	md_set_desc	*sd = NULL;
2040 
2041 	assert(! (flags & MD_BYPASS_DAEMON));
2042 
2043 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
2044 		return (NULL);
2045 
2046 	if (sd->sd_drvs)
2047 		return (sd->sd_drvs);
2048 
2049 	if ((sd->sd_drvs = dr2drivedesc(sp, sideno, flags, ep)) == NULL)
2050 		return (NULL);
2051 
2052 	return (sd->sd_drvs);
2053 }
2054 
2055 int
2056 metaget_setownership(
2057 	mdsetname_t	*sp,
2058 	md_error_t	*ep
2059 )
2060 {
2061 	md_set_desc	*sd;
2062 	int		bool;
2063 	int		i;
2064 	md_mnnode_desc	*nd;
2065 
2066 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
2067 		return (-1);
2068 
2069 	if (MD_MNSET_DESC(sd)) {
2070 		nd = sd->sd_nodelist;
2071 		while (nd) {
2072 			/* If node isn't alive, can't own diskset */
2073 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2074 				nd->nd_flags &= ~MD_MN_NODE_OWN;
2075 				nd = nd->nd_next;
2076 				continue;
2077 			}
2078 			/*
2079 			 * If can't communicate with rpc.metad, then mark
2080 			 * this node as not an owner.  That node may
2081 			 * in fact, be an owner, but without rpc.metad running
2082 			 * that node can't do much.
2083 			 */
2084 			if (clnt_ownset(nd->nd_nodename, sp, &bool, ep) == -1) {
2085 				nd->nd_flags &= ~MD_MN_NODE_OWN;
2086 			} else if (bool == TRUE) {
2087 				nd->nd_flags |= MD_MN_NODE_OWN;
2088 			} else {
2089 				nd->nd_flags &= ~MD_MN_NODE_OWN;
2090 			}
2091 			nd = nd->nd_next;
2092 		}
2093 		return (0);
2094 	}
2095 
2096 	/* Rest of code handles traditional disksets */
2097 
2098 	for (i = 0; i < MD_MAXSIDES; i++)
2099 		sd->sd_isown[i] = 0;
2100 
2101 	if (clnt_ownset(mynode(), sp, &bool, ep) == -1)
2102 		return (-1);
2103 
2104 	if (bool == TRUE)
2105 		sd->sd_isown[getmyside(sp, ep)] = 1;
2106 
2107 	return (0);
2108 }
2109 
2110 char *
2111 mynode(void)
2112 {
2113 	static struct utsname	myuname;
2114 	static int		done = 0;
2115 
2116 	if (! done) {
2117 		if (uname(&myuname) == -1) {
2118 			md_perror(dgettext(TEXT_DOMAIN, "uname"));
2119 			assert(0);
2120 		}
2121 		done = 1;
2122 	}
2123 	return (myuname.nodename);
2124 }
2125 
2126 int
2127 strinlst(char *str, int cnt, char **lst)
2128 {
2129 	int i;
2130 
2131 	for (i = 0; i < cnt; i++)
2132 		if (strcmp(lst[i], str) == 0)
2133 			return (TRUE);
2134 
2135 	return (FALSE);
2136 }
2137 
2138 /*
2139  * meta_get_reserved_names
2140  *  returns an mdnamelist_t of reserved slices
2141  *  reserved slices are those that are used but don't necessarily
2142  *  show up as metadevices (ex. reserved slice for db in sets, logs)
2143  */
2144 
2145 /*ARGSUSED*/
2146 int
2147 meta_get_reserved_names(
2148 	mdsetname_t	*sp,
2149 	mdnamelist_t	**nlpp,
2150 	int		options,
2151 	md_error_t	*ep)
2152 {
2153 	int		 count		= 0;
2154 	mdname_t	*np		= NULL;
2155 	mdnamelist_t	*transnlp	= NULL;
2156 	mdnamelist_t	**tailpp 	= nlpp;
2157 	mdnamelist_t	*nlp;
2158 	md_drive_desc	*dd, *di;
2159 
2160 	if (metaislocalset(sp))
2161 		goto out;
2162 
2163 	if (!(dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) && !mdisok(ep)) {
2164 		count = -1;
2165 		goto out;
2166 	}
2167 
2168 	/* db in for sets on reserved slice */
2169 	for (di = dd; di && count >= 0; di = di->dd_next) {
2170 		uint_t	rep_slice;
2171 
2172 		/*
2173 		 * Add the name struct to the end of the
2174 		 * namelist but keep a pointer to the last
2175 		 * element so that we don't incur the overhead
2176 		 * of traversing the list each time
2177 		 */
2178 		if (di->dd_dnp &&
2179 		    (meta_replicaslice(di->dd_dnp, &rep_slice, ep) == 0) &&
2180 		    (np = metaslicename(di->dd_dnp, rep_slice, ep)) &&
2181 		    (tailpp = meta_namelist_append_wrapper(tailpp, np)))
2182 			count++;
2183 		else
2184 			count = -1;
2185 	}
2186 
2187 	/* now find logs */
2188 	if (meta_get_trans_names(sp, &transnlp, 0, ep) < 0) {
2189 		count = -1;
2190 		goto out;
2191 	}
2192 
2193 	for (nlp = transnlp; (nlp != NULL); nlp = nlp->next) {
2194 		mdname_t	*transnp = nlp->namep;
2195 		md_trans_t	*transp;
2196 
2197 		if ((transp = meta_get_trans(sp, transnp, ep)) == NULL) {
2198 			count = -1;
2199 			goto out;
2200 		}
2201 		if (transp->lognamep) {
2202 			/*
2203 			 * Add the name struct to the end of the
2204 			 * namelist but keep a pointer to the last
2205 			 * element so that we don't incur the overhead
2206 			 * of traversing the list each time
2207 			 */
2208 			tailpp = meta_namelist_append_wrapper(
2209 			    tailpp, transp->lognamep);
2210 		}
2211 	}
2212 out:
2213 	metafreenamelist(transnlp);
2214 	return (count);
2215 }
2216 
2217 /*
2218  * Entry point to join a node to MultiNode diskset.
2219  *
2220  * Validate host in diskset.
2221  *	- Should be in membership list from API
2222  *	- Should not already be joined into diskset.
2223  *	- Set must have drives
2224  * Assume valid configuration is stored in the set/drive/node records
2225  * in the local mddb since no node or drive can be added to the MNset
2226  * unless all drives and nodes are available.  Reconfig steps will
2227  * resync all ALIVE nodes in case of panic in critical areas.
2228  *
2229  * Lock down the set.
2230  * Verify host is a member of this diskset.
2231  * If drives exist in the configuration, load the mddbs.
2232  * Set this node to active by notifying master if one exists.
2233  * If this is the first node active in the diskset, this node
2234  * 	becomes the master.
2235  * Unlock the set.
2236  *
2237  * Mirror Resync:
2238  * If this node is the last node to join the set and clustering
2239  * isn't running, then start the 'metasync -r' type resync
2240  * on all mirrors in this diskset.
2241  * If clustering is running, this resync operation will
2242  * be handled by the reconfig steps and should NOT
2243  * be handled during a join operation.
2244  *
2245  * There are multiple return values in order to assist
2246  * the join operation of all sets in the metaset command.
2247  *
2248  * Return values:
2249  *	0  - Node successfully joined to set.
2250  *	-1 - Join attempted but failed
2251  *		- any failure from libmeta calls
2252  *		- node not in the member list
2253  *	-2 - Join not attempted since
2254  *		- this set had no drives in set
2255  *		- this node already joined to set
2256  *		- set is not a multinode set
2257  *	-3 - Node joined to STALE set.
2258  */
2259 extern int
2260 meta_set_join(
2261 	mdsetname_t	*sp,
2262 	md_error_t	*ep
2263 )
2264 {
2265 	md_set_desc		*sd;
2266 	md_drive_desc		*dd;
2267 	md_mnnode_desc		*nd, *nd2, my_nd;
2268 	int			rval = 0;
2269 	md_setkey_t		*cl_sk;
2270 	md_error_t		xep = mdnullerror;
2271 	md_error_t		ep_snarf = mdnullerror;
2272 	int			master_flag = 0;
2273 	md_mnset_record		*mas_mnsr = NULL;
2274 	int			clear_nr_flags = 0;
2275 	md_mnnode_record	*nr;
2276 	int			stale_set = 0;
2277 	int			rb_flags = 0;
2278 	int			stale_bool = FALSE;
2279 	int			suspendall_flag = 0;
2280 	int			suspend1_flag = 0;
2281 	sigset_t		oldsigs;
2282 	int			send_reinit = 0;
2283 
2284 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2285 		return (-1);
2286 	}
2287 
2288 	/* Must be a multinode diskset */
2289 	if (!MD_MNSET_DESC(sd)) {
2290 		(void) mderror(ep, MDE_NOT_MN, sp->setname);
2291 		return (-2);
2292 	}
2293 
2294 	/* Verify that the node is ALIVE (i.e. is in the API membership list) */
2295 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_ALIVE)) {
2296 		(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno,
2297 			sd->sd_mn_mynode->nd_nodename, NULL,
2298 			sp->setname);
2299 		return (-1);
2300 	}
2301 
2302 	/* Make sure we are blocking all signals */
2303 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
2304 		mdclrerror(&xep);
2305 
2306 	/*
2307 	 * Lock the set on current set members.
2308 	 * For MN diskset lock_set and SUSPEND are used to protect against
2309 	 * other meta* commands running on the other nodes.
2310 	 */
2311 	nd = sd->sd_nodelist;
2312 	while (nd) {
2313 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2314 			nd = nd->nd_next;
2315 			continue;
2316 		}
2317 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2318 			rval = -1;
2319 			goto out;
2320 		}
2321 		nd = nd->nd_next;
2322 	}
2323 
2324 	/*
2325 	 * Lock out other meta* commands by suspending
2326 	 * class 1 messages across the diskset.
2327 	 */
2328 	nd = sd->sd_nodelist;
2329 	while (nd) {
2330 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2331 			nd = nd->nd_next;
2332 			continue;
2333 		}
2334 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2335 			    sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) {
2336 			rval = -1;
2337 			goto out;
2338 		}
2339 		suspend1_flag = 1;
2340 		nd = nd->nd_next;
2341 	}
2342 
2343 	/*
2344 	 * Verify that this host is a member (in the host list) of the set.
2345 	 */
2346 	nd = sd->sd_nodelist;
2347 	while (nd) {
2348 		if (strcmp(mynode(), nd->nd_nodename) == 0) {
2349 			break;
2350 		}
2351 		nd = nd->nd_next;
2352 	}
2353 	if (!nd) {
2354 		(void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
2355 			sd->sd_mn_mynode->nd_nodename, NULL,
2356 			sp->setname);
2357 		rval = -1;
2358 		goto out;
2359 	}
2360 
2361 	/*
2362 	 * Need to return failure if host is already 'joined'
2363 	 * into the set.  This is done so that if later the user
2364 	 * issues a command to join all sets and a failure is
2365 	 * encountered - that the resulting cleanup effort
2366 	 * (withdrawing from all sets that were joined
2367 	 * during that command) won't withdraw from this set.
2368 	 */
2369 	if (nd->nd_flags & MD_MN_NODE_OWN) {
2370 		rval = -2;
2371 		goto out2;
2372 	}
2373 
2374 	/*
2375 	 * Call metaget_setownership that calls each node in diskset and
2376 	 * marks in set descriptor if node is an owner of the set or not.
2377 	 * metaget_setownership checks to see if a node is an owner by
2378 	 * checking to see if that node's kernel has the mddb loaded.
2379 	 * If a node had panic'd during a reconfig or an
2380 	 * add/delete/join/withdraw operation, the other nodes' node
2381 	 * records may not reflect the current state of the diskset,
2382 	 * so calling metaget_setownership is the safest thing to do.
2383 	 */
2384 	if (metaget_setownership(sp, ep) == -1) {
2385 		rval = -1;
2386 		goto out;
2387 	}
2388 
2389 	/* If first active member of diskset, become the master. */
2390 	nd = sd->sd_nodelist;
2391 	while (nd) {
2392 		if (nd->nd_flags & MD_MN_NODE_OWN)
2393 			break;
2394 		nd = nd->nd_next;
2395 	}
2396 	if (nd == NULL)
2397 		master_flag = 1;
2398 
2399 	/*
2400 	 * If not first active member of diskset, then get the
2401 	 * master information from a node that is already joined
2402 	 * and set the master information for this node.  Be sure
2403 	 * that this node (the already joined node) has its own
2404 	 * join flag set.  If not, then this diskset isn't currently
2405 	 * consistent and shouldn't allow a node to join.  This diskset
2406 	 * inconsistency should only occur when a node has panic'd in
2407 	 * the set while doing a metaset operation and the sysadmin is
2408 	 * attempting to join a node into the set.  This inconsistency
2409 	 * will be fixed during a reconfig cycle which should be occurring
2410 	 * soon since a node panic'd.
2411 	 *
2412 	 * If unable to get this information from an owning node, then
2413 	 * this diskset isn't currently consistent and shouldn't
2414 	 * allow a node to join.
2415 	 */
2416 	if (!master_flag) {
2417 		/* get master information from an owner (joined) node */
2418 		if (clnt_mngetset(nd->nd_nodename, sp->setname,
2419 		    sp->setno, &mas_mnsr, ep) == -1) {
2420 			rval = -1;
2421 			goto out;
2422 		}
2423 
2424 		/* Verify that owner (joined) node has its own JOIN flag set */
2425 		nr = mas_mnsr->sr_nodechain;
2426 		while (nr) {
2427 			if ((nd->nd_nodeid == nr->nr_nodeid) &&
2428 			    ((nr->nr_flags & MD_MN_NODE_OWN) == NULL)) {
2429 				(void) mddserror(ep, MDE_DS_NODENOSET,
2430 				    sp->setno, nd->nd_nodename, NULL,
2431 				    nd->nd_nodename);
2432 				free_sr((md_set_record *)mas_mnsr);
2433 				rval = -1;
2434 				goto out;
2435 			}
2436 			nr = nr->nr_next;
2437 		}
2438 
2439 		/*
2440 		 * Does master have set marked as STALE?
2441 		 * If so, need to pass this down to kernel when
2442 		 * this node snarfs the set.
2443 		 */
2444 		if (clnt_mn_is_stale(nd->nd_nodename, sp,
2445 		    &stale_bool, ep) == -1) {
2446 			rval = -1;
2447 			goto out;
2448 		}
2449 
2450 		/* set master information in my rpc.metad's set record */
2451 		if (clnt_mnsetmaster(mynode(), sp, mas_mnsr->sr_master_nodenm,
2452 		    mas_mnsr->sr_master_nodeid, ep)) {
2453 			free_sr((md_set_record *)mas_mnsr);
2454 			rval = -1;
2455 			goto out;
2456 		}
2457 
2458 		/* set master information in my cached set desc */
2459 		(void) strcpy(sd->sd_mn_master_nodenm,
2460 		    mas_mnsr->sr_master_nodenm);
2461 		sd->sd_mn_master_nodeid = mas_mnsr->sr_master_nodeid;
2462 		nd2 = sd->sd_nodelist;
2463 		while (nd2) {
2464 		    if (nd2->nd_nodeid == mas_mnsr->sr_master_nodeid) {
2465 			sd->sd_mn_masternode = nd2;
2466 			break;
2467 		    }
2468 		    nd2 = nd2->nd_next;
2469 		}
2470 		free_sr((md_set_record *)mas_mnsr);
2471 
2472 		/*
2473 		 * Set the node flags in mynode's rpc.metad node records for
2474 		 * the nodes that are in the diskset.  Can use my sd
2475 		 * since earlier call to metaget_setownership set the
2476 		 * owner flags based on whether that node had snarfed
2477 		 * the MN diskset mddb.  Reconfig steps guarantee that
2478 		 * return of metaget_setownership will match the owning
2479 		 * node's owner list except in the case where a node
2480 		 * has just panic'd and in this case, a reconfig will
2481 		 * be starting immediately and the owner lists will
2482 		 * be sync'd up by the reconfig.
2483 		 *
2484 		 * Flag of SET means to take no action except to
2485 		 * set the node flags as given in the nodelist linked list.
2486 		 */
2487 		if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist,
2488 		    MD_NR_SET, NULL, ep)) {
2489 			rval = -1;
2490 			goto out;
2491 		}
2492 	}
2493 
2494 	/*
2495 	 * Read in the mddb if there are drives in the set.
2496 	 */
2497 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
2498 	    ep)) == NULL) {
2499 		/* No drives in list */
2500 		if (! mdisok(ep)) {
2501 			rval = -1;
2502 			goto out;
2503 		}
2504 		rval = -2;
2505 		goto out;
2506 	}
2507 
2508 	/*
2509 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2510 	 * Start by suspending rpc.mdcommd (which drains it of all messages),
2511 	 * then change the nodelist followed by a reinit and resume.
2512 	 */
2513 	nd = sd->sd_nodelist;
2514 	while (nd) {
2515 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2516 			nd = nd->nd_next;
2517 			continue;
2518 		}
2519 
2520 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp,
2521 		    MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
2522 			rval = -1;
2523 			goto out;
2524 		}
2525 		suspendall_flag = 1;
2526 		nd = nd->nd_next;
2527 	}
2528 
2529 	/* Set master in my set record in rpc.metad */
2530 	if (master_flag) {
2531 		if (clnt_mnsetmaster(mynode(), sp,
2532 		    sd->sd_mn_mynode->nd_nodename,
2533 		    sd->sd_mn_mynode->nd_nodeid, ep)) {
2534 			rval = -1;
2535 			goto out;
2536 		}
2537 	}
2538 	/*
2539 	 * Causes mddbs to be loaded into the kernel.
2540 	 * Set the force flag so that replica locations can be
2541 	 * loaded into the kernel even if a mediator node was
2542 	 * unavailable.  This allows a node to join an MO
2543 	 * diskset when there are sufficient replicas available,
2544 	 * but a mediator node in unavailable.
2545 	 */
2546 	if (setup_db_bydd(sp, dd, TRUE, ep) == -1) {
2547 		mde_perror(ep, dgettext(TEXT_DOMAIN,
2548 		    "Host not able to start diskset."));
2549 		rval = -1;
2550 		goto out;
2551 	}
2552 
2553 	if (! mdisok(ep)) {
2554 		rval = -1;
2555 		goto out;
2556 	}
2557 
2558 	/*
2559 	 * Set rollback flags to 1 so that halt_set is called if a failure
2560 	 * is seen after this point.  If snarf_set fails, still need to
2561 	 * call halt_set to cleanup the diskset.
2562 	 */
2563 	rb_flags = 1;
2564 
2565 	/* Starts the set */
2566 	if (snarf_set(sp, stale_bool, ep) != 0) {
2567 		if (mdismddberror(ep, MDE_DB_STALE)) {
2568 			/*
2569 			 * Don't fail join, STALE means that set has
2570 			 * < 50% mddbs.
2571 			 */
2572 			(void) mdstealerror(&ep_snarf, ep);
2573 			stale_set = 1;
2574 		} else if (mdisok(ep)) {
2575 			/* If snarf failed, but no error was set - set it */
2576 			(void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64,
2577 			    sp->setno, 0, NULL);
2578 				rval = -1;
2579 				goto out;
2580 		} else if (!(mdismddberror(ep, MDE_DB_ACCOK))) {
2581 			/*
2582 			 * Don't fail join if ACCOK; ACCOK means that mediator
2583 			 * provided extra vote.
2584 			 */
2585 			rval = -1;
2586 			goto out;
2587 		}
2588 	}
2589 
2590 	/* Did set really get snarfed? */
2591 	if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_NO) {
2592 		if (mdisok(ep)) {
2593 			/* If snarf failed, but no error was set - set it */
2594 			(void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64,
2595 				sp->setno, 0, NULL);
2596 		}
2597 		mde_perror(ep, dgettext(TEXT_DOMAIN,
2598 		    "Host not able to start diskset."));
2599 		rval = -1;
2600 		goto out;
2601 	}
2602 
2603 	/* Change to nodelist so need to send reinit to rpc.mdcommd */
2604 	send_reinit = 1;
2605 
2606 	/* If first node to enter set, setup master and clear change log */
2607 	if (master_flag) {
2608 		/* Set master in my locally cached set descriptor */
2609 		(void) strcpy(sd->sd_mn_master_nodenm,
2610 		    sd->sd_mn_mynode->nd_nodename);
2611 		sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid;
2612 		sd->sd_mn_am_i_master = 1;
2613 
2614 		/*
2615 		 * If first node to join set, then clear out change log
2616 		 * entries.  Change log entries are only needed when a
2617 		 * change of master is occurring in a diskset that has
2618 		 * multiple owners.   Since this node is the first owner
2619 		 * of the diskset, clear the entries.
2620 		 *
2621 		 * Only do this if we are in a single node non-SC3.x
2622 		 * situation.
2623 		 */
2624 		if (meta_mn_singlenode() &&
2625 			mdmn_reset_changelog(sp, ep,  MDMN_CLF_RESETLOG) != 0) {
2626 			mde_perror(ep, dgettext(TEXT_DOMAIN,
2627 			    "Unable to reset changelog."));
2628 			rval = -1;
2629 			goto out;
2630 		}
2631 	}
2632 
2633 	/* Set my locally cached flag */
2634 	sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN;
2635 
2636 	/*
2637 	 * Set this node's own flag on all joined nodes in the set
2638 	 * (including my node).
2639 	 */
2640 	clear_nr_flags = 1;
2641 
2642 	my_nd = *(sd->sd_mn_mynode);
2643 	my_nd.nd_next = NULL;
2644 	nd = sd->sd_nodelist;
2645 	while (nd) {
2646 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2647 			nd = nd->nd_next;
2648 			continue;
2649 		}
2650 		if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
2651 		    MD_NR_JOIN, NULL, ep)) {
2652 			rval = -1;
2653 			goto out;
2654 		}
2655 		nd = nd->nd_next;
2656 	}
2657 
2658 out:
2659 	if (rval != NULL) {
2660 		/*
2661 		 * If rollback flag is 1, then node was joined to set.
2662 		 * Since an error occurred, withdraw node from set in
2663 		 * order to rollback to before command was run.
2664 		 * Need to preserve ep so that calling function can
2665 		 * get error information.
2666 		 */
2667 		if (rb_flags == 1) {
2668 			if (halt_set(sp, &xep)) {
2669 				mdclrerror(&xep);
2670 			}
2671 		}
2672 
2673 		/*
2674 		 * If error, reset master to INVALID.
2675 		 * Ignore error since (next) first node to successfully join
2676 		 * will set master on all nodes.
2677 		 */
2678 		(void) clnt_mnsetmaster(mynode(), sp, "",
2679 			MD_MN_INVALID_NID, &xep);
2680 		mdclrerror(&xep);
2681 		/* Reset master in my locally cached set descriptor */
2682 		sd->sd_mn_master_nodeid = MD_MN_INVALID_NID;
2683 		sd->sd_mn_am_i_master = 0;
2684 
2685 		/*
2686 		 * If nr flags set on other nodes, reset them.
2687 		 */
2688 		if (clear_nr_flags) {
2689 			nd = sd->sd_nodelist;
2690 			while (nd) {
2691 				if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2692 					nd = nd->nd_next;
2693 					continue;
2694 				}
2695 				(void) clnt_upd_nr_flags(nd->nd_nodename, sp,
2696 					&my_nd, MD_NR_WITHDRAW, NULL, &xep);
2697 				mdclrerror(&xep);
2698 				nd = nd->nd_next;
2699 			}
2700 			/* Reset my locally cached flag */
2701 			sd->sd_mn_mynode->nd_flags &= ~MD_MN_NODE_OWN;
2702 		}
2703 	}
2704 
2705 	/*
2706 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2707 	 * Send reinit command to mdcommd which forces it to get
2708 	 * fresh set description.
2709 	 */
2710 	if (send_reinit) {
2711 		/* Send reinit */
2712 		nd = sd->sd_nodelist;
2713 		while (nd) {
2714 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2715 				nd = nd->nd_next;
2716 				continue;
2717 			}
2718 
2719 			/* Class is ignored for REINIT */
2720 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
2721 				sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
2722 				/*
2723 				 * We are here because we failed to resume
2724 				 * rpc.mdcommd.  However we potentially have
2725 				 * an error from the previous call
2726 				 * If the previous call did fail,  we capture
2727 				 * that error and generate a perror with
2728 				 * the string, "Unable to resume...".
2729 				 * Setting rval to -1 ensures that in the
2730 				 * next iteration of the loop, ep is not
2731 				 * clobbered.
2732 				 */
2733 				if (rval == 0)
2734 					(void) mdstealerror(ep, &xep);
2735 				else
2736 					mdclrerror(&xep);
2737 				rval = -1;
2738 				mde_perror(ep, dgettext(TEXT_DOMAIN,
2739 				    "Unable to reinit rpc.mdcommd."));
2740 			}
2741 			nd = nd->nd_next;
2742 		}
2743 
2744 	}
2745 
2746 out2:
2747 	/*
2748 	 * Unlock diskset by resuming messages across the diskset.
2749 	 * Just resume all classes so that resume is the same whether
2750 	 * just one class was locked or all classes were locked.
2751 	 */
2752 	if ((suspend1_flag) || (suspendall_flag)) {
2753 		nd = sd->sd_nodelist;
2754 		while (nd) {
2755 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2756 				nd = nd->nd_next;
2757 				continue;
2758 			}
2759 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
2760 				sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
2761 				/*
2762 				 * We are here because we failed to resume
2763 				 * rpc.mdcommd.  However we potentially have
2764 				 * an error from the previous call
2765 				 * If the previous call did fail,  we capture
2766 				 * that error and generate a perror with
2767 				 * the string, "Unable to resume...".
2768 				 * Setting rval to -1 ensures that in the
2769 				 * next iteration of the loop, ep is not
2770 				 * clobbered.
2771 				 */
2772 				if (rval == 0)
2773 					(void) mdstealerror(ep, &xep);
2774 				else
2775 					mdclrerror(&xep);
2776 				rval = -1;
2777 				mde_perror(ep, dgettext(TEXT_DOMAIN,
2778 				    "Unable to resume rpc.mdcommd."));
2779 			}
2780 			nd = nd->nd_next;
2781 		}
2782 		meta_ping_mnset(sp->setno);
2783 	}
2784 
2785 	/*
2786 	 * Unlock set.  This flushes the caches on the servers.
2787 	 */
2788 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
2789 	nd = sd->sd_nodelist;
2790 	while (nd) {
2791 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2792 			nd = nd->nd_next;
2793 			continue;
2794 		}
2795 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
2796 			if (rval == 0)
2797 				(void) mdstealerror(ep, &xep);
2798 			else
2799 				mdclrerror(&xep);
2800 			rval = -1;
2801 		}
2802 		nd = nd->nd_next;
2803 	}
2804 
2805 	/*
2806 	 * If this node is the last to join the diskset and clustering isn't
2807 	 * running, then resync the mirrors in the diskset. We have to wait
2808 	 * until all nodes are joined so that the status gets propagated to
2809 	 * all of the members of the set.
2810 	 * Ignore any error from the resync as the join function shouldn't fail
2811 	 * because the mirror resync had a problem.
2812 	 *
2813 	 * Don't start resync if set is stale.
2814 	 */
2815 	if ((rval == 0) && (sdssc_bind_library() != SDSSC_OKAY) &&
2816 	    (stale_set != 1)) {
2817 		nd = sd->sd_nodelist;
2818 		while (nd) {
2819 			if (!(nd->nd_flags & MD_MN_NODE_OWN))
2820 				break;
2821 			nd = nd->nd_next;
2822 		}
2823 		/*
2824 		 * nd set to NULL means that we have no nodes in the set that
2825 		 * haven't joined. In this case we start the resync.
2826 		 */
2827 		if (nd == NULL) {
2828 			(void) meta_mirror_resync_all(sp, 0, &xep);
2829 			mdclrerror(&xep);
2830 		}
2831 	}
2832 
2833 	/* Update ABR state for all soft partitions */
2834 	(void) meta_sp_update_abr(sp, &xep);
2835 	mdclrerror(&xep);
2836 
2837 	/*
2838 	 * call metaflushsetnames to reset local cache for master and
2839 	 * node information.
2840 	 */
2841 	metaflushsetname(sp);
2842 
2843 	/* release signals back to what they were on entry */
2844 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
2845 		mdclrerror(&xep);
2846 
2847 	/*
2848 	 * If no error and stale_set is set, then set ep back
2849 	 * to ep from snarf_set call and return -3.  If another error
2850 	 * occurred and rval is not 0, then that error would have
2851 	 * caused the node to be withdrawn from the set and would
2852 	 * have set ep to that error information.
2853 	 */
2854 	if ((rval == 0) && (stale_set)) {
2855 		(void) mdstealerror(ep, &ep_snarf);
2856 		return (-3);
2857 	}
2858 
2859 	return (rval);
2860 }
2861 
2862 /*
2863  * Entry point to withdraw a node from MultiNode diskset.
2864  *
2865  * Validate host in diskset.
2866  *	- Should be joined into diskset.
2867  * Assume valid configuration is stored in the set/drive/node records
2868  * in the local mddb since no node or drive can be added to the MNset
2869  * unless all drives and nodes are available.  Reconfig steps will
2870  * resync all ALIVE nodes in case of panic in critical areas.
2871  *
2872  * Lock down the set.
2873  * Verify that drives exist in configuration.
2874  * Verify host is a member of this diskset.
2875  * Verify host is an owner of the diskset (host is joined to diskset).
2876  * Only allow withdrawal of master node if master node is the only joined
2877  * in the diskset.
2878  * Halt the diskset on this node.
2879  * Reset Master on this node.
2880  * Updated node flags that this node with withdrawn.
2881  * Unlock the set.
2882  *
2883  * Return values:
2884  *	0  - Node successfully withdrew from set.
2885  *	-1 - Withdrawal attempted but failed
2886  *		- any failure from libmeta calls
2887  *		- node not in the member list
2888  *	-2 - Withdrawal not attempted since
2889  *		- this set had no drives in set
2890  *		- this node not joined to set
2891  *		- set is not a multinode set
2892  */
2893 extern int
2894 meta_set_withdraw(
2895 	mdsetname_t	*sp,
2896 	md_error_t	*ep
2897 )
2898 {
2899 	md_set_desc		*sd;
2900 	md_drive_desc		*dd = 0;
2901 	md_mnnode_desc		*nd, my_nd;
2902 	int			rval = 0;
2903 	md_setkey_t		*cl_sk;
2904 	md_error_t		xep = mdnullerror;
2905 	int			set_halted = 0;
2906 	int			suspendall_flag = 0;
2907 	int			suspend1_flag = 0;
2908 	bool_t			stale_bool = FALSE;
2909 	mddb_config_t		c;
2910 	int			node_id_list[1];
2911 	sigset_t		oldsigs;
2912 	int			send_reinit = 0;
2913 
2914 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2915 		return (-1);
2916 	}
2917 
2918 	/* Must be a multinode diskset */
2919 	if (!MD_MNSET_DESC(sd)) {
2920 		(void) mderror(ep, MDE_NOT_MN, sp->setname);
2921 		return (-1);
2922 	}
2923 
2924 	/* Make sure we are blocking all signals */
2925 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
2926 		mdclrerror(&xep);
2927 
2928 	/*
2929 	 * Lock the set on current set members.
2930 	 * For MN diskset lock_set and SUSPEND are used to protect against
2931 	 * other meta* commands running on the other nodes.
2932 	 */
2933 	nd = sd->sd_nodelist;
2934 	while (nd) {
2935 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2936 			nd = nd->nd_next;
2937 			continue;
2938 		}
2939 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2940 			rval = -1;
2941 			goto out;
2942 		}
2943 		nd = nd->nd_next;
2944 	}
2945 	/*
2946 	 * Lock out other meta* commands by suspending
2947 	 * class 1 messages across the diskset.
2948 	 */
2949 	nd = sd->sd_nodelist;
2950 	while (nd) {
2951 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2952 			nd = nd->nd_next;
2953 			continue;
2954 		}
2955 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2956 			sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) {
2957 			rval = -1;
2958 			goto out;
2959 		}
2960 		suspend1_flag = 1;
2961 		nd = nd->nd_next;
2962 	}
2963 
2964 	/* Get list of drives - needed in case of failure */
2965 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
2966 	    ep)) == NULL) {
2967 		/* Error getting drives in list */
2968 		if (! mdisok(ep)) {
2969 			rval = -1;
2970 			goto out2;
2971 		}
2972 		/* no drives in list */
2973 		rval = -2;
2974 		goto out2;
2975 	}
2976 
2977 	/*
2978 	 * Verify that this host is a member (in the host list) of the set.
2979 	 */
2980 	nd = sd->sd_nodelist;
2981 	while (nd) {
2982 		if (strcmp(mynode(), nd->nd_nodename) == 0) {
2983 			break;
2984 		}
2985 		nd = nd->nd_next;
2986 	}
2987 	if (!nd) {
2988 		(void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
2989 			sd->sd_mn_mynode->nd_nodename, NULL,
2990 			sp->setname);
2991 		rval = -1;
2992 		goto out2;
2993 	}
2994 
2995 	/*
2996 	 * Call metaget_setownership that calls each node in diskset and
2997 	 * marks in set descriptor if node is an owner of the set or not.
2998 	 * metaget_setownership checks to see if a node is an owner by
2999 	 * checking to see if that node's kernel has the mddb loaded.
3000 	 * If a node had panic'd during a reconfig or an
3001 	 * add/delete/join/withdraw operation, the other nodes' node
3002 	 * records may not reflect the current state of the diskset,
3003 	 * so calling metaget_setownership is the safest thing to do.
3004 	 */
3005 	if (metaget_setownership(sp, ep) == -1) {
3006 		rval = -1;
3007 		goto out2;
3008 	}
3009 
3010 	/*
3011 	 * Verify that this node is joined
3012 	 * to diskset (i.e. is an owner of the diskset).
3013 	 */
3014 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
3015 		rval = -2;
3016 		goto out2;
3017 	}
3018 
3019 	/*
3020 	 * For a MN diskset, only withdraw master if it is
3021 	 * the only joined node.
3022 	 */
3023 	if (sd->sd_mn_master_nodeid == sd->sd_mn_mynode->nd_nodeid) {
3024 		nd = sd->sd_nodelist;
3025 		while (nd) {
3026 			/* Skip my node since checking for other owners */
3027 			if (nd->nd_nodeid == sd->sd_mn_master_nodeid) {
3028 				nd = nd->nd_next;
3029 				continue;
3030 			}
3031 			/* If another owner node if found, error */
3032 			if (nd->nd_flags & MD_MN_NODE_OWN) {
3033 				(void) mddserror(ep, MDE_DS_WITHDRAWMASTER,
3034 					sp->setno,
3035 					sd->sd_mn_mynode->nd_nodename, NULL,
3036 					sp->setname);
3037 				rval = -1;
3038 				goto out2;
3039 			}
3040 			nd = nd->nd_next;
3041 		}
3042 	}
3043 
3044 	/*
3045 	 * Is current set STALE?
3046 	 */
3047 	(void) memset(&c, 0, sizeof (c));
3048 	c.c_id = 0;
3049 	c.c_setno = sp->setno;
3050 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
3051 		(void) mdstealerror(ep, &c.c_mde);
3052 		rval = -1;
3053 		goto out;
3054 	}
3055 	if (c.c_flags & MDDB_C_STALE) {
3056 		stale_bool = TRUE;
3057 	}
3058 
3059 	/*
3060 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
3061 	 * Start by suspending rpc.mdcommd (which drains it of all messages),
3062 	 * then change the nodelist followed by a reinit and resume.
3063 	 */
3064 	nd = sd->sd_nodelist;
3065 	while (nd) {
3066 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3067 			nd = nd->nd_next;
3068 			continue;
3069 		}
3070 
3071 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
3072 		    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
3073 			rval = -1;
3074 			goto out;
3075 		}
3076 		suspendall_flag = 1;
3077 		nd = nd->nd_next;
3078 	}
3079 
3080 	/*
3081 	 * Withdraw the set - halt set.
3082 	 * This will fail if any I/O is occuring to any metadevice which
3083 	 * includes a resync to a mirror metadevice.
3084 	 */
3085 	set_halted = 1;
3086 	if (halt_set(sp, ep)) {
3087 		/* Was set actually halted? */
3088 		if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_YES) {
3089 			set_halted = 0;
3090 		}
3091 		rval = -1;
3092 		goto out;
3093 	}
3094 
3095 	/* Change to nodelist so need to send reinit to rpc.mdcommd */
3096 	send_reinit = 1;
3097 
3098 	/* Reset master on withdrawn node */
3099 	if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, "",
3100 	    MD_MN_INVALID_NID, ep)) {
3101 		rval = -1;
3102 		goto out;
3103 	}
3104 
3105 	/* Mark my node as withdrawn and send to other nodes */
3106 	nd = sd->sd_nodelist;
3107 	my_nd = *(sd->sd_mn_mynode);	/* structure copy */
3108 	my_nd.nd_next = NULL;
3109 	while (nd) {
3110 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3111 			nd = nd->nd_next;
3112 			continue;
3113 		}
3114 		if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
3115 		    MD_NR_WITHDRAW, NULL, ep)) {
3116 			rval = -1;
3117 			goto out;
3118 		}
3119 		nd = nd->nd_next;
3120 	}
3121 
3122 	/*
3123 	 * If withdrawn node is a mirror owner, reset mirror owner
3124 	 * to NULL.  If an error occurs, print a warning and continue.
3125 	 * Don't fail metaset because of mirror owner reset problem since
3126 	 * next node to grab mirror will resolve this issue.
3127 	 * Before next node grabs mirrors, metaset will show the withdrawn
3128 	 * node as owner which is why an attempt to reset the mirror owner
3129 	 * is made.
3130 	 */
3131 	node_id_list[0] = sd->sd_mn_mynode->nd_nodeid;	/* Setup my nodeid */
3132 	nd = sd->sd_nodelist;
3133 	while (nd) {
3134 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3135 			nd = nd->nd_next;
3136 			continue;
3137 		}
3138 		if (clnt_reset_mirror_owner(nd->nd_nodename, sp,
3139 		    1, &node_id_list[0], &xep) == 01) {
3140 			mde_perror(&xep, dgettext(TEXT_DOMAIN,
3141 			    "Unable to reset mirror owner on node %s"),
3142 			    nd->nd_nodename);
3143 			mdclrerror(&xep);
3144 		}
3145 		nd = nd->nd_next;
3146 	}
3147 
3148 out:
3149 	if (rval == -1) {
3150 		/* Rejoin node - Mark node as joined and send to other nodes */
3151 		nd = sd->sd_nodelist;
3152 		my_nd = *(sd->sd_mn_mynode);	/* structure copy */
3153 		my_nd.nd_next = NULL;
3154 		while (nd) {
3155 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3156 				nd = nd->nd_next;
3157 				continue;
3158 			}
3159 			if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
3160 			    MD_NR_JOIN, NULL, &xep)) {
3161 				mdclrerror(&xep);
3162 			}
3163 			nd = nd->nd_next;
3164 		}
3165 
3166 		/* Set master on withdrawn node */
3167 		if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp,
3168 		    sd->sd_mn_master_nodenm,
3169 		    sd->sd_mn_master_nodeid, &xep)) {
3170 			mdclrerror(&xep);
3171 		}
3172 
3173 		/* Join set if halt_set had succeeded */
3174 		if (set_halted) {
3175 			/*
3176 			 * Causes mddbs to be loaded into the kernel.
3177 			 * Set the force flag so that replica locations can be
3178 			 * loaded into the kernel even if a mediator node was
3179 			 * unavailable.  This allows a node to join an MO
3180 			 * diskset when there are sufficient replicas available,
3181 			 * but a mediator node in unavailable.
3182 			 */
3183 			if (setup_db_bydd(sp, dd, TRUE, &xep) == -1) {
3184 				mdclrerror(&xep);
3185 			}
3186 			/* If set previously stale - make it so at re-join */
3187 			if (snarf_set(sp, stale_bool, &xep) != 0) {
3188 				mdclrerror(&xep);
3189 				(void) halt_set(sp, &xep);
3190 				mdclrerror(&xep);
3191 			}
3192 		}
3193 	}
3194 
3195 	/*
3196 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
3197 	 * Send reinit command to mdcommd which forces it to get
3198 	 * fresh set description.
3199 	 */
3200 	if (send_reinit) {
3201 		/* Send reinit */
3202 		nd = sd->sd_nodelist;
3203 		while (nd) {
3204 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3205 				nd = nd->nd_next;
3206 				continue;
3207 			}
3208 
3209 			/* Class is ignored for REINIT */
3210 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
3211 				sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
3212 				/*
3213 				 * We are here because we failed to resume
3214 				 * rpc.mdcommd.  However we potentially have
3215 				 * an error from the previous call.
3216 				 * If the previous call did fail,  we
3217 				 * capture that error and generate a perror
3218 				 * withthe string,  "Unable to resume...".
3219 				 * Setting rval to -1 ensures that in the
3220 				 * next iteration of the loop, ep is not
3221 				 * clobbered.
3222 				 */
3223 				if (rval == 0)
3224 					(void) mdstealerror(ep, &xep);
3225 				else
3226 					mdclrerror(&xep);
3227 				rval = -1;
3228 				mde_perror(ep, dgettext(TEXT_DOMAIN,
3229 				    "Unable to reinit rpc.mdcommd."));
3230 			}
3231 			nd = nd->nd_next;
3232 		}
3233 	}
3234 
3235 out2:
3236 	/*
3237 	 * Unlock diskset by resuming messages across the diskset.
3238 	 * Just resume all classes so that resume is the same whether
3239 	 * just one class was locked or all classes were locked.
3240 	 */
3241 	if ((suspend1_flag) || (suspendall_flag)) {
3242 		nd = sd->sd_nodelist;
3243 		while (nd) {
3244 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3245 				nd = nd->nd_next;
3246 				continue;
3247 			}
3248 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
3249 				sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
3250 				/*
3251 				 * We are here because we failed to resume
3252 				 * rpc.mdcommd.  However we potentially have
3253 				 * an error from the previous call
3254 				 * If the previous call did fail,  we capture
3255 				 * that error and generate a perror with
3256 				 * the string, "Unable to resume...".
3257 				 * Setting rval to -1 ensures that in the
3258 				 * next iteration of the loop, ep is not
3259 				 * clobbered.
3260 				 */
3261 				if (rval == 0)
3262 					(void) mdstealerror(ep, &xep);
3263 				else
3264 					mdclrerror(&xep);
3265 				rval = -1;
3266 				mde_perror(ep, dgettext(TEXT_DOMAIN,
3267 				    "Unable to resume rpc.mdcommd."));
3268 			}
3269 			nd = nd->nd_next;
3270 		}
3271 		meta_ping_mnset(sp->setno);
3272 	}
3273 
3274 	/*
3275 	 * Unlock set.  This flushes the caches on the servers.
3276 	 */
3277 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
3278 	nd = sd->sd_nodelist;
3279 	while (nd) {
3280 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3281 			nd = nd->nd_next;
3282 			continue;
3283 		}
3284 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
3285 			if (rval == 0)
3286 				(void) mdstealerror(ep, &xep);
3287 			else
3288 				mdclrerror(&xep);
3289 			rval = -1;
3290 		}
3291 		nd = nd->nd_next;
3292 	}
3293 
3294 	/*
3295 	 * call metaflushsetnames to reset local cache for master and
3296 	 * node information.
3297 	 */
3298 	metaflushsetname(sp);
3299 
3300 	/* release signals back to what they were on entry */
3301 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
3302 		mdclrerror(&xep);
3303 
3304 	return (rval);
3305 
3306 }
3307 
3308 /*
3309  * Update nodelist with cluster member information.
3310  * A node not in the member list will be marked
3311  * as not ALIVE and not OWN.
3312  * A node in the member list will be marked ALIVE, but
3313  * the OWN bit will not be changed.
3314  *
3315  * If mynode isn't in the membership list, fail causing
3316  * another reconfig cycle to be started since a non-member
3317  * node shouldn't be taking part in the reconfig cycle.
3318  *
3319  * Return values:
3320  *	0 - No problem.
3321  *	1 - Any failure including RPC failure to my node.
3322  */
3323 int
3324 meta_reconfig_update_nodelist(
3325 	mdsetname_t			*sp,
3326 	mndiskset_membershiplist_t	*nl,
3327 	md_set_desc			*sd,
3328 	md_error_t			*ep
3329 )
3330 {
3331 	mndiskset_membershiplist_t	*nl2;
3332 	md_mnnode_desc			*nd;
3333 	md_error_t			xep = mdnullerror;
3334 	int				rval = 0;
3335 
3336 	/*
3337 	 * Walk through nodelist, checking to see if each
3338 	 * node is in the member list.
3339 	 * If node is not a member, reset ALIVE and OWN node flag.
3340 	 * If node is a member, set ALIVE.
3341 	 * If mynode's OWN flag gets reset, then halt the diskset on this node.
3342 	 */
3343 	nd = sd->sd_nodelist;
3344 	while (nd) {
3345 		nl2 = nl;
3346 		while (nl2) {
3347 			/* If node is in member list, set ALIVE */
3348 			if (nl2->msl_node_id == nd->nd_nodeid) {
3349 				nd->nd_flags |= MD_MN_NODE_ALIVE;
3350 				break;
3351 			} else {
3352 				nl2 = nl2->next;
3353 			}
3354 			/* node is not in member list, mark !ALIVE and !OWN */
3355 			if (nl2 == NULL) {
3356 				/* If node is mynode, then halt set if needed */
3357 				if (strcmp(mynode(), nd->nd_nodename) == 0) {
3358 					/*
3359 					 * This shouldn't happen, but just
3360 					 * in case...  Any node not in the
3361 					 * membership list should be dead and
3362 					 * not running reconfig step1.
3363 					 */
3364 					if (nd->nd_flags & MD_MN_NODE_OWN) {
3365 						if (halt_set(sp, &xep)) {
3366 							mde_perror(&xep, "");
3367 							mdclrerror(&xep);
3368 						}
3369 					}
3370 					/*
3371 					 * Return failure since this node
3372 					 * (mynode) is not in the membership
3373 					 * list, but process the rest of the
3374 					 * nodelist first so that rpc.metad
3375 					 * can be updated with the latest
3376 					 * membership information.
3377 					 */
3378 					(void) mddserror(ep,
3379 					    MDE_DS_NOTINMEMBERLIST,
3380 					    sp->setno, nd->nd_nodename, NULL,
3381 					    sp->setname);
3382 					rval = 1;
3383 				}
3384 				nd->nd_flags &= ~MD_MN_NODE_ALIVE;
3385 				nd->nd_flags &= ~MD_MN_NODE_OWN;
3386 			}
3387 		}
3388 		nd = nd->nd_next;
3389 	}
3390 
3391 	/* Send this information to rpc.metad */
3392 	if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist,
3393 	    MD_NR_SET,  MNSET_IN_RECONFIG, &xep)) {
3394 		/* Return failure if can't send node flags to rpc.metad */
3395 		if (rval == 0) {
3396 			(void) mdstealerror(ep, &xep);
3397 			rval = 1;
3398 		}
3399 	}
3400 	return (rval);
3401 }
3402 
3403 /*
3404  * Choose master determines the master for a diskset.
3405  * Each node determines the master on its own and
3406  * adds this information to its local rpc.metad nodelist
3407  * and also sends it to the kernel.
3408  *
3409  * Nodelist in set descriptor (sd) is sorted in
3410  * monotonically increasing sequence of nodeid.
3411  *
3412  * Return values:
3413  *	0 - No problem.
3414  *	205 - There was an RPC problem to another node.
3415  *	-1 - There was an error.  This could be an RPC error to my node.
3416  *		This is a catastrophic failure causing node to panic.
3417  */
3418 int
3419 meta_reconfig_choose_master_for_set(
3420 	mdsetname_t	*sp,
3421 	md_set_desc	*sd,
3422 	md_error_t	*ep
3423 )
3424 {
3425 	int			is_owner;
3426 	md_mnset_record		*mnsr = NULL;
3427 	int			lowest_alive_nodeid = 0;
3428 	uint_t			master_nodeid;
3429 	md_mnnode_desc		*nd, *nd2;
3430 	md_mnnode_record	*nr;
3431 	md_drive_desc		*dd;
3432 	md_setkey_t		*cl_sk;
3433 	int			rval = 0;
3434 	md_error_t		xep = mdnullerror;
3435 	mddb_setflags_config_t	sf;
3436 
3437 	/*
3438 	 * Is current node joined to diskset?
3439 	 * Don't trust flags, really check to see if mddb is snarfed.
3440 	 */
3441 	if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
3442 		/*
3443 		 * If a node is joined to the diskset, this node checks
3444 		 * to see if the current master of the diskset is valid and
3445 		 * is still in the membership list (ALIVE) and is
3446 		 * still joined (OWN).  Need to verify if master is
3447 		 * really joined - don't trust the flags.  (Can trust
3448 		 * ALIVE since set during earlier part of reconfig cycle.)
3449 		 * If the current master is valid, still in the membership
3450 		 * list and joined, then master is not changed on this node.
3451 		 * Just return.
3452 		 *
3453 		 * Verify that nodeid is valid before accessing masternode.
3454 		 */
3455 		if ((sd->sd_mn_master_nodeid != MD_MN_INVALID_NID) &&
3456 		    (sd->sd_mn_masternode->nd_flags & MD_MN_NODE_ALIVE)) {
3457 			if (clnt_ownset(sd->sd_mn_master_nodenm, sp,
3458 			    &is_owner, ep) == -1) {
3459 				/* If RPC failure to another node return 205 */
3460 				if ((mdanyrpcerror(ep)) &&
3461 				    (sd->sd_mn_mynode->nd_nodeid !=
3462 				    sd->sd_mn_master_nodeid)) {
3463 					return (205);
3464 				} else {
3465 					/* Any other failure */
3466 					return (-1);
3467 				}
3468 			} else {
3469 				if (is_owner == TRUE) {
3470 
3471 					meta_mc_log(MC_LOG5, dgettext(
3472 					    TEXT_DOMAIN, "Set %s previous "
3473 					    "master chosen %s (%d): %s"),
3474 					    sp->setname,
3475 					    sd->sd_mn_master_nodenm,
3476 					    sd->sd_mn_master_nodeid,
3477 					    meta_print_hrtime(gethrtime() -
3478 					    start_time));
3479 
3480 					/* Previous master is ok - done */
3481 					return (0);
3482 				}
3483 			}
3484 		}
3485 
3486 		/*
3487 		 * If current master is no longer in the membership list or
3488 		 * is no longer joined, then this node uses the following
3489 		 * algorithm:
3490 		 * - node calls RPC routine clnt_ownset to get latest
3491 		 *	information on which nodes are owners of diskset.
3492 		 * 	clnt_ownset checks on each node to see if its kernel
3493 		 *	has that diskset snarfed.
3494 		 */
3495 		nd = sd->sd_nodelist;
3496 		while (nd) {
3497 			/* Don't consider node that isn't in member list */
3498 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3499 				nd = nd->nd_next;
3500 				continue;
3501 			}
3502 
3503 			if (clnt_ownset(nd->nd_nodename, sp,
3504 			    &is_owner, ep) == -1) {
3505 				/* If RPC failure to another node return 205 */
3506 				if ((mdanyrpcerror(ep)) &&
3507 				    (sd->sd_mn_mynode->nd_nodeid !=
3508 				    nd->nd_nodeid)) {
3509 					return (205);
3510 				} else {
3511 					/* Any other failure */
3512 					return (-1);
3513 				}
3514 			}
3515 
3516 			/*
3517 			 * Set owner flag for each node based on whether
3518 			 * that node really has a diskset mddb snarfed in
3519 			 * or not.
3520 			 */
3521 			if (is_owner == TRUE)
3522 				nd->nd_flags |= MD_MN_NODE_OWN;
3523 			else
3524 				nd->nd_flags &= ~MD_MN_NODE_OWN;
3525 
3526 			nd = nd->nd_next;
3527 		}
3528 
3529 		/*
3530 		 * - node walks through nodelist looking for nodes that are
3531 		 *	owners of the diskset that are in the membership list.
3532 		 * - for each owner, node calls RPC routine clnt_getset to
3533 		 *	 see if that node has its node record set to OK.
3534 		 * - If so, master is chosen to be this owner node.
3535 		 */
3536 		nd = sd->sd_nodelist;
3537 		while (nd) {
3538 			/* Don't consider node that isn't in member list */
3539 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3540 				nd = nd->nd_next;
3541 				continue;
3542 			}
3543 
3544 			/* Don't consider a node that isn't an owner */
3545 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3546 				nd = nd->nd_next;
3547 				continue;
3548 			}
3549 
3550 			/* Does node has its own node record set to OK? */
3551 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3552 			    MD_SET_BAD, &mnsr, ep) == -1) {
3553 				/* If RPC failure to another node return 205 */
3554 				if ((mdanyrpcerror(ep)) &&
3555 				    (sd->sd_mn_mynode->nd_nodeid !=
3556 				    nd->nd_nodeid)) {
3557 					return (205);
3558 				} else {
3559 					/* Any other failure */
3560 					return (-1);
3561 				}
3562 			}
3563 			nr = mnsr->sr_nodechain;
3564 			while (nr) {
3565 				if (nd->nd_nodeid == nr->nr_nodeid) {
3566 					if (nr->nr_flags & MD_MN_NODE_OK) {
3567 						/* Found a master */
3568 						free_sr(
3569 						    (md_set_record *)mnsr);
3570 						goto found_master;
3571 					}
3572 				}
3573 				nr = nr->nr_next;
3574 			}
3575 			free_sr((md_set_record *)mnsr);
3576 			nd = nd->nd_next;
3577 		}
3578 
3579 		/*
3580 		 * - If no owner node has its own node record on its own node
3581 		 *	set to OK, then this node checks all of the non-owner
3582 		 * 	nodes that are in the membership list.
3583 		 * - for each non-owner, node calls RPC routine clnt_getset to
3584 		 *	 see if that node has its node record set to OK.
3585 		 * - If set doesn't exist, don't choose node for master.
3586 		 * - If so, master is chosen to be this non-owner node.
3587 		 *
3588 		 */
3589 		nd = sd->sd_nodelist;
3590 		while (nd) {
3591 			/* Don't consider node that isn't in member list */
3592 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3593 				nd = nd->nd_next;
3594 				continue;
3595 			}
3596 
3597 			/* Only checking non-owner nodes this time around */
3598 			if (nd->nd_flags & MD_MN_NODE_OWN) {
3599 				nd = nd->nd_next;
3600 				continue;
3601 			}
3602 
3603 			/* Does node has its own node record set to OK? */
3604 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3605 			    MD_SET_BAD, &mnsr, ep) == -1) {
3606 				/*
3607 				 * If set doesn't exist on non-owner node,
3608 				 * don't consider this node for master.
3609 				 */
3610 				if (mdiserror(ep, MDE_NO_SET)) {
3611 					nd = nd->nd_next;
3612 					continue;
3613 				} else if ((mdanyrpcerror(ep)) &&
3614 				    (sd->sd_mn_mynode->nd_nodeid !=
3615 				    nd->nd_nodeid)) {
3616 					/* RPC failure to another node */
3617 					return (205);
3618 				} else {
3619 					/* Any other failure */
3620 					return (-1);
3621 				}
3622 			}
3623 			nr = mnsr->sr_nodechain;
3624 			while (nr) {
3625 				if (nd->nd_nodeid == nr->nr_nodeid) {
3626 					if (nr->nr_flags & MD_MN_NODE_OK) {
3627 						/* Found a master */
3628 						free_sr(
3629 						    (md_set_record *)mnsr);
3630 						goto found_master;
3631 					}
3632 				}
3633 				nr = nr->nr_next;
3634 			}
3635 			free_sr((md_set_record *)mnsr);
3636 			nd = nd->nd_next;
3637 		}
3638 
3639 		/*
3640 		 * - If no node can be found that has its own node record on
3641 		 *	its node to be set to OK, then all alive nodes
3642 		 * 	were in the process of being added to or deleted
3643 		 *	from set.  Each alive node will remove all
3644 		 *	information pertaining to this set from its node.
3645 		 *
3646 		 * If all nodes in set are ALIVE, then call sdssc end routines
3647 		 * since set was truly being initially created or destroyed.
3648 		 */
3649 		goto delete_set;
3650 	} else {
3651 
3652 		/*
3653 		 * If node is not joined to diskset, then this
3654 		 * node uses the following algorithm:
3655 		 * - If unjoined node doesn't have a node record for itself,
3656 		 *	just delete the diskset since diskset was in the
3657 		 *	process of being created.
3658 		 * - node needs to find master of diskset before
3659 		 *	reconfig cycle, if a master existed.
3660 		 * - node calls RPC routine clnt_ownset to get latest
3661 		 * 	information on which nodes are owners of diskset.
3662 		 *	clnt_ownset checks on each node to see if its
3663 		 *	kernel has that diskset snarfed.
3664 		 */
3665 
3666 		/*
3667 		 * Is my node in the set description?
3668 		 * If not, delete the set from this node.
3669 		 * sr2setdesc sets sd_mn_mynode pointer to the node
3670 		 * descriptor for this node if there was a node
3671 		 * record for this node.
3672 		 *
3673 		 */
3674 		if (sd->sd_mn_mynode == NULL) {
3675 			goto delete_set;
3676 		}
3677 
3678 		nd = sd->sd_nodelist;
3679 		while (nd) {
3680 			/* Don't consider node that isn't in member list */
3681 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3682 				nd = nd->nd_next;
3683 				continue;
3684 			}
3685 
3686 			if (clnt_ownset(nd->nd_nodename, sp,
3687 			    &is_owner, ep) == -1) {
3688 				/* If RPC failure to another node return 205 */
3689 				if ((mdanyrpcerror(ep)) &&
3690 				    (sd->sd_mn_mynode->nd_nodeid !=
3691 				    nd->nd_nodeid)) {
3692 					return (205);
3693 				} else {
3694 					/* Any other failure */
3695 					return (-1);
3696 				}
3697 			}
3698 
3699 			/*
3700 			 * Set owner flag for each node based on whether
3701 			 * that node really has a diskset mddb snarfed in
3702 			 * or not.
3703 			 */
3704 			if (is_owner == TRUE)
3705 				nd->nd_flags |= MD_MN_NODE_OWN;
3706 			else
3707 				nd->nd_flags &= ~MD_MN_NODE_OWN;
3708 
3709 			nd = nd->nd_next;
3710 		}
3711 
3712 		/*
3713 		 * - node walks through nodelist looking for nodes that
3714 		 *	are owners of the diskset that are in
3715 		 *	the membership list.
3716 		 * - for each owner, node calls RPC routine clnt_getset to
3717 		 *	see if that node has a master set and to get the
3718 		 *	diskset description.
3719 		 * - If the owner node has a set description that doesn't
3720 		 *	include the non-joined node in the nodelist, this node
3721 		 *	removes its set description of that diskset
3722 		 *	(i.e. removes the set from its local mddbs).  This is
3723 		 *	handling the case of when a node was removed from a
3724 		 *	diskset while it was not in the cluster membership
3725 		 *	list.
3726 		 * - If that node has a master set and the master is in the
3727 		 *	membership list and is an owner, then either this was
3728 		 *	the master from before the reconfig cycle or this
3729 		 *	node has already chosen a new master - either way,
3730 		 *	the master value is valid as long as it is in the
3731 		 *	membership list and is an owner
3732 		 * - master is chosen to be owner node's master
3733 		 */
3734 		nd = sd->sd_nodelist;
3735 		while (nd) {
3736 			/* Don't consider node that isn't in member list */
3737 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3738 				nd = nd->nd_next;
3739 				continue;
3740 			}
3741 
3742 			/* Don't consider a node that isn't an owner */
3743 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3744 				nd = nd->nd_next;
3745 				continue;
3746 			}
3747 
3748 			/* Get owner node's set record */
3749 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3750 			    MD_SET_BAD, &mnsr, ep) == -1) {
3751 				/* If RPC failure to another node return 205 */
3752 				if ((mdanyrpcerror(ep)) &&
3753 				    (sd->sd_mn_mynode->nd_nodeid !=
3754 				    nd->nd_nodeid)) {
3755 					return (205);
3756 				} else {
3757 					/* Any other failure */
3758 					return (-1);
3759 				}
3760 			}
3761 
3762 			/* Is this node in the owner node's set record */
3763 			nr = mnsr->sr_nodechain;
3764 			while (nr) {
3765 				if (sd->sd_mn_mynode->nd_nodeid ==
3766 				    nr->nr_nodeid) {
3767 					break;
3768 				}
3769 				nr = nr->nr_next;
3770 			}
3771 			if (nr == NULL) {
3772 				/* my node not found - delete set */
3773 				free_sr((md_set_record *)mnsr);
3774 				goto delete_set;
3775 			}
3776 
3777 			/* Is owner's node's master valid? */
3778 			master_nodeid = mnsr->sr_master_nodeid;
3779 			free_sr((md_set_record *)mnsr);
3780 			if (master_nodeid == MD_MN_INVALID_NID) {
3781 				nd = nd->nd_next;
3782 				continue;
3783 			}
3784 
3785 			nd2 = sd->sd_nodelist;
3786 			while (nd2) {
3787 				if ((nd2->nd_nodeid == master_nodeid) &&
3788 				    (nd2->nd_flags & MD_MN_NODE_ALIVE) &&
3789 				    (nd2->nd_flags & MD_MN_NODE_OWN)) {
3790 						nd = nd2;
3791 						goto found_master;
3792 				}
3793 				nd2 = nd2->nd_next;
3794 			}
3795 			nd = nd->nd_next;
3796 		}
3797 
3798 		/*
3799 		 * - If no owner node has a valid master, then follow
3800 		 * 	algorithm of when a node is joined to the diskset.
3801 		 * - node walks through nodelist looking for nodes that are
3802 		 *	owners of the diskset that are in the membership list.
3803 		 * - for each owner, node calls RPC routine clnt_getset to
3804 		 *	 see if that node has its node record set to OK.
3805 		 * - If so, master is chosen to be this owner node.
3806 		 */
3807 		nd = sd->sd_nodelist;
3808 		while (nd) {
3809 			/* Don't consider node that isn't in member list */
3810 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3811 				nd = nd->nd_next;
3812 				continue;
3813 			}
3814 
3815 			/* Don't consider a node that isn't an owner */
3816 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3817 				nd = nd->nd_next;
3818 				continue;
3819 			}
3820 
3821 			/* Does node has its own node record set to OK? */
3822 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3823 			    MD_SET_BAD, &mnsr, ep) == -1) {
3824 				/* If RPC failure to another node return 205 */
3825 				if ((mdanyrpcerror(ep)) &&
3826 				    (sd->sd_mn_mynode->nd_nodeid !=
3827 				    nd->nd_nodeid)) {
3828 					return (205);
3829 				} else {
3830 					/* Any other failure */
3831 					return (-1);
3832 				}
3833 			}
3834 			nr = mnsr->sr_nodechain;
3835 			while (nr) {
3836 				if (nd->nd_nodeid == nr->nr_nodeid) {
3837 					if (nr->nr_flags & MD_MN_NODE_OK) {
3838 						/* Found a master */
3839 						free_sr(
3840 						    (md_set_record *)mnsr);
3841 						goto found_master;
3842 					}
3843 				}
3844 				nr = nr->nr_next;
3845 			}
3846 			free_sr((md_set_record *)mnsr);
3847 			nd = nd->nd_next;
3848 		}
3849 
3850 		/*
3851 		 * - If no owner node has its own node record on its own node
3852 		 *	set to OK, then this node checks all of the non-owner
3853 		 *	nodes that are in the membership list.
3854 		 * - for each non-owner, node calls RPC routine clnt_getset to
3855 		 *	see if that node has its node record set to OK.
3856 		 * - If set doesn't exist, don't choose node for master.
3857 		 * - If this node doesn't exist in the nodelist on any of the
3858 		 *	non-owner nodes, this node removes its set description
3859 		 *	of that diskset (i.e. removes the set from its local
3860 		 *	mddbs). This is handling the case of when a node was
3861 		 *	removed from a diskset while it was not in the
3862 		 *	cluster membership list.
3863 		 * - If non-owner node has its node record set to OK and if
3864 		 *	this node hasn't removed this diskset (step directly
3865 		 *	before this one), then the master is chosen to be this
3866 		 *	non-owner node.
3867 		 */
3868 		nd = sd->sd_nodelist;
3869 		while (nd) {
3870 			/* Don't consider node that isn't in member list */
3871 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3872 				nd->nd_flags |= MD_MN_NODE_DEL;
3873 				nd = nd->nd_next;
3874 				continue;
3875 			}
3876 
3877 			/* Don't consider owner nodes since none are OK */
3878 			if (nd->nd_flags & MD_MN_NODE_OWN) {
3879 				nd->nd_flags |= MD_MN_NODE_DEL;
3880 				nd = nd->nd_next;
3881 				continue;
3882 			}
3883 
3884 			/*
3885 			 * Don't need to get nodelist from my node since
3886 			 * this is where sd_nodelist was obtained.
3887 			 */
3888 			if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) {
3889 				nd = nd->nd_next;
3890 				continue;
3891 			}
3892 
3893 			/*
3894 			 * If node has already been decided against for
3895 			 * master, then skip it.
3896 			 */
3897 			if (nd->nd_flags & MD_MN_NODE_DEL) {
3898 				nd = nd->nd_next;
3899 				continue;
3900 			}
3901 
3902 			/*
3903 			 * Does node in my nodelist have its own node
3904 			 * record marked OK on its node?  And does node
3905 			 * in my nodelist exist on all other nodes?
3906 			 * Don't want to choose a node for master unless
3907 			 * that node is marked OK on its own node and that
3908 			 * node exists on all other alive nodes.
3909 			 *
3910 			 * This is guarding against the case when several
3911 			 * nodes are down and one of the downed nodes is
3912 			 * deleted from the diskset.  When the down nodes
3913 			 * are rebooted into the cluster, you don't want
3914 			 * any node to pick the deleted node as the master.
3915 			 */
3916 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3917 			    MD_SET_BAD, &mnsr, ep) == -1) {
3918 				/*
3919 				 * If set doesn't exist on non-owner node,
3920 				 * don't consider this node for master.
3921 				 */
3922 				if (mdiserror(ep, MDE_NO_SET)) {
3923 					nd->nd_flags |= MD_MN_NODE_DEL;
3924 					nd = nd->nd_next;
3925 					continue;
3926 				} else if (mdanyrpcerror(ep)) {
3927 					/* RPC failure to another node */
3928 					return (205);
3929 				} else {
3930 					/* Any other failure */
3931 					return (-1);
3932 				}
3933 			}
3934 			/*
3935 			 * Is my node in the nodelist gotten from the other
3936 			 * node?  If not, then remove the set from my node
3937 			 * since set was deleted from my node while my node
3938 			 * was out of the cluster.
3939 			 */
3940 			nr = mnsr->sr_nodechain;
3941 			while (nr) {
3942 				if (sd->sd_mn_mynode->nd_nodeid ==
3943 				    nr->nr_nodeid) {
3944 					break;
3945 				}
3946 				nr = nr->nr_next;
3947 			}
3948 			if (nr == NULL) {
3949 				/* my node not found - delete set */
3950 				free_sr((md_set_record *)mnsr);
3951 				goto delete_set;
3952 			}
3953 
3954 			/* Is node being checked marked OK on its own node? */
3955 			nr = mnsr->sr_nodechain;
3956 			while (nr) {
3957 				if (nd->nd_nodeid == nr->nr_nodeid) {
3958 					if (!(nr->nr_flags & MD_MN_NODE_OK)) {
3959 						nd->nd_flags |= MD_MN_NODE_DEL;
3960 					}
3961 					break;
3962 				}
3963 				nr = nr->nr_next;
3964 			}
3965 			/*
3966 			 * If node being checked doesn't exist on its
3967 			 * own node - don't choose it as master.
3968 			 */
3969 			if (nr == NULL) {
3970 				nd->nd_flags |= MD_MN_NODE_DEL;
3971 			}
3972 
3973 			/*
3974 			 * Check every node in my node's nodelist against
3975 			 * the nodelist gotten from the other node.
3976 			 * If a node in my node's nodelist is not found in the
3977 			 * other node's nodelist, then set the DEL flag.
3978 			 */
3979 			nd2 = sd->sd_nodelist;
3980 			while (nd2) {
3981 				nr = mnsr->sr_nodechain;
3982 				while (nr) {
3983 					if (nd2->nd_nodeid == nr->nr_nodeid) {
3984 						break;
3985 					}
3986 					nr = nr->nr_next;
3987 				}
3988 				/* nd2 not found in other node's nodelist */
3989 				if (nr == NULL) {
3990 					nd2->nd_flags |= MD_MN_NODE_DEL;
3991 				}
3992 				nd2 = nd2->nd_next;
3993 			}
3994 
3995 			free_sr((md_set_record *)mnsr);
3996 			nd = nd->nd_next;
3997 		}
3998 
3999 		/*
4000 		 * Rescan list look for node that has not been marked DEL.
4001 		 * First node found is the master.
4002 		 */
4003 		nd = sd->sd_nodelist;
4004 		while (nd) {
4005 			if (!(nd->nd_flags & MD_MN_NODE_DEL)) {
4006 				break;
4007 			}
4008 			nd = nd->nd_next;
4009 			continue;
4010 		}
4011 		if (nd) {
4012 			/* Found a master */
4013 			goto found_master;
4014 		}
4015 
4016 		/*
4017 		 * - If no node can be found that has its own node record on
4018 		 *	its node to be set to OK, then all alive nodes
4019 		 * 	were in the process of being added to or deleted
4020 		 *	from set.  Each alive node will remove all
4021 		 *	information pertaining to this set from its node.
4022 		 *
4023 		 * If all nodes in set are ALIVE, then call sdssc end routines
4024 		 * since set was truly being initially created or destroyed.
4025 		 */
4026 		goto delete_set;
4027 	}
4028 
4029 found_master:
4030 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4031 	    "Set %s master chosen %s (%d): %s"),
4032 	    sp->setname, nd->nd_nodename, nd->nd_nodeid,
4033 	    meta_print_hrtime(gethrtime() - start_time));
4034 
4035 	if (clnt_lock_set(mynode(), sp, ep) == -1) {
4036 		return (-1);
4037 	}
4038 
4039 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
4040 
4041 	if (clnt_mnsetmaster(mynode(), sp,
4042 	    nd->nd_nodename, nd->nd_nodeid, ep)) {
4043 		rval = -1;
4044 	} else if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) {
4045 		/* If this node is new master, set flag in this node's kernel */
4046 		(void) memset(&sf, 0, sizeof (sf));
4047 		sf.sf_setno = sp->setno;
4048 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
4049 		/* Use magic to help protect ioctl against attack. */
4050 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
4051 		sf.sf_flags = MDDB_NM_SET;
4052 
4053 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4054 		    "Setting new master flag for set %s: %s"),
4055 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4056 
4057 		/*
4058 		 * Fail reconfig cycle if ioctl fails since it is critical
4059 		 * to set new master flag.
4060 		 */
4061 		if (metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde,
4062 		    NULL) != NULL) {
4063 			(void) mdstealerror(ep, &sf.sf_mde);
4064 			rval = -1;
4065 		}
4066 	}
4067 
4068 	if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) {
4069 		if (rval == 0) {
4070 			(void) mdstealerror(ep, &xep);
4071 			rval = -1;
4072 		}
4073 	}
4074 
4075 	cl_set_setkey(NULL);
4076 
4077 	metaflushsetname(sp);
4078 
4079 	return (rval);
4080 
4081 delete_set:
4082 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4083 	    "Master not chosen, deleting set %s: %s"),
4084 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4085 
4086 	/*
4087 	 * Remove all set information from this node:
4088 	 *	- node records for this set
4089 	 *	- drive records for this set
4090 	 *	- set record for this set
4091 	 * (Only do this on this node since each node
4092 	 * will do it for its own local mddb.)
4093 	 *
4094 	 * If all nodes in set are ALIVE, then
4095 	 * the lowest numbered ALIVE nodeid in set
4096 	 * (irregardless of whether an owner node or not) will
4097 	 * call the DCS service to cleanup for create/delete of set.
4098 	 *   sdssc_create_end(cleanup) if set was being created or
4099 	 *   sdssc_delete_end(cleanup) if set was being deleted.
4100 	 * A node record with flag ADD denotes a set being
4101 	 * created.  A node record with flag DEL denotes a
4102 	 * set being deleted.
4103 	 */
4104 	nd = sd->sd_nodelist;
4105 	while (nd) {
4106 		/* Found a node that isn't alive */
4107 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
4108 			break;
4109 
4110 		/* Is my node the lowest numbered ALIVE node? */
4111 		if (nd->nd_nodeid < sd->sd_mn_mynode->nd_nodeid) {
4112 			break;
4113 		}
4114 		nd = nd->nd_next;
4115 	}
4116 	if (nd == NULL) {
4117 		/* All nodes ALIVE and this is the lowest nodeid */
4118 		lowest_alive_nodeid = 1;
4119 	}
4120 
4121 	if (clnt_lock_set(mynode(), sp, ep) == -1) {
4122 		return (-1);
4123 	}
4124 
4125 
4126 	/*
4127 	 * If this node had been joined, withdraw and reset master.
4128 	 *
4129 	 * This could happen if a node was being added to or removed
4130 	 * from a diskset and the node doing the add/delete operation and
4131 	 * all other nodes in the diskset have left the cluster.
4132 	 */
4133 	if (sd->sd_mn_mynode) {
4134 		nd = sd->sd_mn_mynode;
4135 		if (nd->nd_flags & MD_MN_NODE_OWN) {
4136 			if (clnt_withdrawset(mynode(), sp, ep)) {
4137 				rval = -1;
4138 				goto out;
4139 			}
4140 			if (clnt_mnsetmaster(mynode(), sp, "",
4141 			    MD_MN_INVALID_NID, ep)) {
4142 				rval = -1;
4143 				goto out;
4144 			}
4145 		}
4146 	}
4147 
4148 	/*
4149 	 * Remove side records for this node (side) from local mddb
4150 	 * (clnt_deldrvs does this) if there are drives in the set.
4151 	 *
4152 	 * Don't need to mark this node as DEL since already marked as
4153 	 * ADD or DEL (or this node would have been chosen as master).
4154 	 * Don't need to mark other node records, drive records or
4155 	 * set records as DEL.  If a panic occurs during clnt_delset,
4156 	 * these records will be deleted the next time this node
4157 	 * becomes a member and goes through the reconfig cycle.
4158 	 */
4159 	/* Get the drive descriptors for this set */
4160 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
4161 	    ep)) == NULL) {
4162 		if (! mdisok(ep)) {
4163 			/*
4164 			 * Ignore and clear out any failures from
4165 			 * metaget_drivedesc since a panic could have
4166 			 * occurred when a node was partially added to a set.
4167 			 */
4168 			mdclrerror(ep);
4169 		}
4170 	} else {
4171 		if (clnt_deldrvs(mynode(), sp, dd, ep)) {
4172 			rval = -1;
4173 			goto out;
4174 		}
4175 	}
4176 
4177 	/*
4178 	 * Now, delete the set - this removes the node, drive
4179 	 * and set records from the local mddb.
4180 	 */
4181 	if (clnt_delset(mynode(), sp, ep)) {
4182 		rval = -1;
4183 		goto out;
4184 	}
4185 
4186 out:
4187 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
4188 
4189 	/*
4190 	 * Ignore errors from unlock of set since set is no longer
4191 	 * known (if clnt_delset worked).
4192 	 */
4193 	if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) {
4194 		mdclrerror(&xep);
4195 	}
4196 
4197 	cl_set_setkey(NULL);
4198 
4199 	metaflushsetname(sp);
4200 
4201 	/*
4202 	 * If this node is the lowest numbered nodeid then
4203 	 * call sdssc_create/delete_end depending on whether
4204 	 * this node is marked as ADD or DEL in the node record.
4205 	 */
4206 	if (lowest_alive_nodeid) {
4207 		if (nd->nd_flags & MD_MN_NODE_ADD)
4208 			sdssc_create_end(sp->setname, SDSSC_CLEANUP);
4209 		else if (nd->nd_flags & MD_MN_NODE_DEL)
4210 			sdssc_delete_end(sp->setname, SDSSC_CLEANUP);
4211 	}
4212 
4213 	/* Finished with this set -- return */
4214 	return (rval);
4215 }
4216 
4217 /*
4218  * Reconfig step to choose a new master for all MN disksets.
4219  * Return values:
4220  *	0 - Everything is great.
4221  *	1 - This node failed to reconfig.
4222  *	205 - Cause another reconfig due to a nodelist problem
4223  *		or RPC failure to another node
4224  */
4225 int
4226 meta_reconfig_choose_master(
4227 	md_error_t	*ep
4228 )
4229 {
4230 	set_t				max_sets, setno;
4231 	int				nodecnt;
4232 	mndiskset_membershiplist_t	*nl;
4233 	md_set_desc			*sd;
4234 	mdsetname_t			*sp;
4235 	int				rval = 0;
4236 	mddb_setflags_config_t		sf;
4237 	int				start_node_delayed = 0;
4238 
4239 	if ((max_sets = get_max_sets(ep)) == 0) {
4240 		mde_perror(ep, dgettext(TEXT_DOMAIN,
4241 		    "Unable to get number of sets"));
4242 		return (1);
4243 	}
4244 
4245 	/*
4246 	 * Get membershiplist from API routine.  If there's
4247 	 * an error, return a 205 to cause another reconfig.
4248 	 */
4249 	if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
4250 		mde_perror(ep, "");
4251 		return (205);
4252 	}
4253 
4254 	for (setno = 1; setno < max_sets; setno++) {
4255 		if ((sp = metasetnosetname(setno, ep)) == NULL) {
4256 			if (mdiserror(ep, MDE_NO_SET)) {
4257 				/* No set for this setno - continue */
4258 				mdclrerror(ep);
4259 				continue;
4260 			} else {
4261 				/*
4262 				 * If encountered an RPC error from my node,
4263 				 * then immediately fail.
4264 				 */
4265 				if (mdanyrpcerror(ep)) {
4266 					mde_perror(ep, "");
4267 					return (1);
4268 				}
4269 				/* Can't get set information */
4270 				mde_perror(ep, dgettext(TEXT_DOMAIN,
4271 					"Unable to get information for "
4272 					"set number %d"), setno);
4273 				mdclrerror(ep);
4274 				continue;
4275 			}
4276 		}
4277 
4278 		/* If setname is there, set desc should exist. */
4279 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4280 			/*
4281 			 * If encountered an RPC error from my node,
4282 			 * then immediately fail.
4283 			 */
4284 			if (mdanyrpcerror(ep)) {
4285 				mde_perror(ep, "");
4286 				return (1);
4287 			}
4288 			mde_perror(ep, dgettext(TEXT_DOMAIN,
4289 				"Unable to get set %s desc information"),
4290 				sp->setname);
4291 			mdclrerror(ep);
4292 			continue;
4293 		}
4294 
4295 		/* Only reconfig MN disksets */
4296 		if (!MD_MNSET_DESC(sd)) {
4297 			continue;
4298 		}
4299 
4300 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4301 		    "Begin choose master for set %s: %s"),
4302 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4303 
4304 		/* Update nodelist with member information. */
4305 		if (meta_reconfig_update_nodelist(sp, nl, sd, ep)) {
4306 			/*
4307 			 * If encountered an RPC error from my node,
4308 			 * then immediately fail.
4309 			 */
4310 			if (mdanyrpcerror(ep)) {
4311 				mde_perror(ep, "");
4312 				return (1);
4313 			}
4314 			mde_perror(ep, "");
4315 			mdclrerror(ep);
4316 			continue;
4317 		}
4318 
4319 		/*
4320 		 * If all nodes in a cluster are starting, then
4321 		 * all nodes will attempt to contact all other nodes
4322 		 * to determine a master node.  This can lead to a
4323 		 * problem where node 1 is trying to contact the rpc.metad
4324 		 * node 2 and node 2 is trying to contact the rpc.metad
4325 		 * on node 1 -- and this causes the rpc call to fail
4326 		 * on both nodes and causes a new reconfig cycle.
4327 		 *
4328 		 * In order to break this problem, a newly starting node
4329 		 * will delay a small amount of time (nodeid mod 4 seconds)
4330 		 * and will then run the code to choose a master for the
4331 		 * first set.  Delay will only be done once regardless of the
4332 		 * number of sets.
4333 		 */
4334 		if (start_node_delayed == 0) {
4335 			(void) memset(&sf, 0, sizeof (sf));
4336 			sf.sf_setno = sp->setno;
4337 			sf.sf_flags = MDDB_NM_GET;
4338 			/* Use magic to help protect ioctl against attack. */
4339 			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
4340 			if ((metaioctl(MD_MN_GET_SETFLAGS, &sf,
4341 			    &sf.sf_mde, NULL) == 0) &&
4342 			    ((sf.sf_setflags & MD_SET_MN_START_RC) ==
4343 			    MD_SET_MN_START_RC)) {
4344 				(void) sleep(sd->sd_mn_mynode->nd_nodeid % 4);
4345 			}
4346 			start_node_delayed = 1;
4347 		}
4348 
4349 		/* Choose master for this set */
4350 		rval = meta_reconfig_choose_master_for_set(sp, sd, ep);
4351 		if (rval == -1) {
4352 			mde_perror(ep, "");
4353 			return (1);
4354 		} else if (rval == 205) {
4355 			mde_perror(ep, "");
4356 			return (205);
4357 		}
4358 
4359 		/* Send new nodelist to rpc.mdcommd */
4360 		(void) mdmn_reinit_set(sp->setno);
4361 
4362 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4363 		    "Choose master for set %s completed: %s"),
4364 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4365 	}
4366 
4367 	/*
4368 	 * Each node turns on I/Os for all MN disksets.
4369 	 * This is to recover from the situation where the master died
4370 	 * during a previous reconfig cycle when I/Os were suspended
4371 	 * for a MN diskset.
4372 	 * If a failure occurs return a 1 which will force this node to
4373 	 * panic.  Cannot leave node in the situation where I/Os are
4374 	 * not resumed.
4375 	 */
4376 	setno = 0; /* 0 means all MN sets */
4377 	if (metaioctl(MD_MN_RESUME_SET, &setno, ep, NULL)) {
4378 		mde_perror(ep, "");
4379 		return (1);
4380 	}
4381 
4382 	/* Free the nodelist */
4383 	if (nodecnt)
4384 		meta_free_nodelist(nl);
4385 
4386 	return (0);
4387 }
4388 
4389 /*
4390  * meta_mnsync_user_records will synchronize the diskset user records across
4391  * all nodes in the diskset.  The diskset user records are stored in
4392  * each node's local set mddb.
4393  *
4394  * This needs to be done even if there is no master change during the
4395  * reconfig cycle since this routine should clean up any mess left by
4396  * the untimely termination of a metaset or metadb command (due to a
4397  * node panic or to user intervention).
4398  *
4399  * Caller is the Master node.
4400  *
4401  * Returns	 0 - Success
4402  *		205 - Failure during RPC to another node
4403  *		-1 - Any other failure and ep is filled in.
4404  */
4405 int
4406 meta_mnsync_user_records(
4407 	mdsetname_t	*sp,
4408 	md_error_t	*ep
4409 )
4410 {
4411 	md_set_desc		*sd;
4412 	md_mnnode_desc		*master_nodelist, *nd, *nd2, *ndtail;
4413 	md_mnset_record		*mnsr;
4414 	md_mnsr_node_t		*master_mnsr_node = NULL, *mnsr_node = NULL;
4415 	md_mnnode_record	*nr;
4416 	md_drive_record		*dr;
4417 	int			dr_cnt, dd_cnt;
4418 	int			found_my_nr;
4419 	md_drive_desc		*dd, *dd_prev, *master_dd, *other_dd;
4420 	int			all_drives_ok;
4421 	int			rval = 0;
4422 	int			max_genid = 0;
4423 	int			num_alive_nodes, num_alive_nodes_del = 0;
4424 	int			set_locked = 0;
4425 	md_setkey_t		*cl_sk;
4426 	md_error_t		xep = mdnullerror;
4427 	char			*anode[1];
4428 	mddb_setflags_config_t	sf;
4429 
4430 	/*
4431 	 * Sync up node records first.
4432 	 * Construct a master nodelist using the nodelist from this
4433 	 * node's rpc.metad node records and then setting the state of each
4434 	 * node following these rules:
4435 	 *	- If a node record is marked OK on its node, mark it OK
4436 	 *		in the master nodelist (and later OK on all nodes)
4437 	 *		If a node record is also marked OWN on its node,
4438 	 *		mark it OWN in the master nodelist.
4439 	 *	- If a node record is not marked OK on its node, then mark
4440 	 *		it as DEL in the master list (later deleting it)
4441 	 *	- If node record doesn't exist on that node, then mark it DEL
4442 	 *		(later deleting it)
4443 	 *	- If set record doesn't exist on that node, mark node as DEL
4444 	 *	- If a node record doesn't exist on all nodes, then mark it DEL
4445 	 *	- If a node is not ALIVE, then
4446 	 *		- If that node marked DEL on any node - mark it DEL
4447 	 *			in master list but leave in nodelist
4448 	 *		- If that node is marked as ADD on any node, mark it
4449 	 *			ADD in the master list but leave in nodelist
4450 	 *		- When that node returns to the living, the DEL
4451 	 *			node record will be removed and the ADD node
4452 	 *			record may be removed if marked ADD on that
4453 	 *			node.
4454 	 * The key rule is to not remove a node from the nodelist until
4455 	 * that node record is removed from its own node.  Do not want to
4456 	 * remove a node's record from all other nodes and then have
4457 	 * that node have its own record marked OK so that a node will pick
4458 	 * a different master than the other nodes.
4459 	 *
4460 	 * Next,
4461 	 * If node is ALIVE and node record is marked DEL in master nodelist,
4462 	 * remove node from set.
4463 	 * If node is ALIVE and node record is marked OK in master nodelist,
4464 	 * mark it OK on all other nodes.
4465 	 * If node is not ALIVE and node record is marked DEL in master
4466 	 * nodelist, mark it DEL on all other nodes.
4467 	 * If node is not ALIVE and node record is marked ADD in master,
4468 	 * nodelist, mark it ADD on all other nodes.
4469 	 */
4470 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4471 		return (-1);
4472 	}
4473 	master_nodelist = sd->sd_nodelist;
4474 
4475 	/*
4476 	 * Walk through nodelist creating a master nodelist.
4477 	 */
4478 	num_alive_nodes = 0;
4479 	nd = master_nodelist;
4480 	while (nd) {
4481 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4482 			nd = nd->nd_next;
4483 			continue;
4484 		}
4485 		num_alive_nodes++;
4486 		if (clnt_mngetset(nd->nd_nodename, sp->setname,
4487 		    MD_SET_BAD, &mnsr, ep) == -1) {
4488 			if (mdiserror(ep, MDE_NO_SET)) {
4489 				/* set doesn't exist, mark node as DEL */
4490 				nd->nd_flags &= ~MD_MN_NODE_OK;
4491 				nd->nd_flags &= ~MD_MN_NODE_ADD;
4492 				nd->nd_flags |= MD_MN_NODE_DEL;
4493 				nd->nd_flags |= MD_MN_NODE_NOSET;
4494 				nd = nd->nd_next;
4495 				continue;
4496 			} else {
4497 				/* If RPC failure to another node return 205 */
4498 				if ((mdanyrpcerror(ep)) &&
4499 				    (sd->sd_mn_mynode->nd_nodeid !=
4500 				    nd->nd_nodeid)) {
4501 					rval = 205;
4502 				} else {
4503 					/* Any other failure */
4504 					rval = -1;
4505 				}
4506 				goto out;
4507 			}
4508 		}
4509 		/* Find biggest genid in records for this diskset */
4510 		if (mnsr->sr_genid > max_genid)
4511 			max_genid = mnsr->sr_genid;
4512 
4513 		dr = mnsr->sr_drivechain;
4514 		while (dr) {
4515 			/* Find biggest genid in records for this diskset */
4516 			if (dr->dr_genid > max_genid) {
4517 				max_genid = dr->dr_genid;
4518 			}
4519 			dr = dr->dr_next;
4520 		}
4521 
4522 		found_my_nr = 0;
4523 		nr = mnsr->sr_nodechain;
4524 		/* nr is the list of node recs from nd_nodename node */
4525 		while (nr) {
4526 			/* Find biggest genid in records for this diskset */
4527 			if (nr->nr_genid > max_genid)
4528 				max_genid = nr->nr_genid;
4529 			nd2 = master_nodelist;
4530 			ndtail = NULL;
4531 			/* For each node record, is it in master list? */
4532 			while (nd2) {
4533 				if (nd2->nd_nodeid == nr->nr_nodeid)
4534 					break;
4535 				if (nd2->nd_next == NULL)
4536 					ndtail = nd2;
4537 				nd2 = nd2->nd_next;
4538 			}
4539 			/*
4540 			 * Found node record not in master list -- add it
4541 			 * to list marking it as DEL since node record
4542 			 * should exist on all nodes unless a panic occurred
4543 			 * during addition or deletion of host to diskset.
4544 			 */
4545 			if (nd2 == NULL) {
4546 				nd2 = Zalloc(sizeof (*nd2));
4547 				(void) strcpy(nd2->nd_nodename,
4548 				    nr->nr_nodename);
4549 				nd2->nd_flags = nr->nr_flags;
4550 				nd2->nd_flags |= MD_MN_NODE_DEL;
4551 				nd2->nd_nodeid = nr->nr_nodeid;
4552 				nd2->nd_next = NULL;
4553 				ndtail->nd_next = nd2;
4554 				nd2 = NULL;
4555 				nr = nr->nr_next;
4556 				continue;
4557 			}
4558 			/*
4559 			 * Is this the node record for the node that
4560 			 * we requested the set desc from?
4561 			 * If so, check if node has its own node record
4562 			 * marked OK. If marked OK, check for the OWN bit.
4563 			 */
4564 			if (nr->nr_nodeid == nd->nd_nodeid) {
4565 				found_my_nr = 1;
4566 				if (nr->nr_flags & MD_MN_NODE_OK) {
4567 					/*
4568 					 * If node record is marked OK
4569 					 * on its own node, then mark it OK
4570 					 * in the master list.  Node record
4571 					 * would have to exist on all nodes
4572 					 * in the ADD state before it could
4573 					 * be put into the OK state.
4574 					 */
4575 					nd->nd_flags |= MD_MN_NODE_OK;
4576 					nd->nd_flags &=
4577 					    ~(MD_MN_NODE_ADD | MD_MN_NODE_DEL);
4578 					/*
4579 					 * Mark own in master list as marked
4580 					 * on own node.
4581 					 */
4582 					if (nr->nr_flags & MD_MN_NODE_OWN)
4583 						nd->nd_flags |= MD_MN_NODE_OWN;
4584 					else
4585 						nd->nd_flags &= ~MD_MN_NODE_OWN;
4586 				} else {
4587 					/* Otherwise, mark node as DEL */
4588 					nd->nd_flags &= ~MD_MN_NODE_OK;
4589 					nd->nd_flags &= ~MD_MN_NODE_ADD;
4590 					nd->nd_flags |= MD_MN_NODE_DEL;
4591 				}
4592 			}
4593 			/*
4594 			 * If node is not ALIVE and marked DEL
4595 			 * on any node, make it DEL in master list.
4596 			 * If node is not ALIVE and marked ADD
4597 			 * on any node, make it ADD in master list
4598 			 * unless node record has already been marked DEL.
4599 			 */
4600 			if (!(nr->nr_flags & MD_MN_NODE_ALIVE)) {
4601 				if (nr->nr_flags & MD_MN_NODE_ADD) {
4602 					if (!(nd->nd_flags & MD_MN_NODE_DEL)) {
4603 						/* If not DEL - mark it ADD */
4604 						nd->nd_flags |= MD_MN_NODE_ADD;
4605 						nd->nd_flags &= ~MD_MN_NODE_OK;
4606 					}
4607 				}
4608 				if (nr->nr_flags & MD_MN_NODE_DEL) {
4609 					nd->nd_flags |= MD_MN_NODE_DEL;
4610 					nd->nd_flags &= ~MD_MN_NODE_OK;
4611 					/* Could already be ADD - make it DEL */
4612 					nd->nd_flags &= ~MD_MN_NODE_ADD;
4613 				}
4614 			}
4615 			nr = nr->nr_next;
4616 		}
4617 		/*
4618 		 * If a node record doesn't exist on its own node,
4619 		 * then mark node as DEL.
4620 		 */
4621 		if (found_my_nr == 0) {
4622 			nd->nd_flags &= ~MD_MN_NODE_OK;
4623 			nd->nd_flags |= MD_MN_NODE_DEL;
4624 		}
4625 
4626 		/*
4627 		 * If node is OK - put mnsr onto master_mnsr_node list for
4628 		 * later use when syncing up the drive records in the set.
4629 		 */
4630 		if (nd->nd_flags & MD_MN_NODE_OK) {
4631 			mnsr_node = Zalloc(sizeof (*mnsr_node));
4632 			mnsr_node->mmn_mnsr = mnsr;
4633 			(void) strncpy(mnsr_node->mmn_nodename,
4634 				nd->nd_nodename, MD_MAX_MNNODENAME_PLUS_1);
4635 			mnsr_node->mmn_next = master_mnsr_node;
4636 			master_mnsr_node = mnsr_node;
4637 		} else {
4638 			free_sr((struct md_set_record *)mnsr);
4639 		}
4640 
4641 		nd = nd->nd_next;
4642 	}
4643 
4644 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4645 	    "Master nodelist created for set %s: %s"),
4646 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4647 
4648 	/*
4649 	 * Send master nodelist to the rpc.metad on all nodes (including
4650 	 * myself) and each node will update itself.  This will set the
4651 	 * ADD and DEL flags on each node as setup in the master nodelist.
4652 	 * Don't send nodelist to node where set doesn't exist.
4653 	 */
4654 	nd = master_nodelist;
4655 	while (nd) {
4656 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4657 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
4658 			nd = nd->nd_next;
4659 			continue;
4660 		}
4661 		if (clnt_upd_nr_flags(nd->nd_nodename, sp,
4662 		    master_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) {
4663 			/* If RPC failure to another node return 205 */
4664 			if ((mdanyrpcerror(ep)) &&
4665 			    (sd->sd_mn_mynode->nd_nodeid !=
4666 			    nd->nd_nodeid)) {
4667 				rval = 205;
4668 			} else {
4669 				/* Any other failure */
4670 				rval = -1;
4671 			}
4672 			goto out;
4673 		}
4674 		nd = nd->nd_next;
4675 	}
4676 
4677 	/*
4678 	 * Now, delete nodes that need to be deleted.
4679 	 */
4680 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
4681 	    ep))  == NULL) {
4682 		if (! mdisok(ep)) {
4683 			rval = -1;
4684 			goto out;
4685 		}
4686 	}
4687 
4688 	/*
4689 	 * May be doing lots of RPC commands to the nodes, so lock the
4690 	 * ALIVE members of the set since most of the rpc.metad routines
4691 	 * require this for security reasons.
4692 	 */
4693 	nd = master_nodelist;
4694 	while (nd) {
4695 		/* Skip non-alive nodes and node without set */
4696 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4697 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
4698 			nd = nd->nd_next;
4699 			continue;
4700 		}
4701 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
4702 			/* If RPC failure to another node return 205 */
4703 			if ((mdanyrpcerror(ep)) &&
4704 			    (sd->sd_mn_mynode->nd_nodeid !=
4705 			    nd->nd_nodeid)) {
4706 				rval = 205;
4707 			} else {
4708 				/* Any other failure */
4709 				rval = -1;
4710 			}
4711 			goto out;
4712 		}
4713 		set_locked = 1;
4714 		nd = nd->nd_next;
4715 	}
4716 
4717 	nd = master_nodelist;
4718 	while (nd) {
4719 		/* Skip non-alive nodes */
4720 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4721 			nd = nd->nd_next;
4722 			continue;
4723 		}
4724 		if (nd->nd_flags & MD_MN_NODE_DEL) {
4725 			num_alive_nodes_del++;
4726 			/*
4727 			 * Delete this node rec from all ALIVE nodes in diskset.
4728 			 */
4729 			nd2 = master_nodelist;
4730 			while (nd2) {
4731 				/* Skip non-alive nodes and node without set */
4732 				if (!(nd2->nd_flags & MD_MN_NODE_ALIVE) ||
4733 				    (nd2->nd_flags & MD_MN_NODE_NOSET)) {
4734 					nd2 = nd2->nd_next;
4735 					continue;
4736 				}
4737 
4738 				/* This is a node being deleted from set */
4739 				if (nd2->nd_nodeid == nd->nd_nodeid) {
4740 					/* Mark set record as DEL */
4741 					if (clnt_upd_sr_flags(nd->nd_nodename,
4742 					    sp, MD_SR_DEL, ep)) {
4743 						/* RPC failure to !my node */
4744 						if ((mdanyrpcerror(ep)) &&
4745 						    (sd->sd_mn_mynode->
4746 						    nd_nodeid
4747 						    != nd->nd_nodeid)) {
4748 							rval = 205;
4749 						} else {
4750 							/* Any other failure */
4751 							rval = -1;
4752 						}
4753 						goto out;
4754 					}
4755 					if (clnt_deldrvs(nd->nd_nodename, sp,
4756 					    dd, ep)) {
4757 						/* RPC failure to !my node */
4758 						if ((mdanyrpcerror(ep)) &&
4759 						    (sd->sd_mn_mynode->
4760 						    nd_nodeid
4761 						    != nd->nd_nodeid)) {
4762 							rval = 205;
4763 						} else {
4764 							/* Any other failure */
4765 							rval = -1;
4766 						}
4767 						goto out;
4768 					}
4769 					if (clnt_delset(nd->nd_nodename, sp,
4770 					    ep) == -1) {
4771 						/* RPC failure to !my node */
4772 						if ((mdanyrpcerror(ep)) &&
4773 						    (sd->sd_mn_mynode->
4774 						    nd_nodeid
4775 						    != nd->nd_nodeid)) {
4776 							rval = 205;
4777 						} else {
4778 							/* Any other failure */
4779 							rval = -1;
4780 						}
4781 						goto out;
4782 					}
4783 				} else {
4784 					/*
4785 					 * Delete host from sets on hosts
4786 					 * not being deleted.
4787 					 */
4788 					anode[0] = Strdup(nd->nd_nodename);
4789 					if (clnt_delhosts(nd2->nd_nodename, sp,
4790 					    1, anode, ep) == -1) {
4791 						Free(anode[0]);
4792 						/* RPC failure to !my node */
4793 						if ((mdanyrpcerror(ep)) &&
4794 						    (sd->sd_mn_mynode->
4795 						    nd_nodeid
4796 						    != nd2->nd_nodeid)) {
4797 							rval = 205;
4798 						} else {
4799 							/* Any other failure */
4800 							rval = -1;
4801 						}
4802 						goto out;
4803 					}
4804 
4805 					meta_mc_log(MC_LOG5,
4806 					    dgettext(TEXT_DOMAIN,
4807 					    "Deleted node %s (%d) on node %s "
4808 					    "from set %s: %s"),
4809 					    nd->nd_nodename, nd->nd_nodeid,
4810 					    nd2->nd_nodename,
4811 					    sp->setname,
4812 					    meta_print_hrtime(
4813 					    gethrtime() - start_time));
4814 
4815 					Free(anode[0]);
4816 				}
4817 				nd2 = nd2->nd_next;
4818 			}
4819 		}
4820 		nd = nd->nd_next;
4821 	}
4822 
4823 	nd = master_nodelist;
4824 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
4825 	while (nd) {
4826 		/* Skip non-alive nodes and node without set */
4827 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4828 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
4829 			nd = nd->nd_next;
4830 			continue;
4831 		}
4832 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) {
4833 			/* If RPC failure to another node return 205 */
4834 			if ((mdanyrpcerror(ep)) &&
4835 			    (sd->sd_mn_mynode->nd_nodeid !=
4836 			    nd->nd_nodeid)) {
4837 				rval = 205;
4838 			} else {
4839 				/* Any other failure */
4840 				rval = -1;
4841 			}
4842 			goto out;
4843 		}
4844 		nd = nd->nd_next;
4845 	}
4846 	cl_set_setkey(NULL);
4847 	set_locked = 0;
4848 
4849 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4850 	    "Nodelist syncronization complete for set %s: %s"),
4851 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4852 
4853 	metaflushsetname(sp);
4854 
4855 	/*
4856 	 * If all alive nodes have been deleted from set, just
4857 	 * return since nothing else can be done until non-alive
4858 	 * nodes (if there are any) rejoin the cluster.
4859 	 */
4860 	if (num_alive_nodes == num_alive_nodes_del) {
4861 		rval = 0;
4862 		goto out;
4863 	}
4864 
4865 	/*
4866 	 * Sync up drive records.
4867 	 *
4868 	 * If a node panic'd (or metaset command was killed) during the
4869 	 * addition or deletion of a drive to the diskset, the nodes
4870 	 * may have a different view of the drive list.  During cleanup
4871 	 * of the drive list during reconfig, a drive will be deleted
4872 	 * from the list if the master node sees that the drive has been
4873 	 * marked in the ADD state on any node or is marked in the DEL state
4874 	 * on all nodes.
4875 	 * This cleanup must occur even if all nodes in the cluster are
4876 	 * not part of the cluster so that all nodes have the same view
4877 	 * of the drivelist.
4878 	 * Then if the entire cluster goes down and comes back up, the
4879 	 * new master node could be a node that wasn't in the cluster when
4880 	 * the node was deleted.  This could lead to a situation where the
4881 	 * master node thinks that a drive is OK, but this drive isn't
4882 	 * known to the other nodes.
4883 	 * This situation can also occur during the addition of a drive
4884 	 * where a node has the drive marked OK, but the node executing the
4885 	 * metaset command enountered a failure before marking that drive OK
4886 	 * on the rest of the nodes.  If the node with the OK drive then
4887 	 * panics, then rest of the nodes will remove that drive marked ADD
4888 	 * and when the node with the OK drive rejoins the cluster, it will
4889 	 * have a drive marked OK that is unknown by the other nodes.
4890 	 *
4891 	 * There are 2 situations to consider:
4892 	 * A) Master knows about a drive that other nodes don't know about.
4893 	 * B) At least one slave node knows about a drive that the master
4894 	 *    node doesn't know about.
4895 	 *
4896 	 * To handle these situations the following steps are followed:
4897 	 * 1) Count number of drives known by this master node and the
4898 	 *    other slave nodes.
4899 	 *    If all nodes have the same number of drives and the master has
4900 	 *    all drives marked OK, then skip to step4.
4901 	 *
4902 	 * 2) If a node has less drives listed than the master, the master
4903 	 *    must get the drive descriptor list from that node so that
4904 	 *    master can determine which drive it needs to delete from that
4905 	 *    node.  Master must get the drive descriptor list since the
4906 	 *    drive record list does not contain the name of the drive, but
4907 	 *    only a key and the key can only be interprested on that other
4908 	 *    node.
4909 	 *
4910 	 * 3) The master will then create the master drive list by doing:
4911 	 *	- Master starts with drive list known by master.
4912 	 *	- Any drive marked ADD will be removed from the list.
4913 	 *	- Any drive not known by another node (from step2) will be
4914 	 *	removed from the drive list.
4915 	 *	- If a drive is marked DEL on the master, the master must
4916 	 *	verify that the drive record is marked DEL on all nodes.
4917 	 *	If any node has the drive record marked OK, mark it OK
4918 	 *	on the master.  (The reason why is described below).
4919 	 *
4920 	 * 4) The master sends out the master drive list and the slave
4921 	 *    nodes will force their drive lists to match the master
4922 	 *    drive list by deleting drives, if necessary and by changing
4923 	 *    the drive record states from ADD->OK if master has drive
4924 	 *    marked OK and slave has drive marked ADD.
4925 	 *
4926 	 * Interesting scenarios:
4927 	 *
4928 	 * 1) System has 4 nodes with node 1 as the master.  Node 3 starts
4929 	 *    to delete a drive record (drive record on node 1 is marked DEL),
4930 	 *    but is stopped when node 3 panics.  Node 1 also panics.
4931 	 *    During reconfig cycle, node 2 is picked as master and the drive
4932 	 *    record is left alone since all nodes in the cluster have it
4933 	 *    marked OK.  User now sees drive as part of diskset.
4934 	 *    Now, entire cluster is rebooted and node 1 rejoins the cluster.
4935 	 *    Node 1 is picked as the master and node 1 has drive record
4936 	 *    marked DEL.  Node 1 contacts all other nodes in the cluster
4937 	 *    and since at least one node has the drive record marked OK,
4938 	 *    the master marks the drive record OK.
4939 	 *    User continues to see the drive as part of the diskset.
4940 	 */
4941 
4942 	/* Reget set descriptor since flushed above */
4943 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4944 		rval = -1;
4945 		goto out;
4946 	}
4947 
4948 	/* Has side effect of setting sd->sd_drvs to same as master_dd */
4949 	if ((master_dd = metaget_drivedesc_sideno(sp,
4950 	    sd->sd_mn_mynode->nd_nodeid,
4951 	    (MD_BASICNAME_OK | PRINT_FAST), ep)) == NULL) {
4952 		/* No drives in list */
4953 		if (!mdisok(ep)) {
4954 			/*
4955 			 * Can't get drive list for this node, so
4956 			 * return -1 causing this node to be removed
4957 			 * cluster config and fixed.
4958 			 */
4959 			rval = -1;
4960 			goto out;
4961 		}
4962 	}
4963 
4964 	/* Count the number of drives for all nodes */
4965 	mnsr_node = master_mnsr_node;
4966 	while (mnsr_node) {
4967 		dr_cnt = 0;
4968 		dr = mnsr_node->mmn_mnsr->sr_drivechain;
4969 		while (dr) {
4970 			dr_cnt++;
4971 			dr = dr->dr_next;
4972 		}
4973 		mnsr_node->mmn_numdrives = dr_cnt;
4974 		mnsr_node = mnsr_node->mmn_next;
4975 	}
4976 
4977 	/* Count the number of drives for the master; also check flags */
4978 	all_drives_ok = 1;
4979 	dd_cnt = 0;
4980 	dd = master_dd;
4981 	while (dd) {
4982 		dd_cnt++;
4983 		if (!(dd->dd_flags & MD_DR_OK))
4984 			all_drives_ok = 0;
4985 		dd = dd->dd_next;
4986 	}
4987 
4988 	/* If all drives are ok, do quick check against number of drives */
4989 	if (all_drives_ok) {
4990 		/* If all nodes have same number of drives, almost done */
4991 		mnsr_node = master_mnsr_node;
4992 		while (mnsr_node) {
4993 			if (mnsr_node->mmn_numdrives != dd_cnt)
4994 				break;
4995 			mnsr_node = mnsr_node->mmn_next;
4996 		}
4997 		/* All nodes have same number of drives, just send flags */
4998 		if (mnsr_node == NULL) {
4999 			goto send_drive_list;
5000 		}
5001 	}
5002 
5003 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5004 	    "Begin detailed drive synchronization for set %s: %s"),
5005 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5006 
5007 	/* Detailed check required  */
5008 	mnsr_node = master_mnsr_node;
5009 	while (mnsr_node) {
5010 		/* Does slave node have less drives than master? */
5011 		if (mnsr_node->mmn_numdrives < dd_cnt) {
5012 			/* Yes - must determine which drive is missing */
5013 			if (clnt_getdrivedesc(mnsr_node->mmn_nodename, sp,
5014 			    &other_dd, ep)) {
5015 				/* RPC failure to !my node */
5016 				if ((mdanyrpcerror(ep)) &&
5017 				    (strcmp(mynode(), mnsr_node->mmn_nodename)
5018 				    != 0)) {
5019 					rval = 205;
5020 				} else {
5021 					/* Any other failure */
5022 					rval = -1;
5023 				}
5024 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5025 				    "Master node %s unable to "
5026 				    "retrieve drive list from node %s"),
5027 				    mynode(), mnsr_node->mmn_nodename);
5028 				goto out;
5029 			}
5030 			mnsr_node->mmn_dd = other_dd;
5031 			dd = master_dd;
5032 			while (dd) {
5033 				if (!(dd->dd_flags & MD_DR_OK)) {
5034 					dd = dd->dd_next;
5035 					continue;
5036 				}
5037 				other_dd = mnsr_node->mmn_dd;
5038 				while (other_dd) {
5039 					/* Convert to devids, when available */
5040 					if (strcmp(other_dd->dd_dnp->cname,
5041 					    dd->dd_dnp->cname) == 0) {
5042 						break;
5043 					}
5044 					other_dd = other_dd->dd_next;
5045 				}
5046 				/*
5047 				 * dd not found on slave so mark it
5048 				 * ADD for later deletion (drives in ADD
5049 				 * state are deleted later in this routine).
5050 				 */
5051 				if (other_dd == NULL) {
5052 					dd->dd_flags = MD_DR_ADD;
5053 				}
5054 				dd = dd->dd_next;
5055 			}
5056 
5057 		}
5058 		mnsr_node = mnsr_node->mmn_next;
5059 	}
5060 
5061 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5062 	    "Drive check completed for set %s: %s"),
5063 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5064 
5065 	dd = master_dd;
5066 	dd_prev = 0;
5067 	while (dd) {
5068 		/* Remove any ADD drives from list */
5069 		if (dd->dd_flags & MD_DR_ADD) {
5070 			if (dd_prev) {
5071 				dd_prev->dd_next = dd->dd_next;
5072 				dd->dd_next = NULL;
5073 				metafreedrivedesc(&dd);
5074 				dd = dd_prev->dd_next;
5075 			} else {
5076 				/*
5077 				 * If removing drive descriptor from head
5078 				 * of linked list, also change sd->sd_drvs.
5079 				 */
5080 				master_dd = sd->sd_drvs = dd->dd_next;
5081 				dd->dd_next = NULL;
5082 				metafreedrivedesc(&dd);
5083 				dd = master_dd;
5084 			}
5085 			/* dd setup in if/else above */
5086 			continue;
5087 		}
5088 		/*
5089 		 * If drive is marked DEL, check all other nodes.
5090 		 * If drive on another node is marked OK, mark drive OK
5091 		 * in master list.  If drive is marked DEL or doesn't exist
5092 		 * on all nodes, remove drive from list.
5093 		 */
5094 		if (dd->dd_flags & MD_DR_DEL) {
5095 			mnsr_node = master_mnsr_node;
5096 			while (mnsr_node) {
5097 				if (mnsr_node->mmn_dd == NULL) {
5098 				    if (clnt_getdrivedesc(
5099 					mnsr_node->mmn_nodename, sp,
5100 					&other_dd, ep)) {
5101 					    /* RPC failure to !my node */
5102 					    if ((mdanyrpcerror(ep)) &&
5103 						(strcmp(mynode(),
5104 						mnsr_node->mmn_nodename)
5105 						!= 0)) {
5106 						    rval = 205;
5107 					    } else {
5108 						    /* Any other failure */
5109 						    rval = -1;
5110 					    }
5111 					    mde_perror(ep, dgettext(TEXT_DOMAIN,
5112 						"Master node %s unable "
5113 						"to retrieve drive list from "
5114 						"node %s"), mynode(),
5115 						mnsr_node->mmn_nodename);
5116 					    goto out;
5117 				    }
5118 				    mnsr_node->mmn_dd = other_dd;
5119 				}
5120 				other_dd = mnsr_node->mmn_dd;
5121 				while (other_dd) {
5122 					/* Found drive (OK) from other node */
5123 					if (strcmp(dd->dd_dnp->cname,
5124 					    other_dd->dd_dnp->cname)
5125 					    == 0) {
5126 						/* Drive marked OK */
5127 						if (other_dd->dd_flags &
5128 						    MD_DR_OK) {
5129 						    dd->dd_flags = MD_DR_OK;
5130 						}
5131 						break;
5132 					}
5133 					other_dd = other_dd->dd_next;
5134 				}
5135 				if (dd->dd_flags == MD_DR_OK)
5136 					break;
5137 
5138 				mnsr_node = mnsr_node->mmn_next;
5139 			}
5140 			/*
5141 			 * If no node had this drive marked OK, delete it.
5142 			 */
5143 			if (dd->dd_flags & MD_DR_DEL) {
5144 				if (dd_prev) {
5145 					dd_prev->dd_next = dd->dd_next;
5146 					dd->dd_next = NULL;
5147 					metafreedrivedesc(&dd);
5148 					dd = dd_prev->dd_next;
5149 				} else {
5150 					/*
5151 					 * If removing drive descriptor from
5152 					 * head of linked list, also change
5153 					 * sd->sd_drvs.
5154 					 */
5155 					master_dd = sd->sd_drvs = dd->dd_next;
5156 					dd->dd_next = NULL;
5157 					metafreedrivedesc(&dd);
5158 					dd = master_dd;
5159 				}
5160 				/* dd setup in if/else above */
5161 				continue;
5162 			}
5163 		}
5164 		dd_prev = dd;
5165 		dd = dd->dd_next;
5166 	}
5167 
5168 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5169 	    "Setting drive states completed for set %s: %s"),
5170 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5171 
5172 send_drive_list:
5173 	/*
5174 	 * Set genid on all drives to be the highest value seen.
5175 	 */
5176 	dd = master_dd;
5177 	while (dd) {
5178 		dd->dd_genid = max_genid;
5179 		dd = dd->dd_next;
5180 	}
5181 	/*
5182 	 * Send updated drive list to all alive nodes.
5183 	 * Will also set genid on set and node records to have same
5184 	 * as the drive records.
5185 	 */
5186 	nd = sd->sd_nodelist;
5187 	while (nd) {
5188 		/* Skip non-alive nodes */
5189 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5190 			nd = nd->nd_next;
5191 			continue;
5192 		}
5193 		if (clnt_upd_dr_reconfig(nd->nd_nodename, sp, master_dd, ep)) {
5194 			/* RPC failure to another node */
5195 			if ((mdanyrpcerror(ep)) &&
5196 			    (sd->sd_mn_mynode->nd_nodeid != nd->nd_nodeid)) {
5197 				rval = 205;
5198 			} else {
5199 				/* Any other failure */
5200 				rval = -1;
5201 			}
5202 			goto out;
5203 		}
5204 		nd = nd->nd_next;
5205 	}
5206 
5207 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5208 	    "Sent drive list to all nodes for set %s: %s"),
5209 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5210 
5211 	/*
5212 	 * If no drive records left in set and nodes had been joined,
5213 	 * withdraw the nodes.  Always reset the master and mark
5214 	 * all nodes as withdrawn on all nodes.
5215 	 */
5216 	if (master_dd == NULL) {
5217 		/* Reset new master flag since no longer master */
5218 		(void) memset(&sf, 0, sizeof (sf));
5219 		sf.sf_setno = sp->setno;
5220 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5221 		sf.sf_flags = MDDB_NM_RESET;
5222 		/* Use magic to help protect ioctl against attack. */
5223 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5224 		/* Ignore failure, failure to reset flag isn't catastrophic */
5225 		(void) metaioctl(MD_MN_SET_SETFLAGS, &sf,
5226 		    &sf.sf_mde, NULL);
5227 
5228 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5229 		    "Reset new master flag for " "set %s: %s"),
5230 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5231 
5232 		nd = sd->sd_nodelist;
5233 		while (nd) {
5234 			/* Skip non-alive nodes  */
5235 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5236 				nd = nd->nd_next;
5237 				continue;
5238 			}
5239 
5240 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
5241 				/* RPC failure to another node */
5242 				if ((mdanyrpcerror(ep)) &&
5243 				    (sd->sd_mn_mynode->nd_nodeid !=
5244 				    nd->nd_nodeid)) {
5245 					rval = 205;
5246 				} else {
5247 					/* Any other failure */
5248 					rval = -1;
5249 				}
5250 				goto out;
5251 			}
5252 			set_locked = 1;
5253 
5254 			/* Withdraw node from set if owner */
5255 			if ((nd->nd_flags & MD_MN_NODE_OWN) &&
5256 			    (clnt_withdrawset(nd->nd_nodename, sp, ep))) {
5257 				/* RPC failure to another node */
5258 				if ((mdanyrpcerror(ep)) &&
5259 				    (sd->sd_mn_mynode->nd_nodeid !=
5260 				    nd->nd_nodeid)) {
5261 					rval = 205;
5262 				} else {
5263 					/* Any other failure */
5264 					rval = -1;
5265 				}
5266 				goto out;
5267 			}
5268 
5269 			/* Mark all nodes as withdrawn on this node */
5270 			if (clnt_upd_nr_flags(nd->nd_nodename, sp,
5271 			    sd->sd_nodelist, MD_NR_WITHDRAW, NULL, ep)) {
5272 				/* RPC failure to another node */
5273 				if ((mdanyrpcerror(ep)) &&
5274 				    (sd->sd_mn_mynode->nd_nodeid !=
5275 				    nd->nd_nodeid)) {
5276 					rval = 205;
5277 				} else {
5278 					/* Any other failure */
5279 					rval = -1;
5280 				}
5281 				goto out;
5282 			}
5283 
5284 			/* Resets master to no-master on this node */
5285 			if (clnt_mnsetmaster(nd->nd_nodename, sp,
5286 			    "", MD_MN_INVALID_NID, ep)) {
5287 				/* RPC failure to another node */
5288 				if ((mdanyrpcerror(ep)) &&
5289 				    (sd->sd_mn_mynode->nd_nodeid !=
5290 				    nd->nd_nodeid)) {
5291 					rval = 205;
5292 				} else {
5293 					/* Any other failure */
5294 					rval = -1;
5295 				}
5296 				goto out;
5297 			}
5298 
5299 			cl_sk = cl_get_setkey(sp->setno, sp->setname);
5300 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) {
5301 				/* RPC failure to another node */
5302 				if ((mdanyrpcerror(ep)) &&
5303 				    (sd->sd_mn_mynode->nd_nodeid !=
5304 				    nd->nd_nodeid)) {
5305 					rval = 205;
5306 				} else {
5307 					/* Any other failure */
5308 					rval = -1;
5309 				}
5310 				goto out;
5311 			}
5312 			set_locked = 0;
5313 			nd = nd->nd_next;
5314 		}
5315 	}
5316 
5317 out:
5318 	/*
5319 	 * If got here and set is still locked, then an error has
5320 	 * occurred and master_nodelist is still valid.
5321 	 * If error is not an RPC error, then unlock.
5322 	 * If error is an RPC error, skip unlocks since this could cause
5323 	 * yet another RPC timeout if a node has failed.
5324 	 * Ignore failures in unlock since unlock is just trying to
5325 	 * clean things up.
5326 	 */
5327 	if ((set_locked) && !(mdanyrpcerror(ep))) {
5328 		nd = master_nodelist;
5329 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
5330 		while (nd) {
5331 			/* Skip non-alive nodes */
5332 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5333 				nd = nd->nd_next;
5334 				continue;
5335 			}
5336 			/*
5337 			 * If clnt_unlock fails, just break out since next
5338 			 * reconfig cycle will reset the locks anyway.
5339 			 */
5340 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
5341 				break;
5342 			}
5343 			nd = nd->nd_next;
5344 		}
5345 		cl_set_setkey(NULL);
5346 	}
5347 	/* Free master_mnsr and drive descs */
5348 	mnsr_node = master_mnsr_node;
5349 	while (mnsr_node) {
5350 		master_mnsr_node = mnsr_node->mmn_next;
5351 		free_sr((md_set_record *)mnsr_node->mmn_mnsr);
5352 		free_rem_dd(mnsr_node->mmn_dd);
5353 		Free(mnsr_node);
5354 		mnsr_node = master_mnsr_node;
5355 	}
5356 
5357 	/* Frees sd->sd_drvs (which is also master_dd) */
5358 	metaflushsetname(sp);
5359 	return (rval);
5360 }
5361 
5362 /*
5363  * meta_mnsync_diskset_mddbs
5364  * Calling node is guaranteed to be an owner node.
5365  * Calling node is the master node.
5366  *
5367  * Master node verifies that ondisk mddb format matches its incore format.
5368  * If no nodes are joined to set, remove the change log entries.
5369  * If a node is joined to set, play the change log.
5370  *
5371  * Returns	 0 - Success
5372  *		 1 - Master unable to join to set.
5373  *		205 - Failure during RPC to another node
5374  *		-1 - Any other failure and ep is filled in.
5375  *			-1 return will eventually cause node to panic
5376  *			in a SunCluster environment.
5377  */
5378 int
5379 meta_mnsync_diskset_mddbs(
5380 	mdsetname_t	*sp,
5381 	md_error_t	*ep
5382 )
5383 {
5384 	md_set_desc		*sd;
5385 	mddb_config_t		c;
5386 	md_mn_msgclass_t	class;
5387 	mddb_setflags_config_t	sf;
5388 	md_mnnode_desc		*nd, *nd2;
5389 	md_error_t		xep = mdnullerror;
5390 	int			stale_set = 0;
5391 
5392 	/* If setname is there, set desc should exist. */
5393 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5394 		mde_perror(ep, dgettext(TEXT_DOMAIN,
5395 		    "Unable to get set %s desc information"), sp->setname);
5396 		return (-1);
5397 	}
5398 
5399 	/* Are there drives in the set? */
5400 	if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
5401 	    ep) == NULL) {
5402 		if (! mdisok(ep)) {
5403 			return (-1);
5404 		}
5405 		/* No drives in set -- nothing to sync up */
5406 		return (0);
5407 	}
5408 
5409 	/*
5410 	 * Is master node (which is this node) joined to set?
5411 	 * If master node isn't joined (which means that no nodes
5412 	 * are joined to diskset), remove the change log entries
5413 	 * since no need to replay them - all nodes will have same
5414 	 * view of mddbs since all nodes are reading in the mddbs
5415 	 * from disk.
5416 	 * There is also no need to sync up the master and ondisk mddbs
5417 	 * since master has no incore knowledge.
5418 	 * Need to join master to set in order to flush the change
5419 	 * log entries. Don't need to block I/O during join of master
5420 	 * to set since no other nodes are joined to set and so no I/O
5421 	 * can be occurring.
5422 	 */
5423 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
5424 		/* Join master to set */
5425 		if (clnt_joinset(mynode(), sp,
5426 		    MNSET_IN_RECONFIG, ep)) {
5427 			if (mdismddberror(ep, MDE_DB_STALE)) {
5428 				/*
5429 				 * If STALE, print message and continue on.
5430 				 * Don't do any writes or reads to mddbs
5431 				 * so don't clear change log.
5432 				 */
5433 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5434 				    "Join of master node to STALE set %s"),
5435 				    sp->setname);
5436 				stale_set = 1;
5437 				mdclrerror(ep);
5438 			} else if (mdismddberror(ep, MDE_DB_ACCOK)) {
5439 				/* ACCOK means mediator provided extra vote */
5440 				mdclrerror(ep);
5441 			} else {
5442 				/*
5443 				 * If master is unable to join set, print an
5444 				 * error message.  Don't return failure or node
5445 				 * will panic during cluster reconfig cycle.
5446 				 * Also, withdraw node from set in order to
5447 				 * cleanup from failed join attempt.
5448 				 */
5449 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5450 				    "Join of master node in set %s failed"),
5451 				    sp->setname);
5452 				if (clnt_withdrawset(mynode(), sp, &xep))
5453 					mdclrerror(&xep);
5454 				return (1);
5455 			}
5456 		}
5457 		/*
5458 		 * Master node successfully joined.
5459 		 * Set local copy of flags to OWN and
5460 		 * send owner flag to rpc.metad. If not stale,
5461 		 * flush the change log.
5462 		 */
5463 		sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN;
5464 		if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, MD_NR_SET,
5465 		    MNSET_IN_RECONFIG, ep)) {
5466 			mde_perror(ep, dgettext(TEXT_DOMAIN,
5467 			    "Flag update of master node join in set %s failed"),
5468 			    sp->setname);
5469 			return (-1);
5470 		}
5471 
5472 		if (!stale_set) {
5473 			if (mdmn_reset_changelog(sp, ep,
5474 			    MDMN_CLF_RESETLOG) != 0) {
5475 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5476 				    "Unable to reset changelog."));
5477 				return (-1);
5478 			}
5479 			meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5480 			    "Removed changelog entries for set %s: %s"),
5481 			    sp->setname,
5482 			    meta_print_hrtime(gethrtime() - start_time));
5483 		}
5484 		/* Reset new master flag before return */
5485 		(void) memset(&sf, 0, sizeof (sf));
5486 		sf.sf_setno = sp->setno;
5487 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5488 		sf.sf_flags = MDDB_NM_RESET;
5489 		/* Use magic to help protect ioctl against attack. */
5490 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5491 		/* Ignore failure, failure to reset flag isn't catastrophic */
5492 		(void) metaioctl(MD_MN_SET_SETFLAGS, &sf,
5493 		    &sf.sf_mde, NULL);
5494 
5495 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5496 		    "Reset new master flag for set %s: %s"),
5497 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5498 
5499 		return (0);
5500 	}
5501 
5502 	/*
5503 	 * Is master already joined to STALE set (< 50% mddbs avail)?
5504 	 * If so, can make no config changes to mddbs so don't check or play
5505 	 * changelog and don't sync master node to ondisk mddbs.
5506 	 * To get out of the stale state all nodes must be withdrawn
5507 	 * from set.  Then as nodes are re-joined, all nodes will
5508 	 * have same view of mddbs since all nodes are reading the
5509 	 * mddbs from disk.
5510 	 */
5511 	(void) memset(&c, 0, sizeof (c));
5512 	c.c_id = 0;
5513 	c.c_setno = sp->setno;
5514 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
5515 		(void) mdstealerror(ep, &c.c_mde);
5516 		return (-1);
5517 	}
5518 	if (c.c_flags & MDDB_C_STALE) {
5519 		return (0);
5520 	}
5521 
5522 	/*
5523 	 * If this node is NOT a newly chosen master, then there's
5524 	 * nothing else to do since the change log should be empty and
5525 	 * the ondisk and incore mddbs are already consistent.
5526 	 *
5527 	 * A newly chosen master is a node that was not the master
5528 	 * at the beginning of the reconfig cycle.  If a node is a new
5529 	 * master, then the new master state is reset after the ondisk
5530 	 * and incore mddbs are consistent and the change log has
5531 	 * been replayed.
5532 	 */
5533 	(void) memset(&sf, 0, sizeof (sf));
5534 	sf.sf_setno = sp->setno;
5535 	sf.sf_flags = MDDB_NM_GET;
5536 	/* Use magic to help protect ioctl against attack. */
5537 	sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5538 	if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, &sf.sf_mde, NULL) == 0) &&
5539 	    ((sf.sf_setflags & MD_SET_MN_NEWMAS_RC) == 0)) {
5540 		return (0);
5541 	}
5542 
5543 	/*
5544 	 * Now, sync up incore master view to ondisk mddbs.
5545 	 * This is needed in the case where a master node
5546 	 * had made a change to the mddb, but this change
5547 	 * may not have been relayed to the slaves yet.
5548 	 * So, the new master needs to verify that the ondisk
5549 	 * mddbs match what the new master has incore -
5550 	 * if different, new master rewrites all of the mddbs.
5551 	 * Then the new master will replay the changelog and the
5552 	 * new master will then execute what the old master had
5553 	 * done.
5554 	 *
5555 	 * Block all I/Os to disks in this diskset on all nodes in
5556 	 * the diskset.  This will allow the rewriting of the mddbs
5557 	 * (if needed), to proceed in a timely manner.
5558 	 *
5559 	 * If block of I/Os fail, return a -1.
5560 	 */
5561 
5562 	nd = sd->sd_nodelist;
5563 	while (nd) {
5564 		/* Skip non-alive and non-owner nodes  */
5565 		if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5566 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5567 			nd = nd->nd_next;
5568 			continue;
5569 		}
5570 		if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5571 		    MN_SUSP_IO, ep)) {
5572 			mde_perror(ep, dgettext(TEXT_DOMAIN,
5573 			    "Unable to suspend I/O on node %s in set %s"),
5574 			    nd->nd_nodename, sp->setname);
5575 
5576 			/*
5577 			 * Resume all other nodes that had been suspended.
5578 			 * (Reconfig return step also resumes I/Os
5579 			 * for all sets.)
5580 			 */
5581 			nd2 = sd->sd_nodelist;
5582 			while (nd2) {
5583 				/* Stop when reaching failed node */
5584 				if (nd2->nd_nodeid == nd->nd_nodeid)
5585 					break;
5586 				/* Skip non-alive and non-owner nodes  */
5587 				if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) ||
5588 				    (!(nd2->nd_flags & MD_MN_NODE_OWN))) {
5589 					nd2 = nd2->nd_next;
5590 					continue;
5591 				}
5592 				(void) (clnt_mn_susp_res_io(nd2->nd_nodename,
5593 					sp->setno, MN_RES_IO, &xep));
5594 				nd2 = nd2->nd_next;
5595 			}
5596 
5597 			/*
5598 			 * If an RPC failure on another node, return a 205.
5599 			 * Otherwise, exit with failure.
5600 			 */
5601 			if ((mdanyrpcerror(ep)) &&
5602 			    (sd->sd_mn_mynode->nd_nodeid !=
5603 			    nd->nd_nodeid)) {
5604 				return (205);
5605 			} else {
5606 				return (-1);
5607 			}
5608 
5609 		}
5610 		nd = nd->nd_next;
5611 	}
5612 
5613 	(void) memset(&c, 0, sizeof (c));
5614 	c.c_id = 0;
5615 	c.c_setno = sp->setno;
5616 	/* Master can't sync up to ondisk mddbs?  Kick it out of cluster */
5617 	if (metaioctl(MD_MN_CHK_WRT_MDDB, &c, &c.c_mde, NULL) != 0)
5618 		return (-1);
5619 
5620 	/*
5621 	 * Resume I/Os that were suspended above.
5622 	 */
5623 	nd = sd->sd_nodelist;
5624 	while (nd) {
5625 		/* Skip non-alive and non-owner nodes  */
5626 		if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5627 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5628 			nd = nd->nd_next;
5629 			continue;
5630 		}
5631 		if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5632 		    MN_RES_IO, ep)) {
5633 			mde_perror(ep, dgettext(TEXT_DOMAIN,
5634 			    "Unable to resume I/O on node %s in set %s"),
5635 			    nd->nd_nodename, sp->setname);
5636 
5637 			/*
5638 			 * If an RPC failure then don't do any
5639 			 * more RPC calls, since one timeout is enough
5640 			 * to endure.  If RPC failure to another node, return
5641 			 * 205.  If RPC failure to my node, return -1.
5642 			 * If not an RPC failure, continue resuming the
5643 			 * rest of the nodes and then return -1.
5644 			 */
5645 			if (mdanyrpcerror(ep)) {
5646 				if (sd->sd_mn_mynode->nd_nodeid ==
5647 				    nd->nd_nodeid) {
5648 					return (-1);
5649 				} else {
5650 					return (205);
5651 				}
5652 			}
5653 
5654 			/*
5655 			 * If not an RPC error, continue resuming rest of
5656 			 * nodes, ignoring any failures except for an
5657 			 * RPC failure which constitutes an immediate exit.
5658 			 * Start in middle of list with failing node.
5659 			 */
5660 			nd2 = nd->nd_next;
5661 			while (nd2) {
5662 				/* Skip non-alive and non-owner nodes  */
5663 				if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) ||
5664 				    (!(nd2->nd_flags & MD_MN_NODE_OWN))) {
5665 					nd2 = nd2->nd_next;
5666 					continue;
5667 				}
5668 				(void) (clnt_mn_susp_res_io(nd2->nd_nodename,
5669 					sp->setno, MN_RES_IO, &xep));
5670 				if (mdanyrpcerror(&xep)) {
5671 					return (-1);
5672 				}
5673 				nd2 = nd2->nd_next;
5674 			}
5675 		}
5676 		nd = nd->nd_next;
5677 	}
5678 
5679 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, "Master node has completed "
5680 	    "checking/writing the mddb for set %s: %s"), sp->setname,
5681 	    meta_print_hrtime(gethrtime() - start_time));
5682 
5683 	/*
5684 	 * Send (aka replay) all messages we find in the changelog.
5685 	 * Flag the messages with
5686 	 *   MD_MSGF_REPLAY_MSG, so no new message ID is generated for them
5687 	 *   MD_MSGF_OVERRIDE_SUSPEND so they can pass the suspended commd.
5688 	 */
5689 	for (class = MD_MN_NCLASSES - 1; class > 0; class--) {
5690 		mdmn_changelog_record_t	*lr;
5691 		md_error_t	xep = mdnullerror;
5692 		md_mn_result_t	*resultp = NULL;
5693 		int		ret;
5694 
5695 		lr = mdmn_get_changelogrec(sp->setno, class);
5696 		if ((lr->lr_flags & MD_MN_LR_INUSE) == 0) {
5697 			/* no entry for this class */
5698 			continue;
5699 		}
5700 
5701 		meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN,
5702 		    "replaying message ID=(%d, 0x%llx-%d)\n"),
5703 		    MSGID_ELEMS(lr->lr_msg.msg_msgid));
5704 
5705 		ret = mdmn_send_message_with_msgid(
5706 			lr->lr_msg.msg_setno,
5707 			lr->lr_msg.msg_type,
5708 			lr->lr_msg.msg_flags |  MD_MSGF_REPLAY_MSG |
5709 						MD_MSGF_OVERRIDE_SUSPEND,
5710 			lr->lr_msg.msg_event_data,
5711 			lr->lr_msg.msg_event_size,
5712 			&resultp,
5713 			&lr->lr_msg.msg_msgid,
5714 			&xep);
5715 
5716 		meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN,
5717 		    "mdmn_send_message returned %d\n"), ret);
5718 
5719 		if (resultp)
5720 			free_result(resultp);
5721 	}
5722 
5723 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5724 	    "Playing changelog completed for set %s: %s"),
5725 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5726 
5727 	/*
5728 	 * Now that new master has ondisk and incore mddbs in sync, reset
5729 	 * this node's new master kernel flag (for this set).  If this node
5730 	 * re-enters another reconfig cycle before the completion of this
5731 	 * reconfig cycle, this master node won't need to check if the ondisk
5732 	 * and incore mddbs are in sync since this node won't be considered
5733 	 * a new master (since this flag is being reset here in the middle of
5734 	 * step2).  This will save time during any subsequent reconfig
5735 	 * cycles as long as this node continues to be master.
5736 	 */
5737 	(void) memset(&sf, 0, sizeof (sf));
5738 	sf.sf_setno = sp->setno;
5739 	sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5740 	sf.sf_flags = MDDB_NM_RESET;
5741 	/* Use magic to help protect ioctl against attack. */
5742 	sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5743 	/* Ignore failure, since failure to reset flag isn't catastrophic */
5744 	(void) metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, NULL);
5745 
5746 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5747 	    "Reset new master flag for set %s: %s"),
5748 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5749 
5750 	return (0);
5751 }
5752 
5753 /*
5754  * meta_mnjoin_all will join all starting nodes in the diskset.
5755  * A starting node is considered to be any node that is not
5756  * an owner of the set but is a member of the cluster.
5757  * Master node is already joined to set (done in meta_mnsync_diskset_mddbs).
5758  *
5759  * Caller is the Master node.
5760  *
5761  * Returns	 0 - Success
5762  *		205 - Failure during RPC to another node
5763  *		-1 - Any other failure and ep is filled in.
5764  */
5765 int
5766 meta_mnjoin_all(
5767 	mdsetname_t	*sp,
5768 	md_error_t	*ep
5769 )
5770 {
5771 	md_set_desc		*sd;
5772 	md_mnnode_desc		*nd, *nd2;
5773 	int			rval = 0;
5774 	int			stale_flag = 0;
5775 	mddb_config_t		c;
5776 	int			susp_res_flag = 0;
5777 	md_error_t		xep = mdnullerror;
5778 
5779 	/* If setname is there, set desc should exist. */
5780 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5781 		mde_perror(ep, dgettext(TEXT_DOMAIN,
5782 		    "Unable to get set %s desc information"), sp->setname);
5783 		return (-1);
5784 	}
5785 
5786 	/* Are there drives in the set? */
5787 	if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
5788 	    ep) == NULL) {
5789 		if (! mdisok(ep)) {
5790 			return (-1);
5791 		}
5792 		/* No drives in set -- nothing to join */
5793 		return (0);
5794 	}
5795 
5796 	/*
5797 	 * Is set currently stale?
5798 	 */
5799 	(void) memset(&c, 0, sizeof (c));
5800 	c.c_id = 0;
5801 	c.c_setno = sp->setno;
5802 	/* Ignore failure since master node may not be joined yet */
5803 	(void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL);
5804 	if (c.c_flags & MDDB_C_STALE) {
5805 		stale_flag = MNSET_IS_STALE;
5806 	}
5807 
5808 	/*
5809 	 * If any nodes are going to be joined to diskset, then
5810 	 * suspend I/O to all disks in diskset so that nodes can join
5811 	 * (read in mddbs) in a reasonable amount of time even under
5812 	 * high I/O load.  Don't need to do this if set is STALE since
5813 	 * no I/O can be occurring to a STALE set.
5814 	 */
5815 	if (stale_flag != MNSET_IS_STALE) {
5816 		nd = sd->sd_nodelist;
5817 		while (nd) {
5818 			/* Found a node that will be joined to diskset */
5819 			if ((nd->nd_flags & MD_MN_NODE_ALIVE) &&
5820 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5821 				/* Set flag that diskset should be suspended */
5822 				susp_res_flag = 1;
5823 				break;
5824 			}
5825 			nd = nd->nd_next;
5826 		}
5827 	}
5828 
5829 	if (susp_res_flag) {
5830 		/*
5831 		 * Block all I/Os to disks in this diskset on all joined
5832 		 * nodes in the diskset.
5833 		 * If block of I/Os fails due to an RPC failure on another
5834 		 * node, return 205; otherwise, return -1.
5835 		 */
5836 		nd = sd->sd_nodelist;
5837 		while (nd) {
5838 			/* Skip non-alive and non-owner nodes  */
5839 			if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5840 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5841 				nd = nd->nd_next;
5842 				continue;
5843 			}
5844 			if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5845 			    MN_SUSP_IO, ep)) {
5846 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5847 				    "Unable to suspend I/O on node %s"
5848 				    " in set %s"), nd->nd_nodename,
5849 				    sp->setname);
5850 				/*
5851 				 * Resume other nodes that had been suspended.
5852 				 * (Reconfig return step also resumes I/Os
5853 				 * for all sets.)
5854 				 */
5855 				nd2 = sd->sd_nodelist;
5856 				while (nd2) {
5857 					/* Stop when reaching failed node */
5858 					if (nd2->nd_nodeid == nd->nd_nodeid)
5859 						break;
5860 					/* Skip non-alive/non-owner nodes  */
5861 					if ((!(nd2->nd_flags &
5862 					    MD_MN_NODE_ALIVE)) ||
5863 					    (!(nd2->nd_flags &
5864 					    MD_MN_NODE_OWN))) {
5865 						nd2 = nd2->nd_next;
5866 						continue;
5867 					}
5868 					(void) (clnt_mn_susp_res_io(
5869 					    nd2->nd_nodename, sp->setno,
5870 					    MN_RES_IO, &xep));
5871 					nd2 = nd2->nd_next;
5872 				}
5873 
5874 				/*
5875 				 * If the suspend failed due to an
5876 				 * RPC failure on another node, return
5877 				 * a 205.
5878 				 * Otherwise, exit with failure.
5879 				 * The return reconfig step will resume
5880 				 * I/Os for all disksets.
5881 				 */
5882 				if ((mdanyrpcerror(ep)) &&
5883 				    (sd->sd_mn_mynode->nd_nodeid !=
5884 				    nd->nd_nodeid)) {
5885 					return (205);
5886 				} else {
5887 					return (-1);
5888 				}
5889 			}
5890 			nd = nd->nd_next;
5891 		}
5892 	}
5893 
5894 	nd = sd->sd_nodelist;
5895 	while (nd) {
5896 		/*
5897 		 * If a node is in the membership list but isn't joined
5898 		 * to the set, try to join the node.
5899 		 */
5900 		if ((nd->nd_flags & MD_MN_NODE_ALIVE) &&
5901 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5902 			if (clnt_joinset(nd->nd_nodename, sp,
5903 			    (MNSET_IN_RECONFIG | stale_flag), ep)) {
5904 				/*
5905 				 * If RPC failure to another node
5906 				 * then exit without attempting anything else.
5907 				 * (Reconfig return step will resume I/Os
5908 				 * for all sets.)
5909 				 */
5910 				if (mdanyrpcerror(ep)) {
5911 					mde_perror(ep, "");
5912 					return (205);
5913 				}
5914 				/*
5915 				 * STALE and ACCOK failures aren't true
5916 				 * failures.  STALE means that <50% mddbs
5917 				 * are available. ACCOK means that the
5918 				 * mediator provided the extra vote.
5919 				 * If a true failure, then print messasge
5920 				 * and withdraw node from set in order to
5921 				 * cleanup from failed join attempt.
5922 				 */
5923 				if ((!mdismddberror(ep, MDE_DB_STALE)) &&
5924 				    (!mdismddberror(ep, MDE_DB_ACCOK))) {
5925 					mde_perror(ep,
5926 					    "WARNING: Unable to join node %s "
5927 					    "to set %s", nd->nd_nodename,
5928 					    sp->setname);
5929 					mdclrerror(ep);
5930 					if (clnt_withdrawset(nd->nd_nodename,
5931 					    sp, &xep))
5932 						mdclrerror(&xep);
5933 					nd = nd->nd_next;
5934 					continue;
5935 				}
5936 			}
5937 			/* Set owner flag even if STALE or ACCOK */
5938 			nd->nd_flags |= MD_MN_NODE_OWN;
5939 		}
5940 		nd = nd->nd_next;
5941 	}
5942 	/*
5943 	 * Resume I/Os if suspended above.
5944 	 */
5945 	if (susp_res_flag) {
5946 		nd = sd->sd_nodelist;
5947 		while (nd) {
5948 			/*
5949 			 * Skip non-alive and non-owner nodes
5950 			 * (this list doesn't include any of
5951 			 * the nodes that were joined).
5952 			 */
5953 			if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5954 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5955 				nd = nd->nd_next;
5956 				continue;
5957 			}
5958 			if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5959 			    MN_RES_IO, ep)) {
5960 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5961 				    "Unable to resume I/O on node %s"
5962 				    " in set %s"), nd->nd_nodename,
5963 				    sp->setname);
5964 
5965 				/*
5966 				 * If an RPC failure then don't do any
5967 				 * more RPC calls, since one timeout is enough
5968 				 * to endure.  If RPC failure to another node,
5969 				 * return 205.  If RPC failure to my node,
5970 				 * return -1.
5971 				 * (Reconfig return step will resume I/Os
5972 				 * for all sets.)
5973 				 * If not an RPC failure, continue resuming the
5974 				 * rest of the nodes and then return -1.
5975 				 */
5976 				if (mdanyrpcerror(ep)) {
5977 					if (sd->sd_mn_mynode->nd_nodeid ==
5978 					    nd->nd_nodeid) {
5979 						return (-1);
5980 					} else {
5981 						return (205);
5982 					}
5983 				}
5984 
5985 				/*
5986 				 * If not an RPC error, continue resuming rest
5987 				 * of nodes, ignoring any failures except for
5988 				 * an RPC failure which constitutes an
5989 				 * immediate exit.
5990 				 * Start in middle of list with failing node.
5991 				 */
5992 				nd2 = nd->nd_next;
5993 				while (nd2) {
5994 					/* Skip non-owner nodes  */
5995 					if ((!(nd2->nd_flags &
5996 					    MD_MN_NODE_ALIVE)) ||
5997 					    (!(nd2->nd_flags &
5998 					    MD_MN_NODE_OWN))) {
5999 						nd2 = nd2->nd_next;
6000 						continue;
6001 					}
6002 					(void) (clnt_mn_susp_res_io(
6003 					    nd2->nd_nodename, sp->setno,
6004 					    MN_RES_IO, &xep));
6005 					if (mdanyrpcerror(&xep)) {
6006 						return (-1);
6007 					}
6008 					nd2 = nd2->nd_next;
6009 				}
6010 			}
6011 			nd = nd->nd_next;
6012 		}
6013 	}
6014 
6015 	nd = sd->sd_nodelist;
6016 	while (nd) {
6017 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
6018 			nd = nd->nd_next;
6019 			continue;
6020 		}
6021 		/*
6022 		 * If 1 node fails - go ahead and update the rest except
6023 		 * in the case of an RPC failure, fail immediately.
6024 		 */
6025 		if (clnt_upd_nr_flags(nd->nd_nodename, sp,
6026 		    sd->sd_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) {
6027 			/* RPC failure to another node */
6028 			if (mdanyrpcerror(ep)) {
6029 				return (205);
6030 			}
6031 			nd = nd->nd_next;
6032 			rval = -1;
6033 			continue;
6034 		}
6035 		nd = nd->nd_next;
6036 	}
6037 
6038 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
6039 	    "Join of all nodes completed for set %s: %s"),
6040 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
6041 
6042 	return (rval);
6043 }
6044