xref: /titanic_41/usr/src/lib/lvm/libmeta/common/meta_set.c (revision 16ade92d9ce9c9ab33a25f7a2fdd00b581b6efda)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Just in case we're not in a build environment, make sure that
31  * TEXT_DOMAIN gets set to something.
32  */
33 #if !defined(TEXT_DOMAIN)
34 #define	TEXT_DOMAIN "SYS_TEST"
35 #endif
36 
37 /*
38  * Metadevice diskset interfaces
39  */
40 
41 #include "meta_set_prv.h"
42 #include <meta.h>
43 #include <metad.h>
44 #include <mdmn_changelog.h>
45 #include <sys/lvm/md_crc.h>
46 #include <sys/utsname.h>
47 #include <sdssc.h>
48 
49 #include <sys/sysevent/eventdefs.h>
50 #include <sys/sysevent/svm.h>
51 extern	char	*blkname(char *);
52 
53 static md_drive_desc *
54 dr2drivedesc(
55 	mdsetname_t	*sp,
56 	side_t		sideno,
57 	int		flags,
58 	md_error_t	*ep
59 )
60 {
61 	md_set_record	*sr;
62 	md_drive_record	*dr;
63 	mddrivename_t	*dnp;
64 	md_drive_desc	*dd_head = NULL;
65 	md_set_desc	*sd;
66 
67 	if (flags & MD_BYPASS_DAEMON) {
68 		if ((sr = metad_getsetbynum(sp->setno, ep)) == NULL)
69 			return (NULL);
70 		sd = metaget_setdesc(sp, ep);
71 		sideno = getnodeside(mynode(), sd);
72 		sp = metafakesetname(sp->setno, sr->sr_setname);
73 	} else {
74 		if ((sr = getsetbyname(sp->setname, ep)) == NULL)
75 			return (NULL);
76 	}
77 
78 	assert(sideno != MD_SIDEWILD);
79 
80 	/*
81 	 * WARNING:
82 	 * The act of getting the dnp from the namespace means that we
83 	 * will get the devid of the disk as recorded in the namespace.
84 	 * This devid has the potential to be stale if the disk is being
85 	 * replaced via a rebind, this means that any code that relies
86 	 * on any of the dnp information should take the appropriate action
87 	 * to preserve that information. For example in the rebind code the
88 	 * devid of the new disk is saved off and then copied back in once
89 	 * the code that has called this function has completed.
90 	 */
91 	for (dr = sr->sr_drivechain; dr != NULL; dr = dr->dr_next) {
92 		if ((dnp = metadrivename_withdrkey(sp, sideno, dr->dr_key,
93 		    flags, ep)) == NULL) {
94 			if (!(flags & MD_BYPASS_DAEMON))
95 				free_sr(sr);
96 			metafreedrivedesc(&dd_head);
97 			return (NULL);
98 		}
99 
100 		(void) metadrivedesc_append(&dd_head, dnp, dr->dr_dbcnt,
101 		    dr->dr_dbsize, dr->dr_ctime, dr->dr_genid, dr->dr_flags);
102 	}
103 
104 	if (!(flags & MD_BYPASS_DAEMON)) {
105 		free_sr(sr);
106 	}
107 	return (dd_head);
108 }
109 
110 static int
111 get_sidenmlist(
112 	mdsetname_t	*sp,
113 	mddrivename_t	*dnp,
114 	md_error_t	*ep
115 )
116 {
117 	md_set_desc	*sd;
118 	mdsidenames_t	*sn, **sn_next;
119 	int		i;
120 
121 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
122 		return (-1);
123 
124 	metaflushsidenames(dnp);
125 	sn_next = &dnp->side_names;
126 	if (MD_MNSET_DESC(sd)) {
127 		/*
128 		 * Only get sidenames for this node since
129 		 * that is the only side information stored in
130 		 * the local mddb for a multi-node diskset.
131 		 */
132 		if (sd->sd_mn_mynode) {
133 			sn = Zalloc(sizeof (*sn));
134 			sn->sideno = sd->sd_mn_mynode->nd_nodeid;
135 			if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET,
136 			    sn->sideno, dnp->side_names_key, &sn->dname,
137 			    &sn->mnum, NULL, ep)) == NULL) {
138 				if (sn->dname != NULL)
139 					Free(sn->dname);
140 				Free(sn);
141 				return (-1);
142 			}
143 
144 			/* Add to the end of the linked list */
145 			assert(*sn_next == NULL);
146 			*sn_next = sn;
147 			sn_next = &sn->next;
148 		}
149 	} else {
150 		for (i = 0; i < MD_MAXSIDES; i++) {
151 			/* Skip empty slots */
152 			if (sd->sd_nodes[i][0] == '\0')
153 				continue;
154 
155 			sn = Zalloc(sizeof (*sn));
156 			sn->sideno = i;
157 			if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET,
158 			    i+SKEW, dnp->side_names_key, &sn->dname,
159 			    &sn->mnum, NULL, ep)) == NULL) {
160 				/*
161 				 * It is possible that during the add of a
162 				 * host to have a 'missing' side as the side
163 				 * for this disk will be added later. So ignore
164 				 * the error. The 'missing' side will be added
165 				 * once the addhosts process has completed.
166 				 */
167 				if (mdissyserror(ep, ENOENT)) {
168 					mdclrerror(ep);
169 					Free(sn);
170 					continue;
171 				}
172 
173 				if (sn->dname != NULL)
174 					Free(sn->dname);
175 				Free(sn);
176 				return (-1);
177 			}
178 
179 			/* Add to the end of the linked list */
180 			assert(*sn_next == NULL);
181 			*sn_next = sn;
182 			sn_next = &sn->next;
183 		}
184 	}
185 
186 	return (0);
187 }
188 
189 static md_drive_desc *
190 rl_to_dd(
191 	mdsetname_t		*sp,
192 	md_replicalist_t	*rlp,
193 	md_error_t		*ep
194 )
195 {
196 	md_replicalist_t	*rl;
197 	md_replica_t		*r;
198 	md_drive_desc		*dd = NULL;
199 	md_drive_desc		*d;
200 	int			found;
201 	md_set_desc		*sd;
202 	daddr_t			nblks = 0;
203 
204 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
205 		return (NULL);
206 
207 	/* find the smallest existing replica */
208 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
209 		r = rl->rl_repp;
210 		nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks));
211 	}
212 
213 	if (nblks <= 0)
214 		nblks = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;
215 
216 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
217 		r = rl->rl_repp;
218 
219 		found = 0;
220 		for (d = dd; d != NULL; d = d->dd_next) {
221 			if (strcmp(r->r_namep->drivenamep->cname,
222 			    d->dd_dnp->cname) == 0) {
223 				found = 1;
224 				dd->dd_dbcnt++;
225 				break;
226 			}
227 		}
228 
229 		if (! found)
230 			(void) metadrivedesc_append(&dd, r->r_namep->drivenamep,
231 			    1, nblks, sd->sd_ctime, sd->sd_genid, MD_DR_OK);
232 	}
233 
234 	return (dd);
235 }
236 
237 /*
238  * Exported Entry Points
239  */
240 
241 set_t
242 get_max_sets(md_error_t *ep)
243 {
244 
245 	static set_t		max_sets = 0;
246 
247 	if (max_sets == 0)
248 		if (metaioctl(MD_IOCGETNSET, &max_sets, ep, NULL) != 0)
249 			return (0);
250 
251 	return (max_sets);
252 }
253 
254 int
255 get_max_meds(md_error_t *ep)
256 {
257 	static int		max_meds = 0;
258 
259 	if (max_meds == 0)
260 		if (metaioctl(MD_MED_GET_NMED, &max_meds, ep, NULL) != 0)
261 			return (0);
262 
263 	return (max_meds);
264 }
265 
266 side_t
267 getmyside(mdsetname_t *sp, md_error_t *ep)
268 {
269 	md_set_desc		*sd;
270 	char 			*node = NULL;
271 	side_t			sideno;
272 
273 	if (sp->setno == 0)
274 		return (0);
275 
276 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
277 		return (MD_SIDEWILD);
278 
279 	node = mynode();
280 
281 	assert(node != NULL);
282 
283 	sideno = getnodeside(node, sd);
284 
285 	if (sideno != MD_SIDEWILD)
286 		return (sideno);
287 
288 	return (mddserror(ep, MDE_DS_HOSTNOSIDE, sp->setno, node, NULL, node));
289 }
290 
291 /*
292  * get set info from name
293  */
294 md_set_record *
295 getsetbyname(char *setname, md_error_t *ep)
296 {
297 	md_set_record		*sr = NULL;
298 	md_mnset_record		*mnsr = NULL;
299 	char			*p;
300 	size_t			len;
301 
302 	/* get set info from daemon */
303 	if (clnt_getset(mynode(), setname, MD_SET_BAD, &sr, ep) == -1)
304 		return (NULL);
305 	if (sr != NULL) {
306 		/*
307 		 * Returned record could be for a multi-node set or a
308 		 * non-multi-node set.
309 		 */
310 		if (MD_MNSET_REC(sr)) {
311 			/*
312 			 * Record is for a multi-node set.  Reissue call
313 			 * to get mnset information.  Need to free
314 			 * record as if a non-multi-node set record since
315 			 * that is what clnt_getset gave us.  If in
316 			 * the daemon, don't free since this is a pointer
317 			 * into the setrecords array.
318 			 */
319 			if (! md_in_daemon) {
320 				sr->sr_flags &= ~MD_SR_MN;
321 				free_sr(sr);
322 			}
323 			if (clnt_mngetset(mynode(), setname, MD_SET_BAD, &mnsr,
324 			    ep) == -1)
325 				return (NULL);
326 			if (mnsr != NULL)
327 				return ((struct md_set_record *)mnsr);
328 		} else {
329 			return (sr);
330 		}
331 	}
332 
333 	/* no such set */
334 	len = strlen(setname) + 30;
335 	p = Malloc(len);
336 	(void) snprintf(p, len, "setname \"%s\"", setname);
337 	(void) mderror(ep, MDE_NO_SET, p);
338 	Free(p);
339 	return (NULL);
340 }
341 
342 /*
343  * get set info from number
344  */
345 md_set_record *
346 getsetbynum(set_t setno, md_error_t *ep)
347 {
348 	md_set_record		*sr;
349 	md_mnset_record		*mnsr = NULL;
350 	char			buf[100];
351 
352 	if (clnt_getset(mynode(), NULL, setno, &sr, ep) == -1)
353 		return (NULL);
354 
355 	if (sr != NULL) {
356 		/*
357 		 * Record is for a multi-node set.  Reissue call
358 		 * to get mnset information.  Need to free
359 		 * record as if a non-multi-node set record since
360 		 * that is what clnt_getset gave us.  If in
361 		 * the daemon, don't free since this is a pointer
362 		 * into the setrecords array.
363 		 */
364 		if (MD_MNSET_REC(sr)) {
365 			/*
366 			 * Record is for a multi-node set.  Reissue call
367 			 * to get mnset information.
368 			 */
369 			if (! md_in_daemon) {
370 				sr->sr_flags &= ~MD_SR_MN;
371 				free_sr(sr);
372 			}
373 			if (clnt_mngetset(mynode(), NULL, setno, &mnsr,
374 			    ep) == -1)
375 				return (NULL);
376 			if (mnsr != NULL)
377 				return ((struct md_set_record *)mnsr);
378 		} else {
379 			return (sr);
380 		}
381 	}
382 
383 	(void) sprintf(buf, "setno %u", setno);
384 	(void) mderror(ep, MDE_NO_SET, buf);
385 	return (NULL);
386 }
387 
388 int
389 meta_check_drive_inuse(
390 	mdsetname_t	*sp,
391 	mddrivename_t	*dnp,
392 	int		check_db,
393 	md_error_t	*ep
394 )
395 {
396 	mdnamelist_t	*nlp = NULL;
397 	mdnamelist_t	*p;
398 	int		rval = 0;
399 
400 	/* get all underlying partitions */
401 	if (meta_getalldevs(sp, &nlp, check_db, ep) != 0)
402 		return (-1);
403 
404 	/* search for drive */
405 	for (p = nlp; (p != NULL); p = p->next) {
406 		mdname_t	*np = p->namep;
407 
408 		if (strcmp(dnp->cname, np->drivenamep->cname) == 0) {
409 			rval = (mddserror(ep, MDE_DS_DRIVEINUSE, sp->setno,
410 			    NULL, dnp->cname, sp->setname));
411 			break;
412 		}
413 	}
414 
415 	/* cleanup, return success */
416 	metafreenamelist(nlp);
417 	return (rval);
418 }
419 
420 /*
421  * simple check for ownership
422  */
423 int
424 meta_check_ownership(mdsetname_t *sp, md_error_t *ep)
425 {
426 	int			ownset;
427 	md_set_desc		*sd;
428 	md_drive_desc		*dd;
429 	md_replicalist_t	*rlp = NULL;
430 	md_error_t		xep = mdnullerror;
431 
432 	if (metaislocalset(sp))
433 		return (0);
434 
435 	ownset = own_set(sp, NULL, TRUE, ep);
436 	if (! mdisok(ep))
437 		return (-1);
438 
439 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
440 		return (-1);
441 
442 	dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep);
443 	if (! mdisok(ep))
444 		return (-1);
445 
446 	/* If we have no drive descriptors, check for no ownership */
447 	if (dd == NULL) {
448 		if (ownset == MD_SETOWNER_NONE)
449 			return (0);
450 
451 		/* If ownership somehow has come to exist, we must clean up */
452 
453 		if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp,
454 		    &xep) < 0)
455 			mdclrerror(&xep);
456 
457 		if ((dd = rl_to_dd(sp, rlp, &xep)) == NULL)
458 			if (! mdisok(&xep))
459 				mdclrerror(&xep);
460 
461 		if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
462 			if (rel_own_bydd(sp, dd, TRUE, &xep))
463 				mdclrerror(&xep);
464 		}
465 
466 		if (halt_set(sp, &xep))
467 			mdclrerror(&xep);
468 
469 		metafreereplicalist(rlp);
470 
471 		metafreedrivedesc(&dd);
472 
473 		return (0);
474 	}
475 
476 	metafreedrivedesc(&sd->sd_drvs);
477 
478 	if (ownset == MD_SETOWNER_YES)
479 		return (0);
480 
481 	return (mddserror(ep, MDE_DS_NOOWNER, sp->setno, NULL, NULL,
482 	    sp->setname));
483 }
484 
485 /*
486  * simple check for ownership
487  */
488 int
489 meta_check_ownership_on_host(mdsetname_t *sp, char *hostname, md_error_t *ep)
490 {
491 	md_set_desc	*sd;
492 	md_drive_desc	*dd;
493 	int		bool;
494 
495 	if (metaislocalset(sp))
496 		return (0);
497 
498 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
499 		return (-1);
500 
501 	if (getnodeside(hostname, sd) == MD_SIDEWILD)
502 		return (mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
503 		    hostname, NULL, sp->setname));
504 
505 	dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep);
506 	if (! mdisok(ep))
507 		return (-1);
508 
509 	if (clnt_ownset(hostname, sp, &bool, ep) == -1)
510 		return (-1);
511 
512 	if (dd == NULL)
513 		return (0);
514 
515 	metafreedrivedesc(&sd->sd_drvs);
516 
517 	if (bool == TRUE)
518 		return (0);
519 
520 	return (mddserror(ep, MDE_DS_NODEISNOTOWNER, sp->setno, hostname, NULL,
521 	    sp->setname));
522 }
523 
524 /*
525  * Function that determines if a node is in the multinode diskset
526  * membership list.  Calling node passes in node to be checked and
527  * the nodelist as returned from meta_read_nodelist.  This routine
528  * anticipates being called many times using the same diskset membership
529  * list which is why the alloc and free of the diskset membership list
530  * is left to the calling routine.
531  * Returns:
532  *	1 - if a member
533  *	0 - not a member
534  */
535 int
536 meta_is_member(
537 	char				*node_name,
538 	md_mn_nodeid_t			node_id,
539 	mndiskset_membershiplist_t	*nl
540 )
541 {
542 	mndiskset_membershiplist_t	*nl2;
543 	int				flag_check_name;
544 
545 	if (node_id != 0)
546 		flag_check_name = 0;
547 	else if (node_name != NULL)
548 		flag_check_name = 1;
549 	else
550 		return (0);
551 
552 	nl2 = nl;
553 	while (nl2) {
554 		if (flag_check_name) {
555 			/* Compare given name against name in member list */
556 			if (strcmp(nl2->msl_node_name, node_name) == 0)
557 				break;
558 		} else {
559 			/* Compare given nodeid against nodeid in member list */
560 			if (nl2->msl_node_id == node_id)
561 				break;
562 		}
563 		nl2 = nl2->next;
564 	}
565 	/* No match found in member list */
566 	if (nl2 == NULL) {
567 		return (0);
568 	}
569 	/* Return 1 if node is in member list */
570 	return (1);
571 }
572 
573 /*
574  * meta_getnext_devinfo should go to the host that
575  * has the device, to return the device name, driver name, minor num.
576  * We can take the big cheat for now, since it is a requirement
577  * that the device names and device numbers are the same, and
578  * just get the info locally.
579  *
580  * This routine is very similar to meta_getnextside_devinfo except
581  * that the specific side to be used is being passed in.
582  *
583  * Exit status:
584  *	 0 - No more side info to return
585  *	 1 - More side info's to return
586  *	-1 - An error has been detected
587  */
588 /*ARGSUSED*/
589 int
590 meta_getside_devinfo(
591 	mdsetname_t	*sp,		/* for this set */
592 	char		*bname,		/* local block name (myside) */
593 	side_t		sideno,		/* sideno */
594 	char		**ret_bname,	/* block device name of returned side */
595 	char		**ret_dname,	/* driver name of returned side */
596 	minor_t		*ret_mnum,	/* minor number of returned side */
597 	md_error_t	*ep
598 )
599 {
600 	mdname_t	*np;
601 
602 	if (ret_bname != NULL)
603 		*ret_bname = NULL;
604 	if (ret_dname != NULL)
605 		*ret_dname = NULL;
606 	if (ret_mnum != NULL)
607 		*ret_mnum = NODEV32;
608 
609 
610 	if ((np = metaname(&sp, bname, ep)) == NULL)
611 		return (-1);
612 
613 /*
614  * NOTE (future) - There will be more work here once devids are integrated
615  * into disksets.  Then the side should be used to find the correct
616  * host and the b/d names should be gotten from that host.
617  */
618 
619 	/*
620 	 * Return the side info.
621 	 */
622 	if (ret_bname != NULL)
623 		*ret_bname = Strdup(np->bname);
624 
625 	if (ret_dname != NULL) {
626 		mdcinfo_t	*cinfo;
627 
628 		if ((cinfo = metagetcinfo(np, ep)) == NULL)
629 			return (-1);
630 
631 		*ret_dname = Strdup(cinfo->dname);
632 	}
633 
634 	if (ret_mnum != NULL)
635 		*ret_mnum = meta_getminor(np->dev);
636 
637 	return (1);
638 }
639 
640 /*
641  * Get the information on the device from the remote node using the devid
642  * of the disk.
643  *
644  * Exit status:
645  *	 0 - No more side info to return
646  *	 1 - More side info's to return
647  *	-1 - An error has been detected
648  */
649 int
650 meta_getnextside_devinfo(
651 	mdsetname_t	*sp,		/* for this set */
652 	char		*bname,		/* local block name (myside) */
653 	side_t		*sideno,	/* previous sideno & returned sideno */
654 	char		**ret_bname,	/* block device name of returned side */
655 	char		**ret_dname,	/* driver name of returned side */
656 	minor_t		*ret_mnum,	/* minor number of returned side */
657 	md_error_t	*ep
658 )
659 {
660 	md_set_desc	*sd;
661 	int		i;
662 	mdname_t	*np;
663 	mddrivename_t	*dnp;
664 	char		*devidstr = NULL;
665 	int		devidstrlen;
666 	md_dev64_t	retdev = NODEV64;
667 	char		*ret_devname = NULL;
668 	char		*ret_blkdevname = NULL;
669 	char		*ret_driver = NULL;
670 	char		*nodename;
671 	int		fd;
672 	int		ret = -1;
673 	char		*minor_name = NULL;
674 	md_mnnode_desc	*nd;
675 
676 
677 	if (ret_bname != NULL)
678 		*ret_bname = NULL;
679 	if (ret_dname != NULL)
680 		*ret_dname = NULL;
681 	if (ret_mnum != NULL)
682 		*ret_mnum = NODEV32;
683 
684 	if (metaislocalset(sp)) {
685 		/* no more sides - we are done */
686 		if (*sideno != MD_SIDEWILD)
687 			return (0);
688 
689 		/* First time through -  set up return sideno */
690 		*sideno = 0;
691 	} else {
692 
693 		/*
694 		 * Find the next sideno, starting after the one given.
695 		 */
696 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
697 			return (-1);
698 
699 		if (MD_MNSET_DESC(sd)) {
700 			nd = sd->sd_nodelist;
701 			if ((*sideno == MD_SIDEWILD) &&
702 			    (nd != (struct md_mnnode_desc *)NULL)) {
703 				*sideno = nd->nd_nodeid;
704 			} else {
705 				while (nd) {
706 					/*
707 					 * Found given sideno, now find
708 					 * next sideno, if there is one.
709 					 */
710 					if ((*sideno == nd->nd_nodeid) &&
711 					    (nd->nd_next !=
712 					    (struct md_mnnode_desc *)NULL)) {
713 						*sideno =
714 						    nd->nd_next->nd_nodeid;
715 						break;
716 					}
717 					nd = nd->nd_next;
718 				}
719 				if (nd == NULL) {
720 					return (0);
721 				}
722 			}
723 			if (*sideno == MD_SIDEWILD)
724 				return (0);
725 		} else {
726 			for (i = (*sideno)+1; i < MD_MAXSIDES; i++)
727 				/* Find next full slot */
728 				if (sd->sd_nodes[i][0] != '\0')
729 					break;
730 
731 			/* No more sides - we are done */
732 			if (i == MD_MAXSIDES)
733 				return (0);
734 
735 			/* Set up the return sideno */
736 			*sideno = i;
737 			nodename = (char *)sd->sd_nodes[i];
738 		}
739 	}
740 
741 	/*
742 	 * Need to pass the node the devid of the disk and get it to
743 	 * send back the details of the disk from that side.
744 	 */
745 	if ((np = metaname(&sp, bname, ep)) == NULL)
746 		return (-1);
747 
748 	dnp = np->drivenamep;
749 
750 	/*
751 	 * By default, set up the parameters so that they are copied out.
752 	 */
753 	if (ret_bname != NULL)
754 		*ret_bname = Strdup(np->bname);
755 
756 	if (ret_dname != NULL) {
757 		mdcinfo_t	*cinfo;
758 
759 		if ((cinfo = metagetcinfo(np, ep)) == NULL)
760 			return (-1);
761 
762 		*ret_dname = Strdup(cinfo->dname);
763 	}
764 
765 	if (ret_mnum != NULL)
766 		*ret_mnum = meta_getminor(np->dev);
767 
768 	/*
769 	 * Try some optimization. If this is the local set or the device
770 	 * is a metadevice then just copy the information. If the device
771 	 * does not have a devid (due to not having a minor name) then
772 	 * fall back to the pre-devid behaviour of copying the information
773 	 * on the device: this is okay because the sanity checks before this
774 	 * call would have found any issues with the device. If it's a
775 	 * multi-node diskset also just return ie. copy.
776 	 */
777 	if (metaislocalset(sp) || metaismeta(np) || (dnp->devid == NULL) ||
778 	    (MD_MNSET_DESC(sd)))
779 		return (1);
780 
781 	if (np->minor_name == (char *)NULL) {
782 		/*
783 		 * Have to get the minor name then. The slice should exist
784 		 * on the disk because it will have already been repartitioned
785 		 * up prior to getting to this point.
786 		 */
787 		if ((fd = open(np->bname, (O_RDONLY|O_NDELAY), 0)) < 0) {
788 			(void) mdsyserror(ep, errno, np->bname);
789 			return (-1);
790 		}
791 		(void) devid_get_minor_name(fd, &minor_name);
792 		np->minor_name = Strdup(minor_name);
793 		devid_str_free(minor_name);
794 		(void) close(fd);
795 	}
796 
797 	/* allocate extra space for "/" and NULL hence +2 */
798 	devidstrlen = strlen(dnp->devid) + strlen(np->minor_name) + 2;
799 	devidstr = (char *)Malloc(devidstrlen);
800 
801 	/*
802 	 * As a minor name is supplied then the ret_devname will be
803 	 * appropriate to that minor_name and in this case it will be
804 	 * a block device ie /dev/dsk.
805 	 */
806 	(void) snprintf(devidstr, devidstrlen,
807 		"%s/%s", dnp->devid, np->minor_name);
808 
809 	ret = clnt_devinfo_by_devid(nodename, sp, devidstr, &retdev,
810 	    np->bname, &ret_devname, &ret_driver, ep);
811 
812 	Free(devidstr);
813 
814 	/*
815 	 * If the other side is not running device id in disksets,
816 	 * 'ret' is set to ENOTSUP in which case we fallback to
817 	 * the existing behaviour
818 	 */
819 	if (ret == ENOTSUP)
820 		return (1);
821 	else if (ret == -1)
822 		return (-1);
823 
824 	/*
825 	 * ret_devname comes from the rpc call and is a
826 	 * raw device name. We need to make this into a
827 	 * block device via blkname for further processing.
828 	 * Unfortunately, when our device id isn't found in
829 	 * the system, the rpc call will return a " " in
830 	 * ret_devname in which case we need to fill that in
831 	 * as ret_blkname because blkname of " " returns NULL.
832 	 */
833 	if (ret_bname != NULL && ret_devname != NULL) {
834 		ret_blkdevname = blkname(ret_devname);
835 		if (ret_blkdevname == NULL)
836 			*ret_bname = Strdup(ret_devname);
837 		else
838 			*ret_bname = Strdup(ret_blkdevname);
839 	}
840 
841 	if (ret_dname != NULL && ret_driver != NULL)
842 		*ret_dname = Strdup(ret_driver);
843 
844 	if (ret_mnum != NULL)
845 		*ret_mnum = meta_getminor(retdev);
846 
847 	return (1);
848 }
849 
850 int
851 meta_is_drive_in_anyset(
852 	mddrivename_t	*dnp,
853 	mdsetname_t	**spp,
854 	int		bypass_daemon,
855 	md_error_t 	*ep
856 )
857 {
858 	set_t		setno;
859 	mdsetname_t	*this_sp;
860 	int		is_it;
861 	set_t		max_sets;
862 
863 	if ((max_sets = get_max_sets(ep)) == 0)
864 		return (-1);
865 
866 	assert(spp != NULL);
867 	*spp = NULL;
868 
869 	for (setno = 1; setno < max_sets; setno++) {
870 		if (!bypass_daemon) {
871 			if ((this_sp = metasetnosetname(setno, ep)) == NULL) {
872 				if (mdismddberror(ep, MDE_DB_NODB)) {
873 					mdclrerror(ep);
874 					return (0);
875 				}
876 				if (mdiserror(ep, MDE_NO_SET)) {
877 					mdclrerror(ep);
878 					continue;
879 				}
880 				return (-1);
881 			}
882 		} else
883 			this_sp = metafakesetname(setno, NULL);
884 
885 		if ((is_it = meta_is_drive_in_thisset(this_sp, dnp,
886 		    bypass_daemon, ep)) == -1) {
887 			if (mdiserror(ep, MDE_NO_SET)) {
888 				mdclrerror(ep);
889 				continue;
890 			}
891 			return (-1);
892 		}
893 		if (is_it) {
894 			*spp = this_sp;
895 			return (0);
896 		}
897 	}
898 	return (0);
899 }
900 
901 int
902 meta_is_drive_in_thisset(
903 	mdsetname_t	*sp,
904 	mddrivename_t	*dnp,
905 	int		bypass_daemon,
906 	md_error_t	*ep
907 )
908 {
909 	md_drive_desc	*dd, *p;
910 
911 	if (bypass_daemon)
912 		dd = dr2drivedesc(sp, MD_SIDEWILD,
913 		    (MD_BASICNAME_OK | MD_BYPASS_DAEMON), ep);
914 	else
915 		dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
916 
917 	if (dd == NULL) {
918 		if (! mdisok(ep))
919 			return (-1);
920 		return (0);
921 	}
922 
923 
924 	for (p = dd; p != NULL; p = p->dd_next)
925 		if (strcmp(p->dd_dnp->cname, dnp->cname) == 0)
926 			return (1);
927 	return (0);
928 }
929 
930 int
931 meta_set_balance(
932 	mdsetname_t		*sp,
933 	md_error_t		*ep
934 )
935 {
936 	md_set_desc		*sd;
937 	md_drive_desc		*dd, *curdd;
938 	daddr_t			dbsize;
939 	daddr_t			nblks;
940 	int			i;
941 	int			rval = 0;
942 	sigset_t		oldsigs;
943 	md_setkey_t		*cl_sk;
944 	md_error_t		xep = mdnullerror;
945 	md_mnnode_desc		*nd;
946 	int			suspend1_flag = 0;
947 
948 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
949 		return (-1);
950 
951 	dbsize = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;
952 
953 	/* Make sure we own the set */
954 	if (meta_check_ownership(sp, ep) != 0)
955 		return (-1);
956 
957 	/* END CHECK CODE */
958 
959 	/*
960 	 * Get drive descriptors for the drives that are currently in the set.
961 	 */
962 	curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep);
963 
964 	if (! mdisok(ep))
965 		return (-1);
966 
967 	/* Find the minimum replica size in use is or use the default */
968 	if ((nblks = meta_db_minreplica(sp, ep)) < 0)
969 		mdclrerror(ep);
970 	else
971 		dbsize = nblks;	/* adjust replica size */
972 
973 	/* Make sure we are blocking all signals */
974 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
975 		mdclrerror(&xep);
976 
977 	/*
978 	 * Lock the set on current set members.
979 	 * For MN diskset lock_set and SUSPEND are used to protect against
980 	 * other meta* commands running on the other nodes.
981 	 */
982 	if (MD_MNSET_DESC(sd)) {
983 		nd = sd->sd_nodelist;
984 		while (nd) {
985 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
986 				nd = nd->nd_next;
987 				continue;
988 			}
989 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
990 				rval = -1;
991 				goto out;
992 			}
993 			nd = nd->nd_next;
994 		}
995 		/*
996 		 * Lock out other meta* commands by suspending
997 		 * class 1 messages across the diskset.
998 		 */
999 		nd = sd->sd_nodelist;
1000 		while (nd) {
1001 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1002 				nd = nd->nd_next;
1003 				continue;
1004 			}
1005 			if (clnt_mdcommdctl(nd->nd_nodename,
1006 			    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
1007 			    MD_MSCF_NO_FLAGS, ep)) {
1008 				rval = -1;
1009 				goto out;
1010 			}
1011 			suspend1_flag = 1;
1012 			nd = nd->nd_next;
1013 		}
1014 	} else {
1015 		for (i = 0; i < MD_MAXSIDES; i++) {
1016 			/* Skip empty slots */
1017 			if (sd->sd_nodes[i][0] == '\0') continue;
1018 
1019 			if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
1020 				rval = -1;
1021 				goto out;
1022 			}
1023 		}
1024 	}
1025 
1026 	/* We are not adding or deleting any drives, just balancing */
1027 	dd = NULL;
1028 
1029 	/*
1030 	 * Balance the DB's according to the list of existing drives and the
1031 	 * list of added drives.
1032 	 */
1033 	if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1)
1034 		goto out;
1035 
1036 out:
1037 	/*
1038 	 * Unlock diskset by resuming class 1 messages across the diskset.
1039 	 * Just resume all classes so that resume is the same whether
1040 	 * just one class was locked or all classes were locked.
1041 	 */
1042 	if (suspend1_flag) {
1043 		nd = sd->sd_nodelist;
1044 		while (nd) {
1045 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1046 				nd = nd->nd_next;
1047 				continue;
1048 			}
1049 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
1050 				sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
1051 				/*
1052 				 * We are here because we failed to resume
1053 				 * rpc.mdcommd.  However we potentially have
1054 				 * an error from the previous call
1055 				 * (meta_db_balance). If the previous call
1056 				 * did fail,  we capture that error and
1057 				 * generate a perror withthe string,
1058 				 * "Unable to resume...".
1059 				 * Setting rval to -1 ensures that in the
1060 				 * next iteration of the loop, ep is not
1061 				 * clobbered.
1062 				 */
1063 				if (rval == 0)
1064 					(void) mdstealerror(ep, &xep);
1065 				else
1066 					mdclrerror(&xep);
1067 				rval = -1;
1068 				mde_perror(ep, dgettext(TEXT_DOMAIN,
1069 				    "Unable to resume rpc.mdcommd."));
1070 			}
1071 			nd = nd->nd_next;
1072 		}
1073 	}
1074 
1075 	/* Unlock the set */
1076 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1077 	if (MD_MNSET_DESC(sd)) {
1078 		nd = sd->sd_nodelist;
1079 		while (nd) {
1080 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1081 				nd = nd->nd_next;
1082 				continue;
1083 			}
1084 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
1085 				if (rval == 0)
1086 					(void) mdstealerror(ep, &xep);
1087 				else
1088 					mdclrerror(&xep);
1089 				rval = -1;
1090 			}
1091 			nd = nd->nd_next;
1092 		}
1093 	} else {
1094 		for (i = 0; i < MD_MAXSIDES; i++) {
1095 			/* Skip empty slots */
1096 			if (sd->sd_nodes[i][0] == '\0')
1097 				continue;
1098 
1099 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
1100 				if (rval == 0)
1101 					(void) mdstealerror(ep, &xep);
1102 				rval = -1;
1103 			}
1104 		}
1105 	}
1106 
1107 	/* release signals back to what they were on entry */
1108 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
1109 		mdclrerror(&xep);
1110 
1111 	cl_set_setkey(NULL);
1112 
1113 	metaflushsetname(sp);
1114 
1115 	return (rval);
1116 }
1117 
1118 int
1119 meta_set_destroy(
1120 	mdsetname_t	*sp,
1121 	int		lock_set,
1122 	md_error_t	*ep
1123 )
1124 {
1125 	int		i;
1126 	med_rec_t	medr;
1127 	md_set_desc	*sd;
1128 	md_drive_desc	*dd, *p, *p1;
1129 	mddrivename_t	*dnp;
1130 	mdname_t	*np;
1131 	mdnamelist_t	*nlp = NULL;
1132 	int		num_users = 0;
1133 	int		has_set;
1134 	side_t		mysideno;
1135 	sigset_t	oldsigs;
1136 	md_error_t	xep = mdnullerror;
1137 	md_setkey_t	*cl_sk;
1138 	int		rval = 0;
1139 	int		delete_end = 1;
1140 
1141 	/* Make sure we are blocking all signals */
1142 	if (procsigs(TRUE, &oldsigs, ep) < 0)
1143 		return (-1);
1144 
1145 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1146 		if (! mdisok(ep))
1147 			rval = -1;
1148 		goto out;
1149 	}
1150 
1151 	/*
1152 	 * meta_set_destroy should not be called for a MN diskset.
1153 	 * This routine destroys a set without communicating this information
1154 	 * to the other nodes which would lead to an inconsistency in
1155 	 * the MN diskset.
1156 	 */
1157 	if (MD_MNSET_DESC(sd)) {
1158 		rval = -1;
1159 		goto out;
1160 	}
1161 
1162 	/* Continue if a traditional diskset */
1163 
1164 	/*
1165 	 * Check to see who has the set.  If we are not the last user of the
1166 	 * set, we will not touch the replicas.
1167 	 */
1168 	for (i = 0; i < MD_MAXSIDES; i++) {
1169 		/* Skip empty slots */
1170 		if (sd->sd_nodes[i][0] == '\0')
1171 			continue;
1172 
1173 		has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NST_EQ,
1174 		    ep);
1175 
1176 		if (has_set < 0) {
1177 			mdclrerror(ep);
1178 		} else
1179 			num_users++;
1180 	}
1181 
1182 	if ((dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) == NULL) {
1183 		if (! mdisok(ep)) {
1184 			rval = -1;
1185 			goto out;
1186 		}
1187 	}
1188 
1189 	if (setup_db_bydd(sp, dd, TRUE, ep) == -1) {
1190 		rval = -1;
1191 		goto out;
1192 	}
1193 
1194 	if (lock_set == TRUE) {
1195 		/* Lock the set on our side */
1196 		if (clnt_lock_set(mynode(), sp, ep)) {
1197 			rval = -1;
1198 			goto out;
1199 		}
1200 	}
1201 
1202 	/*
1203 	 * A traditional diskset has no diskset stale information to send
1204 	 * since there can only be one owner node at a time.
1205 	 */
1206 	if (snarf_set(sp, FALSE, ep))
1207 		mdclrerror(ep);
1208 
1209 	if (dd != NULL) {
1210 		/*
1211 		 * Make sure that no drives are in use as parts of metadrives
1212 		 * or hot spare pools, this is one of the few error conditions
1213 		 * that will stop this routine, unless the environment has
1214 		 * META_DESTROY_SET_OK set, in which case, the operation will
1215 		 * proceed.
1216 		 */
1217 		if (getenv("META_DESTROY_SET_OK") == NULL) {
1218 			for (p = dd; p != NULL; p = p->dd_next) {
1219 				dnp = p->dd_dnp;
1220 
1221 				i = meta_check_drive_inuse(sp, dnp, FALSE, ep);
1222 				if (i == -1) {
1223 					/* need xep - wire calls clear error */
1224 					i = metaget_setownership(sp, &xep);
1225 					if (i == -1) {
1226 						rval = -1;
1227 						goto out;
1228 					}
1229 
1230 					mysideno = getmyside(sp, &xep);
1231 
1232 					if (mysideno == MD_SIDEWILD) {
1233 						rval = -1;
1234 						goto out;
1235 					}
1236 
1237 					if (sd->sd_isown[mysideno] == FALSE)
1238 						if (halt_set(sp, &xep)) {
1239 							rval = -1;
1240 							goto out;
1241 						}
1242 
1243 					rval = -1;
1244 					goto out;
1245 				}
1246 			}
1247 		}
1248 
1249 		for (i = 0; i < MD_MAXSIDES; i++) {
1250 			/* Skip empty slots */
1251 			if (sd->sd_nodes[i][0] == '\0')
1252 				continue;
1253 
1254 			/* Skip non local nodes */
1255 			if (strcmp(mynode(), sd->sd_nodes[i]) != 0)
1256 				continue;
1257 
1258 			if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep))
1259 				mdclrerror(ep);
1260 		}
1261 
1262 		/*
1263 		 * Go thru each drive and individually delete the replicas.
1264 		 * This way we can ignore individual errors.
1265 		 */
1266 		for (p = dd; p != NULL; p = p->dd_next) {
1267 			uint_t	rep_slice;
1268 
1269 			dnp = p->dd_dnp;
1270 			if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) ||
1271 			    (((np = metaslicename(dnp, rep_slice, ep))
1272 				== NULL) &&
1273 				((np = metaslicename(dnp, MD_SLICE0, ep))
1274 				    == NULL))) {
1275 				rval = -1;
1276 				goto out;
1277 			}
1278 
1279 			if ((np = metaslicename(dnp,
1280 			    rep_slice, ep)) == NULL) {
1281 				if ((np = metaslicename(dnp,
1282 				    MD_SLICE0, ep)) == NULL) {
1283 					rval = -1;
1284 					goto out;
1285 				}
1286 				mdclrerror(ep);
1287 			}
1288 
1289 			/* Yes this is UGLY!!! */
1290 			p1 = p->dd_next;
1291 			p->dd_next = NULL;
1292 			if (rel_own_bydd(sp, p, FALSE, ep))
1293 				mdclrerror(ep);
1294 			p->dd_next = p1;
1295 
1296 			if (p->dd_dbcnt == 0)
1297 				continue;
1298 
1299 			/*
1300 			 * Skip the replica removal if we are not the last user
1301 			 */
1302 			if (num_users != 1)
1303 				continue;
1304 
1305 			nlp = NULL;
1306 			(void) metanamelist_append(&nlp, np);
1307 			if (meta_db_detach(sp, nlp,
1308 			    (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, ep))
1309 				mdclrerror(ep);
1310 			metafreenamelist(nlp);
1311 		}
1312 	}
1313 
1314 	if (halt_set(sp, ep)) {
1315 		rval = -1;
1316 		goto out;
1317 	}
1318 
1319 	/* Setup the mediator record */
1320 	(void) memset(&medr, '\0', sizeof (med_rec_t));
1321 	medr.med_rec_mag = MED_REC_MAGIC;
1322 	medr.med_rec_rev = MED_REC_REV;
1323 	medr.med_rec_fl  = 0;
1324 	medr.med_rec_sn  = sp->setno;
1325 	(void) strcpy(medr.med_rec_snm, sp->setname);
1326 	medr.med_rec_meds = sd->sd_med;	/* structure assigment */
1327 	(void) memset(&medr.med_rec_data, '\0', sizeof (med_data_t));
1328 	medr.med_rec_foff = 0;
1329 
1330 	/*
1331 	 * If we are the last remaining user, then remove the mediator hosts
1332 	 */
1333 	if (num_users == 1) {
1334 		for (i = 0; i < MED_MAX_HOSTS; i++) {
1335 			if (medr.med_rec_meds.n_lst[i].a_cnt != 0)
1336 				SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE,
1337 				    SVM_TAG_MEDIATOR, sp->setno, i);
1338 			(void) memset(&medr.med_rec_meds.n_lst[i], '\0',
1339 			    sizeof (md_h_t));
1340 		}
1341 		medr.med_rec_meds.n_cnt = 0;
1342 	} else { 	/* Remove this host from the mediator node list. */
1343 		for (i = 0; i < MD_MAXSIDES; i++) {
1344 			/* Skip empty slots */
1345 			if (sd->sd_nodes[i][0] == '\0')
1346 				continue;
1347 
1348 			/* Copy non local node */
1349 			if (strcmp(mynode(), sd->sd_nodes[i]) != 0) {
1350 				(void) strcpy(medr.med_rec_nodes[i],
1351 				    sd->sd_nodes[i]);
1352 				continue;
1353 			}
1354 
1355 			/* Clear local node */
1356 			(void) memset(&medr.med_rec_nodes[i], '\0',
1357 			    sizeof (md_node_nm_t));
1358 		}
1359 	}
1360 
1361 	crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
1362 
1363 	/*
1364 	 * If the client is part of a cluster put the DCS service
1365 	 * into a deleteing state.
1366 	 */
1367 	if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) {
1368 		if (metad_isautotakebyname(sp->setname)) {
1369 			delete_end = 0;
1370 		} else {
1371 			mdclrerror(ep);
1372 			goto out;
1373 		}
1374 	}
1375 
1376 	/* Inform the mediator hosts of the new information */
1377 	for (i = 0; i < MED_MAX_HOSTS; i++) {
1378 		if (sd->sd_med.n_lst[i].a_cnt == 0)
1379 			continue;
1380 
1381 		if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep))
1382 			mdclrerror(ep);
1383 	}
1384 
1385 	/* Delete the set locally */
1386 	for (i = 0; i < MD_MAXSIDES; i++) {
1387 		/* Skip empty slots */
1388 		if (sd->sd_nodes[i][0] == '\0')
1389 			continue;
1390 
1391 		/* Skip non local nodes */
1392 		if (strcmp(mynode(), sd->sd_nodes[i]) != 0)
1393 			continue;
1394 
1395 		if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1)
1396 			mdclrerror(ep);
1397 	}
1398 	if (delete_end &&
1399 	    sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR)
1400 		rval = -1;
1401 
1402 out:
1403 	/* release signals back to what they were on entry */
1404 	if (procsigs(FALSE, &oldsigs, &xep) < 0) {
1405 		if (rval == 0)
1406 			(void) mdstealerror(ep, &xep);
1407 		rval = -1;
1408 	}
1409 
1410 	if (lock_set == TRUE) {
1411 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
1412 		if (clnt_unlock_set(mynode(), cl_sk, &xep)) {
1413 			if (rval == 0)
1414 				(void) mdstealerror(ep, &xep);
1415 			rval = -1;
1416 		}
1417 		cl_set_setkey(NULL);
1418 	}
1419 
1420 	metaflushsetname(sp);
1421 	return (rval);
1422 }
1423 
1424 int
1425 meta_set_purge(
1426 	mdsetname_t	*sp,
1427 	int		bypass_cluster,
1428 	int		forceflg,
1429 	md_error_t	*ep
1430 )
1431 {
1432 	char		*thishost = mynode();
1433 	md_set_desc	*sd;
1434 	md_setkey_t	*cl_sk;
1435 	md_error_t	xep = mdnullerror;
1436 	int		rval = 0;
1437 	int		i, num_hosts = 0;
1438 	int		has_set = 0;
1439 	int		max_node = 0;
1440 	int		delete_end = 1;
1441 	md_mnnode_desc	*nd;
1442 
1443 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1444 		/* unable to find set description */
1445 		rval = 1;
1446 		return (rval);
1447 	}
1448 
1449 	if (MD_MNSET_DESC(sd)) {
1450 		/*
1451 		 * Get a count of the hosts in the set and also lock the set
1452 		 * on those hosts that know about it.
1453 		 */
1454 		nd = sd->sd_nodelist;
1455 		while (nd) {
1456 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1457 				nd = nd->nd_next;
1458 				continue;
1459 			}
1460 			has_set = nodehasset(sp, nd->nd_nodename,
1461 				NHS_NST_EQ, ep);
1462 
1463 			/*
1464 			 * The host is not aware of this set (has_set < 0) or
1465 			 * the set does not match (has_set == 0). This check
1466 			 * prevents the code getting confused by an apparent
1467 			 * inconsistancy in the set's state, this is in the
1468 			 * purge code so something is broken in any case and
1469 			 * this is just trying to fix the brokeness.
1470 			 */
1471 			if (has_set <= 0) {
1472 				mdclrerror(ep);
1473 				nd->nd_flags |= MD_MN_NODE_NOSET;
1474 			} else {
1475 				num_hosts++;
1476 				if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
1477 					/*
1478 					 * If the force flag is set then
1479 					 * ignore any RPC failures because we
1480 					 * are only really interested with
1481 					 * the set on local node.
1482 					 */
1483 					if (forceflg && mdanyrpcerror(ep)) {
1484 						mdclrerror(ep);
1485 					} else {
1486 						/*
1487 						 * set max_node so that in the
1488 						 * unlock code nodes in the
1489 						 * set that have not been
1490 						 * locked are not unlocked.
1491 						 */
1492 						max_node = nd->nd_nodeid;
1493 						rval = 2;
1494 						goto out1;
1495 					}
1496 				}
1497 
1498 			}
1499 			nd = nd->nd_next;
1500 		}
1501 		max_node = 0;
1502 	} else {
1503 		/*
1504 		 * Get a count of the hosts in the set and also lock the set
1505 		 * on those hosts that know about it.
1506 		 */
1507 		for (i = 0; i < MD_MAXSIDES; i++) {
1508 			/* Skip empty slots */
1509 			if (sd->sd_nodes[i][0] == '\0')
1510 				continue;
1511 
1512 			has_set = nodehasset(sp, sd->sd_nodes[i],
1513 				NHS_NST_EQ, ep);
1514 
1515 			/*
1516 			 * The host is not aware of this set (has_set < 0) or
1517 			 * the set does not match (has_set == 0). This check
1518 			 * prevents the code getting confused by an apparent
1519 			 * inconsistancy in the set's state, this is in the
1520 			 * purge code so something is broken in any case and
1521 			 * this is just trying to fix the brokeness.
1522 			 */
1523 			if (has_set <= 0) {
1524 				mdclrerror(ep);
1525 				/*
1526 				 * set the node to NULL to prevent further
1527 				 * requests to this unresponsive node.
1528 				 */
1529 				sd->sd_nodes[i][0] = '\0';
1530 			} else {
1531 				num_hosts++;
1532 				if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
1533 					/*
1534 					 * If the force flag is set then
1535 					 * ignore any RPC failures because we
1536 					 * are only really interested with
1537 					 * the set on local node.
1538 					 */
1539 					if (forceflg && mdanyrpcerror(ep)) {
1540 						mdclrerror(ep);
1541 					} else {
1542 						rval = 2;
1543 						/*
1544 						 * set max_node so that in the
1545 						 * unlock code nodes in the
1546 						 * set that have not been
1547 						 * locked are not unlocked.
1548 						 */
1549 						max_node = i;
1550 						goto out1;
1551 					}
1552 				}
1553 			}
1554 		}
1555 		max_node = i;	/* now MD_MAXSIDES */
1556 	}
1557 	if (!bypass_cluster) {
1558 		/*
1559 		 * If there is only one host associated with the
1560 		 * set then remove the set from the cluster.
1561 		 */
1562 		if (num_hosts == 1) {
1563 			if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) {
1564 				if (metad_isautotakebyname(sp->setname)) {
1565 					delete_end = 0;
1566 				} else {
1567 					mdclrerror(ep);
1568 					rval = 3;
1569 					goto out1;
1570 				}
1571 			}
1572 		}
1573 	}
1574 
1575 	if (MD_MNSET_DESC(sd)) {
1576 		/*
1577 		 * Get a count of the hosts in the set and also lock the set
1578 		 * on those hosts that know about it.
1579 		 */
1580 		nd = sd->sd_nodelist;
1581 		while (nd) {
1582 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1583 				nd = nd->nd_next;
1584 				continue;
1585 			}
1586 			if (nd->nd_nodeid != sd->sd_mn_mynode->nd_nodeid) {
1587 				/*
1588 				 * Tell the remote node to remove this node
1589 				 */
1590 				if (clnt_delhosts(nd->nd_nodename, sp, 1,
1591 					&thishost, ep) == -1) {
1592 					/*
1593 					 * If we fail to delete ourselves
1594 					 * from the remote host it does not
1595 					 * really matter because the set is
1596 					 * being "purged" from this node. The
1597 					 * set can be purged from the other
1598 					 * node at a later time.
1599 					 */
1600 					mdclrerror(ep);
1601 				}
1602 				nd = nd->nd_next;
1603 				continue;
1604 			}
1605 			/* remove the set from this host */
1606 			if (clnt_delset(nd->nd_nodename, sp, ep) == -1) {
1607 				md_perror(dgettext(TEXT_DOMAIN, "delset"));
1608 				if (!bypass_cluster && num_hosts == 1)
1609 					(void) sdssc_delete_end(sp->setname,
1610 					    SDSSC_CLEANUP);
1611 				mdclrerror(ep);
1612 				goto out1;
1613 			}
1614 			nd = nd->nd_next;
1615 		}
1616 	} else {
1617 		for (i = 0; i < MD_MAXSIDES; i++) {
1618 			/* Skip empty slots */
1619 			if (sd->sd_nodes[i][0] == '\0')
1620 				continue;
1621 			if (strcmp(thishost, sd->sd_nodes[i]) != 0) {
1622 				/*
1623 				 * Tell the remote node to remove this node
1624 				 */
1625 				if (clnt_delhosts(sd->sd_nodes[i], sp, 1,
1626 				    &thishost, ep) == -1) {
1627 					/*
1628 					 * If we fail to delete ourselves
1629 					 * from the remote host it does not
1630 					 * really matter because the set is
1631 					 * being "purged" from this node. The
1632 					 * set can be purged from the other
1633 					 * node at a later time.
1634 					 */
1635 					mdclrerror(ep);
1636 				}
1637 				continue;
1638 			}
1639 
1640 			/* remove the set from this host */
1641 			if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) {
1642 				md_perror(dgettext(TEXT_DOMAIN, "delset"));
1643 				if (!bypass_cluster && num_hosts == 1)
1644 					(void) sdssc_delete_end(sp->setname,
1645 					    SDSSC_CLEANUP);
1646 				mdclrerror(ep);
1647 				goto out1;
1648 			}
1649 		}
1650 	}
1651 
1652 	if (!bypass_cluster && num_hosts == 1) {
1653 		if (delete_end && sdssc_delete_end(sp->setname, SDSSC_COMMIT) ==
1654 		    SDSSC_ERROR) {
1655 			rval = 4;
1656 		}
1657 	}
1658 
1659 out1:
1660 
1661 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1662 
1663 	/*
1664 	 * Remove the set lock on those nodes that had the set locked
1665 	 * max_node will either be MD_MAXSIDES or array index of the last
1666 	 * node contacted (or rather failed to contact) for traditional
1667 	 * diskset.  For a MN diskset, max_node is the node_id of the node
1668 	 * that failed the lock.
1669 	 */
1670 	if (MD_MNSET_DESC(sd)) {
1671 		nd = sd->sd_nodelist;
1672 		while (nd) {
1673 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1674 				nd = nd->nd_next;
1675 				continue;
1676 			}
1677 			if (nd->nd_nodeid == max_node)
1678 				break;
1679 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
1680 				if (forceflg && mdanyrpcerror(&xep)) {
1681 					mdclrerror(&xep);
1682 					nd = nd->nd_next;
1683 					continue;
1684 				}
1685 				if (rval == 0)
1686 					(void) mdstealerror(ep, &xep);
1687 				rval = 5;
1688 			}
1689 			nd = nd->nd_next;
1690 		}
1691 	} else {
1692 		for (i = 0; i < max_node; i++) {
1693 			/* Skip empty slots */
1694 			if (sd->sd_nodes[i][0] == '\0')
1695 				continue;
1696 
1697 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
1698 				if (forceflg && mdanyrpcerror(&xep)) {
1699 					mdclrerror(&xep);
1700 					continue;
1701 				}
1702 				if (rval == 0)
1703 					(void) mdstealerror(ep, &xep);
1704 				rval = 5;
1705 			}
1706 		}
1707 	}
1708 
1709 	cl_set_setkey(NULL);
1710 
1711 	return (rval);
1712 }
1713 
1714 int
1715 meta_set_query(
1716 	mdsetname_t		*sp,
1717 	mddb_dtag_lst_t		**dtlpp,
1718 	md_error_t		*ep
1719 )
1720 {
1721 	mddb_dtag_get_parm_t	dtgp;
1722 
1723 	(void) memset(&dtgp, '\0', sizeof (mddb_dtag_get_parm_t));
1724 	dtgp.dtgp_setno = sp->setno;
1725 
1726 	/*CONSTCOND*/
1727 	while (1) {
1728 		if (metaioctl(MD_MED_GET_TAG, &dtgp, &dtgp.dtgp_mde, NULL) != 0)
1729 			if (! mdismddberror(&dtgp.dtgp_mde, MDE_DB_NOTAG) ||
1730 			    *dtlpp == NULL)
1731 				return (mdstealerror(ep, &dtgp.dtgp_mde));
1732 			else
1733 				break;
1734 
1735 		/*
1736 		 * Run to the end of the list
1737 		 */
1738 		for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx)
1739 			/* void */;
1740 
1741 		*dtlpp = Zalloc(sizeof (mddb_dtag_lst_t));
1742 
1743 		(void) memmove(&(*dtlpp)->dtl_dt, &dtgp.dtgp_dt,
1744 		    sizeof (mddb_dtag_t));
1745 
1746 		dtgp.dtgp_dt.dt_id++;
1747 	}
1748 	return (0);
1749 }
1750 
1751 /*
1752  * return drivename get by key
1753  */
1754 mddrivename_t *
1755 metadrivename_withdrkey(
1756 	mdsetname_t	*sp,
1757 	side_t		sideno,
1758 	mdkey_t		key,
1759 	int		flags,
1760 	md_error_t	*ep
1761 )
1762 {
1763 	char		*nm;
1764 	mdname_t	*np;
1765 	mddrivename_t	*dnp;
1766 	ddi_devid_t	devidp;
1767 	md_set_desc	*sd;
1768 
1769 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1770 		return (NULL);
1771 	}
1772 
1773 	/* get namespace info */
1774 	if (MD_MNSET_DESC(sd)) {
1775 		if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno,
1776 		    key, ep)) == NULL)
1777 			return (NULL);
1778 	} else {
1779 		if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno+SKEW,
1780 		    key, ep)) == NULL)
1781 			return (NULL);
1782 	}
1783 
1784 	/* get device name */
1785 	if (flags & PRINT_FAST) {
1786 		if ((np = metaname_fast(&sp, nm, ep)) == NULL) {
1787 			Free(nm);
1788 			return (NULL);
1789 		}
1790 	} else {
1791 		if ((np = metaname(&sp, nm, ep)) == NULL) {
1792 			Free(nm);
1793 			return (NULL);
1794 		}
1795 	}
1796 	Free(nm);
1797 
1798 	/* make sure it's OK */
1799 	if ((! (flags & MD_BASICNAME_OK)) && (metachkcomp(np, ep) != 0))
1800 		return (NULL);
1801 
1802 	/* get drivename */
1803 	dnp = np->drivenamep;
1804 	dnp->side_names_key = key;
1805 
1806 	/*
1807 	 * Skip the following devid check if dnp is did device
1808 	 * The device id is disabled for did device due to the
1809 	 * lack of minor name support in the did driver. The following
1810 	 * devid code path can set and propagate the error and
1811 	 * eventually prevent did disks from being added to the
1812 	 * diskset under SunCluster systems
1813 	 */
1814 	if (strncmp(dnp->rname, "/dev/did/", strlen("/dev/did/")) == 0) {
1815 		goto out;
1816 	}
1817 
1818 	/* Also, Skip the check if MN diskset, no devid's */
1819 	if (MD_MNSET_DESC(sd)) {
1820 		goto out;
1821 	}
1822 
1823 	/*
1824 	 * Get the devid associated with the key.
1825 	 *
1826 	 * If a devid was returned, it MUST be valid even in
1827 	 * the case where a device id has been "updated". The
1828 	 * "update" of the device id may have occured due to
1829 	 * a firmware upgrade.
1830 	 */
1831 	if ((devidp = meta_getdidbykey(MD_LOCAL_SET, sideno+SKEW, key, ep))
1832 	    != NULL) {
1833 		dnp->devid = devid_str_encode(devidp, NULL);
1834 		free(devidp);
1835 	} else {
1836 		/*
1837 		 * It is okay if replica is not in devid mode
1838 		 */
1839 		if (mdissyserror(ep, MDDB_F_NODEVID)) {
1840 			mdclrerror(ep);
1841 			goto out;
1842 		}
1843 
1844 		/*
1845 		 * devid is missing so this means that we have
1846 		 * just upgraded from a configuration where
1847 		 * devid's were not used so try to add in
1848 		 * the devid and requery.
1849 		 */
1850 		if (meta_setdid(MD_LOCAL_SET, sideno + SKEW, key,
1851 		    ep) < 0)
1852 			return (NULL);
1853 		if ((devidp = (ddi_devid_t)meta_getdidbykey(MD_LOCAL_SET,
1854 		    sideno+SKEW, key, ep)) == NULL)
1855 			return (NULL);
1856 		dnp->devid = devid_str_encode(devidp, NULL);
1857 		devid_free(devidp);
1858 	}
1859 
1860 out:
1861 	if (flags & MD_BYPASS_DAEMON)
1862 		return (dnp);
1863 
1864 	if (get_sidenmlist(sp, dnp, ep))
1865 		return (NULL);
1866 
1867 	/* return success */
1868 	return (dnp);
1869 }
1870 
1871 void
1872 metafreedrivedesc(md_drive_desc **dd)
1873 {
1874 	md_drive_desc	*p, *next = NULL;
1875 
1876 	for (p = *dd; p != NULL; p = next) {
1877 		next = p->dd_next;
1878 		Free(p);
1879 	}
1880 	*dd = NULL;
1881 }
1882 
1883 md_drive_desc *
1884 metaget_drivedesc(
1885 	mdsetname_t	*sp,
1886 	int		flags,
1887 	md_error_t	*ep
1888 )
1889 {
1890 	side_t		sideno = MD_SIDEWILD;
1891 
1892 	assert(! (flags & MD_BYPASS_DAEMON));
1893 
1894 	if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD)
1895 		return (NULL);
1896 
1897 	return (metaget_drivedesc_sideno(sp, sideno, flags, ep));
1898 }
1899 
1900 md_drive_desc *
1901 metaget_drivedesc_fromnamelist(
1902 	mdsetname_t	*sp,
1903 	mdnamelist_t	*nlp,
1904 	md_error_t	*ep
1905 )
1906 {
1907 	md_set_desc		*sd;
1908 	mdnamelist_t		*p;
1909 	md_drive_desc		*dd = NULL;
1910 
1911 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1912 		return (NULL);
1913 
1914 	for (p = nlp; p != NULL; p = p->next)
1915 		(void) metadrivedesc_append(&dd, p->namep->drivenamep, 0, 0,
1916 		    sd->sd_ctime, sd->sd_genid, MD_DR_ADD);
1917 
1918 	return (dd);
1919 }
1920 
1921 md_drive_desc *
1922 metaget_drivedesc_sideno(
1923 	mdsetname_t *sp,
1924 	side_t sideno,
1925 	int flags,
1926 	md_error_t *ep
1927 )
1928 {
1929 	md_set_desc	*sd = NULL;
1930 
1931 	assert(! (flags & MD_BYPASS_DAEMON));
1932 
1933 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1934 		return (NULL);
1935 
1936 	if (sd->sd_drvs)
1937 		return (sd->sd_drvs);
1938 
1939 	if ((sd->sd_drvs = dr2drivedesc(sp, sideno, flags, ep)) == NULL)
1940 		return (NULL);
1941 
1942 	return (sd->sd_drvs);
1943 }
1944 
1945 int
1946 metaget_setownership(
1947 	mdsetname_t	*sp,
1948 	md_error_t	*ep
1949 )
1950 {
1951 	md_set_desc	*sd;
1952 	int		bool;
1953 	int		i;
1954 	md_mnnode_desc	*nd;
1955 
1956 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1957 		return (-1);
1958 
1959 	if (MD_MNSET_DESC(sd)) {
1960 		nd = sd->sd_nodelist;
1961 		while (nd) {
1962 			/* If node isn't alive, can't own diskset */
1963 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1964 				nd->nd_flags &= ~MD_MN_NODE_OWN;
1965 				nd = nd->nd_next;
1966 				continue;
1967 			}
1968 			/*
1969 			 * If can't communicate with rpc.metad, then mark
1970 			 * this node as not an owner.  That node may
1971 			 * in fact, be an owner, but without rpc.metad running
1972 			 * that node can't do much.
1973 			 */
1974 			if (clnt_ownset(nd->nd_nodename, sp, &bool, ep) == -1) {
1975 				nd->nd_flags &= ~MD_MN_NODE_OWN;
1976 			} else if (bool == TRUE) {
1977 				nd->nd_flags |= MD_MN_NODE_OWN;
1978 			} else {
1979 				nd->nd_flags &= ~MD_MN_NODE_OWN;
1980 			}
1981 			nd = nd->nd_next;
1982 		}
1983 		return (0);
1984 	}
1985 
1986 	/* Rest of code handles traditional disksets */
1987 
1988 	for (i = 0; i < MD_MAXSIDES; i++)
1989 		sd->sd_isown[i] = 0;
1990 
1991 	if (clnt_ownset(mynode(), sp, &bool, ep) == -1)
1992 		return (-1);
1993 
1994 	if (bool == TRUE)
1995 		sd->sd_isown[getmyside(sp, ep)] = 1;
1996 
1997 	return (0);
1998 }
1999 
2000 char *
2001 mynode(void)
2002 {
2003 	static struct utsname	myuname;
2004 	static int		done = 0;
2005 
2006 	if (! done) {
2007 		if (uname(&myuname) == -1) {
2008 			md_perror(dgettext(TEXT_DOMAIN, "uname"));
2009 			assert(0);
2010 		}
2011 		done = 1;
2012 	}
2013 	return (myuname.nodename);
2014 }
2015 
2016 int
2017 strinlst(char *str, int cnt, char **lst)
2018 {
2019 	int i;
2020 
2021 	for (i = 0; i < cnt; i++)
2022 		if (strcmp(lst[i], str) == 0)
2023 			return (TRUE);
2024 
2025 	return (FALSE);
2026 }
2027 
2028 /*
2029  * meta_get_reserved_names
2030  *  returns an mdnamelist_t of reserved slices
2031  *  reserved slices are those that are used but don't necessarily
2032  *  show up as metadevices (ex. reserved slice for db in sets, logs)
2033  */
2034 
2035 /*ARGSUSED*/
2036 int
2037 meta_get_reserved_names(
2038 	mdsetname_t	*sp,
2039 	mdnamelist_t	**nlpp,
2040 	int		options,
2041 	md_error_t	*ep)
2042 {
2043 	int		 count		= 0;
2044 	mdname_t	*np		= NULL;
2045 	mdnamelist_t	*transnlp	= NULL;
2046 	mdnamelist_t	**tailpp 	= nlpp;
2047 	mdnamelist_t	*nlp;
2048 	md_drive_desc	*dd, *di;
2049 
2050 	if (metaislocalset(sp))
2051 		goto out;
2052 
2053 	if (!(dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) && !mdisok(ep)) {
2054 		count = -1;
2055 		goto out;
2056 	}
2057 
2058 	/* db in for sets on reserved slice */
2059 	for (di = dd; di && count >= 0; di = di->dd_next) {
2060 		uint_t	rep_slice;
2061 
2062 		/*
2063 		 * Add the name struct to the end of the
2064 		 * namelist but keep a pointer to the last
2065 		 * element so that we don't incur the overhead
2066 		 * of traversing the list each time
2067 		 */
2068 		if (di->dd_dnp &&
2069 		    (meta_replicaslice(di->dd_dnp, &rep_slice, ep) == 0) &&
2070 		    (np = metaslicename(di->dd_dnp, rep_slice, ep)) &&
2071 		    (tailpp = meta_namelist_append_wrapper(tailpp, np)))
2072 			count++;
2073 		else
2074 			count = -1;
2075 	}
2076 
2077 	/* now find logs */
2078 	if (meta_get_trans_names(sp, &transnlp, 0, ep) < 0) {
2079 		count = -1;
2080 		goto out;
2081 	}
2082 
2083 	for (nlp = transnlp; (nlp != NULL); nlp = nlp->next) {
2084 		mdname_t	*transnp = nlp->namep;
2085 		md_trans_t	*transp;
2086 
2087 		if ((transp = meta_get_trans(sp, transnp, ep)) == NULL) {
2088 			count = -1;
2089 			goto out;
2090 		}
2091 		if (transp->lognamep) {
2092 			/*
2093 			 * Add the name struct to the end of the
2094 			 * namelist but keep a pointer to the last
2095 			 * element so that we don't incur the overhead
2096 			 * of traversing the list each time
2097 			 */
2098 			tailpp = meta_namelist_append_wrapper(
2099 			    tailpp, transp->lognamep);
2100 		}
2101 	}
2102 out:
2103 	metafreenamelist(transnlp);
2104 	return (count);
2105 }
2106 
2107 /*
2108  * Entry point to join a node to MultiNode diskset.
2109  *
2110  * Validate host in diskset.
2111  *	- Should be in membership list from API
2112  *	- Should not already be joined into diskset.
2113  *	- Set must have drives
2114  * Assume valid configuration is stored in the set/drive/node records
2115  * in the local mddb since no node or drive can be added to the MNset
2116  * unless all drives and nodes are available.  Reconfig steps will
2117  * resync all ALIVE nodes in case of panic in critical areas.
2118  *
2119  * Lock down the set.
2120  * Verify host is a member of this diskset.
2121  * If drives exist in the configuration, load the mddbs.
2122  * Set this node to active by notifying master if one exists.
2123  * If this is the first node active in the diskset, this node
2124  * 	becomes the master.
2125  * Unlock the set.
2126  *
2127  * Mirror Resync:
2128  * If this node is the last node to join the set and clustering
2129  * isn't running, then start the 'metasync -r' type resync
2130  * on all mirrors in this diskset.
2131  * If clustering is running, this resync operation will
2132  * be handled by the reconfig steps and should NOT
2133  * be handled during a join operation.
2134  *
2135  * There are multiple return values in order to assist
2136  * the join operation of all sets in the metaset command.
2137  *
2138  * Return values:
2139  *	0  - Node successfully joined to set.
2140  *	-1 - Join attempted but failed
2141  *		- any failure from libmeta calls
2142  *		- node not in the member list
2143  *	-2 - Join not attempted since
2144  *		- this set had no drives in set
2145  *		- this node already joined to set
2146  *		- set is not a multinode set
2147  *	-3 - Node joined to STALE set.
2148  */
2149 extern int
2150 meta_set_join(
2151 	mdsetname_t	*sp,
2152 	md_error_t	*ep
2153 )
2154 {
2155 	md_set_desc		*sd;
2156 	md_drive_desc		*dd;
2157 	md_mnnode_desc		*nd, *nd2, my_nd;
2158 	int			rval = 0;
2159 	md_setkey_t		*cl_sk;
2160 	md_error_t		xep = mdnullerror;
2161 	md_error_t		ep_snarf = mdnullerror;
2162 	int			master_flag = 0;
2163 	md_mnset_record		*mas_mnsr = NULL;
2164 	int			clear_nr_flags = 0;
2165 	md_mnnode_record	*nr;
2166 	int			stale_set = 0;
2167 	int			rb_flags = 0;
2168 	int			stale_bool = FALSE;
2169 	int			suspendall_flag = 0;
2170 	int			suspend1_flag = 0;
2171 	sigset_t		oldsigs;
2172 	int			send_reinit = 0;
2173 
2174 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2175 		return (-1);
2176 	}
2177 
2178 	/* Must be a multinode diskset */
2179 	if (!MD_MNSET_DESC(sd)) {
2180 		(void) mderror(ep, MDE_NOT_MN, sp->setname);
2181 		return (-2);
2182 	}
2183 
2184 	/* Verify that the node is ALIVE (i.e. is in the API membership list) */
2185 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_ALIVE)) {
2186 		(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno,
2187 			sd->sd_mn_mynode->nd_nodename, NULL,
2188 			sp->setname);
2189 		return (-1);
2190 	}
2191 
2192 	/* Make sure we are blocking all signals */
2193 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
2194 		mdclrerror(&xep);
2195 
2196 	/*
2197 	 * Lock the set on current set members.
2198 	 * For MN diskset lock_set and SUSPEND are used to protect against
2199 	 * other meta* commands running on the other nodes.
2200 	 */
2201 	nd = sd->sd_nodelist;
2202 	while (nd) {
2203 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2204 			nd = nd->nd_next;
2205 			continue;
2206 		}
2207 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2208 			rval = -1;
2209 			goto out;
2210 		}
2211 		nd = nd->nd_next;
2212 	}
2213 
2214 	/*
2215 	 * Lock out other meta* commands by suspending
2216 	 * class 1 messages across the diskset.
2217 	 */
2218 	nd = sd->sd_nodelist;
2219 	while (nd) {
2220 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2221 			nd = nd->nd_next;
2222 			continue;
2223 		}
2224 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2225 			    sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) {
2226 			rval = -1;
2227 			goto out;
2228 		}
2229 		suspend1_flag = 1;
2230 		nd = nd->nd_next;
2231 	}
2232 
2233 	/*
2234 	 * Verify that this host is a member (in the host list) of the set.
2235 	 */
2236 	nd = sd->sd_nodelist;
2237 	while (nd) {
2238 		if (strcmp(mynode(), nd->nd_nodename) == 0) {
2239 			break;
2240 		}
2241 		nd = nd->nd_next;
2242 	}
2243 	if (!nd) {
2244 		(void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
2245 			sd->sd_mn_mynode->nd_nodename, NULL,
2246 			sp->setname);
2247 		rval = -1;
2248 		goto out;
2249 	}
2250 
2251 	/*
2252 	 * Need to return failure if host is already 'joined'
2253 	 * into the set.  This is done so that if later the user
2254 	 * issues a command to join all sets and a failure is
2255 	 * encountered - that the resulting cleanup effort
2256 	 * (withdrawing from all sets that were joined
2257 	 * during that command) won't withdraw from this set.
2258 	 */
2259 	if (nd->nd_flags & MD_MN_NODE_OWN) {
2260 		rval = -2;
2261 		goto out2;
2262 	}
2263 
2264 	/*
2265 	 * Call metaget_setownership that calls each node in diskset and
2266 	 * marks in set descriptor if node is an owner of the set or not.
2267 	 * metaget_setownership checks to see if a node is an owner by
2268 	 * checking to see if that node's kernel has the mddb loaded.
2269 	 * If a node had panic'd during a reconfig or an
2270 	 * add/delete/join/withdraw operation, the other nodes' node
2271 	 * records may not reflect the current state of the diskset,
2272 	 * so calling metaget_setownership is the safest thing to do.
2273 	 */
2274 	if (metaget_setownership(sp, ep) == -1) {
2275 		rval = -1;
2276 		goto out;
2277 	}
2278 
2279 	/* If first active member of diskset, become the master. */
2280 	nd = sd->sd_nodelist;
2281 	while (nd) {
2282 		if (nd->nd_flags & MD_MN_NODE_OWN)
2283 			break;
2284 		nd = nd->nd_next;
2285 	}
2286 	if (nd == NULL)
2287 		master_flag = 1;
2288 
2289 	/*
2290 	 * If not first active member of diskset, then get the
2291 	 * master information from a node that is already joined
2292 	 * and set the master information for this node.  Be sure
2293 	 * that this node (the already joined node) has its own
2294 	 * join flag set.  If not, then this diskset isn't currently
2295 	 * consistent and shouldn't allow a node to join.  This diskset
2296 	 * inconsistency should only occur when a node has panic'd in
2297 	 * the set while doing a metaset operation and the sysadmin is
2298 	 * attempting to join a node into the set.  This inconsistency
2299 	 * will be fixed during a reconfig cycle which should be occurring
2300 	 * soon since a node panic'd.
2301 	 *
2302 	 * If unable to get this information from an owning node, then
2303 	 * this diskset isn't currently consistent and shouldn't
2304 	 * allow a node to join.
2305 	 */
2306 	if (!master_flag) {
2307 		/* get master information from an owner (joined) node */
2308 		if (clnt_mngetset(nd->nd_nodename, sp->setname,
2309 		    sp->setno, &mas_mnsr, ep) == -1) {
2310 			rval = -1;
2311 			goto out;
2312 		}
2313 
2314 		/* Verify that owner (joined) node has its own JOIN flag set */
2315 		nr = mas_mnsr->sr_nodechain;
2316 		while (nr) {
2317 			if ((nd->nd_nodeid == nr->nr_nodeid) &&
2318 			    ((nr->nr_flags & MD_MN_NODE_OWN) == NULL)) {
2319 				(void) mddserror(ep, MDE_DS_NODENOSET,
2320 				    sp->setno, nd->nd_nodename, NULL,
2321 				    nd->nd_nodename);
2322 				free_sr((md_set_record *)mas_mnsr);
2323 				rval = -1;
2324 				goto out;
2325 			}
2326 			nr = nr->nr_next;
2327 		}
2328 
2329 		/*
2330 		 * Does master have set marked as STALE?
2331 		 * If so, need to pass this down to kernel when
2332 		 * this node snarfs the set.
2333 		 */
2334 		if (clnt_mn_is_stale(nd->nd_nodename, sp,
2335 		    &stale_bool, ep) == -1) {
2336 			rval = -1;
2337 			goto out;
2338 		}
2339 
2340 		/* set master information in my rpc.metad's set record */
2341 		if (clnt_mnsetmaster(mynode(), sp, mas_mnsr->sr_master_nodenm,
2342 		    mas_mnsr->sr_master_nodeid, ep)) {
2343 			free_sr((md_set_record *)mas_mnsr);
2344 			rval = -1;
2345 			goto out;
2346 		}
2347 
2348 		/* set master information in my cached set desc */
2349 		(void) strcpy(sd->sd_mn_master_nodenm,
2350 		    mas_mnsr->sr_master_nodenm);
2351 		sd->sd_mn_master_nodeid = mas_mnsr->sr_master_nodeid;
2352 		nd2 = sd->sd_nodelist;
2353 		while (nd2) {
2354 		    if (nd2->nd_nodeid == mas_mnsr->sr_master_nodeid) {
2355 			sd->sd_mn_masternode = nd2;
2356 			break;
2357 		    }
2358 		    nd2 = nd2->nd_next;
2359 		}
2360 		free_sr((md_set_record *)mas_mnsr);
2361 
2362 		/*
2363 		 * Set the node flags in mynode's rpc.metad node records for
2364 		 * the nodes that are in the diskset.  Can use my sd
2365 		 * since earlier call to metaget_setownership set the
2366 		 * owner flags based on whether that node had snarfed
2367 		 * the MN diskset mddb.  Reconfig steps guarantee that
2368 		 * return of metaget_setownership will match the owning
2369 		 * node's owner list except in the case where a node
2370 		 * has just panic'd and in this case, a reconfig will
2371 		 * be starting immediately and the owner lists will
2372 		 * be sync'd up by the reconfig.
2373 		 *
2374 		 * Flag of SET means to take no action except to
2375 		 * set the node flags as given in the nodelist linked list.
2376 		 */
2377 		if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist,
2378 		    MD_NR_SET, NULL, ep)) {
2379 			rval = -1;
2380 			goto out;
2381 		}
2382 	}
2383 
2384 	/*
2385 	 * Read in the mddb if there are drives in the set.
2386 	 */
2387 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
2388 	    ep)) == NULL) {
2389 		/* No drives in list */
2390 		if (! mdisok(ep)) {
2391 			rval = -1;
2392 			goto out;
2393 		}
2394 		rval = -2;
2395 		goto out;
2396 	}
2397 
2398 	/*
2399 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2400 	 * Start by suspending rpc.mdcommd (which drains it of all messages),
2401 	 * then change the nodelist followed by a reinit and resume.
2402 	 */
2403 	nd = sd->sd_nodelist;
2404 	while (nd) {
2405 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2406 			nd = nd->nd_next;
2407 			continue;
2408 		}
2409 
2410 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp,
2411 		    MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
2412 			rval = -1;
2413 			goto out;
2414 		}
2415 		suspendall_flag = 1;
2416 		nd = nd->nd_next;
2417 	}
2418 
2419 	/* Set master in my set record in rpc.metad */
2420 	if (master_flag) {
2421 		if (clnt_mnsetmaster(mynode(), sp,
2422 		    sd->sd_mn_mynode->nd_nodename,
2423 		    sd->sd_mn_mynode->nd_nodeid, ep)) {
2424 			rval = -1;
2425 			goto out;
2426 		}
2427 	}
2428 	/* Causes mddbs to be loaded in kernel */
2429 	if (setup_db_bydd(sp, dd, 0, ep) == -1) {
2430 		mde_perror(ep, dgettext(TEXT_DOMAIN,
2431 		    "Host not able to start diskset."));
2432 		rval = -1;
2433 		goto out;
2434 	}
2435 
2436 	if (! mdisok(ep)) {
2437 		rval = -1;
2438 		goto out;
2439 	}
2440 
2441 	/*
2442 	 * Set rollback flags to 1 so that halt_set is called if a failure
2443 	 * is seen after this point.  If snarf_set fails, still need to
2444 	 * call halt_set to cleanup the diskset.
2445 	 */
2446 	rb_flags = 1;
2447 
2448 	/* Starts the set */
2449 	if (snarf_set(sp, stale_bool, ep) != 0) {
2450 		if (mdismddberror(ep, MDE_DB_STALE)) {
2451 			/*
2452 			 * Don't fail join, STALE means that set has
2453 			 * < 50% mddbs.
2454 			 */
2455 			(void) mdstealerror(&ep_snarf, ep);
2456 			stale_set = 1;
2457 		} else if (mdisok(ep)) {
2458 			/* If snarf failed, but no error was set - set it */
2459 			(void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64,
2460 			    sp->setno, 0, NULL);
2461 				rval = -1;
2462 				goto out;
2463 		} else if (!(mdismddberror(ep, MDE_DB_ACCOK))) {
2464 			/*
2465 			 * Don't fail join if ACCOK; ACCOK means that mediator
2466 			 * provided extra vote.
2467 			 */
2468 			rval = -1;
2469 			goto out;
2470 		}
2471 	}
2472 
2473 	/* Did set really get snarfed? */
2474 	if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_NO) {
2475 		if (mdisok(ep)) {
2476 			/* If snarf failed, but no error was set - set it */
2477 			(void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64,
2478 				sp->setno, 0, NULL);
2479 		}
2480 		mde_perror(ep, dgettext(TEXT_DOMAIN,
2481 		    "Host not able to start diskset."));
2482 		rval = -1;
2483 		goto out;
2484 	}
2485 
2486 	/* Change to nodelist so need to send reinit to rpc.mdcommd */
2487 	send_reinit = 1;
2488 
2489 	/* If first node to enter set, setup master and clear change log */
2490 	if (master_flag) {
2491 		/* Set master in my locally cached set descriptor */
2492 		(void) strcpy(sd->sd_mn_master_nodenm,
2493 		    sd->sd_mn_mynode->nd_nodename);
2494 		sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid;
2495 		sd->sd_mn_am_i_master = 1;
2496 
2497 		/*
2498 		 * If first node to join set, then clear out change log
2499 		 * entries.  Change log entries are only needed when a
2500 		 * change of master is occurring in a diskset that has
2501 		 * multiple owners.   Since this node is the first owner
2502 		 * of the diskset, clear the entries.
2503 		 *
2504 		 * Only do this if we are in a single node non-SC3.x
2505 		 * situation.
2506 		 */
2507 		if (meta_mn_singlenode() &&
2508 			mdmn_reset_changelog(sp, ep,  MDMN_CLF_RESETLOG) != 0) {
2509 			mde_perror(ep, dgettext(TEXT_DOMAIN,
2510 			    "Unable to reset changelog."));
2511 			rval = -1;
2512 			goto out;
2513 		}
2514 	}
2515 
2516 	/* Set my locally cached flag */
2517 	sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN;
2518 
2519 	/*
2520 	 * Set this node's own flag on all joined nodes in the set
2521 	 * (including my node).
2522 	 */
2523 	clear_nr_flags = 1;
2524 
2525 	my_nd = *(sd->sd_mn_mynode);
2526 	my_nd.nd_next = NULL;
2527 	nd = sd->sd_nodelist;
2528 	while (nd) {
2529 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2530 			nd = nd->nd_next;
2531 			continue;
2532 		}
2533 		if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
2534 		    MD_NR_JOIN, NULL, ep)) {
2535 			rval = -1;
2536 			goto out;
2537 		}
2538 		nd = nd->nd_next;
2539 	}
2540 
2541 out:
2542 	if (rval != NULL) {
2543 		/*
2544 		 * If rollback flag is 1, then node was joined to set.
2545 		 * Since an error occurred, withdraw node from set in
2546 		 * order to rollback to before command was run.
2547 		 * Need to preserve ep so that calling function can
2548 		 * get error information.
2549 		 */
2550 		if (rb_flags == 1) {
2551 			if (halt_set(sp, &xep)) {
2552 				mdclrerror(&xep);
2553 			}
2554 		}
2555 
2556 		/*
2557 		 * If error, reset master to INVALID.
2558 		 * Ignore error since (next) first node to successfully join
2559 		 * will set master on all nodes.
2560 		 */
2561 		(void) clnt_mnsetmaster(mynode(), sp, "",
2562 			MD_MN_INVALID_NID, &xep);
2563 		mdclrerror(&xep);
2564 		/* Reset master in my locally cached set descriptor */
2565 		sd->sd_mn_master_nodeid = MD_MN_INVALID_NID;
2566 		sd->sd_mn_am_i_master = 0;
2567 
2568 		/*
2569 		 * If nr flags set on other nodes, reset them.
2570 		 */
2571 		if (clear_nr_flags) {
2572 			nd = sd->sd_nodelist;
2573 			while (nd) {
2574 				if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2575 					nd = nd->nd_next;
2576 					continue;
2577 				}
2578 				(void) clnt_upd_nr_flags(nd->nd_nodename, sp,
2579 					&my_nd, MD_NR_WITHDRAW, NULL, &xep);
2580 				mdclrerror(&xep);
2581 				nd = nd->nd_next;
2582 			}
2583 			/* Reset my locally cached flag */
2584 			sd->sd_mn_mynode->nd_flags &= ~MD_MN_NODE_OWN;
2585 		}
2586 	}
2587 
2588 	/*
2589 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2590 	 * Send reinit command to mdcommd which forces it to get
2591 	 * fresh set description.
2592 	 */
2593 	if (send_reinit) {
2594 		/* Send reinit */
2595 		nd = sd->sd_nodelist;
2596 		while (nd) {
2597 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2598 				nd = nd->nd_next;
2599 				continue;
2600 			}
2601 
2602 			/* Class is ignored for REINIT */
2603 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
2604 				sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
2605 				/*
2606 				 * We are here because we failed to resume
2607 				 * rpc.mdcommd.  However we potentially have
2608 				 * an error from the previous call
2609 				 * If the previous call did fail,  we capture
2610 				 * that error and generate a perror with
2611 				 * the string, "Unable to resume...".
2612 				 * Setting rval to -1 ensures that in the
2613 				 * next iteration of the loop, ep is not
2614 				 * clobbered.
2615 				 */
2616 				if (rval == 0)
2617 					(void) mdstealerror(ep, &xep);
2618 				else
2619 					mdclrerror(&xep);
2620 				rval = -1;
2621 				mde_perror(ep, dgettext(TEXT_DOMAIN,
2622 				    "Unable to reinit rpc.mdcommd."));
2623 			}
2624 			nd = nd->nd_next;
2625 		}
2626 
2627 	}
2628 
2629 out2:
2630 	/*
2631 	 * Unlock diskset by resuming messages across the diskset.
2632 	 * Just resume all classes so that resume is the same whether
2633 	 * just one class was locked or all classes were locked.
2634 	 */
2635 	if ((suspend1_flag) || (suspendall_flag)) {
2636 		nd = sd->sd_nodelist;
2637 		while (nd) {
2638 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2639 				nd = nd->nd_next;
2640 				continue;
2641 			}
2642 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
2643 				sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
2644 				/*
2645 				 * We are here because we failed to resume
2646 				 * rpc.mdcommd.  However we potentially have
2647 				 * an error from the previous call
2648 				 * If the previous call did fail,  we capture
2649 				 * that error and generate a perror with
2650 				 * the string, "Unable to resume...".
2651 				 * Setting rval to -1 ensures that in the
2652 				 * next iteration of the loop, ep is not
2653 				 * clobbered.
2654 				 */
2655 				if (rval == 0)
2656 					(void) mdstealerror(ep, &xep);
2657 				else
2658 					mdclrerror(&xep);
2659 				rval = -1;
2660 				mde_perror(ep, dgettext(TEXT_DOMAIN,
2661 				    "Unable to resume rpc.mdcommd."));
2662 			}
2663 			nd = nd->nd_next;
2664 		}
2665 		meta_ping_mnset(sp->setno);
2666 	}
2667 
2668 	/*
2669 	 * Unlock set.  This flushes the caches on the servers.
2670 	 */
2671 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
2672 	nd = sd->sd_nodelist;
2673 	while (nd) {
2674 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2675 			nd = nd->nd_next;
2676 			continue;
2677 		}
2678 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
2679 			if (rval == 0)
2680 				(void) mdstealerror(ep, &xep);
2681 			else
2682 				mdclrerror(&xep);
2683 			rval = -1;
2684 		}
2685 		nd = nd->nd_next;
2686 	}
2687 
2688 	/*
2689 	 * If this node is the last to join the diskset and clustering isn't
2690 	 * running, then resync the mirrors in the diskset. We have to wait
2691 	 * until all nodes are joined so that the status gets propagated to
2692 	 * all of the members of the set.
2693 	 * Ignore any error from the resync as the join function shouldn't fail
2694 	 * because the mirror resync had a problem.
2695 	 *
2696 	 * Don't start resync if set is stale.
2697 	 */
2698 	if ((rval == 0) && (sdssc_bind_library() != SDSSC_OKAY) &&
2699 	    (stale_set != 1)) {
2700 		nd = sd->sd_nodelist;
2701 		while (nd) {
2702 			if (!(nd->nd_flags & MD_MN_NODE_OWN))
2703 				break;
2704 			nd = nd->nd_next;
2705 		}
2706 		/*
2707 		 * nd set to NULL means that we have no nodes in the set that
2708 		 * haven't joined. In this case we start the resync.
2709 		 */
2710 		if (nd == NULL) {
2711 			(void) meta_mirror_resync_all(sp, 0, &xep);
2712 			mdclrerror(&xep);
2713 		}
2714 	}
2715 
2716 	/* Update ABR state for all soft partitions */
2717 	(void) meta_sp_update_abr(sp, &xep);
2718 	mdclrerror(&xep);
2719 
2720 	/*
2721 	 * call metaflushsetnames to reset local cache for master and
2722 	 * node information.
2723 	 */
2724 	metaflushsetname(sp);
2725 
2726 	/* release signals back to what they were on entry */
2727 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
2728 		mdclrerror(&xep);
2729 
2730 	/*
2731 	 * If no error and stale_set is set, then set ep back
2732 	 * to ep from snarf_set call and return -3.  If another error
2733 	 * occurred and rval is not 0, then that error would have
2734 	 * caused the node to be withdrawn from the set and would
2735 	 * have set ep to that error information.
2736 	 */
2737 	if ((rval == 0) && (stale_set)) {
2738 		(void) mdstealerror(ep, &ep_snarf);
2739 		return (-3);
2740 	}
2741 
2742 	return (rval);
2743 }
2744 
2745 /*
2746  * Entry point to withdraw a node from MultiNode diskset.
2747  *
2748  * Validate host in diskset.
2749  *	- Should be joined into diskset.
2750  * Assume valid configuration is stored in the set/drive/node records
2751  * in the local mddb since no node or drive can be added to the MNset
2752  * unless all drives and nodes are available.  Reconfig steps will
2753  * resync all ALIVE nodes in case of panic in critical areas.
2754  *
2755  * Lock down the set.
2756  * Verify that drives exist in configuration.
2757  * Verify host is a member of this diskset.
2758  * Verify host is an owner of the diskset (host is joined to diskset).
2759  * Only allow withdrawal of master node if master node is the only joined
2760  * in the diskset.
2761  * Halt the diskset on this node.
2762  * Reset Master on this node.
2763  * Updated node flags that this node with withdrawn.
2764  * Unlock the set.
2765  *
2766  * Return values:
2767  *	0  - Node successfully withdrew from set.
2768  *	-1 - Withdrawal attempted but failed
2769  *		- any failure from libmeta calls
2770  *		- node not in the member list
2771  *	-2 - Withdrawal not attempted since
2772  *		- this set had no drives in set
2773  *		- this node not joined to set
2774  *		- set is not a multinode set
2775  */
2776 extern int
2777 meta_set_withdraw(
2778 	mdsetname_t	*sp,
2779 	md_error_t	*ep
2780 )
2781 {
2782 	md_set_desc		*sd;
2783 	md_drive_desc		*dd = 0;
2784 	md_mnnode_desc		*nd, my_nd;
2785 	int			rval = 0;
2786 	md_setkey_t		*cl_sk;
2787 	md_error_t		xep = mdnullerror;
2788 	int			set_halted = 0;
2789 	int			suspendall_flag = 0;
2790 	int			suspend1_flag = 0;
2791 	bool_t			stale_bool = FALSE;
2792 	mddb_config_t		c;
2793 	int			node_id_list[1];
2794 	sigset_t		oldsigs;
2795 	int			send_reinit = 0;
2796 
2797 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2798 		return (-1);
2799 	}
2800 
2801 	/* Must be a multinode diskset */
2802 	if (!MD_MNSET_DESC(sd)) {
2803 		(void) mderror(ep, MDE_NOT_MN, sp->setname);
2804 		return (-1);
2805 	}
2806 
2807 	/* Make sure we are blocking all signals */
2808 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
2809 		mdclrerror(&xep);
2810 
2811 	/*
2812 	 * Lock the set on current set members.
2813 	 * For MN diskset lock_set and SUSPEND are used to protect against
2814 	 * other meta* commands running on the other nodes.
2815 	 */
2816 	nd = sd->sd_nodelist;
2817 	while (nd) {
2818 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2819 			nd = nd->nd_next;
2820 			continue;
2821 		}
2822 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2823 			rval = -1;
2824 			goto out;
2825 		}
2826 		nd = nd->nd_next;
2827 	}
2828 	/*
2829 	 * Lock out other meta* commands by suspending
2830 	 * class 1 messages across the diskset.
2831 	 */
2832 	nd = sd->sd_nodelist;
2833 	while (nd) {
2834 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2835 			nd = nd->nd_next;
2836 			continue;
2837 		}
2838 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2839 			sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) {
2840 			rval = -1;
2841 			goto out;
2842 		}
2843 		suspend1_flag = 1;
2844 		nd = nd->nd_next;
2845 	}
2846 
2847 	/* Get list of drives - needed in case of failure */
2848 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
2849 	    ep)) == NULL) {
2850 		/* Error getting drives in list */
2851 		if (! mdisok(ep)) {
2852 			rval = -1;
2853 			goto out2;
2854 		}
2855 		/* no drives in list */
2856 		rval = -2;
2857 		goto out2;
2858 	}
2859 
2860 	/*
2861 	 * Verify that this host is a member (in the host list) of the set.
2862 	 */
2863 	nd = sd->sd_nodelist;
2864 	while (nd) {
2865 		if (strcmp(mynode(), nd->nd_nodename) == 0) {
2866 			break;
2867 		}
2868 		nd = nd->nd_next;
2869 	}
2870 	if (!nd) {
2871 		(void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
2872 			sd->sd_mn_mynode->nd_nodename, NULL,
2873 			sp->setname);
2874 		rval = -1;
2875 		goto out2;
2876 	}
2877 
2878 	/*
2879 	 * Call metaget_setownership that calls each node in diskset and
2880 	 * marks in set descriptor if node is an owner of the set or not.
2881 	 * metaget_setownership checks to see if a node is an owner by
2882 	 * checking to see if that node's kernel has the mddb loaded.
2883 	 * If a node had panic'd during a reconfig or an
2884 	 * add/delete/join/withdraw operation, the other nodes' node
2885 	 * records may not reflect the current state of the diskset,
2886 	 * so calling metaget_setownership is the safest thing to do.
2887 	 */
2888 	if (metaget_setownership(sp, ep) == -1) {
2889 		rval = -1;
2890 		goto out2;
2891 	}
2892 
2893 	/*
2894 	 * Verify that this node is joined
2895 	 * to diskset (i.e. is an owner of the diskset).
2896 	 */
2897 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
2898 		rval = -2;
2899 		goto out2;
2900 	}
2901 
2902 	/*
2903 	 * For a MN diskset, only withdraw master if it is
2904 	 * the only joined node.
2905 	 */
2906 	if (sd->sd_mn_master_nodeid == sd->sd_mn_mynode->nd_nodeid) {
2907 		nd = sd->sd_nodelist;
2908 		while (nd) {
2909 			/* Skip my node since checking for other owners */
2910 			if (nd->nd_nodeid == sd->sd_mn_master_nodeid) {
2911 				nd = nd->nd_next;
2912 				continue;
2913 			}
2914 			/* If another owner node if found, error */
2915 			if (nd->nd_flags & MD_MN_NODE_OWN) {
2916 				(void) mddserror(ep, MDE_DS_WITHDRAWMASTER,
2917 					sp->setno,
2918 					sd->sd_mn_mynode->nd_nodename, NULL,
2919 					sp->setname);
2920 				rval = -1;
2921 				goto out2;
2922 			}
2923 			nd = nd->nd_next;
2924 		}
2925 	}
2926 
2927 	/*
2928 	 * Is current set STALE?
2929 	 */
2930 	(void) memset(&c, 0, sizeof (c));
2931 	c.c_id = 0;
2932 	c.c_setno = sp->setno;
2933 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
2934 		(void) mdstealerror(ep, &c.c_mde);
2935 		rval = -1;
2936 		goto out;
2937 	}
2938 	if (c.c_flags & MDDB_C_STALE) {
2939 		stale_bool = TRUE;
2940 	}
2941 
2942 	/*
2943 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2944 	 * Start by suspending rpc.mdcommd (which drains it of all messages),
2945 	 * then change the nodelist followed by a reinit and resume.
2946 	 */
2947 	nd = sd->sd_nodelist;
2948 	while (nd) {
2949 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2950 			nd = nd->nd_next;
2951 			continue;
2952 		}
2953 
2954 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2955 		    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
2956 			rval = -1;
2957 			goto out;
2958 		}
2959 		suspendall_flag = 1;
2960 		nd = nd->nd_next;
2961 	}
2962 
2963 	/*
2964 	 * Withdraw the set - halt set.
2965 	 * This will fail if any I/O is occuring to any metadevice which
2966 	 * includes a resync to a mirror metadevice.
2967 	 */
2968 	set_halted = 1;
2969 	if (halt_set(sp, ep)) {
2970 		/* Was set actually halted? */
2971 		if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_YES) {
2972 			set_halted = 0;
2973 		}
2974 		rval = -1;
2975 		goto out;
2976 	}
2977 
2978 	/* Change to nodelist so need to send reinit to rpc.mdcommd */
2979 	send_reinit = 1;
2980 
2981 	/* Reset master on withdrawn node */
2982 	if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, "",
2983 	    MD_MN_INVALID_NID, ep)) {
2984 		rval = -1;
2985 		goto out;
2986 	}
2987 
2988 	/* Mark my node as withdrawn and send to other nodes */
2989 	nd = sd->sd_nodelist;
2990 	my_nd = *(sd->sd_mn_mynode);	/* structure copy */
2991 	my_nd.nd_next = NULL;
2992 	while (nd) {
2993 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2994 			nd = nd->nd_next;
2995 			continue;
2996 		}
2997 		if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
2998 		    MD_NR_WITHDRAW, NULL, ep)) {
2999 			rval = -1;
3000 			goto out;
3001 		}
3002 		nd = nd->nd_next;
3003 	}
3004 
3005 	/*
3006 	 * If withdrawn node is a mirror owner, reset mirror owner
3007 	 * to NULL.  If an error occurs, print a warning and continue.
3008 	 * Don't fail metaset because of mirror owner reset problem since
3009 	 * next node to grab mirror will resolve this issue.
3010 	 * Before next node grabs mirrors, metaset will show the withdrawn
3011 	 * node as owner which is why an attempt to reset the mirror owner
3012 	 * is made.
3013 	 */
3014 	node_id_list[0] = sd->sd_mn_mynode->nd_nodeid;	/* Setup my nodeid */
3015 	nd = sd->sd_nodelist;
3016 	while (nd) {
3017 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3018 			nd = nd->nd_next;
3019 			continue;
3020 		}
3021 		if (clnt_reset_mirror_owner(nd->nd_nodename, sp,
3022 		    1, &node_id_list[0], &xep) == 01) {
3023 			mde_perror(&xep, dgettext(TEXT_DOMAIN,
3024 			    "Unable to reset mirror owner on node %s"),
3025 			    nd->nd_nodename);
3026 			mdclrerror(&xep);
3027 		}
3028 		nd = nd->nd_next;
3029 	}
3030 
3031 out:
3032 	if (rval == -1) {
3033 		/* Rejoin node - Mark node as joined and send to other nodes */
3034 		nd = sd->sd_nodelist;
3035 		my_nd = *(sd->sd_mn_mynode);	/* structure copy */
3036 		my_nd.nd_next = NULL;
3037 		while (nd) {
3038 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3039 				nd = nd->nd_next;
3040 				continue;
3041 			}
3042 			if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
3043 			    MD_NR_JOIN, NULL, &xep)) {
3044 				mdclrerror(&xep);
3045 			}
3046 			nd = nd->nd_next;
3047 		}
3048 
3049 		/* Set master on withdrawn node */
3050 		if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp,
3051 		    sd->sd_mn_master_nodenm,
3052 		    sd->sd_mn_master_nodeid, &xep)) {
3053 			mdclrerror(&xep);
3054 		}
3055 
3056 		/* Join set if halt_set had succeeded */
3057 		if (set_halted) {
3058 			if (setup_db_bydd(sp, dd, 0, &xep) == -1) {
3059 				mdclrerror(&xep);
3060 			}
3061 			/* If set previously stale - make it so at re-join */
3062 			if (snarf_set(sp, stale_bool, &xep) != 0) {
3063 				mdclrerror(&xep);
3064 				(void) halt_set(sp, &xep);
3065 				mdclrerror(&xep);
3066 			}
3067 		}
3068 	}
3069 
3070 	/*
3071 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
3072 	 * Send reinit command to mdcommd which forces it to get
3073 	 * fresh set description.
3074 	 */
3075 	if (send_reinit) {
3076 		/* Send reinit */
3077 		nd = sd->sd_nodelist;
3078 		while (nd) {
3079 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3080 				nd = nd->nd_next;
3081 				continue;
3082 			}
3083 
3084 			/* Class is ignored for REINIT */
3085 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
3086 				sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
3087 				/*
3088 				 * We are here because we failed to resume
3089 				 * rpc.mdcommd.  However we potentially have
3090 				 * an error from the previous call.
3091 				 * If the previous call did fail,  we
3092 				 * capture that error and generate a perror
3093 				 * withthe string,  "Unable to resume...".
3094 				 * Setting rval to -1 ensures that in the
3095 				 * next iteration of the loop, ep is not
3096 				 * clobbered.
3097 				 */
3098 				if (rval == 0)
3099 					(void) mdstealerror(ep, &xep);
3100 				else
3101 					mdclrerror(&xep);
3102 				rval = -1;
3103 				mde_perror(ep, dgettext(TEXT_DOMAIN,
3104 				    "Unable to reinit rpc.mdcommd."));
3105 			}
3106 			nd = nd->nd_next;
3107 		}
3108 	}
3109 
3110 out2:
3111 	/*
3112 	 * Unlock diskset by resuming messages across the diskset.
3113 	 * Just resume all classes so that resume is the same whether
3114 	 * just one class was locked or all classes were locked.
3115 	 */
3116 	if ((suspend1_flag) || (suspendall_flag)) {
3117 		nd = sd->sd_nodelist;
3118 		while (nd) {
3119 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3120 				nd = nd->nd_next;
3121 				continue;
3122 			}
3123 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
3124 				sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
3125 				/*
3126 				 * We are here because we failed to resume
3127 				 * rpc.mdcommd.  However we potentially have
3128 				 * an error from the previous call
3129 				 * If the previous call did fail,  we capture
3130 				 * that error and generate a perror with
3131 				 * the string, "Unable to resume...".
3132 				 * Setting rval to -1 ensures that in the
3133 				 * next iteration of the loop, ep is not
3134 				 * clobbered.
3135 				 */
3136 				if (rval == 0)
3137 					(void) mdstealerror(ep, &xep);
3138 				else
3139 					mdclrerror(&xep);
3140 				rval = -1;
3141 				mde_perror(ep, dgettext(TEXT_DOMAIN,
3142 				    "Unable to resume rpc.mdcommd."));
3143 			}
3144 			nd = nd->nd_next;
3145 		}
3146 		meta_ping_mnset(sp->setno);
3147 	}
3148 
3149 	/*
3150 	 * Unlock set.  This flushes the caches on the servers.
3151 	 */
3152 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
3153 	nd = sd->sd_nodelist;
3154 	while (nd) {
3155 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3156 			nd = nd->nd_next;
3157 			continue;
3158 		}
3159 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
3160 			if (rval == 0)
3161 				(void) mdstealerror(ep, &xep);
3162 			else
3163 				mdclrerror(&xep);
3164 			rval = -1;
3165 		}
3166 		nd = nd->nd_next;
3167 	}
3168 
3169 	/*
3170 	 * call metaflushsetnames to reset local cache for master and
3171 	 * node information.
3172 	 */
3173 	metaflushsetname(sp);
3174 
3175 	/* release signals back to what they were on entry */
3176 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
3177 		mdclrerror(&xep);
3178 
3179 	return (rval);
3180 
3181 }
3182 
3183 /*
3184  * Update nodelist with cluster member information.
3185  * A node not in the member list will be marked
3186  * as not ALIVE and not OWN.
3187  * A node in the member list will be marked ALIVE, but
3188  * the OWN bit will not be changed.
3189  *
3190  * If mynode isn't in the membership list, fail causing
3191  * another reconfig cycle to be started since a non-member
3192  * node shouldn't be taking part in the reconfig cycle.
3193  *
3194  * Return values:
3195  *	0 - No problem.
3196  *	1 - Any failure including RPC failure to my node.
3197  */
3198 int
3199 meta_reconfig_update_nodelist(
3200 	mdsetname_t			*sp,
3201 	mndiskset_membershiplist_t	*nl,
3202 	md_set_desc			*sd,
3203 	md_error_t			*ep
3204 )
3205 {
3206 	mndiskset_membershiplist_t	*nl2;
3207 	md_mnnode_desc			*nd;
3208 	md_error_t			xep = mdnullerror;
3209 	int				rval = 0;
3210 
3211 	/*
3212 	 * Walk through nodelist, checking to see if each
3213 	 * node is in the member list.
3214 	 * If node is not a member, reset ALIVE and OWN node flag.
3215 	 * If node is a member, set ALIVE.
3216 	 * If mynode's OWN flag gets reset, then halt the diskset on this node.
3217 	 */
3218 	nd = sd->sd_nodelist;
3219 	while (nd) {
3220 		nl2 = nl;
3221 		while (nl2) {
3222 			/* If node is in member list, set ALIVE */
3223 			if (nl2->msl_node_id == nd->nd_nodeid) {
3224 				nd->nd_flags |= MD_MN_NODE_ALIVE;
3225 				break;
3226 			} else {
3227 				nl2 = nl2->next;
3228 			}
3229 			/* node is not in member list, mark !ALIVE and !OWN */
3230 			if (nl2 == NULL) {
3231 				/* If node is mynode, then halt set if needed */
3232 				if (strcmp(mynode(), nd->nd_nodename) == 0) {
3233 					/*
3234 					 * This shouldn't happen, but just
3235 					 * in case...  Any node not in the
3236 					 * membership list should be dead and
3237 					 * not running reconfig step1.
3238 					 */
3239 					if (nd->nd_flags & MD_MN_NODE_OWN) {
3240 						if (halt_set(sp, &xep)) {
3241 							mde_perror(&xep, "");
3242 							mdclrerror(&xep);
3243 						}
3244 					}
3245 					/*
3246 					 * Return failure since this node
3247 					 * (mynode) is not in the membership
3248 					 * list, but process the rest of the
3249 					 * nodelist first so that rpc.metad
3250 					 * can be updated with the latest
3251 					 * membership information.
3252 					 */
3253 					(void) mddserror(ep,
3254 					    MDE_DS_NOTINMEMBERLIST,
3255 					    sp->setno, nd->nd_nodename, NULL,
3256 					    sp->setname);
3257 					rval = 1;
3258 				}
3259 				nd->nd_flags &= ~MD_MN_NODE_ALIVE;
3260 				nd->nd_flags &= ~MD_MN_NODE_OWN;
3261 			}
3262 		}
3263 		nd = nd->nd_next;
3264 	}
3265 
3266 	/* Send this information to rpc.metad */
3267 	if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist,
3268 	    MD_NR_SET,  MNSET_IN_RECONFIG, &xep)) {
3269 		/* Return failure if can't send node flags to rpc.metad */
3270 		if (rval == 0) {
3271 			(void) mdstealerror(ep, &xep);
3272 			rval = 1;
3273 		}
3274 	}
3275 	return (rval);
3276 }
3277 
3278 /*
3279  * Choose master determines the master for a diskset.
3280  * Each node determines the master on its own and
3281  * adds this information to its local rpc.metad nodelist
3282  * and also sends it to the kernel.
3283  *
3284  * Nodelist in set descriptor (sd) is sorted in
3285  * monotonically increasing sequence of nodeid.
3286  *
3287  * Return values:
3288  *	0 - No problem.
3289  *	205 - There was an RPC problem to another node.
3290  *	-1 - There was an error.  This could be an RPC error to my node.
3291  *		This is a catastrophic failure causing node to panic.
3292  */
3293 int
3294 meta_reconfig_choose_master_for_set(
3295 	mdsetname_t	*sp,
3296 	md_set_desc	*sd,
3297 	md_error_t	*ep
3298 )
3299 {
3300 	int			is_owner;
3301 	md_mnset_record		*mnsr = NULL;
3302 	int			lowest_alive_nodeid = 0;
3303 	uint_t			master_nodeid;
3304 	md_mnnode_desc		*nd, *nd2;
3305 	md_mnnode_record	*nr;
3306 	md_drive_desc		*dd;
3307 	md_setkey_t		*cl_sk;
3308 	int			rval = 0;
3309 	md_error_t		xep = mdnullerror;
3310 	mddb_setflags_config_t	sf;
3311 
3312 	/*
3313 	 * Is current node joined to diskset?
3314 	 * Don't trust flags, really check to see if mddb is snarfed.
3315 	 */
3316 	if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
3317 		/*
3318 		 * If a node is joined to the diskset, this node checks
3319 		 * to see if the current master of the diskset is valid and
3320 		 * is still in the membership list (ALIVE) and is
3321 		 * still joined (OWN).  Need to verify if master is
3322 		 * really joined - don't trust the flags.  (Can trust
3323 		 * ALIVE since set during earlier part of reconfig cycle.)
3324 		 * If the current master is valid, still in the membership
3325 		 * list and joined, then master is not changed on this node.
3326 		 * Just return.
3327 		 *
3328 		 * Verify that nodeid is valid before accessing masternode.
3329 		 */
3330 		if ((sd->sd_mn_master_nodeid != MD_MN_INVALID_NID) &&
3331 		    (sd->sd_mn_masternode->nd_flags & MD_MN_NODE_ALIVE)) {
3332 			if (clnt_ownset(sd->sd_mn_master_nodenm, sp,
3333 			    &is_owner, ep) == -1) {
3334 				/* If RPC failure to another node return 205 */
3335 				if ((mdanyrpcerror(ep)) &&
3336 				    (sd->sd_mn_mynode->nd_nodeid !=
3337 				    sd->sd_mn_master_nodeid)) {
3338 					return (205);
3339 				} else {
3340 					/* Any other failure */
3341 					return (-1);
3342 				}
3343 			} else {
3344 				if (is_owner == TRUE) {
3345 
3346 					meta_mc_log(MC_LOG5, dgettext(
3347 					    TEXT_DOMAIN, "Set %s previous "
3348 					    "master chosen %s (%d): %s"),
3349 					    sp->setname,
3350 					    sd->sd_mn_master_nodenm,
3351 					    sd->sd_mn_master_nodeid,
3352 					    meta_print_hrtime(gethrtime() -
3353 					    start_time));
3354 
3355 					/* Previous master is ok - done */
3356 					return (0);
3357 				}
3358 			}
3359 		}
3360 
3361 		/*
3362 		 * If current master is no longer in the membership list or
3363 		 * is no longer joined, then this node uses the following
3364 		 * algorithm:
3365 		 * - node calls RPC routine clnt_ownset to get latest
3366 		 *	information on which nodes are owners of diskset.
3367 		 * 	clnt_ownset checks on each node to see if its kernel
3368 		 *	has that diskset snarfed.
3369 		 */
3370 		nd = sd->sd_nodelist;
3371 		while (nd) {
3372 			/* Don't consider node that isn't in member list */
3373 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3374 				nd = nd->nd_next;
3375 				continue;
3376 			}
3377 
3378 			if (clnt_ownset(nd->nd_nodename, sp,
3379 			    &is_owner, ep) == -1) {
3380 				/* If RPC failure to another node return 205 */
3381 				if ((mdanyrpcerror(ep)) &&
3382 				    (sd->sd_mn_mynode->nd_nodeid !=
3383 				    nd->nd_nodeid)) {
3384 					return (205);
3385 				} else {
3386 					/* Any other failure */
3387 					return (-1);
3388 				}
3389 			}
3390 
3391 			/*
3392 			 * Set owner flag for each node based on whether
3393 			 * that node really has a diskset mddb snarfed in
3394 			 * or not.
3395 			 */
3396 			if (is_owner == TRUE)
3397 				nd->nd_flags |= MD_MN_NODE_OWN;
3398 			else
3399 				nd->nd_flags &= ~MD_MN_NODE_OWN;
3400 
3401 			nd = nd->nd_next;
3402 		}
3403 
3404 		/*
3405 		 * - node walks through nodelist looking for nodes that are
3406 		 *	owners of the diskset that are in the membership list.
3407 		 * - for each owner, node calls RPC routine clnt_getset to
3408 		 *	 see if that node has its node record set to OK.
3409 		 * - If so, master is chosen to be this owner node.
3410 		 */
3411 		nd = sd->sd_nodelist;
3412 		while (nd) {
3413 			/* Don't consider node that isn't in member list */
3414 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3415 				nd = nd->nd_next;
3416 				continue;
3417 			}
3418 
3419 			/* Don't consider a node that isn't an owner */
3420 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3421 				nd = nd->nd_next;
3422 				continue;
3423 			}
3424 
3425 			/* Does node has its own node record set to OK? */
3426 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3427 			    MD_SET_BAD, &mnsr, ep) == -1) {
3428 				/* If RPC failure to another node return 205 */
3429 				if ((mdanyrpcerror(ep)) &&
3430 				    (sd->sd_mn_mynode->nd_nodeid !=
3431 				    nd->nd_nodeid)) {
3432 					return (205);
3433 				} else {
3434 					/* Any other failure */
3435 					return (-1);
3436 				}
3437 			}
3438 			nr = mnsr->sr_nodechain;
3439 			while (nr) {
3440 				if (nd->nd_nodeid == nr->nr_nodeid) {
3441 					if (nr->nr_flags & MD_MN_NODE_OK) {
3442 						/* Found a master */
3443 						free_sr(
3444 						    (md_set_record *)mnsr);
3445 						goto found_master;
3446 					}
3447 				}
3448 				nr = nr->nr_next;
3449 			}
3450 			free_sr((md_set_record *)mnsr);
3451 			nd = nd->nd_next;
3452 		}
3453 
3454 		/*
3455 		 * - If no owner node has its own node record on its own node
3456 		 *	set to OK, then this node checks all of the non-owner
3457 		 * 	nodes that are in the membership list.
3458 		 * - for each non-owner, node calls RPC routine clnt_getset to
3459 		 *	 see if that node has its node record set to OK.
3460 		 * - If set doesn't exist, don't choose node for master.
3461 		 * - If so, master is chosen to be this non-owner node.
3462 		 *
3463 		 */
3464 		nd = sd->sd_nodelist;
3465 		while (nd) {
3466 			/* Don't consider node that isn't in member list */
3467 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3468 				nd = nd->nd_next;
3469 				continue;
3470 			}
3471 
3472 			/* Only checking non-owner nodes this time around */
3473 			if (nd->nd_flags & MD_MN_NODE_OWN) {
3474 				nd = nd->nd_next;
3475 				continue;
3476 			}
3477 
3478 			/* Does node has its own node record set to OK? */
3479 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3480 			    MD_SET_BAD, &mnsr, ep) == -1) {
3481 				/*
3482 				 * If set doesn't exist on non-owner node,
3483 				 * don't consider this node for master.
3484 				 */
3485 				if (mdiserror(ep, MDE_NO_SET)) {
3486 					nd = nd->nd_next;
3487 					continue;
3488 				} else if ((mdanyrpcerror(ep)) &&
3489 				    (sd->sd_mn_mynode->nd_nodeid !=
3490 				    nd->nd_nodeid)) {
3491 					/* RPC failure to another node */
3492 					return (205);
3493 				} else {
3494 					/* Any other failure */
3495 					return (-1);
3496 				}
3497 			}
3498 			nr = mnsr->sr_nodechain;
3499 			while (nr) {
3500 				if (nd->nd_nodeid == nr->nr_nodeid) {
3501 					if (nr->nr_flags & MD_MN_NODE_OK) {
3502 						/* Found a master */
3503 						free_sr(
3504 						    (md_set_record *)mnsr);
3505 						goto found_master;
3506 					}
3507 				}
3508 				nr = nr->nr_next;
3509 			}
3510 			free_sr((md_set_record *)mnsr);
3511 			nd = nd->nd_next;
3512 		}
3513 
3514 		/*
3515 		 * - If no node can be found that has its own node record on
3516 		 *	its node to be set to OK, then all alive nodes
3517 		 * 	were in the process of being added to or deleted
3518 		 *	from set.  Each alive node will remove all
3519 		 *	information pertaining to this set from its node.
3520 		 *
3521 		 * If all nodes in set are ALIVE, then call sdssc end routines
3522 		 * since set was truly being initially created or destroyed.
3523 		 */
3524 		goto delete_set;
3525 	} else {
3526 
3527 		/*
3528 		 * If node is not joined to diskset, then this
3529 		 * node uses the following algorithm:
3530 		 * - If unjoined node doesn't have a node record for itself,
3531 		 *	just delete the diskset since diskset was in the
3532 		 *	process of being created.
3533 		 * - node needs to find master of diskset before
3534 		 *	reconfig cycle, if a master existed.
3535 		 * - node calls RPC routine clnt_ownset to get latest
3536 		 * 	information on which nodes are owners of diskset.
3537 		 *	clnt_ownset checks on each node to see if its
3538 		 *	kernel has that diskset snarfed.
3539 		 */
3540 
3541 		/*
3542 		 * Is my node in the set description?
3543 		 * If not, delete the set from this node.
3544 		 * sr2setdesc sets sd_mn_mynode pointer to the node
3545 		 * descriptor for this node if there was a node
3546 		 * record for this node.
3547 		 *
3548 		 */
3549 		if (sd->sd_mn_mynode == NULL) {
3550 			goto delete_set;
3551 		}
3552 
3553 		nd = sd->sd_nodelist;
3554 		while (nd) {
3555 			/* Don't consider node that isn't in member list */
3556 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3557 				nd = nd->nd_next;
3558 				continue;
3559 			}
3560 
3561 			if (clnt_ownset(nd->nd_nodename, sp,
3562 			    &is_owner, ep) == -1) {
3563 				/* If RPC failure to another node return 205 */
3564 				if ((mdanyrpcerror(ep)) &&
3565 				    (sd->sd_mn_mynode->nd_nodeid !=
3566 				    nd->nd_nodeid)) {
3567 					return (205);
3568 				} else {
3569 					/* Any other failure */
3570 					return (-1);
3571 				}
3572 			}
3573 
3574 			/*
3575 			 * Set owner flag for each node based on whether
3576 			 * that node really has a diskset mddb snarfed in
3577 			 * or not.
3578 			 */
3579 			if (is_owner == TRUE)
3580 				nd->nd_flags |= MD_MN_NODE_OWN;
3581 			else
3582 				nd->nd_flags &= ~MD_MN_NODE_OWN;
3583 
3584 			nd = nd->nd_next;
3585 		}
3586 
3587 		/*
3588 		 * - node walks through nodelist looking for nodes that
3589 		 *	are owners of the diskset that are in
3590 		 *	the membership list.
3591 		 * - for each owner, node calls RPC routine clnt_getset to
3592 		 *	see if that node has a master set and to get the
3593 		 *	diskset description.
3594 		 * - If the owner node has a set description that doesn't
3595 		 *	include the non-joined node in the nodelist, this node
3596 		 *	removes its set description of that diskset
3597 		 *	(i.e. removes the set from its local mddbs).  This is
3598 		 *	handling the case of when a node was removed from a
3599 		 *	diskset while it was not in the cluster membership
3600 		 *	list.
3601 		 * - If that node has a master set and the master is in the
3602 		 *	membership list and is an owner, then either this was
3603 		 *	the master from before the reconfig cycle or this
3604 		 *	node has already chosen a new master - either way,
3605 		 *	the master value is valid as long as it is in the
3606 		 *	membership list and is an owner
3607 		 * - master is chosen to be owner node's master
3608 		 */
3609 		nd = sd->sd_nodelist;
3610 		while (nd) {
3611 			/* Don't consider node that isn't in member list */
3612 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3613 				nd = nd->nd_next;
3614 				continue;
3615 			}
3616 
3617 			/* Don't consider a node that isn't an owner */
3618 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3619 				nd = nd->nd_next;
3620 				continue;
3621 			}
3622 
3623 			/* Get owner node's set record */
3624 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3625 			    MD_SET_BAD, &mnsr, ep) == -1) {
3626 				/* If RPC failure to another node return 205 */
3627 				if ((mdanyrpcerror(ep)) &&
3628 				    (sd->sd_mn_mynode->nd_nodeid !=
3629 				    nd->nd_nodeid)) {
3630 					return (205);
3631 				} else {
3632 					/* Any other failure */
3633 					return (-1);
3634 				}
3635 			}
3636 
3637 			/* Is this node in the owner node's set record */
3638 			nr = mnsr->sr_nodechain;
3639 			while (nr) {
3640 				if (sd->sd_mn_mynode->nd_nodeid ==
3641 				    nr->nr_nodeid) {
3642 					break;
3643 				}
3644 				nr = nr->nr_next;
3645 			}
3646 			if (nr == NULL) {
3647 				/* my node not found - delete set */
3648 				free_sr((md_set_record *)mnsr);
3649 				goto delete_set;
3650 			}
3651 
3652 			/* Is owner's node's master valid? */
3653 			master_nodeid = mnsr->sr_master_nodeid;
3654 			free_sr((md_set_record *)mnsr);
3655 			if (master_nodeid == MD_MN_INVALID_NID) {
3656 				nd = nd->nd_next;
3657 				continue;
3658 			}
3659 
3660 			nd2 = sd->sd_nodelist;
3661 			while (nd2) {
3662 				if ((nd2->nd_nodeid == master_nodeid) &&
3663 				    (nd2->nd_flags & MD_MN_NODE_ALIVE) &&
3664 				    (nd2->nd_flags & MD_MN_NODE_OWN)) {
3665 						nd = nd2;
3666 						goto found_master;
3667 				}
3668 				nd2 = nd2->nd_next;
3669 			}
3670 			nd = nd->nd_next;
3671 		}
3672 
3673 		/*
3674 		 * - If no owner node has a valid master, then follow
3675 		 * 	algorithm of when a node is joined to the diskset.
3676 		 * - node walks through nodelist looking for nodes that are
3677 		 *	owners of the diskset that are in the membership list.
3678 		 * - for each owner, node calls RPC routine clnt_getset to
3679 		 *	 see if that node has its node record set to OK.
3680 		 * - If so, master is chosen to be this owner node.
3681 		 */
3682 		nd = sd->sd_nodelist;
3683 		while (nd) {
3684 			/* Don't consider node that isn't in member list */
3685 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3686 				nd = nd->nd_next;
3687 				continue;
3688 			}
3689 
3690 			/* Don't consider a node that isn't an owner */
3691 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3692 				nd = nd->nd_next;
3693 				continue;
3694 			}
3695 
3696 			/* Does node has its own node record set to OK? */
3697 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3698 			    MD_SET_BAD, &mnsr, ep) == -1) {
3699 				/* If RPC failure to another node return 205 */
3700 				if ((mdanyrpcerror(ep)) &&
3701 				    (sd->sd_mn_mynode->nd_nodeid !=
3702 				    nd->nd_nodeid)) {
3703 					return (205);
3704 				} else {
3705 					/* Any other failure */
3706 					return (-1);
3707 				}
3708 			}
3709 			nr = mnsr->sr_nodechain;
3710 			while (nr) {
3711 				if (nd->nd_nodeid == nr->nr_nodeid) {
3712 					if (nr->nr_flags & MD_MN_NODE_OK) {
3713 						/* Found a master */
3714 						free_sr(
3715 						    (md_set_record *)mnsr);
3716 						goto found_master;
3717 					}
3718 				}
3719 				nr = nr->nr_next;
3720 			}
3721 			free_sr((md_set_record *)mnsr);
3722 			nd = nd->nd_next;
3723 		}
3724 
3725 		/*
3726 		 * - If no owner node has its own node record on its own node
3727 		 *	set to OK, then this node checks all of the non-owner
3728 		 *	nodes that are in the membership list.
3729 		 * - for each non-owner, node calls RPC routine clnt_getset to
3730 		 *	see if that node has its node record set to OK.
3731 		 * - If set doesn't exist, don't choose node for master.
3732 		 * - If this node doesn't exist in the nodelist on any of the
3733 		 *	non-owner nodes, this node removes its set description
3734 		 *	of that diskset (i.e. removes the set from its local
3735 		 *	mddbs). This is handling the case of when a node was
3736 		 *	removed from a diskset while it was not in the
3737 		 *	cluster membership list.
3738 		 * - If non-owner node has its node record set to OK and if
3739 		 *	this node hasn't removed this diskset (step directly
3740 		 *	before this one), then the master is chosen to be this
3741 		 *	non-owner node.
3742 		 */
3743 		nd = sd->sd_nodelist;
3744 		while (nd) {
3745 			/* Don't consider node that isn't in member list */
3746 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3747 				nd->nd_flags |= MD_MN_NODE_DEL;
3748 				nd = nd->nd_next;
3749 				continue;
3750 			}
3751 
3752 			/* Don't consider owner nodes since none are OK */
3753 			if (nd->nd_flags & MD_MN_NODE_OWN) {
3754 				nd->nd_flags |= MD_MN_NODE_DEL;
3755 				nd = nd->nd_next;
3756 				continue;
3757 			}
3758 
3759 			/*
3760 			 * Don't need to get nodelist from my node since
3761 			 * this is where sd_nodelist was obtained.
3762 			 */
3763 			if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) {
3764 				nd = nd->nd_next;
3765 				continue;
3766 			}
3767 
3768 			/*
3769 			 * If node has already been decided against for
3770 			 * master, then skip it.
3771 			 */
3772 			if (nd->nd_flags & MD_MN_NODE_DEL) {
3773 				nd = nd->nd_next;
3774 				continue;
3775 			}
3776 
3777 			/*
3778 			 * Does node in my nodelist have its own node
3779 			 * record marked OK on its node?  And does node
3780 			 * in my nodelist exist on all other nodes?
3781 			 * Don't want to choose a node for master unless
3782 			 * that node is marked OK on its own node and that
3783 			 * node exists on all other alive nodes.
3784 			 *
3785 			 * This is guarding against the case when several
3786 			 * nodes are down and one of the downed nodes is
3787 			 * deleted from the diskset.  When the down nodes
3788 			 * are rebooted into the cluster, you don't want
3789 			 * any node to pick the deleted node as the master.
3790 			 */
3791 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3792 			    MD_SET_BAD, &mnsr, ep) == -1) {
3793 				/*
3794 				 * If set doesn't exist on non-owner node,
3795 				 * don't consider this node for master.
3796 				 */
3797 				if (mdiserror(ep, MDE_NO_SET)) {
3798 					nd->nd_flags |= MD_MN_NODE_DEL;
3799 					nd = nd->nd_next;
3800 					continue;
3801 				} else if (mdanyrpcerror(ep)) {
3802 					/* RPC failure to another node */
3803 					return (205);
3804 				} else {
3805 					/* Any other failure */
3806 					return (-1);
3807 				}
3808 			}
3809 			/*
3810 			 * Is my node in the nodelist gotten from the other
3811 			 * node?  If not, then remove the set from my node
3812 			 * since set was deleted from my node while my node
3813 			 * was out of the cluster.
3814 			 */
3815 			nr = mnsr->sr_nodechain;
3816 			while (nr) {
3817 				if (sd->sd_mn_mynode->nd_nodeid ==
3818 				    nr->nr_nodeid) {
3819 					break;
3820 				}
3821 				nr = nr->nr_next;
3822 			}
3823 			if (nr == NULL) {
3824 				/* my node not found - delete set */
3825 				free_sr((md_set_record *)mnsr);
3826 				goto delete_set;
3827 			}
3828 
3829 			/* Is node being checked marked OK on its own node? */
3830 			nr = mnsr->sr_nodechain;
3831 			while (nr) {
3832 				if (nd->nd_nodeid == nr->nr_nodeid) {
3833 					if (!(nr->nr_flags & MD_MN_NODE_OK)) {
3834 						nd->nd_flags |= MD_MN_NODE_DEL;
3835 					}
3836 					break;
3837 				}
3838 				nr = nr->nr_next;
3839 			}
3840 			/*
3841 			 * If node being checked doesn't exist on its
3842 			 * own node - don't choose it as master.
3843 			 */
3844 			if (nr == NULL) {
3845 				nd->nd_flags |= MD_MN_NODE_DEL;
3846 			}
3847 
3848 			/*
3849 			 * Check every node in my node's nodelist against
3850 			 * the nodelist gotten from the other node.
3851 			 * If a node in my node's nodelist is not found in the
3852 			 * other node's nodelist, then set the DEL flag.
3853 			 */
3854 			nd2 = sd->sd_nodelist;
3855 			while (nd2) {
3856 				nr = mnsr->sr_nodechain;
3857 				while (nr) {
3858 					if (nd2->nd_nodeid == nr->nr_nodeid) {
3859 						break;
3860 					}
3861 					nr = nr->nr_next;
3862 				}
3863 				/* nd2 not found in other node's nodelist */
3864 				if (nr == NULL) {
3865 					nd2->nd_flags |= MD_MN_NODE_DEL;
3866 				}
3867 				nd2 = nd2->nd_next;
3868 			}
3869 
3870 			free_sr((md_set_record *)mnsr);
3871 			nd = nd->nd_next;
3872 		}
3873 
3874 		/*
3875 		 * Rescan list look for node that has not been marked DEL.
3876 		 * First node found is the master.
3877 		 */
3878 		nd = sd->sd_nodelist;
3879 		while (nd) {
3880 			if (!(nd->nd_flags & MD_MN_NODE_DEL)) {
3881 				break;
3882 			}
3883 			nd = nd->nd_next;
3884 			continue;
3885 		}
3886 		if (nd) {
3887 			/* Found a master */
3888 			goto found_master;
3889 		}
3890 
3891 		/*
3892 		 * - If no node can be found that has its own node record on
3893 		 *	its node to be set to OK, then all alive nodes
3894 		 * 	were in the process of being added to or deleted
3895 		 *	from set.  Each alive node will remove all
3896 		 *	information pertaining to this set from its node.
3897 		 *
3898 		 * If all nodes in set are ALIVE, then call sdssc end routines
3899 		 * since set was truly being initially created or destroyed.
3900 		 */
3901 		goto delete_set;
3902 	}
3903 
3904 found_master:
3905 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
3906 	    "Set %s master chosen %s (%d): %s"),
3907 	    sp->setname, nd->nd_nodename, nd->nd_nodeid,
3908 	    meta_print_hrtime(gethrtime() - start_time));
3909 
3910 	if (clnt_lock_set(mynode(), sp, ep) == -1) {
3911 		return (-1);
3912 	}
3913 
3914 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
3915 
3916 	if (clnt_mnsetmaster(mynode(), sp,
3917 	    nd->nd_nodename, nd->nd_nodeid, ep)) {
3918 		rval = -1;
3919 	} else if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) {
3920 		/* If this node is new master, set flag in this node's kernel */
3921 		(void) memset(&sf, 0, sizeof (sf));
3922 		sf.sf_setno = sp->setno;
3923 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
3924 		/* Use magic to help protect ioctl against attack. */
3925 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
3926 		sf.sf_flags = MDDB_NM_SET;
3927 
3928 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
3929 		    "Setting new master flag for set %s: %s"),
3930 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
3931 
3932 		/*
3933 		 * Fail reconfig cycle if ioctl fails since it is critical
3934 		 * to set new master flag.
3935 		 */
3936 		if (metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde,
3937 		    NULL) != NULL) {
3938 			(void) mdstealerror(ep, &sf.sf_mde);
3939 			rval = -1;
3940 		}
3941 	}
3942 
3943 	if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) {
3944 		if (rval == 0) {
3945 			(void) mdstealerror(ep, &xep);
3946 			rval = -1;
3947 		}
3948 	}
3949 
3950 	cl_set_setkey(NULL);
3951 
3952 	metaflushsetname(sp);
3953 
3954 	return (rval);
3955 
3956 delete_set:
3957 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
3958 	    "Master not chosen, deleting set %s: %s"),
3959 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
3960 
3961 	/*
3962 	 * Remove all set information from this node:
3963 	 *	- node records for this set
3964 	 *	- drive records for this set
3965 	 *	- set record for this set
3966 	 * (Only do this on this node since each node
3967 	 * will do it for its own local mddb.)
3968 	 *
3969 	 * If all nodes in set are ALIVE, then
3970 	 * the lowest numbered ALIVE nodeid in set
3971 	 * (irregardless of whether an owner node or not) will
3972 	 * call the DCS service to cleanup for create/delete of set.
3973 	 *   sdssc_create_end(cleanup) if set was being created or
3974 	 *   sdssc_delete_end(cleanup) if set was being deleted.
3975 	 * A node record with flag ADD denotes a set being
3976 	 * created.  A node record with flag DEL denotes a
3977 	 * set being deleted.
3978 	 */
3979 	nd = sd->sd_nodelist;
3980 	while (nd) {
3981 		/* Found a node that isn't alive */
3982 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
3983 			break;
3984 
3985 		/* Is my node the lowest numbered ALIVE node? */
3986 		if (nd->nd_nodeid < sd->sd_mn_mynode->nd_nodeid) {
3987 			break;
3988 		}
3989 		nd = nd->nd_next;
3990 	}
3991 	if (nd == NULL) {
3992 		/* All nodes ALIVE and this is the lowest nodeid */
3993 		lowest_alive_nodeid = 1;
3994 	}
3995 
3996 	if (clnt_lock_set(mynode(), sp, ep) == -1) {
3997 		return (-1);
3998 	}
3999 
4000 
4001 	/*
4002 	 * If this node had been joined, withdraw and reset master.
4003 	 *
4004 	 * This could happen if a node was being added to or removed
4005 	 * from a diskset and the node doing the add/delete operation and
4006 	 * all other nodes in the diskset have left the cluster.
4007 	 */
4008 	if (sd->sd_mn_mynode) {
4009 		nd = sd->sd_mn_mynode;
4010 		if (nd->nd_flags & MD_MN_NODE_OWN) {
4011 			if (clnt_withdrawset(mynode(), sp, ep)) {
4012 				rval = -1;
4013 				goto out;
4014 			}
4015 			if (clnt_mnsetmaster(mynode(), sp, "",
4016 			    MD_MN_INVALID_NID, ep)) {
4017 				rval = -1;
4018 				goto out;
4019 			}
4020 		}
4021 	}
4022 
4023 	/*
4024 	 * Remove side records for this node (side) from local mddb
4025 	 * (clnt_deldrvs does this) if there are drives in the set.
4026 	 *
4027 	 * Don't need to mark this node as DEL since already marked as
4028 	 * ADD or DEL (or this node would have been chosen as master).
4029 	 * Don't need to mark other node records, drive records or
4030 	 * set records as DEL.  If a panic occurs during clnt_delset,
4031 	 * these records will be deleted the next time this node
4032 	 * becomes a member and goes through the reconfig cycle.
4033 	 */
4034 	/* Get the drive descriptors for this set */
4035 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
4036 	    ep)) == NULL) {
4037 		if (! mdisok(ep)) {
4038 			/*
4039 			 * Ignore and clear out any failures from
4040 			 * metaget_drivedesc since a panic could have
4041 			 * occurred when a node was partially added to a set.
4042 			 */
4043 			mdclrerror(ep);
4044 		}
4045 	} else {
4046 		if (clnt_deldrvs(mynode(), sp, dd, ep)) {
4047 			rval = -1;
4048 			goto out;
4049 		}
4050 	}
4051 
4052 	/*
4053 	 * Now, delete the set - this removes the node, drive
4054 	 * and set records from the local mddb.
4055 	 */
4056 	if (clnt_delset(mynode(), sp, ep)) {
4057 		rval = -1;
4058 		goto out;
4059 	}
4060 
4061 out:
4062 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
4063 
4064 	/*
4065 	 * Ignore errors from unlock of set since set is no longer
4066 	 * known (if clnt_delset worked).
4067 	 */
4068 	if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) {
4069 		mdclrerror(&xep);
4070 	}
4071 
4072 	cl_set_setkey(NULL);
4073 
4074 	metaflushsetname(sp);
4075 
4076 	/*
4077 	 * If this node is the lowest numbered nodeid then
4078 	 * call sdssc_create/delete_end depending on whether
4079 	 * this node is marked as ADD or DEL in the node record.
4080 	 */
4081 	if (lowest_alive_nodeid) {
4082 		if (nd->nd_flags & MD_MN_NODE_ADD)
4083 			sdssc_create_end(sp->setname, SDSSC_CLEANUP);
4084 		else if (nd->nd_flags & MD_MN_NODE_DEL)
4085 			sdssc_delete_end(sp->setname, SDSSC_CLEANUP);
4086 	}
4087 
4088 	/* Finished with this set -- return */
4089 	return (rval);
4090 }
4091 
4092 /*
4093  * Reconfig step to choose a new master for all MN disksets.
4094  * Return values:
4095  *	0 - Everything is great.
4096  *	1 - This node failed to reconfig.
4097  *	205 - Cause another reconfig due to a nodelist problem
4098  *		or RPC failure to another node
4099  */
4100 int
4101 meta_reconfig_choose_master(
4102 	md_error_t	*ep
4103 )
4104 {
4105 	set_t				max_sets, setno;
4106 	int				nodecnt;
4107 	mndiskset_membershiplist_t	*nl;
4108 	md_set_desc			*sd;
4109 	mdsetname_t			*sp;
4110 	int				rval = 0;
4111 	mddb_setflags_config_t		sf;
4112 	int				start_node_delayed = 0;
4113 
4114 	if ((max_sets = get_max_sets(ep)) == 0) {
4115 		mde_perror(ep, dgettext(TEXT_DOMAIN,
4116 		    "Unable to get number of sets"));
4117 		return (1);
4118 	}
4119 
4120 	/*
4121 	 * Get membershiplist from API routine.  If there's
4122 	 * an error, return a 205 to cause another reconfig.
4123 	 */
4124 	if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
4125 		mde_perror(ep, "");
4126 		return (205);
4127 	}
4128 
4129 	for (setno = 1; setno < max_sets; setno++) {
4130 		if ((sp = metasetnosetname(setno, ep)) == NULL) {
4131 			if (mdiserror(ep, MDE_NO_SET)) {
4132 				/* No set for this setno - continue */
4133 				mdclrerror(ep);
4134 				continue;
4135 			} else {
4136 				/*
4137 				 * If encountered an RPC error from my node,
4138 				 * then immediately fail.
4139 				 */
4140 				if (mdanyrpcerror(ep)) {
4141 					mde_perror(ep, "");
4142 					return (1);
4143 				}
4144 				/* Can't get set information */
4145 				mde_perror(ep, dgettext(TEXT_DOMAIN,
4146 					"Unable to get information for "
4147 					"set number %d"), setno);
4148 				mdclrerror(ep);
4149 				continue;
4150 			}
4151 		}
4152 
4153 		/* If setname is there, set desc should exist. */
4154 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4155 			/*
4156 			 * If encountered an RPC error from my node,
4157 			 * then immediately fail.
4158 			 */
4159 			if (mdanyrpcerror(ep)) {
4160 				mde_perror(ep, "");
4161 				return (1);
4162 			}
4163 			mde_perror(ep, dgettext(TEXT_DOMAIN,
4164 				"Unable to get set %s desc information"),
4165 				sp->setname);
4166 			mdclrerror(ep);
4167 			continue;
4168 		}
4169 
4170 		/* Only reconfig MN disksets */
4171 		if (!MD_MNSET_DESC(sd)) {
4172 			continue;
4173 		}
4174 
4175 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4176 		    "Begin choose master for set %s: %s"),
4177 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4178 
4179 		/* Update nodelist with member information. */
4180 		if (meta_reconfig_update_nodelist(sp, nl, sd, ep)) {
4181 			/*
4182 			 * If encountered an RPC error from my node,
4183 			 * then immediately fail.
4184 			 */
4185 			if (mdanyrpcerror(ep)) {
4186 				mde_perror(ep, "");
4187 				return (1);
4188 			}
4189 			mde_perror(ep, "");
4190 			mdclrerror(ep);
4191 			continue;
4192 		}
4193 
4194 		/*
4195 		 * If all nodes in a cluster are starting, then
4196 		 * all nodes will attempt to contact all other nodes
4197 		 * to determine a master node.  This can lead to a
4198 		 * problem where node 1 is trying to contact the rpc.metad
4199 		 * node 2 and node 2 is trying to contact the rpc.metad
4200 		 * on node 1 -- and this causes the rpc call to fail
4201 		 * on both nodes and causes a new reconfig cycle.
4202 		 *
4203 		 * In order to break this problem, a newly starting node
4204 		 * will delay a small amount of time (nodeid mod 4 seconds)
4205 		 * and will then run the code to choose a master for the
4206 		 * first set.  Delay will only be done once regardless of the
4207 		 * number of sets.
4208 		 */
4209 		if (start_node_delayed == 0) {
4210 			(void) memset(&sf, 0, sizeof (sf));
4211 			sf.sf_setno = sp->setno;
4212 			sf.sf_flags = MDDB_NM_GET;
4213 			/* Use magic to help protect ioctl against attack. */
4214 			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
4215 			if ((metaioctl(MD_MN_GET_SETFLAGS, &sf,
4216 			    &sf.sf_mde, NULL) == 0) &&
4217 			    ((sf.sf_setflags & MD_SET_MN_START_RC) ==
4218 			    MD_SET_MN_START_RC)) {
4219 				(void) sleep(sd->sd_mn_mynode->nd_nodeid % 4);
4220 			}
4221 			start_node_delayed = 1;
4222 		}
4223 
4224 		/* Choose master for this set */
4225 		rval = meta_reconfig_choose_master_for_set(sp, sd, ep);
4226 		if (rval == -1) {
4227 			mde_perror(ep, "");
4228 			return (1);
4229 		} else if (rval == 205) {
4230 			mde_perror(ep, "");
4231 			return (205);
4232 		}
4233 
4234 		/* Send new nodelist to rpc.mdcommd */
4235 		(void) mdmn_reinit_set(sp->setno);
4236 
4237 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4238 		    "Choose master for set %s completed: %s"),
4239 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4240 	}
4241 
4242 	/*
4243 	 * Each node turns on I/Os for all MN disksets.
4244 	 * This is to recover from the situation where the master died
4245 	 * during a previous reconfig cycle when I/Os were suspended
4246 	 * for a MN diskset.
4247 	 * If a failure occurs return a 1 which will force this node to
4248 	 * panic.  Cannot leave node in the situation where I/Os are
4249 	 * not resumed.
4250 	 */
4251 	setno = 0; /* 0 means all MN sets */
4252 	if (metaioctl(MD_MN_RESUME_SET, &setno, ep, NULL)) {
4253 		mde_perror(ep, "");
4254 		return (1);
4255 	}
4256 
4257 	/* Free the nodelist */
4258 	if (nodecnt)
4259 		meta_free_nodelist(nl);
4260 
4261 	return (0);
4262 }
4263 
4264 /*
4265  * meta_mnsync_user_records will synchronize the diskset user records across
4266  * all nodes in the diskset.  The diskset user records are stored in
4267  * each node's local set mddb.
4268  *
4269  * This needs to be done even if there is no master change during the
4270  * reconfig cycle since this routine should clean up any mess left by
4271  * the untimely termination of a metaset or metadb command (due to a
4272  * node panic or to user intervention).
4273  *
4274  * Caller is the Master node.
4275  *
4276  * Returns	 0 - Success
4277  *		205 - Failure during RPC to another node
4278  *		-1 - Any other failure and ep is filled in.
4279  */
4280 int
4281 meta_mnsync_user_records(
4282 	mdsetname_t	*sp,
4283 	md_error_t	*ep
4284 )
4285 {
4286 	md_set_desc		*sd;
4287 	md_mnnode_desc		*master_nodelist, *nd, *nd2, *ndtail;
4288 	md_mnset_record		*mnsr;
4289 	md_mnsr_node_t		*master_mnsr_node = NULL, *mnsr_node = NULL;
4290 	md_mnnode_record	*nr;
4291 	md_drive_record		*dr;
4292 	int			dr_cnt, dd_cnt;
4293 	int			found_my_nr;
4294 	md_drive_desc		*dd, *dd_prev, *master_dd, *other_dd;
4295 	int			all_drives_ok;
4296 	int			rval = 0;
4297 	int			max_genid = 0;
4298 	int			num_alive_nodes, num_alive_nodes_del = 0;
4299 	int			set_locked = 0;
4300 	md_setkey_t		*cl_sk;
4301 	md_error_t		xep = mdnullerror;
4302 	char			*anode[1];
4303 	mddb_setflags_config_t	sf;
4304 
4305 	/*
4306 	 * Sync up node records first.
4307 	 * Construct a master nodelist using the nodelist from this
4308 	 * node's rpc.metad node records and then setting the state of each
4309 	 * node following these rules:
4310 	 *	- If a node record is marked OK on its node, mark it OK
4311 	 *		in the master nodelist (and later OK on all nodes)
4312 	 *		If a node record is also marked OWN on its node,
4313 	 *		mark it OWN in the master nodelist.
4314 	 *	- If a node record is not marked OK on its node, then mark
4315 	 *		it as DEL in the master list (later deleting it)
4316 	 *	- If node record doesn't exist on that node, then mark it DEL
4317 	 *		(later deleting it)
4318 	 *	- If set record doesn't exist on that node, mark node as DEL
4319 	 *	- If a node record doesn't exist on all nodes, then mark it DEL
4320 	 *	- If a node is not ALIVE, then
4321 	 *		- If that node marked DEL on any node - mark it DEL
4322 	 *			in master list but leave in nodelist
4323 	 *		- If that node is marked as ADD on any node, mark it
4324 	 *			ADD in the master list but leave in nodelist
4325 	 *		- When that node returns to the living, the DEL
4326 	 *			node record will be removed and the ADD node
4327 	 *			record may be removed if marked ADD on that
4328 	 *			node.
4329 	 * The key rule is to not remove a node from the nodelist until
4330 	 * that node record is removed from its own node.  Do not want to
4331 	 * remove a node's record from all other nodes and then have
4332 	 * that node have its own record marked OK so that a node will pick
4333 	 * a different master than the other nodes.
4334 	 *
4335 	 * Next,
4336 	 * If node is ALIVE and node record is marked DEL in master nodelist,
4337 	 * remove node from set.
4338 	 * If node is ALIVE and node record is marked OK in master nodelist,
4339 	 * mark it OK on all other nodes.
4340 	 * If node is not ALIVE and node record is marked DEL in master
4341 	 * nodelist, mark it DEL on all other nodes.
4342 	 * If node is not ALIVE and node record is marked ADD in master,
4343 	 * nodelist, mark it ADD on all other nodes.
4344 	 */
4345 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4346 		return (-1);
4347 	}
4348 	master_nodelist = sd->sd_nodelist;
4349 
4350 	/*
4351 	 * Walk through nodelist creating a master nodelist.
4352 	 */
4353 	num_alive_nodes = 0;
4354 	nd = master_nodelist;
4355 	while (nd) {
4356 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4357 			nd = nd->nd_next;
4358 			continue;
4359 		}
4360 		num_alive_nodes++;
4361 		if (clnt_mngetset(nd->nd_nodename, sp->setname,
4362 		    MD_SET_BAD, &mnsr, ep) == -1) {
4363 			if (mdiserror(ep, MDE_NO_SET)) {
4364 				/* set doesn't exist, mark node as DEL */
4365 				nd->nd_flags &= ~MD_MN_NODE_OK;
4366 				nd->nd_flags &= ~MD_MN_NODE_ADD;
4367 				nd->nd_flags |= MD_MN_NODE_DEL;
4368 				nd->nd_flags |= MD_MN_NODE_NOSET;
4369 				nd = nd->nd_next;
4370 				continue;
4371 			} else {
4372 				/* If RPC failure to another node return 205 */
4373 				if ((mdanyrpcerror(ep)) &&
4374 				    (sd->sd_mn_mynode->nd_nodeid !=
4375 				    nd->nd_nodeid)) {
4376 					rval = 205;
4377 				} else {
4378 					/* Any other failure */
4379 					rval = -1;
4380 				}
4381 				goto out;
4382 			}
4383 		}
4384 		/* Find biggest genid in records for this diskset */
4385 		if (mnsr->sr_genid > max_genid)
4386 			max_genid = mnsr->sr_genid;
4387 
4388 		dr = mnsr->sr_drivechain;
4389 		while (dr) {
4390 			/* Find biggest genid in records for this diskset */
4391 			if (dr->dr_genid > max_genid) {
4392 				max_genid = dr->dr_genid;
4393 			}
4394 			dr = dr->dr_next;
4395 		}
4396 
4397 		found_my_nr = 0;
4398 		nr = mnsr->sr_nodechain;
4399 		/* nr is the list of node recs from nd_nodename node */
4400 		while (nr) {
4401 			/* Find biggest genid in records for this diskset */
4402 			if (nr->nr_genid > max_genid)
4403 				max_genid = nr->nr_genid;
4404 			nd2 = master_nodelist;
4405 			ndtail = NULL;
4406 			/* For each node record, is it in master list? */
4407 			while (nd2) {
4408 				if (nd2->nd_nodeid == nr->nr_nodeid)
4409 					break;
4410 				if (nd2->nd_next == NULL)
4411 					ndtail = nd2;
4412 				nd2 = nd2->nd_next;
4413 			}
4414 			/*
4415 			 * Found node record not in master list -- add it
4416 			 * to list marking it as DEL since node record
4417 			 * should exist on all nodes unless a panic occurred
4418 			 * during addition or deletion of host to diskset.
4419 			 */
4420 			if (nd2 == NULL) {
4421 				nd2 = Zalloc(sizeof (*nd2));
4422 				(void) strcpy(nd2->nd_nodename,
4423 				    nr->nr_nodename);
4424 				nd2->nd_flags = nr->nr_flags;
4425 				nd2->nd_flags |= MD_MN_NODE_DEL;
4426 				nd2->nd_nodeid = nr->nr_nodeid;
4427 				nd2->nd_next = NULL;
4428 				ndtail->nd_next = nd2;
4429 				nd2 = NULL;
4430 				nr = nr->nr_next;
4431 				continue;
4432 			}
4433 			/*
4434 			 * Is this the node record for the node that
4435 			 * we requested the set desc from?
4436 			 * If so, check if node has its own node record
4437 			 * marked OK. If marked OK, check for the OWN bit.
4438 			 */
4439 			if (nr->nr_nodeid == nd->nd_nodeid) {
4440 				found_my_nr = 1;
4441 				if (nr->nr_flags & MD_MN_NODE_OK) {
4442 					/*
4443 					 * If node record is marked OK
4444 					 * on its own node, then mark it OK
4445 					 * in the master list.  Node record
4446 					 * would have to exist on all nodes
4447 					 * in the ADD state before it could
4448 					 * be put into the OK state.
4449 					 */
4450 					nd->nd_flags |= MD_MN_NODE_OK;
4451 					nd->nd_flags &=
4452 					    ~(MD_MN_NODE_ADD | MD_MN_NODE_DEL);
4453 					/*
4454 					 * Mark own in master list as marked
4455 					 * on own node.
4456 					 */
4457 					if (nr->nr_flags & MD_MN_NODE_OWN)
4458 						nd->nd_flags |= MD_MN_NODE_OWN;
4459 					else
4460 						nd->nd_flags &= ~MD_MN_NODE_OWN;
4461 				} else {
4462 					/* Otherwise, mark node as DEL */
4463 					nd->nd_flags &= ~MD_MN_NODE_OK;
4464 					nd->nd_flags &= ~MD_MN_NODE_ADD;
4465 					nd->nd_flags |= MD_MN_NODE_DEL;
4466 				}
4467 			}
4468 			/*
4469 			 * If node is not ALIVE and marked DEL
4470 			 * on any node, make it DEL in master list.
4471 			 * If node is not ALIVE and marked ADD
4472 			 * on any node, make it ADD in master list
4473 			 * unless node record has already been marked DEL.
4474 			 */
4475 			if (!(nr->nr_flags & MD_MN_NODE_ALIVE)) {
4476 				if (nr->nr_flags & MD_MN_NODE_ADD) {
4477 					if (!(nd->nd_flags & MD_MN_NODE_DEL)) {
4478 						/* If not DEL - mark it ADD */
4479 						nd->nd_flags |= MD_MN_NODE_ADD;
4480 						nd->nd_flags &= ~MD_MN_NODE_OK;
4481 					}
4482 				}
4483 				if (nr->nr_flags & MD_MN_NODE_DEL) {
4484 					nd->nd_flags |= MD_MN_NODE_DEL;
4485 					nd->nd_flags &= ~MD_MN_NODE_OK;
4486 					/* Could already be ADD - make it DEL */
4487 					nd->nd_flags &= ~MD_MN_NODE_ADD;
4488 				}
4489 			}
4490 			nr = nr->nr_next;
4491 		}
4492 		/*
4493 		 * If a node record doesn't exist on its own node,
4494 		 * then mark node as DEL.
4495 		 */
4496 		if (found_my_nr == 0) {
4497 			nd->nd_flags &= ~MD_MN_NODE_OK;
4498 			nd->nd_flags |= MD_MN_NODE_DEL;
4499 		}
4500 
4501 		/*
4502 		 * If node is OK - put mnsr onto master_mnsr_node list for
4503 		 * later use when syncing up the drive records in the set.
4504 		 */
4505 		if (nd->nd_flags & MD_MN_NODE_OK) {
4506 			mnsr_node = Zalloc(sizeof (*mnsr_node));
4507 			mnsr_node->mmn_mnsr = mnsr;
4508 			(void) strncpy(mnsr_node->mmn_nodename,
4509 				nd->nd_nodename, MD_MAX_MNNODENAME_PLUS_1);
4510 			mnsr_node->mmn_next = master_mnsr_node;
4511 			master_mnsr_node = mnsr_node;
4512 		} else {
4513 			free_sr((struct md_set_record *)mnsr);
4514 		}
4515 
4516 		nd = nd->nd_next;
4517 	}
4518 
4519 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4520 	    "Master nodelist created for set %s: %s"),
4521 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4522 
4523 	/*
4524 	 * Send master nodelist to the rpc.metad on all nodes (including
4525 	 * myself) and each node will update itself.  This will set the
4526 	 * ADD and DEL flags on each node as setup in the master nodelist.
4527 	 * Don't send nodelist to node where set doesn't exist.
4528 	 */
4529 	nd = master_nodelist;
4530 	while (nd) {
4531 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4532 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
4533 			nd = nd->nd_next;
4534 			continue;
4535 		}
4536 		if (clnt_upd_nr_flags(nd->nd_nodename, sp,
4537 		    master_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) {
4538 			/* If RPC failure to another node return 205 */
4539 			if ((mdanyrpcerror(ep)) &&
4540 			    (sd->sd_mn_mynode->nd_nodeid !=
4541 			    nd->nd_nodeid)) {
4542 				rval = 205;
4543 			} else {
4544 				/* Any other failure */
4545 				rval = -1;
4546 			}
4547 			goto out;
4548 		}
4549 		nd = nd->nd_next;
4550 	}
4551 
4552 	/*
4553 	 * Now, delete nodes that need to be deleted.
4554 	 */
4555 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
4556 	    ep))  == NULL) {
4557 		if (! mdisok(ep)) {
4558 			rval = -1;
4559 			goto out;
4560 		}
4561 	}
4562 
4563 	/*
4564 	 * May be doing lots of RPC commands to the nodes, so lock the
4565 	 * ALIVE members of the set since most of the rpc.metad routines
4566 	 * require this for security reasons.
4567 	 */
4568 	nd = master_nodelist;
4569 	while (nd) {
4570 		/* Skip non-alive nodes and node without set */
4571 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4572 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
4573 			nd = nd->nd_next;
4574 			continue;
4575 		}
4576 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
4577 			/* If RPC failure to another node return 205 */
4578 			if ((mdanyrpcerror(ep)) &&
4579 			    (sd->sd_mn_mynode->nd_nodeid !=
4580 			    nd->nd_nodeid)) {
4581 				rval = 205;
4582 			} else {
4583 				/* Any other failure */
4584 				rval = -1;
4585 			}
4586 			goto out;
4587 		}
4588 		set_locked = 1;
4589 		nd = nd->nd_next;
4590 	}
4591 
4592 	nd = master_nodelist;
4593 	while (nd) {
4594 		/* Skip non-alive nodes */
4595 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4596 			nd = nd->nd_next;
4597 			continue;
4598 		}
4599 		if (nd->nd_flags & MD_MN_NODE_DEL) {
4600 			num_alive_nodes_del++;
4601 			/*
4602 			 * Delete this node rec from all ALIVE nodes in diskset.
4603 			 */
4604 			nd2 = master_nodelist;
4605 			while (nd2) {
4606 				/* Skip non-alive nodes and node without set */
4607 				if (!(nd2->nd_flags & MD_MN_NODE_ALIVE) ||
4608 				    (nd2->nd_flags & MD_MN_NODE_NOSET)) {
4609 					nd2 = nd2->nd_next;
4610 					continue;
4611 				}
4612 
4613 				/* This is a node being deleted from set */
4614 				if (nd2->nd_nodeid == nd->nd_nodeid) {
4615 					/* Mark set record as DEL */
4616 					if (clnt_upd_sr_flags(nd->nd_nodename,
4617 					    sp, MD_SR_DEL, ep)) {
4618 						/* RPC failure to !my node */
4619 						if ((mdanyrpcerror(ep)) &&
4620 						    (sd->sd_mn_mynode->
4621 						    nd_nodeid
4622 						    != nd->nd_nodeid)) {
4623 							rval = 205;
4624 						} else {
4625 							/* Any other failure */
4626 							rval = -1;
4627 						}
4628 						goto out;
4629 					}
4630 					if (clnt_deldrvs(nd->nd_nodename, sp,
4631 					    dd, ep)) {
4632 						/* RPC failure to !my node */
4633 						if ((mdanyrpcerror(ep)) &&
4634 						    (sd->sd_mn_mynode->
4635 						    nd_nodeid
4636 						    != nd->nd_nodeid)) {
4637 							rval = 205;
4638 						} else {
4639 							/* Any other failure */
4640 							rval = -1;
4641 						}
4642 						goto out;
4643 					}
4644 					if (clnt_delset(nd->nd_nodename, sp,
4645 					    ep) == -1) {
4646 						/* RPC failure to !my node */
4647 						if ((mdanyrpcerror(ep)) &&
4648 						    (sd->sd_mn_mynode->
4649 						    nd_nodeid
4650 						    != nd->nd_nodeid)) {
4651 							rval = 205;
4652 						} else {
4653 							/* Any other failure */
4654 							rval = -1;
4655 						}
4656 						goto out;
4657 					}
4658 				} else {
4659 					/*
4660 					 * Delete host from sets on hosts
4661 					 * not being deleted.
4662 					 */
4663 					anode[0] = Strdup(nd->nd_nodename);
4664 					if (clnt_delhosts(nd2->nd_nodename, sp,
4665 					    1, anode, ep) == -1) {
4666 						Free(anode[0]);
4667 						/* RPC failure to !my node */
4668 						if ((mdanyrpcerror(ep)) &&
4669 						    (sd->sd_mn_mynode->
4670 						    nd_nodeid
4671 						    != nd2->nd_nodeid)) {
4672 							rval = 205;
4673 						} else {
4674 							/* Any other failure */
4675 							rval = -1;
4676 						}
4677 						goto out;
4678 					}
4679 
4680 					meta_mc_log(MC_LOG5,
4681 					    dgettext(TEXT_DOMAIN,
4682 					    "Deleted node %s (%d) on node %s "
4683 					    "from set %s: %s"),
4684 					    nd->nd_nodename, nd->nd_nodeid,
4685 					    nd2->nd_nodename,
4686 					    sp->setname,
4687 					    meta_print_hrtime(
4688 					    gethrtime() - start_time));
4689 
4690 					Free(anode[0]);
4691 				}
4692 				nd2 = nd2->nd_next;
4693 			}
4694 		}
4695 		nd = nd->nd_next;
4696 	}
4697 
4698 	nd = master_nodelist;
4699 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
4700 	while (nd) {
4701 		/* Skip non-alive nodes and node without set */
4702 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4703 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
4704 			nd = nd->nd_next;
4705 			continue;
4706 		}
4707 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) {
4708 			/* If RPC failure to another node return 205 */
4709 			if ((mdanyrpcerror(ep)) &&
4710 			    (sd->sd_mn_mynode->nd_nodeid !=
4711 			    nd->nd_nodeid)) {
4712 				rval = 205;
4713 			} else {
4714 				/* Any other failure */
4715 				rval = -1;
4716 			}
4717 			goto out;
4718 		}
4719 		nd = nd->nd_next;
4720 	}
4721 	cl_set_setkey(NULL);
4722 	set_locked = 0;
4723 
4724 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4725 	    "Nodelist syncronization complete for set %s: %s"),
4726 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4727 
4728 	metaflushsetname(sp);
4729 
4730 	/*
4731 	 * If all alive nodes have been deleted from set, just
4732 	 * return since nothing else can be done until non-alive
4733 	 * nodes (if there are any) rejoin the cluster.
4734 	 */
4735 	if (num_alive_nodes == num_alive_nodes_del) {
4736 		rval = 0;
4737 		goto out;
4738 	}
4739 
4740 	/*
4741 	 * Sync up drive records.
4742 	 *
4743 	 * If a node panic'd (or metaset command was killed) during the
4744 	 * addition or deletion of a drive to the diskset, the nodes
4745 	 * may have a different view of the drive list.  During cleanup
4746 	 * of the drive list during reconfig, a drive will be deleted
4747 	 * from the list if the master node sees that the drive has been
4748 	 * marked in the ADD state on any node or is marked in the DEL state
4749 	 * on all nodes.
4750 	 * This cleanup must occur even if all nodes in the cluster are
4751 	 * not part of the cluster so that all nodes have the same view
4752 	 * of the drivelist.
4753 	 * Then if the entire cluster goes down and comes back up, the
4754 	 * new master node could be a node that wasn't in the cluster when
4755 	 * the node was deleted.  This could lead to a situation where the
4756 	 * master node thinks that a drive is OK, but this drive isn't
4757 	 * known to the other nodes.
4758 	 * This situation can also occur during the addition of a drive
4759 	 * where a node has the drive marked OK, but the node executing the
4760 	 * metaset command enountered a failure before marking that drive OK
4761 	 * on the rest of the nodes.  If the node with the OK drive then
4762 	 * panics, then rest of the nodes will remove that drive marked ADD
4763 	 * and when the node with the OK drive rejoins the cluster, it will
4764 	 * have a drive marked OK that is unknown by the other nodes.
4765 	 *
4766 	 * There are 2 situations to consider:
4767 	 * A) Master knows about a drive that other nodes don't know about.
4768 	 * B) At least one slave node knows about a drive that the master
4769 	 *    node doesn't know about.
4770 	 *
4771 	 * To handle these situations the following steps are followed:
4772 	 * 1) Count number of drives known by this master node and the
4773 	 *    other slave nodes.
4774 	 *    If all nodes have the same number of drives and the master has
4775 	 *    all drives marked OK, then skip to step4.
4776 	 *
4777 	 * 2) If a node has less drives listed than the master, the master
4778 	 *    must get the drive descriptor list from that node so that
4779 	 *    master can determine which drive it needs to delete from that
4780 	 *    node.  Master must get the drive descriptor list since the
4781 	 *    drive record list does not contain the name of the drive, but
4782 	 *    only a key and the key can only be interprested on that other
4783 	 *    node.
4784 	 *
4785 	 * 3) The master will then create the master drive list by doing:
4786 	 *	- Master starts with drive list known by master.
4787 	 *	- Any drive marked ADD will be removed from the list.
4788 	 *	- Any drive not known by another node (from step2) will be
4789 	 *	removed from the drive list.
4790 	 *	- If a drive is marked DEL on the master, the master must
4791 	 *	verify that the drive record is marked DEL on all nodes.
4792 	 *	If any node has the drive record marked OK, mark it OK
4793 	 *	on the master.  (The reason why is described below).
4794 	 *
4795 	 * 4) The master sends out the master drive list and the slave
4796 	 *    nodes will force their drive lists to match the master
4797 	 *    drive list by deleting drives, if necessary and by changing
4798 	 *    the drive record states from ADD->OK if master has drive
4799 	 *    marked OK and slave has drive marked ADD.
4800 	 *
4801 	 * Interesting scenarios:
4802 	 *
4803 	 * 1) System has 4 nodes with node 1 as the master.  Node 3 starts
4804 	 *    to delete a drive record (drive record on node 1 is marked DEL),
4805 	 *    but is stopped when node 3 panics.  Node 1 also panics.
4806 	 *    During reconfig cycle, node 2 is picked as master and the drive
4807 	 *    record is left alone since all nodes in the cluster have it
4808 	 *    marked OK.  User now sees drive as part of diskset.
4809 	 *    Now, entire cluster is rebooted and node 1 rejoins the cluster.
4810 	 *    Node 1 is picked as the master and node 1 has drive record
4811 	 *    marked DEL.  Node 1 contacts all other nodes in the cluster
4812 	 *    and since at least one node has the drive record marked OK,
4813 	 *    the master marks the drive record OK.
4814 	 *    User continues to see the drive as part of the diskset.
4815 	 */
4816 
4817 	/* Reget set descriptor since flushed above */
4818 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4819 		rval = -1;
4820 		goto out;
4821 	}
4822 
4823 	/* Has side effect of setting sd->sd_drvs to same as master_dd */
4824 	if ((master_dd = metaget_drivedesc_sideno(sp,
4825 	    sd->sd_mn_mynode->nd_nodeid,
4826 	    (MD_BASICNAME_OK | PRINT_FAST), ep)) == NULL) {
4827 		/* No drives in list */
4828 		if (!mdisok(ep)) {
4829 			/*
4830 			 * Can't get drive list for this node, so
4831 			 * return -1 causing this node to be removed
4832 			 * cluster config and fixed.
4833 			 */
4834 			rval = -1;
4835 			goto out;
4836 		}
4837 	}
4838 
4839 	/* Count the number of drives for all nodes */
4840 	mnsr_node = master_mnsr_node;
4841 	while (mnsr_node) {
4842 		dr_cnt = 0;
4843 		dr = mnsr_node->mmn_mnsr->sr_drivechain;
4844 		while (dr) {
4845 			dr_cnt++;
4846 			dr = dr->dr_next;
4847 		}
4848 		mnsr_node->mmn_numdrives = dr_cnt;
4849 		mnsr_node = mnsr_node->mmn_next;
4850 	}
4851 
4852 	/* Count the number of drives for the master; also check flags */
4853 	all_drives_ok = 1;
4854 	dd_cnt = 0;
4855 	dd = master_dd;
4856 	while (dd) {
4857 		dd_cnt++;
4858 		if (!(dd->dd_flags & MD_DR_OK))
4859 			all_drives_ok = 0;
4860 		dd = dd->dd_next;
4861 	}
4862 
4863 	/* If all drives are ok, do quick check against number of drives */
4864 	if (all_drives_ok) {
4865 		/* If all nodes have same number of drives, almost done */
4866 		mnsr_node = master_mnsr_node;
4867 		while (mnsr_node) {
4868 			if (mnsr_node->mmn_numdrives != dd_cnt)
4869 				break;
4870 			mnsr_node = mnsr_node->mmn_next;
4871 		}
4872 		/* All nodes have same number of drives, just send flags */
4873 		if (mnsr_node == NULL) {
4874 			goto send_drive_list;
4875 		}
4876 	}
4877 
4878 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4879 	    "Begin detailed drive synchronization for set %s: %s"),
4880 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4881 
4882 	/* Detailed check required  */
4883 	mnsr_node = master_mnsr_node;
4884 	while (mnsr_node) {
4885 		/* Does slave node have less drives than master? */
4886 		if (mnsr_node->mmn_numdrives < dd_cnt) {
4887 			/* Yes - must determine which drive is missing */
4888 			if (clnt_getdrivedesc(mnsr_node->mmn_nodename, sp,
4889 			    &other_dd, ep)) {
4890 				/* RPC failure to !my node */
4891 				if ((mdanyrpcerror(ep)) &&
4892 				    (strcmp(mynode(), mnsr_node->mmn_nodename)
4893 				    != 0)) {
4894 					rval = 205;
4895 				} else {
4896 					/* Any other failure */
4897 					rval = -1;
4898 				}
4899 				mde_perror(ep, dgettext(TEXT_DOMAIN,
4900 				    "Master node %s unable to "
4901 				    "retrieve drive list from node %s"),
4902 				    mynode(), mnsr_node->mmn_nodename);
4903 				goto out;
4904 			}
4905 			mnsr_node->mmn_dd = other_dd;
4906 			dd = master_dd;
4907 			while (dd) {
4908 				if (!(dd->dd_flags & MD_DR_OK)) {
4909 					dd = dd->dd_next;
4910 					continue;
4911 				}
4912 				other_dd = mnsr_node->mmn_dd;
4913 				while (other_dd) {
4914 					/* Convert to devids, when available */
4915 					if (strcmp(other_dd->dd_dnp->cname,
4916 					    dd->dd_dnp->cname) == 0) {
4917 						break;
4918 					}
4919 					other_dd = other_dd->dd_next;
4920 				}
4921 				/*
4922 				 * dd not found on slave so mark it
4923 				 * ADD for later deletion (drives in ADD
4924 				 * state are deleted later in this routine).
4925 				 */
4926 				if (other_dd == NULL) {
4927 					dd->dd_flags = MD_DR_ADD;
4928 				}
4929 				dd = dd->dd_next;
4930 			}
4931 
4932 		}
4933 		mnsr_node = mnsr_node->mmn_next;
4934 	}
4935 
4936 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4937 	    "Drive check completed for set %s: %s"),
4938 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4939 
4940 	dd = master_dd;
4941 	dd_prev = 0;
4942 	while (dd) {
4943 		/* Remove any ADD drives from list */
4944 		if (dd->dd_flags & MD_DR_ADD) {
4945 			if (dd_prev) {
4946 				dd_prev->dd_next = dd->dd_next;
4947 				dd->dd_next = NULL;
4948 				metafreedrivedesc(&dd);
4949 				dd = dd_prev->dd_next;
4950 			} else {
4951 				/*
4952 				 * If removing drive descriptor from head
4953 				 * of linked list, also change sd->sd_drvs.
4954 				 */
4955 				master_dd = sd->sd_drvs = dd->dd_next;
4956 				dd->dd_next = NULL;
4957 				metafreedrivedesc(&dd);
4958 				dd = master_dd;
4959 			}
4960 			/* dd setup in if/else above */
4961 			continue;
4962 		}
4963 		/*
4964 		 * If drive is marked DEL, check all other nodes.
4965 		 * If drive on another node is marked OK, mark drive OK
4966 		 * in master list.  If drive is marked DEL or doesn't exist
4967 		 * on all nodes, remove drive from list.
4968 		 */
4969 		if (dd->dd_flags & MD_DR_DEL) {
4970 			mnsr_node = master_mnsr_node;
4971 			while (mnsr_node) {
4972 				if (mnsr_node->mmn_dd == NULL) {
4973 				    if (clnt_getdrivedesc(
4974 					mnsr_node->mmn_nodename, sp,
4975 					&other_dd, ep)) {
4976 					    /* RPC failure to !my node */
4977 					    if ((mdanyrpcerror(ep)) &&
4978 						(strcmp(mynode(),
4979 						mnsr_node->mmn_nodename)
4980 						!= 0)) {
4981 						    rval = 205;
4982 					    } else {
4983 						    /* Any other failure */
4984 						    rval = -1;
4985 					    }
4986 					    mde_perror(ep, dgettext(TEXT_DOMAIN,
4987 						"Master node %s unable "
4988 						"to retrieve drive list from "
4989 						"node %s"), mynode(),
4990 						mnsr_node->mmn_nodename);
4991 					    goto out;
4992 				    }
4993 				    mnsr_node->mmn_dd = other_dd;
4994 				}
4995 				other_dd = mnsr_node->mmn_dd;
4996 				while (other_dd) {
4997 					/* Found drive (OK) from other node */
4998 					if (strcmp(dd->dd_dnp->cname,
4999 					    other_dd->dd_dnp->cname)
5000 					    == 0) {
5001 						/* Drive marked OK */
5002 						if (other_dd->dd_flags &
5003 						    MD_DR_OK) {
5004 						    dd->dd_flags = MD_DR_OK;
5005 						}
5006 						break;
5007 					}
5008 					other_dd = other_dd->dd_next;
5009 				}
5010 				if (dd->dd_flags == MD_DR_OK)
5011 					break;
5012 
5013 				mnsr_node = mnsr_node->mmn_next;
5014 			}
5015 			/*
5016 			 * If no node had this drive marked OK, delete it.
5017 			 */
5018 			if (dd->dd_flags & MD_DR_DEL) {
5019 				if (dd_prev) {
5020 					dd_prev->dd_next = dd->dd_next;
5021 					dd->dd_next = NULL;
5022 					metafreedrivedesc(&dd);
5023 					dd = dd_prev->dd_next;
5024 				} else {
5025 					/*
5026 					 * If removing drive descriptor from
5027 					 * head of linked list, also change
5028 					 * sd->sd_drvs.
5029 					 */
5030 					master_dd = sd->sd_drvs = dd->dd_next;
5031 					dd->dd_next = NULL;
5032 					metafreedrivedesc(&dd);
5033 					dd = master_dd;
5034 				}
5035 				/* dd setup in if/else above */
5036 				continue;
5037 			}
5038 		}
5039 		dd_prev = dd;
5040 		dd = dd->dd_next;
5041 	}
5042 
5043 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5044 	    "Setting drive states completed for set %s: %s"),
5045 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5046 
5047 send_drive_list:
5048 	/*
5049 	 * Set genid on all drives to be the highest value seen.
5050 	 */
5051 	dd = master_dd;
5052 	while (dd) {
5053 		dd->dd_genid = max_genid;
5054 		dd = dd->dd_next;
5055 	}
5056 	/*
5057 	 * Send updated drive list to all alive nodes.
5058 	 * Will also set genid on set and node records to have same
5059 	 * as the drive records.
5060 	 */
5061 	nd = sd->sd_nodelist;
5062 	while (nd) {
5063 		/* Skip non-alive nodes */
5064 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5065 			nd = nd->nd_next;
5066 			continue;
5067 		}
5068 		if (clnt_upd_dr_reconfig(nd->nd_nodename, sp, master_dd, ep)) {
5069 			/* RPC failure to another node */
5070 			if ((mdanyrpcerror(ep)) &&
5071 			    (sd->sd_mn_mynode->nd_nodeid != nd->nd_nodeid)) {
5072 				rval = 205;
5073 			} else {
5074 				/* Any other failure */
5075 				rval = -1;
5076 			}
5077 			goto out;
5078 		}
5079 		nd = nd->nd_next;
5080 	}
5081 
5082 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5083 	    "Sent drive list to all nodes for set %s: %s"),
5084 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5085 
5086 	/*
5087 	 * If no drive records left in set and nodes had been joined,
5088 	 * withdraw the nodes.  Always reset the master and mark
5089 	 * all nodes as withdrawn on all nodes.
5090 	 */
5091 	if (master_dd == NULL) {
5092 		/* Reset new master flag since no longer master */
5093 		(void) memset(&sf, 0, sizeof (sf));
5094 		sf.sf_setno = sp->setno;
5095 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5096 		sf.sf_flags = MDDB_NM_RESET;
5097 		/* Use magic to help protect ioctl against attack. */
5098 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5099 		/* Ignore failure, failure to reset flag isn't catastrophic */
5100 		(void) metaioctl(MD_MN_SET_SETFLAGS, &sf,
5101 		    &sf.sf_mde, NULL);
5102 
5103 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5104 		    "Reset new master flag for " "set %s: %s"),
5105 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5106 
5107 		nd = sd->sd_nodelist;
5108 		while (nd) {
5109 			/* Skip non-alive nodes  */
5110 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5111 				nd = nd->nd_next;
5112 				continue;
5113 			}
5114 
5115 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
5116 				/* RPC failure to another node */
5117 				if ((mdanyrpcerror(ep)) &&
5118 				    (sd->sd_mn_mynode->nd_nodeid !=
5119 				    nd->nd_nodeid)) {
5120 					rval = 205;
5121 				} else {
5122 					/* Any other failure */
5123 					rval = -1;
5124 				}
5125 				goto out;
5126 			}
5127 			set_locked = 1;
5128 
5129 			/* Withdraw node from set if owner */
5130 			if ((nd->nd_flags & MD_MN_NODE_OWN) &&
5131 			    (clnt_withdrawset(nd->nd_nodename, sp, ep))) {
5132 				/* RPC failure to another node */
5133 				if ((mdanyrpcerror(ep)) &&
5134 				    (sd->sd_mn_mynode->nd_nodeid !=
5135 				    nd->nd_nodeid)) {
5136 					rval = 205;
5137 				} else {
5138 					/* Any other failure */
5139 					rval = -1;
5140 				}
5141 				goto out;
5142 			}
5143 
5144 			/* Mark all nodes as withdrawn on this node */
5145 			if (clnt_upd_nr_flags(nd->nd_nodename, sp,
5146 			    sd->sd_nodelist, MD_NR_WITHDRAW, NULL, ep)) {
5147 				/* RPC failure to another node */
5148 				if ((mdanyrpcerror(ep)) &&
5149 				    (sd->sd_mn_mynode->nd_nodeid !=
5150 				    nd->nd_nodeid)) {
5151 					rval = 205;
5152 				} else {
5153 					/* Any other failure */
5154 					rval = -1;
5155 				}
5156 				goto out;
5157 			}
5158 
5159 			/* Resets master to no-master on this node */
5160 			if (clnt_mnsetmaster(nd->nd_nodename, sp,
5161 			    "", MD_MN_INVALID_NID, ep)) {
5162 				/* RPC failure to another node */
5163 				if ((mdanyrpcerror(ep)) &&
5164 				    (sd->sd_mn_mynode->nd_nodeid !=
5165 				    nd->nd_nodeid)) {
5166 					rval = 205;
5167 				} else {
5168 					/* Any other failure */
5169 					rval = -1;
5170 				}
5171 				goto out;
5172 			}
5173 
5174 			cl_sk = cl_get_setkey(sp->setno, sp->setname);
5175 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) {
5176 				/* RPC failure to another node */
5177 				if ((mdanyrpcerror(ep)) &&
5178 				    (sd->sd_mn_mynode->nd_nodeid !=
5179 				    nd->nd_nodeid)) {
5180 					rval = 205;
5181 				} else {
5182 					/* Any other failure */
5183 					rval = -1;
5184 				}
5185 				goto out;
5186 			}
5187 			set_locked = 0;
5188 			nd = nd->nd_next;
5189 		}
5190 	}
5191 
5192 out:
5193 	/*
5194 	 * If got here and set is still locked, then an error has
5195 	 * occurred and master_nodelist is still valid.
5196 	 * If error is not an RPC error, then unlock.
5197 	 * If error is an RPC error, skip unlocks since this could cause
5198 	 * yet another RPC timeout if a node has failed.
5199 	 * Ignore failures in unlock since unlock is just trying to
5200 	 * clean things up.
5201 	 */
5202 	if ((set_locked) && !(mdanyrpcerror(ep))) {
5203 		nd = master_nodelist;
5204 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
5205 		while (nd) {
5206 			/* Skip non-alive nodes */
5207 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5208 				nd = nd->nd_next;
5209 				continue;
5210 			}
5211 			/*
5212 			 * If clnt_unlock fails, just break out since next
5213 			 * reconfig cycle will reset the locks anyway.
5214 			 */
5215 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
5216 				break;
5217 			}
5218 			nd = nd->nd_next;
5219 		}
5220 		cl_set_setkey(NULL);
5221 	}
5222 	/* Free master_mnsr and drive descs */
5223 	mnsr_node = master_mnsr_node;
5224 	while (mnsr_node) {
5225 		master_mnsr_node = mnsr_node->mmn_next;
5226 		free_sr((md_set_record *)mnsr_node->mmn_mnsr);
5227 		free_rem_dd(mnsr_node->mmn_dd);
5228 		Free(mnsr_node);
5229 		mnsr_node = master_mnsr_node;
5230 	}
5231 
5232 	/* Frees sd->sd_drvs (which is also master_dd) */
5233 	metaflushsetname(sp);
5234 	return (rval);
5235 }
5236 
5237 /*
5238  * meta_mnsync_diskset_mddbs
5239  * Calling node is guaranteed to be an owner node.
5240  * Calling node is the master node.
5241  *
5242  * Master node verifies that ondisk mddb format matches its incore format.
5243  * If no nodes are joined to set, remove the change log entries.
5244  * If a node is joined to set, play the change log.
5245  *
5246  * Returns	 0 - Success
5247  *		 1 - Master unable to join to set.
5248  *		205 - Failure during RPC to another node
5249  *		-1 - Any other failure and ep is filled in.
5250  *			-1 return will eventually cause node to panic
5251  *			in a SunCluster environment.
5252  */
5253 int
5254 meta_mnsync_diskset_mddbs(
5255 	mdsetname_t	*sp,
5256 	md_error_t	*ep
5257 )
5258 {
5259 	md_set_desc		*sd;
5260 	mddb_config_t		c;
5261 	md_mn_msgclass_t	class;
5262 	mddb_setflags_config_t	sf;
5263 	md_mnnode_desc		*nd, *nd2;
5264 	md_error_t		xep = mdnullerror;
5265 	int			stale_set = 0;
5266 
5267 	/* If setname is there, set desc should exist. */
5268 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5269 		mde_perror(ep, dgettext(TEXT_DOMAIN,
5270 		    "Unable to get set %s desc information"), sp->setname);
5271 		return (-1);
5272 	}
5273 
5274 	/* Are there drives in the set? */
5275 	if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
5276 	    ep) == NULL) {
5277 		if (! mdisok(ep)) {
5278 			return (-1);
5279 		}
5280 		/* No drives in set -- nothing to sync up */
5281 		return (0);
5282 	}
5283 
5284 	/*
5285 	 * Is master node (which is this node) joined to set?
5286 	 * If master node isn't joined (which means that no nodes
5287 	 * are joined to diskset), remove the change log entries
5288 	 * since no need to replay them - all nodes will have same
5289 	 * view of mddbs since all nodes are reading in the mddbs
5290 	 * from disk.
5291 	 * There is also no need to sync up the master and ondisk mddbs
5292 	 * since master has no incore knowledge.
5293 	 * Need to join master to set in order to flush the change
5294 	 * log entries. Don't need to block I/O during join of master
5295 	 * to set since no other nodes are joined to set and so no I/O
5296 	 * can be occurring.
5297 	 */
5298 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
5299 		/* Join master to set */
5300 		if (clnt_joinset(mynode(), sp,
5301 		    MNSET_IN_RECONFIG, ep)) {
5302 			if (mdismddberror(ep, MDE_DB_STALE)) {
5303 				/*
5304 				 * If STALE, print message and continue on.
5305 				 * Don't do any writes or reads to mddbs
5306 				 * so don't clear change log.
5307 				 */
5308 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5309 				    "Join of master node to STALE set %s"),
5310 				    sp->setname);
5311 				stale_set = 1;
5312 				mdclrerror(ep);
5313 			} else if (mdismddberror(ep, MDE_DB_ACCOK)) {
5314 				/* ACCOK means mediator provided extra vote */
5315 				mdclrerror(ep);
5316 			} else {
5317 				/*
5318 				 * If master is unable to join set, print an
5319 				 * error message.  Don't return failure or node
5320 				 * will panic during cluster reconfig cycle.
5321 				 * Also, withdraw node from set in order to
5322 				 * cleanup from failed join attempt.
5323 				 */
5324 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5325 				    "Join of master node in set %s failed"),
5326 				    sp->setname);
5327 				if (clnt_withdrawset(mynode(), sp, &xep))
5328 					mdclrerror(&xep);
5329 				return (1);
5330 			}
5331 		}
5332 		/*
5333 		 * Master node successfully joined.
5334 		 * Set local copy of flags to OWN and
5335 		 * send owner flag to rpc.metad. If not stale,
5336 		 * flush the change log.
5337 		 */
5338 		sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN;
5339 		if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, MD_NR_SET,
5340 		    MNSET_IN_RECONFIG, ep)) {
5341 			mde_perror(ep, dgettext(TEXT_DOMAIN,
5342 			    "Flag update of master node join in set %s failed"),
5343 			    sp->setname);
5344 			return (-1);
5345 		}
5346 
5347 		if (!stale_set) {
5348 			if (mdmn_reset_changelog(sp, ep,
5349 			    MDMN_CLF_RESETLOG) != 0) {
5350 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5351 				    "Unable to reset changelog."));
5352 				return (-1);
5353 			}
5354 			meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5355 			    "Removed changelog entries for set %s: %s"),
5356 			    sp->setname,
5357 			    meta_print_hrtime(gethrtime() - start_time));
5358 		}
5359 		/* Reset new master flag before return */
5360 		(void) memset(&sf, 0, sizeof (sf));
5361 		sf.sf_setno = sp->setno;
5362 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5363 		sf.sf_flags = MDDB_NM_RESET;
5364 		/* Use magic to help protect ioctl against attack. */
5365 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5366 		/* Ignore failure, failure to reset flag isn't catastrophic */
5367 		(void) metaioctl(MD_MN_SET_SETFLAGS, &sf,
5368 		    &sf.sf_mde, NULL);
5369 
5370 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5371 		    "Reset new master flag for set %s: %s"),
5372 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5373 
5374 		return (0);
5375 	}
5376 
5377 	/*
5378 	 * Is master already joined to STALE set (< 50% mddbs avail)?
5379 	 * If so, can make no config changes to mddbs so don't check or play
5380 	 * changelog and don't sync master node to ondisk mddbs.
5381 	 * To get out of the stale state all nodes must be withdrawn
5382 	 * from set.  Then as nodes are re-joined, all nodes will
5383 	 * have same view of mddbs since all nodes are reading the
5384 	 * mddbs from disk.
5385 	 */
5386 	(void) memset(&c, 0, sizeof (c));
5387 	c.c_id = 0;
5388 	c.c_setno = sp->setno;
5389 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
5390 		(void) mdstealerror(ep, &c.c_mde);
5391 		return (-1);
5392 	}
5393 	if (c.c_flags & MDDB_C_STALE) {
5394 		return (0);
5395 	}
5396 
5397 	/*
5398 	 * If this node is NOT a newly chosen master, then there's
5399 	 * nothing else to do since the change log should be empty and
5400 	 * the ondisk and incore mddbs are already consistent.
5401 	 *
5402 	 * A newly chosen master is a node that was not the master
5403 	 * at the beginning of the reconfig cycle.  If a node is a new
5404 	 * master, then the new master state is reset after the ondisk
5405 	 * and incore mddbs are consistent and the change log has
5406 	 * been replayed.
5407 	 */
5408 	(void) memset(&sf, 0, sizeof (sf));
5409 	sf.sf_setno = sp->setno;
5410 	sf.sf_flags = MDDB_NM_GET;
5411 	/* Use magic to help protect ioctl against attack. */
5412 	sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5413 	if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, &sf.sf_mde, NULL) == 0) &&
5414 	    ((sf.sf_setflags & MD_SET_MN_NEWMAS_RC) == 0)) {
5415 		return (0);
5416 	}
5417 
5418 	/*
5419 	 * Now, sync up incore master view to ondisk mddbs.
5420 	 * This is needed in the case where a master node
5421 	 * had made a change to the mddb, but this change
5422 	 * may not have been relayed to the slaves yet.
5423 	 * So, the new master needs to verify that the ondisk
5424 	 * mddbs match what the new master has incore -
5425 	 * if different, new master rewrites all of the mddbs.
5426 	 * Then the new master will replay the changelog and the
5427 	 * new master will then execute what the old master had
5428 	 * done.
5429 	 *
5430 	 * Block all I/Os to disks in this diskset on all nodes in
5431 	 * the diskset.  This will allow the rewriting of the mddbs
5432 	 * (if needed), to proceed in a timely manner.
5433 	 *
5434 	 * If block of I/Os fail, return a -1.
5435 	 */
5436 
5437 	nd = sd->sd_nodelist;
5438 	while (nd) {
5439 		/* Skip non-alive and non-owner nodes  */
5440 		if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5441 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5442 			nd = nd->nd_next;
5443 			continue;
5444 		}
5445 		if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5446 		    MN_SUSP_IO, ep)) {
5447 			mde_perror(ep, dgettext(TEXT_DOMAIN,
5448 			    "Unable to suspend I/O on node %s in set %s"),
5449 			    nd->nd_nodename, sp->setname);
5450 
5451 			/*
5452 			 * Resume all other nodes that had been suspended.
5453 			 * (Reconfig return step also resumes I/Os
5454 			 * for all sets.)
5455 			 */
5456 			nd2 = sd->sd_nodelist;
5457 			while (nd2) {
5458 				/* Stop when reaching failed node */
5459 				if (nd2->nd_nodeid == nd->nd_nodeid)
5460 					break;
5461 				/* Skip non-alive and non-owner nodes  */
5462 				if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) ||
5463 				    (!(nd2->nd_flags & MD_MN_NODE_OWN))) {
5464 					nd2 = nd2->nd_next;
5465 					continue;
5466 				}
5467 				(void) (clnt_mn_susp_res_io(nd2->nd_nodename,
5468 					sp->setno, MN_RES_IO, &xep));
5469 				nd2 = nd2->nd_next;
5470 			}
5471 
5472 			/*
5473 			 * If an RPC failure on another node, return a 205.
5474 			 * Otherwise, exit with failure.
5475 			 */
5476 			if ((mdanyrpcerror(ep)) &&
5477 			    (sd->sd_mn_mynode->nd_nodeid !=
5478 			    nd->nd_nodeid)) {
5479 				return (205);
5480 			} else {
5481 				return (-1);
5482 			}
5483 
5484 		}
5485 		nd = nd->nd_next;
5486 	}
5487 
5488 	(void) memset(&c, 0, sizeof (c));
5489 	c.c_id = 0;
5490 	c.c_setno = sp->setno;
5491 	/* Master can't sync up to ondisk mddbs?  Kick it out of cluster */
5492 	if (metaioctl(MD_MN_CHK_WRT_MDDB, &c, &c.c_mde, NULL) != 0)
5493 		return (-1);
5494 
5495 	/*
5496 	 * Resume I/Os that were suspended above.
5497 	 */
5498 	nd = sd->sd_nodelist;
5499 	while (nd) {
5500 		/* Skip non-alive and non-owner nodes  */
5501 		if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5502 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5503 			nd = nd->nd_next;
5504 			continue;
5505 		}
5506 		if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5507 		    MN_RES_IO, ep)) {
5508 			mde_perror(ep, dgettext(TEXT_DOMAIN,
5509 			    "Unable to resume I/O on node %s in set %s"),
5510 			    nd->nd_nodename, sp->setname);
5511 
5512 			/*
5513 			 * If an RPC failure then don't do any
5514 			 * more RPC calls, since one timeout is enough
5515 			 * to endure.  If RPC failure to another node, return
5516 			 * 205.  If RPC failure to my node, return -1.
5517 			 * If not an RPC failure, continue resuming the
5518 			 * rest of the nodes and then return -1.
5519 			 */
5520 			if (mdanyrpcerror(ep)) {
5521 				if (sd->sd_mn_mynode->nd_nodeid ==
5522 				    nd->nd_nodeid) {
5523 					return (-1);
5524 				} else {
5525 					return (205);
5526 				}
5527 			}
5528 
5529 			/*
5530 			 * If not an RPC error, continue resuming rest of
5531 			 * nodes, ignoring any failures except for an
5532 			 * RPC failure which constitutes an immediate exit.
5533 			 * Start in middle of list with failing node.
5534 			 */
5535 			nd2 = nd->nd_next;
5536 			while (nd2) {
5537 				/* Skip non-alive and non-owner nodes  */
5538 				if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) ||
5539 				    (!(nd2->nd_flags & MD_MN_NODE_OWN))) {
5540 					nd2 = nd2->nd_next;
5541 					continue;
5542 				}
5543 				(void) (clnt_mn_susp_res_io(nd2->nd_nodename,
5544 					sp->setno, MN_RES_IO, &xep));
5545 				if (mdanyrpcerror(&xep)) {
5546 					return (-1);
5547 				}
5548 				nd2 = nd2->nd_next;
5549 			}
5550 		}
5551 		nd = nd->nd_next;
5552 	}
5553 
5554 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, "Master node has completed "
5555 	    "checking/writing the mddb for set %s: %s"), sp->setname,
5556 	    meta_print_hrtime(gethrtime() - start_time));
5557 
5558 	/*
5559 	 * Send (aka replay) all messages we find in the changelog.
5560 	 * Flag the messages with
5561 	 *   MD_MSGF_REPLAY_MSG, so no new message ID is generated for them
5562 	 *   MD_MSGF_OVERRIDE_SUSPEND so they can pass the suspended commd.
5563 	 */
5564 	for (class = MD_MN_NCLASSES - 1; class > 0; class--) {
5565 		mdmn_changelog_record_t	*lr;
5566 		md_error_t	xep = mdnullerror;
5567 		md_mn_result_t	*resultp = NULL;
5568 		int		ret;
5569 
5570 		lr = mdmn_get_changelogrec(sp->setno, class);
5571 		if ((lr->lr_flags & MD_MN_LR_INUSE) == 0) {
5572 			/* no entry for this class */
5573 			continue;
5574 		}
5575 
5576 		meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN,
5577 		    "replaying message ID=(%d, 0x%llx-%d)\n"),
5578 		    MSGID_ELEMS(lr->lr_msg.msg_msgid));
5579 
5580 		ret = mdmn_send_message_with_msgid(
5581 			lr->lr_msg.msg_setno,
5582 			lr->lr_msg.msg_type,
5583 			lr->lr_msg.msg_flags |  MD_MSGF_REPLAY_MSG |
5584 						MD_MSGF_OVERRIDE_SUSPEND,
5585 			lr->lr_msg.msg_event_data,
5586 			lr->lr_msg.msg_event_size,
5587 			&resultp,
5588 			&lr->lr_msg.msg_msgid,
5589 			&xep);
5590 
5591 		meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN,
5592 		    "mdmn_send_message returned %d\n"), ret);
5593 
5594 		if (resultp)
5595 			free_result(resultp);
5596 	}
5597 
5598 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5599 	    "Playing changelog completed for set %s: %s"),
5600 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5601 
5602 	/*
5603 	 * Now that new master has ondisk and incore mddbs in sync, reset
5604 	 * this node's new master kernel flag (for this set).  If this node
5605 	 * re-enters another reconfig cycle before the completion of this
5606 	 * reconfig cycle, this master node won't need to check if the ondisk
5607 	 * and incore mddbs are in sync since this node won't be considered
5608 	 * a new master (since this flag is being reset here in the middle of
5609 	 * step2).  This will save time during any subsequent reconfig
5610 	 * cycles as long as this node continues to be master.
5611 	 */
5612 	(void) memset(&sf, 0, sizeof (sf));
5613 	sf.sf_setno = sp->setno;
5614 	sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5615 	sf.sf_flags = MDDB_NM_RESET;
5616 	/* Use magic to help protect ioctl against attack. */
5617 	sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5618 	/* Ignore failure, since failure to reset flag isn't catastrophic */
5619 	(void) metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, NULL);
5620 
5621 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5622 	    "Reset new master flag for set %s: %s"),
5623 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5624 
5625 	return (0);
5626 }
5627 
5628 /*
5629  * meta_mnjoin_all will join all starting nodes in the diskset.
5630  * A starting node is considered to be any node that is not
5631  * an owner of the set but is a member of the cluster.
5632  * Master node is already joined to set (done in meta_mnsync_diskset_mddbs).
5633  *
5634  * Caller is the Master node.
5635  *
5636  * Returns	 0 - Success
5637  *		205 - Failure during RPC to another node
5638  *		-1 - Any other failure and ep is filled in.
5639  */
5640 int
5641 meta_mnjoin_all(
5642 	mdsetname_t	*sp,
5643 	md_error_t	*ep
5644 )
5645 {
5646 	md_set_desc		*sd;
5647 	md_mnnode_desc		*nd, *nd2;
5648 	int			rval = 0;
5649 	int			stale_flag = 0;
5650 	mddb_config_t		c;
5651 	int			susp_res_flag = 0;
5652 	md_error_t		xep = mdnullerror;
5653 
5654 	/* If setname is there, set desc should exist. */
5655 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5656 		mde_perror(ep, dgettext(TEXT_DOMAIN,
5657 		    "Unable to get set %s desc information"), sp->setname);
5658 		return (-1);
5659 	}
5660 
5661 	/* Are there drives in the set? */
5662 	if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
5663 	    ep) == NULL) {
5664 		if (! mdisok(ep)) {
5665 			return (-1);
5666 		}
5667 		/* No drives in set -- nothing to join */
5668 		return (0);
5669 	}
5670 
5671 	/*
5672 	 * Is set currently stale?
5673 	 */
5674 	(void) memset(&c, 0, sizeof (c));
5675 	c.c_id = 0;
5676 	c.c_setno = sp->setno;
5677 	/* Ignore failure since master node may not be joined yet */
5678 	(void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL);
5679 	if (c.c_flags & MDDB_C_STALE) {
5680 		stale_flag = MNSET_IS_STALE;
5681 	}
5682 
5683 	/*
5684 	 * If any nodes are going to be joined to diskset, then
5685 	 * suspend I/O to all disks in diskset so that nodes can join
5686 	 * (read in mddbs) in a reasonable amount of time even under
5687 	 * high I/O load.  Don't need to do this if set is STALE since
5688 	 * no I/O can be occurring to a STALE set.
5689 	 */
5690 	if (stale_flag != MNSET_IS_STALE) {
5691 		nd = sd->sd_nodelist;
5692 		while (nd) {
5693 			/* Found a node that will be joined to diskset */
5694 			if ((nd->nd_flags & MD_MN_NODE_ALIVE) &&
5695 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5696 				/* Set flag that diskset should be suspended */
5697 				susp_res_flag = 1;
5698 				break;
5699 			}
5700 			nd = nd->nd_next;
5701 		}
5702 	}
5703 
5704 	if (susp_res_flag) {
5705 		/*
5706 		 * Block all I/Os to disks in this diskset on all joined
5707 		 * nodes in the diskset.
5708 		 * If block of I/Os fails due to an RPC failure on another
5709 		 * node, return 205; otherwise, return -1.
5710 		 */
5711 		nd = sd->sd_nodelist;
5712 		while (nd) {
5713 			/* Skip non-alive and non-owner nodes  */
5714 			if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5715 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5716 				nd = nd->nd_next;
5717 				continue;
5718 			}
5719 			if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5720 			    MN_SUSP_IO, ep)) {
5721 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5722 				    "Unable to suspend I/O on node %s"
5723 				    " in set %s"), nd->nd_nodename,
5724 				    sp->setname);
5725 				/*
5726 				 * Resume other nodes that had been suspended.
5727 				 * (Reconfig return step also resumes I/Os
5728 				 * for all sets.)
5729 				 */
5730 				nd2 = sd->sd_nodelist;
5731 				while (nd2) {
5732 					/* Stop when reaching failed node */
5733 					if (nd2->nd_nodeid == nd->nd_nodeid)
5734 						break;
5735 					/* Skip non-alive/non-owner nodes  */
5736 					if ((!(nd2->nd_flags &
5737 					    MD_MN_NODE_ALIVE)) ||
5738 					    (!(nd2->nd_flags &
5739 					    MD_MN_NODE_OWN))) {
5740 						nd2 = nd2->nd_next;
5741 						continue;
5742 					}
5743 					(void) (clnt_mn_susp_res_io(
5744 					    nd2->nd_nodename, sp->setno,
5745 					    MN_RES_IO, &xep));
5746 					nd2 = nd2->nd_next;
5747 				}
5748 
5749 				/*
5750 				 * If the suspend failed due to an
5751 				 * RPC failure on another node, return
5752 				 * a 205.
5753 				 * Otherwise, exit with failure.
5754 				 * The return reconfig step will resume
5755 				 * I/Os for all disksets.
5756 				 */
5757 				if ((mdanyrpcerror(ep)) &&
5758 				    (sd->sd_mn_mynode->nd_nodeid !=
5759 				    nd->nd_nodeid)) {
5760 					return (205);
5761 				} else {
5762 					return (-1);
5763 				}
5764 			}
5765 			nd = nd->nd_next;
5766 		}
5767 	}
5768 
5769 	nd = sd->sd_nodelist;
5770 	while (nd) {
5771 		/*
5772 		 * If a node is in the membership list but isn't joined
5773 		 * to the set, try to join the node.
5774 		 */
5775 		if ((nd->nd_flags & MD_MN_NODE_ALIVE) &&
5776 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5777 			if (clnt_joinset(nd->nd_nodename, sp,
5778 			    (MNSET_IN_RECONFIG | stale_flag), ep)) {
5779 				/*
5780 				 * If RPC failure to another node
5781 				 * then exit without attempting anything else.
5782 				 * (Reconfig return step will resume I/Os
5783 				 * for all sets.)
5784 				 */
5785 				if (mdanyrpcerror(ep)) {
5786 					mde_perror(ep, "");
5787 					return (205);
5788 				}
5789 				/*
5790 				 * STALE and ACCOK failures aren't true
5791 				 * failures.  STALE means that <50% mddbs
5792 				 * are available. ACCOK means that the
5793 				 * mediator provided the extra vote.
5794 				 * If a true failure, then print messasge
5795 				 * and withdraw node from set in order to
5796 				 * cleanup from failed join attempt.
5797 				 */
5798 				if ((!mdismddberror(ep, MDE_DB_STALE)) &&
5799 				    (!mdismddberror(ep, MDE_DB_ACCOK))) {
5800 					mde_perror(ep,
5801 					    "WARNING: Unable to join node %s "
5802 					    "to set %s", nd->nd_nodename,
5803 					    sp->setname);
5804 					mdclrerror(ep);
5805 					if (clnt_withdrawset(nd->nd_nodename,
5806 					    sp, &xep))
5807 						mdclrerror(&xep);
5808 					nd = nd->nd_next;
5809 					continue;
5810 				}
5811 			}
5812 			/* Set owner flag even if STALE or ACCOK */
5813 			nd->nd_flags |= MD_MN_NODE_OWN;
5814 		}
5815 		nd = nd->nd_next;
5816 	}
5817 	/*
5818 	 * Resume I/Os if suspended above.
5819 	 */
5820 	if (susp_res_flag) {
5821 		nd = sd->sd_nodelist;
5822 		while (nd) {
5823 			/*
5824 			 * Skip non-alive and non-owner nodes
5825 			 * (this list doesn't include any of
5826 			 * the nodes that were joined).
5827 			 */
5828 			if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5829 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5830 				nd = nd->nd_next;
5831 				continue;
5832 			}
5833 			if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5834 			    MN_RES_IO, ep)) {
5835 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5836 				    "Unable to resume I/O on node %s"
5837 				    " in set %s"), nd->nd_nodename,
5838 				    sp->setname);
5839 
5840 				/*
5841 				 * If an RPC failure then don't do any
5842 				 * more RPC calls, since one timeout is enough
5843 				 * to endure.  If RPC failure to another node,
5844 				 * return 205.  If RPC failure to my node,
5845 				 * return -1.
5846 				 * (Reconfig return step will resume I/Os
5847 				 * for all sets.)
5848 				 * If not an RPC failure, continue resuming the
5849 				 * rest of the nodes and then return -1.
5850 				 */
5851 				if (mdanyrpcerror(ep)) {
5852 					if (sd->sd_mn_mynode->nd_nodeid ==
5853 					    nd->nd_nodeid) {
5854 						return (-1);
5855 					} else {
5856 						return (205);
5857 					}
5858 				}
5859 
5860 				/*
5861 				 * If not an RPC error, continue resuming rest
5862 				 * of nodes, ignoring any failures except for
5863 				 * an RPC failure which constitutes an
5864 				 * immediate exit.
5865 				 * Start in middle of list with failing node.
5866 				 */
5867 				nd2 = nd->nd_next;
5868 				while (nd2) {
5869 					/* Skip non-owner nodes  */
5870 					if ((!(nd2->nd_flags &
5871 					    MD_MN_NODE_ALIVE)) ||
5872 					    (!(nd2->nd_flags &
5873 					    MD_MN_NODE_OWN))) {
5874 						nd2 = nd2->nd_next;
5875 						continue;
5876 					}
5877 					(void) (clnt_mn_susp_res_io(
5878 					    nd2->nd_nodename, sp->setno,
5879 					    MN_RES_IO, &xep));
5880 					if (mdanyrpcerror(&xep)) {
5881 						return (-1);
5882 					}
5883 					nd2 = nd2->nd_next;
5884 				}
5885 			}
5886 			nd = nd->nd_next;
5887 		}
5888 	}
5889 
5890 	nd = sd->sd_nodelist;
5891 	while (nd) {
5892 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
5893 			nd = nd->nd_next;
5894 			continue;
5895 		}
5896 		/*
5897 		 * If 1 node fails - go ahead and update the rest except
5898 		 * in the case of an RPC failure, fail immediately.
5899 		 */
5900 		if (clnt_upd_nr_flags(nd->nd_nodename, sp,
5901 		    sd->sd_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) {
5902 			/* RPC failure to another node */
5903 			if (mdanyrpcerror(ep)) {
5904 				return (205);
5905 			}
5906 			nd = nd->nd_next;
5907 			rval = -1;
5908 			continue;
5909 		}
5910 		nd = nd->nd_next;
5911 	}
5912 
5913 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5914 	    "Join of all nodes completed for set %s: %s"),
5915 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5916 
5917 	return (rval);
5918 }
5919