xref: /titanic_41/usr/src/lib/lvm/libmeta/common/meta_set_hst.c (revision bf85a12b7c81d0745d5a8aff65baeff50006cde9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Just in case we're not in a build environment, make sure that
29  * TEXT_DOMAIN gets set to something.
30  */
31 #if !defined(TEXT_DOMAIN)
32 #define	TEXT_DOMAIN "SYS_TEST"
33 #endif
34 
35 /*
36  * Metadevice diskset interfaces
37  */
38 
39 #include "meta_set_prv.h"
40 #include <meta.h>
41 #include <sys/lvm/md_crc.h>
42 #include <sys/time.h>
43 #include <sdssc.h>
44 
45 static int
add_db_sidenms(mdsetname_t * sp,md_error_t * ep)46 add_db_sidenms(
47 	mdsetname_t	*sp,
48 	md_error_t	*ep
49 )
50 {
51 	md_replicalist_t	*rlp = NULL;
52 	md_replicalist_t	*rl;
53 	int			rval = 0;
54 
55 	if (metareplicalist(sp, MD_FULLNAME_ONLY, &rlp, ep) < 0)
56 		return (-1);
57 
58 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
59 		md_replica_t	*r = rl->rl_repp;
60 
61 		/*
62 		 * This is not the first replica being added to the
63 		 * diskset so call with ADDSIDENMS_BCAST.  If this
64 		 * is a traditional diskset, the bcast flag is ignored
65 		 * since traditional disksets don't use the rpc.mdcommd.
66 		 */
67 		if (meta_db_addsidenms(sp, r->r_namep, r->r_blkno,
68 		    DB_ADDSIDENMS_BCAST, ep)) {
69 			rval = -1;
70 			goto out;
71 		}
72 	}
73 
74 out:
75 	metafreereplicalist(rlp);
76 	return (rval);
77 }
78 
79 static int
add_drvs_to_hosts(mdsetname_t * sp,int node_c,char ** node_v,md_error_t * ep)80 add_drvs_to_hosts(
81 	mdsetname_t	*sp,
82 	int		node_c,
83 	char		**node_v,
84 	md_error_t	*ep
85 )
86 {
87 	int		i;
88 	md_set_desc	*sd;
89 	md_drive_desc	*dd;
90 	md_timeval32_t	now;
91 	ulong_t		genid;
92 
93 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
94 		return (-1);
95 
96 	if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL) {
97 		if (! mdisok(ep))
98 			return (-1);
99 		return (0);
100 	}
101 
102 	now = sd->sd_ctime;
103 	genid = sd->sd_genid - 1;
104 
105 	for (i = 0; i < node_c; i++) {
106 		if (clnt_adddrvs(node_v[i], sp, dd, now, genid, ep) == -1)
107 			return (-1);
108 	}
109 
110 	return (0);
111 }
112 
113 static int
add_md_sidenms(mdsetname_t * sp,side_t sideno,side_t otherside,md_error_t * ep)114 add_md_sidenms(mdsetname_t *sp, side_t sideno, side_t otherside, md_error_t *ep)
115 {
116 	mdnm_params_t	nm;
117 	char		*cname, *dname;
118 	side_t		tmp_sideno;
119 	minor_t		mnum;
120 	int		done, i;
121 	int		rval = 0;
122 	md_set_desc	*sd;
123 
124 	(void) memset(&nm, '\0', sizeof (nm));
125 	nm.key   = MD_KEYWILD;
126 
127 	if (!metaislocalset(sp)) {
128 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
129 			return (-1);
130 	}
131 	/* Use rpc.mdcommd to add md side info from all nodes */
132 	if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
133 	    (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
134 		md_mn_result_t			*resultp = NULL;
135 		md_mn_msg_meta_md_addside_t	md_as;
136 		int				send_rval;
137 
138 		md_as.msg_sideno = sideno;
139 		md_as.msg_otherside = otherside;
140 		/*
141 		 * If reconfig cycle has been started, this node is stuck in
142 		 * in the return step until this command has completed.  If
143 		 * mdcommd is suspended, ask send_message to fail (instead of
144 		 * retrying) so that metaset can finish allowing the
145 		 * reconfig cycle to proceed.
146 		 */
147 		send_rval = mdmn_send_message(sp->setno,
148 		    MD_MN_MSG_META_MD_ADDSIDE,
149 		    MD_MSGF_FAIL_ON_SUSPEND | MD_MSGF_PANIC_WHEN_INCONSISTENT,
150 		    0, (char *)&md_as, sizeof (md_mn_msg_meta_md_addside_t),
151 		    &resultp, ep);
152 		if (send_rval != 0) {
153 			(void) mdstealerror(ep, &(resultp->mmr_ep));
154 			if (resultp)
155 				free_result(resultp);
156 			return (-1);
157 		}
158 		if (resultp)
159 			free_result(resultp);
160 		return (0);
161 	} else {
162 		/*CONSTCOND*/
163 		while (1) {
164 			char	*drvnm = NULL;
165 
166 			nm.mde   = mdnullerror;
167 			nm.setno = sp->setno;
168 			nm.side  = otherside;
169 			if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, NULL) != 0)
170 				return (mdstealerror(ep, &nm.mde));
171 
172 			if (nm.key == MD_KEYWILD)
173 				return (0);
174 
175 			/*
176 			 * Okay we have a valid key
177 			 * Let's see if it is hsp or not
178 			 */
179 			nm.devname = (uintptr_t)meta_getnmentbykey(sp->setno,
180 			    otherside, nm.key, &drvnm, NULL, NULL, ep);
181 			if (nm.devname == NULL || drvnm == NULL) {
182 				if (nm.devname)
183 					Free((void *)(uintptr_t)nm.devname);
184 				if (drvnm)
185 					Free((void *)(uintptr_t)drvnm);
186 				return (-1);
187 			}
188 
189 			/*
190 			 * If it is hsp add here
191 			 */
192 			if (strcmp(drvnm, MD_HOTSPARES) == 0) {
193 				if (add_name(sp, sideno, nm.key, MD_HOTSPARES,
194 				    minor(NODEV), (char *)(uintptr_t)nm.devname,
195 				    NULL, NULL, ep) == -1) {
196 					Free((void *)(uintptr_t)nm.devname);
197 					Free((void *)(uintptr_t)drvnm);
198 					return (-1);
199 				} else {
200 					Free((void *)(uintptr_t)nm.devname);
201 					Free((void *)(uintptr_t)drvnm);
202 					continue;
203 				}
204 			}
205 
206 			nm.side = sideno;
207 			if (MD_MNSET_DESC(sd)) {
208 				tmp_sideno = sideno;
209 			} else {
210 				tmp_sideno = sideno - 1;
211 			}
212 
213 			if ((done = meta_getnextside_devinfo(sp,
214 			    (char *)(uintptr_t)nm.devname, &tmp_sideno,
215 			    &cname, &dname, &mnum, ep)) == -1) {
216 				Free((void *)(uintptr_t)nm.devname);
217 				return (-1);
218 			}
219 
220 			assert(done == 1);
221 			Free((void *)(uintptr_t)nm.devname);
222 			Free((void *)(uintptr_t)drvnm);
223 
224 			/*
225 			 * The device reference count can be greater than 1 if
226 			 * more than one softpart is configured on top of the
227 			 * same device.  If this is the case then we want to
228 			 * increment the count to sync up with the other sides.
229 			 */
230 			for (i = 0; i < nm.ref_count; i++) {
231 				if (add_name(sp, sideno, nm.key, dname, mnum,
232 				    cname, NULL, NULL, ep) == -1)
233 					rval = -1;
234 			}
235 
236 			Free(cname);
237 			Free(dname);
238 
239 			if (rval != 0)
240 				return (rval);
241 		}
242 	}
243 
244 	/*NOTREACHED*/
245 }
246 
247 static int
check_setdrvs_againstnode(mdsetname_t * sp,char * node,md_error_t * ep)248 check_setdrvs_againstnode(mdsetname_t *sp, char *node, md_error_t *ep)
249 {
250 	mddrivename_t	*dp;
251 	md_drive_desc	*dd, *ddp;
252 
253 	if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL)
254 		if (! mdisok(ep))
255 			return (-1);
256 
257 	for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
258 		dp = ddp->dd_dnp;
259 
260 		if (checkdrive_onnode(sp, dp, node, ep))
261 			return (-1);
262 	}
263 
264 	return (0);
265 }
266 
267 static int
create_multinode_set_on_hosts(mdsetname_t * sp,int node_c,char ** node_v,int new_set,md_error_t * ep)268 create_multinode_set_on_hosts(
269 	mdsetname_t	*sp,
270 	int		node_c,		/* Number of new nodes */
271 	char		**node_v,	/* Nodes which are being added */
272 	int		new_set,
273 	md_error_t	*ep
274 )
275 {
276 	int				i;
277 	md_set_desc			*sd;
278 	md_timeval32_t			now;
279 	ulong_t				genid;
280 	int				rval = 0;
281 	md_mnnode_desc			*nd, *ndm = NULL;
282 	md_mnnode_desc			*nd_prev, *nd_curr;
283 	int				nodecnt;
284 	mndiskset_membershiplist_t	*nl, *nl2;
285 
286 	if (!new_set) {
287 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
288 			return (-1);
289 		now = sd->sd_ctime;
290 		genid = sd->sd_genid - 1;
291 		if (sd->sd_drvs)
292 			genid--;
293 	} else {
294 		sd = Zalloc(sizeof (*sd));
295 
296 		if (meta_gettimeofday(&now) == -1) {
297 			(void) mdsyserror(ep, errno,
298 			    dgettext(TEXT_DOMAIN, "meta_gettimeofday()"));
299 			rval = -1;
300 			goto out;
301 		}
302 
303 		/* Put the new entries into the set */
304 		/*
305 		 * Get membershiplist from API routine.  If there's
306 		 * an error, fail to create set and pass back error.
307 		 */
308 		if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
309 			rval = -1;
310 			goto out;
311 		}
312 
313 		/*
314 		 * meta_set_addhosts has already verified that
315 		 * this node list is in the membership list
316 		 * so set ALIVE flag.
317 		 * Since this is a new set, all hosts being
318 		 * added are new to the set, so also set ADD flag.
319 		 */
320 		for (i = 0; i < node_c; i++) {
321 			nd = Zalloc(sizeof (*nd));
322 			(void) strcpy(nd->nd_nodename, node_v[i]);
323 			nd->nd_ctime = now;
324 			nd->nd_flags = (MD_MN_NODE_ALIVE |
325 			    MD_MN_NODE_ADD);
326 			nl2 = nl;
327 			while (nl2) {
328 				if (strcmp(nl2->msl_node_name,
329 				    node_v[i]) == 0) {
330 					nd->nd_nodeid = nl2->msl_node_id;
331 					(void) strcpy(nd->nd_priv_ic,
332 					    nl2->msl_node_addr);
333 					break;
334 				}
335 				nl2 = nl2->next;
336 			}
337 
338 			/*
339 			 * Nodelist must be kept in ascending
340 			 * nodeid order.
341 			 */
342 			if (sd->sd_nodelist == NULL) {
343 				/* Nothing in list, just add it */
344 				sd->sd_nodelist = nd;
345 			} else if (nd->nd_nodeid < sd->sd_nodelist->nd_nodeid) {
346 				/* Add to head of list */
347 				nd->nd_next = sd->sd_nodelist;
348 				sd->sd_nodelist = nd;
349 			} else {
350 				nd_curr = sd->sd_nodelist->nd_next;
351 				nd_prev = sd->sd_nodelist;
352 				/* Search for place ot add it */
353 				while (nd_curr) {
354 					if (nd->nd_nodeid <
355 					    nd_curr->nd_nodeid) {
356 						/* Add before nd_curr */
357 						nd->nd_next = nd_curr;
358 						nd_prev->nd_next = nd;
359 						break;
360 					}
361 					nd_prev = nd_curr;
362 					nd_curr = nd_curr->nd_next;
363 				}
364 				/* Add to end of list */
365 				if (nd_curr == NULL) {
366 					nd_prev->nd_next = nd;
367 				}
368 
369 			}
370 			/* Set master to be first node added */
371 			if (ndm == NULL)
372 				ndm = nd;
373 		}
374 
375 		meta_free_nodelist(nl);
376 		/*
377 		 * Creating mnset for first time.
378 		 * Set master to be invalid until first drive is
379 		 * in set.
380 		 */
381 		(void) strcpy(sd->sd_mn_master_nodenm, "");
382 		sd->sd_mn_master_nodeid = MD_MN_INVALID_NID;
383 		sd->sd_mn_masternode = ndm;
384 		sd->sd_ctime = now;
385 		genid = sd->sd_genid = 0;
386 	}
387 
388 	/* Create the set where needed */
389 	for (i = 0; i < node_c; i++) {
390 		/*
391 		 * Create the set on each new node.  If the set already
392 		 * exists, then the node list being created on each new node
393 		 * is the current node list from before the new nodes
394 		 * were added.  If the set doesn't exist, then the node
395 		 * list being created on each new node is the entire
396 		 * new node list.
397 		 */
398 		if (clnt_mncreateset(node_v[i], sp, sd->sd_nodelist,
399 		    now, genid, sd->sd_mn_master_nodenm,
400 		    sd->sd_mn_master_nodeid, ep) == -1) {
401 			rval = -1;
402 			break;
403 		}
404 	}
405 
406 out:
407 	if (new_set) {
408 		nd = sd->sd_nodelist;
409 		while (nd) {
410 			sd->sd_nodelist = nd->nd_next;
411 			Free(nd);
412 			nd = sd->sd_nodelist;
413 		}
414 		Free(sd);
415 	}
416 
417 	if (rval != 0 || new_set)
418 		return (rval);
419 
420 	/*
421 	 * Add the drive records to the new sets
422 	 * and names for the new sides.
423 	 */
424 	return (add_drvs_to_hosts(sp, node_c, node_v, ep));
425 }
426 
427 
428 static int
create_traditional_set_on_hosts(mdsetname_t * sp,int node_c,char ** node_v,int new_set,md_error_t * ep)429 create_traditional_set_on_hosts(
430 	mdsetname_t	*sp,
431 	int		node_c,		/* Number of new nodes */
432 	char		**node_v,	/* Nodes which are being added */
433 	int		new_set,
434 	md_error_t	*ep
435 )
436 {
437 	int		i;
438 	md_set_desc	*sd;
439 	md_timeval32_t	now;
440 	ulong_t		genid;
441 	int		rval = 0;
442 
443 	if (!new_set) {
444 
445 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
446 			return (-1);
447 		now = sd->sd_ctime;
448 
449 		genid = sd->sd_genid;
450 
451 		if (sd->sd_drvs)
452 			genid--;
453 	} else {
454 		if (node_c > MD_MAXSIDES)
455 			return (mddserror(ep, MDE_DS_SIDENUMNOTAVAIL,
456 			    sp->setno, NULL, NULL, sp->setname));
457 
458 		sd = Zalloc(sizeof (*sd));
459 
460 		/* Put the new entries into the set */
461 		for (i = 0; i < node_c; i++) {
462 			(void) strcpy(sd->sd_nodes[i], node_v[i]);
463 		}
464 
465 		if (meta_gettimeofday(&now) == -1) {
466 			(void) mdsyserror(ep, errno, "meta_gettimeofday()");
467 			rval = -1;
468 			goto out;
469 		}
470 
471 		sd->sd_ctime = now;
472 		genid = sd->sd_genid = 0;
473 	}
474 
475 	/* Create the set where needed */
476 	for (i = 0; i < node_c; i++) {
477 		/*
478 		 * Create the set on each new host
479 		 */
480 		if (clnt_createset(node_v[i], sp, sd->sd_nodes, now, genid,
481 		    ep) == -1) {
482 			rval = -1;
483 			break;
484 		}
485 	}
486 
487 out:
488 	if (new_set)
489 		Free(sd);
490 
491 	if (rval != 0 || new_set)
492 		return (rval);
493 
494 	/*
495 	 * Add the drive records to the new sets
496 	 * and names for the new sides.
497 	 */
498 	return (add_drvs_to_hosts(sp, node_c, node_v, ep));
499 }
500 
501 static int
create_set_on_hosts(mdsetname_t * sp,int multi_node,int node_c,char ** node_v,int new_set,md_error_t * ep)502 create_set_on_hosts(
503 	mdsetname_t	*sp,
504 	int		multi_node,	/* Multi_node diskset or not? */
505 	int		node_c,		/* Number of new nodes */
506 	char		**node_v,	/* Nodes which are being added */
507 	int		new_set,
508 	md_error_t	*ep
509 )
510 {
511 	if (multi_node)
512 		return (create_multinode_set_on_hosts(sp, node_c, node_v,
513 		    new_set, ep));
514 	else
515 		return (create_traditional_set_on_hosts(sp, node_c, node_v,
516 		    new_set, ep));
517 }
518 
519 static int
create_set(mdsetname_t * sp,int multi_node,int node_c,char ** node_v,int auto_take,md_error_t * ep)520 create_set(
521 	mdsetname_t	*sp,
522 	int		multi_node,	/* Multi-node diskset or not? */
523 	int		node_c,
524 	char		**node_v,
525 	int		auto_take,
526 	md_error_t	*ep
527 )
528 {
529 	int		i;
530 	int		rval = 0;
531 	set_t		max_sets;
532 	set_t		setno;
533 	int		bool;
534 	uint_t		sr_flags;
535 	sigset_t	oldsigs;
536 	md_setkey_t	*cl_sk;
537 	int		rb_level = 0;
538 	md_error_t	xep = mdnullerror;
539 	rval_e		sdssc_rval;
540 	int		lock_flag = 0;
541 	int		sig_flag = 0;
542 
543 	if ((max_sets = get_max_sets(ep)) == 0)
544 		return (-1);
545 
546 	/* We must be a member of the set we are creating */
547 	if (! strinlst(mynode(), node_c, node_v))
548 		return (mddserror(ep, MDE_DS_SELFNOTIN,
549 		    sp->setno, mynode(), NULL, sp->setname));
550 
551 	/*
552 	 * If auto_take then we must be the only member of the set
553 	 * that we are creating.
554 	 */
555 	if (auto_take && node_c > 1)
556 		return (mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL, NULL,
557 		    sp->setname));
558 
559 	/*
560 	 * If we're part of SC3.0 we'll already have allocated the
561 	 * set number so we can skip the allocation algorithm used.
562 	 * Set number is unique across traditional and MN disksets.
563 	 */
564 	if ((sdssc_rval = sdssc_get_index(sp->setname, &setno))
565 	    == SDSSC_NOT_BOUND) {
566 
567 		for (i = 0; i < node_c; i++) {
568 			int	has_set;
569 
570 			/* Skip my node */
571 			if (strcmp(mynode(), node_v[i]) == 0)
572 				continue;
573 
574 			/*
575 			 * Make sure this set name is not used on the
576 			 * other hosts
577 			 */
578 			has_set = nodehasset(sp, node_v[i], NHS_N_EQ, ep);
579 			if (has_set < 0) {
580 				if (! mdiserror(ep, MDE_NO_SET)) {
581 					rval = -1;
582 					goto out;
583 				}
584 				mdclrerror(ep);
585 				continue;
586 			}
587 
588 			if (has_set) {
589 				(void) mddserror(ep, MDE_DS_NODEHASSET,
590 				    sp->setno, node_v[i], NULL, sp->setname);
591 				rval = -1;
592 				goto out;
593 			}
594 		}
595 
596 		for (setno = 1; setno < max_sets; setno++) {
597 			for (i = 0; i < node_c; i++) {
598 				if (clnt_setnumbusy(node_v[i], setno,
599 				    &bool, ep) == -1) {
600 					rval = -1;
601 					goto out;
602 				}
603 
604 				if (bool == TRUE)
605 					break;
606 			}
607 			if (i == node_c)
608 				break;
609 		}
610 	} else if (sdssc_rval != SDSSC_OKAY) {
611 		(void) mddserror(ep, MDE_DS_SETNUMNOTAVAIL, MD_SET_BAD, NULL,
612 		    NULL, sp->setname);
613 		rval = -1;
614 		goto out;
615 	}
616 
617 	if (setno == max_sets) {
618 		(void) mddserror(ep, MDE_DS_SETNUMNOTAVAIL, MD_SET_BAD, NULL,
619 		    NULL, sp->setname);
620 		rval = -1;
621 		goto out;
622 	}
623 
624 	sp->setno = setno;
625 
626 	/*
627 	 * Lock the set on current set members.
628 	 * Set locking done much earlier for MN diskset than for traditional
629 	 * diskset since lock_set is used to protect against
630 	 * other meta* commands running on the other nodes.
631 	 * Don't issue mdcommd SUSPEND command since there is nothing
632 	 * to suspend since there currently is no set.
633 	 */
634 	if (multi_node) {
635 		/* Make sure we are blocking all signals */
636 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
637 			mdclrerror(&xep);
638 		sig_flag = 1;
639 
640 		/* Lock the set on new set members */
641 		for (i = 0; i < node_c; i++) {
642 			if (clnt_lock_set(node_v[i], sp, ep)) {
643 				rval = -1;
644 				goto out;
645 			}
646 			lock_flag = 1;
647 		}
648 		/* Now have the diskset locked, verify set number is still ok */
649 		for (i = 0; i < node_c; i++) {
650 			if (clnt_setnumbusy(node_v[i], setno,
651 			    &bool, ep) == -1) {
652 				rval = -1;
653 				goto out;
654 			}
655 		}
656 	}
657 
658 
659 	if (meta_set_checkname(sp->setname, ep)) {
660 		rval = -1;
661 		goto out;
662 	}
663 
664 	for (i = 0; i < node_c; i++) {
665 		if (clnt_setnameok(node_v[i], sp, &bool, ep) == -1) {
666 			rval = -1;
667 			goto out;
668 		}
669 		if (bool == FALSE) {
670 			(void) mddserror(ep, MDE_DS_SETNAMEBUSY, sp->setno,
671 			    node_v[i], NULL, sp->setname);
672 			rval = -1;
673 			goto out;
674 		}
675 	}
676 
677 	/* END CHECK CODE */
678 
679 	/* Lock the set on new set members */
680 	if (!multi_node) {
681 		md_rb_sig_handling_on();
682 		sig_flag = 1;
683 		for (i = 0; i < node_c; i++) {
684 			if (clnt_lock_set(node_v[i], sp, ep)) {
685 				rval = -1;
686 				goto out;
687 			}
688 			lock_flag = 1;
689 		}
690 	}
691 
692 	RB_TEST(1, "create_set", ep)
693 
694 	RB_PREEMPT;
695 	rb_level = 1;	/* level 1 */
696 
697 	RB_TEST(2, "create_set", ep)
698 
699 	if ((rval = create_set_on_hosts(sp, multi_node, node_c, node_v,
700 	    1, ep)) == -1)
701 		goto rollback;
702 
703 	RB_TEST(3, "create_set", ep)
704 
705 	if (auto_take)
706 		sr_flags = MD_SR_OK | MD_SR_AUTO_TAKE;
707 	else
708 		sr_flags = MD_SR_OK;
709 
710 	/*
711 	 * Mark the set record MD_SR_OK
712 	 */
713 	for (i = 0; i < node_c; i++)
714 		if (clnt_upd_sr_flags(node_v[i], sp, sr_flags, ep))
715 			goto rollback;
716 
717 	rb_level = 2;	/* level 2 */
718 
719 	/*
720 	 * For MN diskset:
721 	 * On each added node, set the node record for that node
722 	 * to OK.  Then set all node records for the newly added
723 	 * nodes on all nodes to ok.
724 	 *
725 	 * By setting a node's own node record to ok first, even if
726 	 * the node adding the hosts panics, the rest of the nodes can
727 	 * determine the same node list during the choosing of the master
728 	 * during reconfig.  So, only nodes considered for mastership
729 	 * are nodes that have both MD_MN_NODE_OK and MD_SR_OK set
730 	 * on that node's rpc.metad.  If all nodes have MD_SR_OK set,
731 	 * but no node has its own MD_MN_NODE_OK set, then the set will
732 	 * be removed during reconfig since a panic occurred during the
733 	 * creation of the initial diskset.
734 	 */
735 
736 	if (multi_node) {
737 		md_mnnode_desc	*nd, *saved_nd_next;
738 		md_set_desc	*sd;
739 
740 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
741 			goto rollback;
742 		}
743 
744 		for (i = 0; i < node_c; i++) {
745 			nd = sd->sd_nodelist;
746 			/* All nodes are guaranteed to be ALIVE */
747 			while (nd) {
748 				if (strcmp(nd->nd_nodename, node_v[i]) == 0)
749 					break;
750 				nd = nd->nd_next;
751 			}
752 			/* Something wrong, will pick this up in next loop */
753 			if (nd == NULL)
754 				continue;
755 
756 			/* Only changing my local cache of node list */
757 			saved_nd_next = nd->nd_next;
758 			nd->nd_next = NULL;
759 
760 			/* Set node record for added host to ok on that host */
761 			if (clnt_upd_nr_flags(node_v[i], sp,
762 			    nd, MD_NR_OK, NULL, ep)) {
763 				nd->nd_next = saved_nd_next;
764 				goto rollback;
765 			}
766 			nd->nd_next = saved_nd_next;
767 		}
768 
769 		/* Now set all node records on all nodes to be ok */
770 		nd = sd->sd_nodelist;
771 		/* All nodes are guaranteed to be ALIVE */
772 		while (nd) {
773 			if (clnt_upd_nr_flags(nd->nd_nodename, sp,
774 			    sd->sd_nodelist, MD_NR_OK, NULL, ep)) {
775 				goto rollback;
776 			}
777 			nd = nd->nd_next;
778 		}
779 	}
780 
781 	RB_TEST(4, "create_set", ep)
782 
783 out:
784 	if ((rval == 0) && multi_node) {
785 		/*
786 		 * Set successfully created.
787 		 * Notify rpc.mdcommd on all nodes of a nodelist change.
788 		 * Send reinit command to mdcommd which forces it to get
789 		 * fresh set description.  Then send resume.
790 		 * Resume on class 0 will resume all classes.
791 		 */
792 		for (i = 0; i < node_c; i++) {
793 			/* Class is ignored for REINIT */
794 			if (clnt_mdcommdctl(node_v[i], COMMDCTL_REINIT,
795 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
796 				if (rval == 0)
797 					(void) mdstealerror(ep, &xep);
798 				rval = -1;
799 				mde_perror(ep, dgettext(TEXT_DOMAIN,
800 				    "Unable to reinit rpc.mdcommd.\n"));
801 			}
802 		}
803 		for (i = 0; i < node_c; i++) {
804 			if (clnt_mdcommdctl(node_v[i], COMMDCTL_RESUME,
805 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
806 				if (rval == 0)
807 					(void) mdstealerror(ep, &xep);
808 				rval = -1;
809 				mde_perror(ep, dgettext(TEXT_DOMAIN,
810 				    "Unable to resume rpc.mdcommd.\n"));
811 			}
812 		}
813 		meta_ping_mnset(sp->setno);
814 	}
815 	if (lock_flag) {
816 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
817 		for (i = 0; i < node_c; i++) {
818 			if (clnt_unlock_set(node_v[i], cl_sk, &xep)) {
819 				if (rval == 0)
820 					(void) mdstealerror(ep, &xep);
821 				rval = -1;
822 			}
823 		}
824 		cl_set_setkey(NULL);
825 	}
826 
827 	if (sig_flag) {
828 		if (multi_node) {
829 			/* release signals back to what they were on entry */
830 			if (procsigs(FALSE, &oldsigs, &xep) < 0)
831 				mdclrerror(&xep);
832 		} else {
833 			md_rb_sig_handling_off(md_got_sig(), md_which_sig());
834 		}
835 	}
836 
837 	return (rval);
838 
839 rollback:
840 	/* all signals already blocked for MN disket */
841 	if (!multi_node) {
842 		/* Make sure we are blocking all signals */
843 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
844 			mdclrerror(&xep);
845 	}
846 
847 	rval = -1;
848 
849 	/*
850 	 * For MN diskset:
851 	 * On each added node (which is now each node to be deleted),
852 	 * set the node record for that node to DEL.  Then set all
853 	 * node records for the newly added (soon to be deleted) nodes
854 	 * on all nodes to ok.
855 	 *
856 	 * By setting a node's own node record to DEL first, even if
857 	 * the node doing the rollback panics, the rest of the nodes can
858 	 * determine the same node list during the choosing of the master
859 	 * during reconfig.
860 	 */
861 
862 	/* level 3 */
863 	if ((rb_level > 1) && (multi_node)) {
864 		md_mnnode_desc	*nd, *saved_nd_next;
865 		md_set_desc	*sd;
866 
867 		if ((sd = metaget_setdesc(sp, &xep)) == NULL) {
868 			mdclrerror(&xep);
869 		}
870 
871 		for (i = 0; i < node_c; i++) {
872 			nd = sd->sd_nodelist;
873 			/* All nodes are guaranteed to be ALIVE */
874 			while (nd) {
875 				if (strcmp(nd->nd_nodename, node_v[i]) == 0)
876 					break;
877 				nd = nd->nd_next;
878 			}
879 			/* Something wrong, will pick this up in next loop */
880 			if (nd == NULL)
881 				continue;
882 
883 			/* Only changing my local cache of node list */
884 			saved_nd_next = nd->nd_next;
885 			nd->nd_next = NULL;
886 
887 			/* Set node record for added host to DEL on that host */
888 			if (clnt_upd_nr_flags(node_v[i], sp,
889 			    nd, MD_NR_DEL, NULL, &xep)) {
890 				nd->nd_next = saved_nd_next;
891 				mdclrerror(&xep);
892 			}
893 			nd->nd_next = saved_nd_next;
894 		}
895 
896 		/* Now set all node records on all nodes to be DEL */
897 		nd = sd->sd_nodelist;
898 		/* All nodes are guaranteed to be ALIVE */
899 		while (nd) {
900 			if (clnt_upd_nr_flags(nd->nd_nodename, sp,
901 			    sd->sd_nodelist, MD_NR_DEL, NULL, &xep)) {
902 				mdclrerror(&xep);
903 			}
904 			nd = nd->nd_next;
905 		}
906 
907 		/* Mark set record on all hosts to be DELETED */
908 		for (i = 0; i < node_c; i++) {
909 			if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, &xep)) {
910 				mdclrerror(&xep);
911 			}
912 		}
913 	}
914 	/* level 1 */
915 	if (rb_level > 0) {
916 		for (i = 0; i < node_c; i++) {
917 			if (clnt_delset(node_v[i], sp, &xep) == -1)
918 				mdclrerror(&xep);
919 		}
920 	}
921 
922 	/* level 0 */
923 	/* Don't test lock flag since guaranteed to be set if in rollback */
924 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
925 	for (i = 0; i < node_c; i++) {
926 		if (clnt_unlock_set(node_v[i], cl_sk, &xep))
927 			mdclrerror(&xep);
928 	}
929 	cl_set_setkey(NULL);
930 
931 	/* release signals back to what they were on entry */
932 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
933 		mdclrerror(&xep);
934 
935 	if ((sig_flag) && (!multi_node))
936 		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
937 
938 	return (rval);
939 }
940 
941 static int
del_db_sidenms(mdsetname_t * sp,side_t sideno,md_error_t * ep)942 del_db_sidenms(
943 	mdsetname_t	*sp,
944 	side_t		sideno,
945 	md_error_t	*ep
946 )
947 {
948 	md_replicalist_t	*rlp = NULL;
949 	md_replicalist_t	*rl;
950 	int			rval = 0;
951 
952 	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0)
953 		return (-1);
954 
955 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
956 		md_replica_t	*r = rl->rl_repp;
957 
958 		if (meta_db_delsidenm(sp, sideno, r->r_namep, r->r_blkno, ep)) {
959 			rval = -1;
960 			goto out;
961 		}
962 	}
963 
964 out:
965 	metafreereplicalist(rlp);
966 	return (rval);
967 }
968 
969 static int
del_drvs_from_hosts(mdsetname_t * sp,md_set_desc * sd,md_drive_desc * dd,int node_c,char ** node_v,int oha,md_error_t * ep)970 del_drvs_from_hosts(
971 	mdsetname_t	*sp,
972 	md_set_desc	*sd,
973 	md_drive_desc	*dd,
974 	int		node_c,
975 	char		**node_v,
976 	int		oha,
977 	md_error_t	*ep
978 )
979 {
980 	int 		i;
981 	md_mnnode_desc	*nd;
982 
983 	for (i = 0; i < node_c; i++) {
984 		if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
985 			/*
986 			 * During OHA mode, don't issue RPCs to
987 			 * non-alive nodes since there is no reason to
988 			 * wait for RPC timeouts.
989 			 */
990 			nd = sd->sd_nodelist;
991 			while (nd) {
992 				if (strcmp(nd->nd_nodename, node_v[i]) == 0)
993 					break;
994 				nd = nd->nd_next;
995 			}
996 			if (nd == NULL) {
997 				return (mddserror(ep, MDE_DS_NOTINMEMBERLIST,
998 				    sp->setno, nd->nd_nodename,
999 				    NULL, sp->setname));
1000 			}
1001 
1002 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1003 				continue;
1004 			}
1005 			if (clnt_deldrvs(node_v[i], sp, dd, ep)) {
1006 				return (-1);
1007 			}
1008 		} else if (MD_MNSET_DESC(sd) && (oha == FALSE)) {
1009 			/*
1010 			 * All nodes should be alive in non-oha mode.
1011 			 */
1012 			if (clnt_deldrvs(node_v[i], sp, dd, ep)) {
1013 				return (-1);
1014 			}
1015 		} else {
1016 			/*
1017 			 * For traditional diskset, issue the RPC and
1018 			 * ignore RPC failure if in OHA mode.
1019 			 */
1020 			if (clnt_deldrvs(node_v[i], sp, dd, ep)) {
1021 				if (oha == TRUE && mdanyrpcerror(ep)) {
1022 					mdclrerror(ep);
1023 					continue;
1024 				}
1025 				return (-1);
1026 			}
1027 		}
1028 	}
1029 
1030 	return (0);
1031 }
1032 
1033 static int
del_host_noset(mdsetname_t * sp,char ** anode,md_error_t * ep)1034 del_host_noset(
1035 	mdsetname_t	*sp,
1036 	char		**anode,
1037 	md_error_t	*ep
1038 )
1039 {
1040 	int		rval = 0;
1041 	md_setkey_t	*cl_sk;
1042 	md_drive_desc	*dd;
1043 	md_error_t	xep = mdnullerror;
1044 	md_set_desc	*sd;
1045 
1046 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1047 		return (-1);
1048 
1049 	/* Make sure we own the set */
1050 	if (meta_check_ownership(sp, ep) != 0)
1051 		return (-1);
1052 
1053 	/* Lock the set on our side */
1054 	if (clnt_lock_set(mynode(), sp, ep)) {
1055 		rval = -1;
1056 		goto out;
1057 	}
1058 
1059 	if (clnt_delhosts(mynode(), sp, 1, anode, ep)) {
1060 		rval = -1;
1061 		goto out;
1062 	}
1063 
1064 	if (!MD_MNSET_DESC(sd)) {
1065 		if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
1066 		    ep)) == NULL) {
1067 			if (! mdisok(ep)) {
1068 				rval = -1;
1069 				goto out;
1070 			}
1071 		}
1072 
1073 		/* If we have drives */
1074 		if (dd != NULL) {
1075 			if (clnt_del_drv_sidenms(mynode(), sp, ep)) {
1076 				rval = -1;
1077 				goto out;
1078 			}
1079 		}
1080 	}
1081 
1082 out:
1083 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1084 	if (clnt_unlock_set(mynode(), cl_sk, &xep)) {
1085 		if (rval == 0)
1086 			(void) mdstealerror(ep, &xep);
1087 		rval = -1;
1088 	}
1089 	cl_set_setkey(NULL);
1090 
1091 	metaflushsetname(sp);
1092 
1093 	return (rval);
1094 }
1095 
1096 static int
del_md_sidenms(mdsetname_t * sp,side_t sideno,md_error_t * ep)1097 del_md_sidenms(mdsetname_t *sp, side_t sideno, md_error_t *ep)
1098 {
1099 	mdnm_params_t		nm;
1100 	md_set_desc		*sd;
1101 	int			i;
1102 
1103 	if (!metaislocalset(sp)) {
1104 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
1105 			return (-1);
1106 	}
1107 	/* Use rpc.mdcommd to add md side info from all nodes */
1108 	if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
1109 	    (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
1110 		md_mn_result_t			*resultp = NULL;
1111 		md_mn_msg_meta_md_delside_t	md_ds;
1112 		int				send_rval;
1113 
1114 		md_ds.msg_sideno = sideno;
1115 		/*
1116 		 * If reconfig cycle has been started, this node is stuck in
1117 		 * in the return step until this command has completed.  If
1118 		 * mdcommd is suspended, ask send_message to fail (instead of
1119 		 * retrying) so that metaset can finish allowing the
1120 		 * reconfig cycle to proceed.
1121 		 */
1122 		send_rval = mdmn_send_message(sp->setno,
1123 		    MD_MN_MSG_META_MD_DELSIDE,
1124 		    MD_MSGF_FAIL_ON_SUSPEND | MD_MSGF_PANIC_WHEN_INCONSISTENT,
1125 		    0, (char *)&md_ds, sizeof (md_mn_msg_meta_md_delside_t),
1126 		    &resultp, ep);
1127 		if (send_rval != 0) {
1128 			(void) mdstealerror(ep, &(resultp->mmr_ep));
1129 			if (resultp)
1130 				free_result(resultp);
1131 			return (-1);
1132 		}
1133 		if (resultp)
1134 			free_result(resultp);
1135 	} else {
1136 		(void) memset(&nm, '\0', sizeof (nm));
1137 		nm.key   = MD_KEYWILD;
1138 
1139 		/*CONSTCOND*/
1140 		while (1) {
1141 			nm.mde   = mdnullerror;
1142 			nm.setno = sp->setno;
1143 			nm.side  = MD_SIDEWILD;
1144 			if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, NULL) != 0)
1145 				return (mdstealerror(ep, &nm.mde));
1146 
1147 			if (nm.key == MD_KEYWILD)
1148 				return (0);
1149 
1150 			/*
1151 			 * The device reference count can be greater than 1 if
1152 			 * more than one softpart is configured on top of the
1153 			 * same device.  If this is the case then we want to
1154 			 * decrement the count to zero so the entry can be
1155 			 * actually removed.
1156 			 */
1157 			for (i = 0; i < nm.ref_count; i++) {
1158 				if (del_name(sp, sideno, nm.key, ep) == -1)
1159 					return (-1);
1160 			}
1161 		}
1162 	}
1163 	return (0);
1164 }
1165 
1166 static void
recreate_set(mdsetname_t * sp,md_set_desc * sd)1167 recreate_set(
1168 	mdsetname_t		*sp,
1169 	md_set_desc		*sd
1170 )
1171 {
1172 	int			i;
1173 	int			has_set;
1174 	md_error_t		xep = mdnullerror;
1175 	md_mnnode_desc		*nd;
1176 
1177 	if (MD_MNSET_DESC(sd)) {
1178 		nd = sd->sd_nodelist;
1179 		while (nd) {
1180 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1181 				nd = nd->nd_next;
1182 				continue;
1183 			}
1184 			has_set = nodehasset(sp, nd->nd_nodename,
1185 			    NHS_NST_EQ, &xep);
1186 
1187 			if (has_set >= 0) {
1188 				nd = nd->nd_next;
1189 				continue;
1190 			}
1191 
1192 			mdclrerror(&xep);
1193 
1194 			if (clnt_mncreateset(nd->nd_nodename, sp,
1195 			    sd->sd_nodelist,
1196 			    sd->sd_ctime, sd->sd_genid,
1197 			    sd->sd_mn_master_nodenm,
1198 			    sd->sd_mn_master_nodeid, &xep) == -1)
1199 				mdclrerror(&xep);
1200 			nd = nd->nd_next;
1201 		}
1202 	} else {
1203 		for (i = 0; i < MD_MAXSIDES; i++) {
1204 			/* Skip empty slots */
1205 			if (sd->sd_nodes[i][0] == '\0')
1206 				continue;
1207 
1208 			has_set = nodehasset(sp, sd->sd_nodes[i],
1209 			    NHS_NST_EQ, &xep);
1210 
1211 			if (has_set >= 0)
1212 				continue;
1213 
1214 			mdclrerror(&xep);
1215 
1216 			if (clnt_createset(sd->sd_nodes[i], sp, sd->sd_nodes,
1217 			    sd->sd_ctime, sd->sd_genid, &xep) == -1)
1218 				mdclrerror(&xep);
1219 		}
1220 	}
1221 }
1222 
1223 /*
1224  * If a MN diskset, set is already locked on all nodes via clnt_lock_set.
1225  */
1226 static int
del_set_nodrives(mdsetname_t * sp,int node_c,char ** node_v,int oha,md_error_t * ep)1227 del_set_nodrives(
1228 	mdsetname_t		*sp,
1229 	int			node_c,
1230 	char			**node_v,
1231 	int			oha,
1232 	md_error_t		*ep
1233 )
1234 {
1235 	md_set_desc		*sd;
1236 	int			i;
1237 	sigset_t		oldsigs;
1238 	md_setkey_t		*cl_sk;
1239 	int			rb_level = 0;
1240 	ulong_t			max_genid = 0;
1241 	int			rval = 0;
1242 	md_error_t		xep = mdnullerror;
1243 	md_mnnode_desc		*nd;
1244 	int			delete_end = 1;
1245 
1246 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1247 		return (-1);
1248 
1249 	if (MD_MNSET_DESC(sd)) {
1250 		/* Make sure we are blocking all signals */
1251 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
1252 			mdclrerror(&xep);
1253 	} else {
1254 		md_rb_sig_handling_on();
1255 	}
1256 
1257 	/*
1258 	 * Lock the set on current set members for traditional disksets.
1259 	 */
1260 	if (!(MD_MNSET_DESC(sd))) {
1261 		for (i = 0; i < node_c; i++) {
1262 			/*
1263 			 * For traditional diskset, issue the RPC and
1264 			 * ignore RPC failure if in OHA mode.
1265 			 */
1266 			if (clnt_lock_set(node_v[i], sp, ep)) {
1267 				if (oha == TRUE && mdanyrpcerror(ep)) {
1268 					mdclrerror(ep);
1269 					continue;
1270 				}
1271 				rval = -1;
1272 				goto out;
1273 			}
1274 		}
1275 	}
1276 
1277 
1278 	RB_TEST(1, "deletehosts", ep)
1279 
1280 	RB_PREEMPT;
1281 	rb_level = 1;	/* level 1 */
1282 
1283 	RB_TEST(2, "deletehosts", ep)
1284 
1285 	/*
1286 	 * Mark the set record MD_SR_DEL
1287 	 */
1288 	for (i = 0; i < node_c; i++) {
1289 
1290 		RB_TEST(3, "deletehosts", ep)
1291 
1292 		if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
1293 			/*
1294 			 * During OHA mode, don't issue RPCs to
1295 			 * non-alive nodes since there is no reason to
1296 			 * wait for RPC timeouts.
1297 			 */
1298 			nd = sd->sd_nodelist;
1299 			while (nd) {
1300 				if (strcmp(nd->nd_nodename, node_v[i]) == 0)
1301 					break;
1302 				nd = nd->nd_next;
1303 			}
1304 			if (nd == NULL) {
1305 				(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
1306 				    sp->setno, nd->nd_nodename,
1307 				    NULL, sp->setname);
1308 				goto rollback;
1309 			}
1310 
1311 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1312 				continue;
1313 			}
1314 
1315 			if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) {
1316 				goto rollback;
1317 			}
1318 		} else if (MD_MNSET_DESC(sd) && (oha == FALSE)) {
1319 			/*
1320 			 * All nodes should be alive in non-oha mode.
1321 			 */
1322 			if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) {
1323 				goto rollback;
1324 			}
1325 		} else {
1326 			/*
1327 			 * For traditional diskset, issue the RPC and
1328 			 * ignore RPC failure if in OHA mode.
1329 			 */
1330 			if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) {
1331 				if (oha == TRUE && mdanyrpcerror(ep)) {
1332 					mdclrerror(ep);
1333 					continue;
1334 				}
1335 				goto rollback;
1336 			}
1337 		}
1338 
1339 		RB_TEST(4, "deletehosts", ep)
1340 	}
1341 
1342 	RB_TEST(5, "deletehosts", ep)
1343 
1344 	RB_PREEMPT;
1345 	rb_level = 2;	/* level 2 */
1346 
1347 	RB_TEST(6, "deletehosts", ep)
1348 
1349 	if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR)
1350 		if (metad_isautotakebyname(sp->setname))
1351 			delete_end = 0;
1352 		else
1353 			goto rollback;
1354 
1355 	/* The set is OK to delete, make it so. */
1356 	for (i = 0; i < node_c; i++) {
1357 
1358 		RB_TEST(7, "deletehosts", ep)
1359 
1360 		if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
1361 			/*
1362 			 * During OHA mode, don't issue RPCs to
1363 			 * non-alive nodes since there is no reason to
1364 			 * wait for RPC timeouts.
1365 			 */
1366 			nd = sd->sd_nodelist;
1367 			while (nd) {
1368 				if (strcmp(nd->nd_nodename, node_v[i]) == 0)
1369 					break;
1370 				nd = nd->nd_next;
1371 			}
1372 			if (nd == NULL) {
1373 				(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
1374 				    sp->setno, nd->nd_nodename,
1375 				    NULL, sp->setname);
1376 				goto rollback;
1377 			}
1378 
1379 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1380 				continue;
1381 			}
1382 
1383 			if (clnt_delset(node_v[i], sp, ep) == -1) {
1384 				goto rollback;
1385 			}
1386 		} else if (MD_MNSET_DESC(sd) && (oha == FALSE)) {
1387 			/*
1388 			 * All nodes should be alive in non-oha mode.
1389 			 */
1390 			if (clnt_delset(node_v[i], sp, ep) == -1) {
1391 				goto rollback;
1392 			}
1393 		} else {
1394 			/*
1395 			 * For traditional diskset, issue the RPC and
1396 			 * ignore RPC failure if in OHA mode.
1397 			 */
1398 			if (clnt_delset(node_v[i], sp, ep) == -1) {
1399 				if (oha == TRUE && mdanyrpcerror(ep)) {
1400 					mdclrerror(ep);
1401 					continue;
1402 				}
1403 				goto rollback;
1404 			}
1405 		}
1406 
1407 		RB_TEST(8, "deletehosts", ep)
1408 	}
1409 
1410 	RB_TEST(9, "deletehosts", ep)
1411 
1412 out:
1413 	/*
1414 	 * Unlock the set on current set members
1415 	 * for traditional disksets.
1416 	 */
1417 	if (!(MD_MNSET_DESC(sd))) {
1418 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
1419 		for (i = 0; i < node_c; i++) {
1420 			/*
1421 			 * For traditional diskset, issue the RPC and
1422 			 * ignore RPC failure if in OHA mode.
1423 			 */
1424 			if (clnt_unlock_set(node_v[i], cl_sk, &xep)) {
1425 				if (oha == TRUE && mdanyrpcerror(&xep)) {
1426 					mdclrerror(&xep);
1427 					continue;
1428 				}
1429 				if (rval == 0)
1430 					(void) mdstealerror(ep, &xep);
1431 				rval = -1;
1432 			}
1433 		}
1434 		cl_set_setkey(NULL);
1435 	}
1436 
1437 	/*
1438 	 * A MN diskset has the clnt_locks held by meta_set_deletehosts so
1439 	 * don't flush that data until meta_set_deletehosts has finished
1440 	 * with it.  meta_set_deletehosts will handle the flush of the
1441 	 * setname.
1442 	 */
1443 	if (!(MD_MNSET_DESC(sd))) {
1444 		metaflushsetname(sp);
1445 	}
1446 
1447 	if (delete_end &&
1448 	    sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR)
1449 		rval = -1;
1450 
1451 	if (MD_MNSET_DESC(sd)) {
1452 		/* release signals back to what they were on entry */
1453 		if (procsigs(FALSE, &oldsigs, &xep) < 0)
1454 			mdclrerror(&xep);
1455 	} else {
1456 		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1457 	}
1458 
1459 	return (rval);
1460 
1461 rollback:
1462 	/* all signals already blocked for MN disket */
1463 	if (!(MD_MNSET_DESC(sd))) {
1464 		/* Make sure we are blocking all signals */
1465 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
1466 			mdclrerror(&xep);
1467 	}
1468 
1469 	rval = -1;
1470 
1471 	max_genid = sd->sd_genid;
1472 
1473 	/* level 2 */
1474 	if (rb_level > 1) {
1475 		recreate_set(sp, sd);
1476 		max_genid++;
1477 
1478 		if (delete_end)
1479 			(void) sdssc_delete_end(sp->setname, SDSSC_CLEANUP);
1480 	}
1481 
1482 	/* level 1 */
1483 	if (rb_level > 0) {
1484 		max_genid++;
1485 		resync_genid(sp, sd, max_genid, node_c, node_v);
1486 	}
1487 
1488 	/* level 0 */
1489 	/*
1490 	 * Unlock the set on current set members
1491 	 * for traditional disksets.
1492 	 */
1493 	if (!(MD_MNSET_DESC(sd))) {
1494 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
1495 		for (i = 0; i < node_c; i++) {
1496 			/*
1497 			 * For traditional diskset, issue the RPC and
1498 			 * ignore RPC failure if in OHA mode.
1499 			 */
1500 			if (clnt_unlock_set(node_v[i], cl_sk, &xep))
1501 				mdclrerror(&xep);
1502 		}
1503 		cl_set_setkey(NULL);
1504 	}
1505 
1506 	/* release signals back to what they were on entry */
1507 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
1508 		mdclrerror(&xep);
1509 
1510 	/*
1511 	 * A MN diskset has the clnt_locks held by meta_set_deletehosts so
1512 	 * don't flush that data until meta_set_deletehosts has finished
1513 	 * with it.  meta_set_deletehosts will handle the flush of the
1514 	 * setname.
1515 	 */
1516 	if (!(MD_MNSET_DESC(sd))) {
1517 		metaflushsetname(sp);
1518 		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1519 	}
1520 
1521 	return (rval);
1522 }
1523 
1524 /*
1525  * On entry:
1526  *   procsigs already called for MN diskset.
1527  *   md_rb_sig_handling already called for traditional diskset.
1528  */
1529 static int
del_set_on_hosts(mdsetname_t * sp,md_set_desc * sd,md_drive_desc * dd,int node_c,char ** node_v,int oha,md_error_t * ep)1530 del_set_on_hosts(
1531 	mdsetname_t		*sp,
1532 	md_set_desc		*sd,
1533 	md_drive_desc		*dd,
1534 	int			node_c,		/* Number of nodes */
1535 	char			**node_v,	/* Nodes being deleted */
1536 	int			oha,
1537 	md_error_t		*ep
1538 )
1539 {
1540 	int			i;
1541 	int			j;
1542 	side_t			sideno;
1543 	md_replicalist_t	*rlp = NULL;
1544 	sigset_t		oldsigs;
1545 	md_setkey_t		*cl_sk;
1546 	ulong_t			max_genid = 0;
1547 	int			rb_level = 1;	/* This is a special case */
1548 	md_error_t		xep = mdnullerror;
1549 	md_mnnode_desc		*nd;
1550 
1551 	RB_PREEMPT;
1552 
1553 	RB_TEST(7, "deletehosts", ep)
1554 
1555 	if (dd != NULL) {
1556 		/*
1557 		 * May need this to re-add sidenames on roll back.
1558 		 */
1559 		if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp,
1560 		    ep) < 0)
1561 			goto rollback;
1562 
1563 		RB_TEST(8, "deletehosts", ep)
1564 
1565 		RB_PREEMPT;
1566 		rb_level = 2;	/* level 2 */
1567 
1568 		RB_TEST(9, "deletehosts", ep)
1569 
1570 		if (del_drvs_from_hosts(sp, sd, dd, node_c, node_v, oha, ep))
1571 			goto rollback;
1572 
1573 		RB_TEST(10, "deletehosts", ep)
1574 
1575 		RB_PREEMPT;
1576 		rb_level = 3;	/* level 3 */
1577 
1578 		RB_TEST(11, "deletehosts", ep)
1579 
1580 		/*
1581 		 * Delete the db replica sides
1582 		 * This is done before the next loop, so that
1583 		 * the db does not get unloaded before we are finished
1584 		 * deleting the sides.
1585 		 */
1586 		if (MD_MNSET_DESC(sd)) {
1587 			nd = sd->sd_nodelist;
1588 			while (nd) {
1589 				/* Skip hosts not being deleted */
1590 				if (! strinlst(nd->nd_nodename, node_c,
1591 				    node_v)) {
1592 					nd = nd->nd_next;
1593 					continue;
1594 				}
1595 
1596 				if (del_db_sidenms(sp, nd->nd_nodeid, ep))
1597 					goto rollback;
1598 
1599 				RB_TEST(12, "deletehosts", ep)
1600 				nd = nd->nd_next;
1601 			}
1602 		} else {
1603 			for (sideno = 0; sideno < MD_MAXSIDES; sideno++) {
1604 				/* Skip empty slots */
1605 				if (sd->sd_nodes[sideno][0] == '\0')
1606 					continue;
1607 
1608 				/* Skip hosts not being deleted */
1609 				if (! strinlst(sd->sd_nodes[sideno], node_c,
1610 				    node_v))
1611 					continue;
1612 
1613 				if (del_db_sidenms(sp, sideno, ep))
1614 					goto rollback;
1615 
1616 				RB_TEST(12, "deletehosts", ep)
1617 			}
1618 		}
1619 
1620 		RB_TEST(13, "deletehosts", ep)
1621 
1622 		RB_PREEMPT;
1623 		rb_level = 4;	/* level 4 */
1624 
1625 		RB_TEST(14, "deletehosts", ep)
1626 
1627 		/* Delete the names from the namespace */
1628 		if (MD_MNSET_DESC(sd)) {
1629 			nd = sd->sd_nodelist;
1630 			while (nd) {
1631 				/* Skip hosts not being deleted */
1632 				if (! strinlst(nd->nd_nodename, node_c,
1633 				    node_v)) {
1634 					nd = nd->nd_next;
1635 					continue;
1636 				}
1637 
1638 				if (del_md_sidenms(sp, nd->nd_nodeid, ep))
1639 					goto rollback;
1640 
1641 				RB_TEST(15, "deletehosts", ep)
1642 				nd = nd->nd_next;
1643 			}
1644 		} else {
1645 			for (sideno = 0; sideno < MD_MAXSIDES; sideno++) {
1646 				/* Skip empty slots */
1647 				if (sd->sd_nodes[sideno][0] == '\0')
1648 					continue;
1649 
1650 				/* Skip hosts not being deleted */
1651 				if (! strinlst(sd->sd_nodes[sideno], node_c,
1652 				    node_v))
1653 					continue;
1654 
1655 				if (del_md_sidenms(sp, sideno, ep))
1656 					goto rollback;
1657 
1658 				RB_TEST(15, "deletehosts", ep)
1659 			}
1660 		}
1661 	}
1662 
1663 	RB_TEST(16, "deletehosts", ep)
1664 
1665 	RB_PREEMPT;
1666 	rb_level = 5;	/* level 6 */
1667 
1668 	RB_TEST(17, "deletehosts", ep)
1669 
1670 	for (i = 0; i < node_c; i++) {
1671 		if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
1672 			/*
1673 			 * During OHA mode, don't issue RPCs to
1674 			 * non-alive nodes since there is no reason to
1675 			 * wait for RPC timeouts.
1676 			 */
1677 			nd = sd->sd_nodelist;
1678 			while (nd) {
1679 				if (strcmp(nd->nd_nodename, node_v[i]) == 0)
1680 					break;
1681 				nd = nd->nd_next;
1682 			}
1683 			if (nd == NULL) {
1684 				(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
1685 				    sp->setno, nd->nd_nodename,
1686 				    NULL, sp->setname);
1687 				goto rollback;
1688 			}
1689 
1690 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1691 				continue;
1692 			}
1693 
1694 			if (clnt_delset(node_v[i], sp, ep) == -1) {
1695 				goto rollback;
1696 			}
1697 		} else if (MD_MNSET_DESC(sd) && (oha == FALSE)) {
1698 			/*
1699 			 * All nodes should be alive in non-oha mode.
1700 			 */
1701 			if (clnt_delset(node_v[i], sp, ep) == -1) {
1702 				goto rollback;
1703 			}
1704 		} else {
1705 			/*
1706 			 * For traditional diskset, issue the RPC and
1707 			 * ignore RPC failure if in OHA mode.
1708 			 */
1709 			if (clnt_delset(node_v[i], sp, ep) == -1) {
1710 				if (oha == TRUE && mdanyrpcerror(ep)) {
1711 					mdclrerror(ep);
1712 					continue;
1713 				}
1714 				goto rollback;
1715 			}
1716 		}
1717 
1718 		RB_TEST(18, "deletehosts", ep)
1719 	}
1720 
1721 	metafreereplicalist(rlp);
1722 
1723 	if (MD_MNSET_DESC(sd)) {
1724 		/* release signals back to what they were on entry */
1725 		if (procsigs(FALSE, &oldsigs, &xep) < 0)
1726 			mdclrerror(&xep);
1727 	} else {
1728 		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1729 	}
1730 
1731 	return (0);
1732 
1733 rollback:
1734 	/* all signals already blocked for MN disket */
1735 	if (!(MD_MNSET_DESC(sd))) {
1736 		/* Make sure we are blocking all signals */
1737 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
1738 			mdclrerror(&xep);
1739 	}
1740 
1741 	max_genid = sd->sd_genid;
1742 
1743 	/* level 5 */
1744 	if (rb_level > 4) {
1745 		recreate_set(sp, sd);
1746 		max_genid++;
1747 	}
1748 
1749 	/* level 2 */
1750 	if (rb_level > 1 && dd != NULL) {
1751 		/*
1752 		 * See if we have to re-add the drives specified.
1753 		 */
1754 		for (i = 0; i < node_c; i++) {
1755 			md_set_record	*sr;
1756 
1757 			if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
1758 				/*
1759 				 * During OHA mode, don't issue RPCs to
1760 				 * non-alive nodes since there is no reason to
1761 				 * wait for RPC timeouts.
1762 				 */
1763 				nd = sd->sd_nodelist;
1764 				while (nd) {
1765 					if (strcmp(nd->nd_nodename, node_v[i])
1766 					    == 0)
1767 						break;
1768 					nd = nd->nd_next;
1769 				}
1770 				if (nd == NULL)
1771 					continue;
1772 
1773 				if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
1774 					continue;
1775 			}
1776 
1777 			/* Don't care if set record is MN or not */
1778 			if (clnt_getset(node_v[i], sp->setname,
1779 			    MD_SET_BAD, &sr, &xep) == -1) {
1780 				mdclrerror(&xep);
1781 				continue;
1782 			}
1783 
1784 			/* Drive already added, skip to next node */
1785 			if (sr->sr_drivechain != NULL) {
1786 				/*
1787 				 * Set record structure was allocated from RPC
1788 				 * routine getset so this structure is only of
1789 				 * size md_set_record even if the MN flag is
1790 				 * set.  So, clear the flag so that the free
1791 				 * code doesn't attempt to free a structure
1792 				 * the size of md_mnset_record.
1793 				 */
1794 				sr->sr_flags &= ~MD_SR_MN;
1795 				free_sr(sr);
1796 				continue;
1797 			}
1798 
1799 			if (clnt_adddrvs(node_v[i], sp, dd,
1800 			    sr->sr_ctime, sr->sr_genid, &xep) == -1)
1801 				mdclrerror(&xep);
1802 
1803 			if (clnt_upd_dr_flags(node_v[i], sp, dd,
1804 			    MD_DR_OK, &xep) == -1)
1805 				mdclrerror(&xep);
1806 
1807 			/*
1808 			 * Set record structure was allocated from RPC routine
1809 			 * getset so this structure is only of size
1810 			 * md_set_record even if the MN flag is set.  So,
1811 			 * clear the flag so that the free code doesn't
1812 			 * attempt to free a structure the size of
1813 			 * md_mnset_record.
1814 			 */
1815 			sr->sr_flags &= ~MD_SR_MN;
1816 			free_sr(sr);
1817 		}
1818 		max_genid += 3;
1819 	}
1820 
1821 	/* level 3 */
1822 	if (rb_level > 2 && dd != NULL) {
1823 		md_replicalist_t	*rl;
1824 
1825 		for (rl = rlp; rl != NULL; rl = rl->rl_next) {
1826 			md_replica_t	*r = rl->rl_repp;
1827 
1828 			/*
1829 			 * This is not the first replica being added to the
1830 			 * diskset so call with ADDSIDENMS_BCAST.  If this
1831 			 * is a traditional diskset, the bcast flag is ignored
1832 			 * since traditional disksets don't use the rpc.mdcommd.
1833 			 */
1834 			if (meta_db_addsidenms(sp, r->r_namep, r->r_blkno,
1835 			    DB_ADDSIDENMS_BCAST, &xep))
1836 				mdclrerror(&xep);
1837 		}
1838 	}
1839 
1840 	/* level 4 */
1841 	if (rb_level > 3 && dd != NULL) {
1842 		int	nodeid_addsides = 0;
1843 		/*
1844 		 * Add the device names for the new sides into the namespace,
1845 		 * on all hosts not being deleted.
1846 		 */
1847 		if (MD_MNSET_DESC(sd)) {
1848 			nd = sd->sd_nodelist;
1849 			while (nd) {
1850 				/* Find a node that is not being deleted */
1851 				if (! strinlst(nd->nd_nodename, node_c,
1852 				    node_v)) {
1853 					nodeid_addsides = nd->nd_nodeid;
1854 					break;
1855 				}
1856 				nd = nd->nd_next;
1857 			}
1858 		} else {
1859 			for (j = 0; j < MD_MAXSIDES; j++) {
1860 				/* Skip empty slots */
1861 				if (sd->sd_nodes[j][0] == '\0')
1862 					continue;
1863 
1864 				/* Find a node that is not being deleted */
1865 				if (! strinlst(sd->sd_nodes[j], node_c,
1866 				    node_v))
1867 					break;
1868 			}
1869 			nodeid_addsides = j;
1870 		}
1871 
1872 		if (MD_MNSET_DESC(sd)) {
1873 			nd = sd->sd_nodelist;
1874 			while (nd) {
1875 				/* Skip nodes not being deleted */
1876 				if (!strinlst(nd->nd_nodename, node_c,
1877 				    node_v)) {
1878 					nd = nd->nd_next;
1879 					continue;
1880 				}
1881 
1882 				/* this side was just created, add the names */
1883 				if (add_md_sidenms(sp, nd->nd_nodeid,
1884 				    nodeid_addsides, &xep))
1885 					mdclrerror(&xep);
1886 				nd = nd->nd_next;
1887 			}
1888 		} else {
1889 			for (i = 0; i < MD_MAXSIDES; i++) {
1890 				/* Skip empty slots */
1891 				if (sd->sd_nodes[i][0] == '\0')
1892 					continue;
1893 
1894 				/* Skip nodes not being deleted */
1895 				if (!strinlst(sd->sd_nodes[i], node_c, node_v))
1896 					continue;
1897 
1898 				/* this side was just created, add the names */
1899 				if (add_md_sidenms(sp, i, nodeid_addsides,
1900 				    &xep))
1901 					mdclrerror(&xep);
1902 			}
1903 		}
1904 	}
1905 
1906 	/* level 1 */
1907 	if (rb_level > 0) {
1908 		max_genid++;
1909 		resync_genid(sp, sd, max_genid, node_c, node_v);
1910 	}
1911 
1912 	/* level 0 */
1913 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1914 	if (MD_MNSET_DESC(sd)) {
1915 		nd = sd->sd_nodelist;
1916 		while (nd) {
1917 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
1918 				continue;
1919 			/* To balance lock/unlock; can send to dead node */
1920 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
1921 				mdclrerror(&xep);
1922 			nd = nd->nd_next;
1923 		}
1924 	} else {
1925 		for (i = 0; i < MD_MAXSIDES; i++) {
1926 			/* Skip empty slots */
1927 			if (sd->sd_nodes[i][0] == '\0')
1928 				continue;
1929 
1930 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
1931 				mdclrerror(&xep);
1932 		}
1933 	}
1934 	cl_set_setkey(NULL);
1935 
1936 	/* release signals back to what they were on entry */
1937 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
1938 		mdclrerror(&xep);
1939 
1940 	metafreereplicalist(rlp);
1941 
1942 	if (!(MD_MNSET_DESC(sd))) {
1943 		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1944 	}
1945 
1946 	return (-1);
1947 }
1948 
1949 static int
make_sideno_sidenm(mdsetname_t * sp,mddrivename_t * dnp,side_t sideno,md_error_t * ep)1950 make_sideno_sidenm(
1951 	mdsetname_t	*sp,
1952 	mddrivename_t	*dnp,
1953 	side_t		sideno,
1954 	md_error_t	*ep
1955 )
1956 {
1957 	mdsidenames_t	*sn, **sn_next;
1958 	md_set_desc	*sd;
1959 	mdname_t	*np;
1960 	uint_t		rep_slice;
1961 	int		err = 0;
1962 
1963 	assert(dnp->side_names_key != MD_KEYWILD);
1964 
1965 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1966 		return (-1);
1967 
1968 	/* find the end of the link list */
1969 	for (sn = dnp->side_names; sn->next != NULL; sn = sn->next)
1970 		;
1971 	sn_next = &sn->next;
1972 
1973 	if (meta_replicaslice(dnp, &rep_slice, ep) != 0)
1974 		return (-1);
1975 
1976 	if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
1977 		return (-1);
1978 
1979 	sn = Zalloc(sizeof (*sn));
1980 	sn->sideno = sideno;
1981 
1982 	if (MD_MNSET_DESC(sd)) {
1983 		/*
1984 		 * For MO diskset the sideno is not an index into
1985 		 * the array of nodes.  Hence getside_devinfo is
1986 		 * used instead of meta_getnextside_devinfo.
1987 		 */
1988 		if (meta_getside_devinfo(sp, np->bname, sideno, &sn->cname,
1989 		    &sn->dname, &sn->mnum, ep) == -1)
1990 			err = -1;
1991 	} else {
1992 		/* decrement sideno, to look like the previous sideno */
1993 		sideno--;
1994 		if (meta_getnextside_devinfo(sp, np->bname, &sideno,
1995 		    &sn->cname, &sn->dname, &sn->mnum, ep) == -1)
1996 			err = -1;
1997 	}
1998 
1999 	if (err) {
2000 		Free(sn);
2001 		return (err);
2002 	}
2003 	assert(sn->sideno == sideno);
2004 
2005 	/* Add to the end of the linked list */
2006 	*sn_next = sn;
2007 	return (0);
2008 }
2009 
2010 static int
validate_nodes(mdsetname_t * sp,int node_c,char ** node_v,md_error_t * ep)2011 validate_nodes(
2012 	mdsetname_t	*sp,
2013 	int		node_c,
2014 	char		**node_v,
2015 	md_error_t	*ep
2016 )
2017 {
2018 	char		*hostname;
2019 	int		i;
2020 
2021 
2022 	for (i = 0; i < node_c; i++) {
2023 		if (strlen(node_v[i]) > (size_t)MD_MAX_NODENAME)
2024 			return (mddserror(ep, MDE_DS_NODENAMETOOLONG,
2025 			    sp->setno, node_v[i], NULL, sp->setname));
2026 		if (clnt_hostname(node_v[i], &hostname, ep))
2027 			return (-1);
2028 		if (strcmp(node_v[i], hostname) != 0) {
2029 			Free(hostname);
2030 			return (mddserror(ep, MDE_DS_NOTNODENAME, sp->setno,
2031 			    node_v[i], NULL, sp->setname));
2032 		}
2033 		Free(hostname);
2034 	}
2035 	return (0);
2036 }
2037 
2038 /*
2039  * Exported Entry Points
2040  */
2041 
2042 /*
2043  * Check the given disk set name for syntactic correctness.
2044  */
2045 int
meta_set_checkname(char * setname,md_error_t * ep)2046 meta_set_checkname(char *setname, md_error_t *ep)
2047 {
2048 	char	*cp;
2049 
2050 	if (strlen(setname) > (size_t)MD_MAX_SETNAME)
2051 		return (mddserror(ep, MDE_DS_SETNAMETOOLONG,
2052 		    MD_SET_BAD, NULL, NULL, setname));
2053 
2054 	for (cp = setname; *cp; cp++)
2055 		if (!isprint(*cp) || strchr(INVALID_IN_NAMES, *cp) != NULL)
2056 			return (mddserror(ep, MDE_DS_INVALIDSETNAME,
2057 			    MD_SET_BAD, NULL, NULL, setname));
2058 	return (0);
2059 }
2060 
2061 /*
2062  * Add host(s) to the multi-node diskset provided in sp.
2063  * 	- create set if non-existent.
2064  */
2065 static int
meta_multinode_set_addhosts(mdsetname_t * sp,int multi_node,int node_c,char ** node_v,int auto_take,md_error_t * ep)2066 meta_multinode_set_addhosts(
2067 	mdsetname_t	*sp,
2068 	int		multi_node,
2069 	int		node_c,
2070 	char		**node_v,
2071 	int		auto_take,
2072 	md_error_t	*ep
2073 )
2074 {
2075 	md_set_desc			*sd;
2076 	md_drive_desc			*dd, *p;
2077 	int				rval = 0;
2078 	int				bool;
2079 	int				nodeindex;
2080 	int 				i;
2081 	int				has_set;
2082 	sigset_t			oldsigs;
2083 	md_setkey_t			*cl_sk;
2084 	int				rb_level = 0;
2085 	md_error_t			xep = mdnullerror;
2086 	md_mnnode_desc			*nd, *nd_curr, *nd_prev;
2087 	md_timeval32_t			now;
2088 	int				nodecnt;
2089 	mndiskset_membershiplist_t	*nl, *nl2;
2090 	int				suspendall_flag = 0;
2091 	int				suspend1_flag = 0;
2092 	int				lock_flag = 0;
2093 	int				stale_flag = 0;
2094 	md_mnnode_desc			*saved_nd_next;
2095 	int				remote_sets_created = 0;
2096 
2097 	/*
2098 	 * Check membershiplist first.  If there's
2099 	 * an error, fail to create set and pass back error.
2100 	 */
2101 	if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
2102 		return (-1);
2103 	}
2104 	/* Verify that all nodes are in member list */
2105 	for (i = 0; i < node_c; i++) {
2106 		/*
2107 		 * If node in list isn't a member of the membership,
2108 		 * just return error.
2109 		 */
2110 		if (meta_is_member(node_v[i], NULL, nl) == 0) {
2111 			meta_free_nodelist(nl);
2112 			return (mddserror(ep, MDE_DS_NOTINMEMBERLIST,
2113 			    sp->setno, node_v[i], NULL, sp->setname));
2114 		}
2115 	}
2116 	/*
2117 	 * Node list is needed later, but there is a lot of error
2118 	 * checking and possible failures between here and there, so
2119 	 * just re-get the list later if there are no errors.
2120 	 */
2121 	meta_free_nodelist(nl);
2122 	nl = NULL;
2123 
2124 	/*
2125 	 * Verify that list of nodes being added contains no
2126 	 * duplicates.
2127 	 */
2128 	if (nodesuniq(sp, node_c, node_v, ep))
2129 		return (-1);
2130 
2131 	/*
2132 	 * Verify that each node being added thinks that its nodename
2133 	 * is the same as the nodename given.
2134 	 */
2135 	if (validate_nodes(sp, node_c, node_v, ep))
2136 		return (-1);
2137 
2138 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2139 		if (! mdiserror(ep, MDE_NO_SET))
2140 			return (-1);
2141 		mdclrerror(ep);
2142 		return (create_set(sp, multi_node, node_c, node_v, auto_take,
2143 		    ep));
2144 	} else {
2145 		/*
2146 		 * If this node and another node were both attempting to
2147 		 * create the same setname at the same time, and the other
2148 		 * node has just created the set on this node then sd would
2149 		 * be non-NULL, but sp->setno would be null (setno is filled
2150 		 * in by the create_set). If this is true, then fail since
2151 		 * the other node has already won this race.
2152 		 */
2153 		if (sp->setno == NULL) {
2154 			return (mddserror(ep, MDE_DS_NODEINSET,
2155 			    NULL, mynode(), NULL, sp->setname));
2156 		}
2157 	}
2158 
2159 	/* The auto_take behavior is inconsistent with multiple hosts. */
2160 	if (auto_take || sd->sd_flags & MD_SR_AUTO_TAKE) {
2161 		(void) mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL, NULL,
2162 		    sp->setname);
2163 		return (-1);
2164 	}
2165 
2166 	/*
2167 	 * We already have the set.
2168 	 */
2169 
2170 	/* Make sure we own the set */
2171 	if (meta_check_ownership(sp, ep) != 0)
2172 		return (-1);
2173 
2174 	/*
2175 	 * The drive and node records are stored in the local mddbs of each
2176 	 * node in the diskset.  Each node's rpc.metad daemon reads in the set,
2177 	 * drive and node records from that node's local mddb and caches them
2178 	 * internally. Any process needing diskset information contacts its
2179 	 * local rpc.metad to get this information.  Since each node in the
2180 	 * diskset is independently reading the set information from its local
2181 	 * mddb, the set, drive and node records in the local mddbs must stay
2182 	 * in-sync, so that all nodes have a consistent view of the diskset.
2183 	 *
2184 	 * For a multinode diskset, explicitly verify that all nodes in the
2185 	 * diskset are ALIVE (i.e. are in the API membership list).  Otherwise,
2186 	 * fail this operation since all nodes must be ALIVE in order to add
2187 	 * the new node record to their local mddb.  If a panic of this node
2188 	 * leaves the local mddbs set, node and drive records out-of-sync, the
2189 	 * reconfig cycle will fix the local mddbs and force them back into
2190 	 * synchronization.
2191 	 */
2192 	nd = sd->sd_nodelist;
2193 	while (nd) {
2194 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2195 			return (mddserror(ep, MDE_DS_NOTINMEMBERLIST,
2196 			    sp->setno, nd->nd_nodename, NULL,
2197 			    sp->setname));
2198 		}
2199 		nd = nd->nd_next;
2200 	}
2201 
2202 	/*
2203 	 * Check if node is already in set.
2204 	 */
2205 	for (i = 0; i < node_c; i++) {
2206 		/* Is node already in set? */
2207 		nd = sd->sd_nodelist;
2208 		while (nd) {
2209 			if (strcmp(nd->nd_nodename, node_v[i]) == 0)
2210 				break;
2211 			nd = nd->nd_next;
2212 		}
2213 		if (nd) {
2214 			return (mddserror(ep, MDE_DS_NODEINSET,
2215 			    sp->setno, node_v[i], NULL,
2216 			    sp->setname));
2217 		}
2218 	}
2219 
2220 	/*
2221 	 * Lock the set on current set members.
2222 	 * Set locking done much earlier for MN diskset than for traditional
2223 	 * diskset since lock_set and SUSPEND are used to protect against
2224 	 * other meta* commands running on the other nodes.
2225 	 */
2226 	/* Make sure we are blocking all signals */
2227 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
2228 		mdclrerror(&xep);
2229 
2230 	nd = sd->sd_nodelist;
2231 	/* All nodes are guaranteed to be ALIVE */
2232 	while (nd) {
2233 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2234 			rval = -1;
2235 			goto out;
2236 		}
2237 		lock_flag = 1;
2238 		nd = nd->nd_next;
2239 	}
2240 	/*
2241 	 * Lock out other meta* commands by suspending
2242 	 * class 1 messages across the diskset.
2243 	 */
2244 	nd = sd->sd_nodelist;
2245 	/* Send suspend to nodes in nodelist before addhosts call */
2246 	/* All nodes are guaranteed to be ALIVE */
2247 	while (nd) {
2248 		if (clnt_mdcommdctl(nd->nd_nodename,
2249 		    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
2250 		    MD_MSCF_NO_FLAGS, ep)) {
2251 			rval = -1;
2252 			goto out;
2253 		}
2254 		suspend1_flag = 1;
2255 		nd = nd->nd_next;
2256 	}
2257 
2258 	/* Lock the set on new set members */
2259 	for (i = 0; i < node_c; i++) {
2260 		/* Already verified to be alive */
2261 		if (clnt_lock_set(node_v[i], sp, ep)) {
2262 			rval = -1;
2263 			goto out;
2264 		}
2265 		lock_flag = 1;
2266 	}
2267 
2268 	/*
2269 	 * Perform the required checks for new hosts
2270 	 */
2271 	for (i = 0; i < node_c; i++) {
2272 		/* Make sure this set name is not used on the other hosts */
2273 		has_set = nodehasset(sp, node_v[i], NHS_N_EQ, ep);
2274 		if (has_set < 0) {
2275 			if (! mdiserror(ep, MDE_NO_SET)) {
2276 				rval = -1;
2277 				goto out;
2278 			}
2279 			/* Keep on truck'n */
2280 			mdclrerror(ep);
2281 		} else if (has_set) {
2282 			(void) mddserror(ep, MDE_DS_NODEHASSET, sp->setno,
2283 			    node_v[i], NULL, sp->setname);
2284 			rval = -1;
2285 			goto out;
2286 		}
2287 
2288 		if (clnt_setnumbusy(node_v[i], sp->setno, &bool, ep) == -1) {
2289 			rval = -1;
2290 			goto out;
2291 		}
2292 
2293 		if (bool == TRUE) {
2294 			(void) mddserror(ep, MDE_DS_SETNUMBUSY, sp->setno,
2295 			    node_v[i], NULL, sp->setname);
2296 			rval = -1;
2297 			goto out;
2298 		}
2299 
2300 		if (clnt_setnameok(node_v[i], sp, &bool, ep) == -1) {
2301 			rval = -1;
2302 			goto out;
2303 		}
2304 
2305 		if (bool == FALSE) {
2306 			(void) mddserror(ep, MDE_DS_SETNAMEBUSY, sp->setno,
2307 			    node_v[i], NULL, sp->setname);
2308 			rval = -1;
2309 			goto out;
2310 		}
2311 
2312 		if (check_setdrvs_againstnode(sp, node_v[i], ep)) {
2313 			rval = -1;
2314 			goto out;
2315 		}
2316 	}
2317 
2318 	/* Get drive descriptors for the set */
2319 	if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL) {
2320 		if (! mdisok(ep)) {
2321 			rval = -1;
2322 			goto out;
2323 		}
2324 	}
2325 
2326 	/* END CHECK CODE */
2327 
2328 	RB_TEST(1, "addhosts", ep)
2329 
2330 	RB_PREEMPT;
2331 	rb_level = 1;	/* level 1 */
2332 
2333 	RB_TEST(2, "addhosts", ep)
2334 
2335 	/*
2336 	 * Create the set where needed
2337 	 */
2338 	if (create_set_on_hosts(sp, multi_node, node_c, node_v, 0, ep)) {
2339 		goto rollback;
2340 	}
2341 
2342 	/*
2343 	 * Send suspend to rpc.mdcommd on nodes where a set has been
2344 	 * created since rpc.mdcommd must now be running on the remote nodes.
2345 	 */
2346 	remote_sets_created = 1;
2347 	for (i = 0; i < node_c; i++) {
2348 		/*
2349 		 * Lock out other meta* commands by suspending
2350 		 * class 1 messages across the diskset.
2351 		 */
2352 		if (clnt_mdcommdctl(node_v[i],
2353 		    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
2354 		    MD_MSCF_NO_FLAGS, ep)) {
2355 			rval = -1;
2356 			goto rollback;
2357 		}
2358 	}
2359 
2360 	/*
2361 	 * Merge the new entries into the set with the existing sides.
2362 	 * Get membershiplist from API routine.  If there's
2363 	 * an error, fail to create set and pass back error.
2364 	 */
2365 	if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
2366 		goto rollback;
2367 	}
2368 	if (meta_gettimeofday(&now) == -1) {
2369 		meta_free_nodelist(nl);
2370 		(void) mdsyserror(ep, errno,
2371 		    dgettext(TEXT_DOMAIN, "meta_gettimeofday()"));
2372 		goto rollback;
2373 	}
2374 	for (nodeindex = 0; nodeindex < node_c; nodeindex++) {
2375 		nd = Zalloc(sizeof (*nd));
2376 		(void) strcpy(nd->nd_nodename, node_v[nodeindex]);
2377 		nd->nd_ctime = now;
2378 		nl2 = nl;
2379 		while (nl2) {
2380 			if (strcmp(nl2->msl_node_name,
2381 			    node_v[nodeindex]) == 0) {
2382 				nd->nd_nodeid = nl2->msl_node_id;
2383 				(void) strcpy(nd->nd_priv_ic,
2384 				    nl2->msl_node_addr);
2385 				break;
2386 			}
2387 			nl2 = nl2->next;
2388 		}
2389 
2390 		/*
2391 		 * Nodelist must be kept in ascending nodeid order.
2392 		 */
2393 		if (sd->sd_nodelist == NULL) {
2394 			/* Nothing in list, just add it */
2395 			sd->sd_nodelist = nd;
2396 		} else if (nd->nd_nodeid <
2397 		    sd->sd_nodelist->nd_nodeid) {
2398 			/* Add to head of list */
2399 			nd->nd_next = sd->sd_nodelist;
2400 			sd->sd_nodelist = nd;
2401 		} else {
2402 			nd_curr = sd->sd_nodelist->nd_next;
2403 			nd_prev = sd->sd_nodelist;
2404 			/* Search for place to add it */
2405 			while (nd_curr) {
2406 				if (nd->nd_nodeid < nd_curr->nd_nodeid) {
2407 					/* Add before nd_curr */
2408 					nd->nd_next = nd_curr;
2409 					nd_prev->nd_next = nd;
2410 					break;
2411 				}
2412 				nd_prev = nd_curr;
2413 				nd_curr = nd_curr->nd_next;
2414 			}
2415 			/* Add to end of list */
2416 			if (nd_curr == NULL) {
2417 				nd_prev->nd_next = nd;
2418 			}
2419 
2420 		}
2421 		/* Node already verified to be in membership */
2422 		nd->nd_flags |= MD_MN_NODE_ALIVE;
2423 	}
2424 	meta_free_nodelist(nl);
2425 
2426 	/* If we have drives */
2427 	if (dd != NULL) {
2428 		/*
2429 		 * For all the hosts being added, create a sidename structure
2430 		 */
2431 		nd = sd->sd_nodelist;
2432 		while (nd) {
2433 			/* Skip nodes not being added */
2434 			if (!strinlst(nd->nd_nodename, node_c, node_v)) {
2435 				nd = nd->nd_next;
2436 				continue;
2437 			}
2438 			for (p = dd; p != NULL; p = p->dd_next) {
2439 				if (make_sideno_sidenm(sp, p->dd_dnp,
2440 				    nd->nd_nodeid, ep) != 0)
2441 					goto rollback;
2442 			}
2443 			nd = nd->nd_next;
2444 		}
2445 
2446 		RB_PREEMPT;
2447 		rb_level = 2;   /* level 2 */
2448 
2449 		RB_TEST(4, "addhosts", ep)
2450 
2451 		/*
2452 		 * Add the new sidename for each drive to all the hosts
2453 		 *
2454 		 * If a multi-node diskset, each host only stores
2455 		 * the side information for itself.  So, only send
2456 		 * side information to the new hosts where each host
2457 		 * will add the appropriate side information to its
2458 		 * local mddb.
2459 		 */
2460 		nd = sd->sd_nodelist;
2461 		while (nd) {
2462 			/* Skip nodes not being added */
2463 			if (!strinlst(nd->nd_nodename, node_c,
2464 			    node_v)) {
2465 				nd = nd->nd_next;
2466 				continue;
2467 			}
2468 
2469 			/* Add side info to new hosts */
2470 			if (clnt_add_drv_sidenms(nd->nd_nodename,
2471 			    mynode(), sp, sd, node_c, node_v, ep))
2472 				goto rollback;
2473 
2474 			nd = nd->nd_next;
2475 		}
2476 
2477 		RB_TEST(5, "addhosts", ep)
2478 
2479 		RB_PREEMPT;
2480 		rb_level = 3;	/* level 3 */
2481 
2482 		RB_TEST(6, "addhosts", ep)
2483 
2484 		/*
2485 		 * Add the device names for the new sides into the namespace
2486 		 * for all hosts being added.  This is adding the side
2487 		 * names to the diskset's mddb so add sidenames for all
2488 		 * of the new hosts.
2489 		 */
2490 		nd = sd->sd_nodelist;
2491 		while (nd) {
2492 			/* Skip nodes not being added */
2493 			if (!strinlst(nd->nd_nodename, node_c, node_v)) {
2494 				nd = nd->nd_next;
2495 				continue;
2496 			}
2497 
2498 			/* this side was just created, add the names */
2499 			if (add_md_sidenms(sp, nd->nd_nodeid,
2500 			    MD_SIDEWILD, ep))
2501 				goto rollback;
2502 
2503 			nd = nd->nd_next;
2504 		}
2505 
2506 		RB_TEST(7, "addhosts", ep)
2507 
2508 		RB_PREEMPT;
2509 		rb_level = 4;   /* level 4 */
2510 
2511 		RB_TEST(8, "addhosts", ep)
2512 
2513 		if (add_db_sidenms(sp, ep))
2514 			goto rollback;
2515 
2516 	} else {
2517 		RB_PREEMPT;
2518 		rb_level = 4;
2519 	}
2520 
2521 	RB_TEST(9, "addhosts", ep)
2522 
2523 	RB_PREEMPT;
2524 	rb_level = 5;	/* level 5 */
2525 
2526 	RB_TEST(10, "addhosts", ep)
2527 
2528 	if (dd != NULL) {
2529 		/*
2530 		 * Notify rpc.mdcommd on all nodes of a nodelist change.
2531 		 * Start by suspending rpc.mdcommd (which drains it of all
2532 		 * messages), then change the nodelist followed by a reinit
2533 		 * and resume.
2534 		 */
2535 		nd = sd->sd_nodelist;
2536 		/* Send suspend_all to nodes in nodelist (existing + new) */
2537 		/* All nodes are guaranteed to be ALIVE */
2538 		while (nd) {
2539 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2540 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
2541 				rval = -1;
2542 				goto rollback;
2543 			}
2544 			suspendall_flag = 1;
2545 			nd = nd->nd_next;
2546 		}
2547 	}
2548 
2549 	/* Add the node(s) to the each host that is currently in the set */
2550 	nd = sd->sd_nodelist;
2551 	/* All nodes are guaranteed to be ALIVE */
2552 	while (nd) {
2553 		if (clnt_addhosts(nd->nd_nodename, sp, node_c, node_v, ep)) {
2554 			goto rollback;
2555 		}
2556 		nd = nd->nd_next;
2557 	}
2558 
2559 	RB_TEST(11, "addhosts", ep)
2560 
2561 	if (dd != NULL) {
2562 		/*
2563 		 * Mark the drives MD_DR_OK.
2564 		 */
2565 		nd = sd->sd_nodelist;
2566 		/* All nodes are guaranteed to be ALIVE */
2567 		while (nd) {
2568 			if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
2569 			    MD_DR_OK, ep) == -1)
2570 				goto rollback;
2571 			nd = nd->nd_next;
2572 		}
2573 	}
2574 
2575 	RB_TEST(12, "addhosts", ep)
2576 
2577 	RB_PREEMPT;
2578 	rb_level = 6;   /* level 6 */
2579 
2580 	RB_TEST(13, "addhosts", ep)
2581 
2582 
2583 	/* Add the mediator information to all hosts in the set. */
2584 	nd = sd->sd_nodelist;
2585 	/* All nodes are guaranteed to be ALIVE */
2586 	while (nd) {
2587 		if (clnt_updmeds(nd->nd_nodename, sp, &sd->sd_med, ep))
2588 			goto rollback;
2589 		nd = nd->nd_next;
2590 	}
2591 
2592 	RB_TEST(14, "addhosts", ep)
2593 
2594 	/*
2595 	 * If a MN diskset and there are drives in the set,
2596 	 * set the master on the new nodes and
2597 	 * automatically join the new nodes into the set.
2598 	 */
2599 	if (dd != NULL) {
2600 		mddb_config_t   c;
2601 		/*
2602 		 * Is current set STALE?
2603 		 */
2604 		(void) memset(&c, 0, sizeof (c));
2605 		c.c_id = 0;
2606 		c.c_setno = sp->setno;
2607 		if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
2608 			(void) mdstealerror(ep, &c.c_mde);
2609 			rval = -1;
2610 			goto out;
2611 		}
2612 		if (c.c_flags & MDDB_C_STALE) {
2613 			stale_flag = MNSET_IS_STALE;
2614 		}
2615 
2616 		/* Set master on newly added nodes */
2617 		for (i = 0; i < node_c; i++) {
2618 			if (clnt_mnsetmaster(node_v[i], sp,
2619 			    sd->sd_mn_master_nodenm,
2620 			    sd->sd_mn_master_nodeid, ep)) {
2621 				goto rollback;
2622 			}
2623 		}
2624 		/* Join newly added nodes to diskset and set OWN flag */
2625 		for (i = 0; i < node_c; i++) {
2626 			if (clnt_joinset(node_v[i], sp, stale_flag, ep))
2627 				goto rollback;
2628 			nd = sd->sd_nodelist;
2629 			while (nd) {
2630 				if (strcmp(nd->nd_nodename, node_v[i]) == 0) {
2631 					nd->nd_flags |= MD_MN_NODE_OWN;
2632 					/*
2633 					 * Also set ADD flag since this flag
2634 					 * is already set in rpc.metad - it's
2635 					 * just not in the local copy.
2636 					 * Could flush local cache and call
2637 					 * metaget_setdesc, but this just
2638 					 * adds time.  Since this node knows
2639 					 * the state of the node flags in
2640 					 * rpc.metad, just set the ADD
2641 					 * flag and save time.
2642 					 */
2643 					nd->nd_flags |= MD_MN_NODE_ADD;
2644 					break;
2645 				}
2646 				nd = nd->nd_next;
2647 			}
2648 		}
2649 
2650 		/* Send new node flag list to all Owner nodes */
2651 		nd = sd->sd_nodelist;
2652 		while (nd) {
2653 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2654 				nd = nd->nd_next;
2655 				continue;
2656 			}
2657 			/*
2658 			 * Will effectively set OWN flag in records kept
2659 			 * cached in rpc.metad.  The ADD flag would have
2660 			 * already been set by the call to clnt_addhosts.
2661 			 */
2662 			if (clnt_upd_nr_flags(nd->nd_nodename, sp,
2663 			    sd->sd_nodelist, MD_NR_SET, NULL, ep)) {
2664 				goto rollback;
2665 			}
2666 			nd = nd->nd_next;
2667 		}
2668 	}
2669 
2670 	/*
2671 	 * Mark the set record MD_SR_OK
2672 	 */
2673 	nd = sd->sd_nodelist;
2674 	/* All nodes are guaranteed to be ALIVE */
2675 	while (nd) {
2676 		if (clnt_upd_sr_flags(nd->nd_nodename, sp, MD_SR_OK,
2677 		    ep)) {
2678 			goto rollback;
2679 		}
2680 		nd = nd->nd_next;
2681 	}
2682 
2683 	/*
2684 	 * For MN diskset:
2685 	 * On each newly added node, set the node record for that node
2686 	 * to OK.  Then set all node records for the newly added
2687 	 * nodes on all nodes to ok.
2688 	 *
2689 	 * By setting a node's own node record to ok first, even if
2690 	 * the node adding the hosts panics, the rest of the nodes can
2691 	 * determine the same node list during the choosing of the master
2692 	 * during reconfig.  So, only nodes considered for mastership
2693 	 * are nodes that have both MD_MN_NODE_OK and MD_SR_OK set
2694 	 * on that node's rpc.metad.  If all nodes have MD_SR_OK set,
2695 	 * but no node has its own MD_MN_NODE_OK set, then the set will
2696 	 * be removed during reconfig since a panic occurred during the
2697 	 * creation of the initial diskset.
2698 	 */
2699 
2700 	for (i = 0; i < node_c; i++) {
2701 		nd = sd->sd_nodelist;
2702 		/* All nodes are guaranteed to be ALIVE */
2703 		while (nd) {
2704 			if (strcmp(nd->nd_nodename, node_v[i]) == 0)
2705 				break;
2706 			nd = nd->nd_next;
2707 		}
2708 		/* Something wrong, will pick this up in next loop */
2709 		if (nd == NULL)
2710 			continue;
2711 
2712 		/* Only changing my local cache of node list */
2713 		saved_nd_next = nd->nd_next;
2714 		nd->nd_next = NULL;
2715 
2716 		/* Set node record for added host to ok on that host */
2717 		if (clnt_upd_nr_flags(node_v[i], sp,
2718 		    nd, MD_NR_OK, NULL, ep)) {
2719 			nd->nd_next = saved_nd_next;
2720 			goto rollback;
2721 		}
2722 		nd->nd_next = saved_nd_next;
2723 	}
2724 
2725 	/* Now set all node records on all nodes to be ok */
2726 	nd = sd->sd_nodelist;
2727 	/* All nodes are guaranteed to be ALIVE */
2728 	while (nd) {
2729 		if (clnt_upd_nr_flags(nd->nd_nodename, sp,
2730 		    sd->sd_nodelist, MD_NR_OK, NULL, ep)) {
2731 			goto rollback;
2732 		}
2733 		nd = nd->nd_next;
2734 	}
2735 
2736 	RB_TEST(15, "addhosts", ep)
2737 out:
2738 	/*
2739 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2740 	 * Send reinit command to mdcommd which forces it to get
2741 	 * fresh set description.  Then send resume.
2742 	 * Resume on class 0 will resume all classes, so can skip
2743 	 * doing an explicit resume of class1 (ignore suspend1_flag).
2744 	 */
2745 	if (suspendall_flag) {
2746 		/*
2747 		 * Don't know if nodelist contains the nodes being added
2748 		 * or not, so do reinit to nodes not being added (by skipping
2749 		 * any nodes in the nodelist being added) and then do
2750 		 * reinit to nodes being added if remote_sets_created is 1.
2751 		 */
2752 		nd = sd->sd_nodelist;
2753 		/* All nodes are guaranteed to be ALIVE */
2754 		while (nd) {
2755 			/* Skip nodes being added - handled later */
2756 			if (strinlst(nd->nd_nodename, node_c, node_v)) {
2757 				nd = nd->nd_next;
2758 				continue;
2759 			}
2760 			/* Class is ignored for REINIT */
2761 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
2762 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
2763 				if (rval == 0)
2764 					(void) mdstealerror(ep, &xep);
2765 				rval = -1;
2766 				mde_perror(ep, dgettext(TEXT_DOMAIN,
2767 				    "Unable to reinit rpc.mdcommd.\n"));
2768 			}
2769 			nd = nd->nd_next;
2770 		}
2771 		/*
2772 		 * Send reinit to added nodes that had a set created since
2773 		 * rpc.mdcommd is running on the nodes with a set.
2774 		 */
2775 		if (remote_sets_created == 1) {
2776 			for (i = 0; i < node_c; i++) {
2777 				if (clnt_mdcommdctl(node_v[i], COMMDCTL_REINIT,
2778 				    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
2779 					if (rval == 0)
2780 						(void) mdstealerror(ep, &xep);
2781 					rval = -1;
2782 					mde_perror(ep, dgettext(TEXT_DOMAIN,
2783 					    "Unable to reinit rpc.mdcommd.\n"));
2784 				}
2785 			}
2786 		}
2787 	}
2788 	if ((suspend1_flag) || (suspendall_flag)) {
2789 		/*
2790 		 * Unlock diskset by resuming messages across the diskset.
2791 		 * Just resume all classes so that resume is the same whether
2792 		 * just one class was locked or all classes were locked.
2793 		 *
2794 		 * Don't know if nodelist contains the nodes being added
2795 		 * or not, so do resume_all to nodes not being added (by
2796 		 * skipping any nodes in the nodelist being added) and then do
2797 		 * resume_all to nodes being added if remote_sets_created is 1.
2798 		 */
2799 		nd = sd->sd_nodelist;
2800 		/* All nodes are guaranteed to be ALIVE */
2801 		while (nd) {
2802 			/* Skip nodes being added - handled later */
2803 			if (strinlst(nd->nd_nodename, node_c, node_v)) {
2804 				nd = nd->nd_next;
2805 				continue;
2806 			}
2807 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
2808 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
2809 				if (rval == 0)
2810 					(void) mdstealerror(ep, &xep);
2811 				rval = -1;
2812 				mde_perror(ep, dgettext(TEXT_DOMAIN,
2813 				    "Unable to resume rpc.mdcommd.\n"));
2814 			}
2815 			nd = nd->nd_next;
2816 		}
2817 		/*
2818 		 * Send resume to added nodes that had a set created since
2819 		 * rpc.mdcommd is be running on the nodes with a set.
2820 		 */
2821 		if (remote_sets_created == 1) {
2822 			for (i = 0; i < node_c; i++) {
2823 				/* Already verified to be alive */
2824 				if (clnt_mdcommdctl(node_v[i], COMMDCTL_RESUME,
2825 				    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS,
2826 				    &xep)) {
2827 					if (rval == 0)
2828 						(void) mdstealerror(ep, &xep);
2829 					rval = -1;
2830 					mde_perror(ep, dgettext(TEXT_DOMAIN,
2831 					    "Unable to resume rpc.mdcommd.\n"));
2832 				}
2833 			}
2834 		}
2835 		meta_ping_mnset(sp->setno);
2836 		/*
2837 		 * Start a resync thread on the newly added nodes
2838 		 * if set is not stale. Also start a thread to update the
2839 		 * abr state of all soft partitions
2840 		 */
2841 		if (stale_flag != MNSET_IS_STALE) {
2842 			for (i = 0; i < node_c; i++) {
2843 				if (clnt_mn_mirror_resync_all(node_v[i],
2844 				    sp->setno, &xep)) {
2845 					if (rval == 0)
2846 						(void) mdstealerror(ep, &xep);
2847 					rval = -1;
2848 					mde_perror(ep, dgettext(TEXT_DOMAIN,
2849 					    "Unable to start resync "
2850 					    "thread.\n"));
2851 				}
2852 				if (clnt_mn_sp_update_abr(node_v[i],
2853 				    sp->setno, &xep)) {
2854 					if (rval == 0)
2855 						(void) mdstealerror(ep, &xep);
2856 					rval = -1;
2857 					mde_perror(ep, dgettext(TEXT_DOMAIN,
2858 					    "Unable to start sp update "
2859 					    "thread.\n"));
2860 				}
2861 			}
2862 		}
2863 	}
2864 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
2865 	/*
2866 	 * Don't know if nodelist contains the nodes being added
2867 	 * or not, so do clnt_unlock_set to nodes not being added (by
2868 	 * skipping any nodes in the nodelist being added) and then do
2869 	 * clnt_unlock_set to nodes being added.
2870 	 */
2871 	if (lock_flag) {
2872 		nd = sd->sd_nodelist;
2873 		/* All nodes are guaranteed to be ALIVE */
2874 		while (nd) {
2875 			/* Skip hosts we get in the next loop */
2876 			if (strinlst(nd->nd_nodename, node_c, node_v)) {
2877 				nd = nd->nd_next;
2878 				continue;
2879 			}
2880 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
2881 				if (rval == 0)
2882 					(void) mdstealerror(ep, &xep);
2883 				rval = -1;
2884 			}
2885 			nd = nd->nd_next;
2886 		}
2887 		for (i = 0; i < node_c; i++) {
2888 			/* Already verified to be alive */
2889 			if (clnt_unlock_set(node_v[i], cl_sk, &xep)) {
2890 				if (rval == 0)
2891 					(void) mdstealerror(ep, &xep);
2892 				rval = -1;
2893 			}
2894 		}
2895 	}
2896 	cl_set_setkey(NULL);
2897 
2898 	metaflushsetname(sp);
2899 
2900 	/* release signals back to what they were on entry */
2901 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
2902 		mdclrerror(&xep);
2903 
2904 	return (rval);
2905 
2906 rollback:
2907 	rval = -1;
2908 
2909 	/* level 6 */
2910 	if (rb_level > 5) {
2911 		/*
2912 		 * For each node being deleted, set DEL flag and
2913 		 * reset OK flag on that node first.
2914 		 * Until a node has turned off its own
2915 		 * rpc.metad's NODE_OK flag, that node could be
2916 		 * considered for master during a reconfig.
2917 		 */
2918 		for (i = 0; i < node_c; i++) {
2919 			nd = sd->sd_nodelist;
2920 			/* All nodes are guaranteed to be ALIVE */
2921 			while (nd) {
2922 				if (strcmp(nd->nd_nodename, node_v[i]) == 0)
2923 					break;
2924 				nd = nd->nd_next;
2925 			}
2926 			/* Something wrong, handle this in next loop */
2927 			if (nd == NULL)
2928 				continue;
2929 
2930 			/* Only changing my local cache of node list */
2931 			saved_nd_next = nd->nd_next;
2932 			nd->nd_next = NULL;
2933 
2934 			/* Set flags for del host to DEL on that host */
2935 			if (clnt_upd_nr_flags(node_v[i], sp,
2936 			    nd, MD_NR_DEL, NULL, &xep)) {
2937 				mdclrerror(&xep);
2938 			}
2939 			nd->nd_next = saved_nd_next;
2940 		}
2941 
2942 		for (i = 0; i < node_c; i++) {
2943 			if (dd != NULL) {
2944 				/* Reset master on newly added node */
2945 				if (clnt_mnsetmaster(node_v[i], sp, "",
2946 				    MD_MN_INVALID_NID, &xep))
2947 					mdclrerror(&xep);
2948 				/* Withdraw set on newly added node */
2949 				if (clnt_withdrawset(node_v[i], sp, &xep))
2950 					mdclrerror(&xep);
2951 			}
2952 			/*
2953 			 * Turn off owner flag in nodes to be deleted
2954 			 * if there are drives in the set.
2955 			 * Also, turn off NODE_OK and turn on NODE_DEL
2956 			 * for nodes to be deleted.
2957 			 * These flags are used to set the node
2958 			 * record flags in all nodes in the set.
2959 			 */
2960 			nd = sd->sd_nodelist;
2961 			while (nd) {
2962 				if (strcmp(nd->nd_nodename, node_v[i]) == 0) {
2963 					if (dd != NULL) {
2964 						nd->nd_flags &= ~MD_MN_NODE_OWN;
2965 					}
2966 					nd->nd_flags |= MD_MN_NODE_DEL;
2967 					nd->nd_flags &= ~MD_MN_NODE_OK;
2968 					break;
2969 				}
2970 				nd = nd->nd_next;
2971 			}
2972 		}
2973 
2974 		/*
2975 		 * Now, reset owner and set delete flags for the deleted
2976 		 * nodes on all nodes.
2977 		 */
2978 		nd = sd->sd_nodelist;
2979 		while (nd) {
2980 			if (clnt_upd_nr_flags(nd->nd_nodename, sp,
2981 			    sd->sd_nodelist, MD_NR_SET, NULL, &xep)) {
2982 				mdclrerror(&xep);
2983 			}
2984 			nd = nd->nd_next;
2985 		}
2986 
2987 		/*
2988 		 * On each node being deleted, set the set record
2989 		 * to be in DEL state.
2990 		 */
2991 		for (i = 0; i < node_c; i++) {
2992 			if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, &xep)) {
2993 				mdclrerror(&xep);
2994 			}
2995 		}
2996 	}
2997 
2998 	/* level 5 */
2999 	if (rb_level > 4) {
3000 		nd = sd->sd_nodelist;
3001 		/* All nodes are guaranteed to be ALIVE */
3002 		while (nd) {
3003 			if (clnt_delhosts(nd->nd_nodename, sp, node_c,
3004 			    node_v, &xep) == -1)
3005 				mdclrerror(&xep);
3006 			nd = nd->nd_next;
3007 		}
3008 	}
3009 
3010 	/*
3011 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
3012 	 * Send reinit command to mdcommd which forces it to get
3013 	 * fresh set description.  Then send resume.
3014 	 * Nodelist contains all nodes (existing + added).
3015 	 */
3016 	if (suspendall_flag) {
3017 		/* Send reinit */
3018 		nd = sd->sd_nodelist;
3019 		/* All nodes are guaranteed to be ALIVE */
3020 		/* Send reinit to nodes in nodelist before addhosts call */
3021 		while (nd) {
3022 			/*
3023 			 * Skip nodes being added if remote sets were not
3024 			 * created since rpc.mdcommd may not be running
3025 			 * on the remote nodes.
3026 			 */
3027 			if ((remote_sets_created == 0) &&
3028 			    (strinlst(nd->nd_nodename, node_c, node_v))) {
3029 				nd = nd->nd_next;
3030 				continue;
3031 			}
3032 			/* Class is ignored for REINIT */
3033 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
3034 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
3035 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
3036 				    "Unable to reinit rpc.mdcommd.\n"));
3037 				mdclrerror(&xep);
3038 			}
3039 			nd = nd->nd_next;
3040 		}
3041 
3042 		/* Send resume */
3043 		nd = sd->sd_nodelist;
3044 		/* All nodes are guaranteed to be ALIVE */
3045 		while (nd) {
3046 			/*
3047 			 * Skip nodes being added if remote sets were not
3048 			 * created since rpc.mdcommd may not be running
3049 			 * on the remote nodes.
3050 			 */
3051 			if ((remote_sets_created == 0) &&
3052 			    (strinlst(nd->nd_nodename, node_c, node_v))) {
3053 				nd = nd->nd_next;
3054 				continue;
3055 			}
3056 			/*
3057 			 * Resume all classes but class 1 so that lock is held
3058 			 * against meta* commands.
3059 			 * Send resume_all_but_1 to nodes in nodelist
3060 			 * before addhosts call.
3061 			 */
3062 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
3063 			    sp, MD_MSG_CLASS0, MD_MSCF_DONT_RESUME_CLASS1,
3064 			    &xep)) {
3065 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
3066 				    "Unable to resume rpc.mdcommd.\n"));
3067 				mdclrerror(&xep);
3068 			}
3069 			nd = nd->nd_next;
3070 		}
3071 		meta_ping_mnset(sp->setno);
3072 	}
3073 
3074 	/* level 4 */
3075 	/* Nodelist may or may not contain nodes being added. */
3076 	if (rb_level > 3 && dd != NULL) {
3077 		nd = sd->sd_nodelist;
3078 		while (nd) {
3079 			/* Skip nodes not being added */
3080 			if (!strinlst(nd->nd_nodename, node_c, node_v)) {
3081 				nd = nd->nd_next;
3082 				continue;
3083 			}
3084 
3085 			if (del_db_sidenms(sp, nd->nd_nodeid, &xep))
3086 				mdclrerror(&xep);
3087 			nd = nd->nd_next;
3088 		}
3089 	}
3090 
3091 	/* level 3 */
3092 	/* Nodelist may or may not contain nodes being added. */
3093 	if (rb_level > 2 && dd != NULL) {
3094 		nd = sd->sd_nodelist;
3095 		while (nd) {
3096 			/* Skip nodes not being added */
3097 			if (!strinlst(nd->nd_nodename, node_c, node_v)) {
3098 				nd = nd->nd_next;
3099 				continue;
3100 			}
3101 
3102 			if (del_md_sidenms(sp, nd->nd_nodeid, &xep))
3103 				mdclrerror(&xep);
3104 			nd = nd->nd_next;
3105 		}
3106 	}
3107 
3108 	/* level 1 */
3109 	if (rb_level > 0) {
3110 		if (dd != NULL) {
3111 			/* delete the drive records */
3112 			for (i = 0; i < node_c; i++) {
3113 				if (clnt_deldrvs(node_v[i], sp, dd, &xep) == -1)
3114 					mdclrerror(&xep);
3115 			}
3116 		}
3117 
3118 		/* delete the set record */
3119 		for (i = 0; i < node_c; i++) {
3120 			if (clnt_delset(node_v[i], sp, &xep) == -1)
3121 				mdclrerror(&xep);
3122 		}
3123 	}
3124 
3125 	/* level 0 */
3126 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
3127 	/* Don't test lock flag since guaranteed to be set if in rollback */
3128 	/* Nodelist may or may not contain nodes being added. */
3129 	/*
3130 	 * Unlock diskset by resuming messages across the diskset.
3131 	 * Just resume all classes so that resume is the same whether
3132 	 * just one class was locked or all classes were locked.
3133 	 */
3134 	if ((suspend1_flag) || (suspendall_flag)) {
3135 		/* All nodes are guaranteed to be ALIVE */
3136 		nd = sd->sd_nodelist;
3137 		while (nd) {
3138 			/*
3139 			 * Skip nodes being added since remote sets
3140 			 * were either created and then deleted or
3141 			 * were never created.  Either way - rpc.mdcommd
3142 			 * may not be running on the remote node.
3143 			 */
3144 			if (strinlst(nd->nd_nodename, node_c, node_v)) {
3145 				nd = nd->nd_next;
3146 				continue;
3147 			}
3148 			if (clnt_mdcommdctl(nd->nd_nodename,
3149 			    COMMDCTL_RESUME, sp, MD_MSG_CLASS0,
3150 			    MD_MSCF_NO_FLAGS, &xep)) {
3151 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
3152 				    "Unable to resume rpc.mdcommd.\n"));
3153 				mdclrerror(&xep);
3154 			}
3155 			nd = nd->nd_next;
3156 		}
3157 		meta_ping_mnset(sp->setno);
3158 	}
3159 	nd = sd->sd_nodelist;
3160 	/* All nodes are guaranteed to be ALIVE */
3161 	while (nd) {
3162 		/* Skip hosts we get in the next loop */
3163 		if (strinlst(nd->nd_nodename, node_c, node_v)) {
3164 			nd = nd->nd_next;
3165 			continue;
3166 		}
3167 
3168 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
3169 			mdclrerror(&xep);
3170 		nd = nd->nd_next;
3171 	}
3172 
3173 	for (i = 0; i < node_c; i++)
3174 		if (clnt_unlock_set(node_v[i], cl_sk, &xep))
3175 			mdclrerror(&xep);
3176 	cl_set_setkey(NULL);
3177 
3178 	/* release signals back to what they were on entry */
3179 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
3180 		mdclrerror(&xep);
3181 
3182 	metaflushsetname(sp);
3183 
3184 	return (rval);
3185 }
3186 
3187 /*
3188  * Add host(s) to the traditional diskset provided in sp.
3189  *	- create set if non-existent.
3190  */
3191 static int
meta_traditional_set_addhosts(mdsetname_t * sp,int multi_node,int node_c,char ** node_v,int auto_take,md_error_t * ep)3192 meta_traditional_set_addhosts(
3193 	mdsetname_t	*sp,
3194 	int		multi_node,
3195 	int		node_c,
3196 	char		**node_v,
3197 	int		auto_take,
3198 	md_error_t	*ep
3199 )
3200 {
3201 	md_set_desc	*sd;
3202 	md_drive_desc	*dd, *p;
3203 	med_rec_t	medr;
3204 	med_rec_t	rb_medr;
3205 	int		rval = 0;
3206 	int		bool;
3207 	int		nodeindex;
3208 	int 		i;
3209 	int		has_set;
3210 	int		numsides;
3211 	sigset_t	oldsigs;
3212 	md_setkey_t	*cl_sk;
3213 	int		rb_level = 0;
3214 	md_error_t	xep = mdnullerror;
3215 	int		max_meds;
3216 
3217 	if (nodesuniq(sp, node_c, node_v, ep))
3218 		return (-1);
3219 
3220 	if (validate_nodes(sp, node_c, node_v, ep))
3221 		return (-1);
3222 
3223 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
3224 		if (! mdiserror(ep, MDE_NO_SET))
3225 			return (-1);
3226 		mdclrerror(ep);
3227 		return (create_set(sp, multi_node, node_c, node_v, auto_take,
3228 		    ep));
3229 	}
3230 
3231 	/* The auto_take behavior is inconsistent with multiple hosts. */
3232 	if (auto_take || sd->sd_flags & MD_SR_AUTO_TAKE) {
3233 		(void) mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL, NULL,
3234 		    sp->setname);
3235 		return (-1);
3236 	}
3237 
3238 	/*
3239 	 * We already have the set.
3240 	 */
3241 
3242 	/* Make sure we own the set */
3243 	if (meta_check_ownership(sp, ep) != 0)
3244 		return (-1);
3245 
3246 	/*
3247 	 * Perform the required checks for new hosts
3248 	 */
3249 	for (i = 0; i < node_c; i++) {
3250 		if (getnodeside(node_v[i], sd) != MD_SIDEWILD)
3251 			return (mddserror(ep, MDE_DS_NODEINSET, sp->setno,
3252 			    node_v[i], NULL, sp->setname));
3253 
3254 		/* Make sure this set name is not used on the other hosts */
3255 		has_set = nodehasset(sp, node_v[i], NHS_N_EQ, ep);
3256 		if (has_set < 0) {
3257 			if (! mdiserror(ep, MDE_NO_SET))
3258 				return (-1);
3259 			/* Keep on truck'n */
3260 			mdclrerror(ep);
3261 		} else if (has_set)
3262 			return (mddserror(ep, MDE_DS_NODEHASSET, sp->setno,
3263 			    node_v[i], NULL, sp->setname));
3264 
3265 		if (clnt_setnumbusy(node_v[i], sp->setno, &bool, ep) == -1)
3266 			return (-1);
3267 
3268 		if (bool == TRUE)
3269 			return (mddserror(ep, MDE_DS_SETNUMBUSY, sp->setno,
3270 			    node_v[i], NULL, sp->setname));
3271 
3272 		if (clnt_setnameok(node_v[i], sp, &bool, ep) == -1)
3273 			return (-1);
3274 
3275 		if (bool == FALSE)
3276 			return (mddserror(ep, MDE_DS_SETNAMEBUSY, sp->setno,
3277 			    node_v[i], NULL, sp->setname));
3278 
3279 		if (check_setdrvs_againstnode(sp, node_v[i], ep))
3280 			return (-1);
3281 	}
3282 
3283 	/* Count the number of occupied slots */
3284 	numsides = 0;
3285 	for (i = 0; i < MD_MAXSIDES; i++) {
3286 		/* Count occupied slots */
3287 		if (sd->sd_nodes[i][0] != '\0')
3288 			numsides++;
3289 	}
3290 
3291 	/* Make sure the we have space to add the new sides */
3292 	if ((numsides + node_c) > MD_MAXSIDES) {
3293 		(void) mddserror(ep, MDE_DS_SIDENUMNOTAVAIL, sp->setno, NULL,
3294 		    NULL, sp->setname);
3295 		return (-1);
3296 	}
3297 
3298 	/* Get drive descriptors for the set */
3299 	if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL)
3300 		if (! mdisok(ep))
3301 			return (-1);
3302 
3303 	/* Setup the mediator record roll-back structure */
3304 	(void) memset(&rb_medr, '\0', sizeof (med_rec_t));
3305 	rb_medr.med_rec_mag = MED_REC_MAGIC;
3306 	rb_medr.med_rec_rev = MED_REC_REV;
3307 	rb_medr.med_rec_fl  = 0;
3308 	rb_medr.med_rec_sn  = sp->setno;
3309 	(void) strcpy(rb_medr.med_rec_snm, sp->setname);
3310 	for (i = 0; i < MD_MAXSIDES; i++)
3311 		(void) strcpy(rb_medr.med_rec_nodes[i], sd->sd_nodes[i]);
3312 	rb_medr.med_rec_meds = sd->sd_med;	/* structure assigment */
3313 	(void) memset(&rb_medr.med_rec_data, '\0', sizeof (med_data_t));
3314 	rb_medr.med_rec_foff = 0;
3315 	crcgen(&rb_medr, &rb_medr.med_rec_cks, sizeof (med_rec_t), NULL);
3316 
3317 	if ((max_meds = get_max_meds(ep)) == 0)
3318 		return (-1);
3319 
3320 	/* END CHECK CODE */
3321 
3322 	md_rb_sig_handling_on();
3323 
3324 	/* Lock the set on current set members */
3325 	for (i = 0; i < MD_MAXSIDES; i++) {
3326 		/* Skip empty slots */
3327 		if (sd->sd_nodes[i][0] == '\0')
3328 			continue;
3329 
3330 		if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
3331 			rval = -1;
3332 			goto out;
3333 		}
3334 	}
3335 
3336 	/* Lock the set on new set members */
3337 	for (i = 0; i < node_c; i++) {
3338 		if (clnt_lock_set(node_v[i], sp, ep)) {
3339 			rval = -1;
3340 			goto out;
3341 		}
3342 	}
3343 
3344 	RB_TEST(1, "addhosts", ep)
3345 
3346 	RB_PREEMPT;
3347 	rb_level = 1;	/* level 1 */
3348 
3349 	RB_TEST(2, "addhosts", ep)
3350 
3351 	/*
3352 	 * Add the new hosts to the existing set record on the existing hosts
3353 	 */
3354 	for (i = 0; i < MD_MAXSIDES; i++) {
3355 		/* skip empty slots */
3356 		if (sd->sd_nodes[i][0] == '\0')
3357 			continue;
3358 
3359 		if (clnt_addhosts(sd->sd_nodes[i], sp, node_c, node_v, ep))
3360 			goto rollback;
3361 	}
3362 
3363 	RB_PREEMPT;
3364 	rb_level = 2;	/* level 2 */
3365 
3366 	RB_TEST(3, "addhosts", ep);
3367 
3368 	/* Merge the new entries into the set with the existing sides */
3369 	nodeindex = 0;
3370 	for (i = 0; i < MD_MAXSIDES; i++) {
3371 		/* Skip full slots */
3372 		if (sd->sd_nodes[i][0] != '\0')
3373 			continue;
3374 
3375 		(void) strcpy(sd->sd_nodes[i], node_v[nodeindex++]);
3376 		if (nodeindex == node_c)
3377 			break;
3378 	}
3379 
3380 	/* If we have drives */
3381 	if (dd != NULL) {
3382 		/*
3383 		 * For all the hosts being added, create a sidename structure
3384 		 */
3385 		for (i = 0; i < MD_MAXSIDES; i++) {
3386 			/* Skip empty slots */
3387 			if (sd->sd_nodes[i][0] == '\0')
3388 				continue;
3389 
3390 			/* Skip nodes not being added */
3391 			if (! strinlst(sd->sd_nodes[i], node_c, node_v))
3392 				continue;
3393 
3394 			for (p = dd; p != NULL; p = p->dd_next) {
3395 				if (make_sideno_sidenm(sp, p->dd_dnp, i,
3396 				    ep) != 0)
3397 					goto rollback;
3398 			}
3399 		}
3400 
3401 		/*
3402 		 * Add the new sidename for each drive to the existing hosts
3403 		 */
3404 		for (i = 0; i < MD_MAXSIDES; i++) {
3405 			/* Skip empty slots */
3406 			if (sd->sd_nodes[i][0] == '\0')
3407 				continue;
3408 
3409 			/* Skip nodes being added */
3410 			if (strinlst(sd->sd_nodes[i], node_c, node_v))
3411 				continue;
3412 
3413 			if (clnt_add_drv_sidenms(sd->sd_nodes[i], mynode(), sp,
3414 			    sd, node_c, node_v, ep)) {
3415 				goto rollback;
3416 			}
3417 		}
3418 
3419 		RB_TEST(4, "addhosts", ep)
3420 
3421 		RB_PREEMPT;
3422 		rb_level = 3;	/* level 3 */
3423 
3424 		RB_TEST(5, "addhosts", ep)
3425 
3426 		if (add_db_sidenms(sp, ep)) {
3427 			goto rollback;
3428 		}
3429 
3430 	} else {
3431 		RB_PREEMPT;
3432 		rb_level = 3;
3433 	}
3434 
3435 	RB_TEST(6, "addhosts", ep)
3436 
3437 	RB_PREEMPT;
3438 	rb_level = 4;	/* level 4 */
3439 
3440 	RB_TEST(7, "addhosts", ep)
3441 
3442 
3443 	/* create the set on the new nodes, this adds the drives as well */
3444 	if (create_set_on_hosts(sp, multi_node, node_c, node_v, 0, ep)) {
3445 		goto rollback;
3446 	}
3447 
3448 	RB_TEST(8, "addhosts", ep)
3449 
3450 	RB_PREEMPT;
3451 	rb_level = 5;	/* level 5 */
3452 
3453 	RB_TEST(9, "addhosts", ep)
3454 
3455 	if (dd != NULL) {
3456 
3457 		/*
3458 		 * Add the device entries for the new sides into the namespace.
3459 		 */
3460 		for (i = 0; i < MD_MAXSIDES; i++) {
3461 			/* Skip empty slots */
3462 			if (sd->sd_nodes[i][0] == '\0')
3463 				continue;
3464 
3465 			/* Skip nodes not being added */
3466 			if (! strinlst(sd->sd_nodes[i], node_c, node_v))
3467 				continue;
3468 
3469 			if (add_md_sidenms(sp, i, MD_SIDEWILD, ep))
3470 				goto rollback;
3471 		}
3472 	}
3473 
3474 	RB_TEST(10, "addhosts", ep)
3475 
3476 	RB_PREEMPT;
3477 	rb_level = 6;	/* level 6 */
3478 
3479 	RB_TEST(11, "addhosts", ep);
3480 
3481 	if (dd != NULL) {
3482 		/*
3483 		 * Mark the drives MD_DR_OK.
3484 		 */
3485 		for (i = 0; i < MD_MAXSIDES; i++) {
3486 			/* Skip empty slots */
3487 			if (sd->sd_nodes[i][0] == '\0')
3488 				continue;
3489 
3490 			if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd,
3491 			    MD_DR_OK, ep) == -1) {
3492 				goto rollback;
3493 			}
3494 		}
3495 	}
3496 
3497 	RB_TEST(12, "addhosts", ep)
3498 
3499 	/* Bring the mediator record up to date with the set record */
3500 	medr = rb_medr;				/* structure assignment */
3501 	for (i = 0; i < MD_MAXSIDES; i++)
3502 		(void) strcpy(medr.med_rec_nodes[i], sd->sd_nodes[i]);
3503 	crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
3504 
3505 	/* Inform the mediator hosts of the new node list */
3506 	for (i = 0; i < max_meds; i++) {
3507 		if (sd->sd_med.n_lst[i].a_cnt == 0)
3508 			continue;
3509 
3510 		if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep))
3511 			goto rollback;
3512 	}
3513 
3514 	/* Add the mediator information to all hosts in the set */
3515 	for (i = 0; i < MD_MAXSIDES; i++) {
3516 		/* Skip empty slots */
3517 		if (sd->sd_nodes[i][0] == '\0')
3518 			continue;
3519 
3520 		if (clnt_updmeds(sd->sd_nodes[i], sp, &sd->sd_med, ep))
3521 			goto rollback;
3522 	}
3523 
3524 	RB_TEST(13, "addhosts", ep)
3525 
3526 	/*
3527 	 * Mark the set record MD_SR_OK
3528 	 */
3529 	for (i = 0; i < MD_MAXSIDES; i++) {
3530 		/* Skip empty slots */
3531 		if (sd->sd_nodes[i][0] == '\0')
3532 			continue;
3533 
3534 		if (clnt_upd_sr_flags(sd->sd_nodes[i], sp, MD_SR_OK, ep))
3535 			goto rollback;
3536 	}
3537 
3538 	RB_TEST(14, "addhosts", ep)
3539 
3540 out:
3541 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
3542 	for (i = 0; i < MD_MAXSIDES; i++) {
3543 		/* Skip empty slots */
3544 		if (sd->sd_nodes[i][0] == '\0')
3545 			continue;
3546 
3547 		/* Skip hosts we get in the next loop */
3548 		if (strinlst(sd->sd_nodes[i], node_c, node_v))
3549 			continue;
3550 
3551 		if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
3552 			if (rval == 0)
3553 				(void) mdstealerror(ep, &xep);
3554 			rval = -1;
3555 		}
3556 	}
3557 
3558 	if (rval == 0) {
3559 		for (i = 0; i < node_c; i++)
3560 			if (clnt_unlock_set(node_v[i], cl_sk, &xep)) {
3561 				if (rval == 0)
3562 					(void) mdstealerror(ep, &xep);
3563 				rval = -1;
3564 			}
3565 	}
3566 	cl_set_setkey(NULL);
3567 
3568 	metaflushsetname(sp);
3569 
3570 	md_rb_sig_handling_off(md_got_sig(), md_which_sig());
3571 
3572 	return (rval);
3573 
3574 rollback:
3575 	/* Make sure we are blocking all signals */
3576 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
3577 		mdclrerror(&xep);
3578 
3579 	rval = -1;
3580 
3581 	/* level 6 */
3582 	if (rb_level > 5) {
3583 		for (i = 0; i < max_meds; i++) {
3584 			if (sd->sd_med.n_lst[i].a_cnt == 0)
3585 				continue;
3586 
3587 			if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp,
3588 			    &rb_medr, &xep))
3589 				mdclrerror(&xep);
3590 		}
3591 		if (dd != NULL) {
3592 			for (i = 0; i < MD_MAXSIDES; i++) {
3593 				/* Skip empty slots */
3594 				if (sd->sd_nodes[i][0] == '\0')
3595 					continue;
3596 
3597 				/* Skip nodes not being added */
3598 				if (! strinlst(sd->sd_nodes[i], node_c, node_v))
3599 					continue;
3600 
3601 				if (del_md_sidenms(sp, i, &xep))
3602 					mdclrerror(&xep);
3603 			}
3604 		}
3605 	}
3606 
3607 	/* level 5 */
3608 	if (rb_level > 4) {
3609 		if (dd != NULL) {
3610 			/* delete the drive records */
3611 			for (i = 0; i < node_c; i++) {
3612 				if (clnt_deldrvs(node_v[i], sp, dd, &xep) == -1)
3613 					mdclrerror(&xep);
3614 			}
3615 		}
3616 		/* delete the set record on the 'new' hosts */
3617 		for (i = 0; i < node_c; i++) {
3618 			if (clnt_delset(node_v[i], sp, &xep) == -1)
3619 				mdclrerror(&xep);
3620 		}
3621 	}
3622 
3623 	/* level 4 */
3624 	if (rb_level > 3 && dd != NULL) {
3625 		for (i = 0; i < MD_MAXSIDES; i++) {
3626 			/* Skip empty slots */
3627 			if (sd->sd_nodes[i][0] == '\0')
3628 				continue;
3629 
3630 			/* Skip nodes not being added */
3631 			if (! strinlst(sd->sd_nodes[i], node_c, node_v))
3632 				continue;
3633 
3634 			if (del_db_sidenms(sp, i, &xep))
3635 				mdclrerror(&xep);
3636 		}
3637 	}
3638 
3639 	/* level 3 */
3640 	if (rb_level > 2 && dd != NULL) {
3641 		for (i = 0; i < MD_MAXSIDES; i++) {
3642 			/* Skip empty slots */
3643 			if (sd->sd_nodes[i][0] == '\0')
3644 				continue;
3645 
3646 			/* Skip nodes not being added */
3647 			if (! strinlst(sd->sd_nodes[i], node_c, node_v))
3648 				continue;
3649 
3650 			if (clnt_del_drv_sidenms(sd->sd_nodes[i], sp,
3651 			    &xep) == -1)
3652 				mdclrerror(&xep);
3653 		}
3654 	}
3655 
3656 	/* level 2 */
3657 	if (rb_level > 1) {
3658 		for (i = 0; i < MD_MAXSIDES; i++) {
3659 			/* Skip empty slots */
3660 			if (sd->sd_nodes[i][0] == '\0')
3661 				continue;
3662 
3663 			if (clnt_delhosts(sd->sd_nodes[i], sp, node_c, node_v,
3664 			    &xep) == -1)
3665 				mdclrerror(&xep);
3666 		}
3667 	}
3668 
3669 	/* level 1 */
3670 	if (rb_level > 0) {
3671 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
3672 		for (i = 0; i < MD_MAXSIDES; i++) {
3673 			/* Skip empty slots */
3674 			if (sd->sd_nodes[i][0] == '\0')
3675 				continue;
3676 
3677 			/* Skip hosts we get in the next loop */
3678 			if (strinlst(sd->sd_nodes[i], node_c, node_v))
3679 				continue;
3680 
3681 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
3682 				mdclrerror(&xep);
3683 		}
3684 
3685 		for (i = 0; i < node_c; i++)
3686 			if (clnt_unlock_set(node_v[i], cl_sk, &xep))
3687 				mdclrerror(&xep);
3688 		cl_set_setkey(NULL);
3689 	}
3690 
3691 	/* release signals back to what they were on entry */
3692 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
3693 		mdclrerror(&xep);
3694 
3695 	metaflushsetname(sp);
3696 
3697 	md_rb_sig_handling_off(md_got_sig(), md_which_sig());
3698 
3699 	return (rval);
3700 }
3701 
3702 /*
3703  * Add host(s) to the diskset provided in sp.
3704  * 	- create set if non-existent.
3705  */
3706 int
meta_set_addhosts(mdsetname_t * sp,int multi_node,int node_c,char ** node_v,int auto_take,md_error_t * ep)3707 meta_set_addhosts(
3708 	mdsetname_t	*sp,
3709 	int		multi_node,
3710 	int		node_c,
3711 	char		**node_v,
3712 	int		auto_take,
3713 	md_error_t	*ep
3714 )
3715 {
3716 	if (multi_node)
3717 		return (meta_multinode_set_addhosts(sp, multi_node, node_c,
3718 		    node_v, auto_take, ep));
3719 	else
3720 		return (meta_traditional_set_addhosts(sp, multi_node, node_c,
3721 		    node_v, auto_take, ep));
3722 }
3723 
3724 /*
3725  * Delete host(s) from the diskset provided in sp.
3726  * 	- destroy set if last host in set is removed.
3727  */
3728 int
meta_set_deletehosts(mdsetname_t * sp,int node_c,char ** node_v,int forceflg,md_error_t * ep)3729 meta_set_deletehosts(
3730 	mdsetname_t		*sp,
3731 	int			node_c,
3732 	char			**node_v,
3733 	int			forceflg,
3734 	md_error_t		*ep
3735 )
3736 {
3737 	md_set_desc		*sd;
3738 	md_drive_desc		*dd;
3739 	med_rec_t		medr;
3740 	med_rec_t		rb_medr;
3741 	int			i, j;
3742 	int			has_set;
3743 	int			numsides = 0;
3744 	int			oha = FALSE;
3745 	sigset_t		oldsigs;
3746 	mhd_mhiargs_t		mhiargs;
3747 	md_replicalist_t	*rlp = NULL;
3748 	md_setkey_t		*cl_sk;
3749 	ulong_t			max_genid = 0;
3750 	int			rval = 0;
3751 	int			rb_level = 0;
3752 	int			max_meds = 0;
3753 	md_error_t		xep = mdnullerror;
3754 	md_mnnode_desc		*nd;
3755 	md_mnnode_record	*nr;
3756 	int			delete_master = 0;
3757 	int			suspendall_flag = 0, suspendall_flag_rb = 0;
3758 	int			suspend1_flag = 0;
3759 	int			lock_flag = 0;
3760 	int			stale_flag = 0;
3761 	int			*node_id_list = NULL;
3762 	int			remote_sets_deleted = 0;
3763 
3764 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
3765 		return (-1);
3766 
3767 	/*
3768 	 * Verify that list of nodes being deleted contains no
3769 	 * duplicates.
3770 	 */
3771 	if (nodesuniq(sp, node_c, node_v, ep))
3772 		return (-1);
3773 
3774 	/* Make sure we own the set */
3775 	if (meta_check_ownership(sp, ep) != 0)
3776 		return (-1);
3777 
3778 	/*
3779 	 * The drive and node records are stored in the local mddbs of each
3780 	 * node in the diskset.  Each node's rpc.metad daemon reads in the set,
3781 	 * drive and node records from that node's local mddb and caches them
3782 	 * internally. Any process needing diskset information contacts its
3783 	 * local rpc.metad to get this information.  Since each node in the
3784 	 * diskset is independently reading the set information from its local
3785 	 * mddb, the set, drive and node records in the local mddbs must stay
3786 	 * in-sync, so that all nodes have a consistent view of the diskset.
3787 	 *
3788 	 * For a multinode diskset, explicitly verify that all nodes in the
3789 	 * diskset are ALIVE (i.e. are in the API membership list) if the
3790 	 * forceflag is FALSE.  (The case of forceflag being TRUE is handled
3791 	 * in OHA check above.)
3792 	 *
3793 	 * If forceflag is FALSE and a node in the diskset is not in
3794 	 * the membership list, then fail this operation since all nodes must
3795 	 * be ALIVE in order to delete the node record from their local mddb.
3796 	 * If a panic of this node leaves the local mddbs set, node and drive
3797 	 * records out-of-sync, the reconfig cycle will fix the local mddbs
3798 	 * and force them back into synchronization.
3799 	 */
3800 	if ((forceflg == FALSE) && (MD_MNSET_DESC(sd))) {
3801 		nd = sd->sd_nodelist;
3802 		while (nd) {
3803 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3804 				return (mddserror(ep, MDE_DS_NOTINMEMBERLIST,
3805 				    sp->setno, nd->nd_nodename,
3806 				    NULL, sp->setname));
3807 			}
3808 			nd = nd->nd_next;
3809 		}
3810 	}
3811 
3812 
3813 	/*
3814 	 * Lock the set on current set members.
3815 	 * Set locking done much earlier for MN diskset than for traditional
3816 	 * diskset since lock_set and SUSPEND are used to protect against
3817 	 * other meta* commands running on the other nodes.
3818 	 */
3819 	if (MD_MNSET_DESC(sd)) {
3820 		/* Make sure we are blocking all signals */
3821 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
3822 			mdclrerror(&xep);
3823 
3824 		nd = sd->sd_nodelist;
3825 		while (nd) {
3826 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3827 				nd = nd->nd_next;
3828 				continue;
3829 			}
3830 
3831 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
3832 				rval = -1;
3833 				goto out2;
3834 			}
3835 			lock_flag = 1;
3836 			nd = nd->nd_next;
3837 		}
3838 		/*
3839 		 * Lock out other meta* commands by suspending
3840 		 * class 1 messages across the diskset.
3841 		 */
3842 		nd = sd->sd_nodelist;
3843 		while (nd) {
3844 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3845 				nd = nd->nd_next;
3846 				continue;
3847 			}
3848 			if (clnt_mdcommdctl(nd->nd_nodename,
3849 			    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
3850 			    MD_MSCF_NO_FLAGS, ep)) {
3851 				rval = -1;
3852 				goto out2;
3853 			}
3854 			suspend1_flag = 1;
3855 			nd = nd->nd_next;
3856 		}
3857 	}
3858 
3859 	for (i = 0; i < node_c; i++)
3860 		if (getnodeside(node_v[i], sd) == MD_SIDEWILD) {
3861 			(void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
3862 			    node_v[i], NULL, sp->setname);
3863 			rval = -1;
3864 			goto out2;
3865 		}
3866 
3867 	/*
3868 	 * Count the number of nodes currently in the set.
3869 	 */
3870 	if (MD_MNSET_DESC(sd)) {
3871 		nd = sd->sd_nodelist;
3872 		while (nd) {
3873 			numsides++;
3874 			nd = nd->nd_next;
3875 		}
3876 	} else {
3877 		for (i = 0; i < MD_MAXSIDES; i++)
3878 			/* Count full slots */
3879 			if (sd->sd_nodes[i][0] != '\0')
3880 				numsides++;
3881 	}
3882 
3883 	/*
3884 	 * OHA mode == -f -h <hostname>
3885 	 * OHA is One Host Administration that occurs when the forceflag (-f)
3886 	 * is set and at least one host in the diskset isn't responding
3887 	 * to RPC requests.
3888 	 *
3889 	 * When in OHA mode, a node cannot delete itself from a diskset.
3890 	 * When in OHA mode, a node can delete a list of nodes from a diskset
3891 	 * even if some of the nodes in the diskset are unresponsive.
3892 	 *
3893 	 * For multinode diskset, only allow OHA mode when the nodes that
3894 	 * aren't responding in the diskset are not in the membership list
3895 	 * (i.e. nodes that aren't responding are not marked ALIVE).
3896 	 * Nodes that aren't in the membership list will be rejoining
3897 	 * the diskset through a reconfig cycle and the local mddb set
3898 	 * and node records can be reconciled during the reconfig cycle.
3899 	 *
3900 	 * If a node isn't responding, but is still in the membership list,
3901 	 * fail the request since the node may not be responding because
3902 	 * rpc.metad died and is restarting.  In this case, no reconfig
3903 	 * cycle will be started, so there's no way to recover if
3904 	 * the host delete operation was allowed.
3905 	 *
3906 	 * NOTE: if nodes that weren't in the membership when the OHA host
3907 	 * delete occurred are now the only nodes in membership list,
3908 	 * those nodes will see the old view of the diskset.  As soon as
3909 	 * a node re-enters the cluster that was present in the cluster
3910 	 * during the host deletion, the diskset will reflect the host
3911 	 * deletion on all nodes presently in the cluster.
3912 	 */
3913 	if (forceflg == TRUE) {
3914 		if (MD_MNSET_DESC(sd)) {
3915 			nd = sd->sd_nodelist;
3916 			while (nd) {
3917 				/*
3918 				 * If a node isn't ALIVE (in member list),
3919 				 * then allow a force-able delete in OHA mode.
3920 				 */
3921 				if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3922 					oha = TRUE;
3923 					break;
3924 				}
3925 				/*
3926 				 * Don't test for clnt_nullproc since already
3927 				 * tested the RPC connections by clnt_lock_set.
3928 				 */
3929 				nd = nd->nd_next;
3930 			}
3931 		} else {
3932 			for (i = 0; i < MD_MAXSIDES; i++) {
3933 				/* Skip empty slots */
3934 				if (sd->sd_nodes[i][0] == '\0')
3935 					continue;
3936 
3937 				if (clnt_nullproc(sd->sd_nodes[i], ep) == -1) {
3938 					/*
3939 					 * If we timeout to at least one
3940 					 * client, then we can allow OHA mode,
3941 					 * otherwise, we are in normal mode.
3942 					 */
3943 					if (mdanyrpcerror(ep)) {
3944 						mdclrerror(ep);
3945 						if (strinlst(sd->sd_nodes[i],
3946 						    node_c, node_v)) {
3947 							oha = TRUE;
3948 							break;
3949 						}
3950 					}
3951 				}
3952 			}
3953 		}
3954 	}
3955 
3956 	/*
3957 	 * Don't allow this for MN diskset since meta_set_destroy of 1 node
3958 	 * does NOT remove this node's node record from the other node's set
3959 	 * records in their local mddb.  This leaves a MN diskset in a very
3960 	 * messed up state.
3961 	 */
3962 	if (!(MD_MNSET_DESC(sd))) {
3963 		/* Destroy set */
3964 		if (forceflg == TRUE && node_c == 1 &&
3965 		    strcmp(mynode(), node_v[0]) == 0) {
3966 			/* Can return since !MN diskset so nothing to unlock */
3967 			return (meta_set_destroy(sp, TRUE, ep));
3968 		}
3969 	}
3970 
3971 
3972 	/*
3973 	 * In multinode diskset, can only delete self if this
3974 	 * is the last node in the set or if all nodes in
3975 	 * the set are being deleted.  The traditional diskset code
3976 	 * allows a node to delete itself (when there are other nodes
3977 	 * in the diskset) when using the force flag, but that code
3978 	 * path doesn't have the node remove itself from
3979 	 * the set node list on the other nodes.  Since this isn't
3980 	 * satisfactory for the multinode diskset, just don't
3981 	 * allow this operation.
3982 	 */
3983 	if (MD_MNSET_DESC(sd) && (numsides > 1) && (node_c != numsides) &&
3984 	    strinlst(mynode(), node_c, node_v)) {
3985 		(void) mddserror(ep, MDE_DS_MNCANTDELSELF, sp->setno,
3986 		    mynode(), NULL, sp->setname);
3987 		rval = -1;
3988 		goto out2;
3989 	}
3990 
3991 	/*
3992 	 * In multinode diskset, don't allow deletion of master node unless
3993 	 * this is the only node left or unless all nodes are being
3994 	 * deleted since there is no way to switch
3995 	 * master ownership (unless via a cluster reconfig cycle).
3996 	 */
3997 	delete_master = strinlst(sd->sd_mn_master_nodenm, node_c, node_v);
3998 	if (MD_MNSET_DESC(sd) && (numsides > 1) && (node_c != numsides) &&
3999 	    delete_master) {
4000 		(void) mddserror(ep, MDE_DS_CANTDELMASTER, sp->setno,
4001 		    sd->sd_mn_master_nodenm, NULL, sp->setname);
4002 		rval = -1;
4003 		goto out2;
4004 	}
4005 
4006 
4007 	/* Deleting self w/o forceflg */
4008 	if (forceflg == FALSE && numsides > 1 &&
4009 	    strinlst(mynode(), node_c, node_v)) {
4010 		(void) mddserror(ep, MDE_DS_CANTDELSELF, sp->setno,
4011 		    mynode(), NULL, sp->setname);
4012 		rval = -1;
4013 		goto out2;
4014 	}
4015 
4016 	/*
4017 	 * Setup the mediator record roll-back structure for a trad diskset.
4018 	 *
4019 	 * For a MN diskset, the deletion of a host in the diskset
4020 	 * does not cause an update of the mediator record.  If the
4021 	 * host deletion will cause the diskset to be removed (this is
4022 	 * the last host being removed or all hosts are being removed)
4023 	 * then the mediator record must have already been removed by the
4024 	 * user or this delete host operation will fail (a check for
4025 	 * this is done later in this routine).
4026 	 */
4027 	if (!(MD_MNSET_DESC(sd))) {
4028 		(void) memset(&rb_medr, '\0', sizeof (med_rec_t));
4029 		rb_medr.med_rec_mag = MED_REC_MAGIC;
4030 		rb_medr.med_rec_rev = MED_REC_REV;
4031 		rb_medr.med_rec_fl = 0;
4032 		rb_medr.med_rec_sn  = sp->setno;
4033 		(void) strcpy(rb_medr.med_rec_snm, sp->setname);
4034 		for (i = 0; i < MD_MAXSIDES; i++)
4035 			(void) strcpy(rb_medr.med_rec_nodes[i],
4036 			    sd->sd_nodes[i]);
4037 		rb_medr.med_rec_meds = sd->sd_med;  /* structure assigment */
4038 		(void) memset(&rb_medr.med_rec_data, '\0', sizeof (med_data_t));
4039 		rb_medr.med_rec_foff = 0;
4040 		crcgen(&rb_medr, &rb_medr.med_rec_cks,
4041 		    sizeof (med_rec_t), NULL);
4042 
4043 		/* Bring the mediator record up to date with the set record */
4044 		medr = rb_medr;			/* structure assignment */
4045 
4046 		if ((max_meds = get_max_meds(ep)) == 0) {
4047 			rval = -1;
4048 			goto out2;
4049 		}
4050 	}
4051 
4052 	/*
4053 	 * For traditional diskset:
4054 	 * Check to see if all the hosts we are trying to delete the set from
4055 	 * have a set "setname" that is the same as ours, i.e. - same name,
4056 	 * same time stamp, same genid.  We only do this if forceflg is not
4057 	 * specified or we are in OHA mode.
4058 	 */
4059 	if (!(MD_MNSET_DESC(sd)) && (forceflg == FALSE || oha == TRUE)) {
4060 		int	fix_node_v = FALSE;
4061 		int	j;
4062 
4063 		for (i = 0; i < node_c; i++) {
4064 			/* We skip this side */
4065 			if (strcmp(mynode(), node_v[i]) == 0)
4066 				continue;
4067 
4068 			has_set = nodehasset(sp, node_v[i], NHS_NSTG_EQ, ep);
4069 
4070 			if (has_set < 0) {
4071 				char	 *anode[1];
4072 
4073 				/*
4074 				 * Can't talk to the host only allowed in OHA
4075 				 * mode.
4076 				 */
4077 				if (oha == TRUE && mdanyrpcerror(ep)) {
4078 					mdclrerror(ep);
4079 					continue;
4080 				}
4081 
4082 				/*
4083 				 * We got an error we do not, or are not,
4084 				 * prepared to handle.
4085 				 */
4086 				if (! mdiserror(ep, MDE_NO_SET) &&
4087 				    ! mdismddberror(ep, MDE_DB_NODB)) {
4088 					rval = -1;
4089 					goto out2;
4090 				}
4091 				mdclrerror(ep);
4092 
4093 				/*
4094 				 * If we got here: both hosts are up; a host in
4095 				 * our set record does not have the set. So we
4096 				 * delete the host from our set and invalidate
4097 				 * the node.
4098 				 */
4099 				anode[0] = Strdup(node_v[i]);
4100 
4101 				rval = del_host_noset(sp, anode, ep);
4102 
4103 				/*
4104 				 * If we delete a host, make sure the mediator
4105 				 * hosts are made aware of this.
4106 				 */
4107 				for (j = 0; j < MD_MAXSIDES; j++) {
4108 					if (strcmp(medr.med_rec_nodes[j],
4109 					    node_v[i]) != 0)
4110 						continue;
4111 					(void) memset(&medr.med_rec_nodes[j],
4112 					    '\0', sizeof (md_node_nm_t));
4113 				}
4114 				crcgen(&medr, &medr.med_rec_cks,
4115 				    sizeof (med_rec_t), NULL);
4116 
4117 				rb_medr = medr;		/* struct assignment */
4118 
4119 				Free(anode[0]);
4120 
4121 				if (rval == -1)
4122 					goto out2;
4123 
4124 				node_v[i][0] = '\0';
4125 				fix_node_v = TRUE;
4126 				continue;
4127 			}
4128 
4129 			/*
4130 			 * If we can talk to the host, and they do not have the
4131 			 * exact set, then we disallow the operation.
4132 			 */
4133 			if (has_set == FALSE) {
4134 				(void) mddserror(ep, MDE_DS_NODENOSET,
4135 				    sp->setno, node_v[i], NULL, sp->setname);
4136 				rval = -1;
4137 				goto out2;
4138 			}
4139 		}
4140 
4141 		/*
4142 		 * Here we prune the node_v's that were invalidated above.
4143 		 */
4144 		if (fix_node_v == TRUE) {
4145 			i = 0;
4146 			while (i < node_c) {
4147 				if (node_v[i][0] == '\0') {
4148 					for (j = i; (j + 1) < node_c; j++)
4149 						node_v[j] = node_v[j + 1];
4150 					node_c--;
4151 				}
4152 				i++;
4153 			}
4154 			/*
4155 			 * If we are left with no nodes, then we have
4156 			 * compeleted the operation.
4157 			 */
4158 			if (node_c == 0) {
4159 				/*
4160 				 * Inform the mediator hosts of the new node
4161 				 * list
4162 				 */
4163 				for (i = 0; i < max_meds; i++) {
4164 					if (sd->sd_med.n_lst[i].a_cnt == 0)
4165 						continue;
4166 
4167 					if (clnt_med_upd_rec(
4168 					    &sd->sd_med.n_lst[i], sp, &medr,
4169 					    ep))
4170 						mdclrerror(ep);
4171 				}
4172 				rval = 0;
4173 				goto out2;
4174 			}
4175 		}
4176 	}
4177 
4178 	/*
4179 	 * For multinode diskset:
4180 	 * If forceflag is FALSE then check to see if all the hosts we
4181 	 * are trying to delete the set from have a set "setname" that
4182 	 * is the same as ours, i.e. - same name, same time stamp, same genid.
4183 	 * If forceflag is TRUE, then we don't care if the hosts being
4184 	 * deleted have the same set information or not since user is forcing
4185 	 * those hosts to be deleted.
4186 	 */
4187 	if ((MD_MNSET_DESC(sd)) && (forceflg == FALSE)) {
4188 		for (i = 0; i < node_c; i++) {
4189 			/* We skip this node since comparing against it */
4190 			if (strcmp(mynode(), node_v[i]) == 0)
4191 				continue;
4192 
4193 			has_set = nodehasset(sp, node_v[i], NHS_NSTG_EQ, ep);
4194 
4195 			if (has_set < 0) {
4196 				rval = -1;
4197 				goto out2;
4198 			}
4199 
4200 			/*
4201 			 * If we can talk to the host, and they do not have the
4202 			 * exact set, then we disallow the operation.
4203 			 */
4204 			if (has_set == FALSE) {
4205 				(void) mddserror(ep, MDE_DS_NODENOSET,
4206 				    sp->setno, node_v[i], NULL, sp->setname);
4207 				rval = -1;
4208 				goto out2;
4209 			}
4210 		}
4211 	}
4212 
4213 	/*
4214 	 * For traditional diskset:
4215 	 * Can't allow user to delete their node (without deleting all nodes)
4216 	 * out of a set in OHA mode, would leave a real mess.
4217 	 * This action was already failed above for a MN diskset.
4218 	 */
4219 	if (!(MD_MNSET_DESC(sd)) && (oha == TRUE) &&
4220 	    strinlst(mynode(), node_c, node_v)) {
4221 		/* Can directly return since !MN diskset; nothing to unlock */
4222 		return (mddserror(ep, MDE_DS_OHACANTDELSELF, sp->setno,
4223 		    mynode(), NULL, sp->setname));
4224 	}
4225 
4226 
4227 	/* Get the drive descriptors for this set */
4228 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
4229 	    ep)) == NULL) {
4230 		if (! mdisok(ep)) {
4231 			rval = -1;
4232 			goto out2;
4233 		}
4234 	}
4235 
4236 	/*
4237 	 * We have been asked to delete all the hosts in the set, i.e. - delete
4238 	 * the whole set.
4239 	 */
4240 	if (node_c == numsides) {
4241 		/*
4242 		 * This is only a valid operation if all drives have been
4243 		 * removed first.
4244 		 */
4245 
4246 		if (dd != NULL) {
4247 			(void) mddserror(ep, MDE_DS_HASDRIVES, sp->setno,
4248 			    NULL, NULL, sp->setname);
4249 			rval = -1;
4250 			goto out2;
4251 		}
4252 
4253 		/*
4254 		 * If a mediator is currently associated with this set,
4255 		 * fail the deletion of the last host(s).
4256 		 */
4257 		if (sd->sd_med.n_cnt != 0) {
4258 			(void) mddserror(ep, MDE_DS_HASMED, sp->setno,
4259 			    NULL, NULL, sp->setname);
4260 			rval = -1;
4261 			goto out2;
4262 		}
4263 
4264 		if (! mdisok(ep)) {
4265 			rval = -1;
4266 			goto out2;
4267 		}
4268 
4269 		rval = del_set_nodrives(sp, node_c, node_v, oha, ep);
4270 		remote_sets_deleted = 1;
4271 		goto out2;
4272 	}
4273 
4274 	/*
4275 	 * Get timeout values in case we need to roll back
4276 	 */
4277 	(void) memset(&mhiargs, '\0', sizeof (mhiargs));
4278 	if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) != 0) {
4279 		rval = -1;
4280 		goto out2;
4281 	}
4282 
4283 	if (dd != NULL) {
4284 		/*
4285 		 * We need this around for re-adding DB side names later.
4286 		 */
4287 		if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) {
4288 			rval = -1;
4289 			goto out2;
4290 		}
4291 
4292 		/*
4293 		 * Alloc nodeid list if drives are present in diskset.
4294 		 * nodeid list is used to reset mirror owners if the
4295 		 * owner is a deleted node.
4296 		 */
4297 		if (MD_MNSET_DESC(sd)) {
4298 			node_id_list = Zalloc(sizeof (int) * node_c);
4299 		}
4300 	}
4301 
4302 	/* Lock the set on current set members */
4303 	if (!(MD_MNSET_DESC(sd))) {
4304 		md_rb_sig_handling_on();
4305 		for (i = 0; i < MD_MAXSIDES; i++) {
4306 			/* Skip empty slots */
4307 			if (sd->sd_nodes[i][0] == '\0')
4308 				continue;
4309 
4310 			if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
4311 				if (oha == TRUE && mdanyrpcerror(ep)) {
4312 					mdclrerror(ep);
4313 					continue;
4314 				}
4315 				rval = -1;
4316 				goto out2;
4317 			}
4318 			lock_flag = 1;
4319 		}
4320 	}
4321 
4322 	RB_TEST(1, "deletehosts", ep)
4323 
4324 	RB_PREEMPT;
4325 	rb_level = 1;	/* level 1 */
4326 
4327 	RB_TEST(2, "deletehosts", ep)
4328 
4329 	if (MD_MNSET_DESC(sd)) {
4330 		md_mnnode_desc		*saved_nd_next;
4331 		mddb_config_t		c;
4332 
4333 		if (dd != NULL) {
4334 			/*
4335 			 * Notify rpc.mdcommd on all nodes of a nodelist change.
4336 			 * Start by suspending rpc.mdcommd (which drains it of
4337 			 * all messages), then change the nodelist followed
4338 			 * by a reinit and resume.
4339 			 */
4340 			nd = sd->sd_nodelist;
4341 			while (nd) {
4342 				if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4343 					nd = nd->nd_next;
4344 					continue;
4345 				}
4346 				if (clnt_mdcommdctl(nd->nd_nodename,
4347 				    COMMDCTL_SUSPEND, sp,
4348 				    MD_MSG_CLASS0,
4349 				    MD_MSCF_NO_FLAGS, ep)) {
4350 					rval = -1;
4351 					goto out2;
4352 				}
4353 				suspendall_flag = 1;
4354 				nd = nd->nd_next;
4355 			}
4356 			/*
4357 			 * Is current set STALE?
4358 			 * Need to know this if delete host fails and node
4359 			 * is re-joined to diskset.
4360 			 */
4361 			(void) memset(&c, 0, sizeof (c));
4362 			c.c_id = 0;
4363 			c.c_setno = sp->setno;
4364 			if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
4365 				(void) mdstealerror(ep, &c.c_mde);
4366 				rval = -1;
4367 				goto out2;
4368 			}
4369 			if (c.c_flags & MDDB_C_STALE) {
4370 				stale_flag = MNSET_IS_STALE;
4371 			}
4372 		}
4373 
4374 		/*
4375 		 * For each node being deleted, set DEL flag and
4376 		 * reset OK flag on that node first.
4377 		 * Until a node has turned off its own
4378 		 * rpc.metad's NODE_OK flag, that node could be
4379 		 * considered for master during a reconfig.
4380 		 */
4381 		for (i = 0; i < node_c; i++) {
4382 			/*
4383 			 * During OHA mode, don't issue RPCs to
4384 			 * non-alive nodes since there is no reason to
4385 			 * wait for RPC timeouts.
4386 			 */
4387 			nd = sd->sd_nodelist;
4388 			while (nd) {
4389 				if (strcmp(nd->nd_nodename, node_v[i]) == 0)
4390 					break;
4391 				nd = nd->nd_next;
4392 			}
4393 			/* Something wrong, handle this in next loop */
4394 			if (nd == NULL)
4395 				continue;
4396 
4397 			/* If node_id_list is alloc'd, fill in for later use */
4398 			if (node_id_list)
4399 				node_id_list[i] = nd->nd_nodeid;
4400 
4401 			/* All nodes are guaranteed to be ALIVE unless OHA */
4402 			if ((oha == TRUE) &&
4403 			    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4404 				continue;
4405 			}
4406 
4407 			/* Only changing my local cache of node list */
4408 			saved_nd_next = nd->nd_next;
4409 			nd->nd_next = NULL;
4410 
4411 			/* Set flags for del host to DEL on that host */
4412 			if (clnt_upd_nr_flags(node_v[i], sp,
4413 			    nd, MD_NR_DEL, NULL, ep)) {
4414 				nd->nd_next = saved_nd_next;
4415 				goto rollback;
4416 			}
4417 			nd->nd_next = saved_nd_next;
4418 		}
4419 		for (i = 0; i < node_c; i++) {
4420 			/*
4421 			 * Turn off owner flag in nodes to be deleted
4422 			 * if this node has been joined.
4423 			 * Also, turn off NODE_OK and turn on NODE_DEL
4424 			 * for nodes to be deleted.
4425 			 * These flags are used to set the node
4426 			 * record flags in all nodes in the set.
4427 			 * Only withdraw nodes that are joined.
4428 			 */
4429 			nd = sd->sd_nodelist;
4430 			while (nd) {
4431 				/*
4432 				 * Don't communicate with non-ALIVE node if
4433 				 * in OHA - but set flags in master list so
4434 				 * alive nodes are updated correctly.
4435 				 */
4436 				if (strcmp(nd->nd_nodename, node_v[i]) == 0) {
4437 					if ((oha == TRUE) && (!(nd->nd_flags &
4438 					    MD_MN_NODE_ALIVE))) {
4439 						nd->nd_flags |= MD_MN_NODE_DEL;
4440 						nd->nd_flags &= ~MD_MN_NODE_OK;
4441 						nd = nd->nd_next;
4442 						continue;
4443 					}
4444 					if (nd->nd_flags & MD_MN_NODE_OWN) {
4445 						/*
4446 						 * Going to set locally cached
4447 						 * node flags to rollback join
4448 						 * so in case of error, the
4449 						 * rollback code knows which
4450 						 * nodes to re-join.  rpc.metad
4451 						 * ignores the RB_JOIN flag.
4452 						 */
4453 						nd->nd_flags |=
4454 						    MD_MN_NODE_RB_JOIN;
4455 						nd->nd_flags &= ~MD_MN_NODE_OWN;
4456 
4457 						/*
4458 						 * Be careful in ordering of
4459 						 * following steps so that
4460 						 * recovery from a panic
4461 						 * between the steps is viable.
4462 						 * Only reset master info in
4463 						 * rpc.metad - don't reset
4464 						 * local cached info which will
4465 						 * be used to set master info
4466 						 * back if failure (rollback).
4467 						 */
4468 						if (clnt_withdrawset(
4469 						    nd->nd_nodename, sp, ep))
4470 							goto rollback;
4471 
4472 						/*
4473 						 * Reset master on deleted node
4474 						 */
4475 						if (clnt_mnsetmaster(node_v[i],
4476 						    sp, "", MD_MN_INVALID_NID,
4477 						    ep))
4478 							goto rollback;
4479 					}
4480 
4481 					nd->nd_flags |= MD_MN_NODE_DEL;
4482 					nd->nd_flags &= ~MD_MN_NODE_OK;
4483 				}
4484 				nd = nd->nd_next;
4485 			}
4486 		}
4487 
4488 		/*
4489 		 * Now, reset owner and set delete flags for the
4490 		 * deleted nodes on all nodes.
4491 		 */
4492 		nd = sd->sd_nodelist;
4493 		while (nd) {
4494 			/* Skip non-ALIVE node if in OHA */
4495 			if ((oha == TRUE) &&
4496 			    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4497 				nd = nd->nd_next;
4498 				continue;
4499 			}
4500 			if (clnt_upd_nr_flags(nd->nd_nodename, sp,
4501 			    sd->sd_nodelist, MD_NR_SET, NULL, ep)) {
4502 				goto rollback;
4503 			}
4504 			nd = nd->nd_next;
4505 		}
4506 		/*
4507 		 * Notify rpc.mdcommd on all nodes of a nodelist change.
4508 		 * Send reinit command to mdcommd which forces it to get
4509 		 * fresh set description.
4510 		 */
4511 		if (suspendall_flag) {
4512 			/* Send reinit */
4513 			nd = sd->sd_nodelist;
4514 			while (nd) {
4515 				if ((oha == TRUE) &&
4516 				    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4517 					nd = nd->nd_next;
4518 					continue;
4519 				}
4520 				/* Class is ignored for REINIT */
4521 				if (clnt_mdcommdctl(nd->nd_nodename,
4522 				    COMMDCTL_REINIT, sp, NULL,
4523 				    MD_MSCF_NO_FLAGS, ep)) {
4524 					mde_perror(ep, dgettext(TEXT_DOMAIN,
4525 					    "Unable to reinit rpc.mdcommd.\n"));
4526 					goto rollback;
4527 				}
4528 				nd = nd->nd_next;
4529 			}
4530 			/* Send resume */
4531 			nd = sd->sd_nodelist;
4532 			while (nd) {
4533 				if ((oha == TRUE) &&
4534 				    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4535 					nd = nd->nd_next;
4536 					continue;
4537 				}
4538 				if (clnt_mdcommdctl(nd->nd_nodename,
4539 				    COMMDCTL_RESUME, sp, MD_MSG_CLASS0,
4540 				    MD_MSCF_DONT_RESUME_CLASS1, ep)) {
4541 					mde_perror(ep, dgettext(TEXT_DOMAIN,
4542 					    "Unable to resume rpc.mdcommd.\n"));
4543 					goto rollback;
4544 				}
4545 				nd = nd->nd_next;
4546 			}
4547 			meta_ping_mnset(sp->setno);
4548 		}
4549 	}
4550 
4551 
4552 	/*
4553 	 * Mark the set record MD_SR_DEL on the hosts we are deleting
4554 	 * If a MN diskset and OHA mode, don't issue RPC to nodes that
4555 	 * are not ALIVE.
4556 	 * If a MN diskset and not in OHA mode, then all nodes must respond
4557 	 * to RPC (be alive) or this routine will return failure.
4558 	 * If a traditional diskset, all RPC failures if in OHA mode.
4559 	 */
4560 	for (i = 0; i < node_c; i++) {
4561 
4562 		RB_TEST(3, "deletehosts", ep)
4563 
4564 		if ((MD_MNSET_DESC(sd)) && (oha == TRUE)) {
4565 			/*
4566 			 * During OHA mode, don't issue RPCs to
4567 			 * non-alive nodes since there is no reason to
4568 			 * wait for RPC timeouts.
4569 			 */
4570 			nd = sd->sd_nodelist;
4571 			while (nd) {
4572 				if (strcmp(nd->nd_nodename, node_v[i]) == 0) {
4573 					break;
4574 				}
4575 				nd = nd->nd_next;
4576 			}
4577 			if (nd == NULL) {
4578 				(void) mddserror(ep, MDE_DS_NODENOTINSET,
4579 				    sp->setno, node_v[i], NULL, sp->setname);
4580 				goto rollback;
4581 			} else if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4582 				/* Skip non-ALIVE node if in OHA mode */
4583 				continue;
4584 			} else {
4585 				if (clnt_upd_sr_flags(node_v[i], sp,
4586 				    MD_SR_DEL, ep)) {
4587 					goto rollback;
4588 				}
4589 			}
4590 		} else if ((MD_MNSET_DESC(sd)) && (oha == FALSE)) {
4591 			/*
4592 			 * All nodes should be alive in non-oha mode.
4593 			 */
4594 			if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) {
4595 				goto rollback;
4596 			}
4597 		} else {
4598 			/*
4599 			 * For traditional diskset, issue the RPC and
4600 			 * ignore RPC failure if in OHA mode.
4601 			 */
4602 			if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) {
4603 				if (oha == TRUE && mdanyrpcerror(ep)) {
4604 					mdclrerror(ep);
4605 					continue;
4606 				}
4607 				goto rollback;
4608 			}
4609 		}
4610 
4611 		RB_TEST(4, "deletehosts", ep)
4612 	}
4613 
4614 	RB_TEST(5, "deletehosts", ep)
4615 
4616 	RB_PREEMPT;
4617 	rb_level = 2;	/* level 2 */
4618 
4619 	RB_TEST(6, "deletehosts", ep)
4620 
4621 	/* Delete the set on the hosts we are deleting */
4622 	if (del_set_on_hosts(sp, sd, dd, node_c, node_v, oha, ep)) {
4623 		if (node_id_list)
4624 			Free(node_id_list);
4625 		/*
4626 		 * Failure during del_set_on_hosts would have recreated
4627 		 * the diskset on the remote hosts, but for multi-owner
4628 		 * disksets need to set node flags properly and REINIT and
4629 		 * RESUME rpc.mdcommd, so just let the rollback code
4630 		 * do this.
4631 		 */
4632 		if (MD_MNSET_DESC(sd))
4633 			goto rollback;
4634 		return (-1);
4635 	}
4636 	remote_sets_deleted = 1;
4637 
4638 	RB_TEST(19, "deletehosts", ep)
4639 
4640 	RB_PREEMPT;
4641 	rb_level = 3;	/* level 3 */
4642 
4643 	RB_TEST(20, "deletehosts", ep)
4644 
4645 	/* Delete the host from sets on hosts not being deleted */
4646 	if (MD_MNSET_DESC(sd)) {
4647 		nd = sd->sd_nodelist;
4648 		/* All nodes are guaranteed to be ALIVE unless in oha mode */
4649 		while (nd) {
4650 			/*
4651 			 * During OHA mode, don't issue RPCs to
4652 			 * non-alive nodes since there is no reason to
4653 			 * wait for RPC timeouts.
4654 			 */
4655 			if ((oha == TRUE) &&
4656 			    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4657 				nd = nd->nd_next;
4658 				continue;
4659 			}
4660 
4661 			/* Skip nodes being deleted */
4662 			if (strinlst(nd->nd_nodename, node_c, node_v)) {
4663 				nd = nd->nd_next;
4664 				continue;
4665 			}
4666 			if (clnt_delhosts(nd->nd_nodename, sp, node_c, node_v,
4667 			    ep) == -1) {
4668 				goto rollback;
4669 			}
4670 
4671 			RB_TEST(21, "deletehosts", ep)
4672 			nd = nd->nd_next;
4673 		}
4674 	} else {
4675 		for (i = 0; i < MD_MAXSIDES; i++) {
4676 			/* Skip empty slots */
4677 			if (sd->sd_nodes[i][0] == '\0')
4678 				continue;
4679 
4680 			/* Skip nodes being deleted */
4681 			if (strinlst(sd->sd_nodes[i], node_c, node_v))
4682 				continue;
4683 
4684 			if (clnt_delhosts(sd->sd_nodes[i], sp, node_c, node_v,
4685 			    ep) == -1) {
4686 				if (oha == TRUE && mdanyrpcerror(ep)) {
4687 					mdclrerror(ep);
4688 					continue;
4689 				}
4690 				goto rollback;
4691 			}
4692 
4693 			RB_TEST(21, "deletehosts", ep)
4694 		}
4695 	}
4696 
4697 	/* We have drives */
4698 	if (dd != NULL) {
4699 		RB_TEST(22, "deletehosts", ep)
4700 
4701 		RB_PREEMPT;
4702 		rb_level = 4;	/* level 4 */
4703 
4704 		RB_TEST(23, "deletehosts", ep)
4705 
4706 		/*
4707 		 * Delete the old sidename for each drive on all the hosts.
4708 		 * If a multi-node diskset, each host only stores
4709 		 * the side information for itself.  So, a multi-node
4710 		 * diskset doesn't delete the old sidename for
4711 		 * an old host.
4712 		 *
4713 		 * If a MN diskset, reset owners of mirrors that are
4714 		 * owned by the deleted nodes.
4715 		 */
4716 		if (!(MD_MNSET_DESC(sd))) {
4717 			for (i = 0; i < MD_MAXSIDES; i++) {
4718 				/* Skip empty slots */
4719 				if (sd->sd_nodes[i][0] == '\0')
4720 					continue;
4721 
4722 				/* Skip nodes being deleted */
4723 				if (strinlst(sd->sd_nodes[i], node_c, node_v))
4724 					continue;
4725 
4726 				if (clnt_del_drv_sidenms(sd->sd_nodes[i], sp,
4727 				    ep)) {
4728 					if (oha == TRUE && mdanyrpcerror(ep)) {
4729 						mdclrerror(ep);
4730 						continue;
4731 					}
4732 					metaflushsetname(sp);
4733 					goto rollback;
4734 				}
4735 
4736 				RB_TEST(24, "deletehosts", ep)
4737 			}
4738 		} else {
4739 			nd = sd->sd_nodelist;
4740 			/* All nodes guaranteed ALIVE unless in oha mode */
4741 			while (nd) {
4742 				/*
4743 				 * If mirror owner was set to a deleted node,
4744 				 * then each existing node resets mirror owner
4745 				 * to NULL.
4746 				 *
4747 				 * During OHA mode, don't issue RPCs to
4748 				 * non-alive nodes since there is no reason to
4749 				 * wait for RPC timeouts.
4750 				 */
4751 				if ((oha == TRUE) &&
4752 				    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4753 					nd = nd->nd_next;
4754 					continue;
4755 				}
4756 
4757 				/* Skip nodes being deleted */
4758 				if (strinlst(nd->nd_nodename, node_c, node_v)) {
4759 					nd = nd->nd_next;
4760 					continue;
4761 				}
4762 
4763 				/*
4764 				 * If mirror owner is a deleted node, reset
4765 				 * mirror owners to NULL.  If an error occurs,
4766 				 * print a warning and continue.  Don't fail
4767 				 * metaset because of mirror owner reset
4768 				 * problem since next node to grab mirror
4769 				 * will resolve this issue.  Before next node
4770 				 * grabs mirrors, metaset will show the deleted
4771 				 * node as owner which is why an attempt to
4772 				 * reset the mirror owner is made.
4773 				 */
4774 				if (clnt_reset_mirror_owner(nd->nd_nodename, sp,
4775 				    node_c, &node_id_list[0], &xep) == -1) {
4776 					mde_perror(&xep, dgettext(TEXT_DOMAIN,
4777 					    "Unable to reset mirror owner on"
4778 					    " node %s\n"), nd->nd_nodename);
4779 					mdclrerror(&xep);
4780 				}
4781 
4782 				RB_TEST(21, "deletehosts", ep)
4783 				nd = nd->nd_next;
4784 			}
4785 		}
4786 	}
4787 
4788 	RB_TEST(25, "deletehosts", ep)
4789 
4790 	RB_PREEMPT;
4791 	rb_level = 4;	/* level 4 */
4792 
4793 	RB_TEST(26, "deletehosts", ep)
4794 
4795 	/*
4796 	 * Bring the mediator record up to date with the set record for
4797 	 * traditional diskset.
4798 	 */
4799 	if (!(MD_MNSET_DESC(sd))) {
4800 		medr = rb_medr;			/* structure assignment */
4801 		for (i = 0; i < MD_MAXSIDES; i++) {
4802 			if (strinlst(sd->sd_nodes[i], node_c, node_v))
4803 				(void) memset(&medr.med_rec_nodes[i],
4804 				    '\0', sizeof (md_node_nm_t));
4805 			else
4806 				(void) strcpy(medr.med_rec_nodes[i],
4807 				    sd->sd_nodes[i]);
4808 		}
4809 		crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
4810 
4811 		/* Inform the mediator hosts of the new node list */
4812 		for (i = 0; i < max_meds; i++) {
4813 			if (sd->sd_med.n_lst[i].a_cnt == 0)
4814 				continue;
4815 
4816 			if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp,
4817 			    &medr, ep)) {
4818 				if (oha == TRUE && mdanyrpcerror(ep)) {
4819 					mdclrerror(ep);
4820 					continue;
4821 				}
4822 				goto rollback;
4823 			}
4824 		}
4825 	}
4826 
4827 	RB_TEST(27, "deletehosts", ep)
4828 
4829 	/*
4830 	 * For traditional diskset:
4831 	 * We are deleting ourselves out of the set and we have drives to
4832 	 * consider; so we need to halt the set, release the drives and
4833 	 * reset the timeout.  **** THIS IS A ONE WAY TICKET, NO ROLL BACK
4834 	 * IS POSSIBLE AS SOON AS THE HALT SET COMPLETES, SO THIS IS DONE
4835 	 * WITH ALL SIGNALS BLOCKED AND LAST ****
4836 	 *
4837 	 * This situation cannot occur in a MN diskset since a node can't
4838 	 * delete itself unless all nodes are being deleted and a diskset
4839 	 * cannot contain any drives if all nodes are being deleted.
4840 	 * So, don't even test for this if a MN diskset.
4841 	 */
4842 	if (!(MD_MNSET_DESC(sd)) && (dd != NULL) &&
4843 	    strinlst(mynode(), node_c, node_v)) {
4844 		/* Make sure we are blocking all signals */
4845 		if (procsigs(TRUE, &oldsigs, ep) < 0) {
4846 			rval = -1;
4847 			goto out1;
4848 		}
4849 
4850 		if (halt_set(sp, ep)) {
4851 			rval = -1;
4852 			goto out1;
4853 		}
4854 
4855 		if (rel_own_bydd(sp, dd, FALSE, ep))
4856 			rval = -1;
4857 
4858 out1:
4859 		/* release signals back to what they were on entry */
4860 		if (procsigs(FALSE, &oldsigs, &xep) < 0) {
4861 			if (rval == 0)
4862 				(void) mdstealerror(ep, &xep);
4863 			rval = -1;
4864 		}
4865 	}
4866 
4867 out2:
4868 	/*
4869 	 * Unlock diskset by resuming messages across the diskset.
4870 	 * Just resume all classes so that resume is the same whether
4871 	 * just one class was locked or all classes were locked.
4872 	 */
4873 	if ((suspend1_flag) || (suspendall_flag)) {
4874 		/* Send resume */
4875 		nd = sd->sd_nodelist;
4876 		while (nd) {
4877 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4878 				nd = nd->nd_next;
4879 				continue;
4880 			}
4881 			/*
4882 			 * Skip nodes being deleted if remote set
4883 			 * was deleted since rpc.mdcommd may no longer
4884 			 * be running on remote node.
4885 			 */
4886 			if ((remote_sets_deleted == 1) &&
4887 			    (strinlst(nd->nd_nodename, node_c, node_v))) {
4888 				nd = nd->nd_next;
4889 				continue;
4890 			}
4891 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
4892 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
4893 				if (rval == 0)
4894 					(void) mdstealerror(ep, &xep);
4895 				rval = -1;
4896 				mde_perror(ep, dgettext(TEXT_DOMAIN,
4897 				    "Unable to resume rpc.mdcommd.\n"));
4898 			}
4899 			nd = nd->nd_next;
4900 		}
4901 		meta_ping_mnset(sp->setno);
4902 	}
4903 
4904 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
4905 	if (lock_flag) {
4906 		if (MD_MNSET_DESC(sd)) {
4907 			nd = sd->sd_nodelist;
4908 			while (nd) {
4909 				/*
4910 				 * During OHA mode, don't issue RPCs to
4911 				 * non-alive nodes since there is no reason to
4912 				 * wait for RPC timeouts.
4913 				 */
4914 				if ((oha == TRUE) &&
4915 				    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4916 					nd = nd->nd_next;
4917 					continue;
4918 				}
4919 				if (clnt_unlock_set(nd->nd_nodename,
4920 				    cl_sk, &xep)) {
4921 					if (rval == 0)
4922 						(void) mdstealerror(ep, &xep);
4923 					rval = -1;
4924 				}
4925 				nd = nd->nd_next;
4926 			}
4927 		} else {
4928 			for (i = 0; i < MD_MAXSIDES; i++) {
4929 				/* Skip empty slots */
4930 				if (sd->sd_nodes[i][0] == '\0')
4931 					continue;
4932 
4933 				if (clnt_unlock_set(sd->sd_nodes[i],
4934 				    cl_sk, &xep)) {
4935 					if (oha == TRUE &&
4936 					    mdanyrpcerror(&xep)) {
4937 						mdclrerror(&xep);
4938 						continue;
4939 					}
4940 					if (rval == 0)
4941 						(void) mdstealerror(ep, &xep);
4942 					rval = -1;
4943 				}
4944 			}
4945 		}
4946 	}
4947 	cl_set_setkey(NULL);
4948 
4949 out3:
4950 	metafreereplicalist(rlp);
4951 	if (node_id_list)
4952 		Free(node_id_list);
4953 
4954 	metaflushsetname(sp);
4955 
4956 	if (MD_MNSET_DESC(sd)) {
4957 		/* release signals back to what they were on entry */
4958 		if (procsigs(FALSE, &oldsigs, &xep) < 0)
4959 			mdclrerror(&xep);
4960 	} else {
4961 		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
4962 	}
4963 
4964 
4965 	return (rval);
4966 
4967 rollback:
4968 	/* all signals already blocked for MN disket */
4969 	if (!(MD_MNSET_DESC(sd))) {
4970 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
4971 			mdclrerror(&xep);
4972 	}
4973 
4974 	rval = -1;
4975 
4976 	max_genid = sd->sd_genid;
4977 
4978 
4979 	/*
4980 	 * Send reinit command to rpc.mdcommd which forces it to get
4981 	 * fresh set description and resume all classes but class 0.
4982 	 * Don't send any commands to rpc.mdcommd if set on that node
4983 	 * has been removed.
4984 	 */
4985 	if (suspendall_flag) {
4986 		/* Send reinit */
4987 		nd = sd->sd_nodelist;
4988 		while (nd) {
4989 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4990 				nd = nd->nd_next;
4991 				continue;
4992 			}
4993 			/*
4994 			 * If the remote set was deleted, rpc.mdcommd
4995 			 * may no longer be running so send nothing to it.
4996 			 */
4997 			if ((remote_sets_deleted == 1) &&
4998 			    (strinlst(nd->nd_nodename, node_c, node_v))) {
4999 				nd = nd->nd_next;
5000 				continue;
5001 			}
5002 			/* Class is ignored for REINIT */
5003 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
5004 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
5005 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
5006 				    "Unable to reinit rpc.mdcommd.\n"));
5007 				mdclrerror(&xep);
5008 			}
5009 			nd = nd->nd_next;
5010 		}
5011 		/* Send resume */
5012 		nd = sd->sd_nodelist;
5013 		while (nd) {
5014 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5015 				nd = nd->nd_next;
5016 				continue;
5017 			}
5018 			/*
5019 			 * If the remote set was deleted, rpc.mdcommd
5020 			 * may no longer be running so send nothing to it.
5021 			 */
5022 			if ((remote_sets_deleted == 1) &&
5023 			    (strinlst(nd->nd_nodename, node_c, node_v))) {
5024 				nd = nd->nd_next;
5025 				continue;
5026 			}
5027 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
5028 			    sp, MD_MSG_CLASS0, MD_MSCF_DONT_RESUME_CLASS1,
5029 			    &xep)) {
5030 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
5031 				    "Unable to resume rpc.mdcommd.\n"));
5032 				mdclrerror(&xep);
5033 			}
5034 			nd = nd->nd_next;
5035 		}
5036 		meta_ping_mnset(sp->setno);
5037 	}
5038 
5039 	/* level 2 */
5040 	if (rb_level > 1) {
5041 		md_set_record		*sr;
5042 		md_replicalist_t	*rl;
5043 
5044 		recreate_set(sp, sd);
5045 
5046 		/*
5047 		 * Lock out other meta* commands on nodes with the newly
5048 		 * re-created sets by suspending class 1 messages
5049 		 * across the diskset.
5050 		 */
5051 		nd = sd->sd_nodelist;
5052 		while (nd) {
5053 			/* Skip nodes not being deleted */
5054 			if (!(strinlst(nd->nd_nodename, node_c, node_v))) {
5055 				nd = nd->nd_next;
5056 				continue;
5057 			}
5058 			/* Suspend commd on nodes with re-created sets */
5059 			if (clnt_mdcommdctl(nd->nd_nodename,
5060 			    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
5061 			    MD_MSCF_NO_FLAGS, &xep)) {
5062 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
5063 				    "Unable to suspend rpc.mdcommd.\n"));
5064 				mdclrerror(&xep);
5065 			}
5066 			nd = nd->nd_next;
5067 		}
5068 
5069 		max_genid++;
5070 
5071 		/*
5072 		 * See if we have to re-add the drives specified.
5073 		 */
5074 		for (i = 0; i < node_c; i++) {
5075 			if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
5076 				/*
5077 				 * During OHA mode, don't issue RPCs to
5078 				 * non-alive nodes since there is no reason to
5079 				 * wait for RPC timeouts.
5080 				 */
5081 				nd = sd->sd_nodelist;
5082 				while (nd) {
5083 					if (strcmp(nd->nd_nodename, node_v[i])
5084 					    == 0) {
5085 						break;
5086 					}
5087 					nd = nd->nd_next;
5088 				}
5089 				if (nd == 0)
5090 					continue;
5091 				if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
5092 					continue;
5093 			}
5094 
5095 			/* Don't care if set record is MN or not */
5096 			if (clnt_getset(node_v[i], sp->setname, MD_SET_BAD, &sr,
5097 			    &xep) == -1) {
5098 				mdclrerror(&xep);
5099 				continue;
5100 			}
5101 
5102 			/* Drive already added, skip to next node */
5103 			if (sr->sr_drivechain != NULL) {
5104 				/*
5105 				 * Set record structure was allocated from RPC
5106 				 * routine getset so this structure is only of
5107 				 * size md_set_record even if the MN flag is
5108 				 * set.  So, clear the flag so that the free
5109 				 * code doesn't attempt to free a structure
5110 				 * the size of md_mnset_record.
5111 				 */
5112 				sr->sr_flags &= ~MD_SR_MN;
5113 				free_sr(sr);
5114 				continue;
5115 			}
5116 
5117 			if (clnt_adddrvs(node_v[i], sp, dd, sr->sr_ctime,
5118 			    sr->sr_genid, &xep) == -1)
5119 				mdclrerror(&xep);
5120 
5121 			if (clnt_upd_dr_flags(node_v[i], sp, dd, MD_DR_OK,
5122 			    &xep) == -1)
5123 				mdclrerror(&xep);
5124 
5125 			/*
5126 			 * Set record structure was allocated from RPC routine
5127 			 * getset so this structure is only of size
5128 			 * md_set_record even if the MN flag is set.  So,
5129 			 * clear the flag so that the free code doesn't
5130 			 * attempt to free a structure the size of
5131 			 * md_mnset_record.
5132 			 */
5133 			sr->sr_flags &= ~MD_SR_MN;
5134 			free_sr(sr);
5135 		}
5136 		max_genid += 3;
5137 
5138 		for (rl = rlp; rl != NULL; rl = rl->rl_next) {
5139 			md_replica_t	*r = rl->rl_repp;
5140 			/*
5141 			 * This is not the first replica being added to the
5142 			 * diskset so call with ADDSIDENMS_BCAST.  If this
5143 			 * is a traditional diskset, the bcast flag is ignored
5144 			 * since traditional disksets don't use the rpc.mdcommd.
5145 			 */
5146 			if (meta_db_addsidenms(sp, r->r_namep, r->r_blkno,
5147 			    DB_ADDSIDENMS_BCAST, &xep))
5148 				mdclrerror(&xep);
5149 		}
5150 
5151 		/*
5152 		 * Add the device names for the new sides into the namespace,
5153 		 * on all hosts not being deleted.
5154 		 */
5155 		if (MD_MNSET_DESC(sd)) {
5156 			nd = sd->sd_nodelist;
5157 			while (nd) {
5158 				/* Find a node that is not being deleted */
5159 				if (!strinlst(nd->nd_nodename, node_c,
5160 				    node_v)) {
5161 					j = nd->nd_nodeid;
5162 					break;
5163 				}
5164 				nd = nd->nd_next;
5165 			}
5166 		} else {
5167 			for (j = 0; j < MD_MAXSIDES; j++) {
5168 				/* Skip empty slots */
5169 				if (sd->sd_nodes[j][0] == '\0')
5170 					continue;
5171 
5172 				/* Find a node that is not being deleted */
5173 				if (!strinlst(sd->sd_nodes[j], node_c, node_v))
5174 					break;
5175 			}
5176 		}
5177 
5178 		if (MD_MNSET_DESC(sd)) {
5179 			nd = sd->sd_nodelist;
5180 			while (nd) {
5181 				/* Skip nodes not being deleted */
5182 				if (!strinlst(nd->nd_nodename, node_c,
5183 				    node_v)) {
5184 					nd = nd->nd_next;
5185 					continue;
5186 				}
5187 
5188 				/* this side was just created, add the names */
5189 				if (add_md_sidenms(sp, nd->nd_nodeid, j, &xep))
5190 					mdclrerror(&xep);
5191 				nd = nd->nd_next;
5192 			}
5193 		} else {
5194 			for (i = 0; i < MD_MAXSIDES; i++) {
5195 				/* Skip empty slots */
5196 				if (sd->sd_nodes[i][0] == '\0')
5197 					continue;
5198 
5199 				/* Skip nodes not being deleted */
5200 				if (!strinlst(sd->sd_nodes[i], node_c, node_v))
5201 					continue;
5202 
5203 				/* this side was just created, add the names */
5204 				if (add_md_sidenms(sp, i, j, &xep))
5205 					mdclrerror(&xep);
5206 			}
5207 		}
5208 	}
5209 
5210 	/* level 4 */
5211 	if (rb_level > 3 && dd != NULL) {
5212 		/*
5213 		 * Add the new sidename for each drive to all the hosts
5214 		 * Multi-node disksets only store the sidename for
5215 		 * that host, so there is nothing to re-add.
5216 		 */
5217 		if (!(MD_MNSET_DESC(sd))) {
5218 			for (j = 0; j < MD_MAXSIDES; j++) {
5219 				/* Skip empty slots */
5220 				if (sd->sd_nodes[j][0] == '\0')
5221 					continue;
5222 
5223 				/* Skip nodes not being deleted */
5224 				if (!strinlst(sd->sd_nodes[j], node_c, node_v))
5225 					break;
5226 			}
5227 			for (i = 0; i < MD_MAXSIDES; i++) {
5228 				/* Skip empty slots */
5229 				if (sd->sd_nodes[i][0] == '\0')
5230 					continue;
5231 
5232 				if (clnt_add_drv_sidenms(sd->sd_nodes[i],
5233 				    sd->sd_nodes[j], sp, sd, node_c, node_v,
5234 				    &xep))
5235 					mdclrerror(&xep);
5236 			}
5237 		}
5238 
5239 	}
5240 
5241 	/* level 5 */
5242 	if ((rb_level > 4) && (!(MD_MNSET_DESC(sd)))) {
5243 		/* rollback the mediator record */
5244 		for (i = 0; i < max_meds; i++) {
5245 			if (sd->sd_med.n_lst[i].a_cnt == 0)
5246 				continue;
5247 
5248 			if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp,
5249 			    &rb_medr, &xep))
5250 				mdclrerror(&xep);
5251 		}
5252 	}
5253 
5254 	/* level 3 */
5255 	if (rb_level > 2) {
5256 		md_set_record		*sr;
5257 		md_mnset_record		*mnsr;
5258 
5259 		if (MD_MNSET_DESC(sd)) {
5260 			nd = sd->sd_nodelist;
5261 			/*
5262 			 * During OHA mode, don't issue RPCs to
5263 			 * non-alive nodes since there is no reason to
5264 			 * wait for RPC timeouts.
5265 			 */
5266 			while (nd) {
5267 				if ((oha == TRUE) &&
5268 				    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
5269 					nd = nd->nd_next;
5270 					continue;
5271 				}
5272 				/* Record should be for a multi-node diskset */
5273 				if (clnt_mngetset(nd->nd_nodename, sp->setname,
5274 				    MD_SET_BAD, &mnsr, &xep) == -1) {
5275 					mdclrerror(&xep);
5276 					nd = nd->nd_next;
5277 					continue;
5278 				}
5279 
5280 				has_set = 1;
5281 
5282 				nr = mnsr->sr_nodechain;
5283 				while (nr) {
5284 					if (nd->nd_nodeid == nr->nr_nodeid) {
5285 						break;
5286 					}
5287 					nr = nr->nr_next;
5288 				}
5289 				if (nr == NULL)
5290 					has_set = 0;
5291 
5292 				free_sr((struct md_set_record *)mnsr);
5293 				if (has_set) {
5294 					nd = nd->nd_next;
5295 					continue;
5296 				}
5297 
5298 				if (clnt_addhosts(nd->nd_nodename, sp, node_c,
5299 				    node_v, &xep) == -1)
5300 					mdclrerror(&xep);
5301 
5302 				nd = nd->nd_next;
5303 			}
5304 		} else {
5305 			for (i = 0; i < MD_MAXSIDES; i++) {
5306 				/* Skip empty slots */
5307 				if (sd->sd_nodes[i][0] == '\0')
5308 					continue;
5309 
5310 				/* Record should be for a non-multi-node set */
5311 				if (clnt_getset(sd->sd_nodes[i], sp->setname,
5312 				    MD_SET_BAD, &sr, &xep) == -1) {
5313 					mdclrerror(&xep);
5314 					continue;
5315 				}
5316 
5317 				/*
5318 				 * Set record structure was allocated from RPC
5319 				 * routine getset so this structure is only of
5320 				 * size md_set_record even if the MN flag is
5321 				 * set.  So, clear the flag so that the free
5322 				 * code doesn't attempt to free a structure
5323 				 * the size of md_mnset_record.
5324 				 */
5325 				if (MD_MNSET_REC(sr)) {
5326 					sr->sr_flags &= ~MD_SR_MN;
5327 					free_sr(sr);
5328 					continue;
5329 				}
5330 
5331 				has_set = 1;
5332 				for (j = 0; j < MD_MAXSIDES; j++) {
5333 					/* Skip empty slots */
5334 					if (sd->sd_nodes[j][0] == '\0')
5335 						continue;
5336 
5337 					if (sr->sr_nodes[j][0] == '\0') {
5338 						has_set = 0;
5339 						break;
5340 					}
5341 				}
5342 
5343 				free_sr(sr);
5344 				if (has_set)
5345 					continue;
5346 
5347 				if (clnt_addhosts(sd->sd_nodes[i], sp, node_c,
5348 				    node_v, &xep) == -1)
5349 					mdclrerror(&xep);
5350 			}
5351 		}
5352 		max_genid++;
5353 	}
5354 
5355 	/* level 1 */
5356 	if (rb_level > 0) {
5357 		max_genid++;
5358 		/* Sets MD_SR_OK on given nodes. */
5359 		resync_genid(sp, sd, max_genid, node_c, node_v);
5360 
5361 		/*
5362 		 * For MN diskset:
5363 		 * On each newly re-added node, set the node record for that
5364 		 * node to OK.  Then set all node records for the newly added
5365 		 * nodes on all nodes to ok.
5366 		 *
5367 		 * By setting a node's own node record to ok first, even if
5368 		 * the node re-adding the hosts panics, the rest of the nodes
5369 		 * can determine the same node list during the choosing of the
5370 		 * master during reconfig.  So, only nodes considered for
5371 		 * mastership are nodes that have both MD_MN_NODE_OK and
5372 		 * MD_SR_OK set on that node's rpc.metad.  If all nodes have
5373 		 * MD_SR_OK set, but no node has its own MD_MN_NODE_OK set,
5374 		 * then the set will be removed during reconfig since a panic
5375 		 * occurred during the re-creation of the deletion of
5376 		 * the initial diskset.
5377 		 */
5378 		if (MD_MNSET_DESC(sd)) {
5379 			md_mnnode_desc	*saved_nd_next;
5380 			if (dd != NULL) {
5381 				/*
5382 				 * Notify rpc.mdcommd on all nodes of a
5383 				 * nodelist change.  Start by suspending
5384 				 * rpc.mdcommd (which drains it of all
5385 				 * messages), then change the nodelist
5386 				 * followed by a reinit and resume.
5387 				 */
5388 				nd = sd->sd_nodelist;
5389 				while (nd) {
5390 					if (!(nd->nd_flags &
5391 					    MD_MN_NODE_ALIVE)) {
5392 						nd = nd->nd_next;
5393 						continue;
5394 					}
5395 					if (clnt_mdcommdctl(nd->nd_nodename,
5396 					    COMMDCTL_SUSPEND, sp,
5397 					    MD_MSG_CLASS0,
5398 					    MD_MSCF_NO_FLAGS, &xep)) {
5399 						mde_perror(&xep,
5400 						    dgettext(TEXT_DOMAIN,
5401 						    "Unable to suspend "
5402 						    "rpc.mdcommd.\n"));
5403 						mdclrerror(&xep);
5404 					}
5405 					suspendall_flag_rb = 1;
5406 					nd = nd->nd_next;
5407 				}
5408 			}
5409 			for (i = 0; i < node_c; i++) {
5410 				/*
5411 				 * During OHA mode, don't issue RPCs to
5412 				 * non-alive nodes since there is no reason to
5413 				 * wait for RPC timeouts.
5414 				 */
5415 				nd = sd->sd_nodelist;
5416 				while (nd) {
5417 					if (strcmp(nd->nd_nodename, node_v[i])
5418 					    == 0)
5419 						break;
5420 					nd = nd->nd_next;
5421 				}
5422 				/* Something wrong, finish this in next loop */
5423 				if (nd == NULL)
5424 					continue;
5425 
5426 				if ((oha == TRUE) &&
5427 				    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
5428 					continue;
5429 				}
5430 
5431 				if (dd != NULL) {
5432 					/* Set master on re-joining node. */
5433 					if (clnt_mnsetmaster(node_v[i], sp,
5434 					    sd->sd_mn_master_nodenm,
5435 					    sd->sd_mn_master_nodeid, &xep)) {
5436 						mdclrerror(&xep);
5437 					}
5438 
5439 					/*
5440 					 * Re-join set to same state as
5441 					 * before - stale or non-stale.
5442 					 */
5443 					if (clnt_joinset(node_v[i], sp,
5444 					    stale_flag, &xep)) {
5445 						mdclrerror(&xep);
5446 					}
5447 				}
5448 
5449 				/* Only changing my local cache of node list */
5450 				saved_nd_next = nd->nd_next;
5451 				nd->nd_next = NULL;
5452 
5453 				/* Set record for host to ok on that host */
5454 				if (clnt_upd_nr_flags(node_v[i], sp,
5455 				    nd, MD_NR_OK, NULL, &xep)) {
5456 					mdclrerror(&xep);
5457 				}
5458 				nd->nd_next = saved_nd_next;
5459 			}
5460 
5461 			/* Now set all node records on all nodes to be ok */
5462 			nd = sd->sd_nodelist;
5463 			while (nd) {
5464 				/*
5465 				 * During OHA mode, don't issue RPCs to
5466 				 * non-alive nodes since there is no reason to
5467 				 * wait for RPC timeouts.
5468 				 */
5469 				if ((oha == TRUE) &&
5470 				    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
5471 					nd = nd->nd_next;
5472 					continue;
5473 				}
5474 				if (clnt_upd_nr_flags(nd->nd_nodename, sp,
5475 				    sd->sd_nodelist, MD_NR_OK, NULL, &xep)) {
5476 					mdclrerror(&xep);
5477 				}
5478 				nd = nd->nd_next;
5479 			}
5480 		}
5481 	}
5482 
5483 	/*
5484 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
5485 	 * Send reinit command to mdcommd which forces it to get
5486 	 * fresh set description.
5487 	 */
5488 	if (suspendall_flag_rb) {
5489 		/* Send reinit */
5490 		nd = sd->sd_nodelist;
5491 		while (nd) {
5492 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5493 				nd = nd->nd_next;
5494 				continue;
5495 			}
5496 
5497 			/* Class is ignored for REINIT */
5498 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
5499 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
5500 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
5501 				    "Unable to reinit rpc.mdcommd.\n"));
5502 				mdclrerror(&xep);
5503 			}
5504 			nd = nd->nd_next;
5505 		}
5506 	}
5507 
5508 	/*
5509 	 * Unlock diskset by resuming messages across the diskset.
5510 	 * Just resume all classes so that resume is the same whether
5511 	 * just one class was locked or all classes were locked.
5512 	 */
5513 	if ((suspend1_flag) || (suspendall_flag) || (suspendall_flag_rb)) {
5514 		/* Send resume */
5515 		nd = sd->sd_nodelist;
5516 		while (nd) {
5517 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5518 				nd = nd->nd_next;
5519 				continue;
5520 			}
5521 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
5522 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
5523 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
5524 				    "Unable to resume rpc.mdcommd.\n"));
5525 			}
5526 			nd = nd->nd_next;
5527 		}
5528 		meta_ping_mnset(sp->setno);
5529 	}
5530 
5531 	/*
5532 	 * Start a resync thread on the re-added nodes
5533 	 * if set is not stale. Also start a thread to update the
5534 	 * abr state of all soft partitions
5535 	 */
5536 	if (stale_flag != MNSET_IS_STALE) {
5537 		for (i = 0; i < node_c; i++) {
5538 			/*
5539 			 * During OHA mode, don't issue RPCs to
5540 			 * non-alive nodes since there is no reason to
5541 			 * wait for RPC timeouts.
5542 			 */
5543 			nd = sd->sd_nodelist;
5544 			while (nd) {
5545 				if (strcmp(nd->nd_nodename, node_v[i])
5546 				    == 0)
5547 					break;
5548 				nd = nd->nd_next;
5549 			}
5550 			if (nd == NULL)
5551 				continue;
5552 
5553 			if ((oha == TRUE) &&
5554 			    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
5555 				continue;
5556 			}
5557 
5558 			if (dd != 0) {
5559 				if (clnt_mn_mirror_resync_all(node_v[i],
5560 				    sp->setno, &xep)) {
5561 					mde_perror(ep, dgettext(TEXT_DOMAIN,
5562 					    "Unable to start resync "
5563 					    "thread.\n"));
5564 				}
5565 				if (clnt_mn_sp_update_abr(node_v[i],
5566 				    sp->setno, &xep)) {
5567 					mde_perror(ep, dgettext(TEXT_DOMAIN,
5568 					    "Unable to start sp update "
5569 					    "thread.\n"));
5570 				}
5571 			}
5572 		}
5573 	}
5574 
5575 	/* level 0 */
5576 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
5577 	/* Don't test lock flag since guaranteed to be set if in rollback */
5578 	if (MD_MNSET_DESC(sd)) {
5579 		nd = sd->sd_nodelist;
5580 		while (nd) {
5581 			/*
5582 			 * During OHA mode, don't issue RPCs to
5583 			 * non-alive nodes since there is no reason to
5584 			 * wait for RPC timeouts.
5585 			 */
5586 			if ((oha == TRUE) &&
5587 			    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
5588 				nd = nd->nd_next;
5589 				continue;
5590 			}
5591 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
5592 				mdclrerror(&xep);
5593 			nd = nd->nd_next;
5594 		}
5595 	} else {
5596 		for (i = 0; i < MD_MAXSIDES; i++) {
5597 			/* Skip empty slots */
5598 			if (sd->sd_nodes[i][0] == '\0')
5599 				continue;
5600 
5601 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
5602 				mdclrerror(&xep);
5603 		}
5604 	}
5605 	cl_set_setkey(NULL);
5606 
5607 	/* release signals back to what they were on entry */
5608 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
5609 		mdclrerror(&xep);
5610 
5611 	metafreereplicalist(rlp);
5612 	if (node_id_list)
5613 		Free(node_id_list);
5614 
5615 	metaflushsetname(sp);
5616 
5617 	if (!(MD_MNSET_DESC(sd))) {
5618 		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
5619 	}
5620 
5621 	return (rval);
5622 }
5623 
5624 int
meta_set_auto_take(mdsetname_t * sp,int take_val,md_error_t * ep)5625 meta_set_auto_take(
5626 	mdsetname_t	*sp,
5627 	int		take_val,
5628 	md_error_t	*ep
5629 )
5630 {
5631 	int		i;
5632 	md_set_desc	*sd;
5633 	int		rval = 0;
5634 	md_setkey_t	*cl_sk;
5635 	md_error_t	xep = mdnullerror;
5636 	char		*hostname;
5637 	md_drive_desc	*dd;
5638 
5639 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
5640 		return (-1);
5641 
5642 	/* Make sure we own the set */
5643 	if (meta_check_ownership(sp, ep) != 0)
5644 		return (-1);
5645 
5646 	hostname = mynode();
5647 
5648 	/* Lock the set on our side */
5649 	if (clnt_lock_set(hostname, sp, ep)) {
5650 		rval = -1;
5651 		goto out;
5652 	}
5653 
5654 	if (take_val) {
5655 		/* enable auto_take but only if it is not already set */
5656 		if (! (sd->sd_flags & MD_SR_AUTO_TAKE)) {
5657 			/* verify that we're the only host in the set */
5658 			for (i = 0; i < MD_MAXSIDES; i++) {
5659 				if (sd->sd_nodes[i] == NULL ||
5660 				    sd->sd_nodes[i][0] == '\0')
5661 					continue;
5662 
5663 				if (strcmp(sd->sd_nodes[i], hostname) != 0) {
5664 					(void) mddserror(ep, MDE_DS_SINGLEHOST,
5665 					    sp->setno, NULL, NULL, sp->setname);
5666 					rval = -1;
5667 					goto out;
5668 				}
5669 			}
5670 
5671 			if (clnt_enable_sr_flags(hostname, sp,
5672 			    MD_SR_AUTO_TAKE, ep))
5673 				rval = -1;
5674 
5675 			/* Disable SCSI reservations */
5676 			if (sd->sd_flags & MD_SR_MB_DEVID)
5677 				dd = metaget_drivedesc(sp, MD_BASICNAME_OK |
5678 				    PRINT_FAST, &xep);
5679 			else
5680 				dd = metaget_drivedesc(sp, MD_BASICNAME_OK,
5681 				    &xep);
5682 
5683 			if (! mdisok(&xep))
5684 				mdclrerror(&xep);
5685 
5686 			if (dd != NULL) {
5687 				if (rel_own_bydd(sp, dd, TRUE, &xep))
5688 					mdclrerror(&xep);
5689 			}
5690 		}
5691 
5692 	} else {
5693 		/* disable auto_take, if set, or error */
5694 		if (sd->sd_flags & MD_SR_AUTO_TAKE) {
5695 			if (clnt_disable_sr_flags(hostname, sp,
5696 			    MD_SR_AUTO_TAKE, ep))
5697 				rval = -1;
5698 
5699 			/* Enable SCSI reservations */
5700 			if (sd->sd_flags & MD_SR_MB_DEVID)
5701 				dd = metaget_drivedesc(sp, MD_BASICNAME_OK |
5702 				    PRINT_FAST, &xep);
5703 			else
5704 				dd = metaget_drivedesc(sp, MD_BASICNAME_OK,
5705 				    &xep);
5706 
5707 			if (! mdisok(&xep))
5708 				mdclrerror(&xep);
5709 
5710 			if (dd != NULL) {
5711 				mhd_mhiargs_t	mhiargs = defmhiargs;
5712 
5713 				if (tk_own_bydd(sp, dd, &mhiargs, TRUE, &xep))
5714 					mdclrerror(&xep);
5715 			}
5716 		} else {
5717 			(void) mddserror(ep, MDE_DS_AUTONOTSET, sp->setno,
5718 			    NULL, NULL, sp->setname);
5719 			rval = -1;
5720 		}
5721 	}
5722 
5723 out:
5724 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
5725 	if (clnt_unlock_set(hostname, cl_sk, &xep)) {
5726 		if (rval == 0)
5727 			(void) mdstealerror(ep, &xep);
5728 		rval = -1;
5729 	}
5730 	cl_set_setkey(NULL);
5731 
5732 	return (rval);
5733 }
5734