xref: /titanic_41/usr/src/lib/lvm/libmeta/common/meta_set_drv.c (revision ab3487b0f99a72f54cfcffc43f7efd2c4bb2d608)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Metadevice diskset interfaces
30  */
31 
32 #include <meta.h>
33 #include <mdmn_changelog.h>
34 #include "meta_set_prv.h"
35 #include "meta_repartition.h"
36 
37 static int
check_setnodes_againstdrivelist(mdsetname_t * sp,mddrivenamelist_t * dnlp,md_error_t * ep)38 check_setnodes_againstdrivelist(
39 	mdsetname_t		*sp,
40 	mddrivenamelist_t	*dnlp,
41 	md_error_t		*ep
42 )
43 {
44 	md_set_desc		*sd;
45 	mddrivenamelist_t	*p;
46 	int 			i;
47 	md_mnnode_desc		*nd;
48 
49 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
50 		return (-1);
51 
52 	if (MD_MNSET_DESC(sd)) {
53 		nd = sd->sd_nodelist;
54 		while (nd) {
55 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
56 				nd = nd->nd_next;
57 				continue;
58 			}
59 			for (p = dnlp; p != NULL; p = p->next)
60 				if (checkdrive_onnode(sp, p->drivenamep,
61 				    nd->nd_nodename, ep))
62 					return (-1);
63 			nd = nd->nd_next;
64 		}
65 	} else {
66 		for (i = 0; i < MD_MAXSIDES; i++) {
67 			/* Skip empty slots */
68 			if (sd->sd_nodes[i][0] == '\0')
69 				continue;
70 
71 			for (p = dnlp; p != NULL; p = p->next)
72 				if (checkdrive_onnode(sp, p->drivenamep,
73 				    sd->sd_nodes[i], ep))
74 					return (-1);
75 		}
76 	}
77 	return (0);
78 }
79 
80 static int
drvsuniq(mdsetname_t * sp,mddrivenamelist_t * dnlp,md_error_t * ep)81 drvsuniq(mdsetname_t *sp, mddrivenamelist_t *dnlp, md_error_t *ep)
82 {
83 	mddrivenamelist_t *dl1, *dl2;
84 	mddrivename_t *dn1, *dn2;
85 
86 	for (dl1 = dnlp; dl1 != NULL; dl1 = dl1->next) {
87 		dn1 = dl1->drivenamep;
88 
89 		for (dl2 = dl1->next; dl2 != NULL; dl2 = dl2->next) {
90 			dn2 = dl2->drivenamep;
91 			if (strcmp(dn1->cname, dn2->cname) != 0)
92 				continue;
93 
94 			return (mddserror(ep, MDE_DS_DUPDRIVE, sp->setno,
95 			    NULL, dn1->cname, sp->setname));
96 		}
97 	}
98 	return (0);
99 }
100 
101 static md_drive_desc *
metaget_drivedesc_fromdrivelist(mdsetname_t * sp,mddrivenamelist_t * dnlp,uint_t flags,md_error_t * ep)102 metaget_drivedesc_fromdrivelist(
103 	mdsetname_t		*sp,
104 	mddrivenamelist_t	*dnlp,
105 	uint_t			flags,
106 	md_error_t		*ep
107 )
108 {
109 	mddrivenamelist_t	*p;
110 	md_drive_desc		*dd = NULL;
111 	md_set_desc		*sd;
112 
113 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
114 		return (NULL);
115 
116 	for (p = dnlp; p != NULL; p = p->next) {
117 		(void) metadrivedesc_append(&dd, p->drivenamep, 0, 0,
118 		    sd->sd_ctime, sd->sd_genid, flags);
119 	}
120 
121 	return (dd);
122 }
123 
124 /*
125  * Exported Entry Points
126  */
127 
128 int
meta_make_sidenmlist(mdsetname_t * sp,mddrivename_t * dnp,int import_flag,md_im_drive_info_t * midp,md_error_t * ep)129 meta_make_sidenmlist(
130 	mdsetname_t		*sp,
131 	mddrivename_t		*dnp,
132 	int			import_flag, /* flags partial import */
133 	md_im_drive_info_t	*midp,	/* import drive information */
134 	md_error_t		*ep
135 )
136 {
137 	mdsidenames_t		*sn, **sn_next;
138 	mdname_t		*np;
139 	int			done;
140 	side_t			sideno = MD_SIDEWILD;
141 	uint_t			rep_slice;
142 	char			*bname;
143 
144 	if (!import_flag) {
145 		/*
146 		 * Normal (aka NOT partial import) code path.
147 		 */
148 		if (meta_replicaslice(dnp, &rep_slice, ep) != 0) {
149 			return (-1);
150 		}
151 
152 		dnp->side_names_key = MD_KEYWILD;
153 
154 		if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
155 			return (-1);
156 		bname = Strdup(np->bname);
157 	} else {
158 		/*
159 		 * When doing a partial import, we'll get the needed
160 		 * information from somewhere other than the system.
161 		 */
162 		dnp->side_names_key = MD_KEYWILD;
163 		bname = Strdup(midp->mid_devname);
164 	}
165 	metaflushsidenames(dnp);
166 	sn_next = &dnp->side_names;
167 	/*CONSTCOND*/
168 	while (1) {
169 		sn = Zalloc(sizeof (*sn));
170 
171 		if ((done = meta_getnextside_devinfo(sp, bname, &sideno,
172 		    &sn->cname, &sn->dname, &sn->mnum, ep)) == -1) {
173 			if (import_flag) {
174 				mdclrerror(ep);
175 				sn->dname = Strdup(midp->mid_driver_name);
176 				sn->mnum = midp->mid_mnum;
177 			} else {
178 				Free(sn);
179 				Free(bname);
180 				return (-1);
181 			}
182 		}
183 
184 		if (done == 0) {
185 			Free(sn);
186 			Free(bname);
187 			return (0);
188 		}
189 
190 		sn->sideno = sideno;
191 
192 		/* Add to the end of the linked list */
193 		assert(*sn_next == NULL);
194 		*sn_next = sn;
195 		sn_next = &sn->next;
196 	}
197 	/*NOTREACHED*/
198 }
199 
200 int
meta_set_adddrives(mdsetname_t * sp,mddrivenamelist_t * dnlp,daddr_t dbsize,int force_label,md_error_t * ep)201 meta_set_adddrives(
202 	mdsetname_t		*sp,
203 	mddrivenamelist_t	*dnlp,
204 	daddr_t			dbsize,
205 	int			force_label,
206 	md_error_t		*ep
207 )
208 {
209 	md_set_desc		*sd;
210 	md_drive_desc		*dd = NULL, *curdd = NULL, *ddp;
211 	int			i;
212 	mddrivenamelist_t	*p;
213 	mhd_mhiargs_t		mhiargs;
214 	int			rval = 0;
215 	md_timeval32_t		now;
216 	sigset_t		oldsigs;
217 	ulong_t			genid;
218 	ulong_t			max_genid = 0;
219 	md_setkey_t		*cl_sk;
220 	int			rb_level = 0;
221 	md_error_t		xep = mdnullerror;
222 	md_mnnode_desc		*nd;
223 	int			suspendall_flag = 0;
224 	int			suspend1_flag = 0;
225 	int			lock_flag = 0;
226 	int			flush_set_onerr = 0;
227 	md_replicalist_t	*rlp = NULL, *rl;
228 
229 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
230 		return (-1);
231 
232 	/* Make sure we own the set */
233 	if (meta_check_ownership(sp, ep) != 0)
234 		return (-1);
235 
236 	/*
237 	 * The drive and node records are stored in the local mddbs of each
238 	 * node in the diskset.  Each node's rpc.metad daemon reads in the set,
239 	 * drive and node records from that node's local mddb and caches them
240 	 * internally. Any process needing diskset information contacts its
241 	 * local rpc.metad to get this information.  Since each node in the
242 	 * diskset is independently reading the set information from its local
243 	 * mddb, the set, drive and node records in the local mddbs must stay
244 	 * in-sync, so that all nodes have a consistent view of the diskset.
245 	 *
246 	 * For a multinode diskset, explicitly verify that all nodes in the
247 	 * diskset are ALIVE (i.e. are in the API membership list).  Otherwise,
248 	 * fail this operation since all nodes must be ALIVE in order to add
249 	 * the new drive record to their local mddb.  If a panic of this node
250 	 * leaves the local mddbs set, node and drive records out-of-sync, the
251 	 * reconfig cycle will fix the local mddbs and force them back into
252 	 * synchronization.
253 	 */
254 	if (MD_MNSET_DESC(sd)) {
255 		nd = sd->sd_nodelist;
256 		while (nd) {
257 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
258 				(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
259 					sp->setno,
260 					nd->nd_nodename, NULL, sp->setname);
261 				return (-1);
262 			}
263 			nd = nd->nd_next;
264 		}
265 	}
266 
267 	if (drvsuniq(sp, dnlp, ep) == -1)
268 		return (-1);
269 
270 	/*
271 	 * Lock the set on current set members.
272 	 * Set locking done much earlier for MN diskset than for traditional
273 	 * diskset since lock_set and SUSPEND are used to protect against
274 	 * other meta* commands running on the other nodes.
275 	 */
276 	if (MD_MNSET_DESC(sd)) {
277 		/* Make sure we are blocking all signals */
278 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
279 			mdclrerror(&xep);
280 
281 		nd = sd->sd_nodelist;
282 		/* All nodes are guaranteed to be ALIVE */
283 		while (nd) {
284 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
285 				rval = -1;
286 				goto out;
287 			}
288 			lock_flag = 1;
289 			nd = nd->nd_next;
290 		}
291 		/*
292 		 * Lock out other meta* commands by suspending
293 		 * class 1 messages across the diskset.
294 		 */
295 		nd = sd->sd_nodelist;
296 		/* All nodes are guaranteed to be ALIVE */
297 		while (nd) {
298 			if (clnt_mdcommdctl(nd->nd_nodename,
299 			    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
300 			    MD_MSCF_NO_FLAGS, ep)) {
301 				rval = -1;
302 				goto out;
303 			}
304 			suspend1_flag = 1;
305 			nd = nd->nd_next;
306 		}
307 	}
308 
309 	if (check_setnodes_againstdrivelist(sp, dnlp, ep)) {
310 		rval = -1;
311 		goto out;
312 	}
313 
314 	for (p = dnlp; p != NULL; p = p->next) {
315 		mdsetname_t	*tmp;
316 
317 		if (meta_is_drive_in_anyset(p->drivenamep, &tmp, FALSE,
318 		    ep) == -1) {
319 			rval = -1;
320 			goto out;
321 		}
322 
323 		if (tmp != NULL) {
324 			(void) mddserror(ep, MDE_DS_DRIVEINSET, sp->setno,
325 			    tmp->setname, p->drivenamep->cname, sp->setname);
326 			rval = -1;
327 			goto out;
328 		}
329 	}
330 
331 	/* END CHECK CODE */
332 
333 	/*
334 	 * This is a separate loop (from above) so that we validate all the
335 	 * drives handed to us before we repartition any one drive.
336 	 */
337 	for (p = dnlp; p != NULL; p = p->next) {
338 		if (meta_repartition_drive(sp,
339 		    p->drivenamep, force_label == TRUE ? MD_REPART_FORCE : 0,
340 		    NULL, /* Don't return the VTOC. */
341 		    ep) != 0) {
342 			rval = -1;
343 			goto out;
344 		}
345 		/*
346 		 * Create the names for the drives we are adding per side.
347 		 */
348 		if (meta_make_sidenmlist(sp, p->drivenamep, 0, NULL,
349 		    ep) == -1) {
350 			rval = -1;
351 			goto out;
352 		}
353 	}
354 
355 	/*
356 	 * Get the list of drives descriptors that we are adding.
357 	 */
358 	dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_ADD, ep);
359 
360 	if (! mdisok(ep)) {
361 		rval = -1;
362 		goto out;
363 	}
364 
365 	/*
366 	 * Get the set timeout information.
367 	 */
368 	(void) memset(&mhiargs, '\0', sizeof (mhiargs));
369 	if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) {
370 		rval = -1;
371 		goto out;
372 	}
373 
374 	/*
375 	 * Get timestamp and generation id for new records
376 	 */
377 	now = sd->sd_ctime;
378 	genid = sd->sd_genid;
379 
380 
381 	/* At this point, in case of error, set should be flushed. */
382 	flush_set_onerr = 1;
383 
384 	/* Lock the set on current set members */
385 	if (!(MD_MNSET_DESC(sd))) {
386 		md_rb_sig_handling_on();
387 		for (i = 0; i < MD_MAXSIDES; i++) {
388 			/* Skip empty slots */
389 			if (sd->sd_nodes[i][0] == '\0')
390 				continue;
391 
392 			if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
393 				rval = -1;
394 				goto out;
395 			}
396 			lock_flag = 1;
397 		}
398 	}
399 
400 	/*
401 	 * Get drive descriptors for the drives that are currently in the set.
402 	 */
403 	curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep);
404 	if (! mdisok(ep))
405 		goto rollback;
406 
407 	/*
408 	 * If first drive being added to set, set the mastership
409 	 * of the multinode diskset to be this node.
410 	 * Only set it on this node.  If all goes well
411 	 * and there are no errors, the mastership of this node will be set
412 	 * on all nodes in user space and in the kernel.
413 	 */
414 	if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) {
415 		if (clnt_mnsetmaster(mynode(), sp,
416 		    sd->sd_mn_mynode->nd_nodename,
417 		    sd->sd_mn_mynode->nd_nodeid, ep)) {
418 			goto rollback;
419 		}
420 		/*
421 		 * Set this up in my local cache of the set desc so that
422 		 * the set descriptor won't have to be gotten again from
423 		 * rpc.metad.  If it is flushed and gotten again, these
424 		 * values will be set in sr2setdesc.
425 		 */
426 		sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid;
427 		(void) strcpy(sd->sd_mn_master_nodenm,
428 		    sd->sd_mn_mynode->nd_nodename);
429 		sd->sd_mn_am_i_master = 1;
430 	}
431 
432 	RB_TEST(1, "adddrives", ep)
433 
434 	RB_PREEMPT;
435 	rb_level = 1;	/* level 1 */
436 
437 	RB_TEST(2, "adddrives", ep)
438 
439 	/*
440 	 * Add the drive records for the drives that we are adding to
441 	 * each host in the set.  Marks the drive as MD_DR_ADD.
442 	 */
443 	if (MD_MNSET_DESC(sd)) {
444 		nd = sd->sd_nodelist;
445 		/* All nodes are guaranteed to be ALIVE */
446 		while (nd) {
447 			if (clnt_adddrvs(nd->nd_nodename, sp, dd, now, genid,
448 			    ep) == -1)
449 				goto rollback;
450 
451 			RB_TEST(3, "adddrives", ep)
452 			nd = nd->nd_next;
453 		}
454 	} else {
455 		for (i = 0; i < MD_MAXSIDES; i++) {
456 			/* Skip empty slots */
457 			if (sd->sd_nodes[i][0] == '\0')
458 				continue;
459 
460 			if (clnt_adddrvs(sd->sd_nodes[i], sp, dd, now, genid,
461 			    ep) == -1)
462 				goto rollback;
463 
464 			RB_TEST(3, "adddrives", ep)
465 		}
466 	}
467 
468 	RB_TEST(4, "adddrives", ep)
469 
470 	RB_PREEMPT;
471 	rb_level = 2;	/* level 2 */
472 
473 	RB_TEST(5, "adddrives", ep)
474 
475 	/*
476 	 * Take ownership of the added drives.
477 	 */
478 	if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
479 		if (tk_own_bydd(sp, dd, &mhiargs, TRUE, ep))
480 			goto rollback;
481 	}
482 
483 	/*
484 	 * If this is not a MN set and the state flags do not indicate the
485 	 * presence of devids, update the set records on all nodes.
486 	 */
487 	if (!(sd->sd_flags & MD_SR_MB_DEVID) && !(MD_MNSET_DESC(sd))) {
488 		if (meta_update_mb(sp, dd, ep) == 0) {
489 			mdclrerror(ep);
490 
491 			/* update the sr_flags on all hosts */
492 			for (i = 0; i < MD_MAXSIDES; i++) {
493 				if (sd->sd_nodes[i][0] == '\0')
494 					continue;
495 
496 				if (clnt_upd_sr_flags(sd->sd_nodes[i],
497 				    sp, (sd->sd_flags | MD_SR_MB_DEVID), ep))
498 					goto rollback;
499 			}
500 		}
501 	}
502 
503 	RB_TEST(6, "adddrives", ep)
504 
505 	RB_PREEMPT;
506 	rb_level = 3;	/* level 3 */
507 
508 	RB_TEST(7, "adddrives", ep)
509 
510 	/*
511 	 * Balance the DB's according to the list of existing drives and the
512 	 * list of added drives.
513 	 */
514 	if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1)
515 		goto rollback;
516 
517 	/*
518 	 * Slam a dummy master block on all the disks that we are adding
519 	 * that don't have replicas on them.
520 	 * Used by diskset import if the disksets are remotely replicated
521 	 */
522 	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) >= 0) {
523 		for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
524 			uint_t		rep_slice;
525 			int		fd = -1;
526 			mdname_t	*np = NULL;
527 			char		*drive_name;
528 
529 			drive_name = ddp->dd_dnp->cname;
530 
531 			for (rl = rlp; rl != NULL; rl = rl->rl_next) {
532 				char	*rep_name;
533 
534 				rep_name =
535 				    rl->rl_repp->r_namep->drivenamep->cname;
536 
537 				if (strcmp(drive_name, rep_name) == 0) {
538 					/*
539 					 * Disk has a replica on it so don't
540 					 * add dummy master block.
541 					 */
542 					break;
543 				}
544 			}
545 			if (rl == NULL) {
546 				/*
547 				 * Drive doesn't have a replica on it so
548 				 * we need a dummy master block. Add it.
549 				 */
550 				if (meta_replicaslice(ddp->dd_dnp, &rep_slice,
551 				    &xep) != 0) {
552 					mdclrerror(&xep);
553 					continue;
554 				}
555 
556 				if ((np = metaslicename(ddp->dd_dnp, rep_slice,
557 				    &xep)) == NULL) {
558 					mdclrerror(&xep);
559 					continue;
560 				}
561 
562 				if ((fd = open(np->rname, O_RDWR)) >= 0) {
563 					meta_mkdummymaster(sp, fd, 16);
564 					(void) close(fd);
565 				}
566 			}
567 		}
568 	}
569 
570 	if ((curdd == NULL) && (MD_MNSET_DESC(sd))) {
571 		/*
572 		 * Notify rpc.mdcommd on all nodes of a nodelist change.
573 		 * Start by suspending rpc.mdcommd (which drains it of all
574 		 * messages), then change the nodelist followed by a reinit
575 		 * and resume.
576 		 */
577 		nd = sd->sd_nodelist;
578 		/* All nodes are guaranteed to be ALIVE */
579 		while (nd) {
580 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
581 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
582 				rval = -1;
583 				goto out;
584 			}
585 			suspendall_flag = 1;
586 			nd = nd->nd_next;
587 		}
588 	}
589 
590 	/*
591 	 * If a MN diskset and this is the first disk(s) being added
592 	 * to set, then pre-allocate change log records here.
593 	 * When the other nodes are joined into the MN diskset, the
594 	 * USER records will just be snarfed in.
595 	 */
596 	if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) {
597 		if (mdmn_allocate_changelog(sp, ep) != 0)
598 			goto rollback;
599 	}
600 
601 	/*
602 	 * Mark the drives MD_DR_OK.
603 	 * If first drive being added to MN diskset, then set
604 	 * master on all nodes to be this node and then join
605 	 * all alive nodes (nodes in membership list) to set.
606 	 */
607 	if (MD_MNSET_DESC(sd)) {
608 		nd = sd->sd_nodelist;
609 		/* All nodes are guaranteed to be ALIVE */
610 		while (nd) {
611 			/* don't set master on this node - done earlier */
612 			if ((curdd == NULL) && (nd->nd_nodeid !=
613 			    sd->sd_mn_mynode->nd_nodeid)) {
614 				/*
615 				 * Set master on all alive nodes since
616 				 * all alive nodes will become joined nodes.
617 				 */
618 				if (clnt_mnsetmaster(nd->nd_nodename, sp,
619 				    sd->sd_mn_mynode->nd_nodename,
620 				    sd->sd_mn_mynode->nd_nodeid, ep)) {
621 					goto rollback;
622 				}
623 			}
624 
625 			if (curdd == NULL) {
626 				/*
627 				 * No special flags for join set.  Since
628 				 * all nodes are joining if 1st drive is being
629 				 * added to set then all nodes will be either
630 				 * STALE or non-STALE and each node can
631 				 * determine this on its own.
632 				 */
633 				if (clnt_joinset(nd->nd_nodename, sp,
634 				    NULL, ep)) {
635 					goto rollback;
636 				}
637 				/* Sets join node flag on all nodes in list */
638 				if (clnt_upd_nr_flags(nd->nd_nodename, sp,
639 				    sd->sd_nodelist, MD_NR_JOIN, NULL, ep)) {
640 					goto rollback;
641 				}
642 			}
643 
644 			/*
645 			 * Set MD_DR_OK as last thing before unlock.
646 			 * In case of panic on this node, recovery
647 			 * code can check for MD_DR_OK to determine
648 			 * status of diskset.
649 			 */
650 			if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
651 			    MD_DR_OK, ep) == -1)
652 				goto rollback;
653 
654 
655 			RB_TEST(8, "adddrives", ep)
656 			nd = nd->nd_next;
657 		}
658 	} else {
659 		for (i = 0; i < MD_MAXSIDES; i++) {
660 			/* Skip empty slots */
661 			if (sd->sd_nodes[i][0] == '\0')
662 				continue;
663 
664 			if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd, MD_DR_OK,
665 			    ep) == -1)
666 				goto rollback;
667 
668 			RB_TEST(8, "adddrives", ep)
669 		}
670 	}
671 
672 	RB_TEST(9, "adddrives", ep)
673 
674 out:
675 	/*
676 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
677 	 * Send reinit command to mdcommd which forces it to get
678 	 * fresh set description.
679 	 */
680 	if (suspendall_flag) {
681 		/* Send reinit */
682 		nd = sd->sd_nodelist;
683 		/* All nodes are guaranteed to be ALIVE */
684 		while (nd) {
685 			/* Class is ignored for REINIT */
686 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
687 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
688 				if (rval == 0)
689 					(void) mdstealerror(ep, &xep);
690 				rval = -1;
691 				mde_perror(ep, dgettext(TEXT_DOMAIN,
692 				    "Unable to reinit rpc.mdcommd.\n"));
693 			}
694 			nd = nd->nd_next;
695 		}
696 	}
697 	/*
698 	 * Unlock diskset by resuming messages across the diskset.
699 	 * Just resume all classes so that resume is the same whether
700 	 * just one class was locked or all classes were locked.
701 	 */
702 	if ((suspend1_flag) || (suspendall_flag)) {
703 		nd = sd->sd_nodelist;
704 		/* All nodes are guaranteed to be ALIVE */
705 		while (nd) {
706 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
707 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
708 				if (rval == 0)
709 					(void) mdstealerror(ep, &xep);
710 				rval = -1;
711 				mde_perror(ep, dgettext(TEXT_DOMAIN,
712 				    "Unable to resume rpc.mdcommd.\n"));
713 			}
714 			nd = nd->nd_next;
715 		}
716 		meta_ping_mnset(sp->setno);
717 	}
718 
719 	if (lock_flag) {
720 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
721 		if (MD_MNSET_DESC(sd)) {
722 			nd = sd->sd_nodelist;
723 			/* All nodes are guaranteed to be ALIVE */
724 			while (nd) {
725 				if (clnt_unlock_set(nd->nd_nodename,
726 				    cl_sk, &xep)) {
727 					if (rval == 0)
728 						(void) mdstealerror(ep, &xep);
729 					rval = -1;
730 				}
731 				nd = nd->nd_next;
732 			}
733 		} else {
734 			for (i = 0; i < MD_MAXSIDES; i++) {
735 				/* Skip empty slots */
736 				if (sd->sd_nodes[i][0] == '\0')
737 					continue;
738 
739 				if (clnt_unlock_set(sd->sd_nodes[i],
740 				    cl_sk, &xep)) {
741 					if (rval == 0)
742 						(void) mdstealerror(ep, &xep);
743 					rval = -1;
744 				}
745 			}
746 		}
747 		cl_set_setkey(NULL);
748 	}
749 
750 	metafreedrivedesc(&dd);
751 
752 	if (flush_set_onerr) {
753 		metaflushsetname(sp);
754 		if (!(MD_MNSET_DESC(sd))) {
755 			md_rb_sig_handling_off(md_got_sig(), md_which_sig());
756 		}
757 	}
758 
759 	if (MD_MNSET_DESC(sd)) {
760 		/* release signals back to what they were on entry */
761 		if (procsigs(FALSE, &oldsigs, &xep) < 0)
762 			mdclrerror(&xep);
763 	}
764 
765 	return (rval);
766 
767 rollback:
768 	/* all signals already blocked for MN disket */
769 	if (!(MD_MNSET_DESC(sd))) {
770 		/* Make sure we are blocking all signals */
771 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
772 			mdclrerror(&xep);
773 	}
774 
775 	rval = -1;
776 
777 	max_genid = sd->sd_genid;
778 
779 	/* level 3 */
780 	if (rb_level > 2) {
781 		/*
782 		 * Since the add drive operation is failing, need
783 		 * to reset config back to the way it was
784 		 * before the add drive opration.
785 		 * If a MN diskset and this is the first drive being added,
786 		 * then reset master on all ALIVE nodes (which is all nodes)
787 		 * since the master would have not been set previously.
788 		 * Don't reset master on this node, since this
789 		 * is done later.
790 		 * This is ok to fail since next node to add first
791 		 * disk to diskset will also set the master on all nodes.
792 		 *
793 		 * Also, if this is the first drive being added,
794 		 * need to have each node withdraw itself from the set.
795 		 */
796 		if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) {
797 			nd = sd->sd_nodelist;
798 			/* All nodes are guaranteed to be ALIVE */
799 			while (nd) {
800 				/*
801 				 * Be careful with ordering in case of
802 				 * panic between the steps and the
803 				 * effect on recovery during reconfig.
804 				 */
805 				if (clnt_withdrawset(nd->nd_nodename, sp, &xep))
806 					mdclrerror(&xep);
807 
808 				/* Sets withdraw flag on all nodes in list */
809 				if (clnt_upd_nr_flags(nd->nd_nodename, sp,
810 				    sd->sd_nodelist, MD_NR_WITHDRAW,
811 				    NULL, &xep)) {
812 					mdclrerror(&xep);
813 				}
814 
815 				/* Skip this node */
816 				if (nd->nd_nodeid ==
817 				    sd->sd_mn_mynode->nd_nodeid) {
818 					nd = nd->nd_next;
819 					continue;
820 				}
821 				/* Reset master on all of the other nodes. */
822 				if (clnt_mnsetmaster(nd->nd_nodename, sp,
823 				    "", MD_MN_INVALID_NID, &xep))
824 					mdclrerror(&xep);
825 				nd = nd->nd_next;
826 			}
827 		}
828 	}
829 
830 	/*
831 	 * Send resume command to mdcommd.  Don't send reinit command
832 	 * since nodelist should not have changed.
833 	 * If suspendall_flag is set, then user would have been adding
834 	 * first drives to set.  Since this failed, there is certainly
835 	 * no reinit message to send to rpc.commd since no nodes will
836 	 * be joined to set at the end of this metaset command.
837 	 */
838 	if (suspendall_flag) {
839 		/* Send resume */
840 		nd = sd->sd_nodelist;
841 		/* All nodes are guaranteed to be ALIVE */
842 		while (nd) {
843 			/*
844 			 * Resume all classes but class 1 so that lock is held
845 			 * against meta* commands.
846 			 * To later resume class1, must issue a class0 resume.
847 			 */
848 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
849 			    sp, MD_MSG_CLASS0,
850 			    MD_MSCF_DONT_RESUME_CLASS1, &xep)) {
851 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
852 				    "Unable to resume rpc.mdcommd.\n"));
853 				mdclrerror(&xep);
854 			}
855 			nd = nd->nd_next;
856 		}
857 		meta_ping_mnset(sp->setno);
858 	}
859 
860 	/* level 3 */
861 	if (rb_level > 2) {
862 		mdnamelist_t	*nlp;
863 		mdname_t	*np;
864 
865 		for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
866 			uint_t	rep_slice;
867 
868 			if ((meta_replicaslice(ddp->dd_dnp,
869 			    &rep_slice, &xep) != 0) ||
870 			    ((np = metaslicename(ddp->dd_dnp, rep_slice,
871 				&xep)) == NULL)) {
872 				mdclrerror(&xep);
873 				continue;
874 			}
875 			nlp = NULL;
876 			(void) metanamelist_append(&nlp, np);
877 
878 			if (meta_db_detach(sp, nlp,
879 			    (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, &xep))
880 				mdclrerror(&xep);
881 
882 			metafreenamelist(nlp);
883 		}
884 
885 		/* Re-balance */
886 		if (meta_db_balance(sp, NULL, curdd, 0, &xep) == -1)
887 			mdclrerror(&xep);
888 
889 		/* Only if we are adding the first drive */
890 		/* Handled MN diskset above. */
891 		if ((curdd == NULL) && !(MD_MNSET_DESC(sd))) {
892 			if (clnt_stimeout(mynode(), sp, &defmhiargs,
893 			    &xep) == -1)
894 				mdclrerror(&xep);
895 
896 			/* This is needed because of a corner case */
897 			if (halt_set(sp, &xep))
898 				mdclrerror(&xep);
899 		}
900 		max_genid++;
901 	}
902 
903 	/* level 2 */
904 	if (rb_level > 1) {
905 		if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
906 			if (rel_own_bydd(sp, dd, TRUE, &xep))
907 				mdclrerror(&xep);
908 		}
909 	}
910 
911 	/* level 1 */
912 	if (rb_level > 0) {
913 		if (MD_MNSET_DESC(sd)) {
914 			nd = sd->sd_nodelist;
915 			/* All nodes are guaranteed to be ALIVE */
916 			while (nd) {
917 				if (clnt_deldrvs(nd->nd_nodename, sp, dd,
918 				    &xep) == -1)
919 					mdclrerror(&xep);
920 				nd = nd->nd_next;
921 			}
922 		} else {
923 			for (i = 0; i < MD_MAXSIDES; i++) {
924 				/* Skip empty slots */
925 				if (sd->sd_nodes[i][0] == '\0')
926 					continue;
927 
928 				if (clnt_deldrvs(sd->sd_nodes[i], sp, dd,
929 				    &xep) == -1)
930 					mdclrerror(&xep);
931 			}
932 		}
933 		max_genid += 2;
934 		resync_genid(sp, sd, max_genid, 0, NULL);
935 	}
936 
937 	if ((suspend1_flag) || (suspendall_flag)) {
938 		/* Send resume */
939 		nd = sd->sd_nodelist;
940 		/* All nodes are guaranteed to be ALIVE */
941 		while (nd) {
942 			/*
943 			 * Just resume all classes so that resume is the
944 			 * same whether just one class was locked or all
945 			 * classes were locked.
946 			 */
947 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
948 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
949 				mdclrerror(&xep);
950 			}
951 			nd = nd->nd_next;
952 		}
953 		meta_ping_mnset(sp->setno);
954 	}
955 
956 	/* level 0 */
957 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
958 	/* Don't test lock flag since guaranteed to be set if in rollback */
959 	if (MD_MNSET_DESC(sd)) {
960 		/*
961 		 * Since the add drive operation is failing, need
962 		 * to reset config back to the way it was
963 		 * before the add drive opration.
964 		 * If a MN diskset and this is the first drive being
965 		 * added, then reset master on this node since
966 		 * the master would have not been set previously.
967 		 * This is ok to fail since next node to add first
968 		 * disk to diskset will also set the master on all nodes.
969 		 */
970 		if (curdd == NULL) {
971 			/* Reset master on mynode */
972 			if (clnt_mnsetmaster(mynode(), sp, "",
973 			    MD_MN_INVALID_NID, &xep))
974 				mdclrerror(&xep);
975 		}
976 		nd = sd->sd_nodelist;
977 		/* All nodes are guaranteed to be ALIVE */
978 		while (nd) {
979 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
980 				mdclrerror(&xep);
981 			nd = nd->nd_next;
982 		}
983 	} else {
984 		for (i = 0; i < MD_MAXSIDES; i++) {
985 			/* Skip empty slots */
986 			if (sd->sd_nodes[i][0] == '\0')
987 				continue;
988 
989 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
990 				mdclrerror(&xep);
991 		}
992 	}
993 	cl_set_setkey(NULL);
994 
995 	/* release signals back to what they were on entry */
996 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
997 		mdclrerror(&xep);
998 
999 	metafreedrivedesc(&dd);
1000 
1001 	if (flush_set_onerr) {
1002 		metaflushsetname(sp);
1003 		if (!(MD_MNSET_DESC(sd))) {
1004 			md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1005 		}
1006 	}
1007 
1008 	return (rval);
1009 }
1010 
1011 /*
1012  * Add drives routine used during import of a diskset.
1013  */
1014 int
meta_imp_set_adddrives(mdsetname_t * sp,mddrivenamelist_t * dnlp,md_im_set_desc_t * misp,md_error_t * ep)1015 meta_imp_set_adddrives(
1016 	mdsetname_t		*sp,
1017 	mddrivenamelist_t	*dnlp,
1018 	md_im_set_desc_t	*misp,
1019 	md_error_t		*ep
1020 )
1021 {
1022 	md_set_desc		*sd;
1023 	mddrivenamelist_t	*p;
1024 	md_drive_desc		*dd = NULL, *ddp;
1025 	int			flush_set_onerr = 0;
1026 	md_timeval32_t		now;
1027 	ulong_t			genid;
1028 	mhd_mhiargs_t		mhiargs;
1029 	md_im_replica_info_t	*mirp;
1030 	md_im_drive_info_t	*midp;
1031 	int			rval = 0;
1032 	sigset_t		oldsigs;
1033 	ulong_t			max_genid = 0;
1034 	int			rb_level = 0;
1035 	md_error_t		xep = mdnullerror;
1036 
1037 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1038 		return (-1);
1039 
1040 	for (p = dnlp; p != NULL; p = p->next) {
1041 		int		imp_flag = 0;
1042 
1043 		/*
1044 		 * If we have a partial diskset, meta_make_sidenmlist will
1045 		 * need information from midp to complete making the
1046 		 * side name structure.
1047 		 */
1048 		if (misp->mis_partial) {
1049 			imp_flag = MDDB_C_IMPORT;
1050 			for (midp = misp->mis_drives; midp != NULL;
1051 			    midp = midp->mid_next) {
1052 				if (midp->mid_dnp == p->drivenamep)
1053 					break;
1054 			}
1055 			if (midp == NULL) {
1056 				(void) mddserror(ep, MDE_DS_SETNOTIMP,
1057 				    MD_SET_BAD, mynode(), NULL, sp->setname);
1058 				rval = -1;
1059 				goto out;
1060 			}
1061 		}
1062 		/*
1063 		 * Create the names for the drives we are adding per side.
1064 		 */
1065 		if (meta_make_sidenmlist(sp, p->drivenamep, imp_flag,
1066 		    midp, ep) == -1) {
1067 			rval = -1;
1068 			goto out;
1069 		}
1070 	}
1071 
1072 	/*
1073 	 * Get the list of drives descriptors that we are adding.
1074 	 */
1075 	dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_ADD, ep);
1076 
1077 	if (! mdisok(ep)) {
1078 		rval = -1;
1079 		goto out;
1080 	}
1081 
1082 	/*
1083 	 * Get the set timeout information.
1084 	 */
1085 	(void) memset(&mhiargs, '\0', sizeof (mhiargs));
1086 	if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) {
1087 		rval = -1;
1088 		goto out;
1089 	}
1090 
1091 	/*
1092 	 * Get timestamp and generation id for new records
1093 	 */
1094 	now = sd->sd_ctime;
1095 	genid = sd->sd_genid;
1096 
1097 	/* At this point, in case of error, set should be flushed. */
1098 	flush_set_onerr = 1;
1099 
1100 	rb_level = 1;   /* level 1 */
1101 
1102 	for (midp = misp->mis_drives; midp != NULL; midp = midp->mid_next) {
1103 		for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
1104 			if (ddp->dd_dnp == midp->mid_dnp) {
1105 				/* same disk */
1106 				ddp->dd_dnp->devid =
1107 				    devid_str_encode(midp->mid_devid,
1108 				    midp->mid_minor_name);
1109 
1110 				ddp->dd_dbcnt = 0;
1111 				mirp = midp->mid_replicas;
1112 				if (mirp) {
1113 					ddp->dd_dbsize = mirp->mir_length;
1114 					for (; mirp != NULL;
1115 					    mirp = mirp->mir_next) {
1116 						ddp->dd_dbcnt++;
1117 					}
1118 				}
1119 				if ((midp->mid_available &
1120 				    MD_IM_DISK_NOT_AVAILABLE) &&
1121 				    (misp->mis_flags & MD_IM_SET_REPLICATED)) {
1122 					ddp->dd_flags = MD_DR_UNRSLV_REPLICATED;
1123 				}
1124 			}
1125 		}
1126 	}
1127 
1128 	/*
1129 	 * Add the drive records for the drives that we are adding to
1130 	 * each host in the set.  Marks the drive records as MD_DR_ADD.
1131 	 * May also mark a drive record as MD_DR_UNRSLV_REPLICATED if
1132 	 * this flag was set in the dd_flags for that drive.
1133 	 */
1134 	if (clnt_imp_adddrvs(mynode(), sp, dd, now, genid, ep) == -1)
1135 		goto rollback;
1136 
1137 	rb_level = 2;   /* level 2 */
1138 
1139 	/*
1140 	 * Take ownership of the added drives.
1141 	 */
1142 	if (tk_own_bydd(sp, dd, &mhiargs, TRUE, ep))
1143 		goto rollback;
1144 
1145 out:
1146 	metafreedrivedesc(&dd);
1147 
1148 	if (flush_set_onerr) {
1149 		metaflushsetname(sp);
1150 	}
1151 
1152 	return (rval);
1153 
1154 rollback:
1155 	/* Make sure we are blocking all signals */
1156 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
1157 		mdclrerror(&xep);
1158 
1159 	rval = -1;
1160 
1161 	max_genid = sd->sd_genid;
1162 
1163 	/* level 2 */
1164 	if (rb_level > 1) {
1165 		if (!MD_ATSET_DESC(sd)) {
1166 			if (rel_own_bydd(sp, dd, TRUE, &xep)) {
1167 				mdclrerror(&xep);
1168 			}
1169 		}
1170 	}
1171 
1172 	/* level 1 */
1173 	if (rb_level > 0) {
1174 		if (clnt_deldrvs(mynode(), sp, dd, &xep) == -1) {
1175 			mdclrerror(&xep);
1176 		}
1177 		max_genid += 2;
1178 		resync_genid(sp, sd, max_genid, 0, NULL);
1179 	}
1180 
1181 	/* level 0 */
1182 
1183 	/* release signals back to what they were on entry */
1184 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
1185 		mdclrerror(&xep);
1186 
1187 	metafreedrivedesc(&dd);
1188 
1189 	if (flush_set_onerr) {
1190 		metaflushsetname(sp);
1191 		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1192 	}
1193 
1194 	return (rval);
1195 }
1196 
1197 int
meta_set_deletedrives(mdsetname_t * sp,mddrivenamelist_t * dnlp,int forceflg,md_error_t * ep)1198 meta_set_deletedrives(
1199 	mdsetname_t		*sp,
1200 	mddrivenamelist_t	*dnlp,
1201 	int			forceflg,
1202 	md_error_t		*ep
1203 )
1204 {
1205 	md_set_desc		*sd;
1206 	md_drive_desc		*ddp, *dd = NULL, *curdd = NULL;
1207 	md_replicalist_t	*rlp = NULL, *rl;
1208 	mddrivenamelist_t	*p;
1209 	int			deldrvcnt = 0;
1210 	int			rval = 0;
1211 	mhd_mhiargs_t		mhiargs;
1212 	int			i;
1213 	sigset_t		oldsigs;
1214 	md_setkey_t		*cl_sk;
1215 	ulong_t			max_genid = 0;
1216 	int			rb_level = 0;
1217 	md_error_t		xep = mdnullerror;
1218 	md_mnnode_desc		*nd;
1219 	int			has_set;
1220 	int			current_drv_cnt = 0;
1221 	int			suspendall_flag = 0, suspendall_flag_rb = 0;
1222 	int			suspend1_flag = 0;
1223 	int			lock_flag = 0;
1224 	bool_t			stale_bool = FALSE;
1225 	int			flush_set_onerr = 0;
1226 	mdnamelist_t		*nlp;
1227 	mdname_t		*np;
1228 
1229 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1230 		return (-1);
1231 
1232 	/* Make sure we own the set */
1233 	if (meta_check_ownership(sp, ep) != 0)
1234 		return (-1);
1235 
1236 	if (drvsuniq(sp, dnlp, ep) == -1)
1237 		return (-1);
1238 
1239 	/*
1240 	 * Check and see if all the nodes have the set.
1241 	 *
1242 	 * The drive and node records are stored in the local mddbs of each
1243 	 * node in the diskset.  Each node's rpc.metad daemon reads in the set,
1244 	 * drive and node records from that node's local mddb and caches them
1245 	 * internally. Any process needing diskset information contacts its
1246 	 * local rpc.metad to get this information.  Since each node in the
1247 	 * diskset is independently reading the set information from its local
1248 	 * mddb, the set, drive and node records in the local mddbs must stay
1249 	 * in-sync, so that all nodes have a consistent view of the diskset.
1250 	 *
1251 	 * For a multinode diskset, explicitly verify that all nodes in the
1252 	 * diskset are ALIVE (i.e. are in the API membership list).  Otherwise,
1253 	 * fail this operation since all nodes must be ALIVE in order to delete
1254 	 * a drive record from their local mddb.  If a panic of this node
1255 	 * leaves the local mddbs set, node and drive records out-of-sync, the
1256 	 * reconfig cycle will fix the local mddbs and force them back into
1257 	 * synchronization.
1258 	 */
1259 	if (MD_MNSET_DESC(sd)) {
1260 		nd = sd->sd_nodelist;
1261 		while (nd) {
1262 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1263 				(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
1264 					sp->setno,
1265 					nd->nd_nodename, NULL, sp->setname);
1266 				return (-1);
1267 			}
1268 			nd = nd->nd_next;
1269 		}
1270 
1271 		/* Make sure we are blocking all signals */
1272 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
1273 			mdclrerror(&xep);
1274 
1275 		/*
1276 		 * Lock the set on current set members.
1277 		 * Set locking done much earlier for MN diskset than for
1278 		 * traditional diskset since lock_set and SUSPEND are used
1279 		 * to protect against other meta* commands running on the
1280 		 * other nodes.
1281 		 */
1282 		nd = sd->sd_nodelist;
1283 		/* All nodes are guaranteed to be ALIVE */
1284 		while (nd) {
1285 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
1286 				rval = -1;
1287 				goto out;
1288 			}
1289 			lock_flag = 1;
1290 			nd = nd->nd_next;
1291 		}
1292 		/*
1293 		 * Lock out other meta* commands by suspending
1294 		 * class 1 messages across the diskset.
1295 		 */
1296 		nd = sd->sd_nodelist;
1297 		/* All nodes are guaranteed to be ALIVE */
1298 		while (nd) {
1299 			if (clnt_mdcommdctl(nd->nd_nodename,
1300 			    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
1301 			    MD_MSCF_NO_FLAGS, ep)) {
1302 				rval = -1;
1303 				goto out;
1304 			}
1305 			suspend1_flag = 1;
1306 			nd = nd->nd_next;
1307 		}
1308 
1309 		nd = sd->sd_nodelist;
1310 		/* All nodes are guaranteed to be ALIVE */
1311 		while (nd) {
1312 			if (strcmp(nd->nd_nodename, mynode()) == 0) {
1313 				nd = nd->nd_next;
1314 				continue;
1315 			}
1316 
1317 			has_set = nodehasset(sp, nd->nd_nodename,
1318 				    NHS_NSTG_EQ, ep);
1319 			if (has_set < 0) {
1320 				rval = -1;
1321 				goto out;
1322 			}
1323 
1324 			if (! has_set) {
1325 				(void) mddserror(ep, MDE_DS_NODENOSET,
1326 					sp->setno, nd->nd_nodename,
1327 					NULL, sp->setname);
1328 				rval = -1;
1329 				goto out;
1330 			}
1331 			nd = nd->nd_next;
1332 		}
1333 	} else {
1334 		for (i = 0; i < MD_MAXSIDES; i++) {
1335 			/* Skip empty slots */
1336 			if (sd->sd_nodes[i][0] == '\0')
1337 				continue;
1338 
1339 			if (strcmp(sd->sd_nodes[i], mynode()) == 0)
1340 				continue;
1341 
1342 			has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NSTG_EQ,
1343 				ep);
1344 			if (has_set < 0) {
1345 				/*
1346 				 * Can directly return since !MN diskset;
1347 				 * nothing to unlock.
1348 				 */
1349 				return (-1);
1350 			}
1351 
1352 			if (! has_set) {
1353 				/*
1354 				 * Can directly return since !MN diskset;
1355 				 * nothing to unlock.
1356 				 */
1357 				return (mddserror(ep, MDE_DS_NODENOSET,
1358 				    sp->setno, sd->sd_nodes[i], NULL,
1359 				    sp->setname));
1360 			}
1361 		}
1362 	}
1363 
1364 	for (p = dnlp; p != NULL; p = p->next) {
1365 		int		is_it;
1366 		mddrivename_t	*dnp;
1367 
1368 		dnp = p->drivenamep;
1369 
1370 		if ((is_it = meta_is_drive_in_thisset(sp, dnp, FALSE, ep))
1371 		    == -1) {
1372 			rval = -1;
1373 			goto out;
1374 		}
1375 
1376 		if (! is_it) {
1377 			(void) mddserror(ep, MDE_DS_DRIVENOTINSET, sp->setno,
1378 			    NULL, dnp->cname, sp->setname);
1379 			rval = -1;
1380 			goto out;
1381 		}
1382 
1383 		if ((meta_check_drive_inuse(sp, dnp, FALSE, ep)) == -1) {
1384 			rval = -1;
1385 			goto out;
1386 		}
1387 
1388 		deldrvcnt++;
1389 	}
1390 	current_drv_cnt = deldrvcnt;
1391 
1392 	/*
1393 	 * Get drive descriptors for the drives that are currently in the set.
1394 	 */
1395 	curdd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
1396 	if (! mdisok(ep)) {
1397 		rval = -1;
1398 		goto out;
1399 	}
1400 
1401 	/*
1402 	 * Decrement the the delete drive count for each drive currently in the
1403 	 * set.
1404 	 */
1405 	for (ddp = curdd; ddp != NULL; ddp = ddp->dd_next)
1406 		deldrvcnt--;
1407 
1408 	/*
1409 	 * If the count of drives we are deleting is equal to the drives in the
1410 	 * set, and we haven't specified forceflg, return an error
1411 	 */
1412 	if (deldrvcnt == 0 && forceflg == FALSE) {
1413 		(void) mderror(ep, MDE_FORCE_DEL_ALL_DRV, NULL);
1414 		rval = -1;
1415 		goto out;
1416 	}
1417 
1418 	/*
1419 	 * Get the list of drive descriptors that we are deleting.
1420 	 */
1421 	dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_DEL, ep);
1422 	if (! mdisok(ep)) {
1423 		rval = -1;
1424 		goto out;
1425 	}
1426 
1427 	/*
1428 	 * Get the set timeout information in case we have to roll back.
1429 	 */
1430 	(void) memset(&mhiargs, '\0', sizeof (mhiargs));
1431 	if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) {
1432 		rval = -1;
1433 		goto out;
1434 	}
1435 
1436 	/* At this point, in case of error, set should be flushed. */
1437 	flush_set_onerr = 1;
1438 
1439 	/* END CHECK CODE */
1440 
1441 	/* Lock the set on current set members */
1442 	if (!(MD_MNSET_DESC(sd))) {
1443 		md_rb_sig_handling_on();
1444 		for (i = 0; i < MD_MAXSIDES; i++) {
1445 			/* Skip empty slots */
1446 			if (sd->sd_nodes[i][0] == '\0')
1447 				continue;
1448 
1449 			if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
1450 				rval = -1;
1451 				goto out;
1452 			}
1453 			lock_flag = 1;
1454 		}
1455 	}
1456 
1457 	if ((deldrvcnt == 0) && (MD_MNSET_DESC(sd))) {
1458 		mddb_config_t		c;
1459 		/*
1460 		 * Is current set STALE?
1461 		 */
1462 		(void) memset(&c, 0, sizeof (c));
1463 		c.c_id = 0;
1464 		c.c_setno = sp->setno;
1465 		if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
1466 			(void) mdstealerror(ep, &c.c_mde);
1467 			rval = -1;
1468 			goto out;
1469 		}
1470 		if (c.c_flags & MDDB_C_STALE) {
1471 			stale_bool = TRUE;
1472 		}
1473 	}
1474 
1475 	RB_TEST(1, "deletedrives", ep)
1476 
1477 	RB_PREEMPT;
1478 	rb_level = 1;	/* level 1 */
1479 
1480 	RB_TEST(2, "deletedrives", ep)
1481 
1482 	/*
1483 	 * Mark the drives MD_DR_DEL
1484 	 */
1485 	if (MD_MNSET_DESC(sd)) {
1486 		nd = sd->sd_nodelist;
1487 		/* All nodes are guaranteed to be ALIVE */
1488 		while (nd) {
1489 			if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
1490 			    MD_DR_DEL, ep) == -1)
1491 				goto rollback;
1492 
1493 			RB_TEST(3, "deletedrives", ep)
1494 			nd = nd->nd_next;
1495 		}
1496 	} else {
1497 		for (i = 0; i < MD_MAXSIDES; i++) {
1498 			/* Skip empty slots */
1499 			if (sd->sd_nodes[i][0] == '\0')
1500 				continue;
1501 
1502 			if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd,
1503 			    MD_DR_DEL, ep) == -1)
1504 				goto rollback;
1505 
1506 			RB_TEST(3, "deletedrives", ep)
1507 		}
1508 	}
1509 
1510 	RB_TEST(4, "deletedrives", ep)
1511 
1512 	RB_PREEMPT;
1513 	rb_level = 2;	/* level 2 */
1514 
1515 	RB_TEST(5, "deletedrives", ep)
1516 
1517 	/*
1518 	 * Balance the DB's according to the list of existing drives and the
1519 	 * list of deleted drives.
1520 	 */
1521 	if (meta_db_balance(sp, dd, curdd, 0, ep) == -1)
1522 		goto rollback;
1523 
1524 	/*
1525 	 * If the drive(s) to be deleted cannot be accessed,
1526 	 * they haven't really been deleted yet. Check and delete now
1527 	 * if need be.
1528 	 */
1529 	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) >= 0) {
1530 		nlp = NULL;
1531 		for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
1532 			char	*delete_name;
1533 
1534 			delete_name = ddp->dd_dnp->cname;
1535 
1536 			for (rl = rlp; rl != NULL; rl = rl->rl_next) {
1537 				char	*cur_name;
1538 
1539 				cur_name =
1540 				    rl->rl_repp->r_namep->drivenamep->cname;
1541 
1542 				if (strcmp(delete_name, cur_name) == 0) {
1543 					/* put it on the delete list */
1544 					np = rl->rl_repp->r_namep;
1545 					(void) metanamelist_append(&nlp, np);
1546 
1547 				}
1548 			}
1549 		}
1550 
1551 		if (nlp != NULL) {
1552 			if (meta_db_detach(sp, nlp,
1553 			    (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL,
1554 			    ep) == -1) {
1555 				metafreenamelist(nlp);
1556 				goto rollback;
1557 			}
1558 			metafreenamelist(nlp);
1559 		}
1560 	}
1561 
1562 	RB_TEST(6, "deletedrives", ep)
1563 
1564 	RB_PREEMPT;
1565 	rb_level = 3;	/* level 3 */
1566 
1567 	RB_TEST(7, "deletedrives", ep)
1568 
1569 	/*
1570 	 * Cannot suspend set until after meta_db_balance since
1571 	 * meta_db_balance uses META_DB_ATTACH/DETACH messages.
1572 	 */
1573 	if ((deldrvcnt == 0) && (MD_MNSET_DESC(sd))) {
1574 		/*
1575 		 * Notify rpc.mdcommd on all nodes of a nodelist change.
1576 		 * Start by suspending rpc.mdcommd (which drains it of all
1577 		 * messages), then change the nodelist followed by a reinit
1578 		 * and resume.
1579 		 */
1580 		nd = sd->sd_nodelist;
1581 		/* All nodes are guaranteed to be ALIVE */
1582 		while (nd) {
1583 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
1584 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
1585 				rval = -1;
1586 				goto out;
1587 			}
1588 			suspendall_flag = 1;
1589 			nd = nd->nd_next;
1590 		}
1591 	}
1592 
1593 	/*
1594 	 * Remove the drive records for the drives that were deleted from
1595 	 * each host in the set.  This removes the record and dr_flags.
1596 	 */
1597 	if (MD_MNSET_DESC(sd)) {
1598 		nd = sd->sd_nodelist;
1599 		/* All nodes are guaranteed to be ALIVE */
1600 		while (nd) {
1601 			if (clnt_deldrvs(nd->nd_nodename, sp, dd, ep) == -1)
1602 				goto rollback;
1603 
1604 			RB_TEST(8, "deletedrives", ep)
1605 			nd = nd->nd_next;
1606 		}
1607 	} else {
1608 		for (i = 0; i < MD_MAXSIDES; i++) {
1609 			/* Skip empty slots */
1610 			if (sd->sd_nodes[i][0] == '\0')
1611 				continue;
1612 
1613 			if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep) == -1)
1614 				goto rollback;
1615 
1616 			RB_TEST(8, "deletedrives", ep)
1617 		}
1618 	}
1619 
1620 	RB_TEST(9, "deletedrives", ep)
1621 
1622 	RB_PREEMPT;
1623 	rb_level = 4;	/* level 4 */
1624 
1625 	RB_TEST(10, "deletedrives", ep)
1626 
1627 	if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
1628 		if (rel_own_bydd(sp, dd, TRUE, ep))
1629 			goto rollback;
1630 	}
1631 
1632 	/* If we deleted all the drives, then we need to halt the set. */
1633 	if (deldrvcnt == 0) {
1634 		RB_TEST(11, "deletedrives", ep)
1635 
1636 		RB_PREEMPT;
1637 		rb_level = 5;	/* level 5 */
1638 
1639 		RB_TEST(12, "deletedrives", ep)
1640 
1641 		if (clnt_stimeout(mynode(), sp, &defmhiargs, ep) == -1)
1642 			goto rollback;
1643 
1644 		RB_TEST(13, "deletedrives", ep)
1645 
1646 		RB_PREEMPT;
1647 		rb_level = 6;	/* level 6 */
1648 
1649 		RB_TEST(14, "deletedrives", ep)
1650 
1651 		/* Halt MN diskset on all nodes by having node withdraw */
1652 		if (MD_MNSET_DESC(sd)) {
1653 			nd = sd->sd_nodelist;
1654 			/* All nodes are guaranteed to be ALIVE */
1655 			while (nd) {
1656 				/* Only withdraw nodes that are joined */
1657 				if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
1658 					nd = nd->nd_next;
1659 					continue;
1660 				}
1661 				/*
1662 				 * Going to set locally cached node flags to
1663 				 * rollback join so in case of error, the
1664 				 * rollback code knows which nodes to re-join.
1665 				 */
1666 				nd->nd_flags |= MD_MN_NODE_RB_JOIN;
1667 
1668 				/*
1669 				 * Be careful in ordering of following steps
1670 				 * so that recovery from a panic between
1671 				 * the steps is viable.
1672 				 * Only reset master info in rpc.metad -
1673 				 * don't reset local cached information
1674 				 * which will be used to set master information
1675 				 * back in case of failure (rollback).
1676 				 */
1677 				if (clnt_withdrawset(nd->nd_nodename, sp, ep))
1678 					goto rollback;
1679 				/* Sets withdraw flag on all nodes in list */
1680 				if (clnt_upd_nr_flags(nd->nd_nodename, sp,
1681 				    sd->sd_nodelist, MD_NR_WITHDRAW,
1682 				    NULL, ep)) {
1683 					goto rollback;
1684 				}
1685 				if (clnt_mnsetmaster(nd->nd_nodename, sp,
1686 				    "", MD_MN_INVALID_NID, ep)) {
1687 					goto rollback;
1688 				}
1689 				nd = nd->nd_next;
1690 			}
1691 		} else {
1692 			if (halt_set(sp, ep))
1693 				goto rollback;
1694 		}
1695 
1696 		RB_TEST(15, "deletedrives", ep)
1697 	}
1698 
1699 	RB_TEST(16, "deletedrives", ep)
1700 
1701 out:
1702 	/*
1703 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
1704 	 * Send reinit command to mdcommd which forces it to get
1705 	 * fresh set description.
1706 	 */
1707 	if (suspendall_flag) {
1708 		/* Send reinit */
1709 		nd = sd->sd_nodelist;
1710 		/* All nodes are guaranteed to be ALIVE */
1711 		while (nd) {
1712 			/* Class is ignored for REINIT */
1713 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
1714 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
1715 				if (rval == 0)
1716 					(void) mdstealerror(ep, &xep);
1717 				rval = -1;
1718 				mde_perror(ep, dgettext(TEXT_DOMAIN,
1719 				    "Unable to reinit rpc.mdcommd.\n"));
1720 			}
1721 			nd = nd->nd_next;
1722 		}
1723 	}
1724 
1725 	/*
1726 	 * Just resume all classes so that resume is the same whether
1727 	 * just one class was locked or all classes were locked.
1728 	 */
1729 	if ((suspend1_flag) || (suspendall_flag)) {
1730 		/* Send resume */
1731 		nd = sd->sd_nodelist;
1732 		/* All nodes are guaranteed to be ALIVE */
1733 		while (nd) {
1734 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
1735 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
1736 				if (rval == 0)
1737 					(void) mdstealerror(ep, &xep);
1738 				rval = -1;
1739 				mde_perror(ep, dgettext(TEXT_DOMAIN,
1740 				    "Unable to resume rpc.mdcommd.\n"));
1741 			}
1742 			nd = nd->nd_next;
1743 		}
1744 		meta_ping_mnset(sp->setno);
1745 	}
1746 	if (lock_flag) {
1747 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
1748 		if (MD_MNSET_DESC(sd)) {
1749 			nd = sd->sd_nodelist;
1750 			/* All nodes are guaranteed to be ALIVE */
1751 			while (nd) {
1752 				if (clnt_unlock_set(nd->nd_nodename,
1753 				    cl_sk, &xep)) {
1754 					if (rval == 0)
1755 						(void) mdstealerror(ep, &xep);
1756 					rval = -1;
1757 				}
1758 				nd = nd->nd_next;
1759 			}
1760 		} else {
1761 			for (i = 0; i < MD_MAXSIDES; i++) {
1762 				/* Skip empty slots */
1763 				if (sd->sd_nodes[i][0] == '\0')
1764 					continue;
1765 
1766 				if (clnt_unlock_set(sd->sd_nodes[i],
1767 				    cl_sk, &xep)) {
1768 					if (rval == 0)
1769 						(void) mdstealerror(ep, &xep);
1770 					rval = -1;
1771 				}
1772 			}
1773 		}
1774 		cl_set_setkey(NULL);
1775 	}
1776 
1777 	metafreedrivedesc(&dd);
1778 
1779 	if (flush_set_onerr) {
1780 		metaflushsetname(sp);
1781 		if (!(MD_MNSET_DESC(sd))) {
1782 			md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1783 		}
1784 	}
1785 
1786 	if (MD_MNSET_DESC(sd)) {
1787 		/* release signals back to what they were on entry */
1788 		if (procsigs(FALSE, &oldsigs, &xep) < 0)
1789 			mdclrerror(&xep);
1790 	}
1791 
1792 	return (rval);
1793 
1794 rollback:
1795 	/* all signals already blocked for MN disket */
1796 	if (!(MD_MNSET_DESC(sd))) {
1797 		/* Make sure we are blocking all signals */
1798 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
1799 			mdclrerror(&xep);
1800 	}
1801 
1802 	rval = -1;
1803 
1804 	max_genid = sd->sd_genid;
1805 
1806 	/* Set the master on all nodes first thing */
1807 	if (rb_level > 5) {
1808 		if (MD_MNSET_DESC(sd)) {
1809 			nd = sd->sd_nodelist;
1810 			/* All nodes are guaranteed to be ALIVE */
1811 			while (nd) {
1812 				if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) {
1813 					continue;
1814 				}
1815 				/*
1816 				 * Set master on all re-joining nodes to be
1817 				 * my cached view of master.
1818 				 */
1819 				if (clnt_mnsetmaster(nd->nd_nodename, sp,
1820 				    sd->sd_mn_master_nodenm,
1821 				    sd->sd_mn_master_nodeid, &xep)) {
1822 					mdclrerror(&xep);
1823 				}
1824 			}
1825 		}
1826 	}
1827 
1828 	/* level 3 */
1829 	if (rb_level > 2) {
1830 		md_set_record		*sr;
1831 		md_mnset_record		*mnsr;
1832 		md_drive_record		*dr;
1833 		int			sr_drive_cnt;
1834 
1835 		/*
1836 		 * See if we have to re-add the drives specified.
1837 		 */
1838 		if (MD_MNSET_DESC(sd)) {
1839 			nd = sd->sd_nodelist;
1840 			/* All nodes are guaranteed to be ALIVE */
1841 			while (nd) {
1842 				/*
1843 				 * Must get current set record from each
1844 				 * node to see what else must be done
1845 				 * to recover.
1846 				 * Record should be for a multi-node diskset.
1847 				 */
1848 				if (clnt_mngetset(nd->nd_nodename, sp->setname,
1849 				    MD_SET_BAD, &mnsr, &xep) == -1) {
1850 					mdclrerror(&xep);
1851 					nd = nd->nd_next;
1852 					continue;
1853 				}
1854 
1855 				/*
1856 				 * If all drives are already there, skip
1857 				 * to next node.
1858 				 */
1859 				sr_drive_cnt = 0;
1860 				dr = mnsr->sr_drivechain;
1861 				while (dr) {
1862 					sr_drive_cnt++;
1863 					dr = dr->dr_next;
1864 				}
1865 				if (sr_drive_cnt == current_drv_cnt) {
1866 					free_sr((md_set_record *)mnsr);
1867 					nd = nd->nd_next;
1868 					continue;
1869 				}
1870 
1871 				/* Readd all drives */
1872 				if (clnt_adddrvs(nd->nd_nodename, sp, dd,
1873 				    mnsr->sr_ctime, mnsr->sr_genid, &xep) == -1)
1874 					mdclrerror(&xep);
1875 
1876 				free_sr((struct md_set_record *)mnsr);
1877 				nd = nd->nd_next;
1878 			}
1879 		} else {
1880 			for (i = 0; i < MD_MAXSIDES; i++) {
1881 				/* Skip empty slots */
1882 				if (sd->sd_nodes[i][0] == '\0')
1883 					continue;
1884 
1885 				/* Record should be for a non-multi-node set */
1886 				if (clnt_getset(sd->sd_nodes[i], sp->setname,
1887 				    MD_SET_BAD, &sr, &xep) == -1) {
1888 					mdclrerror(&xep);
1889 					continue;
1890 				}
1891 
1892 				/*
1893 				 * Set record structure was allocated from RPC
1894 				 * routine getset so this structure is only of
1895 				 * size md_set_record even if the MN flag is
1896 				 * set.  So, clear the flag so that the free
1897 				 * code doesn't attempt to free a structure
1898 				 * the size of md_mnset_record.
1899 				 */
1900 				if (MD_MNSET_REC(sr)) {
1901 					sr->sr_flags &= ~MD_SR_MN;
1902 					free_sr(sr);
1903 					continue;
1904 				}
1905 
1906 				/* Drive already added, skip to next node */
1907 				if (sr->sr_drivechain != NULL) {
1908 					free_sr(sr);
1909 					continue;
1910 				}
1911 
1912 				if (clnt_adddrvs(sd->sd_nodes[i], sp, dd,
1913 				    sr->sr_ctime, sr->sr_genid, &xep) == -1)
1914 					mdclrerror(&xep);
1915 
1916 				free_sr(sr);
1917 			}
1918 		}
1919 		max_genid += 2;
1920 	}
1921 
1922 	/*
1923 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
1924 	 * At this point in time, don't know which nodes are joined
1925 	 * to the set.  So, send a reinit command to mdcommd
1926 	 * which forces it to get fresh set description.  Then send resume.
1927 	 *
1928 	 * Later, this code will use rpc.mdcommd messages to reattach disks
1929 	 * and then rpc.mdcommd may be suspended again, rest of the nodes
1930 	 * joined, rpc.mdcommd reinited and then resumed.
1931 	 */
1932 	if (suspendall_flag) {
1933 		/* Send reinit */
1934 		nd = sd->sd_nodelist;
1935 		/* All nodes are guaranteed to be ALIVE */
1936 		while (nd) {
1937 			/* Class is ignored for REINIT */
1938 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
1939 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
1940 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
1941 				    "Unable to reinit rpc.mdcommd.\n"));
1942 				mdclrerror(&xep);
1943 			}
1944 			nd = nd->nd_next;
1945 		}
1946 
1947 		/* Send resume */
1948 		nd = sd->sd_nodelist;
1949 		/* All nodes are guaranteed to be ALIVE */
1950 		while (nd) {
1951 			/*
1952 			 * Resume all classes but class 1 so that lock is held
1953 			 * against meta* commands.
1954 			 * To later resume class1, must issue a class0 resume.
1955 			 */
1956 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
1957 			    sp, MD_MSG_CLASS0,
1958 			    MD_MSCF_DONT_RESUME_CLASS1, &xep)) {
1959 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
1960 				    "Unable to resume rpc.mdcommd.\n"));
1961 				mdclrerror(&xep);
1962 			}
1963 			nd = nd->nd_next;
1964 		}
1965 		meta_ping_mnset(sp->setno);
1966 	}
1967 
1968 	/* level 2 */
1969 	if (rb_level > 1) {
1970 		mdnamelist_t	*nlp;
1971 		mdname_t	*np;
1972 
1973 		for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
1974 			uint_t	rep_slice;
1975 
1976 			if ((meta_replicaslice(ddp->dd_dnp,
1977 			    &rep_slice, &xep) != 0) ||
1978 			    ((np = metaslicename(ddp->dd_dnp, rep_slice,
1979 				&xep)) == NULL)) {
1980 				mdclrerror(&xep);
1981 				continue;
1982 			}
1983 			nlp = NULL;
1984 			(void) metanamelist_append(&nlp, np);
1985 
1986 			if (meta_db_attach(sp, nlp,
1987 			    (MDCHK_DRVINSET | MDCHK_SET_LOCKED),
1988 			    &sd->sd_ctime, ddp->dd_dbcnt, ddp->dd_dbsize,
1989 			    NULL, &xep) == -1)
1990 				mdclrerror(&xep);
1991 
1992 			metafreenamelist(nlp);
1993 		}
1994 		/* Re-balance */
1995 		if (meta_db_balance(sp, NULL, curdd, 0, &xep) == -1)
1996 			mdclrerror(&xep);
1997 	}
1998 
1999 	/* level 4 */
2000 	if (rb_level > 3) {
2001 		if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
2002 			if (tk_own_bydd(sp, dd, &mhiargs, TRUE, &xep))
2003 				mdclrerror(&xep);
2004 		}
2005 	}
2006 
2007 	/* level 5 */
2008 	if (rb_level > 4) {
2009 		if (clnt_stimeout(mynode(), sp, &mhiargs, &xep) == -1)
2010 			mdclrerror(&xep);
2011 	}
2012 
2013 	/*
2014 	 * If at least one node needs to be rejoined to MN diskset,
2015 	 * then suspend commd again.
2016 	 */
2017 	if (MD_MNSET_DESC(sd)) {
2018 		nd = sd->sd_nodelist;
2019 		/* All nodes are guaranteed to be ALIVE */
2020 		while (nd) {
2021 			if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) {
2022 				nd = nd->nd_next;
2023 				continue;
2024 			}
2025 			break;
2026 		}
2027 		if (nd) {
2028 			/*
2029 			 * Found node that will be rejoined so
2030 			 * notify rpc.mdcommd on all nodes of a nodelist change.
2031 			 * Start by suspending rpc.mdcommd (which drains it of
2032 			 * all messages), then change the nodelist followed by
2033 			 * a reinit and resume.
2034 			 */
2035 			nd = sd->sd_nodelist;
2036 			/* All nodes are guaranteed to be ALIVE */
2037 			while (nd) {
2038 				if (clnt_mdcommdctl(nd->nd_nodename,
2039 				    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS0,
2040 				    MD_MSCF_NO_FLAGS, &xep)) {
2041 					mdclrerror(&xep);
2042 				}
2043 				suspendall_flag_rb = 1;
2044 				nd = nd->nd_next;
2045 			}
2046 		}
2047 	}
2048 
2049 
2050 
2051 	/* level 6 */
2052 	if (rb_level > 5) {
2053 		if (MD_MNSET_DESC(sd)) {
2054 			int	join_flags = 0;
2055 
2056 			nd = sd->sd_nodelist;
2057 			/* All nodes are guaranteed to be ALIVE */
2058 			while (nd) {
2059 				/* Only rejoin nodes that were joined before */
2060 				if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) {
2061 					nd = nd->nd_next;
2062 					continue;
2063 				}
2064 				/*
2065 				 * Rejoin nodes to same state as before -
2066 				 * either STALE or non-STALE.
2067 				 */
2068 				if (stale_bool == TRUE)
2069 					join_flags = MNSET_IS_STALE;
2070 				if (clnt_joinset(nd->nd_nodename, sp,
2071 				    join_flags, &xep))
2072 					mdclrerror(&xep);
2073 				/* Sets OWN flag on all nodes in list */
2074 				if (clnt_upd_nr_flags(nd->nd_nodename, sp,
2075 				    sd->sd_nodelist, MD_NR_JOIN, NULL, &xep)) {
2076 					mdclrerror(&xep);
2077 				}
2078 				nd = nd->nd_next;
2079 			}
2080 		} else {
2081 			if (setup_db_bydd(sp, dd, TRUE, &xep) == -1)
2082 				mdclrerror(&xep);
2083 
2084 			/* No special flag for traditional diskset */
2085 			if (snarf_set(sp, NULL, &xep))
2086 				mdclrerror(&xep);
2087 		}
2088 	}
2089 
2090 	/* level 1 */
2091 	if (rb_level > 0) {
2092 		/*
2093 		 * Mark the drives as OK.
2094 		 */
2095 		if (MD_MNSET_DESC(sd)) {
2096 			nd = sd->sd_nodelist;
2097 			/* All nodes are guaranteed to be ALIVE */
2098 			while (nd) {
2099 				/*
2100 				 * Must be last action before unlock.
2101 				 * In case of panic, recovery code checks
2102 				 * for MD_DR_OK to know that drive
2103 				 * and possible master are fully added back.
2104 				 */
2105 				if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
2106 				    MD_DR_OK, &xep) == -1)
2107 					mdclrerror(&xep);
2108 				nd = nd->nd_next;
2109 			}
2110 		} else {
2111 			for (i = 0; i < MD_MAXSIDES; i++) {
2112 				/* Skip empty slots */
2113 				if (sd->sd_nodes[i][0] == '\0')
2114 					continue;
2115 
2116 				if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd,
2117 				    MD_DR_OK, &xep) == -1)
2118 					mdclrerror(&xep);
2119 
2120 			}
2121 		}
2122 		max_genid += 2;
2123 		resync_genid(sp, sd, max_genid, 0, NULL);
2124 	}
2125 	/*
2126 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2127 	 * Send a reinit command to mdcommd which forces it to get
2128 	 * fresh set description.
2129 	 */
2130 	if (suspendall_flag_rb) {
2131 		/* Send reinit */
2132 		nd = sd->sd_nodelist;
2133 		/* All nodes are guaranteed to be ALIVE */
2134 		while (nd) {
2135 			/* Class is ignored for REINIT */
2136 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
2137 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
2138 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
2139 				    "Unable to reinit rpc.mdcommd.\n"));
2140 				mdclrerror(&xep);
2141 			}
2142 			nd = nd->nd_next;
2143 		}
2144 	}
2145 
2146 	/*
2147 	 * Just resume all classes so that resume is the same whether
2148 	 * just one class was locked or all classes were locked.
2149 	 */
2150 	if ((suspend1_flag) || (suspendall_flag_rb) || (suspendall_flag)) {
2151 		/* Send resume */
2152 		nd = sd->sd_nodelist;
2153 		/* All nodes are guaranteed to be ALIVE */
2154 		while (nd) {
2155 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
2156 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
2157 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
2158 				    "Unable to resume rpc.mdcommd.\n"));
2159 				mdclrerror(&xep);
2160 			}
2161 			nd = nd->nd_next;
2162 		}
2163 		meta_ping_mnset(sp->setno);
2164 	}
2165 
2166 
2167 	/* level 0 */
2168 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
2169 	/* Don't test lock flag since guaranteed to be set if in rollback */
2170 	if (MD_MNSET_DESC(sd)) {
2171 		nd = sd->sd_nodelist;
2172 		/* All nodes are guaranteed to be ALIVE */
2173 		while (nd) {
2174 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
2175 				mdclrerror(&xep);
2176 			nd = nd->nd_next;
2177 		}
2178 	} else {
2179 		for (i = 0; i < MD_MAXSIDES; i++) {
2180 			/* Skip empty slots */
2181 			if (sd->sd_nodes[i][0] == '\0')
2182 				continue;
2183 
2184 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
2185 				mdclrerror(&xep);
2186 		}
2187 	}
2188 	cl_set_setkey(NULL);
2189 
2190 	/* release signals back to what they were on entry */
2191 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
2192 		mdclrerror(&xep);
2193 
2194 	metafreedrivedesc(&dd);
2195 
2196 	if (flush_set_onerr) {
2197 		metaflushsetname(sp);
2198 		if (!(MD_MNSET_DESC(sd))) {
2199 			md_rb_sig_handling_off(md_got_sig(), md_which_sig());
2200 		}
2201 	}
2202 
2203 	return (rval);
2204 }
2205