1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #pragma ident "%Z%%M% %I% %E% SMI"
27
28 /*
29 * Metadevice diskset interfaces
30 */
31
32 #include <meta.h>
33 #include <mdmn_changelog.h>
34 #include "meta_set_prv.h"
35 #include "meta_repartition.h"
36
37 static int
check_setnodes_againstdrivelist(mdsetname_t * sp,mddrivenamelist_t * dnlp,md_error_t * ep)38 check_setnodes_againstdrivelist(
39 mdsetname_t *sp,
40 mddrivenamelist_t *dnlp,
41 md_error_t *ep
42 )
43 {
44 md_set_desc *sd;
45 mddrivenamelist_t *p;
46 int i;
47 md_mnnode_desc *nd;
48
49 if ((sd = metaget_setdesc(sp, ep)) == NULL)
50 return (-1);
51
52 if (MD_MNSET_DESC(sd)) {
53 nd = sd->sd_nodelist;
54 while (nd) {
55 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
56 nd = nd->nd_next;
57 continue;
58 }
59 for (p = dnlp; p != NULL; p = p->next)
60 if (checkdrive_onnode(sp, p->drivenamep,
61 nd->nd_nodename, ep))
62 return (-1);
63 nd = nd->nd_next;
64 }
65 } else {
66 for (i = 0; i < MD_MAXSIDES; i++) {
67 /* Skip empty slots */
68 if (sd->sd_nodes[i][0] == '\0')
69 continue;
70
71 for (p = dnlp; p != NULL; p = p->next)
72 if (checkdrive_onnode(sp, p->drivenamep,
73 sd->sd_nodes[i], ep))
74 return (-1);
75 }
76 }
77 return (0);
78 }
79
80 static int
drvsuniq(mdsetname_t * sp,mddrivenamelist_t * dnlp,md_error_t * ep)81 drvsuniq(mdsetname_t *sp, mddrivenamelist_t *dnlp, md_error_t *ep)
82 {
83 mddrivenamelist_t *dl1, *dl2;
84 mddrivename_t *dn1, *dn2;
85
86 for (dl1 = dnlp; dl1 != NULL; dl1 = dl1->next) {
87 dn1 = dl1->drivenamep;
88
89 for (dl2 = dl1->next; dl2 != NULL; dl2 = dl2->next) {
90 dn2 = dl2->drivenamep;
91 if (strcmp(dn1->cname, dn2->cname) != 0)
92 continue;
93
94 return (mddserror(ep, MDE_DS_DUPDRIVE, sp->setno,
95 NULL, dn1->cname, sp->setname));
96 }
97 }
98 return (0);
99 }
100
101 static md_drive_desc *
metaget_drivedesc_fromdrivelist(mdsetname_t * sp,mddrivenamelist_t * dnlp,uint_t flags,md_error_t * ep)102 metaget_drivedesc_fromdrivelist(
103 mdsetname_t *sp,
104 mddrivenamelist_t *dnlp,
105 uint_t flags,
106 md_error_t *ep
107 )
108 {
109 mddrivenamelist_t *p;
110 md_drive_desc *dd = NULL;
111 md_set_desc *sd;
112
113 if ((sd = metaget_setdesc(sp, ep)) == NULL)
114 return (NULL);
115
116 for (p = dnlp; p != NULL; p = p->next) {
117 (void) metadrivedesc_append(&dd, p->drivenamep, 0, 0,
118 sd->sd_ctime, sd->sd_genid, flags);
119 }
120
121 return (dd);
122 }
123
124 /*
125 * Exported Entry Points
126 */
127
128 int
meta_make_sidenmlist(mdsetname_t * sp,mddrivename_t * dnp,int import_flag,md_im_drive_info_t * midp,md_error_t * ep)129 meta_make_sidenmlist(
130 mdsetname_t *sp,
131 mddrivename_t *dnp,
132 int import_flag, /* flags partial import */
133 md_im_drive_info_t *midp, /* import drive information */
134 md_error_t *ep
135 )
136 {
137 mdsidenames_t *sn, **sn_next;
138 mdname_t *np;
139 int done;
140 side_t sideno = MD_SIDEWILD;
141 uint_t rep_slice;
142 char *bname;
143
144 if (!import_flag) {
145 /*
146 * Normal (aka NOT partial import) code path.
147 */
148 if (meta_replicaslice(dnp, &rep_slice, ep) != 0) {
149 return (-1);
150 }
151
152 dnp->side_names_key = MD_KEYWILD;
153
154 if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
155 return (-1);
156 bname = Strdup(np->bname);
157 } else {
158 /*
159 * When doing a partial import, we'll get the needed
160 * information from somewhere other than the system.
161 */
162 dnp->side_names_key = MD_KEYWILD;
163 bname = Strdup(midp->mid_devname);
164 }
165 metaflushsidenames(dnp);
166 sn_next = &dnp->side_names;
167 /*CONSTCOND*/
168 while (1) {
169 sn = Zalloc(sizeof (*sn));
170
171 if ((done = meta_getnextside_devinfo(sp, bname, &sideno,
172 &sn->cname, &sn->dname, &sn->mnum, ep)) == -1) {
173 if (import_flag) {
174 mdclrerror(ep);
175 sn->dname = Strdup(midp->mid_driver_name);
176 sn->mnum = midp->mid_mnum;
177 } else {
178 Free(sn);
179 Free(bname);
180 return (-1);
181 }
182 }
183
184 if (done == 0) {
185 Free(sn);
186 Free(bname);
187 return (0);
188 }
189
190 sn->sideno = sideno;
191
192 /* Add to the end of the linked list */
193 assert(*sn_next == NULL);
194 *sn_next = sn;
195 sn_next = &sn->next;
196 }
197 /*NOTREACHED*/
198 }
199
200 int
meta_set_adddrives(mdsetname_t * sp,mddrivenamelist_t * dnlp,daddr_t dbsize,int force_label,md_error_t * ep)201 meta_set_adddrives(
202 mdsetname_t *sp,
203 mddrivenamelist_t *dnlp,
204 daddr_t dbsize,
205 int force_label,
206 md_error_t *ep
207 )
208 {
209 md_set_desc *sd;
210 md_drive_desc *dd = NULL, *curdd = NULL, *ddp;
211 int i;
212 mddrivenamelist_t *p;
213 mhd_mhiargs_t mhiargs;
214 int rval = 0;
215 md_timeval32_t now;
216 sigset_t oldsigs;
217 ulong_t genid;
218 ulong_t max_genid = 0;
219 md_setkey_t *cl_sk;
220 int rb_level = 0;
221 md_error_t xep = mdnullerror;
222 md_mnnode_desc *nd;
223 int suspendall_flag = 0;
224 int suspend1_flag = 0;
225 int lock_flag = 0;
226 int flush_set_onerr = 0;
227 md_replicalist_t *rlp = NULL, *rl;
228
229 if ((sd = metaget_setdesc(sp, ep)) == NULL)
230 return (-1);
231
232 /* Make sure we own the set */
233 if (meta_check_ownership(sp, ep) != 0)
234 return (-1);
235
236 /*
237 * The drive and node records are stored in the local mddbs of each
238 * node in the diskset. Each node's rpc.metad daemon reads in the set,
239 * drive and node records from that node's local mddb and caches them
240 * internally. Any process needing diskset information contacts its
241 * local rpc.metad to get this information. Since each node in the
242 * diskset is independently reading the set information from its local
243 * mddb, the set, drive and node records in the local mddbs must stay
244 * in-sync, so that all nodes have a consistent view of the diskset.
245 *
246 * For a multinode diskset, explicitly verify that all nodes in the
247 * diskset are ALIVE (i.e. are in the API membership list). Otherwise,
248 * fail this operation since all nodes must be ALIVE in order to add
249 * the new drive record to their local mddb. If a panic of this node
250 * leaves the local mddbs set, node and drive records out-of-sync, the
251 * reconfig cycle will fix the local mddbs and force them back into
252 * synchronization.
253 */
254 if (MD_MNSET_DESC(sd)) {
255 nd = sd->sd_nodelist;
256 while (nd) {
257 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
258 (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
259 sp->setno,
260 nd->nd_nodename, NULL, sp->setname);
261 return (-1);
262 }
263 nd = nd->nd_next;
264 }
265 }
266
267 if (drvsuniq(sp, dnlp, ep) == -1)
268 return (-1);
269
270 /*
271 * Lock the set on current set members.
272 * Set locking done much earlier for MN diskset than for traditional
273 * diskset since lock_set and SUSPEND are used to protect against
274 * other meta* commands running on the other nodes.
275 */
276 if (MD_MNSET_DESC(sd)) {
277 /* Make sure we are blocking all signals */
278 if (procsigs(TRUE, &oldsigs, &xep) < 0)
279 mdclrerror(&xep);
280
281 nd = sd->sd_nodelist;
282 /* All nodes are guaranteed to be ALIVE */
283 while (nd) {
284 if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
285 rval = -1;
286 goto out;
287 }
288 lock_flag = 1;
289 nd = nd->nd_next;
290 }
291 /*
292 * Lock out other meta* commands by suspending
293 * class 1 messages across the diskset.
294 */
295 nd = sd->sd_nodelist;
296 /* All nodes are guaranteed to be ALIVE */
297 while (nd) {
298 if (clnt_mdcommdctl(nd->nd_nodename,
299 COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
300 MD_MSCF_NO_FLAGS, ep)) {
301 rval = -1;
302 goto out;
303 }
304 suspend1_flag = 1;
305 nd = nd->nd_next;
306 }
307 }
308
309 if (check_setnodes_againstdrivelist(sp, dnlp, ep)) {
310 rval = -1;
311 goto out;
312 }
313
314 for (p = dnlp; p != NULL; p = p->next) {
315 mdsetname_t *tmp;
316
317 if (meta_is_drive_in_anyset(p->drivenamep, &tmp, FALSE,
318 ep) == -1) {
319 rval = -1;
320 goto out;
321 }
322
323 if (tmp != NULL) {
324 (void) mddserror(ep, MDE_DS_DRIVEINSET, sp->setno,
325 tmp->setname, p->drivenamep->cname, sp->setname);
326 rval = -1;
327 goto out;
328 }
329 }
330
331 /* END CHECK CODE */
332
333 /*
334 * This is a separate loop (from above) so that we validate all the
335 * drives handed to us before we repartition any one drive.
336 */
337 for (p = dnlp; p != NULL; p = p->next) {
338 if (meta_repartition_drive(sp,
339 p->drivenamep, force_label == TRUE ? MD_REPART_FORCE : 0,
340 NULL, /* Don't return the VTOC. */
341 ep) != 0) {
342 rval = -1;
343 goto out;
344 }
345 /*
346 * Create the names for the drives we are adding per side.
347 */
348 if (meta_make_sidenmlist(sp, p->drivenamep, 0, NULL,
349 ep) == -1) {
350 rval = -1;
351 goto out;
352 }
353 }
354
355 /*
356 * Get the list of drives descriptors that we are adding.
357 */
358 dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_ADD, ep);
359
360 if (! mdisok(ep)) {
361 rval = -1;
362 goto out;
363 }
364
365 /*
366 * Get the set timeout information.
367 */
368 (void) memset(&mhiargs, '\0', sizeof (mhiargs));
369 if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) {
370 rval = -1;
371 goto out;
372 }
373
374 /*
375 * Get timestamp and generation id for new records
376 */
377 now = sd->sd_ctime;
378 genid = sd->sd_genid;
379
380
381 /* At this point, in case of error, set should be flushed. */
382 flush_set_onerr = 1;
383
384 /* Lock the set on current set members */
385 if (!(MD_MNSET_DESC(sd))) {
386 md_rb_sig_handling_on();
387 for (i = 0; i < MD_MAXSIDES; i++) {
388 /* Skip empty slots */
389 if (sd->sd_nodes[i][0] == '\0')
390 continue;
391
392 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
393 rval = -1;
394 goto out;
395 }
396 lock_flag = 1;
397 }
398 }
399
400 /*
401 * Get drive descriptors for the drives that are currently in the set.
402 */
403 curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep);
404 if (! mdisok(ep))
405 goto rollback;
406
407 /*
408 * If first drive being added to set, set the mastership
409 * of the multinode diskset to be this node.
410 * Only set it on this node. If all goes well
411 * and there are no errors, the mastership of this node will be set
412 * on all nodes in user space and in the kernel.
413 */
414 if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) {
415 if (clnt_mnsetmaster(mynode(), sp,
416 sd->sd_mn_mynode->nd_nodename,
417 sd->sd_mn_mynode->nd_nodeid, ep)) {
418 goto rollback;
419 }
420 /*
421 * Set this up in my local cache of the set desc so that
422 * the set descriptor won't have to be gotten again from
423 * rpc.metad. If it is flushed and gotten again, these
424 * values will be set in sr2setdesc.
425 */
426 sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid;
427 (void) strcpy(sd->sd_mn_master_nodenm,
428 sd->sd_mn_mynode->nd_nodename);
429 sd->sd_mn_am_i_master = 1;
430 }
431
432 RB_TEST(1, "adddrives", ep)
433
434 RB_PREEMPT;
435 rb_level = 1; /* level 1 */
436
437 RB_TEST(2, "adddrives", ep)
438
439 /*
440 * Add the drive records for the drives that we are adding to
441 * each host in the set. Marks the drive as MD_DR_ADD.
442 */
443 if (MD_MNSET_DESC(sd)) {
444 nd = sd->sd_nodelist;
445 /* All nodes are guaranteed to be ALIVE */
446 while (nd) {
447 if (clnt_adddrvs(nd->nd_nodename, sp, dd, now, genid,
448 ep) == -1)
449 goto rollback;
450
451 RB_TEST(3, "adddrives", ep)
452 nd = nd->nd_next;
453 }
454 } else {
455 for (i = 0; i < MD_MAXSIDES; i++) {
456 /* Skip empty slots */
457 if (sd->sd_nodes[i][0] == '\0')
458 continue;
459
460 if (clnt_adddrvs(sd->sd_nodes[i], sp, dd, now, genid,
461 ep) == -1)
462 goto rollback;
463
464 RB_TEST(3, "adddrives", ep)
465 }
466 }
467
468 RB_TEST(4, "adddrives", ep)
469
470 RB_PREEMPT;
471 rb_level = 2; /* level 2 */
472
473 RB_TEST(5, "adddrives", ep)
474
475 /*
476 * Take ownership of the added drives.
477 */
478 if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
479 if (tk_own_bydd(sp, dd, &mhiargs, TRUE, ep))
480 goto rollback;
481 }
482
483 /*
484 * If this is not a MN set and the state flags do not indicate the
485 * presence of devids, update the set records on all nodes.
486 */
487 if (!(sd->sd_flags & MD_SR_MB_DEVID) && !(MD_MNSET_DESC(sd))) {
488 if (meta_update_mb(sp, dd, ep) == 0) {
489 mdclrerror(ep);
490
491 /* update the sr_flags on all hosts */
492 for (i = 0; i < MD_MAXSIDES; i++) {
493 if (sd->sd_nodes[i][0] == '\0')
494 continue;
495
496 if (clnt_upd_sr_flags(sd->sd_nodes[i],
497 sp, (sd->sd_flags | MD_SR_MB_DEVID), ep))
498 goto rollback;
499 }
500 }
501 }
502
503 RB_TEST(6, "adddrives", ep)
504
505 RB_PREEMPT;
506 rb_level = 3; /* level 3 */
507
508 RB_TEST(7, "adddrives", ep)
509
510 /*
511 * Balance the DB's according to the list of existing drives and the
512 * list of added drives.
513 */
514 if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1)
515 goto rollback;
516
517 /*
518 * Slam a dummy master block on all the disks that we are adding
519 * that don't have replicas on them.
520 * Used by diskset import if the disksets are remotely replicated
521 */
522 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) >= 0) {
523 for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
524 uint_t rep_slice;
525 int fd = -1;
526 mdname_t *np = NULL;
527 char *drive_name;
528
529 drive_name = ddp->dd_dnp->cname;
530
531 for (rl = rlp; rl != NULL; rl = rl->rl_next) {
532 char *rep_name;
533
534 rep_name =
535 rl->rl_repp->r_namep->drivenamep->cname;
536
537 if (strcmp(drive_name, rep_name) == 0) {
538 /*
539 * Disk has a replica on it so don't
540 * add dummy master block.
541 */
542 break;
543 }
544 }
545 if (rl == NULL) {
546 /*
547 * Drive doesn't have a replica on it so
548 * we need a dummy master block. Add it.
549 */
550 if (meta_replicaslice(ddp->dd_dnp, &rep_slice,
551 &xep) != 0) {
552 mdclrerror(&xep);
553 continue;
554 }
555
556 if ((np = metaslicename(ddp->dd_dnp, rep_slice,
557 &xep)) == NULL) {
558 mdclrerror(&xep);
559 continue;
560 }
561
562 if ((fd = open(np->rname, O_RDWR)) >= 0) {
563 meta_mkdummymaster(sp, fd, 16);
564 (void) close(fd);
565 }
566 }
567 }
568 }
569
570 if ((curdd == NULL) && (MD_MNSET_DESC(sd))) {
571 /*
572 * Notify rpc.mdcommd on all nodes of a nodelist change.
573 * Start by suspending rpc.mdcommd (which drains it of all
574 * messages), then change the nodelist followed by a reinit
575 * and resume.
576 */
577 nd = sd->sd_nodelist;
578 /* All nodes are guaranteed to be ALIVE */
579 while (nd) {
580 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
581 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
582 rval = -1;
583 goto out;
584 }
585 suspendall_flag = 1;
586 nd = nd->nd_next;
587 }
588 }
589
590 /*
591 * If a MN diskset and this is the first disk(s) being added
592 * to set, then pre-allocate change log records here.
593 * When the other nodes are joined into the MN diskset, the
594 * USER records will just be snarfed in.
595 */
596 if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) {
597 if (mdmn_allocate_changelog(sp, ep) != 0)
598 goto rollback;
599 }
600
601 /*
602 * Mark the drives MD_DR_OK.
603 * If first drive being added to MN diskset, then set
604 * master on all nodes to be this node and then join
605 * all alive nodes (nodes in membership list) to set.
606 */
607 if (MD_MNSET_DESC(sd)) {
608 nd = sd->sd_nodelist;
609 /* All nodes are guaranteed to be ALIVE */
610 while (nd) {
611 /* don't set master on this node - done earlier */
612 if ((curdd == NULL) && (nd->nd_nodeid !=
613 sd->sd_mn_mynode->nd_nodeid)) {
614 /*
615 * Set master on all alive nodes since
616 * all alive nodes will become joined nodes.
617 */
618 if (clnt_mnsetmaster(nd->nd_nodename, sp,
619 sd->sd_mn_mynode->nd_nodename,
620 sd->sd_mn_mynode->nd_nodeid, ep)) {
621 goto rollback;
622 }
623 }
624
625 if (curdd == NULL) {
626 /*
627 * No special flags for join set. Since
628 * all nodes are joining if 1st drive is being
629 * added to set then all nodes will be either
630 * STALE or non-STALE and each node can
631 * determine this on its own.
632 */
633 if (clnt_joinset(nd->nd_nodename, sp,
634 NULL, ep)) {
635 goto rollback;
636 }
637 /* Sets join node flag on all nodes in list */
638 if (clnt_upd_nr_flags(nd->nd_nodename, sp,
639 sd->sd_nodelist, MD_NR_JOIN, NULL, ep)) {
640 goto rollback;
641 }
642 }
643
644 /*
645 * Set MD_DR_OK as last thing before unlock.
646 * In case of panic on this node, recovery
647 * code can check for MD_DR_OK to determine
648 * status of diskset.
649 */
650 if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
651 MD_DR_OK, ep) == -1)
652 goto rollback;
653
654
655 RB_TEST(8, "adddrives", ep)
656 nd = nd->nd_next;
657 }
658 } else {
659 for (i = 0; i < MD_MAXSIDES; i++) {
660 /* Skip empty slots */
661 if (sd->sd_nodes[i][0] == '\0')
662 continue;
663
664 if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd, MD_DR_OK,
665 ep) == -1)
666 goto rollback;
667
668 RB_TEST(8, "adddrives", ep)
669 }
670 }
671
672 RB_TEST(9, "adddrives", ep)
673
674 out:
675 /*
676 * Notify rpc.mdcommd on all nodes of a nodelist change.
677 * Send reinit command to mdcommd which forces it to get
678 * fresh set description.
679 */
680 if (suspendall_flag) {
681 /* Send reinit */
682 nd = sd->sd_nodelist;
683 /* All nodes are guaranteed to be ALIVE */
684 while (nd) {
685 /* Class is ignored for REINIT */
686 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
687 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
688 if (rval == 0)
689 (void) mdstealerror(ep, &xep);
690 rval = -1;
691 mde_perror(ep, dgettext(TEXT_DOMAIN,
692 "Unable to reinit rpc.mdcommd.\n"));
693 }
694 nd = nd->nd_next;
695 }
696 }
697 /*
698 * Unlock diskset by resuming messages across the diskset.
699 * Just resume all classes so that resume is the same whether
700 * just one class was locked or all classes were locked.
701 */
702 if ((suspend1_flag) || (suspendall_flag)) {
703 nd = sd->sd_nodelist;
704 /* All nodes are guaranteed to be ALIVE */
705 while (nd) {
706 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
707 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
708 if (rval == 0)
709 (void) mdstealerror(ep, &xep);
710 rval = -1;
711 mde_perror(ep, dgettext(TEXT_DOMAIN,
712 "Unable to resume rpc.mdcommd.\n"));
713 }
714 nd = nd->nd_next;
715 }
716 meta_ping_mnset(sp->setno);
717 }
718
719 if (lock_flag) {
720 cl_sk = cl_get_setkey(sp->setno, sp->setname);
721 if (MD_MNSET_DESC(sd)) {
722 nd = sd->sd_nodelist;
723 /* All nodes are guaranteed to be ALIVE */
724 while (nd) {
725 if (clnt_unlock_set(nd->nd_nodename,
726 cl_sk, &xep)) {
727 if (rval == 0)
728 (void) mdstealerror(ep, &xep);
729 rval = -1;
730 }
731 nd = nd->nd_next;
732 }
733 } else {
734 for (i = 0; i < MD_MAXSIDES; i++) {
735 /* Skip empty slots */
736 if (sd->sd_nodes[i][0] == '\0')
737 continue;
738
739 if (clnt_unlock_set(sd->sd_nodes[i],
740 cl_sk, &xep)) {
741 if (rval == 0)
742 (void) mdstealerror(ep, &xep);
743 rval = -1;
744 }
745 }
746 }
747 cl_set_setkey(NULL);
748 }
749
750 metafreedrivedesc(&dd);
751
752 if (flush_set_onerr) {
753 metaflushsetname(sp);
754 if (!(MD_MNSET_DESC(sd))) {
755 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
756 }
757 }
758
759 if (MD_MNSET_DESC(sd)) {
760 /* release signals back to what they were on entry */
761 if (procsigs(FALSE, &oldsigs, &xep) < 0)
762 mdclrerror(&xep);
763 }
764
765 return (rval);
766
767 rollback:
768 /* all signals already blocked for MN disket */
769 if (!(MD_MNSET_DESC(sd))) {
770 /* Make sure we are blocking all signals */
771 if (procsigs(TRUE, &oldsigs, &xep) < 0)
772 mdclrerror(&xep);
773 }
774
775 rval = -1;
776
777 max_genid = sd->sd_genid;
778
779 /* level 3 */
780 if (rb_level > 2) {
781 /*
782 * Since the add drive operation is failing, need
783 * to reset config back to the way it was
784 * before the add drive opration.
785 * If a MN diskset and this is the first drive being added,
786 * then reset master on all ALIVE nodes (which is all nodes)
787 * since the master would have not been set previously.
788 * Don't reset master on this node, since this
789 * is done later.
790 * This is ok to fail since next node to add first
791 * disk to diskset will also set the master on all nodes.
792 *
793 * Also, if this is the first drive being added,
794 * need to have each node withdraw itself from the set.
795 */
796 if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) {
797 nd = sd->sd_nodelist;
798 /* All nodes are guaranteed to be ALIVE */
799 while (nd) {
800 /*
801 * Be careful with ordering in case of
802 * panic between the steps and the
803 * effect on recovery during reconfig.
804 */
805 if (clnt_withdrawset(nd->nd_nodename, sp, &xep))
806 mdclrerror(&xep);
807
808 /* Sets withdraw flag on all nodes in list */
809 if (clnt_upd_nr_flags(nd->nd_nodename, sp,
810 sd->sd_nodelist, MD_NR_WITHDRAW,
811 NULL, &xep)) {
812 mdclrerror(&xep);
813 }
814
815 /* Skip this node */
816 if (nd->nd_nodeid ==
817 sd->sd_mn_mynode->nd_nodeid) {
818 nd = nd->nd_next;
819 continue;
820 }
821 /* Reset master on all of the other nodes. */
822 if (clnt_mnsetmaster(nd->nd_nodename, sp,
823 "", MD_MN_INVALID_NID, &xep))
824 mdclrerror(&xep);
825 nd = nd->nd_next;
826 }
827 }
828 }
829
830 /*
831 * Send resume command to mdcommd. Don't send reinit command
832 * since nodelist should not have changed.
833 * If suspendall_flag is set, then user would have been adding
834 * first drives to set. Since this failed, there is certainly
835 * no reinit message to send to rpc.commd since no nodes will
836 * be joined to set at the end of this metaset command.
837 */
838 if (suspendall_flag) {
839 /* Send resume */
840 nd = sd->sd_nodelist;
841 /* All nodes are guaranteed to be ALIVE */
842 while (nd) {
843 /*
844 * Resume all classes but class 1 so that lock is held
845 * against meta* commands.
846 * To later resume class1, must issue a class0 resume.
847 */
848 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
849 sp, MD_MSG_CLASS0,
850 MD_MSCF_DONT_RESUME_CLASS1, &xep)) {
851 mde_perror(&xep, dgettext(TEXT_DOMAIN,
852 "Unable to resume rpc.mdcommd.\n"));
853 mdclrerror(&xep);
854 }
855 nd = nd->nd_next;
856 }
857 meta_ping_mnset(sp->setno);
858 }
859
860 /* level 3 */
861 if (rb_level > 2) {
862 mdnamelist_t *nlp;
863 mdname_t *np;
864
865 for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
866 uint_t rep_slice;
867
868 if ((meta_replicaslice(ddp->dd_dnp,
869 &rep_slice, &xep) != 0) ||
870 ((np = metaslicename(ddp->dd_dnp, rep_slice,
871 &xep)) == NULL)) {
872 mdclrerror(&xep);
873 continue;
874 }
875 nlp = NULL;
876 (void) metanamelist_append(&nlp, np);
877
878 if (meta_db_detach(sp, nlp,
879 (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, &xep))
880 mdclrerror(&xep);
881
882 metafreenamelist(nlp);
883 }
884
885 /* Re-balance */
886 if (meta_db_balance(sp, NULL, curdd, 0, &xep) == -1)
887 mdclrerror(&xep);
888
889 /* Only if we are adding the first drive */
890 /* Handled MN diskset above. */
891 if ((curdd == NULL) && !(MD_MNSET_DESC(sd))) {
892 if (clnt_stimeout(mynode(), sp, &defmhiargs,
893 &xep) == -1)
894 mdclrerror(&xep);
895
896 /* This is needed because of a corner case */
897 if (halt_set(sp, &xep))
898 mdclrerror(&xep);
899 }
900 max_genid++;
901 }
902
903 /* level 2 */
904 if (rb_level > 1) {
905 if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
906 if (rel_own_bydd(sp, dd, TRUE, &xep))
907 mdclrerror(&xep);
908 }
909 }
910
911 /* level 1 */
912 if (rb_level > 0) {
913 if (MD_MNSET_DESC(sd)) {
914 nd = sd->sd_nodelist;
915 /* All nodes are guaranteed to be ALIVE */
916 while (nd) {
917 if (clnt_deldrvs(nd->nd_nodename, sp, dd,
918 &xep) == -1)
919 mdclrerror(&xep);
920 nd = nd->nd_next;
921 }
922 } else {
923 for (i = 0; i < MD_MAXSIDES; i++) {
924 /* Skip empty slots */
925 if (sd->sd_nodes[i][0] == '\0')
926 continue;
927
928 if (clnt_deldrvs(sd->sd_nodes[i], sp, dd,
929 &xep) == -1)
930 mdclrerror(&xep);
931 }
932 }
933 max_genid += 2;
934 resync_genid(sp, sd, max_genid, 0, NULL);
935 }
936
937 if ((suspend1_flag) || (suspendall_flag)) {
938 /* Send resume */
939 nd = sd->sd_nodelist;
940 /* All nodes are guaranteed to be ALIVE */
941 while (nd) {
942 /*
943 * Just resume all classes so that resume is the
944 * same whether just one class was locked or all
945 * classes were locked.
946 */
947 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
948 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
949 mdclrerror(&xep);
950 }
951 nd = nd->nd_next;
952 }
953 meta_ping_mnset(sp->setno);
954 }
955
956 /* level 0 */
957 cl_sk = cl_get_setkey(sp->setno, sp->setname);
958 /* Don't test lock flag since guaranteed to be set if in rollback */
959 if (MD_MNSET_DESC(sd)) {
960 /*
961 * Since the add drive operation is failing, need
962 * to reset config back to the way it was
963 * before the add drive opration.
964 * If a MN diskset and this is the first drive being
965 * added, then reset master on this node since
966 * the master would have not been set previously.
967 * This is ok to fail since next node to add first
968 * disk to diskset will also set the master on all nodes.
969 */
970 if (curdd == NULL) {
971 /* Reset master on mynode */
972 if (clnt_mnsetmaster(mynode(), sp, "",
973 MD_MN_INVALID_NID, &xep))
974 mdclrerror(&xep);
975 }
976 nd = sd->sd_nodelist;
977 /* All nodes are guaranteed to be ALIVE */
978 while (nd) {
979 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
980 mdclrerror(&xep);
981 nd = nd->nd_next;
982 }
983 } else {
984 for (i = 0; i < MD_MAXSIDES; i++) {
985 /* Skip empty slots */
986 if (sd->sd_nodes[i][0] == '\0')
987 continue;
988
989 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
990 mdclrerror(&xep);
991 }
992 }
993 cl_set_setkey(NULL);
994
995 /* release signals back to what they were on entry */
996 if (procsigs(FALSE, &oldsigs, &xep) < 0)
997 mdclrerror(&xep);
998
999 metafreedrivedesc(&dd);
1000
1001 if (flush_set_onerr) {
1002 metaflushsetname(sp);
1003 if (!(MD_MNSET_DESC(sd))) {
1004 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1005 }
1006 }
1007
1008 return (rval);
1009 }
1010
1011 /*
1012 * Add drives routine used during import of a diskset.
1013 */
1014 int
meta_imp_set_adddrives(mdsetname_t * sp,mddrivenamelist_t * dnlp,md_im_set_desc_t * misp,md_error_t * ep)1015 meta_imp_set_adddrives(
1016 mdsetname_t *sp,
1017 mddrivenamelist_t *dnlp,
1018 md_im_set_desc_t *misp,
1019 md_error_t *ep
1020 )
1021 {
1022 md_set_desc *sd;
1023 mddrivenamelist_t *p;
1024 md_drive_desc *dd = NULL, *ddp;
1025 int flush_set_onerr = 0;
1026 md_timeval32_t now;
1027 ulong_t genid;
1028 mhd_mhiargs_t mhiargs;
1029 md_im_replica_info_t *mirp;
1030 md_im_drive_info_t *midp;
1031 int rval = 0;
1032 sigset_t oldsigs;
1033 ulong_t max_genid = 0;
1034 int rb_level = 0;
1035 md_error_t xep = mdnullerror;
1036
1037 if ((sd = metaget_setdesc(sp, ep)) == NULL)
1038 return (-1);
1039
1040 for (p = dnlp; p != NULL; p = p->next) {
1041 int imp_flag = 0;
1042
1043 /*
1044 * If we have a partial diskset, meta_make_sidenmlist will
1045 * need information from midp to complete making the
1046 * side name structure.
1047 */
1048 if (misp->mis_partial) {
1049 imp_flag = MDDB_C_IMPORT;
1050 for (midp = misp->mis_drives; midp != NULL;
1051 midp = midp->mid_next) {
1052 if (midp->mid_dnp == p->drivenamep)
1053 break;
1054 }
1055 if (midp == NULL) {
1056 (void) mddserror(ep, MDE_DS_SETNOTIMP,
1057 MD_SET_BAD, mynode(), NULL, sp->setname);
1058 rval = -1;
1059 goto out;
1060 }
1061 }
1062 /*
1063 * Create the names for the drives we are adding per side.
1064 */
1065 if (meta_make_sidenmlist(sp, p->drivenamep, imp_flag,
1066 midp, ep) == -1) {
1067 rval = -1;
1068 goto out;
1069 }
1070 }
1071
1072 /*
1073 * Get the list of drives descriptors that we are adding.
1074 */
1075 dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_ADD, ep);
1076
1077 if (! mdisok(ep)) {
1078 rval = -1;
1079 goto out;
1080 }
1081
1082 /*
1083 * Get the set timeout information.
1084 */
1085 (void) memset(&mhiargs, '\0', sizeof (mhiargs));
1086 if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) {
1087 rval = -1;
1088 goto out;
1089 }
1090
1091 /*
1092 * Get timestamp and generation id for new records
1093 */
1094 now = sd->sd_ctime;
1095 genid = sd->sd_genid;
1096
1097 /* At this point, in case of error, set should be flushed. */
1098 flush_set_onerr = 1;
1099
1100 rb_level = 1; /* level 1 */
1101
1102 for (midp = misp->mis_drives; midp != NULL; midp = midp->mid_next) {
1103 for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
1104 if (ddp->dd_dnp == midp->mid_dnp) {
1105 /* same disk */
1106 ddp->dd_dnp->devid =
1107 devid_str_encode(midp->mid_devid,
1108 midp->mid_minor_name);
1109
1110 ddp->dd_dbcnt = 0;
1111 mirp = midp->mid_replicas;
1112 if (mirp) {
1113 ddp->dd_dbsize = mirp->mir_length;
1114 for (; mirp != NULL;
1115 mirp = mirp->mir_next) {
1116 ddp->dd_dbcnt++;
1117 }
1118 }
1119 if ((midp->mid_available &
1120 MD_IM_DISK_NOT_AVAILABLE) &&
1121 (misp->mis_flags & MD_IM_SET_REPLICATED)) {
1122 ddp->dd_flags = MD_DR_UNRSLV_REPLICATED;
1123 }
1124 }
1125 }
1126 }
1127
1128 /*
1129 * Add the drive records for the drives that we are adding to
1130 * each host in the set. Marks the drive records as MD_DR_ADD.
1131 * May also mark a drive record as MD_DR_UNRSLV_REPLICATED if
1132 * this flag was set in the dd_flags for that drive.
1133 */
1134 if (clnt_imp_adddrvs(mynode(), sp, dd, now, genid, ep) == -1)
1135 goto rollback;
1136
1137 rb_level = 2; /* level 2 */
1138
1139 /*
1140 * Take ownership of the added drives.
1141 */
1142 if (tk_own_bydd(sp, dd, &mhiargs, TRUE, ep))
1143 goto rollback;
1144
1145 out:
1146 metafreedrivedesc(&dd);
1147
1148 if (flush_set_onerr) {
1149 metaflushsetname(sp);
1150 }
1151
1152 return (rval);
1153
1154 rollback:
1155 /* Make sure we are blocking all signals */
1156 if (procsigs(TRUE, &oldsigs, &xep) < 0)
1157 mdclrerror(&xep);
1158
1159 rval = -1;
1160
1161 max_genid = sd->sd_genid;
1162
1163 /* level 2 */
1164 if (rb_level > 1) {
1165 if (!MD_ATSET_DESC(sd)) {
1166 if (rel_own_bydd(sp, dd, TRUE, &xep)) {
1167 mdclrerror(&xep);
1168 }
1169 }
1170 }
1171
1172 /* level 1 */
1173 if (rb_level > 0) {
1174 if (clnt_deldrvs(mynode(), sp, dd, &xep) == -1) {
1175 mdclrerror(&xep);
1176 }
1177 max_genid += 2;
1178 resync_genid(sp, sd, max_genid, 0, NULL);
1179 }
1180
1181 /* level 0 */
1182
1183 /* release signals back to what they were on entry */
1184 if (procsigs(FALSE, &oldsigs, &xep) < 0)
1185 mdclrerror(&xep);
1186
1187 metafreedrivedesc(&dd);
1188
1189 if (flush_set_onerr) {
1190 metaflushsetname(sp);
1191 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1192 }
1193
1194 return (rval);
1195 }
1196
1197 int
meta_set_deletedrives(mdsetname_t * sp,mddrivenamelist_t * dnlp,int forceflg,md_error_t * ep)1198 meta_set_deletedrives(
1199 mdsetname_t *sp,
1200 mddrivenamelist_t *dnlp,
1201 int forceflg,
1202 md_error_t *ep
1203 )
1204 {
1205 md_set_desc *sd;
1206 md_drive_desc *ddp, *dd = NULL, *curdd = NULL;
1207 md_replicalist_t *rlp = NULL, *rl;
1208 mddrivenamelist_t *p;
1209 int deldrvcnt = 0;
1210 int rval = 0;
1211 mhd_mhiargs_t mhiargs;
1212 int i;
1213 sigset_t oldsigs;
1214 md_setkey_t *cl_sk;
1215 ulong_t max_genid = 0;
1216 int rb_level = 0;
1217 md_error_t xep = mdnullerror;
1218 md_mnnode_desc *nd;
1219 int has_set;
1220 int current_drv_cnt = 0;
1221 int suspendall_flag = 0, suspendall_flag_rb = 0;
1222 int suspend1_flag = 0;
1223 int lock_flag = 0;
1224 bool_t stale_bool = FALSE;
1225 int flush_set_onerr = 0;
1226 mdnamelist_t *nlp;
1227 mdname_t *np;
1228
1229 if ((sd = metaget_setdesc(sp, ep)) == NULL)
1230 return (-1);
1231
1232 /* Make sure we own the set */
1233 if (meta_check_ownership(sp, ep) != 0)
1234 return (-1);
1235
1236 if (drvsuniq(sp, dnlp, ep) == -1)
1237 return (-1);
1238
1239 /*
1240 * Check and see if all the nodes have the set.
1241 *
1242 * The drive and node records are stored in the local mddbs of each
1243 * node in the diskset. Each node's rpc.metad daemon reads in the set,
1244 * drive and node records from that node's local mddb and caches them
1245 * internally. Any process needing diskset information contacts its
1246 * local rpc.metad to get this information. Since each node in the
1247 * diskset is independently reading the set information from its local
1248 * mddb, the set, drive and node records in the local mddbs must stay
1249 * in-sync, so that all nodes have a consistent view of the diskset.
1250 *
1251 * For a multinode diskset, explicitly verify that all nodes in the
1252 * diskset are ALIVE (i.e. are in the API membership list). Otherwise,
1253 * fail this operation since all nodes must be ALIVE in order to delete
1254 * a drive record from their local mddb. If a panic of this node
1255 * leaves the local mddbs set, node and drive records out-of-sync, the
1256 * reconfig cycle will fix the local mddbs and force them back into
1257 * synchronization.
1258 */
1259 if (MD_MNSET_DESC(sd)) {
1260 nd = sd->sd_nodelist;
1261 while (nd) {
1262 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1263 (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
1264 sp->setno,
1265 nd->nd_nodename, NULL, sp->setname);
1266 return (-1);
1267 }
1268 nd = nd->nd_next;
1269 }
1270
1271 /* Make sure we are blocking all signals */
1272 if (procsigs(TRUE, &oldsigs, &xep) < 0)
1273 mdclrerror(&xep);
1274
1275 /*
1276 * Lock the set on current set members.
1277 * Set locking done much earlier for MN diskset than for
1278 * traditional diskset since lock_set and SUSPEND are used
1279 * to protect against other meta* commands running on the
1280 * other nodes.
1281 */
1282 nd = sd->sd_nodelist;
1283 /* All nodes are guaranteed to be ALIVE */
1284 while (nd) {
1285 if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
1286 rval = -1;
1287 goto out;
1288 }
1289 lock_flag = 1;
1290 nd = nd->nd_next;
1291 }
1292 /*
1293 * Lock out other meta* commands by suspending
1294 * class 1 messages across the diskset.
1295 */
1296 nd = sd->sd_nodelist;
1297 /* All nodes are guaranteed to be ALIVE */
1298 while (nd) {
1299 if (clnt_mdcommdctl(nd->nd_nodename,
1300 COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
1301 MD_MSCF_NO_FLAGS, ep)) {
1302 rval = -1;
1303 goto out;
1304 }
1305 suspend1_flag = 1;
1306 nd = nd->nd_next;
1307 }
1308
1309 nd = sd->sd_nodelist;
1310 /* All nodes are guaranteed to be ALIVE */
1311 while (nd) {
1312 if (strcmp(nd->nd_nodename, mynode()) == 0) {
1313 nd = nd->nd_next;
1314 continue;
1315 }
1316
1317 has_set = nodehasset(sp, nd->nd_nodename,
1318 NHS_NSTG_EQ, ep);
1319 if (has_set < 0) {
1320 rval = -1;
1321 goto out;
1322 }
1323
1324 if (! has_set) {
1325 (void) mddserror(ep, MDE_DS_NODENOSET,
1326 sp->setno, nd->nd_nodename,
1327 NULL, sp->setname);
1328 rval = -1;
1329 goto out;
1330 }
1331 nd = nd->nd_next;
1332 }
1333 } else {
1334 for (i = 0; i < MD_MAXSIDES; i++) {
1335 /* Skip empty slots */
1336 if (sd->sd_nodes[i][0] == '\0')
1337 continue;
1338
1339 if (strcmp(sd->sd_nodes[i], mynode()) == 0)
1340 continue;
1341
1342 has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NSTG_EQ,
1343 ep);
1344 if (has_set < 0) {
1345 /*
1346 * Can directly return since !MN diskset;
1347 * nothing to unlock.
1348 */
1349 return (-1);
1350 }
1351
1352 if (! has_set) {
1353 /*
1354 * Can directly return since !MN diskset;
1355 * nothing to unlock.
1356 */
1357 return (mddserror(ep, MDE_DS_NODENOSET,
1358 sp->setno, sd->sd_nodes[i], NULL,
1359 sp->setname));
1360 }
1361 }
1362 }
1363
1364 for (p = dnlp; p != NULL; p = p->next) {
1365 int is_it;
1366 mddrivename_t *dnp;
1367
1368 dnp = p->drivenamep;
1369
1370 if ((is_it = meta_is_drive_in_thisset(sp, dnp, FALSE, ep))
1371 == -1) {
1372 rval = -1;
1373 goto out;
1374 }
1375
1376 if (! is_it) {
1377 (void) mddserror(ep, MDE_DS_DRIVENOTINSET, sp->setno,
1378 NULL, dnp->cname, sp->setname);
1379 rval = -1;
1380 goto out;
1381 }
1382
1383 if ((meta_check_drive_inuse(sp, dnp, FALSE, ep)) == -1) {
1384 rval = -1;
1385 goto out;
1386 }
1387
1388 deldrvcnt++;
1389 }
1390 current_drv_cnt = deldrvcnt;
1391
1392 /*
1393 * Get drive descriptors for the drives that are currently in the set.
1394 */
1395 curdd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
1396 if (! mdisok(ep)) {
1397 rval = -1;
1398 goto out;
1399 }
1400
1401 /*
1402 * Decrement the the delete drive count for each drive currently in the
1403 * set.
1404 */
1405 for (ddp = curdd; ddp != NULL; ddp = ddp->dd_next)
1406 deldrvcnt--;
1407
1408 /*
1409 * If the count of drives we are deleting is equal to the drives in the
1410 * set, and we haven't specified forceflg, return an error
1411 */
1412 if (deldrvcnt == 0 && forceflg == FALSE) {
1413 (void) mderror(ep, MDE_FORCE_DEL_ALL_DRV, NULL);
1414 rval = -1;
1415 goto out;
1416 }
1417
1418 /*
1419 * Get the list of drive descriptors that we are deleting.
1420 */
1421 dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_DEL, ep);
1422 if (! mdisok(ep)) {
1423 rval = -1;
1424 goto out;
1425 }
1426
1427 /*
1428 * Get the set timeout information in case we have to roll back.
1429 */
1430 (void) memset(&mhiargs, '\0', sizeof (mhiargs));
1431 if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) {
1432 rval = -1;
1433 goto out;
1434 }
1435
1436 /* At this point, in case of error, set should be flushed. */
1437 flush_set_onerr = 1;
1438
1439 /* END CHECK CODE */
1440
1441 /* Lock the set on current set members */
1442 if (!(MD_MNSET_DESC(sd))) {
1443 md_rb_sig_handling_on();
1444 for (i = 0; i < MD_MAXSIDES; i++) {
1445 /* Skip empty slots */
1446 if (sd->sd_nodes[i][0] == '\0')
1447 continue;
1448
1449 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
1450 rval = -1;
1451 goto out;
1452 }
1453 lock_flag = 1;
1454 }
1455 }
1456
1457 if ((deldrvcnt == 0) && (MD_MNSET_DESC(sd))) {
1458 mddb_config_t c;
1459 /*
1460 * Is current set STALE?
1461 */
1462 (void) memset(&c, 0, sizeof (c));
1463 c.c_id = 0;
1464 c.c_setno = sp->setno;
1465 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
1466 (void) mdstealerror(ep, &c.c_mde);
1467 rval = -1;
1468 goto out;
1469 }
1470 if (c.c_flags & MDDB_C_STALE) {
1471 stale_bool = TRUE;
1472 }
1473 }
1474
1475 RB_TEST(1, "deletedrives", ep)
1476
1477 RB_PREEMPT;
1478 rb_level = 1; /* level 1 */
1479
1480 RB_TEST(2, "deletedrives", ep)
1481
1482 /*
1483 * Mark the drives MD_DR_DEL
1484 */
1485 if (MD_MNSET_DESC(sd)) {
1486 nd = sd->sd_nodelist;
1487 /* All nodes are guaranteed to be ALIVE */
1488 while (nd) {
1489 if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
1490 MD_DR_DEL, ep) == -1)
1491 goto rollback;
1492
1493 RB_TEST(3, "deletedrives", ep)
1494 nd = nd->nd_next;
1495 }
1496 } else {
1497 for (i = 0; i < MD_MAXSIDES; i++) {
1498 /* Skip empty slots */
1499 if (sd->sd_nodes[i][0] == '\0')
1500 continue;
1501
1502 if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd,
1503 MD_DR_DEL, ep) == -1)
1504 goto rollback;
1505
1506 RB_TEST(3, "deletedrives", ep)
1507 }
1508 }
1509
1510 RB_TEST(4, "deletedrives", ep)
1511
1512 RB_PREEMPT;
1513 rb_level = 2; /* level 2 */
1514
1515 RB_TEST(5, "deletedrives", ep)
1516
1517 /*
1518 * Balance the DB's according to the list of existing drives and the
1519 * list of deleted drives.
1520 */
1521 if (meta_db_balance(sp, dd, curdd, 0, ep) == -1)
1522 goto rollback;
1523
1524 /*
1525 * If the drive(s) to be deleted cannot be accessed,
1526 * they haven't really been deleted yet. Check and delete now
1527 * if need be.
1528 */
1529 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) >= 0) {
1530 nlp = NULL;
1531 for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
1532 char *delete_name;
1533
1534 delete_name = ddp->dd_dnp->cname;
1535
1536 for (rl = rlp; rl != NULL; rl = rl->rl_next) {
1537 char *cur_name;
1538
1539 cur_name =
1540 rl->rl_repp->r_namep->drivenamep->cname;
1541
1542 if (strcmp(delete_name, cur_name) == 0) {
1543 /* put it on the delete list */
1544 np = rl->rl_repp->r_namep;
1545 (void) metanamelist_append(&nlp, np);
1546
1547 }
1548 }
1549 }
1550
1551 if (nlp != NULL) {
1552 if (meta_db_detach(sp, nlp,
1553 (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL,
1554 ep) == -1) {
1555 metafreenamelist(nlp);
1556 goto rollback;
1557 }
1558 metafreenamelist(nlp);
1559 }
1560 }
1561
1562 RB_TEST(6, "deletedrives", ep)
1563
1564 RB_PREEMPT;
1565 rb_level = 3; /* level 3 */
1566
1567 RB_TEST(7, "deletedrives", ep)
1568
1569 /*
1570 * Cannot suspend set until after meta_db_balance since
1571 * meta_db_balance uses META_DB_ATTACH/DETACH messages.
1572 */
1573 if ((deldrvcnt == 0) && (MD_MNSET_DESC(sd))) {
1574 /*
1575 * Notify rpc.mdcommd on all nodes of a nodelist change.
1576 * Start by suspending rpc.mdcommd (which drains it of all
1577 * messages), then change the nodelist followed by a reinit
1578 * and resume.
1579 */
1580 nd = sd->sd_nodelist;
1581 /* All nodes are guaranteed to be ALIVE */
1582 while (nd) {
1583 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
1584 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
1585 rval = -1;
1586 goto out;
1587 }
1588 suspendall_flag = 1;
1589 nd = nd->nd_next;
1590 }
1591 }
1592
1593 /*
1594 * Remove the drive records for the drives that were deleted from
1595 * each host in the set. This removes the record and dr_flags.
1596 */
1597 if (MD_MNSET_DESC(sd)) {
1598 nd = sd->sd_nodelist;
1599 /* All nodes are guaranteed to be ALIVE */
1600 while (nd) {
1601 if (clnt_deldrvs(nd->nd_nodename, sp, dd, ep) == -1)
1602 goto rollback;
1603
1604 RB_TEST(8, "deletedrives", ep)
1605 nd = nd->nd_next;
1606 }
1607 } else {
1608 for (i = 0; i < MD_MAXSIDES; i++) {
1609 /* Skip empty slots */
1610 if (sd->sd_nodes[i][0] == '\0')
1611 continue;
1612
1613 if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep) == -1)
1614 goto rollback;
1615
1616 RB_TEST(8, "deletedrives", ep)
1617 }
1618 }
1619
1620 RB_TEST(9, "deletedrives", ep)
1621
1622 RB_PREEMPT;
1623 rb_level = 4; /* level 4 */
1624
1625 RB_TEST(10, "deletedrives", ep)
1626
1627 if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
1628 if (rel_own_bydd(sp, dd, TRUE, ep))
1629 goto rollback;
1630 }
1631
1632 /* If we deleted all the drives, then we need to halt the set. */
1633 if (deldrvcnt == 0) {
1634 RB_TEST(11, "deletedrives", ep)
1635
1636 RB_PREEMPT;
1637 rb_level = 5; /* level 5 */
1638
1639 RB_TEST(12, "deletedrives", ep)
1640
1641 if (clnt_stimeout(mynode(), sp, &defmhiargs, ep) == -1)
1642 goto rollback;
1643
1644 RB_TEST(13, "deletedrives", ep)
1645
1646 RB_PREEMPT;
1647 rb_level = 6; /* level 6 */
1648
1649 RB_TEST(14, "deletedrives", ep)
1650
1651 /* Halt MN diskset on all nodes by having node withdraw */
1652 if (MD_MNSET_DESC(sd)) {
1653 nd = sd->sd_nodelist;
1654 /* All nodes are guaranteed to be ALIVE */
1655 while (nd) {
1656 /* Only withdraw nodes that are joined */
1657 if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
1658 nd = nd->nd_next;
1659 continue;
1660 }
1661 /*
1662 * Going to set locally cached node flags to
1663 * rollback join so in case of error, the
1664 * rollback code knows which nodes to re-join.
1665 */
1666 nd->nd_flags |= MD_MN_NODE_RB_JOIN;
1667
1668 /*
1669 * Be careful in ordering of following steps
1670 * so that recovery from a panic between
1671 * the steps is viable.
1672 * Only reset master info in rpc.metad -
1673 * don't reset local cached information
1674 * which will be used to set master information
1675 * back in case of failure (rollback).
1676 */
1677 if (clnt_withdrawset(nd->nd_nodename, sp, ep))
1678 goto rollback;
1679 /* Sets withdraw flag on all nodes in list */
1680 if (clnt_upd_nr_flags(nd->nd_nodename, sp,
1681 sd->sd_nodelist, MD_NR_WITHDRAW,
1682 NULL, ep)) {
1683 goto rollback;
1684 }
1685 if (clnt_mnsetmaster(nd->nd_nodename, sp,
1686 "", MD_MN_INVALID_NID, ep)) {
1687 goto rollback;
1688 }
1689 nd = nd->nd_next;
1690 }
1691 } else {
1692 if (halt_set(sp, ep))
1693 goto rollback;
1694 }
1695
1696 RB_TEST(15, "deletedrives", ep)
1697 }
1698
1699 RB_TEST(16, "deletedrives", ep)
1700
1701 out:
1702 /*
1703 * Notify rpc.mdcommd on all nodes of a nodelist change.
1704 * Send reinit command to mdcommd which forces it to get
1705 * fresh set description.
1706 */
1707 if (suspendall_flag) {
1708 /* Send reinit */
1709 nd = sd->sd_nodelist;
1710 /* All nodes are guaranteed to be ALIVE */
1711 while (nd) {
1712 /* Class is ignored for REINIT */
1713 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
1714 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
1715 if (rval == 0)
1716 (void) mdstealerror(ep, &xep);
1717 rval = -1;
1718 mde_perror(ep, dgettext(TEXT_DOMAIN,
1719 "Unable to reinit rpc.mdcommd.\n"));
1720 }
1721 nd = nd->nd_next;
1722 }
1723 }
1724
1725 /*
1726 * Just resume all classes so that resume is the same whether
1727 * just one class was locked or all classes were locked.
1728 */
1729 if ((suspend1_flag) || (suspendall_flag)) {
1730 /* Send resume */
1731 nd = sd->sd_nodelist;
1732 /* All nodes are guaranteed to be ALIVE */
1733 while (nd) {
1734 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
1735 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
1736 if (rval == 0)
1737 (void) mdstealerror(ep, &xep);
1738 rval = -1;
1739 mde_perror(ep, dgettext(TEXT_DOMAIN,
1740 "Unable to resume rpc.mdcommd.\n"));
1741 }
1742 nd = nd->nd_next;
1743 }
1744 meta_ping_mnset(sp->setno);
1745 }
1746 if (lock_flag) {
1747 cl_sk = cl_get_setkey(sp->setno, sp->setname);
1748 if (MD_MNSET_DESC(sd)) {
1749 nd = sd->sd_nodelist;
1750 /* All nodes are guaranteed to be ALIVE */
1751 while (nd) {
1752 if (clnt_unlock_set(nd->nd_nodename,
1753 cl_sk, &xep)) {
1754 if (rval == 0)
1755 (void) mdstealerror(ep, &xep);
1756 rval = -1;
1757 }
1758 nd = nd->nd_next;
1759 }
1760 } else {
1761 for (i = 0; i < MD_MAXSIDES; i++) {
1762 /* Skip empty slots */
1763 if (sd->sd_nodes[i][0] == '\0')
1764 continue;
1765
1766 if (clnt_unlock_set(sd->sd_nodes[i],
1767 cl_sk, &xep)) {
1768 if (rval == 0)
1769 (void) mdstealerror(ep, &xep);
1770 rval = -1;
1771 }
1772 }
1773 }
1774 cl_set_setkey(NULL);
1775 }
1776
1777 metafreedrivedesc(&dd);
1778
1779 if (flush_set_onerr) {
1780 metaflushsetname(sp);
1781 if (!(MD_MNSET_DESC(sd))) {
1782 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1783 }
1784 }
1785
1786 if (MD_MNSET_DESC(sd)) {
1787 /* release signals back to what they were on entry */
1788 if (procsigs(FALSE, &oldsigs, &xep) < 0)
1789 mdclrerror(&xep);
1790 }
1791
1792 return (rval);
1793
1794 rollback:
1795 /* all signals already blocked for MN disket */
1796 if (!(MD_MNSET_DESC(sd))) {
1797 /* Make sure we are blocking all signals */
1798 if (procsigs(TRUE, &oldsigs, &xep) < 0)
1799 mdclrerror(&xep);
1800 }
1801
1802 rval = -1;
1803
1804 max_genid = sd->sd_genid;
1805
1806 /* Set the master on all nodes first thing */
1807 if (rb_level > 5) {
1808 if (MD_MNSET_DESC(sd)) {
1809 nd = sd->sd_nodelist;
1810 /* All nodes are guaranteed to be ALIVE */
1811 while (nd) {
1812 if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) {
1813 continue;
1814 }
1815 /*
1816 * Set master on all re-joining nodes to be
1817 * my cached view of master.
1818 */
1819 if (clnt_mnsetmaster(nd->nd_nodename, sp,
1820 sd->sd_mn_master_nodenm,
1821 sd->sd_mn_master_nodeid, &xep)) {
1822 mdclrerror(&xep);
1823 }
1824 }
1825 }
1826 }
1827
1828 /* level 3 */
1829 if (rb_level > 2) {
1830 md_set_record *sr;
1831 md_mnset_record *mnsr;
1832 md_drive_record *dr;
1833 int sr_drive_cnt;
1834
1835 /*
1836 * See if we have to re-add the drives specified.
1837 */
1838 if (MD_MNSET_DESC(sd)) {
1839 nd = sd->sd_nodelist;
1840 /* All nodes are guaranteed to be ALIVE */
1841 while (nd) {
1842 /*
1843 * Must get current set record from each
1844 * node to see what else must be done
1845 * to recover.
1846 * Record should be for a multi-node diskset.
1847 */
1848 if (clnt_mngetset(nd->nd_nodename, sp->setname,
1849 MD_SET_BAD, &mnsr, &xep) == -1) {
1850 mdclrerror(&xep);
1851 nd = nd->nd_next;
1852 continue;
1853 }
1854
1855 /*
1856 * If all drives are already there, skip
1857 * to next node.
1858 */
1859 sr_drive_cnt = 0;
1860 dr = mnsr->sr_drivechain;
1861 while (dr) {
1862 sr_drive_cnt++;
1863 dr = dr->dr_next;
1864 }
1865 if (sr_drive_cnt == current_drv_cnt) {
1866 free_sr((md_set_record *)mnsr);
1867 nd = nd->nd_next;
1868 continue;
1869 }
1870
1871 /* Readd all drives */
1872 if (clnt_adddrvs(nd->nd_nodename, sp, dd,
1873 mnsr->sr_ctime, mnsr->sr_genid, &xep) == -1)
1874 mdclrerror(&xep);
1875
1876 free_sr((struct md_set_record *)mnsr);
1877 nd = nd->nd_next;
1878 }
1879 } else {
1880 for (i = 0; i < MD_MAXSIDES; i++) {
1881 /* Skip empty slots */
1882 if (sd->sd_nodes[i][0] == '\0')
1883 continue;
1884
1885 /* Record should be for a non-multi-node set */
1886 if (clnt_getset(sd->sd_nodes[i], sp->setname,
1887 MD_SET_BAD, &sr, &xep) == -1) {
1888 mdclrerror(&xep);
1889 continue;
1890 }
1891
1892 /*
1893 * Set record structure was allocated from RPC
1894 * routine getset so this structure is only of
1895 * size md_set_record even if the MN flag is
1896 * set. So, clear the flag so that the free
1897 * code doesn't attempt to free a structure
1898 * the size of md_mnset_record.
1899 */
1900 if (MD_MNSET_REC(sr)) {
1901 sr->sr_flags &= ~MD_SR_MN;
1902 free_sr(sr);
1903 continue;
1904 }
1905
1906 /* Drive already added, skip to next node */
1907 if (sr->sr_drivechain != NULL) {
1908 free_sr(sr);
1909 continue;
1910 }
1911
1912 if (clnt_adddrvs(sd->sd_nodes[i], sp, dd,
1913 sr->sr_ctime, sr->sr_genid, &xep) == -1)
1914 mdclrerror(&xep);
1915
1916 free_sr(sr);
1917 }
1918 }
1919 max_genid += 2;
1920 }
1921
1922 /*
1923 * Notify rpc.mdcommd on all nodes of a nodelist change.
1924 * At this point in time, don't know which nodes are joined
1925 * to the set. So, send a reinit command to mdcommd
1926 * which forces it to get fresh set description. Then send resume.
1927 *
1928 * Later, this code will use rpc.mdcommd messages to reattach disks
1929 * and then rpc.mdcommd may be suspended again, rest of the nodes
1930 * joined, rpc.mdcommd reinited and then resumed.
1931 */
1932 if (suspendall_flag) {
1933 /* Send reinit */
1934 nd = sd->sd_nodelist;
1935 /* All nodes are guaranteed to be ALIVE */
1936 while (nd) {
1937 /* Class is ignored for REINIT */
1938 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
1939 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
1940 mde_perror(&xep, dgettext(TEXT_DOMAIN,
1941 "Unable to reinit rpc.mdcommd.\n"));
1942 mdclrerror(&xep);
1943 }
1944 nd = nd->nd_next;
1945 }
1946
1947 /* Send resume */
1948 nd = sd->sd_nodelist;
1949 /* All nodes are guaranteed to be ALIVE */
1950 while (nd) {
1951 /*
1952 * Resume all classes but class 1 so that lock is held
1953 * against meta* commands.
1954 * To later resume class1, must issue a class0 resume.
1955 */
1956 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
1957 sp, MD_MSG_CLASS0,
1958 MD_MSCF_DONT_RESUME_CLASS1, &xep)) {
1959 mde_perror(&xep, dgettext(TEXT_DOMAIN,
1960 "Unable to resume rpc.mdcommd.\n"));
1961 mdclrerror(&xep);
1962 }
1963 nd = nd->nd_next;
1964 }
1965 meta_ping_mnset(sp->setno);
1966 }
1967
1968 /* level 2 */
1969 if (rb_level > 1) {
1970 mdnamelist_t *nlp;
1971 mdname_t *np;
1972
1973 for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
1974 uint_t rep_slice;
1975
1976 if ((meta_replicaslice(ddp->dd_dnp,
1977 &rep_slice, &xep) != 0) ||
1978 ((np = metaslicename(ddp->dd_dnp, rep_slice,
1979 &xep)) == NULL)) {
1980 mdclrerror(&xep);
1981 continue;
1982 }
1983 nlp = NULL;
1984 (void) metanamelist_append(&nlp, np);
1985
1986 if (meta_db_attach(sp, nlp,
1987 (MDCHK_DRVINSET | MDCHK_SET_LOCKED),
1988 &sd->sd_ctime, ddp->dd_dbcnt, ddp->dd_dbsize,
1989 NULL, &xep) == -1)
1990 mdclrerror(&xep);
1991
1992 metafreenamelist(nlp);
1993 }
1994 /* Re-balance */
1995 if (meta_db_balance(sp, NULL, curdd, 0, &xep) == -1)
1996 mdclrerror(&xep);
1997 }
1998
1999 /* level 4 */
2000 if (rb_level > 3) {
2001 if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
2002 if (tk_own_bydd(sp, dd, &mhiargs, TRUE, &xep))
2003 mdclrerror(&xep);
2004 }
2005 }
2006
2007 /* level 5 */
2008 if (rb_level > 4) {
2009 if (clnt_stimeout(mynode(), sp, &mhiargs, &xep) == -1)
2010 mdclrerror(&xep);
2011 }
2012
2013 /*
2014 * If at least one node needs to be rejoined to MN diskset,
2015 * then suspend commd again.
2016 */
2017 if (MD_MNSET_DESC(sd)) {
2018 nd = sd->sd_nodelist;
2019 /* All nodes are guaranteed to be ALIVE */
2020 while (nd) {
2021 if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) {
2022 nd = nd->nd_next;
2023 continue;
2024 }
2025 break;
2026 }
2027 if (nd) {
2028 /*
2029 * Found node that will be rejoined so
2030 * notify rpc.mdcommd on all nodes of a nodelist change.
2031 * Start by suspending rpc.mdcommd (which drains it of
2032 * all messages), then change the nodelist followed by
2033 * a reinit and resume.
2034 */
2035 nd = sd->sd_nodelist;
2036 /* All nodes are guaranteed to be ALIVE */
2037 while (nd) {
2038 if (clnt_mdcommdctl(nd->nd_nodename,
2039 COMMDCTL_SUSPEND, sp, MD_MSG_CLASS0,
2040 MD_MSCF_NO_FLAGS, &xep)) {
2041 mdclrerror(&xep);
2042 }
2043 suspendall_flag_rb = 1;
2044 nd = nd->nd_next;
2045 }
2046 }
2047 }
2048
2049
2050
2051 /* level 6 */
2052 if (rb_level > 5) {
2053 if (MD_MNSET_DESC(sd)) {
2054 int join_flags = 0;
2055
2056 nd = sd->sd_nodelist;
2057 /* All nodes are guaranteed to be ALIVE */
2058 while (nd) {
2059 /* Only rejoin nodes that were joined before */
2060 if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) {
2061 nd = nd->nd_next;
2062 continue;
2063 }
2064 /*
2065 * Rejoin nodes to same state as before -
2066 * either STALE or non-STALE.
2067 */
2068 if (stale_bool == TRUE)
2069 join_flags = MNSET_IS_STALE;
2070 if (clnt_joinset(nd->nd_nodename, sp,
2071 join_flags, &xep))
2072 mdclrerror(&xep);
2073 /* Sets OWN flag on all nodes in list */
2074 if (clnt_upd_nr_flags(nd->nd_nodename, sp,
2075 sd->sd_nodelist, MD_NR_JOIN, NULL, &xep)) {
2076 mdclrerror(&xep);
2077 }
2078 nd = nd->nd_next;
2079 }
2080 } else {
2081 if (setup_db_bydd(sp, dd, TRUE, &xep) == -1)
2082 mdclrerror(&xep);
2083
2084 /* No special flag for traditional diskset */
2085 if (snarf_set(sp, NULL, &xep))
2086 mdclrerror(&xep);
2087 }
2088 }
2089
2090 /* level 1 */
2091 if (rb_level > 0) {
2092 /*
2093 * Mark the drives as OK.
2094 */
2095 if (MD_MNSET_DESC(sd)) {
2096 nd = sd->sd_nodelist;
2097 /* All nodes are guaranteed to be ALIVE */
2098 while (nd) {
2099 /*
2100 * Must be last action before unlock.
2101 * In case of panic, recovery code checks
2102 * for MD_DR_OK to know that drive
2103 * and possible master are fully added back.
2104 */
2105 if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
2106 MD_DR_OK, &xep) == -1)
2107 mdclrerror(&xep);
2108 nd = nd->nd_next;
2109 }
2110 } else {
2111 for (i = 0; i < MD_MAXSIDES; i++) {
2112 /* Skip empty slots */
2113 if (sd->sd_nodes[i][0] == '\0')
2114 continue;
2115
2116 if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd,
2117 MD_DR_OK, &xep) == -1)
2118 mdclrerror(&xep);
2119
2120 }
2121 }
2122 max_genid += 2;
2123 resync_genid(sp, sd, max_genid, 0, NULL);
2124 }
2125 /*
2126 * Notify rpc.mdcommd on all nodes of a nodelist change.
2127 * Send a reinit command to mdcommd which forces it to get
2128 * fresh set description.
2129 */
2130 if (suspendall_flag_rb) {
2131 /* Send reinit */
2132 nd = sd->sd_nodelist;
2133 /* All nodes are guaranteed to be ALIVE */
2134 while (nd) {
2135 /* Class is ignored for REINIT */
2136 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
2137 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
2138 mde_perror(&xep, dgettext(TEXT_DOMAIN,
2139 "Unable to reinit rpc.mdcommd.\n"));
2140 mdclrerror(&xep);
2141 }
2142 nd = nd->nd_next;
2143 }
2144 }
2145
2146 /*
2147 * Just resume all classes so that resume is the same whether
2148 * just one class was locked or all classes were locked.
2149 */
2150 if ((suspend1_flag) || (suspendall_flag_rb) || (suspendall_flag)) {
2151 /* Send resume */
2152 nd = sd->sd_nodelist;
2153 /* All nodes are guaranteed to be ALIVE */
2154 while (nd) {
2155 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
2156 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
2157 mde_perror(&xep, dgettext(TEXT_DOMAIN,
2158 "Unable to resume rpc.mdcommd.\n"));
2159 mdclrerror(&xep);
2160 }
2161 nd = nd->nd_next;
2162 }
2163 meta_ping_mnset(sp->setno);
2164 }
2165
2166
2167 /* level 0 */
2168 cl_sk = cl_get_setkey(sp->setno, sp->setname);
2169 /* Don't test lock flag since guaranteed to be set if in rollback */
2170 if (MD_MNSET_DESC(sd)) {
2171 nd = sd->sd_nodelist;
2172 /* All nodes are guaranteed to be ALIVE */
2173 while (nd) {
2174 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
2175 mdclrerror(&xep);
2176 nd = nd->nd_next;
2177 }
2178 } else {
2179 for (i = 0; i < MD_MAXSIDES; i++) {
2180 /* Skip empty slots */
2181 if (sd->sd_nodes[i][0] == '\0')
2182 continue;
2183
2184 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
2185 mdclrerror(&xep);
2186 }
2187 }
2188 cl_set_setkey(NULL);
2189
2190 /* release signals back to what they were on entry */
2191 if (procsigs(FALSE, &oldsigs, &xep) < 0)
2192 mdclrerror(&xep);
2193
2194 metafreedrivedesc(&dd);
2195
2196 if (flush_set_onerr) {
2197 metaflushsetname(sp);
2198 if (!(MD_MNSET_DESC(sd))) {
2199 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
2200 }
2201 }
2202
2203 return (rval);
2204 }
2205