1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * Just in case we're not in a build environment, make sure that
29 * TEXT_DOMAIN gets set to something.
30 */
31 #if !defined(TEXT_DOMAIN)
32 #define TEXT_DOMAIN "SYS_TEST"
33 #endif
34
35 /*
36 * Metadevice diskset interfaces
37 */
38
39 #include "meta_set_prv.h"
40 #include <meta.h>
41 #include <sys/lvm/md_crc.h>
42 #include <sys/time.h>
43 #include <sdssc.h>
44
45 static int
add_db_sidenms(mdsetname_t * sp,md_error_t * ep)46 add_db_sidenms(
47 mdsetname_t *sp,
48 md_error_t *ep
49 )
50 {
51 md_replicalist_t *rlp = NULL;
52 md_replicalist_t *rl;
53 int rval = 0;
54
55 if (metareplicalist(sp, MD_FULLNAME_ONLY, &rlp, ep) < 0)
56 return (-1);
57
58 for (rl = rlp; rl != NULL; rl = rl->rl_next) {
59 md_replica_t *r = rl->rl_repp;
60
61 /*
62 * This is not the first replica being added to the
63 * diskset so call with ADDSIDENMS_BCAST. If this
64 * is a traditional diskset, the bcast flag is ignored
65 * since traditional disksets don't use the rpc.mdcommd.
66 */
67 if (meta_db_addsidenms(sp, r->r_namep, r->r_blkno,
68 DB_ADDSIDENMS_BCAST, ep)) {
69 rval = -1;
70 goto out;
71 }
72 }
73
74 out:
75 metafreereplicalist(rlp);
76 return (rval);
77 }
78
79 static int
add_drvs_to_hosts(mdsetname_t * sp,int node_c,char ** node_v,md_error_t * ep)80 add_drvs_to_hosts(
81 mdsetname_t *sp,
82 int node_c,
83 char **node_v,
84 md_error_t *ep
85 )
86 {
87 int i;
88 md_set_desc *sd;
89 md_drive_desc *dd;
90 md_timeval32_t now;
91 ulong_t genid;
92
93 if ((sd = metaget_setdesc(sp, ep)) == NULL)
94 return (-1);
95
96 if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL) {
97 if (! mdisok(ep))
98 return (-1);
99 return (0);
100 }
101
102 now = sd->sd_ctime;
103 genid = sd->sd_genid - 1;
104
105 for (i = 0; i < node_c; i++) {
106 if (clnt_adddrvs(node_v[i], sp, dd, now, genid, ep) == -1)
107 return (-1);
108 }
109
110 return (0);
111 }
112
113 static int
add_md_sidenms(mdsetname_t * sp,side_t sideno,side_t otherside,md_error_t * ep)114 add_md_sidenms(mdsetname_t *sp, side_t sideno, side_t otherside, md_error_t *ep)
115 {
116 mdnm_params_t nm;
117 char *cname, *dname;
118 side_t tmp_sideno;
119 minor_t mnum;
120 int done, i;
121 int rval = 0;
122 md_set_desc *sd;
123
124 (void) memset(&nm, '\0', sizeof (nm));
125 nm.key = MD_KEYWILD;
126
127 if (!metaislocalset(sp)) {
128 if ((sd = metaget_setdesc(sp, ep)) == NULL)
129 return (-1);
130 }
131 /* Use rpc.mdcommd to add md side info from all nodes */
132 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
133 (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
134 md_mn_result_t *resultp = NULL;
135 md_mn_msg_meta_md_addside_t md_as;
136 int send_rval;
137
138 md_as.msg_sideno = sideno;
139 md_as.msg_otherside = otherside;
140 /*
141 * If reconfig cycle has been started, this node is stuck in
142 * in the return step until this command has completed. If
143 * mdcommd is suspended, ask send_message to fail (instead of
144 * retrying) so that metaset can finish allowing the
145 * reconfig cycle to proceed.
146 */
147 send_rval = mdmn_send_message(sp->setno,
148 MD_MN_MSG_META_MD_ADDSIDE,
149 MD_MSGF_FAIL_ON_SUSPEND | MD_MSGF_PANIC_WHEN_INCONSISTENT,
150 0, (char *)&md_as, sizeof (md_mn_msg_meta_md_addside_t),
151 &resultp, ep);
152 if (send_rval != 0) {
153 (void) mdstealerror(ep, &(resultp->mmr_ep));
154 if (resultp)
155 free_result(resultp);
156 return (-1);
157 }
158 if (resultp)
159 free_result(resultp);
160 return (0);
161 } else {
162 /*CONSTCOND*/
163 while (1) {
164 char *drvnm = NULL;
165
166 nm.mde = mdnullerror;
167 nm.setno = sp->setno;
168 nm.side = otherside;
169 if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, NULL) != 0)
170 return (mdstealerror(ep, &nm.mde));
171
172 if (nm.key == MD_KEYWILD)
173 return (0);
174
175 /*
176 * Okay we have a valid key
177 * Let's see if it is hsp or not
178 */
179 nm.devname = (uintptr_t)meta_getnmentbykey(sp->setno,
180 otherside, nm.key, &drvnm, NULL, NULL, ep);
181 if (nm.devname == NULL || drvnm == NULL) {
182 if (nm.devname)
183 Free((void *)(uintptr_t)nm.devname);
184 if (drvnm)
185 Free((void *)(uintptr_t)drvnm);
186 return (-1);
187 }
188
189 /*
190 * If it is hsp add here
191 */
192 if (strcmp(drvnm, MD_HOTSPARES) == 0) {
193 if (add_name(sp, sideno, nm.key, MD_HOTSPARES,
194 minor(NODEV), (char *)(uintptr_t)nm.devname,
195 NULL, NULL, ep) == -1) {
196 Free((void *)(uintptr_t)nm.devname);
197 Free((void *)(uintptr_t)drvnm);
198 return (-1);
199 } else {
200 Free((void *)(uintptr_t)nm.devname);
201 Free((void *)(uintptr_t)drvnm);
202 continue;
203 }
204 }
205
206 nm.side = sideno;
207 if (MD_MNSET_DESC(sd)) {
208 tmp_sideno = sideno;
209 } else {
210 tmp_sideno = sideno - 1;
211 }
212
213 if ((done = meta_getnextside_devinfo(sp,
214 (char *)(uintptr_t)nm.devname, &tmp_sideno,
215 &cname, &dname, &mnum, ep)) == -1) {
216 Free((void *)(uintptr_t)nm.devname);
217 return (-1);
218 }
219
220 assert(done == 1);
221 Free((void *)(uintptr_t)nm.devname);
222 Free((void *)(uintptr_t)drvnm);
223
224 /*
225 * The device reference count can be greater than 1 if
226 * more than one softpart is configured on top of the
227 * same device. If this is the case then we want to
228 * increment the count to sync up with the other sides.
229 */
230 for (i = 0; i < nm.ref_count; i++) {
231 if (add_name(sp, sideno, nm.key, dname, mnum,
232 cname, NULL, NULL, ep) == -1)
233 rval = -1;
234 }
235
236 Free(cname);
237 Free(dname);
238
239 if (rval != 0)
240 return (rval);
241 }
242 }
243
244 /*NOTREACHED*/
245 }
246
247 static int
check_setdrvs_againstnode(mdsetname_t * sp,char * node,md_error_t * ep)248 check_setdrvs_againstnode(mdsetname_t *sp, char *node, md_error_t *ep)
249 {
250 mddrivename_t *dp;
251 md_drive_desc *dd, *ddp;
252
253 if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL)
254 if (! mdisok(ep))
255 return (-1);
256
257 for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
258 dp = ddp->dd_dnp;
259
260 if (checkdrive_onnode(sp, dp, node, ep))
261 return (-1);
262 }
263
264 return (0);
265 }
266
267 static int
create_multinode_set_on_hosts(mdsetname_t * sp,int node_c,char ** node_v,int new_set,md_error_t * ep)268 create_multinode_set_on_hosts(
269 mdsetname_t *sp,
270 int node_c, /* Number of new nodes */
271 char **node_v, /* Nodes which are being added */
272 int new_set,
273 md_error_t *ep
274 )
275 {
276 int i;
277 md_set_desc *sd;
278 md_timeval32_t now;
279 ulong_t genid;
280 int rval = 0;
281 md_mnnode_desc *nd, *ndm = NULL;
282 md_mnnode_desc *nd_prev, *nd_curr;
283 int nodecnt;
284 mndiskset_membershiplist_t *nl, *nl2;
285
286 if (!new_set) {
287 if ((sd = metaget_setdesc(sp, ep)) == NULL)
288 return (-1);
289 now = sd->sd_ctime;
290 genid = sd->sd_genid - 1;
291 if (sd->sd_drvs)
292 genid--;
293 } else {
294 sd = Zalloc(sizeof (*sd));
295
296 if (meta_gettimeofday(&now) == -1) {
297 (void) mdsyserror(ep, errno,
298 dgettext(TEXT_DOMAIN, "meta_gettimeofday()"));
299 rval = -1;
300 goto out;
301 }
302
303 /* Put the new entries into the set */
304 /*
305 * Get membershiplist from API routine. If there's
306 * an error, fail to create set and pass back error.
307 */
308 if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
309 rval = -1;
310 goto out;
311 }
312
313 /*
314 * meta_set_addhosts has already verified that
315 * this node list is in the membership list
316 * so set ALIVE flag.
317 * Since this is a new set, all hosts being
318 * added are new to the set, so also set ADD flag.
319 */
320 for (i = 0; i < node_c; i++) {
321 nd = Zalloc(sizeof (*nd));
322 (void) strcpy(nd->nd_nodename, node_v[i]);
323 nd->nd_ctime = now;
324 nd->nd_flags = (MD_MN_NODE_ALIVE |
325 MD_MN_NODE_ADD);
326 nl2 = nl;
327 while (nl2) {
328 if (strcmp(nl2->msl_node_name,
329 node_v[i]) == 0) {
330 nd->nd_nodeid = nl2->msl_node_id;
331 (void) strcpy(nd->nd_priv_ic,
332 nl2->msl_node_addr);
333 break;
334 }
335 nl2 = nl2->next;
336 }
337
338 /*
339 * Nodelist must be kept in ascending
340 * nodeid order.
341 */
342 if (sd->sd_nodelist == NULL) {
343 /* Nothing in list, just add it */
344 sd->sd_nodelist = nd;
345 } else if (nd->nd_nodeid < sd->sd_nodelist->nd_nodeid) {
346 /* Add to head of list */
347 nd->nd_next = sd->sd_nodelist;
348 sd->sd_nodelist = nd;
349 } else {
350 nd_curr = sd->sd_nodelist->nd_next;
351 nd_prev = sd->sd_nodelist;
352 /* Search for place ot add it */
353 while (nd_curr) {
354 if (nd->nd_nodeid <
355 nd_curr->nd_nodeid) {
356 /* Add before nd_curr */
357 nd->nd_next = nd_curr;
358 nd_prev->nd_next = nd;
359 break;
360 }
361 nd_prev = nd_curr;
362 nd_curr = nd_curr->nd_next;
363 }
364 /* Add to end of list */
365 if (nd_curr == NULL) {
366 nd_prev->nd_next = nd;
367 }
368
369 }
370 /* Set master to be first node added */
371 if (ndm == NULL)
372 ndm = nd;
373 }
374
375 meta_free_nodelist(nl);
376 /*
377 * Creating mnset for first time.
378 * Set master to be invalid until first drive is
379 * in set.
380 */
381 (void) strcpy(sd->sd_mn_master_nodenm, "");
382 sd->sd_mn_master_nodeid = MD_MN_INVALID_NID;
383 sd->sd_mn_masternode = ndm;
384 sd->sd_ctime = now;
385 genid = sd->sd_genid = 0;
386 }
387
388 /* Create the set where needed */
389 for (i = 0; i < node_c; i++) {
390 /*
391 * Create the set on each new node. If the set already
392 * exists, then the node list being created on each new node
393 * is the current node list from before the new nodes
394 * were added. If the set doesn't exist, then the node
395 * list being created on each new node is the entire
396 * new node list.
397 */
398 if (clnt_mncreateset(node_v[i], sp, sd->sd_nodelist,
399 now, genid, sd->sd_mn_master_nodenm,
400 sd->sd_mn_master_nodeid, ep) == -1) {
401 rval = -1;
402 break;
403 }
404 }
405
406 out:
407 if (new_set) {
408 nd = sd->sd_nodelist;
409 while (nd) {
410 sd->sd_nodelist = nd->nd_next;
411 Free(nd);
412 nd = sd->sd_nodelist;
413 }
414 Free(sd);
415 }
416
417 if (rval != 0 || new_set)
418 return (rval);
419
420 /*
421 * Add the drive records to the new sets
422 * and names for the new sides.
423 */
424 return (add_drvs_to_hosts(sp, node_c, node_v, ep));
425 }
426
427
428 static int
create_traditional_set_on_hosts(mdsetname_t * sp,int node_c,char ** node_v,int new_set,md_error_t * ep)429 create_traditional_set_on_hosts(
430 mdsetname_t *sp,
431 int node_c, /* Number of new nodes */
432 char **node_v, /* Nodes which are being added */
433 int new_set,
434 md_error_t *ep
435 )
436 {
437 int i;
438 md_set_desc *sd;
439 md_timeval32_t now;
440 ulong_t genid;
441 int rval = 0;
442
443 if (!new_set) {
444
445 if ((sd = metaget_setdesc(sp, ep)) == NULL)
446 return (-1);
447 now = sd->sd_ctime;
448
449 genid = sd->sd_genid;
450
451 if (sd->sd_drvs)
452 genid--;
453 } else {
454 if (node_c > MD_MAXSIDES)
455 return (mddserror(ep, MDE_DS_SIDENUMNOTAVAIL,
456 sp->setno, NULL, NULL, sp->setname));
457
458 sd = Zalloc(sizeof (*sd));
459
460 /* Put the new entries into the set */
461 for (i = 0; i < node_c; i++) {
462 (void) strcpy(sd->sd_nodes[i], node_v[i]);
463 }
464
465 if (meta_gettimeofday(&now) == -1) {
466 (void) mdsyserror(ep, errno, "meta_gettimeofday()");
467 rval = -1;
468 goto out;
469 }
470
471 sd->sd_ctime = now;
472 genid = sd->sd_genid = 0;
473 }
474
475 /* Create the set where needed */
476 for (i = 0; i < node_c; i++) {
477 /*
478 * Create the set on each new host
479 */
480 if (clnt_createset(node_v[i], sp, sd->sd_nodes, now, genid,
481 ep) == -1) {
482 rval = -1;
483 break;
484 }
485 }
486
487 out:
488 if (new_set)
489 Free(sd);
490
491 if (rval != 0 || new_set)
492 return (rval);
493
494 /*
495 * Add the drive records to the new sets
496 * and names for the new sides.
497 */
498 return (add_drvs_to_hosts(sp, node_c, node_v, ep));
499 }
500
501 static int
create_set_on_hosts(mdsetname_t * sp,int multi_node,int node_c,char ** node_v,int new_set,md_error_t * ep)502 create_set_on_hosts(
503 mdsetname_t *sp,
504 int multi_node, /* Multi_node diskset or not? */
505 int node_c, /* Number of new nodes */
506 char **node_v, /* Nodes which are being added */
507 int new_set,
508 md_error_t *ep
509 )
510 {
511 if (multi_node)
512 return (create_multinode_set_on_hosts(sp, node_c, node_v,
513 new_set, ep));
514 else
515 return (create_traditional_set_on_hosts(sp, node_c, node_v,
516 new_set, ep));
517 }
518
519 static int
create_set(mdsetname_t * sp,int multi_node,int node_c,char ** node_v,int auto_take,md_error_t * ep)520 create_set(
521 mdsetname_t *sp,
522 int multi_node, /* Multi-node diskset or not? */
523 int node_c,
524 char **node_v,
525 int auto_take,
526 md_error_t *ep
527 )
528 {
529 int i;
530 int rval = 0;
531 set_t max_sets;
532 set_t setno;
533 int bool;
534 uint_t sr_flags;
535 sigset_t oldsigs;
536 md_setkey_t *cl_sk;
537 int rb_level = 0;
538 md_error_t xep = mdnullerror;
539 rval_e sdssc_rval;
540 int lock_flag = 0;
541 int sig_flag = 0;
542
543 if ((max_sets = get_max_sets(ep)) == 0)
544 return (-1);
545
546 /* We must be a member of the set we are creating */
547 if (! strinlst(mynode(), node_c, node_v))
548 return (mddserror(ep, MDE_DS_SELFNOTIN,
549 sp->setno, mynode(), NULL, sp->setname));
550
551 /*
552 * If auto_take then we must be the only member of the set
553 * that we are creating.
554 */
555 if (auto_take && node_c > 1)
556 return (mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL, NULL,
557 sp->setname));
558
559 /*
560 * If we're part of SC3.0 we'll already have allocated the
561 * set number so we can skip the allocation algorithm used.
562 * Set number is unique across traditional and MN disksets.
563 */
564 if ((sdssc_rval = sdssc_get_index(sp->setname, &setno))
565 == SDSSC_NOT_BOUND) {
566
567 for (i = 0; i < node_c; i++) {
568 int has_set;
569
570 /* Skip my node */
571 if (strcmp(mynode(), node_v[i]) == 0)
572 continue;
573
574 /*
575 * Make sure this set name is not used on the
576 * other hosts
577 */
578 has_set = nodehasset(sp, node_v[i], NHS_N_EQ, ep);
579 if (has_set < 0) {
580 if (! mdiserror(ep, MDE_NO_SET)) {
581 rval = -1;
582 goto out;
583 }
584 mdclrerror(ep);
585 continue;
586 }
587
588 if (has_set) {
589 (void) mddserror(ep, MDE_DS_NODEHASSET,
590 sp->setno, node_v[i], NULL, sp->setname);
591 rval = -1;
592 goto out;
593 }
594 }
595
596 for (setno = 1; setno < max_sets; setno++) {
597 for (i = 0; i < node_c; i++) {
598 if (clnt_setnumbusy(node_v[i], setno,
599 &bool, ep) == -1) {
600 rval = -1;
601 goto out;
602 }
603
604 if (bool == TRUE)
605 break;
606 }
607 if (i == node_c)
608 break;
609 }
610 } else if (sdssc_rval != SDSSC_OKAY) {
611 (void) mddserror(ep, MDE_DS_SETNUMNOTAVAIL, MD_SET_BAD, NULL,
612 NULL, sp->setname);
613 rval = -1;
614 goto out;
615 }
616
617 if (setno == max_sets) {
618 (void) mddserror(ep, MDE_DS_SETNUMNOTAVAIL, MD_SET_BAD, NULL,
619 NULL, sp->setname);
620 rval = -1;
621 goto out;
622 }
623
624 sp->setno = setno;
625
626 /*
627 * Lock the set on current set members.
628 * Set locking done much earlier for MN diskset than for traditional
629 * diskset since lock_set is used to protect against
630 * other meta* commands running on the other nodes.
631 * Don't issue mdcommd SUSPEND command since there is nothing
632 * to suspend since there currently is no set.
633 */
634 if (multi_node) {
635 /* Make sure we are blocking all signals */
636 if (procsigs(TRUE, &oldsigs, &xep) < 0)
637 mdclrerror(&xep);
638 sig_flag = 1;
639
640 /* Lock the set on new set members */
641 for (i = 0; i < node_c; i++) {
642 if (clnt_lock_set(node_v[i], sp, ep)) {
643 rval = -1;
644 goto out;
645 }
646 lock_flag = 1;
647 }
648 /* Now have the diskset locked, verify set number is still ok */
649 for (i = 0; i < node_c; i++) {
650 if (clnt_setnumbusy(node_v[i], setno,
651 &bool, ep) == -1) {
652 rval = -1;
653 goto out;
654 }
655 }
656 }
657
658
659 if (meta_set_checkname(sp->setname, ep)) {
660 rval = -1;
661 goto out;
662 }
663
664 for (i = 0; i < node_c; i++) {
665 if (clnt_setnameok(node_v[i], sp, &bool, ep) == -1) {
666 rval = -1;
667 goto out;
668 }
669 if (bool == FALSE) {
670 (void) mddserror(ep, MDE_DS_SETNAMEBUSY, sp->setno,
671 node_v[i], NULL, sp->setname);
672 rval = -1;
673 goto out;
674 }
675 }
676
677 /* END CHECK CODE */
678
679 /* Lock the set on new set members */
680 if (!multi_node) {
681 md_rb_sig_handling_on();
682 sig_flag = 1;
683 for (i = 0; i < node_c; i++) {
684 if (clnt_lock_set(node_v[i], sp, ep)) {
685 rval = -1;
686 goto out;
687 }
688 lock_flag = 1;
689 }
690 }
691
692 RB_TEST(1, "create_set", ep)
693
694 RB_PREEMPT;
695 rb_level = 1; /* level 1 */
696
697 RB_TEST(2, "create_set", ep)
698
699 if ((rval = create_set_on_hosts(sp, multi_node, node_c, node_v,
700 1, ep)) == -1)
701 goto rollback;
702
703 RB_TEST(3, "create_set", ep)
704
705 if (auto_take)
706 sr_flags = MD_SR_OK | MD_SR_AUTO_TAKE;
707 else
708 sr_flags = MD_SR_OK;
709
710 /*
711 * Mark the set record MD_SR_OK
712 */
713 for (i = 0; i < node_c; i++)
714 if (clnt_upd_sr_flags(node_v[i], sp, sr_flags, ep))
715 goto rollback;
716
717 rb_level = 2; /* level 2 */
718
719 /*
720 * For MN diskset:
721 * On each added node, set the node record for that node
722 * to OK. Then set all node records for the newly added
723 * nodes on all nodes to ok.
724 *
725 * By setting a node's own node record to ok first, even if
726 * the node adding the hosts panics, the rest of the nodes can
727 * determine the same node list during the choosing of the master
728 * during reconfig. So, only nodes considered for mastership
729 * are nodes that have both MD_MN_NODE_OK and MD_SR_OK set
730 * on that node's rpc.metad. If all nodes have MD_SR_OK set,
731 * but no node has its own MD_MN_NODE_OK set, then the set will
732 * be removed during reconfig since a panic occurred during the
733 * creation of the initial diskset.
734 */
735
736 if (multi_node) {
737 md_mnnode_desc *nd, *saved_nd_next;
738 md_set_desc *sd;
739
740 if ((sd = metaget_setdesc(sp, ep)) == NULL) {
741 goto rollback;
742 }
743
744 for (i = 0; i < node_c; i++) {
745 nd = sd->sd_nodelist;
746 /* All nodes are guaranteed to be ALIVE */
747 while (nd) {
748 if (strcmp(nd->nd_nodename, node_v[i]) == 0)
749 break;
750 nd = nd->nd_next;
751 }
752 /* Something wrong, will pick this up in next loop */
753 if (nd == NULL)
754 continue;
755
756 /* Only changing my local cache of node list */
757 saved_nd_next = nd->nd_next;
758 nd->nd_next = NULL;
759
760 /* Set node record for added host to ok on that host */
761 if (clnt_upd_nr_flags(node_v[i], sp,
762 nd, MD_NR_OK, NULL, ep)) {
763 nd->nd_next = saved_nd_next;
764 goto rollback;
765 }
766 nd->nd_next = saved_nd_next;
767 }
768
769 /* Now set all node records on all nodes to be ok */
770 nd = sd->sd_nodelist;
771 /* All nodes are guaranteed to be ALIVE */
772 while (nd) {
773 if (clnt_upd_nr_flags(nd->nd_nodename, sp,
774 sd->sd_nodelist, MD_NR_OK, NULL, ep)) {
775 goto rollback;
776 }
777 nd = nd->nd_next;
778 }
779 }
780
781 RB_TEST(4, "create_set", ep)
782
783 out:
784 if ((rval == 0) && multi_node) {
785 /*
786 * Set successfully created.
787 * Notify rpc.mdcommd on all nodes of a nodelist change.
788 * Send reinit command to mdcommd which forces it to get
789 * fresh set description. Then send resume.
790 * Resume on class 0 will resume all classes.
791 */
792 for (i = 0; i < node_c; i++) {
793 /* Class is ignored for REINIT */
794 if (clnt_mdcommdctl(node_v[i], COMMDCTL_REINIT,
795 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
796 if (rval == 0)
797 (void) mdstealerror(ep, &xep);
798 rval = -1;
799 mde_perror(ep, dgettext(TEXT_DOMAIN,
800 "Unable to reinit rpc.mdcommd.\n"));
801 }
802 }
803 for (i = 0; i < node_c; i++) {
804 if (clnt_mdcommdctl(node_v[i], COMMDCTL_RESUME,
805 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
806 if (rval == 0)
807 (void) mdstealerror(ep, &xep);
808 rval = -1;
809 mde_perror(ep, dgettext(TEXT_DOMAIN,
810 "Unable to resume rpc.mdcommd.\n"));
811 }
812 }
813 meta_ping_mnset(sp->setno);
814 }
815 if (lock_flag) {
816 cl_sk = cl_get_setkey(sp->setno, sp->setname);
817 for (i = 0; i < node_c; i++) {
818 if (clnt_unlock_set(node_v[i], cl_sk, &xep)) {
819 if (rval == 0)
820 (void) mdstealerror(ep, &xep);
821 rval = -1;
822 }
823 }
824 cl_set_setkey(NULL);
825 }
826
827 if (sig_flag) {
828 if (multi_node) {
829 /* release signals back to what they were on entry */
830 if (procsigs(FALSE, &oldsigs, &xep) < 0)
831 mdclrerror(&xep);
832 } else {
833 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
834 }
835 }
836
837 return (rval);
838
839 rollback:
840 /* all signals already blocked for MN disket */
841 if (!multi_node) {
842 /* Make sure we are blocking all signals */
843 if (procsigs(TRUE, &oldsigs, &xep) < 0)
844 mdclrerror(&xep);
845 }
846
847 rval = -1;
848
849 /*
850 * For MN diskset:
851 * On each added node (which is now each node to be deleted),
852 * set the node record for that node to DEL. Then set all
853 * node records for the newly added (soon to be deleted) nodes
854 * on all nodes to ok.
855 *
856 * By setting a node's own node record to DEL first, even if
857 * the node doing the rollback panics, the rest of the nodes can
858 * determine the same node list during the choosing of the master
859 * during reconfig.
860 */
861
862 /* level 3 */
863 if ((rb_level > 1) && (multi_node)) {
864 md_mnnode_desc *nd, *saved_nd_next;
865 md_set_desc *sd;
866
867 if ((sd = metaget_setdesc(sp, &xep)) == NULL) {
868 mdclrerror(&xep);
869 }
870
871 for (i = 0; i < node_c; i++) {
872 nd = sd->sd_nodelist;
873 /* All nodes are guaranteed to be ALIVE */
874 while (nd) {
875 if (strcmp(nd->nd_nodename, node_v[i]) == 0)
876 break;
877 nd = nd->nd_next;
878 }
879 /* Something wrong, will pick this up in next loop */
880 if (nd == NULL)
881 continue;
882
883 /* Only changing my local cache of node list */
884 saved_nd_next = nd->nd_next;
885 nd->nd_next = NULL;
886
887 /* Set node record for added host to DEL on that host */
888 if (clnt_upd_nr_flags(node_v[i], sp,
889 nd, MD_NR_DEL, NULL, &xep)) {
890 nd->nd_next = saved_nd_next;
891 mdclrerror(&xep);
892 }
893 nd->nd_next = saved_nd_next;
894 }
895
896 /* Now set all node records on all nodes to be DEL */
897 nd = sd->sd_nodelist;
898 /* All nodes are guaranteed to be ALIVE */
899 while (nd) {
900 if (clnt_upd_nr_flags(nd->nd_nodename, sp,
901 sd->sd_nodelist, MD_NR_DEL, NULL, &xep)) {
902 mdclrerror(&xep);
903 }
904 nd = nd->nd_next;
905 }
906
907 /* Mark set record on all hosts to be DELETED */
908 for (i = 0; i < node_c; i++) {
909 if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, &xep)) {
910 mdclrerror(&xep);
911 }
912 }
913 }
914 /* level 1 */
915 if (rb_level > 0) {
916 for (i = 0; i < node_c; i++) {
917 if (clnt_delset(node_v[i], sp, &xep) == -1)
918 mdclrerror(&xep);
919 }
920 }
921
922 /* level 0 */
923 /* Don't test lock flag since guaranteed to be set if in rollback */
924 cl_sk = cl_get_setkey(sp->setno, sp->setname);
925 for (i = 0; i < node_c; i++) {
926 if (clnt_unlock_set(node_v[i], cl_sk, &xep))
927 mdclrerror(&xep);
928 }
929 cl_set_setkey(NULL);
930
931 /* release signals back to what they were on entry */
932 if (procsigs(FALSE, &oldsigs, &xep) < 0)
933 mdclrerror(&xep);
934
935 if ((sig_flag) && (!multi_node))
936 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
937
938 return (rval);
939 }
940
941 static int
del_db_sidenms(mdsetname_t * sp,side_t sideno,md_error_t * ep)942 del_db_sidenms(
943 mdsetname_t *sp,
944 side_t sideno,
945 md_error_t *ep
946 )
947 {
948 md_replicalist_t *rlp = NULL;
949 md_replicalist_t *rl;
950 int rval = 0;
951
952 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0)
953 return (-1);
954
955 for (rl = rlp; rl != NULL; rl = rl->rl_next) {
956 md_replica_t *r = rl->rl_repp;
957
958 if (meta_db_delsidenm(sp, sideno, r->r_namep, r->r_blkno, ep)) {
959 rval = -1;
960 goto out;
961 }
962 }
963
964 out:
965 metafreereplicalist(rlp);
966 return (rval);
967 }
968
969 static int
del_drvs_from_hosts(mdsetname_t * sp,md_set_desc * sd,md_drive_desc * dd,int node_c,char ** node_v,int oha,md_error_t * ep)970 del_drvs_from_hosts(
971 mdsetname_t *sp,
972 md_set_desc *sd,
973 md_drive_desc *dd,
974 int node_c,
975 char **node_v,
976 int oha,
977 md_error_t *ep
978 )
979 {
980 int i;
981 md_mnnode_desc *nd;
982
983 for (i = 0; i < node_c; i++) {
984 if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
985 /*
986 * During OHA mode, don't issue RPCs to
987 * non-alive nodes since there is no reason to
988 * wait for RPC timeouts.
989 */
990 nd = sd->sd_nodelist;
991 while (nd) {
992 if (strcmp(nd->nd_nodename, node_v[i]) == 0)
993 break;
994 nd = nd->nd_next;
995 }
996 if (nd == NULL) {
997 return (mddserror(ep, MDE_DS_NOTINMEMBERLIST,
998 sp->setno, nd->nd_nodename,
999 NULL, sp->setname));
1000 }
1001
1002 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1003 continue;
1004 }
1005 if (clnt_deldrvs(node_v[i], sp, dd, ep)) {
1006 return (-1);
1007 }
1008 } else if (MD_MNSET_DESC(sd) && (oha == FALSE)) {
1009 /*
1010 * All nodes should be alive in non-oha mode.
1011 */
1012 if (clnt_deldrvs(node_v[i], sp, dd, ep)) {
1013 return (-1);
1014 }
1015 } else {
1016 /*
1017 * For traditional diskset, issue the RPC and
1018 * ignore RPC failure if in OHA mode.
1019 */
1020 if (clnt_deldrvs(node_v[i], sp, dd, ep)) {
1021 if (oha == TRUE && mdanyrpcerror(ep)) {
1022 mdclrerror(ep);
1023 continue;
1024 }
1025 return (-1);
1026 }
1027 }
1028 }
1029
1030 return (0);
1031 }
1032
1033 static int
del_host_noset(mdsetname_t * sp,char ** anode,md_error_t * ep)1034 del_host_noset(
1035 mdsetname_t *sp,
1036 char **anode,
1037 md_error_t *ep
1038 )
1039 {
1040 int rval = 0;
1041 md_setkey_t *cl_sk;
1042 md_drive_desc *dd;
1043 md_error_t xep = mdnullerror;
1044 md_set_desc *sd;
1045
1046 if ((sd = metaget_setdesc(sp, ep)) == NULL)
1047 return (-1);
1048
1049 /* Make sure we own the set */
1050 if (meta_check_ownership(sp, ep) != 0)
1051 return (-1);
1052
1053 /* Lock the set on our side */
1054 if (clnt_lock_set(mynode(), sp, ep)) {
1055 rval = -1;
1056 goto out;
1057 }
1058
1059 if (clnt_delhosts(mynode(), sp, 1, anode, ep)) {
1060 rval = -1;
1061 goto out;
1062 }
1063
1064 if (!MD_MNSET_DESC(sd)) {
1065 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
1066 ep)) == NULL) {
1067 if (! mdisok(ep)) {
1068 rval = -1;
1069 goto out;
1070 }
1071 }
1072
1073 /* If we have drives */
1074 if (dd != NULL) {
1075 if (clnt_del_drv_sidenms(mynode(), sp, ep)) {
1076 rval = -1;
1077 goto out;
1078 }
1079 }
1080 }
1081
1082 out:
1083 cl_sk = cl_get_setkey(sp->setno, sp->setname);
1084 if (clnt_unlock_set(mynode(), cl_sk, &xep)) {
1085 if (rval == 0)
1086 (void) mdstealerror(ep, &xep);
1087 rval = -1;
1088 }
1089 cl_set_setkey(NULL);
1090
1091 metaflushsetname(sp);
1092
1093 return (rval);
1094 }
1095
1096 static int
del_md_sidenms(mdsetname_t * sp,side_t sideno,md_error_t * ep)1097 del_md_sidenms(mdsetname_t *sp, side_t sideno, md_error_t *ep)
1098 {
1099 mdnm_params_t nm;
1100 md_set_desc *sd;
1101 int i;
1102
1103 if (!metaislocalset(sp)) {
1104 if ((sd = metaget_setdesc(sp, ep)) == NULL)
1105 return (-1);
1106 }
1107 /* Use rpc.mdcommd to add md side info from all nodes */
1108 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
1109 (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
1110 md_mn_result_t *resultp = NULL;
1111 md_mn_msg_meta_md_delside_t md_ds;
1112 int send_rval;
1113
1114 md_ds.msg_sideno = sideno;
1115 /*
1116 * If reconfig cycle has been started, this node is stuck in
1117 * in the return step until this command has completed. If
1118 * mdcommd is suspended, ask send_message to fail (instead of
1119 * retrying) so that metaset can finish allowing the
1120 * reconfig cycle to proceed.
1121 */
1122 send_rval = mdmn_send_message(sp->setno,
1123 MD_MN_MSG_META_MD_DELSIDE,
1124 MD_MSGF_FAIL_ON_SUSPEND | MD_MSGF_PANIC_WHEN_INCONSISTENT,
1125 0, (char *)&md_ds, sizeof (md_mn_msg_meta_md_delside_t),
1126 &resultp, ep);
1127 if (send_rval != 0) {
1128 (void) mdstealerror(ep, &(resultp->mmr_ep));
1129 if (resultp)
1130 free_result(resultp);
1131 return (-1);
1132 }
1133 if (resultp)
1134 free_result(resultp);
1135 } else {
1136 (void) memset(&nm, '\0', sizeof (nm));
1137 nm.key = MD_KEYWILD;
1138
1139 /*CONSTCOND*/
1140 while (1) {
1141 nm.mde = mdnullerror;
1142 nm.setno = sp->setno;
1143 nm.side = MD_SIDEWILD;
1144 if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, NULL) != 0)
1145 return (mdstealerror(ep, &nm.mde));
1146
1147 if (nm.key == MD_KEYWILD)
1148 return (0);
1149
1150 /*
1151 * The device reference count can be greater than 1 if
1152 * more than one softpart is configured on top of the
1153 * same device. If this is the case then we want to
1154 * decrement the count to zero so the entry can be
1155 * actually removed.
1156 */
1157 for (i = 0; i < nm.ref_count; i++) {
1158 if (del_name(sp, sideno, nm.key, ep) == -1)
1159 return (-1);
1160 }
1161 }
1162 }
1163 return (0);
1164 }
1165
1166 static void
recreate_set(mdsetname_t * sp,md_set_desc * sd)1167 recreate_set(
1168 mdsetname_t *sp,
1169 md_set_desc *sd
1170 )
1171 {
1172 int i;
1173 int has_set;
1174 md_error_t xep = mdnullerror;
1175 md_mnnode_desc *nd;
1176
1177 if (MD_MNSET_DESC(sd)) {
1178 nd = sd->sd_nodelist;
1179 while (nd) {
1180 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1181 nd = nd->nd_next;
1182 continue;
1183 }
1184 has_set = nodehasset(sp, nd->nd_nodename,
1185 NHS_NST_EQ, &xep);
1186
1187 if (has_set >= 0) {
1188 nd = nd->nd_next;
1189 continue;
1190 }
1191
1192 mdclrerror(&xep);
1193
1194 if (clnt_mncreateset(nd->nd_nodename, sp,
1195 sd->sd_nodelist,
1196 sd->sd_ctime, sd->sd_genid,
1197 sd->sd_mn_master_nodenm,
1198 sd->sd_mn_master_nodeid, &xep) == -1)
1199 mdclrerror(&xep);
1200 nd = nd->nd_next;
1201 }
1202 } else {
1203 for (i = 0; i < MD_MAXSIDES; i++) {
1204 /* Skip empty slots */
1205 if (sd->sd_nodes[i][0] == '\0')
1206 continue;
1207
1208 has_set = nodehasset(sp, sd->sd_nodes[i],
1209 NHS_NST_EQ, &xep);
1210
1211 if (has_set >= 0)
1212 continue;
1213
1214 mdclrerror(&xep);
1215
1216 if (clnt_createset(sd->sd_nodes[i], sp, sd->sd_nodes,
1217 sd->sd_ctime, sd->sd_genid, &xep) == -1)
1218 mdclrerror(&xep);
1219 }
1220 }
1221 }
1222
1223 /*
1224 * If a MN diskset, set is already locked on all nodes via clnt_lock_set.
1225 */
1226 static int
del_set_nodrives(mdsetname_t * sp,int node_c,char ** node_v,int oha,md_error_t * ep)1227 del_set_nodrives(
1228 mdsetname_t *sp,
1229 int node_c,
1230 char **node_v,
1231 int oha,
1232 md_error_t *ep
1233 )
1234 {
1235 md_set_desc *sd;
1236 int i;
1237 sigset_t oldsigs;
1238 md_setkey_t *cl_sk;
1239 int rb_level = 0;
1240 ulong_t max_genid = 0;
1241 int rval = 0;
1242 md_error_t xep = mdnullerror;
1243 md_mnnode_desc *nd;
1244 int delete_end = 1;
1245
1246 if ((sd = metaget_setdesc(sp, ep)) == NULL)
1247 return (-1);
1248
1249 if (MD_MNSET_DESC(sd)) {
1250 /* Make sure we are blocking all signals */
1251 if (procsigs(TRUE, &oldsigs, &xep) < 0)
1252 mdclrerror(&xep);
1253 } else {
1254 md_rb_sig_handling_on();
1255 }
1256
1257 /*
1258 * Lock the set on current set members for traditional disksets.
1259 */
1260 if (!(MD_MNSET_DESC(sd))) {
1261 for (i = 0; i < node_c; i++) {
1262 /*
1263 * For traditional diskset, issue the RPC and
1264 * ignore RPC failure if in OHA mode.
1265 */
1266 if (clnt_lock_set(node_v[i], sp, ep)) {
1267 if (oha == TRUE && mdanyrpcerror(ep)) {
1268 mdclrerror(ep);
1269 continue;
1270 }
1271 rval = -1;
1272 goto out;
1273 }
1274 }
1275 }
1276
1277
1278 RB_TEST(1, "deletehosts", ep)
1279
1280 RB_PREEMPT;
1281 rb_level = 1; /* level 1 */
1282
1283 RB_TEST(2, "deletehosts", ep)
1284
1285 /*
1286 * Mark the set record MD_SR_DEL
1287 */
1288 for (i = 0; i < node_c; i++) {
1289
1290 RB_TEST(3, "deletehosts", ep)
1291
1292 if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
1293 /*
1294 * During OHA mode, don't issue RPCs to
1295 * non-alive nodes since there is no reason to
1296 * wait for RPC timeouts.
1297 */
1298 nd = sd->sd_nodelist;
1299 while (nd) {
1300 if (strcmp(nd->nd_nodename, node_v[i]) == 0)
1301 break;
1302 nd = nd->nd_next;
1303 }
1304 if (nd == NULL) {
1305 (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
1306 sp->setno, nd->nd_nodename,
1307 NULL, sp->setname);
1308 goto rollback;
1309 }
1310
1311 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1312 continue;
1313 }
1314
1315 if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) {
1316 goto rollback;
1317 }
1318 } else if (MD_MNSET_DESC(sd) && (oha == FALSE)) {
1319 /*
1320 * All nodes should be alive in non-oha mode.
1321 */
1322 if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) {
1323 goto rollback;
1324 }
1325 } else {
1326 /*
1327 * For traditional diskset, issue the RPC and
1328 * ignore RPC failure if in OHA mode.
1329 */
1330 if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) {
1331 if (oha == TRUE && mdanyrpcerror(ep)) {
1332 mdclrerror(ep);
1333 continue;
1334 }
1335 goto rollback;
1336 }
1337 }
1338
1339 RB_TEST(4, "deletehosts", ep)
1340 }
1341
1342 RB_TEST(5, "deletehosts", ep)
1343
1344 RB_PREEMPT;
1345 rb_level = 2; /* level 2 */
1346
1347 RB_TEST(6, "deletehosts", ep)
1348
1349 if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR)
1350 if (metad_isautotakebyname(sp->setname))
1351 delete_end = 0;
1352 else
1353 goto rollback;
1354
1355 /* The set is OK to delete, make it so. */
1356 for (i = 0; i < node_c; i++) {
1357
1358 RB_TEST(7, "deletehosts", ep)
1359
1360 if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
1361 /*
1362 * During OHA mode, don't issue RPCs to
1363 * non-alive nodes since there is no reason to
1364 * wait for RPC timeouts.
1365 */
1366 nd = sd->sd_nodelist;
1367 while (nd) {
1368 if (strcmp(nd->nd_nodename, node_v[i]) == 0)
1369 break;
1370 nd = nd->nd_next;
1371 }
1372 if (nd == NULL) {
1373 (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
1374 sp->setno, nd->nd_nodename,
1375 NULL, sp->setname);
1376 goto rollback;
1377 }
1378
1379 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1380 continue;
1381 }
1382
1383 if (clnt_delset(node_v[i], sp, ep) == -1) {
1384 goto rollback;
1385 }
1386 } else if (MD_MNSET_DESC(sd) && (oha == FALSE)) {
1387 /*
1388 * All nodes should be alive in non-oha mode.
1389 */
1390 if (clnt_delset(node_v[i], sp, ep) == -1) {
1391 goto rollback;
1392 }
1393 } else {
1394 /*
1395 * For traditional diskset, issue the RPC and
1396 * ignore RPC failure if in OHA mode.
1397 */
1398 if (clnt_delset(node_v[i], sp, ep) == -1) {
1399 if (oha == TRUE && mdanyrpcerror(ep)) {
1400 mdclrerror(ep);
1401 continue;
1402 }
1403 goto rollback;
1404 }
1405 }
1406
1407 RB_TEST(8, "deletehosts", ep)
1408 }
1409
1410 RB_TEST(9, "deletehosts", ep)
1411
1412 out:
1413 /*
1414 * Unlock the set on current set members
1415 * for traditional disksets.
1416 */
1417 if (!(MD_MNSET_DESC(sd))) {
1418 cl_sk = cl_get_setkey(sp->setno, sp->setname);
1419 for (i = 0; i < node_c; i++) {
1420 /*
1421 * For traditional diskset, issue the RPC and
1422 * ignore RPC failure if in OHA mode.
1423 */
1424 if (clnt_unlock_set(node_v[i], cl_sk, &xep)) {
1425 if (oha == TRUE && mdanyrpcerror(&xep)) {
1426 mdclrerror(&xep);
1427 continue;
1428 }
1429 if (rval == 0)
1430 (void) mdstealerror(ep, &xep);
1431 rval = -1;
1432 }
1433 }
1434 cl_set_setkey(NULL);
1435 }
1436
1437 /*
1438 * A MN diskset has the clnt_locks held by meta_set_deletehosts so
1439 * don't flush that data until meta_set_deletehosts has finished
1440 * with it. meta_set_deletehosts will handle the flush of the
1441 * setname.
1442 */
1443 if (!(MD_MNSET_DESC(sd))) {
1444 metaflushsetname(sp);
1445 }
1446
1447 if (delete_end &&
1448 sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR)
1449 rval = -1;
1450
1451 if (MD_MNSET_DESC(sd)) {
1452 /* release signals back to what they were on entry */
1453 if (procsigs(FALSE, &oldsigs, &xep) < 0)
1454 mdclrerror(&xep);
1455 } else {
1456 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1457 }
1458
1459 return (rval);
1460
1461 rollback:
1462 /* all signals already blocked for MN disket */
1463 if (!(MD_MNSET_DESC(sd))) {
1464 /* Make sure we are blocking all signals */
1465 if (procsigs(TRUE, &oldsigs, &xep) < 0)
1466 mdclrerror(&xep);
1467 }
1468
1469 rval = -1;
1470
1471 max_genid = sd->sd_genid;
1472
1473 /* level 2 */
1474 if (rb_level > 1) {
1475 recreate_set(sp, sd);
1476 max_genid++;
1477
1478 if (delete_end)
1479 (void) sdssc_delete_end(sp->setname, SDSSC_CLEANUP);
1480 }
1481
1482 /* level 1 */
1483 if (rb_level > 0) {
1484 max_genid++;
1485 resync_genid(sp, sd, max_genid, node_c, node_v);
1486 }
1487
1488 /* level 0 */
1489 /*
1490 * Unlock the set on current set members
1491 * for traditional disksets.
1492 */
1493 if (!(MD_MNSET_DESC(sd))) {
1494 cl_sk = cl_get_setkey(sp->setno, sp->setname);
1495 for (i = 0; i < node_c; i++) {
1496 /*
1497 * For traditional diskset, issue the RPC and
1498 * ignore RPC failure if in OHA mode.
1499 */
1500 if (clnt_unlock_set(node_v[i], cl_sk, &xep))
1501 mdclrerror(&xep);
1502 }
1503 cl_set_setkey(NULL);
1504 }
1505
1506 /* release signals back to what they were on entry */
1507 if (procsigs(FALSE, &oldsigs, &xep) < 0)
1508 mdclrerror(&xep);
1509
1510 /*
1511 * A MN diskset has the clnt_locks held by meta_set_deletehosts so
1512 * don't flush that data until meta_set_deletehosts has finished
1513 * with it. meta_set_deletehosts will handle the flush of the
1514 * setname.
1515 */
1516 if (!(MD_MNSET_DESC(sd))) {
1517 metaflushsetname(sp);
1518 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1519 }
1520
1521 return (rval);
1522 }
1523
1524 /*
1525 * On entry:
1526 * procsigs already called for MN diskset.
1527 * md_rb_sig_handling already called for traditional diskset.
1528 */
1529 static int
del_set_on_hosts(mdsetname_t * sp,md_set_desc * sd,md_drive_desc * dd,int node_c,char ** node_v,int oha,md_error_t * ep)1530 del_set_on_hosts(
1531 mdsetname_t *sp,
1532 md_set_desc *sd,
1533 md_drive_desc *dd,
1534 int node_c, /* Number of nodes */
1535 char **node_v, /* Nodes being deleted */
1536 int oha,
1537 md_error_t *ep
1538 )
1539 {
1540 int i;
1541 int j;
1542 side_t sideno;
1543 md_replicalist_t *rlp = NULL;
1544 sigset_t oldsigs;
1545 md_setkey_t *cl_sk;
1546 ulong_t max_genid = 0;
1547 int rb_level = 1; /* This is a special case */
1548 md_error_t xep = mdnullerror;
1549 md_mnnode_desc *nd;
1550
1551 RB_PREEMPT;
1552
1553 RB_TEST(7, "deletehosts", ep)
1554
1555 if (dd != NULL) {
1556 /*
1557 * May need this to re-add sidenames on roll back.
1558 */
1559 if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp,
1560 ep) < 0)
1561 goto rollback;
1562
1563 RB_TEST(8, "deletehosts", ep)
1564
1565 RB_PREEMPT;
1566 rb_level = 2; /* level 2 */
1567
1568 RB_TEST(9, "deletehosts", ep)
1569
1570 if (del_drvs_from_hosts(sp, sd, dd, node_c, node_v, oha, ep))
1571 goto rollback;
1572
1573 RB_TEST(10, "deletehosts", ep)
1574
1575 RB_PREEMPT;
1576 rb_level = 3; /* level 3 */
1577
1578 RB_TEST(11, "deletehosts", ep)
1579
1580 /*
1581 * Delete the db replica sides
1582 * This is done before the next loop, so that
1583 * the db does not get unloaded before we are finished
1584 * deleting the sides.
1585 */
1586 if (MD_MNSET_DESC(sd)) {
1587 nd = sd->sd_nodelist;
1588 while (nd) {
1589 /* Skip hosts not being deleted */
1590 if (! strinlst(nd->nd_nodename, node_c,
1591 node_v)) {
1592 nd = nd->nd_next;
1593 continue;
1594 }
1595
1596 if (del_db_sidenms(sp, nd->nd_nodeid, ep))
1597 goto rollback;
1598
1599 RB_TEST(12, "deletehosts", ep)
1600 nd = nd->nd_next;
1601 }
1602 } else {
1603 for (sideno = 0; sideno < MD_MAXSIDES; sideno++) {
1604 /* Skip empty slots */
1605 if (sd->sd_nodes[sideno][0] == '\0')
1606 continue;
1607
1608 /* Skip hosts not being deleted */
1609 if (! strinlst(sd->sd_nodes[sideno], node_c,
1610 node_v))
1611 continue;
1612
1613 if (del_db_sidenms(sp, sideno, ep))
1614 goto rollback;
1615
1616 RB_TEST(12, "deletehosts", ep)
1617 }
1618 }
1619
1620 RB_TEST(13, "deletehosts", ep)
1621
1622 RB_PREEMPT;
1623 rb_level = 4; /* level 4 */
1624
1625 RB_TEST(14, "deletehosts", ep)
1626
1627 /* Delete the names from the namespace */
1628 if (MD_MNSET_DESC(sd)) {
1629 nd = sd->sd_nodelist;
1630 while (nd) {
1631 /* Skip hosts not being deleted */
1632 if (! strinlst(nd->nd_nodename, node_c,
1633 node_v)) {
1634 nd = nd->nd_next;
1635 continue;
1636 }
1637
1638 if (del_md_sidenms(sp, nd->nd_nodeid, ep))
1639 goto rollback;
1640
1641 RB_TEST(15, "deletehosts", ep)
1642 nd = nd->nd_next;
1643 }
1644 } else {
1645 for (sideno = 0; sideno < MD_MAXSIDES; sideno++) {
1646 /* Skip empty slots */
1647 if (sd->sd_nodes[sideno][0] == '\0')
1648 continue;
1649
1650 /* Skip hosts not being deleted */
1651 if (! strinlst(sd->sd_nodes[sideno], node_c,
1652 node_v))
1653 continue;
1654
1655 if (del_md_sidenms(sp, sideno, ep))
1656 goto rollback;
1657
1658 RB_TEST(15, "deletehosts", ep)
1659 }
1660 }
1661 }
1662
1663 RB_TEST(16, "deletehosts", ep)
1664
1665 RB_PREEMPT;
1666 rb_level = 5; /* level 6 */
1667
1668 RB_TEST(17, "deletehosts", ep)
1669
1670 for (i = 0; i < node_c; i++) {
1671 if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
1672 /*
1673 * During OHA mode, don't issue RPCs to
1674 * non-alive nodes since there is no reason to
1675 * wait for RPC timeouts.
1676 */
1677 nd = sd->sd_nodelist;
1678 while (nd) {
1679 if (strcmp(nd->nd_nodename, node_v[i]) == 0)
1680 break;
1681 nd = nd->nd_next;
1682 }
1683 if (nd == NULL) {
1684 (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
1685 sp->setno, nd->nd_nodename,
1686 NULL, sp->setname);
1687 goto rollback;
1688 }
1689
1690 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1691 continue;
1692 }
1693
1694 if (clnt_delset(node_v[i], sp, ep) == -1) {
1695 goto rollback;
1696 }
1697 } else if (MD_MNSET_DESC(sd) && (oha == FALSE)) {
1698 /*
1699 * All nodes should be alive in non-oha mode.
1700 */
1701 if (clnt_delset(node_v[i], sp, ep) == -1) {
1702 goto rollback;
1703 }
1704 } else {
1705 /*
1706 * For traditional diskset, issue the RPC and
1707 * ignore RPC failure if in OHA mode.
1708 */
1709 if (clnt_delset(node_v[i], sp, ep) == -1) {
1710 if (oha == TRUE && mdanyrpcerror(ep)) {
1711 mdclrerror(ep);
1712 continue;
1713 }
1714 goto rollback;
1715 }
1716 }
1717
1718 RB_TEST(18, "deletehosts", ep)
1719 }
1720
1721 metafreereplicalist(rlp);
1722
1723 if (MD_MNSET_DESC(sd)) {
1724 /* release signals back to what they were on entry */
1725 if (procsigs(FALSE, &oldsigs, &xep) < 0)
1726 mdclrerror(&xep);
1727 } else {
1728 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1729 }
1730
1731 return (0);
1732
1733 rollback:
1734 /* all signals already blocked for MN disket */
1735 if (!(MD_MNSET_DESC(sd))) {
1736 /* Make sure we are blocking all signals */
1737 if (procsigs(TRUE, &oldsigs, &xep) < 0)
1738 mdclrerror(&xep);
1739 }
1740
1741 max_genid = sd->sd_genid;
1742
1743 /* level 5 */
1744 if (rb_level > 4) {
1745 recreate_set(sp, sd);
1746 max_genid++;
1747 }
1748
1749 /* level 2 */
1750 if (rb_level > 1 && dd != NULL) {
1751 /*
1752 * See if we have to re-add the drives specified.
1753 */
1754 for (i = 0; i < node_c; i++) {
1755 md_set_record *sr;
1756
1757 if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
1758 /*
1759 * During OHA mode, don't issue RPCs to
1760 * non-alive nodes since there is no reason to
1761 * wait for RPC timeouts.
1762 */
1763 nd = sd->sd_nodelist;
1764 while (nd) {
1765 if (strcmp(nd->nd_nodename, node_v[i])
1766 == 0)
1767 break;
1768 nd = nd->nd_next;
1769 }
1770 if (nd == NULL)
1771 continue;
1772
1773 if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
1774 continue;
1775 }
1776
1777 /* Don't care if set record is MN or not */
1778 if (clnt_getset(node_v[i], sp->setname,
1779 MD_SET_BAD, &sr, &xep) == -1) {
1780 mdclrerror(&xep);
1781 continue;
1782 }
1783
1784 /* Drive already added, skip to next node */
1785 if (sr->sr_drivechain != NULL) {
1786 /*
1787 * Set record structure was allocated from RPC
1788 * routine getset so this structure is only of
1789 * size md_set_record even if the MN flag is
1790 * set. So, clear the flag so that the free
1791 * code doesn't attempt to free a structure
1792 * the size of md_mnset_record.
1793 */
1794 sr->sr_flags &= ~MD_SR_MN;
1795 free_sr(sr);
1796 continue;
1797 }
1798
1799 if (clnt_adddrvs(node_v[i], sp, dd,
1800 sr->sr_ctime, sr->sr_genid, &xep) == -1)
1801 mdclrerror(&xep);
1802
1803 if (clnt_upd_dr_flags(node_v[i], sp, dd,
1804 MD_DR_OK, &xep) == -1)
1805 mdclrerror(&xep);
1806
1807 /*
1808 * Set record structure was allocated from RPC routine
1809 * getset so this structure is only of size
1810 * md_set_record even if the MN flag is set. So,
1811 * clear the flag so that the free code doesn't
1812 * attempt to free a structure the size of
1813 * md_mnset_record.
1814 */
1815 sr->sr_flags &= ~MD_SR_MN;
1816 free_sr(sr);
1817 }
1818 max_genid += 3;
1819 }
1820
1821 /* level 3 */
1822 if (rb_level > 2 && dd != NULL) {
1823 md_replicalist_t *rl;
1824
1825 for (rl = rlp; rl != NULL; rl = rl->rl_next) {
1826 md_replica_t *r = rl->rl_repp;
1827
1828 /*
1829 * This is not the first replica being added to the
1830 * diskset so call with ADDSIDENMS_BCAST. If this
1831 * is a traditional diskset, the bcast flag is ignored
1832 * since traditional disksets don't use the rpc.mdcommd.
1833 */
1834 if (meta_db_addsidenms(sp, r->r_namep, r->r_blkno,
1835 DB_ADDSIDENMS_BCAST, &xep))
1836 mdclrerror(&xep);
1837 }
1838 }
1839
1840 /* level 4 */
1841 if (rb_level > 3 && dd != NULL) {
1842 int nodeid_addsides = 0;
1843 /*
1844 * Add the device names for the new sides into the namespace,
1845 * on all hosts not being deleted.
1846 */
1847 if (MD_MNSET_DESC(sd)) {
1848 nd = sd->sd_nodelist;
1849 while (nd) {
1850 /* Find a node that is not being deleted */
1851 if (! strinlst(nd->nd_nodename, node_c,
1852 node_v)) {
1853 nodeid_addsides = nd->nd_nodeid;
1854 break;
1855 }
1856 nd = nd->nd_next;
1857 }
1858 } else {
1859 for (j = 0; j < MD_MAXSIDES; j++) {
1860 /* Skip empty slots */
1861 if (sd->sd_nodes[j][0] == '\0')
1862 continue;
1863
1864 /* Find a node that is not being deleted */
1865 if (! strinlst(sd->sd_nodes[j], node_c,
1866 node_v))
1867 break;
1868 }
1869 nodeid_addsides = j;
1870 }
1871
1872 if (MD_MNSET_DESC(sd)) {
1873 nd = sd->sd_nodelist;
1874 while (nd) {
1875 /* Skip nodes not being deleted */
1876 if (!strinlst(nd->nd_nodename, node_c,
1877 node_v)) {
1878 nd = nd->nd_next;
1879 continue;
1880 }
1881
1882 /* this side was just created, add the names */
1883 if (add_md_sidenms(sp, nd->nd_nodeid,
1884 nodeid_addsides, &xep))
1885 mdclrerror(&xep);
1886 nd = nd->nd_next;
1887 }
1888 } else {
1889 for (i = 0; i < MD_MAXSIDES; i++) {
1890 /* Skip empty slots */
1891 if (sd->sd_nodes[i][0] == '\0')
1892 continue;
1893
1894 /* Skip nodes not being deleted */
1895 if (!strinlst(sd->sd_nodes[i], node_c, node_v))
1896 continue;
1897
1898 /* this side was just created, add the names */
1899 if (add_md_sidenms(sp, i, nodeid_addsides,
1900 &xep))
1901 mdclrerror(&xep);
1902 }
1903 }
1904 }
1905
1906 /* level 1 */
1907 if (rb_level > 0) {
1908 max_genid++;
1909 resync_genid(sp, sd, max_genid, node_c, node_v);
1910 }
1911
1912 /* level 0 */
1913 cl_sk = cl_get_setkey(sp->setno, sp->setname);
1914 if (MD_MNSET_DESC(sd)) {
1915 nd = sd->sd_nodelist;
1916 while (nd) {
1917 if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
1918 continue;
1919 /* To balance lock/unlock; can send to dead node */
1920 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
1921 mdclrerror(&xep);
1922 nd = nd->nd_next;
1923 }
1924 } else {
1925 for (i = 0; i < MD_MAXSIDES; i++) {
1926 /* Skip empty slots */
1927 if (sd->sd_nodes[i][0] == '\0')
1928 continue;
1929
1930 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
1931 mdclrerror(&xep);
1932 }
1933 }
1934 cl_set_setkey(NULL);
1935
1936 /* release signals back to what they were on entry */
1937 if (procsigs(FALSE, &oldsigs, &xep) < 0)
1938 mdclrerror(&xep);
1939
1940 metafreereplicalist(rlp);
1941
1942 if (!(MD_MNSET_DESC(sd))) {
1943 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1944 }
1945
1946 return (-1);
1947 }
1948
1949 static int
make_sideno_sidenm(mdsetname_t * sp,mddrivename_t * dnp,side_t sideno,md_error_t * ep)1950 make_sideno_sidenm(
1951 mdsetname_t *sp,
1952 mddrivename_t *dnp,
1953 side_t sideno,
1954 md_error_t *ep
1955 )
1956 {
1957 mdsidenames_t *sn, **sn_next;
1958 md_set_desc *sd;
1959 mdname_t *np;
1960 uint_t rep_slice;
1961 int err = 0;
1962
1963 assert(dnp->side_names_key != MD_KEYWILD);
1964
1965 if ((sd = metaget_setdesc(sp, ep)) == NULL)
1966 return (-1);
1967
1968 /* find the end of the link list */
1969 for (sn = dnp->side_names; sn->next != NULL; sn = sn->next)
1970 ;
1971 sn_next = &sn->next;
1972
1973 if (meta_replicaslice(dnp, &rep_slice, ep) != 0)
1974 return (-1);
1975
1976 if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
1977 return (-1);
1978
1979 sn = Zalloc(sizeof (*sn));
1980 sn->sideno = sideno;
1981
1982 if (MD_MNSET_DESC(sd)) {
1983 /*
1984 * For MO diskset the sideno is not an index into
1985 * the array of nodes. Hence getside_devinfo is
1986 * used instead of meta_getnextside_devinfo.
1987 */
1988 if (meta_getside_devinfo(sp, np->bname, sideno, &sn->cname,
1989 &sn->dname, &sn->mnum, ep) == -1)
1990 err = -1;
1991 } else {
1992 /* decrement sideno, to look like the previous sideno */
1993 sideno--;
1994 if (meta_getnextside_devinfo(sp, np->bname, &sideno,
1995 &sn->cname, &sn->dname, &sn->mnum, ep) == -1)
1996 err = -1;
1997 }
1998
1999 if (err) {
2000 Free(sn);
2001 return (err);
2002 }
2003 assert(sn->sideno == sideno);
2004
2005 /* Add to the end of the linked list */
2006 *sn_next = sn;
2007 return (0);
2008 }
2009
2010 static int
validate_nodes(mdsetname_t * sp,int node_c,char ** node_v,md_error_t * ep)2011 validate_nodes(
2012 mdsetname_t *sp,
2013 int node_c,
2014 char **node_v,
2015 md_error_t *ep
2016 )
2017 {
2018 char *hostname;
2019 int i;
2020
2021
2022 for (i = 0; i < node_c; i++) {
2023 if (strlen(node_v[i]) > (size_t)MD_MAX_NODENAME)
2024 return (mddserror(ep, MDE_DS_NODENAMETOOLONG,
2025 sp->setno, node_v[i], NULL, sp->setname));
2026 if (clnt_hostname(node_v[i], &hostname, ep))
2027 return (-1);
2028 if (strcmp(node_v[i], hostname) != 0) {
2029 Free(hostname);
2030 return (mddserror(ep, MDE_DS_NOTNODENAME, sp->setno,
2031 node_v[i], NULL, sp->setname));
2032 }
2033 Free(hostname);
2034 }
2035 return (0);
2036 }
2037
2038 /*
2039 * Exported Entry Points
2040 */
2041
2042 /*
2043 * Check the given disk set name for syntactic correctness.
2044 */
2045 int
meta_set_checkname(char * setname,md_error_t * ep)2046 meta_set_checkname(char *setname, md_error_t *ep)
2047 {
2048 char *cp;
2049
2050 if (strlen(setname) > (size_t)MD_MAX_SETNAME)
2051 return (mddserror(ep, MDE_DS_SETNAMETOOLONG,
2052 MD_SET_BAD, NULL, NULL, setname));
2053
2054 for (cp = setname; *cp; cp++)
2055 if (!isprint(*cp) || strchr(INVALID_IN_NAMES, *cp) != NULL)
2056 return (mddserror(ep, MDE_DS_INVALIDSETNAME,
2057 MD_SET_BAD, NULL, NULL, setname));
2058 return (0);
2059 }
2060
2061 /*
2062 * Add host(s) to the multi-node diskset provided in sp.
2063 * - create set if non-existent.
2064 */
2065 static int
meta_multinode_set_addhosts(mdsetname_t * sp,int multi_node,int node_c,char ** node_v,int auto_take,md_error_t * ep)2066 meta_multinode_set_addhosts(
2067 mdsetname_t *sp,
2068 int multi_node,
2069 int node_c,
2070 char **node_v,
2071 int auto_take,
2072 md_error_t *ep
2073 )
2074 {
2075 md_set_desc *sd;
2076 md_drive_desc *dd, *p;
2077 int rval = 0;
2078 int bool;
2079 int nodeindex;
2080 int i;
2081 int has_set;
2082 sigset_t oldsigs;
2083 md_setkey_t *cl_sk;
2084 int rb_level = 0;
2085 md_error_t xep = mdnullerror;
2086 md_mnnode_desc *nd, *nd_curr, *nd_prev;
2087 md_timeval32_t now;
2088 int nodecnt;
2089 mndiskset_membershiplist_t *nl, *nl2;
2090 int suspendall_flag = 0;
2091 int suspend1_flag = 0;
2092 int lock_flag = 0;
2093 int stale_flag = 0;
2094 md_mnnode_desc *saved_nd_next;
2095 int remote_sets_created = 0;
2096
2097 /*
2098 * Check membershiplist first. If there's
2099 * an error, fail to create set and pass back error.
2100 */
2101 if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
2102 return (-1);
2103 }
2104 /* Verify that all nodes are in member list */
2105 for (i = 0; i < node_c; i++) {
2106 /*
2107 * If node in list isn't a member of the membership,
2108 * just return error.
2109 */
2110 if (meta_is_member(node_v[i], NULL, nl) == 0) {
2111 meta_free_nodelist(nl);
2112 return (mddserror(ep, MDE_DS_NOTINMEMBERLIST,
2113 sp->setno, node_v[i], NULL, sp->setname));
2114 }
2115 }
2116 /*
2117 * Node list is needed later, but there is a lot of error
2118 * checking and possible failures between here and there, so
2119 * just re-get the list later if there are no errors.
2120 */
2121 meta_free_nodelist(nl);
2122 nl = NULL;
2123
2124 /*
2125 * Verify that list of nodes being added contains no
2126 * duplicates.
2127 */
2128 if (nodesuniq(sp, node_c, node_v, ep))
2129 return (-1);
2130
2131 /*
2132 * Verify that each node being added thinks that its nodename
2133 * is the same as the nodename given.
2134 */
2135 if (validate_nodes(sp, node_c, node_v, ep))
2136 return (-1);
2137
2138 if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2139 if (! mdiserror(ep, MDE_NO_SET))
2140 return (-1);
2141 mdclrerror(ep);
2142 return (create_set(sp, multi_node, node_c, node_v, auto_take,
2143 ep));
2144 } else {
2145 /*
2146 * If this node and another node were both attempting to
2147 * create the same setname at the same time, and the other
2148 * node has just created the set on this node then sd would
2149 * be non-NULL, but sp->setno would be null (setno is filled
2150 * in by the create_set). If this is true, then fail since
2151 * the other node has already won this race.
2152 */
2153 if (sp->setno == NULL) {
2154 return (mddserror(ep, MDE_DS_NODEINSET,
2155 NULL, mynode(), NULL, sp->setname));
2156 }
2157 }
2158
2159 /* The auto_take behavior is inconsistent with multiple hosts. */
2160 if (auto_take || sd->sd_flags & MD_SR_AUTO_TAKE) {
2161 (void) mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL, NULL,
2162 sp->setname);
2163 return (-1);
2164 }
2165
2166 /*
2167 * We already have the set.
2168 */
2169
2170 /* Make sure we own the set */
2171 if (meta_check_ownership(sp, ep) != 0)
2172 return (-1);
2173
2174 /*
2175 * The drive and node records are stored in the local mddbs of each
2176 * node in the diskset. Each node's rpc.metad daemon reads in the set,
2177 * drive and node records from that node's local mddb and caches them
2178 * internally. Any process needing diskset information contacts its
2179 * local rpc.metad to get this information. Since each node in the
2180 * diskset is independently reading the set information from its local
2181 * mddb, the set, drive and node records in the local mddbs must stay
2182 * in-sync, so that all nodes have a consistent view of the diskset.
2183 *
2184 * For a multinode diskset, explicitly verify that all nodes in the
2185 * diskset are ALIVE (i.e. are in the API membership list). Otherwise,
2186 * fail this operation since all nodes must be ALIVE in order to add
2187 * the new node record to their local mddb. If a panic of this node
2188 * leaves the local mddbs set, node and drive records out-of-sync, the
2189 * reconfig cycle will fix the local mddbs and force them back into
2190 * synchronization.
2191 */
2192 nd = sd->sd_nodelist;
2193 while (nd) {
2194 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2195 return (mddserror(ep, MDE_DS_NOTINMEMBERLIST,
2196 sp->setno, nd->nd_nodename, NULL,
2197 sp->setname));
2198 }
2199 nd = nd->nd_next;
2200 }
2201
2202 /*
2203 * Check if node is already in set.
2204 */
2205 for (i = 0; i < node_c; i++) {
2206 /* Is node already in set? */
2207 nd = sd->sd_nodelist;
2208 while (nd) {
2209 if (strcmp(nd->nd_nodename, node_v[i]) == 0)
2210 break;
2211 nd = nd->nd_next;
2212 }
2213 if (nd) {
2214 return (mddserror(ep, MDE_DS_NODEINSET,
2215 sp->setno, node_v[i], NULL,
2216 sp->setname));
2217 }
2218 }
2219
2220 /*
2221 * Lock the set on current set members.
2222 * Set locking done much earlier for MN diskset than for traditional
2223 * diskset since lock_set and SUSPEND are used to protect against
2224 * other meta* commands running on the other nodes.
2225 */
2226 /* Make sure we are blocking all signals */
2227 if (procsigs(TRUE, &oldsigs, &xep) < 0)
2228 mdclrerror(&xep);
2229
2230 nd = sd->sd_nodelist;
2231 /* All nodes are guaranteed to be ALIVE */
2232 while (nd) {
2233 if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2234 rval = -1;
2235 goto out;
2236 }
2237 lock_flag = 1;
2238 nd = nd->nd_next;
2239 }
2240 /*
2241 * Lock out other meta* commands by suspending
2242 * class 1 messages across the diskset.
2243 */
2244 nd = sd->sd_nodelist;
2245 /* Send suspend to nodes in nodelist before addhosts call */
2246 /* All nodes are guaranteed to be ALIVE */
2247 while (nd) {
2248 if (clnt_mdcommdctl(nd->nd_nodename,
2249 COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
2250 MD_MSCF_NO_FLAGS, ep)) {
2251 rval = -1;
2252 goto out;
2253 }
2254 suspend1_flag = 1;
2255 nd = nd->nd_next;
2256 }
2257
2258 /* Lock the set on new set members */
2259 for (i = 0; i < node_c; i++) {
2260 /* Already verified to be alive */
2261 if (clnt_lock_set(node_v[i], sp, ep)) {
2262 rval = -1;
2263 goto out;
2264 }
2265 lock_flag = 1;
2266 }
2267
2268 /*
2269 * Perform the required checks for new hosts
2270 */
2271 for (i = 0; i < node_c; i++) {
2272 /* Make sure this set name is not used on the other hosts */
2273 has_set = nodehasset(sp, node_v[i], NHS_N_EQ, ep);
2274 if (has_set < 0) {
2275 if (! mdiserror(ep, MDE_NO_SET)) {
2276 rval = -1;
2277 goto out;
2278 }
2279 /* Keep on truck'n */
2280 mdclrerror(ep);
2281 } else if (has_set) {
2282 (void) mddserror(ep, MDE_DS_NODEHASSET, sp->setno,
2283 node_v[i], NULL, sp->setname);
2284 rval = -1;
2285 goto out;
2286 }
2287
2288 if (clnt_setnumbusy(node_v[i], sp->setno, &bool, ep) == -1) {
2289 rval = -1;
2290 goto out;
2291 }
2292
2293 if (bool == TRUE) {
2294 (void) mddserror(ep, MDE_DS_SETNUMBUSY, sp->setno,
2295 node_v[i], NULL, sp->setname);
2296 rval = -1;
2297 goto out;
2298 }
2299
2300 if (clnt_setnameok(node_v[i], sp, &bool, ep) == -1) {
2301 rval = -1;
2302 goto out;
2303 }
2304
2305 if (bool == FALSE) {
2306 (void) mddserror(ep, MDE_DS_SETNAMEBUSY, sp->setno,
2307 node_v[i], NULL, sp->setname);
2308 rval = -1;
2309 goto out;
2310 }
2311
2312 if (check_setdrvs_againstnode(sp, node_v[i], ep)) {
2313 rval = -1;
2314 goto out;
2315 }
2316 }
2317
2318 /* Get drive descriptors for the set */
2319 if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL) {
2320 if (! mdisok(ep)) {
2321 rval = -1;
2322 goto out;
2323 }
2324 }
2325
2326 /* END CHECK CODE */
2327
2328 RB_TEST(1, "addhosts", ep)
2329
2330 RB_PREEMPT;
2331 rb_level = 1; /* level 1 */
2332
2333 RB_TEST(2, "addhosts", ep)
2334
2335 /*
2336 * Create the set where needed
2337 */
2338 if (create_set_on_hosts(sp, multi_node, node_c, node_v, 0, ep)) {
2339 goto rollback;
2340 }
2341
2342 /*
2343 * Send suspend to rpc.mdcommd on nodes where a set has been
2344 * created since rpc.mdcommd must now be running on the remote nodes.
2345 */
2346 remote_sets_created = 1;
2347 for (i = 0; i < node_c; i++) {
2348 /*
2349 * Lock out other meta* commands by suspending
2350 * class 1 messages across the diskset.
2351 */
2352 if (clnt_mdcommdctl(node_v[i],
2353 COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
2354 MD_MSCF_NO_FLAGS, ep)) {
2355 rval = -1;
2356 goto rollback;
2357 }
2358 }
2359
2360 /*
2361 * Merge the new entries into the set with the existing sides.
2362 * Get membershiplist from API routine. If there's
2363 * an error, fail to create set and pass back error.
2364 */
2365 if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
2366 goto rollback;
2367 }
2368 if (meta_gettimeofday(&now) == -1) {
2369 meta_free_nodelist(nl);
2370 (void) mdsyserror(ep, errno,
2371 dgettext(TEXT_DOMAIN, "meta_gettimeofday()"));
2372 goto rollback;
2373 }
2374 for (nodeindex = 0; nodeindex < node_c; nodeindex++) {
2375 nd = Zalloc(sizeof (*nd));
2376 (void) strcpy(nd->nd_nodename, node_v[nodeindex]);
2377 nd->nd_ctime = now;
2378 nl2 = nl;
2379 while (nl2) {
2380 if (strcmp(nl2->msl_node_name,
2381 node_v[nodeindex]) == 0) {
2382 nd->nd_nodeid = nl2->msl_node_id;
2383 (void) strcpy(nd->nd_priv_ic,
2384 nl2->msl_node_addr);
2385 break;
2386 }
2387 nl2 = nl2->next;
2388 }
2389
2390 /*
2391 * Nodelist must be kept in ascending nodeid order.
2392 */
2393 if (sd->sd_nodelist == NULL) {
2394 /* Nothing in list, just add it */
2395 sd->sd_nodelist = nd;
2396 } else if (nd->nd_nodeid <
2397 sd->sd_nodelist->nd_nodeid) {
2398 /* Add to head of list */
2399 nd->nd_next = sd->sd_nodelist;
2400 sd->sd_nodelist = nd;
2401 } else {
2402 nd_curr = sd->sd_nodelist->nd_next;
2403 nd_prev = sd->sd_nodelist;
2404 /* Search for place to add it */
2405 while (nd_curr) {
2406 if (nd->nd_nodeid < nd_curr->nd_nodeid) {
2407 /* Add before nd_curr */
2408 nd->nd_next = nd_curr;
2409 nd_prev->nd_next = nd;
2410 break;
2411 }
2412 nd_prev = nd_curr;
2413 nd_curr = nd_curr->nd_next;
2414 }
2415 /* Add to end of list */
2416 if (nd_curr == NULL) {
2417 nd_prev->nd_next = nd;
2418 }
2419
2420 }
2421 /* Node already verified to be in membership */
2422 nd->nd_flags |= MD_MN_NODE_ALIVE;
2423 }
2424 meta_free_nodelist(nl);
2425
2426 /* If we have drives */
2427 if (dd != NULL) {
2428 /*
2429 * For all the hosts being added, create a sidename structure
2430 */
2431 nd = sd->sd_nodelist;
2432 while (nd) {
2433 /* Skip nodes not being added */
2434 if (!strinlst(nd->nd_nodename, node_c, node_v)) {
2435 nd = nd->nd_next;
2436 continue;
2437 }
2438 for (p = dd; p != NULL; p = p->dd_next) {
2439 if (make_sideno_sidenm(sp, p->dd_dnp,
2440 nd->nd_nodeid, ep) != 0)
2441 goto rollback;
2442 }
2443 nd = nd->nd_next;
2444 }
2445
2446 RB_PREEMPT;
2447 rb_level = 2; /* level 2 */
2448
2449 RB_TEST(4, "addhosts", ep)
2450
2451 /*
2452 * Add the new sidename for each drive to all the hosts
2453 *
2454 * If a multi-node diskset, each host only stores
2455 * the side information for itself. So, only send
2456 * side information to the new hosts where each host
2457 * will add the appropriate side information to its
2458 * local mddb.
2459 */
2460 nd = sd->sd_nodelist;
2461 while (nd) {
2462 /* Skip nodes not being added */
2463 if (!strinlst(nd->nd_nodename, node_c,
2464 node_v)) {
2465 nd = nd->nd_next;
2466 continue;
2467 }
2468
2469 /* Add side info to new hosts */
2470 if (clnt_add_drv_sidenms(nd->nd_nodename,
2471 mynode(), sp, sd, node_c, node_v, ep))
2472 goto rollback;
2473
2474 nd = nd->nd_next;
2475 }
2476
2477 RB_TEST(5, "addhosts", ep)
2478
2479 RB_PREEMPT;
2480 rb_level = 3; /* level 3 */
2481
2482 RB_TEST(6, "addhosts", ep)
2483
2484 /*
2485 * Add the device names for the new sides into the namespace
2486 * for all hosts being added. This is adding the side
2487 * names to the diskset's mddb so add sidenames for all
2488 * of the new hosts.
2489 */
2490 nd = sd->sd_nodelist;
2491 while (nd) {
2492 /* Skip nodes not being added */
2493 if (!strinlst(nd->nd_nodename, node_c, node_v)) {
2494 nd = nd->nd_next;
2495 continue;
2496 }
2497
2498 /* this side was just created, add the names */
2499 if (add_md_sidenms(sp, nd->nd_nodeid,
2500 MD_SIDEWILD, ep))
2501 goto rollback;
2502
2503 nd = nd->nd_next;
2504 }
2505
2506 RB_TEST(7, "addhosts", ep)
2507
2508 RB_PREEMPT;
2509 rb_level = 4; /* level 4 */
2510
2511 RB_TEST(8, "addhosts", ep)
2512
2513 if (add_db_sidenms(sp, ep))
2514 goto rollback;
2515
2516 } else {
2517 RB_PREEMPT;
2518 rb_level = 4;
2519 }
2520
2521 RB_TEST(9, "addhosts", ep)
2522
2523 RB_PREEMPT;
2524 rb_level = 5; /* level 5 */
2525
2526 RB_TEST(10, "addhosts", ep)
2527
2528 if (dd != NULL) {
2529 /*
2530 * Notify rpc.mdcommd on all nodes of a nodelist change.
2531 * Start by suspending rpc.mdcommd (which drains it of all
2532 * messages), then change the nodelist followed by a reinit
2533 * and resume.
2534 */
2535 nd = sd->sd_nodelist;
2536 /* Send suspend_all to nodes in nodelist (existing + new) */
2537 /* All nodes are guaranteed to be ALIVE */
2538 while (nd) {
2539 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2540 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
2541 rval = -1;
2542 goto rollback;
2543 }
2544 suspendall_flag = 1;
2545 nd = nd->nd_next;
2546 }
2547 }
2548
2549 /* Add the node(s) to the each host that is currently in the set */
2550 nd = sd->sd_nodelist;
2551 /* All nodes are guaranteed to be ALIVE */
2552 while (nd) {
2553 if (clnt_addhosts(nd->nd_nodename, sp, node_c, node_v, ep)) {
2554 goto rollback;
2555 }
2556 nd = nd->nd_next;
2557 }
2558
2559 RB_TEST(11, "addhosts", ep)
2560
2561 if (dd != NULL) {
2562 /*
2563 * Mark the drives MD_DR_OK.
2564 */
2565 nd = sd->sd_nodelist;
2566 /* All nodes are guaranteed to be ALIVE */
2567 while (nd) {
2568 if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
2569 MD_DR_OK, ep) == -1)
2570 goto rollback;
2571 nd = nd->nd_next;
2572 }
2573 }
2574
2575 RB_TEST(12, "addhosts", ep)
2576
2577 RB_PREEMPT;
2578 rb_level = 6; /* level 6 */
2579
2580 RB_TEST(13, "addhosts", ep)
2581
2582
2583 /* Add the mediator information to all hosts in the set. */
2584 nd = sd->sd_nodelist;
2585 /* All nodes are guaranteed to be ALIVE */
2586 while (nd) {
2587 if (clnt_updmeds(nd->nd_nodename, sp, &sd->sd_med, ep))
2588 goto rollback;
2589 nd = nd->nd_next;
2590 }
2591
2592 RB_TEST(14, "addhosts", ep)
2593
2594 /*
2595 * If a MN diskset and there are drives in the set,
2596 * set the master on the new nodes and
2597 * automatically join the new nodes into the set.
2598 */
2599 if (dd != NULL) {
2600 mddb_config_t c;
2601 /*
2602 * Is current set STALE?
2603 */
2604 (void) memset(&c, 0, sizeof (c));
2605 c.c_id = 0;
2606 c.c_setno = sp->setno;
2607 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
2608 (void) mdstealerror(ep, &c.c_mde);
2609 rval = -1;
2610 goto out;
2611 }
2612 if (c.c_flags & MDDB_C_STALE) {
2613 stale_flag = MNSET_IS_STALE;
2614 }
2615
2616 /* Set master on newly added nodes */
2617 for (i = 0; i < node_c; i++) {
2618 if (clnt_mnsetmaster(node_v[i], sp,
2619 sd->sd_mn_master_nodenm,
2620 sd->sd_mn_master_nodeid, ep)) {
2621 goto rollback;
2622 }
2623 }
2624 /* Join newly added nodes to diskset and set OWN flag */
2625 for (i = 0; i < node_c; i++) {
2626 if (clnt_joinset(node_v[i], sp, stale_flag, ep))
2627 goto rollback;
2628 nd = sd->sd_nodelist;
2629 while (nd) {
2630 if (strcmp(nd->nd_nodename, node_v[i]) == 0) {
2631 nd->nd_flags |= MD_MN_NODE_OWN;
2632 /*
2633 * Also set ADD flag since this flag
2634 * is already set in rpc.metad - it's
2635 * just not in the local copy.
2636 * Could flush local cache and call
2637 * metaget_setdesc, but this just
2638 * adds time. Since this node knows
2639 * the state of the node flags in
2640 * rpc.metad, just set the ADD
2641 * flag and save time.
2642 */
2643 nd->nd_flags |= MD_MN_NODE_ADD;
2644 break;
2645 }
2646 nd = nd->nd_next;
2647 }
2648 }
2649
2650 /* Send new node flag list to all Owner nodes */
2651 nd = sd->sd_nodelist;
2652 while (nd) {
2653 if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2654 nd = nd->nd_next;
2655 continue;
2656 }
2657 /*
2658 * Will effectively set OWN flag in records kept
2659 * cached in rpc.metad. The ADD flag would have
2660 * already been set by the call to clnt_addhosts.
2661 */
2662 if (clnt_upd_nr_flags(nd->nd_nodename, sp,
2663 sd->sd_nodelist, MD_NR_SET, NULL, ep)) {
2664 goto rollback;
2665 }
2666 nd = nd->nd_next;
2667 }
2668 }
2669
2670 /*
2671 * Mark the set record MD_SR_OK
2672 */
2673 nd = sd->sd_nodelist;
2674 /* All nodes are guaranteed to be ALIVE */
2675 while (nd) {
2676 if (clnt_upd_sr_flags(nd->nd_nodename, sp, MD_SR_OK,
2677 ep)) {
2678 goto rollback;
2679 }
2680 nd = nd->nd_next;
2681 }
2682
2683 /*
2684 * For MN diskset:
2685 * On each newly added node, set the node record for that node
2686 * to OK. Then set all node records for the newly added
2687 * nodes on all nodes to ok.
2688 *
2689 * By setting a node's own node record to ok first, even if
2690 * the node adding the hosts panics, the rest of the nodes can
2691 * determine the same node list during the choosing of the master
2692 * during reconfig. So, only nodes considered for mastership
2693 * are nodes that have both MD_MN_NODE_OK and MD_SR_OK set
2694 * on that node's rpc.metad. If all nodes have MD_SR_OK set,
2695 * but no node has its own MD_MN_NODE_OK set, then the set will
2696 * be removed during reconfig since a panic occurred during the
2697 * creation of the initial diskset.
2698 */
2699
2700 for (i = 0; i < node_c; i++) {
2701 nd = sd->sd_nodelist;
2702 /* All nodes are guaranteed to be ALIVE */
2703 while (nd) {
2704 if (strcmp(nd->nd_nodename, node_v[i]) == 0)
2705 break;
2706 nd = nd->nd_next;
2707 }
2708 /* Something wrong, will pick this up in next loop */
2709 if (nd == NULL)
2710 continue;
2711
2712 /* Only changing my local cache of node list */
2713 saved_nd_next = nd->nd_next;
2714 nd->nd_next = NULL;
2715
2716 /* Set node record for added host to ok on that host */
2717 if (clnt_upd_nr_flags(node_v[i], sp,
2718 nd, MD_NR_OK, NULL, ep)) {
2719 nd->nd_next = saved_nd_next;
2720 goto rollback;
2721 }
2722 nd->nd_next = saved_nd_next;
2723 }
2724
2725 /* Now set all node records on all nodes to be ok */
2726 nd = sd->sd_nodelist;
2727 /* All nodes are guaranteed to be ALIVE */
2728 while (nd) {
2729 if (clnt_upd_nr_flags(nd->nd_nodename, sp,
2730 sd->sd_nodelist, MD_NR_OK, NULL, ep)) {
2731 goto rollback;
2732 }
2733 nd = nd->nd_next;
2734 }
2735
2736 RB_TEST(15, "addhosts", ep)
2737 out:
2738 /*
2739 * Notify rpc.mdcommd on all nodes of a nodelist change.
2740 * Send reinit command to mdcommd which forces it to get
2741 * fresh set description. Then send resume.
2742 * Resume on class 0 will resume all classes, so can skip
2743 * doing an explicit resume of class1 (ignore suspend1_flag).
2744 */
2745 if (suspendall_flag) {
2746 /*
2747 * Don't know if nodelist contains the nodes being added
2748 * or not, so do reinit to nodes not being added (by skipping
2749 * any nodes in the nodelist being added) and then do
2750 * reinit to nodes being added if remote_sets_created is 1.
2751 */
2752 nd = sd->sd_nodelist;
2753 /* All nodes are guaranteed to be ALIVE */
2754 while (nd) {
2755 /* Skip nodes being added - handled later */
2756 if (strinlst(nd->nd_nodename, node_c, node_v)) {
2757 nd = nd->nd_next;
2758 continue;
2759 }
2760 /* Class is ignored for REINIT */
2761 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
2762 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
2763 if (rval == 0)
2764 (void) mdstealerror(ep, &xep);
2765 rval = -1;
2766 mde_perror(ep, dgettext(TEXT_DOMAIN,
2767 "Unable to reinit rpc.mdcommd.\n"));
2768 }
2769 nd = nd->nd_next;
2770 }
2771 /*
2772 * Send reinit to added nodes that had a set created since
2773 * rpc.mdcommd is running on the nodes with a set.
2774 */
2775 if (remote_sets_created == 1) {
2776 for (i = 0; i < node_c; i++) {
2777 if (clnt_mdcommdctl(node_v[i], COMMDCTL_REINIT,
2778 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
2779 if (rval == 0)
2780 (void) mdstealerror(ep, &xep);
2781 rval = -1;
2782 mde_perror(ep, dgettext(TEXT_DOMAIN,
2783 "Unable to reinit rpc.mdcommd.\n"));
2784 }
2785 }
2786 }
2787 }
2788 if ((suspend1_flag) || (suspendall_flag)) {
2789 /*
2790 * Unlock diskset by resuming messages across the diskset.
2791 * Just resume all classes so that resume is the same whether
2792 * just one class was locked or all classes were locked.
2793 *
2794 * Don't know if nodelist contains the nodes being added
2795 * or not, so do resume_all to nodes not being added (by
2796 * skipping any nodes in the nodelist being added) and then do
2797 * resume_all to nodes being added if remote_sets_created is 1.
2798 */
2799 nd = sd->sd_nodelist;
2800 /* All nodes are guaranteed to be ALIVE */
2801 while (nd) {
2802 /* Skip nodes being added - handled later */
2803 if (strinlst(nd->nd_nodename, node_c, node_v)) {
2804 nd = nd->nd_next;
2805 continue;
2806 }
2807 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
2808 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
2809 if (rval == 0)
2810 (void) mdstealerror(ep, &xep);
2811 rval = -1;
2812 mde_perror(ep, dgettext(TEXT_DOMAIN,
2813 "Unable to resume rpc.mdcommd.\n"));
2814 }
2815 nd = nd->nd_next;
2816 }
2817 /*
2818 * Send resume to added nodes that had a set created since
2819 * rpc.mdcommd is be running on the nodes with a set.
2820 */
2821 if (remote_sets_created == 1) {
2822 for (i = 0; i < node_c; i++) {
2823 /* Already verified to be alive */
2824 if (clnt_mdcommdctl(node_v[i], COMMDCTL_RESUME,
2825 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS,
2826 &xep)) {
2827 if (rval == 0)
2828 (void) mdstealerror(ep, &xep);
2829 rval = -1;
2830 mde_perror(ep, dgettext(TEXT_DOMAIN,
2831 "Unable to resume rpc.mdcommd.\n"));
2832 }
2833 }
2834 }
2835 meta_ping_mnset(sp->setno);
2836 /*
2837 * Start a resync thread on the newly added nodes
2838 * if set is not stale. Also start a thread to update the
2839 * abr state of all soft partitions
2840 */
2841 if (stale_flag != MNSET_IS_STALE) {
2842 for (i = 0; i < node_c; i++) {
2843 if (clnt_mn_mirror_resync_all(node_v[i],
2844 sp->setno, &xep)) {
2845 if (rval == 0)
2846 (void) mdstealerror(ep, &xep);
2847 rval = -1;
2848 mde_perror(ep, dgettext(TEXT_DOMAIN,
2849 "Unable to start resync "
2850 "thread.\n"));
2851 }
2852 if (clnt_mn_sp_update_abr(node_v[i],
2853 sp->setno, &xep)) {
2854 if (rval == 0)
2855 (void) mdstealerror(ep, &xep);
2856 rval = -1;
2857 mde_perror(ep, dgettext(TEXT_DOMAIN,
2858 "Unable to start sp update "
2859 "thread.\n"));
2860 }
2861 }
2862 }
2863 }
2864 cl_sk = cl_get_setkey(sp->setno, sp->setname);
2865 /*
2866 * Don't know if nodelist contains the nodes being added
2867 * or not, so do clnt_unlock_set to nodes not being added (by
2868 * skipping any nodes in the nodelist being added) and then do
2869 * clnt_unlock_set to nodes being added.
2870 */
2871 if (lock_flag) {
2872 nd = sd->sd_nodelist;
2873 /* All nodes are guaranteed to be ALIVE */
2874 while (nd) {
2875 /* Skip hosts we get in the next loop */
2876 if (strinlst(nd->nd_nodename, node_c, node_v)) {
2877 nd = nd->nd_next;
2878 continue;
2879 }
2880 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
2881 if (rval == 0)
2882 (void) mdstealerror(ep, &xep);
2883 rval = -1;
2884 }
2885 nd = nd->nd_next;
2886 }
2887 for (i = 0; i < node_c; i++) {
2888 /* Already verified to be alive */
2889 if (clnt_unlock_set(node_v[i], cl_sk, &xep)) {
2890 if (rval == 0)
2891 (void) mdstealerror(ep, &xep);
2892 rval = -1;
2893 }
2894 }
2895 }
2896 cl_set_setkey(NULL);
2897
2898 metaflushsetname(sp);
2899
2900 /* release signals back to what they were on entry */
2901 if (procsigs(FALSE, &oldsigs, &xep) < 0)
2902 mdclrerror(&xep);
2903
2904 return (rval);
2905
2906 rollback:
2907 rval = -1;
2908
2909 /* level 6 */
2910 if (rb_level > 5) {
2911 /*
2912 * For each node being deleted, set DEL flag and
2913 * reset OK flag on that node first.
2914 * Until a node has turned off its own
2915 * rpc.metad's NODE_OK flag, that node could be
2916 * considered for master during a reconfig.
2917 */
2918 for (i = 0; i < node_c; i++) {
2919 nd = sd->sd_nodelist;
2920 /* All nodes are guaranteed to be ALIVE */
2921 while (nd) {
2922 if (strcmp(nd->nd_nodename, node_v[i]) == 0)
2923 break;
2924 nd = nd->nd_next;
2925 }
2926 /* Something wrong, handle this in next loop */
2927 if (nd == NULL)
2928 continue;
2929
2930 /* Only changing my local cache of node list */
2931 saved_nd_next = nd->nd_next;
2932 nd->nd_next = NULL;
2933
2934 /* Set flags for del host to DEL on that host */
2935 if (clnt_upd_nr_flags(node_v[i], sp,
2936 nd, MD_NR_DEL, NULL, &xep)) {
2937 mdclrerror(&xep);
2938 }
2939 nd->nd_next = saved_nd_next;
2940 }
2941
2942 for (i = 0; i < node_c; i++) {
2943 if (dd != NULL) {
2944 /* Reset master on newly added node */
2945 if (clnt_mnsetmaster(node_v[i], sp, "",
2946 MD_MN_INVALID_NID, &xep))
2947 mdclrerror(&xep);
2948 /* Withdraw set on newly added node */
2949 if (clnt_withdrawset(node_v[i], sp, &xep))
2950 mdclrerror(&xep);
2951 }
2952 /*
2953 * Turn off owner flag in nodes to be deleted
2954 * if there are drives in the set.
2955 * Also, turn off NODE_OK and turn on NODE_DEL
2956 * for nodes to be deleted.
2957 * These flags are used to set the node
2958 * record flags in all nodes in the set.
2959 */
2960 nd = sd->sd_nodelist;
2961 while (nd) {
2962 if (strcmp(nd->nd_nodename, node_v[i]) == 0) {
2963 if (dd != NULL) {
2964 nd->nd_flags &= ~MD_MN_NODE_OWN;
2965 }
2966 nd->nd_flags |= MD_MN_NODE_DEL;
2967 nd->nd_flags &= ~MD_MN_NODE_OK;
2968 break;
2969 }
2970 nd = nd->nd_next;
2971 }
2972 }
2973
2974 /*
2975 * Now, reset owner and set delete flags for the deleted
2976 * nodes on all nodes.
2977 */
2978 nd = sd->sd_nodelist;
2979 while (nd) {
2980 if (clnt_upd_nr_flags(nd->nd_nodename, sp,
2981 sd->sd_nodelist, MD_NR_SET, NULL, &xep)) {
2982 mdclrerror(&xep);
2983 }
2984 nd = nd->nd_next;
2985 }
2986
2987 /*
2988 * On each node being deleted, set the set record
2989 * to be in DEL state.
2990 */
2991 for (i = 0; i < node_c; i++) {
2992 if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, &xep)) {
2993 mdclrerror(&xep);
2994 }
2995 }
2996 }
2997
2998 /* level 5 */
2999 if (rb_level > 4) {
3000 nd = sd->sd_nodelist;
3001 /* All nodes are guaranteed to be ALIVE */
3002 while (nd) {
3003 if (clnt_delhosts(nd->nd_nodename, sp, node_c,
3004 node_v, &xep) == -1)
3005 mdclrerror(&xep);
3006 nd = nd->nd_next;
3007 }
3008 }
3009
3010 /*
3011 * Notify rpc.mdcommd on all nodes of a nodelist change.
3012 * Send reinit command to mdcommd which forces it to get
3013 * fresh set description. Then send resume.
3014 * Nodelist contains all nodes (existing + added).
3015 */
3016 if (suspendall_flag) {
3017 /* Send reinit */
3018 nd = sd->sd_nodelist;
3019 /* All nodes are guaranteed to be ALIVE */
3020 /* Send reinit to nodes in nodelist before addhosts call */
3021 while (nd) {
3022 /*
3023 * Skip nodes being added if remote sets were not
3024 * created since rpc.mdcommd may not be running
3025 * on the remote nodes.
3026 */
3027 if ((remote_sets_created == 0) &&
3028 (strinlst(nd->nd_nodename, node_c, node_v))) {
3029 nd = nd->nd_next;
3030 continue;
3031 }
3032 /* Class is ignored for REINIT */
3033 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
3034 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
3035 mde_perror(&xep, dgettext(TEXT_DOMAIN,
3036 "Unable to reinit rpc.mdcommd.\n"));
3037 mdclrerror(&xep);
3038 }
3039 nd = nd->nd_next;
3040 }
3041
3042 /* Send resume */
3043 nd = sd->sd_nodelist;
3044 /* All nodes are guaranteed to be ALIVE */
3045 while (nd) {
3046 /*
3047 * Skip nodes being added if remote sets were not
3048 * created since rpc.mdcommd may not be running
3049 * on the remote nodes.
3050 */
3051 if ((remote_sets_created == 0) &&
3052 (strinlst(nd->nd_nodename, node_c, node_v))) {
3053 nd = nd->nd_next;
3054 continue;
3055 }
3056 /*
3057 * Resume all classes but class 1 so that lock is held
3058 * against meta* commands.
3059 * Send resume_all_but_1 to nodes in nodelist
3060 * before addhosts call.
3061 */
3062 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
3063 sp, MD_MSG_CLASS0, MD_MSCF_DONT_RESUME_CLASS1,
3064 &xep)) {
3065 mde_perror(&xep, dgettext(TEXT_DOMAIN,
3066 "Unable to resume rpc.mdcommd.\n"));
3067 mdclrerror(&xep);
3068 }
3069 nd = nd->nd_next;
3070 }
3071 meta_ping_mnset(sp->setno);
3072 }
3073
3074 /* level 4 */
3075 /* Nodelist may or may not contain nodes being added. */
3076 if (rb_level > 3 && dd != NULL) {
3077 nd = sd->sd_nodelist;
3078 while (nd) {
3079 /* Skip nodes not being added */
3080 if (!strinlst(nd->nd_nodename, node_c, node_v)) {
3081 nd = nd->nd_next;
3082 continue;
3083 }
3084
3085 if (del_db_sidenms(sp, nd->nd_nodeid, &xep))
3086 mdclrerror(&xep);
3087 nd = nd->nd_next;
3088 }
3089 }
3090
3091 /* level 3 */
3092 /* Nodelist may or may not contain nodes being added. */
3093 if (rb_level > 2 && dd != NULL) {
3094 nd = sd->sd_nodelist;
3095 while (nd) {
3096 /* Skip nodes not being added */
3097 if (!strinlst(nd->nd_nodename, node_c, node_v)) {
3098 nd = nd->nd_next;
3099 continue;
3100 }
3101
3102 if (del_md_sidenms(sp, nd->nd_nodeid, &xep))
3103 mdclrerror(&xep);
3104 nd = nd->nd_next;
3105 }
3106 }
3107
3108 /* level 1 */
3109 if (rb_level > 0) {
3110 if (dd != NULL) {
3111 /* delete the drive records */
3112 for (i = 0; i < node_c; i++) {
3113 if (clnt_deldrvs(node_v[i], sp, dd, &xep) == -1)
3114 mdclrerror(&xep);
3115 }
3116 }
3117
3118 /* delete the set record */
3119 for (i = 0; i < node_c; i++) {
3120 if (clnt_delset(node_v[i], sp, &xep) == -1)
3121 mdclrerror(&xep);
3122 }
3123 }
3124
3125 /* level 0 */
3126 cl_sk = cl_get_setkey(sp->setno, sp->setname);
3127 /* Don't test lock flag since guaranteed to be set if in rollback */
3128 /* Nodelist may or may not contain nodes being added. */
3129 /*
3130 * Unlock diskset by resuming messages across the diskset.
3131 * Just resume all classes so that resume is the same whether
3132 * just one class was locked or all classes were locked.
3133 */
3134 if ((suspend1_flag) || (suspendall_flag)) {
3135 /* All nodes are guaranteed to be ALIVE */
3136 nd = sd->sd_nodelist;
3137 while (nd) {
3138 /*
3139 * Skip nodes being added since remote sets
3140 * were either created and then deleted or
3141 * were never created. Either way - rpc.mdcommd
3142 * may not be running on the remote node.
3143 */
3144 if (strinlst(nd->nd_nodename, node_c, node_v)) {
3145 nd = nd->nd_next;
3146 continue;
3147 }
3148 if (clnt_mdcommdctl(nd->nd_nodename,
3149 COMMDCTL_RESUME, sp, MD_MSG_CLASS0,
3150 MD_MSCF_NO_FLAGS, &xep)) {
3151 mde_perror(&xep, dgettext(TEXT_DOMAIN,
3152 "Unable to resume rpc.mdcommd.\n"));
3153 mdclrerror(&xep);
3154 }
3155 nd = nd->nd_next;
3156 }
3157 meta_ping_mnset(sp->setno);
3158 }
3159 nd = sd->sd_nodelist;
3160 /* All nodes are guaranteed to be ALIVE */
3161 while (nd) {
3162 /* Skip hosts we get in the next loop */
3163 if (strinlst(nd->nd_nodename, node_c, node_v)) {
3164 nd = nd->nd_next;
3165 continue;
3166 }
3167
3168 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
3169 mdclrerror(&xep);
3170 nd = nd->nd_next;
3171 }
3172
3173 for (i = 0; i < node_c; i++)
3174 if (clnt_unlock_set(node_v[i], cl_sk, &xep))
3175 mdclrerror(&xep);
3176 cl_set_setkey(NULL);
3177
3178 /* release signals back to what they were on entry */
3179 if (procsigs(FALSE, &oldsigs, &xep) < 0)
3180 mdclrerror(&xep);
3181
3182 metaflushsetname(sp);
3183
3184 return (rval);
3185 }
3186
3187 /*
3188 * Add host(s) to the traditional diskset provided in sp.
3189 * - create set if non-existent.
3190 */
3191 static int
meta_traditional_set_addhosts(mdsetname_t * sp,int multi_node,int node_c,char ** node_v,int auto_take,md_error_t * ep)3192 meta_traditional_set_addhosts(
3193 mdsetname_t *sp,
3194 int multi_node,
3195 int node_c,
3196 char **node_v,
3197 int auto_take,
3198 md_error_t *ep
3199 )
3200 {
3201 md_set_desc *sd;
3202 md_drive_desc *dd, *p;
3203 med_rec_t medr;
3204 med_rec_t rb_medr;
3205 int rval = 0;
3206 int bool;
3207 int nodeindex;
3208 int i;
3209 int has_set;
3210 int numsides;
3211 sigset_t oldsigs;
3212 md_setkey_t *cl_sk;
3213 int rb_level = 0;
3214 md_error_t xep = mdnullerror;
3215 int max_meds;
3216
3217 if (nodesuniq(sp, node_c, node_v, ep))
3218 return (-1);
3219
3220 if (validate_nodes(sp, node_c, node_v, ep))
3221 return (-1);
3222
3223 if ((sd = metaget_setdesc(sp, ep)) == NULL) {
3224 if (! mdiserror(ep, MDE_NO_SET))
3225 return (-1);
3226 mdclrerror(ep);
3227 return (create_set(sp, multi_node, node_c, node_v, auto_take,
3228 ep));
3229 }
3230
3231 /* The auto_take behavior is inconsistent with multiple hosts. */
3232 if (auto_take || sd->sd_flags & MD_SR_AUTO_TAKE) {
3233 (void) mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL, NULL,
3234 sp->setname);
3235 return (-1);
3236 }
3237
3238 /*
3239 * We already have the set.
3240 */
3241
3242 /* Make sure we own the set */
3243 if (meta_check_ownership(sp, ep) != 0)
3244 return (-1);
3245
3246 /*
3247 * Perform the required checks for new hosts
3248 */
3249 for (i = 0; i < node_c; i++) {
3250 if (getnodeside(node_v[i], sd) != MD_SIDEWILD)
3251 return (mddserror(ep, MDE_DS_NODEINSET, sp->setno,
3252 node_v[i], NULL, sp->setname));
3253
3254 /* Make sure this set name is not used on the other hosts */
3255 has_set = nodehasset(sp, node_v[i], NHS_N_EQ, ep);
3256 if (has_set < 0) {
3257 if (! mdiserror(ep, MDE_NO_SET))
3258 return (-1);
3259 /* Keep on truck'n */
3260 mdclrerror(ep);
3261 } else if (has_set)
3262 return (mddserror(ep, MDE_DS_NODEHASSET, sp->setno,
3263 node_v[i], NULL, sp->setname));
3264
3265 if (clnt_setnumbusy(node_v[i], sp->setno, &bool, ep) == -1)
3266 return (-1);
3267
3268 if (bool == TRUE)
3269 return (mddserror(ep, MDE_DS_SETNUMBUSY, sp->setno,
3270 node_v[i], NULL, sp->setname));
3271
3272 if (clnt_setnameok(node_v[i], sp, &bool, ep) == -1)
3273 return (-1);
3274
3275 if (bool == FALSE)
3276 return (mddserror(ep, MDE_DS_SETNAMEBUSY, sp->setno,
3277 node_v[i], NULL, sp->setname));
3278
3279 if (check_setdrvs_againstnode(sp, node_v[i], ep))
3280 return (-1);
3281 }
3282
3283 /* Count the number of occupied slots */
3284 numsides = 0;
3285 for (i = 0; i < MD_MAXSIDES; i++) {
3286 /* Count occupied slots */
3287 if (sd->sd_nodes[i][0] != '\0')
3288 numsides++;
3289 }
3290
3291 /* Make sure the we have space to add the new sides */
3292 if ((numsides + node_c) > MD_MAXSIDES) {
3293 (void) mddserror(ep, MDE_DS_SIDENUMNOTAVAIL, sp->setno, NULL,
3294 NULL, sp->setname);
3295 return (-1);
3296 }
3297
3298 /* Get drive descriptors for the set */
3299 if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL)
3300 if (! mdisok(ep))
3301 return (-1);
3302
3303 /* Setup the mediator record roll-back structure */
3304 (void) memset(&rb_medr, '\0', sizeof (med_rec_t));
3305 rb_medr.med_rec_mag = MED_REC_MAGIC;
3306 rb_medr.med_rec_rev = MED_REC_REV;
3307 rb_medr.med_rec_fl = 0;
3308 rb_medr.med_rec_sn = sp->setno;
3309 (void) strcpy(rb_medr.med_rec_snm, sp->setname);
3310 for (i = 0; i < MD_MAXSIDES; i++)
3311 (void) strcpy(rb_medr.med_rec_nodes[i], sd->sd_nodes[i]);
3312 rb_medr.med_rec_meds = sd->sd_med; /* structure assigment */
3313 (void) memset(&rb_medr.med_rec_data, '\0', sizeof (med_data_t));
3314 rb_medr.med_rec_foff = 0;
3315 crcgen(&rb_medr, &rb_medr.med_rec_cks, sizeof (med_rec_t), NULL);
3316
3317 if ((max_meds = get_max_meds(ep)) == 0)
3318 return (-1);
3319
3320 /* END CHECK CODE */
3321
3322 md_rb_sig_handling_on();
3323
3324 /* Lock the set on current set members */
3325 for (i = 0; i < MD_MAXSIDES; i++) {
3326 /* Skip empty slots */
3327 if (sd->sd_nodes[i][0] == '\0')
3328 continue;
3329
3330 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
3331 rval = -1;
3332 goto out;
3333 }
3334 }
3335
3336 /* Lock the set on new set members */
3337 for (i = 0; i < node_c; i++) {
3338 if (clnt_lock_set(node_v[i], sp, ep)) {
3339 rval = -1;
3340 goto out;
3341 }
3342 }
3343
3344 RB_TEST(1, "addhosts", ep)
3345
3346 RB_PREEMPT;
3347 rb_level = 1; /* level 1 */
3348
3349 RB_TEST(2, "addhosts", ep)
3350
3351 /*
3352 * Add the new hosts to the existing set record on the existing hosts
3353 */
3354 for (i = 0; i < MD_MAXSIDES; i++) {
3355 /* skip empty slots */
3356 if (sd->sd_nodes[i][0] == '\0')
3357 continue;
3358
3359 if (clnt_addhosts(sd->sd_nodes[i], sp, node_c, node_v, ep))
3360 goto rollback;
3361 }
3362
3363 RB_PREEMPT;
3364 rb_level = 2; /* level 2 */
3365
3366 RB_TEST(3, "addhosts", ep);
3367
3368 /* Merge the new entries into the set with the existing sides */
3369 nodeindex = 0;
3370 for (i = 0; i < MD_MAXSIDES; i++) {
3371 /* Skip full slots */
3372 if (sd->sd_nodes[i][0] != '\0')
3373 continue;
3374
3375 (void) strcpy(sd->sd_nodes[i], node_v[nodeindex++]);
3376 if (nodeindex == node_c)
3377 break;
3378 }
3379
3380 /* If we have drives */
3381 if (dd != NULL) {
3382 /*
3383 * For all the hosts being added, create a sidename structure
3384 */
3385 for (i = 0; i < MD_MAXSIDES; i++) {
3386 /* Skip empty slots */
3387 if (sd->sd_nodes[i][0] == '\0')
3388 continue;
3389
3390 /* Skip nodes not being added */
3391 if (! strinlst(sd->sd_nodes[i], node_c, node_v))
3392 continue;
3393
3394 for (p = dd; p != NULL; p = p->dd_next) {
3395 if (make_sideno_sidenm(sp, p->dd_dnp, i,
3396 ep) != 0)
3397 goto rollback;
3398 }
3399 }
3400
3401 /*
3402 * Add the new sidename for each drive to the existing hosts
3403 */
3404 for (i = 0; i < MD_MAXSIDES; i++) {
3405 /* Skip empty slots */
3406 if (sd->sd_nodes[i][0] == '\0')
3407 continue;
3408
3409 /* Skip nodes being added */
3410 if (strinlst(sd->sd_nodes[i], node_c, node_v))
3411 continue;
3412
3413 if (clnt_add_drv_sidenms(sd->sd_nodes[i], mynode(), sp,
3414 sd, node_c, node_v, ep)) {
3415 goto rollback;
3416 }
3417 }
3418
3419 RB_TEST(4, "addhosts", ep)
3420
3421 RB_PREEMPT;
3422 rb_level = 3; /* level 3 */
3423
3424 RB_TEST(5, "addhosts", ep)
3425
3426 if (add_db_sidenms(sp, ep)) {
3427 goto rollback;
3428 }
3429
3430 } else {
3431 RB_PREEMPT;
3432 rb_level = 3;
3433 }
3434
3435 RB_TEST(6, "addhosts", ep)
3436
3437 RB_PREEMPT;
3438 rb_level = 4; /* level 4 */
3439
3440 RB_TEST(7, "addhosts", ep)
3441
3442
3443 /* create the set on the new nodes, this adds the drives as well */
3444 if (create_set_on_hosts(sp, multi_node, node_c, node_v, 0, ep)) {
3445 goto rollback;
3446 }
3447
3448 RB_TEST(8, "addhosts", ep)
3449
3450 RB_PREEMPT;
3451 rb_level = 5; /* level 5 */
3452
3453 RB_TEST(9, "addhosts", ep)
3454
3455 if (dd != NULL) {
3456
3457 /*
3458 * Add the device entries for the new sides into the namespace.
3459 */
3460 for (i = 0; i < MD_MAXSIDES; i++) {
3461 /* Skip empty slots */
3462 if (sd->sd_nodes[i][0] == '\0')
3463 continue;
3464
3465 /* Skip nodes not being added */
3466 if (! strinlst(sd->sd_nodes[i], node_c, node_v))
3467 continue;
3468
3469 if (add_md_sidenms(sp, i, MD_SIDEWILD, ep))
3470 goto rollback;
3471 }
3472 }
3473
3474 RB_TEST(10, "addhosts", ep)
3475
3476 RB_PREEMPT;
3477 rb_level = 6; /* level 6 */
3478
3479 RB_TEST(11, "addhosts", ep);
3480
3481 if (dd != NULL) {
3482 /*
3483 * Mark the drives MD_DR_OK.
3484 */
3485 for (i = 0; i < MD_MAXSIDES; i++) {
3486 /* Skip empty slots */
3487 if (sd->sd_nodes[i][0] == '\0')
3488 continue;
3489
3490 if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd,
3491 MD_DR_OK, ep) == -1) {
3492 goto rollback;
3493 }
3494 }
3495 }
3496
3497 RB_TEST(12, "addhosts", ep)
3498
3499 /* Bring the mediator record up to date with the set record */
3500 medr = rb_medr; /* structure assignment */
3501 for (i = 0; i < MD_MAXSIDES; i++)
3502 (void) strcpy(medr.med_rec_nodes[i], sd->sd_nodes[i]);
3503 crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
3504
3505 /* Inform the mediator hosts of the new node list */
3506 for (i = 0; i < max_meds; i++) {
3507 if (sd->sd_med.n_lst[i].a_cnt == 0)
3508 continue;
3509
3510 if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep))
3511 goto rollback;
3512 }
3513
3514 /* Add the mediator information to all hosts in the set */
3515 for (i = 0; i < MD_MAXSIDES; i++) {
3516 /* Skip empty slots */
3517 if (sd->sd_nodes[i][0] == '\0')
3518 continue;
3519
3520 if (clnt_updmeds(sd->sd_nodes[i], sp, &sd->sd_med, ep))
3521 goto rollback;
3522 }
3523
3524 RB_TEST(13, "addhosts", ep)
3525
3526 /*
3527 * Mark the set record MD_SR_OK
3528 */
3529 for (i = 0; i < MD_MAXSIDES; i++) {
3530 /* Skip empty slots */
3531 if (sd->sd_nodes[i][0] == '\0')
3532 continue;
3533
3534 if (clnt_upd_sr_flags(sd->sd_nodes[i], sp, MD_SR_OK, ep))
3535 goto rollback;
3536 }
3537
3538 RB_TEST(14, "addhosts", ep)
3539
3540 out:
3541 cl_sk = cl_get_setkey(sp->setno, sp->setname);
3542 for (i = 0; i < MD_MAXSIDES; i++) {
3543 /* Skip empty slots */
3544 if (sd->sd_nodes[i][0] == '\0')
3545 continue;
3546
3547 /* Skip hosts we get in the next loop */
3548 if (strinlst(sd->sd_nodes[i], node_c, node_v))
3549 continue;
3550
3551 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
3552 if (rval == 0)
3553 (void) mdstealerror(ep, &xep);
3554 rval = -1;
3555 }
3556 }
3557
3558 if (rval == 0) {
3559 for (i = 0; i < node_c; i++)
3560 if (clnt_unlock_set(node_v[i], cl_sk, &xep)) {
3561 if (rval == 0)
3562 (void) mdstealerror(ep, &xep);
3563 rval = -1;
3564 }
3565 }
3566 cl_set_setkey(NULL);
3567
3568 metaflushsetname(sp);
3569
3570 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
3571
3572 return (rval);
3573
3574 rollback:
3575 /* Make sure we are blocking all signals */
3576 if (procsigs(TRUE, &oldsigs, &xep) < 0)
3577 mdclrerror(&xep);
3578
3579 rval = -1;
3580
3581 /* level 6 */
3582 if (rb_level > 5) {
3583 for (i = 0; i < max_meds; i++) {
3584 if (sd->sd_med.n_lst[i].a_cnt == 0)
3585 continue;
3586
3587 if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp,
3588 &rb_medr, &xep))
3589 mdclrerror(&xep);
3590 }
3591 if (dd != NULL) {
3592 for (i = 0; i < MD_MAXSIDES; i++) {
3593 /* Skip empty slots */
3594 if (sd->sd_nodes[i][0] == '\0')
3595 continue;
3596
3597 /* Skip nodes not being added */
3598 if (! strinlst(sd->sd_nodes[i], node_c, node_v))
3599 continue;
3600
3601 if (del_md_sidenms(sp, i, &xep))
3602 mdclrerror(&xep);
3603 }
3604 }
3605 }
3606
3607 /* level 5 */
3608 if (rb_level > 4) {
3609 if (dd != NULL) {
3610 /* delete the drive records */
3611 for (i = 0; i < node_c; i++) {
3612 if (clnt_deldrvs(node_v[i], sp, dd, &xep) == -1)
3613 mdclrerror(&xep);
3614 }
3615 }
3616 /* delete the set record on the 'new' hosts */
3617 for (i = 0; i < node_c; i++) {
3618 if (clnt_delset(node_v[i], sp, &xep) == -1)
3619 mdclrerror(&xep);
3620 }
3621 }
3622
3623 /* level 4 */
3624 if (rb_level > 3 && dd != NULL) {
3625 for (i = 0; i < MD_MAXSIDES; i++) {
3626 /* Skip empty slots */
3627 if (sd->sd_nodes[i][0] == '\0')
3628 continue;
3629
3630 /* Skip nodes not being added */
3631 if (! strinlst(sd->sd_nodes[i], node_c, node_v))
3632 continue;
3633
3634 if (del_db_sidenms(sp, i, &xep))
3635 mdclrerror(&xep);
3636 }
3637 }
3638
3639 /* level 3 */
3640 if (rb_level > 2 && dd != NULL) {
3641 for (i = 0; i < MD_MAXSIDES; i++) {
3642 /* Skip empty slots */
3643 if (sd->sd_nodes[i][0] == '\0')
3644 continue;
3645
3646 /* Skip nodes not being added */
3647 if (! strinlst(sd->sd_nodes[i], node_c, node_v))
3648 continue;
3649
3650 if (clnt_del_drv_sidenms(sd->sd_nodes[i], sp,
3651 &xep) == -1)
3652 mdclrerror(&xep);
3653 }
3654 }
3655
3656 /* level 2 */
3657 if (rb_level > 1) {
3658 for (i = 0; i < MD_MAXSIDES; i++) {
3659 /* Skip empty slots */
3660 if (sd->sd_nodes[i][0] == '\0')
3661 continue;
3662
3663 if (clnt_delhosts(sd->sd_nodes[i], sp, node_c, node_v,
3664 &xep) == -1)
3665 mdclrerror(&xep);
3666 }
3667 }
3668
3669 /* level 1 */
3670 if (rb_level > 0) {
3671 cl_sk = cl_get_setkey(sp->setno, sp->setname);
3672 for (i = 0; i < MD_MAXSIDES; i++) {
3673 /* Skip empty slots */
3674 if (sd->sd_nodes[i][0] == '\0')
3675 continue;
3676
3677 /* Skip hosts we get in the next loop */
3678 if (strinlst(sd->sd_nodes[i], node_c, node_v))
3679 continue;
3680
3681 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
3682 mdclrerror(&xep);
3683 }
3684
3685 for (i = 0; i < node_c; i++)
3686 if (clnt_unlock_set(node_v[i], cl_sk, &xep))
3687 mdclrerror(&xep);
3688 cl_set_setkey(NULL);
3689 }
3690
3691 /* release signals back to what they were on entry */
3692 if (procsigs(FALSE, &oldsigs, &xep) < 0)
3693 mdclrerror(&xep);
3694
3695 metaflushsetname(sp);
3696
3697 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
3698
3699 return (rval);
3700 }
3701
3702 /*
3703 * Add host(s) to the diskset provided in sp.
3704 * - create set if non-existent.
3705 */
3706 int
meta_set_addhosts(mdsetname_t * sp,int multi_node,int node_c,char ** node_v,int auto_take,md_error_t * ep)3707 meta_set_addhosts(
3708 mdsetname_t *sp,
3709 int multi_node,
3710 int node_c,
3711 char **node_v,
3712 int auto_take,
3713 md_error_t *ep
3714 )
3715 {
3716 if (multi_node)
3717 return (meta_multinode_set_addhosts(sp, multi_node, node_c,
3718 node_v, auto_take, ep));
3719 else
3720 return (meta_traditional_set_addhosts(sp, multi_node, node_c,
3721 node_v, auto_take, ep));
3722 }
3723
3724 /*
3725 * Delete host(s) from the diskset provided in sp.
3726 * - destroy set if last host in set is removed.
3727 */
3728 int
meta_set_deletehosts(mdsetname_t * sp,int node_c,char ** node_v,int forceflg,md_error_t * ep)3729 meta_set_deletehosts(
3730 mdsetname_t *sp,
3731 int node_c,
3732 char **node_v,
3733 int forceflg,
3734 md_error_t *ep
3735 )
3736 {
3737 md_set_desc *sd;
3738 md_drive_desc *dd;
3739 med_rec_t medr;
3740 med_rec_t rb_medr;
3741 int i, j;
3742 int has_set;
3743 int numsides = 0;
3744 int oha = FALSE;
3745 sigset_t oldsigs;
3746 mhd_mhiargs_t mhiargs;
3747 md_replicalist_t *rlp = NULL;
3748 md_setkey_t *cl_sk;
3749 ulong_t max_genid = 0;
3750 int rval = 0;
3751 int rb_level = 0;
3752 int max_meds = 0;
3753 md_error_t xep = mdnullerror;
3754 md_mnnode_desc *nd;
3755 md_mnnode_record *nr;
3756 int delete_master = 0;
3757 int suspendall_flag = 0, suspendall_flag_rb = 0;
3758 int suspend1_flag = 0;
3759 int lock_flag = 0;
3760 int stale_flag = 0;
3761 int *node_id_list = NULL;
3762 int remote_sets_deleted = 0;
3763
3764 if ((sd = metaget_setdesc(sp, ep)) == NULL)
3765 return (-1);
3766
3767 /*
3768 * Verify that list of nodes being deleted contains no
3769 * duplicates.
3770 */
3771 if (nodesuniq(sp, node_c, node_v, ep))
3772 return (-1);
3773
3774 /* Make sure we own the set */
3775 if (meta_check_ownership(sp, ep) != 0)
3776 return (-1);
3777
3778 /*
3779 * The drive and node records are stored in the local mddbs of each
3780 * node in the diskset. Each node's rpc.metad daemon reads in the set,
3781 * drive and node records from that node's local mddb and caches them
3782 * internally. Any process needing diskset information contacts its
3783 * local rpc.metad to get this information. Since each node in the
3784 * diskset is independently reading the set information from its local
3785 * mddb, the set, drive and node records in the local mddbs must stay
3786 * in-sync, so that all nodes have a consistent view of the diskset.
3787 *
3788 * For a multinode diskset, explicitly verify that all nodes in the
3789 * diskset are ALIVE (i.e. are in the API membership list) if the
3790 * forceflag is FALSE. (The case of forceflag being TRUE is handled
3791 * in OHA check above.)
3792 *
3793 * If forceflag is FALSE and a node in the diskset is not in
3794 * the membership list, then fail this operation since all nodes must
3795 * be ALIVE in order to delete the node record from their local mddb.
3796 * If a panic of this node leaves the local mddbs set, node and drive
3797 * records out-of-sync, the reconfig cycle will fix the local mddbs
3798 * and force them back into synchronization.
3799 */
3800 if ((forceflg == FALSE) && (MD_MNSET_DESC(sd))) {
3801 nd = sd->sd_nodelist;
3802 while (nd) {
3803 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3804 return (mddserror(ep, MDE_DS_NOTINMEMBERLIST,
3805 sp->setno, nd->nd_nodename,
3806 NULL, sp->setname));
3807 }
3808 nd = nd->nd_next;
3809 }
3810 }
3811
3812
3813 /*
3814 * Lock the set on current set members.
3815 * Set locking done much earlier for MN diskset than for traditional
3816 * diskset since lock_set and SUSPEND are used to protect against
3817 * other meta* commands running on the other nodes.
3818 */
3819 if (MD_MNSET_DESC(sd)) {
3820 /* Make sure we are blocking all signals */
3821 if (procsigs(TRUE, &oldsigs, &xep) < 0)
3822 mdclrerror(&xep);
3823
3824 nd = sd->sd_nodelist;
3825 while (nd) {
3826 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3827 nd = nd->nd_next;
3828 continue;
3829 }
3830
3831 if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
3832 rval = -1;
3833 goto out2;
3834 }
3835 lock_flag = 1;
3836 nd = nd->nd_next;
3837 }
3838 /*
3839 * Lock out other meta* commands by suspending
3840 * class 1 messages across the diskset.
3841 */
3842 nd = sd->sd_nodelist;
3843 while (nd) {
3844 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3845 nd = nd->nd_next;
3846 continue;
3847 }
3848 if (clnt_mdcommdctl(nd->nd_nodename,
3849 COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
3850 MD_MSCF_NO_FLAGS, ep)) {
3851 rval = -1;
3852 goto out2;
3853 }
3854 suspend1_flag = 1;
3855 nd = nd->nd_next;
3856 }
3857 }
3858
3859 for (i = 0; i < node_c; i++)
3860 if (getnodeside(node_v[i], sd) == MD_SIDEWILD) {
3861 (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
3862 node_v[i], NULL, sp->setname);
3863 rval = -1;
3864 goto out2;
3865 }
3866
3867 /*
3868 * Count the number of nodes currently in the set.
3869 */
3870 if (MD_MNSET_DESC(sd)) {
3871 nd = sd->sd_nodelist;
3872 while (nd) {
3873 numsides++;
3874 nd = nd->nd_next;
3875 }
3876 } else {
3877 for (i = 0; i < MD_MAXSIDES; i++)
3878 /* Count full slots */
3879 if (sd->sd_nodes[i][0] != '\0')
3880 numsides++;
3881 }
3882
3883 /*
3884 * OHA mode == -f -h <hostname>
3885 * OHA is One Host Administration that occurs when the forceflag (-f)
3886 * is set and at least one host in the diskset isn't responding
3887 * to RPC requests.
3888 *
3889 * When in OHA mode, a node cannot delete itself from a diskset.
3890 * When in OHA mode, a node can delete a list of nodes from a diskset
3891 * even if some of the nodes in the diskset are unresponsive.
3892 *
3893 * For multinode diskset, only allow OHA mode when the nodes that
3894 * aren't responding in the diskset are not in the membership list
3895 * (i.e. nodes that aren't responding are not marked ALIVE).
3896 * Nodes that aren't in the membership list will be rejoining
3897 * the diskset through a reconfig cycle and the local mddb set
3898 * and node records can be reconciled during the reconfig cycle.
3899 *
3900 * If a node isn't responding, but is still in the membership list,
3901 * fail the request since the node may not be responding because
3902 * rpc.metad died and is restarting. In this case, no reconfig
3903 * cycle will be started, so there's no way to recover if
3904 * the host delete operation was allowed.
3905 *
3906 * NOTE: if nodes that weren't in the membership when the OHA host
3907 * delete occurred are now the only nodes in membership list,
3908 * those nodes will see the old view of the diskset. As soon as
3909 * a node re-enters the cluster that was present in the cluster
3910 * during the host deletion, the diskset will reflect the host
3911 * deletion on all nodes presently in the cluster.
3912 */
3913 if (forceflg == TRUE) {
3914 if (MD_MNSET_DESC(sd)) {
3915 nd = sd->sd_nodelist;
3916 while (nd) {
3917 /*
3918 * If a node isn't ALIVE (in member list),
3919 * then allow a force-able delete in OHA mode.
3920 */
3921 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3922 oha = TRUE;
3923 break;
3924 }
3925 /*
3926 * Don't test for clnt_nullproc since already
3927 * tested the RPC connections by clnt_lock_set.
3928 */
3929 nd = nd->nd_next;
3930 }
3931 } else {
3932 for (i = 0; i < MD_MAXSIDES; i++) {
3933 /* Skip empty slots */
3934 if (sd->sd_nodes[i][0] == '\0')
3935 continue;
3936
3937 if (clnt_nullproc(sd->sd_nodes[i], ep) == -1) {
3938 /*
3939 * If we timeout to at least one
3940 * client, then we can allow OHA mode,
3941 * otherwise, we are in normal mode.
3942 */
3943 if (mdanyrpcerror(ep)) {
3944 mdclrerror(ep);
3945 if (strinlst(sd->sd_nodes[i],
3946 node_c, node_v)) {
3947 oha = TRUE;
3948 break;
3949 }
3950 }
3951 }
3952 }
3953 }
3954 }
3955
3956 /*
3957 * Don't allow this for MN diskset since meta_set_destroy of 1 node
3958 * does NOT remove this node's node record from the other node's set
3959 * records in their local mddb. This leaves a MN diskset in a very
3960 * messed up state.
3961 */
3962 if (!(MD_MNSET_DESC(sd))) {
3963 /* Destroy set */
3964 if (forceflg == TRUE && node_c == 1 &&
3965 strcmp(mynode(), node_v[0]) == 0) {
3966 /* Can return since !MN diskset so nothing to unlock */
3967 return (meta_set_destroy(sp, TRUE, ep));
3968 }
3969 }
3970
3971
3972 /*
3973 * In multinode diskset, can only delete self if this
3974 * is the last node in the set or if all nodes in
3975 * the set are being deleted. The traditional diskset code
3976 * allows a node to delete itself (when there are other nodes
3977 * in the diskset) when using the force flag, but that code
3978 * path doesn't have the node remove itself from
3979 * the set node list on the other nodes. Since this isn't
3980 * satisfactory for the multinode diskset, just don't
3981 * allow this operation.
3982 */
3983 if (MD_MNSET_DESC(sd) && (numsides > 1) && (node_c != numsides) &&
3984 strinlst(mynode(), node_c, node_v)) {
3985 (void) mddserror(ep, MDE_DS_MNCANTDELSELF, sp->setno,
3986 mynode(), NULL, sp->setname);
3987 rval = -1;
3988 goto out2;
3989 }
3990
3991 /*
3992 * In multinode diskset, don't allow deletion of master node unless
3993 * this is the only node left or unless all nodes are being
3994 * deleted since there is no way to switch
3995 * master ownership (unless via a cluster reconfig cycle).
3996 */
3997 delete_master = strinlst(sd->sd_mn_master_nodenm, node_c, node_v);
3998 if (MD_MNSET_DESC(sd) && (numsides > 1) && (node_c != numsides) &&
3999 delete_master) {
4000 (void) mddserror(ep, MDE_DS_CANTDELMASTER, sp->setno,
4001 sd->sd_mn_master_nodenm, NULL, sp->setname);
4002 rval = -1;
4003 goto out2;
4004 }
4005
4006
4007 /* Deleting self w/o forceflg */
4008 if (forceflg == FALSE && numsides > 1 &&
4009 strinlst(mynode(), node_c, node_v)) {
4010 (void) mddserror(ep, MDE_DS_CANTDELSELF, sp->setno,
4011 mynode(), NULL, sp->setname);
4012 rval = -1;
4013 goto out2;
4014 }
4015
4016 /*
4017 * Setup the mediator record roll-back structure for a trad diskset.
4018 *
4019 * For a MN diskset, the deletion of a host in the diskset
4020 * does not cause an update of the mediator record. If the
4021 * host deletion will cause the diskset to be removed (this is
4022 * the last host being removed or all hosts are being removed)
4023 * then the mediator record must have already been removed by the
4024 * user or this delete host operation will fail (a check for
4025 * this is done later in this routine).
4026 */
4027 if (!(MD_MNSET_DESC(sd))) {
4028 (void) memset(&rb_medr, '\0', sizeof (med_rec_t));
4029 rb_medr.med_rec_mag = MED_REC_MAGIC;
4030 rb_medr.med_rec_rev = MED_REC_REV;
4031 rb_medr.med_rec_fl = 0;
4032 rb_medr.med_rec_sn = sp->setno;
4033 (void) strcpy(rb_medr.med_rec_snm, sp->setname);
4034 for (i = 0; i < MD_MAXSIDES; i++)
4035 (void) strcpy(rb_medr.med_rec_nodes[i],
4036 sd->sd_nodes[i]);
4037 rb_medr.med_rec_meds = sd->sd_med; /* structure assigment */
4038 (void) memset(&rb_medr.med_rec_data, '\0', sizeof (med_data_t));
4039 rb_medr.med_rec_foff = 0;
4040 crcgen(&rb_medr, &rb_medr.med_rec_cks,
4041 sizeof (med_rec_t), NULL);
4042
4043 /* Bring the mediator record up to date with the set record */
4044 medr = rb_medr; /* structure assignment */
4045
4046 if ((max_meds = get_max_meds(ep)) == 0) {
4047 rval = -1;
4048 goto out2;
4049 }
4050 }
4051
4052 /*
4053 * For traditional diskset:
4054 * Check to see if all the hosts we are trying to delete the set from
4055 * have a set "setname" that is the same as ours, i.e. - same name,
4056 * same time stamp, same genid. We only do this if forceflg is not
4057 * specified or we are in OHA mode.
4058 */
4059 if (!(MD_MNSET_DESC(sd)) && (forceflg == FALSE || oha == TRUE)) {
4060 int fix_node_v = FALSE;
4061 int j;
4062
4063 for (i = 0; i < node_c; i++) {
4064 /* We skip this side */
4065 if (strcmp(mynode(), node_v[i]) == 0)
4066 continue;
4067
4068 has_set = nodehasset(sp, node_v[i], NHS_NSTG_EQ, ep);
4069
4070 if (has_set < 0) {
4071 char *anode[1];
4072
4073 /*
4074 * Can't talk to the host only allowed in OHA
4075 * mode.
4076 */
4077 if (oha == TRUE && mdanyrpcerror(ep)) {
4078 mdclrerror(ep);
4079 continue;
4080 }
4081
4082 /*
4083 * We got an error we do not, or are not,
4084 * prepared to handle.
4085 */
4086 if (! mdiserror(ep, MDE_NO_SET) &&
4087 ! mdismddberror(ep, MDE_DB_NODB)) {
4088 rval = -1;
4089 goto out2;
4090 }
4091 mdclrerror(ep);
4092
4093 /*
4094 * If we got here: both hosts are up; a host in
4095 * our set record does not have the set. So we
4096 * delete the host from our set and invalidate
4097 * the node.
4098 */
4099 anode[0] = Strdup(node_v[i]);
4100
4101 rval = del_host_noset(sp, anode, ep);
4102
4103 /*
4104 * If we delete a host, make sure the mediator
4105 * hosts are made aware of this.
4106 */
4107 for (j = 0; j < MD_MAXSIDES; j++) {
4108 if (strcmp(medr.med_rec_nodes[j],
4109 node_v[i]) != 0)
4110 continue;
4111 (void) memset(&medr.med_rec_nodes[j],
4112 '\0', sizeof (md_node_nm_t));
4113 }
4114 crcgen(&medr, &medr.med_rec_cks,
4115 sizeof (med_rec_t), NULL);
4116
4117 rb_medr = medr; /* struct assignment */
4118
4119 Free(anode[0]);
4120
4121 if (rval == -1)
4122 goto out2;
4123
4124 node_v[i][0] = '\0';
4125 fix_node_v = TRUE;
4126 continue;
4127 }
4128
4129 /*
4130 * If we can talk to the host, and they do not have the
4131 * exact set, then we disallow the operation.
4132 */
4133 if (has_set == FALSE) {
4134 (void) mddserror(ep, MDE_DS_NODENOSET,
4135 sp->setno, node_v[i], NULL, sp->setname);
4136 rval = -1;
4137 goto out2;
4138 }
4139 }
4140
4141 /*
4142 * Here we prune the node_v's that were invalidated above.
4143 */
4144 if (fix_node_v == TRUE) {
4145 i = 0;
4146 while (i < node_c) {
4147 if (node_v[i][0] == '\0') {
4148 for (j = i; (j + 1) < node_c; j++)
4149 node_v[j] = node_v[j + 1];
4150 node_c--;
4151 }
4152 i++;
4153 }
4154 /*
4155 * If we are left with no nodes, then we have
4156 * compeleted the operation.
4157 */
4158 if (node_c == 0) {
4159 /*
4160 * Inform the mediator hosts of the new node
4161 * list
4162 */
4163 for (i = 0; i < max_meds; i++) {
4164 if (sd->sd_med.n_lst[i].a_cnt == 0)
4165 continue;
4166
4167 if (clnt_med_upd_rec(
4168 &sd->sd_med.n_lst[i], sp, &medr,
4169 ep))
4170 mdclrerror(ep);
4171 }
4172 rval = 0;
4173 goto out2;
4174 }
4175 }
4176 }
4177
4178 /*
4179 * For multinode diskset:
4180 * If forceflag is FALSE then check to see if all the hosts we
4181 * are trying to delete the set from have a set "setname" that
4182 * is the same as ours, i.e. - same name, same time stamp, same genid.
4183 * If forceflag is TRUE, then we don't care if the hosts being
4184 * deleted have the same set information or not since user is forcing
4185 * those hosts to be deleted.
4186 */
4187 if ((MD_MNSET_DESC(sd)) && (forceflg == FALSE)) {
4188 for (i = 0; i < node_c; i++) {
4189 /* We skip this node since comparing against it */
4190 if (strcmp(mynode(), node_v[i]) == 0)
4191 continue;
4192
4193 has_set = nodehasset(sp, node_v[i], NHS_NSTG_EQ, ep);
4194
4195 if (has_set < 0) {
4196 rval = -1;
4197 goto out2;
4198 }
4199
4200 /*
4201 * If we can talk to the host, and they do not have the
4202 * exact set, then we disallow the operation.
4203 */
4204 if (has_set == FALSE) {
4205 (void) mddserror(ep, MDE_DS_NODENOSET,
4206 sp->setno, node_v[i], NULL, sp->setname);
4207 rval = -1;
4208 goto out2;
4209 }
4210 }
4211 }
4212
4213 /*
4214 * For traditional diskset:
4215 * Can't allow user to delete their node (without deleting all nodes)
4216 * out of a set in OHA mode, would leave a real mess.
4217 * This action was already failed above for a MN diskset.
4218 */
4219 if (!(MD_MNSET_DESC(sd)) && (oha == TRUE) &&
4220 strinlst(mynode(), node_c, node_v)) {
4221 /* Can directly return since !MN diskset; nothing to unlock */
4222 return (mddserror(ep, MDE_DS_OHACANTDELSELF, sp->setno,
4223 mynode(), NULL, sp->setname));
4224 }
4225
4226
4227 /* Get the drive descriptors for this set */
4228 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
4229 ep)) == NULL) {
4230 if (! mdisok(ep)) {
4231 rval = -1;
4232 goto out2;
4233 }
4234 }
4235
4236 /*
4237 * We have been asked to delete all the hosts in the set, i.e. - delete
4238 * the whole set.
4239 */
4240 if (node_c == numsides) {
4241 /*
4242 * This is only a valid operation if all drives have been
4243 * removed first.
4244 */
4245
4246 if (dd != NULL) {
4247 (void) mddserror(ep, MDE_DS_HASDRIVES, sp->setno,
4248 NULL, NULL, sp->setname);
4249 rval = -1;
4250 goto out2;
4251 }
4252
4253 /*
4254 * If a mediator is currently associated with this set,
4255 * fail the deletion of the last host(s).
4256 */
4257 if (sd->sd_med.n_cnt != 0) {
4258 (void) mddserror(ep, MDE_DS_HASMED, sp->setno,
4259 NULL, NULL, sp->setname);
4260 rval = -1;
4261 goto out2;
4262 }
4263
4264 if (! mdisok(ep)) {
4265 rval = -1;
4266 goto out2;
4267 }
4268
4269 rval = del_set_nodrives(sp, node_c, node_v, oha, ep);
4270 remote_sets_deleted = 1;
4271 goto out2;
4272 }
4273
4274 /*
4275 * Get timeout values in case we need to roll back
4276 */
4277 (void) memset(&mhiargs, '\0', sizeof (mhiargs));
4278 if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) != 0) {
4279 rval = -1;
4280 goto out2;
4281 }
4282
4283 if (dd != NULL) {
4284 /*
4285 * We need this around for re-adding DB side names later.
4286 */
4287 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) {
4288 rval = -1;
4289 goto out2;
4290 }
4291
4292 /*
4293 * Alloc nodeid list if drives are present in diskset.
4294 * nodeid list is used to reset mirror owners if the
4295 * owner is a deleted node.
4296 */
4297 if (MD_MNSET_DESC(sd)) {
4298 node_id_list = Zalloc(sizeof (int) * node_c);
4299 }
4300 }
4301
4302 /* Lock the set on current set members */
4303 if (!(MD_MNSET_DESC(sd))) {
4304 md_rb_sig_handling_on();
4305 for (i = 0; i < MD_MAXSIDES; i++) {
4306 /* Skip empty slots */
4307 if (sd->sd_nodes[i][0] == '\0')
4308 continue;
4309
4310 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
4311 if (oha == TRUE && mdanyrpcerror(ep)) {
4312 mdclrerror(ep);
4313 continue;
4314 }
4315 rval = -1;
4316 goto out2;
4317 }
4318 lock_flag = 1;
4319 }
4320 }
4321
4322 RB_TEST(1, "deletehosts", ep)
4323
4324 RB_PREEMPT;
4325 rb_level = 1; /* level 1 */
4326
4327 RB_TEST(2, "deletehosts", ep)
4328
4329 if (MD_MNSET_DESC(sd)) {
4330 md_mnnode_desc *saved_nd_next;
4331 mddb_config_t c;
4332
4333 if (dd != NULL) {
4334 /*
4335 * Notify rpc.mdcommd on all nodes of a nodelist change.
4336 * Start by suspending rpc.mdcommd (which drains it of
4337 * all messages), then change the nodelist followed
4338 * by a reinit and resume.
4339 */
4340 nd = sd->sd_nodelist;
4341 while (nd) {
4342 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4343 nd = nd->nd_next;
4344 continue;
4345 }
4346 if (clnt_mdcommdctl(nd->nd_nodename,
4347 COMMDCTL_SUSPEND, sp,
4348 MD_MSG_CLASS0,
4349 MD_MSCF_NO_FLAGS, ep)) {
4350 rval = -1;
4351 goto out2;
4352 }
4353 suspendall_flag = 1;
4354 nd = nd->nd_next;
4355 }
4356 /*
4357 * Is current set STALE?
4358 * Need to know this if delete host fails and node
4359 * is re-joined to diskset.
4360 */
4361 (void) memset(&c, 0, sizeof (c));
4362 c.c_id = 0;
4363 c.c_setno = sp->setno;
4364 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
4365 (void) mdstealerror(ep, &c.c_mde);
4366 rval = -1;
4367 goto out2;
4368 }
4369 if (c.c_flags & MDDB_C_STALE) {
4370 stale_flag = MNSET_IS_STALE;
4371 }
4372 }
4373
4374 /*
4375 * For each node being deleted, set DEL flag and
4376 * reset OK flag on that node first.
4377 * Until a node has turned off its own
4378 * rpc.metad's NODE_OK flag, that node could be
4379 * considered for master during a reconfig.
4380 */
4381 for (i = 0; i < node_c; i++) {
4382 /*
4383 * During OHA mode, don't issue RPCs to
4384 * non-alive nodes since there is no reason to
4385 * wait for RPC timeouts.
4386 */
4387 nd = sd->sd_nodelist;
4388 while (nd) {
4389 if (strcmp(nd->nd_nodename, node_v[i]) == 0)
4390 break;
4391 nd = nd->nd_next;
4392 }
4393 /* Something wrong, handle this in next loop */
4394 if (nd == NULL)
4395 continue;
4396
4397 /* If node_id_list is alloc'd, fill in for later use */
4398 if (node_id_list)
4399 node_id_list[i] = nd->nd_nodeid;
4400
4401 /* All nodes are guaranteed to be ALIVE unless OHA */
4402 if ((oha == TRUE) &&
4403 (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4404 continue;
4405 }
4406
4407 /* Only changing my local cache of node list */
4408 saved_nd_next = nd->nd_next;
4409 nd->nd_next = NULL;
4410
4411 /* Set flags for del host to DEL on that host */
4412 if (clnt_upd_nr_flags(node_v[i], sp,
4413 nd, MD_NR_DEL, NULL, ep)) {
4414 nd->nd_next = saved_nd_next;
4415 goto rollback;
4416 }
4417 nd->nd_next = saved_nd_next;
4418 }
4419 for (i = 0; i < node_c; i++) {
4420 /*
4421 * Turn off owner flag in nodes to be deleted
4422 * if this node has been joined.
4423 * Also, turn off NODE_OK and turn on NODE_DEL
4424 * for nodes to be deleted.
4425 * These flags are used to set the node
4426 * record flags in all nodes in the set.
4427 * Only withdraw nodes that are joined.
4428 */
4429 nd = sd->sd_nodelist;
4430 while (nd) {
4431 /*
4432 * Don't communicate with non-ALIVE node if
4433 * in OHA - but set flags in master list so
4434 * alive nodes are updated correctly.
4435 */
4436 if (strcmp(nd->nd_nodename, node_v[i]) == 0) {
4437 if ((oha == TRUE) && (!(nd->nd_flags &
4438 MD_MN_NODE_ALIVE))) {
4439 nd->nd_flags |= MD_MN_NODE_DEL;
4440 nd->nd_flags &= ~MD_MN_NODE_OK;
4441 nd = nd->nd_next;
4442 continue;
4443 }
4444 if (nd->nd_flags & MD_MN_NODE_OWN) {
4445 /*
4446 * Going to set locally cached
4447 * node flags to rollback join
4448 * so in case of error, the
4449 * rollback code knows which
4450 * nodes to re-join. rpc.metad
4451 * ignores the RB_JOIN flag.
4452 */
4453 nd->nd_flags |=
4454 MD_MN_NODE_RB_JOIN;
4455 nd->nd_flags &= ~MD_MN_NODE_OWN;
4456
4457 /*
4458 * Be careful in ordering of
4459 * following steps so that
4460 * recovery from a panic
4461 * between the steps is viable.
4462 * Only reset master info in
4463 * rpc.metad - don't reset
4464 * local cached info which will
4465 * be used to set master info
4466 * back if failure (rollback).
4467 */
4468 if (clnt_withdrawset(
4469 nd->nd_nodename, sp, ep))
4470 goto rollback;
4471
4472 /*
4473 * Reset master on deleted node
4474 */
4475 if (clnt_mnsetmaster(node_v[i],
4476 sp, "", MD_MN_INVALID_NID,
4477 ep))
4478 goto rollback;
4479 }
4480
4481 nd->nd_flags |= MD_MN_NODE_DEL;
4482 nd->nd_flags &= ~MD_MN_NODE_OK;
4483 }
4484 nd = nd->nd_next;
4485 }
4486 }
4487
4488 /*
4489 * Now, reset owner and set delete flags for the
4490 * deleted nodes on all nodes.
4491 */
4492 nd = sd->sd_nodelist;
4493 while (nd) {
4494 /* Skip non-ALIVE node if in OHA */
4495 if ((oha == TRUE) &&
4496 (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4497 nd = nd->nd_next;
4498 continue;
4499 }
4500 if (clnt_upd_nr_flags(nd->nd_nodename, sp,
4501 sd->sd_nodelist, MD_NR_SET, NULL, ep)) {
4502 goto rollback;
4503 }
4504 nd = nd->nd_next;
4505 }
4506 /*
4507 * Notify rpc.mdcommd on all nodes of a nodelist change.
4508 * Send reinit command to mdcommd which forces it to get
4509 * fresh set description.
4510 */
4511 if (suspendall_flag) {
4512 /* Send reinit */
4513 nd = sd->sd_nodelist;
4514 while (nd) {
4515 if ((oha == TRUE) &&
4516 (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4517 nd = nd->nd_next;
4518 continue;
4519 }
4520 /* Class is ignored for REINIT */
4521 if (clnt_mdcommdctl(nd->nd_nodename,
4522 COMMDCTL_REINIT, sp, NULL,
4523 MD_MSCF_NO_FLAGS, ep)) {
4524 mde_perror(ep, dgettext(TEXT_DOMAIN,
4525 "Unable to reinit rpc.mdcommd.\n"));
4526 goto rollback;
4527 }
4528 nd = nd->nd_next;
4529 }
4530 /* Send resume */
4531 nd = sd->sd_nodelist;
4532 while (nd) {
4533 if ((oha == TRUE) &&
4534 (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4535 nd = nd->nd_next;
4536 continue;
4537 }
4538 if (clnt_mdcommdctl(nd->nd_nodename,
4539 COMMDCTL_RESUME, sp, MD_MSG_CLASS0,
4540 MD_MSCF_DONT_RESUME_CLASS1, ep)) {
4541 mde_perror(ep, dgettext(TEXT_DOMAIN,
4542 "Unable to resume rpc.mdcommd.\n"));
4543 goto rollback;
4544 }
4545 nd = nd->nd_next;
4546 }
4547 meta_ping_mnset(sp->setno);
4548 }
4549 }
4550
4551
4552 /*
4553 * Mark the set record MD_SR_DEL on the hosts we are deleting
4554 * If a MN diskset and OHA mode, don't issue RPC to nodes that
4555 * are not ALIVE.
4556 * If a MN diskset and not in OHA mode, then all nodes must respond
4557 * to RPC (be alive) or this routine will return failure.
4558 * If a traditional diskset, all RPC failures if in OHA mode.
4559 */
4560 for (i = 0; i < node_c; i++) {
4561
4562 RB_TEST(3, "deletehosts", ep)
4563
4564 if ((MD_MNSET_DESC(sd)) && (oha == TRUE)) {
4565 /*
4566 * During OHA mode, don't issue RPCs to
4567 * non-alive nodes since there is no reason to
4568 * wait for RPC timeouts.
4569 */
4570 nd = sd->sd_nodelist;
4571 while (nd) {
4572 if (strcmp(nd->nd_nodename, node_v[i]) == 0) {
4573 break;
4574 }
4575 nd = nd->nd_next;
4576 }
4577 if (nd == NULL) {
4578 (void) mddserror(ep, MDE_DS_NODENOTINSET,
4579 sp->setno, node_v[i], NULL, sp->setname);
4580 goto rollback;
4581 } else if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4582 /* Skip non-ALIVE node if in OHA mode */
4583 continue;
4584 } else {
4585 if (clnt_upd_sr_flags(node_v[i], sp,
4586 MD_SR_DEL, ep)) {
4587 goto rollback;
4588 }
4589 }
4590 } else if ((MD_MNSET_DESC(sd)) && (oha == FALSE)) {
4591 /*
4592 * All nodes should be alive in non-oha mode.
4593 */
4594 if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) {
4595 goto rollback;
4596 }
4597 } else {
4598 /*
4599 * For traditional diskset, issue the RPC and
4600 * ignore RPC failure if in OHA mode.
4601 */
4602 if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) {
4603 if (oha == TRUE && mdanyrpcerror(ep)) {
4604 mdclrerror(ep);
4605 continue;
4606 }
4607 goto rollback;
4608 }
4609 }
4610
4611 RB_TEST(4, "deletehosts", ep)
4612 }
4613
4614 RB_TEST(5, "deletehosts", ep)
4615
4616 RB_PREEMPT;
4617 rb_level = 2; /* level 2 */
4618
4619 RB_TEST(6, "deletehosts", ep)
4620
4621 /* Delete the set on the hosts we are deleting */
4622 if (del_set_on_hosts(sp, sd, dd, node_c, node_v, oha, ep)) {
4623 if (node_id_list)
4624 Free(node_id_list);
4625 /*
4626 * Failure during del_set_on_hosts would have recreated
4627 * the diskset on the remote hosts, but for multi-owner
4628 * disksets need to set node flags properly and REINIT and
4629 * RESUME rpc.mdcommd, so just let the rollback code
4630 * do this.
4631 */
4632 if (MD_MNSET_DESC(sd))
4633 goto rollback;
4634 return (-1);
4635 }
4636 remote_sets_deleted = 1;
4637
4638 RB_TEST(19, "deletehosts", ep)
4639
4640 RB_PREEMPT;
4641 rb_level = 3; /* level 3 */
4642
4643 RB_TEST(20, "deletehosts", ep)
4644
4645 /* Delete the host from sets on hosts not being deleted */
4646 if (MD_MNSET_DESC(sd)) {
4647 nd = sd->sd_nodelist;
4648 /* All nodes are guaranteed to be ALIVE unless in oha mode */
4649 while (nd) {
4650 /*
4651 * During OHA mode, don't issue RPCs to
4652 * non-alive nodes since there is no reason to
4653 * wait for RPC timeouts.
4654 */
4655 if ((oha == TRUE) &&
4656 (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4657 nd = nd->nd_next;
4658 continue;
4659 }
4660
4661 /* Skip nodes being deleted */
4662 if (strinlst(nd->nd_nodename, node_c, node_v)) {
4663 nd = nd->nd_next;
4664 continue;
4665 }
4666 if (clnt_delhosts(nd->nd_nodename, sp, node_c, node_v,
4667 ep) == -1) {
4668 goto rollback;
4669 }
4670
4671 RB_TEST(21, "deletehosts", ep)
4672 nd = nd->nd_next;
4673 }
4674 } else {
4675 for (i = 0; i < MD_MAXSIDES; i++) {
4676 /* Skip empty slots */
4677 if (sd->sd_nodes[i][0] == '\0')
4678 continue;
4679
4680 /* Skip nodes being deleted */
4681 if (strinlst(sd->sd_nodes[i], node_c, node_v))
4682 continue;
4683
4684 if (clnt_delhosts(sd->sd_nodes[i], sp, node_c, node_v,
4685 ep) == -1) {
4686 if (oha == TRUE && mdanyrpcerror(ep)) {
4687 mdclrerror(ep);
4688 continue;
4689 }
4690 goto rollback;
4691 }
4692
4693 RB_TEST(21, "deletehosts", ep)
4694 }
4695 }
4696
4697 /* We have drives */
4698 if (dd != NULL) {
4699 RB_TEST(22, "deletehosts", ep)
4700
4701 RB_PREEMPT;
4702 rb_level = 4; /* level 4 */
4703
4704 RB_TEST(23, "deletehosts", ep)
4705
4706 /*
4707 * Delete the old sidename for each drive on all the hosts.
4708 * If a multi-node diskset, each host only stores
4709 * the side information for itself. So, a multi-node
4710 * diskset doesn't delete the old sidename for
4711 * an old host.
4712 *
4713 * If a MN diskset, reset owners of mirrors that are
4714 * owned by the deleted nodes.
4715 */
4716 if (!(MD_MNSET_DESC(sd))) {
4717 for (i = 0; i < MD_MAXSIDES; i++) {
4718 /* Skip empty slots */
4719 if (sd->sd_nodes[i][0] == '\0')
4720 continue;
4721
4722 /* Skip nodes being deleted */
4723 if (strinlst(sd->sd_nodes[i], node_c, node_v))
4724 continue;
4725
4726 if (clnt_del_drv_sidenms(sd->sd_nodes[i], sp,
4727 ep)) {
4728 if (oha == TRUE && mdanyrpcerror(ep)) {
4729 mdclrerror(ep);
4730 continue;
4731 }
4732 metaflushsetname(sp);
4733 goto rollback;
4734 }
4735
4736 RB_TEST(24, "deletehosts", ep)
4737 }
4738 } else {
4739 nd = sd->sd_nodelist;
4740 /* All nodes guaranteed ALIVE unless in oha mode */
4741 while (nd) {
4742 /*
4743 * If mirror owner was set to a deleted node,
4744 * then each existing node resets mirror owner
4745 * to NULL.
4746 *
4747 * During OHA mode, don't issue RPCs to
4748 * non-alive nodes since there is no reason to
4749 * wait for RPC timeouts.
4750 */
4751 if ((oha == TRUE) &&
4752 (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4753 nd = nd->nd_next;
4754 continue;
4755 }
4756
4757 /* Skip nodes being deleted */
4758 if (strinlst(nd->nd_nodename, node_c, node_v)) {
4759 nd = nd->nd_next;
4760 continue;
4761 }
4762
4763 /*
4764 * If mirror owner is a deleted node, reset
4765 * mirror owners to NULL. If an error occurs,
4766 * print a warning and continue. Don't fail
4767 * metaset because of mirror owner reset
4768 * problem since next node to grab mirror
4769 * will resolve this issue. Before next node
4770 * grabs mirrors, metaset will show the deleted
4771 * node as owner which is why an attempt to
4772 * reset the mirror owner is made.
4773 */
4774 if (clnt_reset_mirror_owner(nd->nd_nodename, sp,
4775 node_c, &node_id_list[0], &xep) == -1) {
4776 mde_perror(&xep, dgettext(TEXT_DOMAIN,
4777 "Unable to reset mirror owner on"
4778 " node %s\n"), nd->nd_nodename);
4779 mdclrerror(&xep);
4780 }
4781
4782 RB_TEST(21, "deletehosts", ep)
4783 nd = nd->nd_next;
4784 }
4785 }
4786 }
4787
4788 RB_TEST(25, "deletehosts", ep)
4789
4790 RB_PREEMPT;
4791 rb_level = 4; /* level 4 */
4792
4793 RB_TEST(26, "deletehosts", ep)
4794
4795 /*
4796 * Bring the mediator record up to date with the set record for
4797 * traditional diskset.
4798 */
4799 if (!(MD_MNSET_DESC(sd))) {
4800 medr = rb_medr; /* structure assignment */
4801 for (i = 0; i < MD_MAXSIDES; i++) {
4802 if (strinlst(sd->sd_nodes[i], node_c, node_v))
4803 (void) memset(&medr.med_rec_nodes[i],
4804 '\0', sizeof (md_node_nm_t));
4805 else
4806 (void) strcpy(medr.med_rec_nodes[i],
4807 sd->sd_nodes[i]);
4808 }
4809 crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
4810
4811 /* Inform the mediator hosts of the new node list */
4812 for (i = 0; i < max_meds; i++) {
4813 if (sd->sd_med.n_lst[i].a_cnt == 0)
4814 continue;
4815
4816 if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp,
4817 &medr, ep)) {
4818 if (oha == TRUE && mdanyrpcerror(ep)) {
4819 mdclrerror(ep);
4820 continue;
4821 }
4822 goto rollback;
4823 }
4824 }
4825 }
4826
4827 RB_TEST(27, "deletehosts", ep)
4828
4829 /*
4830 * For traditional diskset:
4831 * We are deleting ourselves out of the set and we have drives to
4832 * consider; so we need to halt the set, release the drives and
4833 * reset the timeout. **** THIS IS A ONE WAY TICKET, NO ROLL BACK
4834 * IS POSSIBLE AS SOON AS THE HALT SET COMPLETES, SO THIS IS DONE
4835 * WITH ALL SIGNALS BLOCKED AND LAST ****
4836 *
4837 * This situation cannot occur in a MN diskset since a node can't
4838 * delete itself unless all nodes are being deleted and a diskset
4839 * cannot contain any drives if all nodes are being deleted.
4840 * So, don't even test for this if a MN diskset.
4841 */
4842 if (!(MD_MNSET_DESC(sd)) && (dd != NULL) &&
4843 strinlst(mynode(), node_c, node_v)) {
4844 /* Make sure we are blocking all signals */
4845 if (procsigs(TRUE, &oldsigs, ep) < 0) {
4846 rval = -1;
4847 goto out1;
4848 }
4849
4850 if (halt_set(sp, ep)) {
4851 rval = -1;
4852 goto out1;
4853 }
4854
4855 if (rel_own_bydd(sp, dd, FALSE, ep))
4856 rval = -1;
4857
4858 out1:
4859 /* release signals back to what they were on entry */
4860 if (procsigs(FALSE, &oldsigs, &xep) < 0) {
4861 if (rval == 0)
4862 (void) mdstealerror(ep, &xep);
4863 rval = -1;
4864 }
4865 }
4866
4867 out2:
4868 /*
4869 * Unlock diskset by resuming messages across the diskset.
4870 * Just resume all classes so that resume is the same whether
4871 * just one class was locked or all classes were locked.
4872 */
4873 if ((suspend1_flag) || (suspendall_flag)) {
4874 /* Send resume */
4875 nd = sd->sd_nodelist;
4876 while (nd) {
4877 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4878 nd = nd->nd_next;
4879 continue;
4880 }
4881 /*
4882 * Skip nodes being deleted if remote set
4883 * was deleted since rpc.mdcommd may no longer
4884 * be running on remote node.
4885 */
4886 if ((remote_sets_deleted == 1) &&
4887 (strinlst(nd->nd_nodename, node_c, node_v))) {
4888 nd = nd->nd_next;
4889 continue;
4890 }
4891 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
4892 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
4893 if (rval == 0)
4894 (void) mdstealerror(ep, &xep);
4895 rval = -1;
4896 mde_perror(ep, dgettext(TEXT_DOMAIN,
4897 "Unable to resume rpc.mdcommd.\n"));
4898 }
4899 nd = nd->nd_next;
4900 }
4901 meta_ping_mnset(sp->setno);
4902 }
4903
4904 cl_sk = cl_get_setkey(sp->setno, sp->setname);
4905 if (lock_flag) {
4906 if (MD_MNSET_DESC(sd)) {
4907 nd = sd->sd_nodelist;
4908 while (nd) {
4909 /*
4910 * During OHA mode, don't issue RPCs to
4911 * non-alive nodes since there is no reason to
4912 * wait for RPC timeouts.
4913 */
4914 if ((oha == TRUE) &&
4915 (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4916 nd = nd->nd_next;
4917 continue;
4918 }
4919 if (clnt_unlock_set(nd->nd_nodename,
4920 cl_sk, &xep)) {
4921 if (rval == 0)
4922 (void) mdstealerror(ep, &xep);
4923 rval = -1;
4924 }
4925 nd = nd->nd_next;
4926 }
4927 } else {
4928 for (i = 0; i < MD_MAXSIDES; i++) {
4929 /* Skip empty slots */
4930 if (sd->sd_nodes[i][0] == '\0')
4931 continue;
4932
4933 if (clnt_unlock_set(sd->sd_nodes[i],
4934 cl_sk, &xep)) {
4935 if (oha == TRUE &&
4936 mdanyrpcerror(&xep)) {
4937 mdclrerror(&xep);
4938 continue;
4939 }
4940 if (rval == 0)
4941 (void) mdstealerror(ep, &xep);
4942 rval = -1;
4943 }
4944 }
4945 }
4946 }
4947 cl_set_setkey(NULL);
4948
4949 out3:
4950 metafreereplicalist(rlp);
4951 if (node_id_list)
4952 Free(node_id_list);
4953
4954 metaflushsetname(sp);
4955
4956 if (MD_MNSET_DESC(sd)) {
4957 /* release signals back to what they were on entry */
4958 if (procsigs(FALSE, &oldsigs, &xep) < 0)
4959 mdclrerror(&xep);
4960 } else {
4961 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
4962 }
4963
4964
4965 return (rval);
4966
4967 rollback:
4968 /* all signals already blocked for MN disket */
4969 if (!(MD_MNSET_DESC(sd))) {
4970 if (procsigs(TRUE, &oldsigs, &xep) < 0)
4971 mdclrerror(&xep);
4972 }
4973
4974 rval = -1;
4975
4976 max_genid = sd->sd_genid;
4977
4978
4979 /*
4980 * Send reinit command to rpc.mdcommd which forces it to get
4981 * fresh set description and resume all classes but class 0.
4982 * Don't send any commands to rpc.mdcommd if set on that node
4983 * has been removed.
4984 */
4985 if (suspendall_flag) {
4986 /* Send reinit */
4987 nd = sd->sd_nodelist;
4988 while (nd) {
4989 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4990 nd = nd->nd_next;
4991 continue;
4992 }
4993 /*
4994 * If the remote set was deleted, rpc.mdcommd
4995 * may no longer be running so send nothing to it.
4996 */
4997 if ((remote_sets_deleted == 1) &&
4998 (strinlst(nd->nd_nodename, node_c, node_v))) {
4999 nd = nd->nd_next;
5000 continue;
5001 }
5002 /* Class is ignored for REINIT */
5003 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
5004 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
5005 mde_perror(&xep, dgettext(TEXT_DOMAIN,
5006 "Unable to reinit rpc.mdcommd.\n"));
5007 mdclrerror(&xep);
5008 }
5009 nd = nd->nd_next;
5010 }
5011 /* Send resume */
5012 nd = sd->sd_nodelist;
5013 while (nd) {
5014 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5015 nd = nd->nd_next;
5016 continue;
5017 }
5018 /*
5019 * If the remote set was deleted, rpc.mdcommd
5020 * may no longer be running so send nothing to it.
5021 */
5022 if ((remote_sets_deleted == 1) &&
5023 (strinlst(nd->nd_nodename, node_c, node_v))) {
5024 nd = nd->nd_next;
5025 continue;
5026 }
5027 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
5028 sp, MD_MSG_CLASS0, MD_MSCF_DONT_RESUME_CLASS1,
5029 &xep)) {
5030 mde_perror(&xep, dgettext(TEXT_DOMAIN,
5031 "Unable to resume rpc.mdcommd.\n"));
5032 mdclrerror(&xep);
5033 }
5034 nd = nd->nd_next;
5035 }
5036 meta_ping_mnset(sp->setno);
5037 }
5038
5039 /* level 2 */
5040 if (rb_level > 1) {
5041 md_set_record *sr;
5042 md_replicalist_t *rl;
5043
5044 recreate_set(sp, sd);
5045
5046 /*
5047 * Lock out other meta* commands on nodes with the newly
5048 * re-created sets by suspending class 1 messages
5049 * across the diskset.
5050 */
5051 nd = sd->sd_nodelist;
5052 while (nd) {
5053 /* Skip nodes not being deleted */
5054 if (!(strinlst(nd->nd_nodename, node_c, node_v))) {
5055 nd = nd->nd_next;
5056 continue;
5057 }
5058 /* Suspend commd on nodes with re-created sets */
5059 if (clnt_mdcommdctl(nd->nd_nodename,
5060 COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
5061 MD_MSCF_NO_FLAGS, &xep)) {
5062 mde_perror(&xep, dgettext(TEXT_DOMAIN,
5063 "Unable to suspend rpc.mdcommd.\n"));
5064 mdclrerror(&xep);
5065 }
5066 nd = nd->nd_next;
5067 }
5068
5069 max_genid++;
5070
5071 /*
5072 * See if we have to re-add the drives specified.
5073 */
5074 for (i = 0; i < node_c; i++) {
5075 if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
5076 /*
5077 * During OHA mode, don't issue RPCs to
5078 * non-alive nodes since there is no reason to
5079 * wait for RPC timeouts.
5080 */
5081 nd = sd->sd_nodelist;
5082 while (nd) {
5083 if (strcmp(nd->nd_nodename, node_v[i])
5084 == 0) {
5085 break;
5086 }
5087 nd = nd->nd_next;
5088 }
5089 if (nd == 0)
5090 continue;
5091 if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
5092 continue;
5093 }
5094
5095 /* Don't care if set record is MN or not */
5096 if (clnt_getset(node_v[i], sp->setname, MD_SET_BAD, &sr,
5097 &xep) == -1) {
5098 mdclrerror(&xep);
5099 continue;
5100 }
5101
5102 /* Drive already added, skip to next node */
5103 if (sr->sr_drivechain != NULL) {
5104 /*
5105 * Set record structure was allocated from RPC
5106 * routine getset so this structure is only of
5107 * size md_set_record even if the MN flag is
5108 * set. So, clear the flag so that the free
5109 * code doesn't attempt to free a structure
5110 * the size of md_mnset_record.
5111 */
5112 sr->sr_flags &= ~MD_SR_MN;
5113 free_sr(sr);
5114 continue;
5115 }
5116
5117 if (clnt_adddrvs(node_v[i], sp, dd, sr->sr_ctime,
5118 sr->sr_genid, &xep) == -1)
5119 mdclrerror(&xep);
5120
5121 if (clnt_upd_dr_flags(node_v[i], sp, dd, MD_DR_OK,
5122 &xep) == -1)
5123 mdclrerror(&xep);
5124
5125 /*
5126 * Set record structure was allocated from RPC routine
5127 * getset so this structure is only of size
5128 * md_set_record even if the MN flag is set. So,
5129 * clear the flag so that the free code doesn't
5130 * attempt to free a structure the size of
5131 * md_mnset_record.
5132 */
5133 sr->sr_flags &= ~MD_SR_MN;
5134 free_sr(sr);
5135 }
5136 max_genid += 3;
5137
5138 for (rl = rlp; rl != NULL; rl = rl->rl_next) {
5139 md_replica_t *r = rl->rl_repp;
5140 /*
5141 * This is not the first replica being added to the
5142 * diskset so call with ADDSIDENMS_BCAST. If this
5143 * is a traditional diskset, the bcast flag is ignored
5144 * since traditional disksets don't use the rpc.mdcommd.
5145 */
5146 if (meta_db_addsidenms(sp, r->r_namep, r->r_blkno,
5147 DB_ADDSIDENMS_BCAST, &xep))
5148 mdclrerror(&xep);
5149 }
5150
5151 /*
5152 * Add the device names for the new sides into the namespace,
5153 * on all hosts not being deleted.
5154 */
5155 if (MD_MNSET_DESC(sd)) {
5156 nd = sd->sd_nodelist;
5157 while (nd) {
5158 /* Find a node that is not being deleted */
5159 if (!strinlst(nd->nd_nodename, node_c,
5160 node_v)) {
5161 j = nd->nd_nodeid;
5162 break;
5163 }
5164 nd = nd->nd_next;
5165 }
5166 } else {
5167 for (j = 0; j < MD_MAXSIDES; j++) {
5168 /* Skip empty slots */
5169 if (sd->sd_nodes[j][0] == '\0')
5170 continue;
5171
5172 /* Find a node that is not being deleted */
5173 if (!strinlst(sd->sd_nodes[j], node_c, node_v))
5174 break;
5175 }
5176 }
5177
5178 if (MD_MNSET_DESC(sd)) {
5179 nd = sd->sd_nodelist;
5180 while (nd) {
5181 /* Skip nodes not being deleted */
5182 if (!strinlst(nd->nd_nodename, node_c,
5183 node_v)) {
5184 nd = nd->nd_next;
5185 continue;
5186 }
5187
5188 /* this side was just created, add the names */
5189 if (add_md_sidenms(sp, nd->nd_nodeid, j, &xep))
5190 mdclrerror(&xep);
5191 nd = nd->nd_next;
5192 }
5193 } else {
5194 for (i = 0; i < MD_MAXSIDES; i++) {
5195 /* Skip empty slots */
5196 if (sd->sd_nodes[i][0] == '\0')
5197 continue;
5198
5199 /* Skip nodes not being deleted */
5200 if (!strinlst(sd->sd_nodes[i], node_c, node_v))
5201 continue;
5202
5203 /* this side was just created, add the names */
5204 if (add_md_sidenms(sp, i, j, &xep))
5205 mdclrerror(&xep);
5206 }
5207 }
5208 }
5209
5210 /* level 4 */
5211 if (rb_level > 3 && dd != NULL) {
5212 /*
5213 * Add the new sidename for each drive to all the hosts
5214 * Multi-node disksets only store the sidename for
5215 * that host, so there is nothing to re-add.
5216 */
5217 if (!(MD_MNSET_DESC(sd))) {
5218 for (j = 0; j < MD_MAXSIDES; j++) {
5219 /* Skip empty slots */
5220 if (sd->sd_nodes[j][0] == '\0')
5221 continue;
5222
5223 /* Skip nodes not being deleted */
5224 if (!strinlst(sd->sd_nodes[j], node_c, node_v))
5225 break;
5226 }
5227 for (i = 0; i < MD_MAXSIDES; i++) {
5228 /* Skip empty slots */
5229 if (sd->sd_nodes[i][0] == '\0')
5230 continue;
5231
5232 if (clnt_add_drv_sidenms(sd->sd_nodes[i],
5233 sd->sd_nodes[j], sp, sd, node_c, node_v,
5234 &xep))
5235 mdclrerror(&xep);
5236 }
5237 }
5238
5239 }
5240
5241 /* level 5 */
5242 if ((rb_level > 4) && (!(MD_MNSET_DESC(sd)))) {
5243 /* rollback the mediator record */
5244 for (i = 0; i < max_meds; i++) {
5245 if (sd->sd_med.n_lst[i].a_cnt == 0)
5246 continue;
5247
5248 if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp,
5249 &rb_medr, &xep))
5250 mdclrerror(&xep);
5251 }
5252 }
5253
5254 /* level 3 */
5255 if (rb_level > 2) {
5256 md_set_record *sr;
5257 md_mnset_record *mnsr;
5258
5259 if (MD_MNSET_DESC(sd)) {
5260 nd = sd->sd_nodelist;
5261 /*
5262 * During OHA mode, don't issue RPCs to
5263 * non-alive nodes since there is no reason to
5264 * wait for RPC timeouts.
5265 */
5266 while (nd) {
5267 if ((oha == TRUE) &&
5268 (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
5269 nd = nd->nd_next;
5270 continue;
5271 }
5272 /* Record should be for a multi-node diskset */
5273 if (clnt_mngetset(nd->nd_nodename, sp->setname,
5274 MD_SET_BAD, &mnsr, &xep) == -1) {
5275 mdclrerror(&xep);
5276 nd = nd->nd_next;
5277 continue;
5278 }
5279
5280 has_set = 1;
5281
5282 nr = mnsr->sr_nodechain;
5283 while (nr) {
5284 if (nd->nd_nodeid == nr->nr_nodeid) {
5285 break;
5286 }
5287 nr = nr->nr_next;
5288 }
5289 if (nr == NULL)
5290 has_set = 0;
5291
5292 free_sr((struct md_set_record *)mnsr);
5293 if (has_set) {
5294 nd = nd->nd_next;
5295 continue;
5296 }
5297
5298 if (clnt_addhosts(nd->nd_nodename, sp, node_c,
5299 node_v, &xep) == -1)
5300 mdclrerror(&xep);
5301
5302 nd = nd->nd_next;
5303 }
5304 } else {
5305 for (i = 0; i < MD_MAXSIDES; i++) {
5306 /* Skip empty slots */
5307 if (sd->sd_nodes[i][0] == '\0')
5308 continue;
5309
5310 /* Record should be for a non-multi-node set */
5311 if (clnt_getset(sd->sd_nodes[i], sp->setname,
5312 MD_SET_BAD, &sr, &xep) == -1) {
5313 mdclrerror(&xep);
5314 continue;
5315 }
5316
5317 /*
5318 * Set record structure was allocated from RPC
5319 * routine getset so this structure is only of
5320 * size md_set_record even if the MN flag is
5321 * set. So, clear the flag so that the free
5322 * code doesn't attempt to free a structure
5323 * the size of md_mnset_record.
5324 */
5325 if (MD_MNSET_REC(sr)) {
5326 sr->sr_flags &= ~MD_SR_MN;
5327 free_sr(sr);
5328 continue;
5329 }
5330
5331 has_set = 1;
5332 for (j = 0; j < MD_MAXSIDES; j++) {
5333 /* Skip empty slots */
5334 if (sd->sd_nodes[j][0] == '\0')
5335 continue;
5336
5337 if (sr->sr_nodes[j][0] == '\0') {
5338 has_set = 0;
5339 break;
5340 }
5341 }
5342
5343 free_sr(sr);
5344 if (has_set)
5345 continue;
5346
5347 if (clnt_addhosts(sd->sd_nodes[i], sp, node_c,
5348 node_v, &xep) == -1)
5349 mdclrerror(&xep);
5350 }
5351 }
5352 max_genid++;
5353 }
5354
5355 /* level 1 */
5356 if (rb_level > 0) {
5357 max_genid++;
5358 /* Sets MD_SR_OK on given nodes. */
5359 resync_genid(sp, sd, max_genid, node_c, node_v);
5360
5361 /*
5362 * For MN diskset:
5363 * On each newly re-added node, set the node record for that
5364 * node to OK. Then set all node records for the newly added
5365 * nodes on all nodes to ok.
5366 *
5367 * By setting a node's own node record to ok first, even if
5368 * the node re-adding the hosts panics, the rest of the nodes
5369 * can determine the same node list during the choosing of the
5370 * master during reconfig. So, only nodes considered for
5371 * mastership are nodes that have both MD_MN_NODE_OK and
5372 * MD_SR_OK set on that node's rpc.metad. If all nodes have
5373 * MD_SR_OK set, but no node has its own MD_MN_NODE_OK set,
5374 * then the set will be removed during reconfig since a panic
5375 * occurred during the re-creation of the deletion of
5376 * the initial diskset.
5377 */
5378 if (MD_MNSET_DESC(sd)) {
5379 md_mnnode_desc *saved_nd_next;
5380 if (dd != NULL) {
5381 /*
5382 * Notify rpc.mdcommd on all nodes of a
5383 * nodelist change. Start by suspending
5384 * rpc.mdcommd (which drains it of all
5385 * messages), then change the nodelist
5386 * followed by a reinit and resume.
5387 */
5388 nd = sd->sd_nodelist;
5389 while (nd) {
5390 if (!(nd->nd_flags &
5391 MD_MN_NODE_ALIVE)) {
5392 nd = nd->nd_next;
5393 continue;
5394 }
5395 if (clnt_mdcommdctl(nd->nd_nodename,
5396 COMMDCTL_SUSPEND, sp,
5397 MD_MSG_CLASS0,
5398 MD_MSCF_NO_FLAGS, &xep)) {
5399 mde_perror(&xep,
5400 dgettext(TEXT_DOMAIN,
5401 "Unable to suspend "
5402 "rpc.mdcommd.\n"));
5403 mdclrerror(&xep);
5404 }
5405 suspendall_flag_rb = 1;
5406 nd = nd->nd_next;
5407 }
5408 }
5409 for (i = 0; i < node_c; i++) {
5410 /*
5411 * During OHA mode, don't issue RPCs to
5412 * non-alive nodes since there is no reason to
5413 * wait for RPC timeouts.
5414 */
5415 nd = sd->sd_nodelist;
5416 while (nd) {
5417 if (strcmp(nd->nd_nodename, node_v[i])
5418 == 0)
5419 break;
5420 nd = nd->nd_next;
5421 }
5422 /* Something wrong, finish this in next loop */
5423 if (nd == NULL)
5424 continue;
5425
5426 if ((oha == TRUE) &&
5427 (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
5428 continue;
5429 }
5430
5431 if (dd != NULL) {
5432 /* Set master on re-joining node. */
5433 if (clnt_mnsetmaster(node_v[i], sp,
5434 sd->sd_mn_master_nodenm,
5435 sd->sd_mn_master_nodeid, &xep)) {
5436 mdclrerror(&xep);
5437 }
5438
5439 /*
5440 * Re-join set to same state as
5441 * before - stale or non-stale.
5442 */
5443 if (clnt_joinset(node_v[i], sp,
5444 stale_flag, &xep)) {
5445 mdclrerror(&xep);
5446 }
5447 }
5448
5449 /* Only changing my local cache of node list */
5450 saved_nd_next = nd->nd_next;
5451 nd->nd_next = NULL;
5452
5453 /* Set record for host to ok on that host */
5454 if (clnt_upd_nr_flags(node_v[i], sp,
5455 nd, MD_NR_OK, NULL, &xep)) {
5456 mdclrerror(&xep);
5457 }
5458 nd->nd_next = saved_nd_next;
5459 }
5460
5461 /* Now set all node records on all nodes to be ok */
5462 nd = sd->sd_nodelist;
5463 while (nd) {
5464 /*
5465 * During OHA mode, don't issue RPCs to
5466 * non-alive nodes since there is no reason to
5467 * wait for RPC timeouts.
5468 */
5469 if ((oha == TRUE) &&
5470 (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
5471 nd = nd->nd_next;
5472 continue;
5473 }
5474 if (clnt_upd_nr_flags(nd->nd_nodename, sp,
5475 sd->sd_nodelist, MD_NR_OK, NULL, &xep)) {
5476 mdclrerror(&xep);
5477 }
5478 nd = nd->nd_next;
5479 }
5480 }
5481 }
5482
5483 /*
5484 * Notify rpc.mdcommd on all nodes of a nodelist change.
5485 * Send reinit command to mdcommd which forces it to get
5486 * fresh set description.
5487 */
5488 if (suspendall_flag_rb) {
5489 /* Send reinit */
5490 nd = sd->sd_nodelist;
5491 while (nd) {
5492 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5493 nd = nd->nd_next;
5494 continue;
5495 }
5496
5497 /* Class is ignored for REINIT */
5498 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
5499 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
5500 mde_perror(&xep, dgettext(TEXT_DOMAIN,
5501 "Unable to reinit rpc.mdcommd.\n"));
5502 mdclrerror(&xep);
5503 }
5504 nd = nd->nd_next;
5505 }
5506 }
5507
5508 /*
5509 * Unlock diskset by resuming messages across the diskset.
5510 * Just resume all classes so that resume is the same whether
5511 * just one class was locked or all classes were locked.
5512 */
5513 if ((suspend1_flag) || (suspendall_flag) || (suspendall_flag_rb)) {
5514 /* Send resume */
5515 nd = sd->sd_nodelist;
5516 while (nd) {
5517 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5518 nd = nd->nd_next;
5519 continue;
5520 }
5521 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
5522 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
5523 mde_perror(&xep, dgettext(TEXT_DOMAIN,
5524 "Unable to resume rpc.mdcommd.\n"));
5525 }
5526 nd = nd->nd_next;
5527 }
5528 meta_ping_mnset(sp->setno);
5529 }
5530
5531 /*
5532 * Start a resync thread on the re-added nodes
5533 * if set is not stale. Also start a thread to update the
5534 * abr state of all soft partitions
5535 */
5536 if (stale_flag != MNSET_IS_STALE) {
5537 for (i = 0; i < node_c; i++) {
5538 /*
5539 * During OHA mode, don't issue RPCs to
5540 * non-alive nodes since there is no reason to
5541 * wait for RPC timeouts.
5542 */
5543 nd = sd->sd_nodelist;
5544 while (nd) {
5545 if (strcmp(nd->nd_nodename, node_v[i])
5546 == 0)
5547 break;
5548 nd = nd->nd_next;
5549 }
5550 if (nd == NULL)
5551 continue;
5552
5553 if ((oha == TRUE) &&
5554 (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
5555 continue;
5556 }
5557
5558 if (dd != 0) {
5559 if (clnt_mn_mirror_resync_all(node_v[i],
5560 sp->setno, &xep)) {
5561 mde_perror(ep, dgettext(TEXT_DOMAIN,
5562 "Unable to start resync "
5563 "thread.\n"));
5564 }
5565 if (clnt_mn_sp_update_abr(node_v[i],
5566 sp->setno, &xep)) {
5567 mde_perror(ep, dgettext(TEXT_DOMAIN,
5568 "Unable to start sp update "
5569 "thread.\n"));
5570 }
5571 }
5572 }
5573 }
5574
5575 /* level 0 */
5576 cl_sk = cl_get_setkey(sp->setno, sp->setname);
5577 /* Don't test lock flag since guaranteed to be set if in rollback */
5578 if (MD_MNSET_DESC(sd)) {
5579 nd = sd->sd_nodelist;
5580 while (nd) {
5581 /*
5582 * During OHA mode, don't issue RPCs to
5583 * non-alive nodes since there is no reason to
5584 * wait for RPC timeouts.
5585 */
5586 if ((oha == TRUE) &&
5587 (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
5588 nd = nd->nd_next;
5589 continue;
5590 }
5591 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
5592 mdclrerror(&xep);
5593 nd = nd->nd_next;
5594 }
5595 } else {
5596 for (i = 0; i < MD_MAXSIDES; i++) {
5597 /* Skip empty slots */
5598 if (sd->sd_nodes[i][0] == '\0')
5599 continue;
5600
5601 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
5602 mdclrerror(&xep);
5603 }
5604 }
5605 cl_set_setkey(NULL);
5606
5607 /* release signals back to what they were on entry */
5608 if (procsigs(FALSE, &oldsigs, &xep) < 0)
5609 mdclrerror(&xep);
5610
5611 metafreereplicalist(rlp);
5612 if (node_id_list)
5613 Free(node_id_list);
5614
5615 metaflushsetname(sp);
5616
5617 if (!(MD_MNSET_DESC(sd))) {
5618 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
5619 }
5620
5621 return (rval);
5622 }
5623
5624 int
meta_set_auto_take(mdsetname_t * sp,int take_val,md_error_t * ep)5625 meta_set_auto_take(
5626 mdsetname_t *sp,
5627 int take_val,
5628 md_error_t *ep
5629 )
5630 {
5631 int i;
5632 md_set_desc *sd;
5633 int rval = 0;
5634 md_setkey_t *cl_sk;
5635 md_error_t xep = mdnullerror;
5636 char *hostname;
5637 md_drive_desc *dd;
5638
5639 if ((sd = metaget_setdesc(sp, ep)) == NULL)
5640 return (-1);
5641
5642 /* Make sure we own the set */
5643 if (meta_check_ownership(sp, ep) != 0)
5644 return (-1);
5645
5646 hostname = mynode();
5647
5648 /* Lock the set on our side */
5649 if (clnt_lock_set(hostname, sp, ep)) {
5650 rval = -1;
5651 goto out;
5652 }
5653
5654 if (take_val) {
5655 /* enable auto_take but only if it is not already set */
5656 if (! (sd->sd_flags & MD_SR_AUTO_TAKE)) {
5657 /* verify that we're the only host in the set */
5658 for (i = 0; i < MD_MAXSIDES; i++) {
5659 if (sd->sd_nodes[i] == NULL ||
5660 sd->sd_nodes[i][0] == '\0')
5661 continue;
5662
5663 if (strcmp(sd->sd_nodes[i], hostname) != 0) {
5664 (void) mddserror(ep, MDE_DS_SINGLEHOST,
5665 sp->setno, NULL, NULL, sp->setname);
5666 rval = -1;
5667 goto out;
5668 }
5669 }
5670
5671 if (clnt_enable_sr_flags(hostname, sp,
5672 MD_SR_AUTO_TAKE, ep))
5673 rval = -1;
5674
5675 /* Disable SCSI reservations */
5676 if (sd->sd_flags & MD_SR_MB_DEVID)
5677 dd = metaget_drivedesc(sp, MD_BASICNAME_OK |
5678 PRINT_FAST, &xep);
5679 else
5680 dd = metaget_drivedesc(sp, MD_BASICNAME_OK,
5681 &xep);
5682
5683 if (! mdisok(&xep))
5684 mdclrerror(&xep);
5685
5686 if (dd != NULL) {
5687 if (rel_own_bydd(sp, dd, TRUE, &xep))
5688 mdclrerror(&xep);
5689 }
5690 }
5691
5692 } else {
5693 /* disable auto_take, if set, or error */
5694 if (sd->sd_flags & MD_SR_AUTO_TAKE) {
5695 if (clnt_disable_sr_flags(hostname, sp,
5696 MD_SR_AUTO_TAKE, ep))
5697 rval = -1;
5698
5699 /* Enable SCSI reservations */
5700 if (sd->sd_flags & MD_SR_MB_DEVID)
5701 dd = metaget_drivedesc(sp, MD_BASICNAME_OK |
5702 PRINT_FAST, &xep);
5703 else
5704 dd = metaget_drivedesc(sp, MD_BASICNAME_OK,
5705 &xep);
5706
5707 if (! mdisok(&xep))
5708 mdclrerror(&xep);
5709
5710 if (dd != NULL) {
5711 mhd_mhiargs_t mhiargs = defmhiargs;
5712
5713 if (tk_own_bydd(sp, dd, &mhiargs, TRUE, &xep))
5714 mdclrerror(&xep);
5715 }
5716 } else {
5717 (void) mddserror(ep, MDE_DS_AUTONOTSET, sp->setno,
5718 NULL, NULL, sp->setname);
5719 rval = -1;
5720 }
5721 }
5722
5723 out:
5724 cl_sk = cl_get_setkey(sp->setno, sp->setname);
5725 if (clnt_unlock_set(hostname, cl_sk, &xep)) {
5726 if (rval == 0)
5727 (void) mdstealerror(ep, &xep);
5728 rval = -1;
5729 }
5730 cl_set_setkey(NULL);
5731
5732 return (rval);
5733 }
5734