1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * Just in case we're not in a build environment, make sure that
28 * TEXT_DOMAIN gets set to something.
29 */
30 #if !defined(TEXT_DOMAIN)
31 #define TEXT_DOMAIN "SYS_TEST"
32 #endif
33
34 /*
35 * Metadevice diskset interfaces
36 */
37
38 #include "meta_set_prv.h"
39 #include <meta.h>
40 #include <metad.h>
41 #include <mdmn_changelog.h>
42 #include <sys/lvm/md_crc.h>
43 #include <sys/utsname.h>
44 #include <sdssc.h>
45
46 #include <sys/sysevent/eventdefs.h>
47 #include <sys/sysevent/svm.h>
48 extern char *blkname(char *);
49
50 static md_drive_desc *
dr2drivedesc(mdsetname_t * sp,side_t sideno,int flags,md_error_t * ep)51 dr2drivedesc(
52 mdsetname_t *sp,
53 side_t sideno,
54 int flags,
55 md_error_t *ep
56 )
57 {
58 md_set_record *sr;
59 md_drive_record *dr;
60 mddrivename_t *dnp;
61 md_drive_desc *dd_head = NULL;
62 md_set_desc *sd;
63
64 if (flags & MD_BYPASS_DAEMON) {
65 if ((sr = metad_getsetbynum(sp->setno, ep)) == NULL)
66 return (NULL);
67 sd = metaget_setdesc(sp, ep);
68 sideno = getnodeside(mynode(), sd);
69 sp = metafakesetname(sp->setno, sr->sr_setname);
70 } else {
71 if ((sr = getsetbyname(sp->setname, ep)) == NULL)
72 return (NULL);
73 }
74
75 assert(sideno != MD_SIDEWILD);
76
77 /*
78 * WARNING:
79 * The act of getting the dnp from the namespace means that we
80 * will get the devid of the disk as recorded in the namespace.
81 * This devid has the potential to be stale if the disk is being
82 * replaced via a rebind, this means that any code that relies
83 * on any of the dnp information should take the appropriate action
84 * to preserve that information. For example in the rebind code the
85 * devid of the new disk is saved off and then copied back in once
86 * the code that has called this function has completed.
87 */
88 for (dr = sr->sr_drivechain; dr != NULL; dr = dr->dr_next) {
89 if ((dnp = metadrivename_withdrkey(sp, sideno, dr->dr_key,
90 flags, ep)) == NULL) {
91 if (!(flags & MD_BYPASS_DAEMON))
92 free_sr(sr);
93 metafreedrivedesc(&dd_head);
94 return (NULL);
95 }
96
97 (void) metadrivedesc_append(&dd_head, dnp, dr->dr_dbcnt,
98 dr->dr_dbsize, dr->dr_ctime, dr->dr_genid, dr->dr_flags);
99 }
100
101 if (!(flags & MD_BYPASS_DAEMON)) {
102 free_sr(sr);
103 }
104 return (dd_head);
105 }
106
107 static int
get_sidenmlist(mdsetname_t * sp,mddrivename_t * dnp,md_error_t * ep)108 get_sidenmlist(
109 mdsetname_t *sp,
110 mddrivename_t *dnp,
111 md_error_t *ep
112 )
113 {
114 md_set_desc *sd;
115 mdsidenames_t *sn, **sn_next;
116 int i;
117
118 if ((sd = metaget_setdesc(sp, ep)) == NULL)
119 return (-1);
120
121 metaflushsidenames(dnp);
122 sn_next = &dnp->side_names;
123 if (MD_MNSET_DESC(sd)) {
124 /*
125 * Only get sidenames for this node since
126 * that is the only side information stored in
127 * the local mddb for a multi-node diskset.
128 */
129 if (sd->sd_mn_mynode) {
130 sn = Zalloc(sizeof (*sn));
131 sn->sideno = sd->sd_mn_mynode->nd_nodeid;
132 if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET,
133 sn->sideno, dnp->side_names_key, &sn->dname,
134 &sn->mnum, NULL, ep)) == NULL) {
135 if (sn->dname != NULL)
136 Free(sn->dname);
137 Free(sn);
138 return (-1);
139 }
140
141 /* Add to the end of the linked list */
142 assert(*sn_next == NULL);
143 *sn_next = sn;
144 sn_next = &sn->next;
145 }
146 } else {
147 for (i = 0; i < MD_MAXSIDES; i++) {
148 /* Skip empty slots */
149 if (sd->sd_nodes[i][0] == '\0')
150 continue;
151
152 sn = Zalloc(sizeof (*sn));
153 sn->sideno = i;
154 if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET,
155 i+SKEW, dnp->side_names_key, &sn->dname,
156 &sn->mnum, NULL, ep)) == NULL) {
157 /*
158 * It is possible that during the add of a
159 * host to have a 'missing' side as the side
160 * for this disk will be added later. So ignore
161 * the error. The 'missing' side will be added
162 * once the addhosts process has completed.
163 */
164 if (mdissyserror(ep, ENOENT)) {
165 mdclrerror(ep);
166 Free(sn);
167 continue;
168 }
169
170 if (sn->dname != NULL)
171 Free(sn->dname);
172 Free(sn);
173 return (-1);
174 }
175
176 /* Add to the end of the linked list */
177 assert(*sn_next == NULL);
178 *sn_next = sn;
179 sn_next = &sn->next;
180 }
181 }
182
183 return (0);
184 }
185
186 static md_drive_desc *
rl_to_dd(mdsetname_t * sp,md_replicalist_t * rlp,md_error_t * ep)187 rl_to_dd(
188 mdsetname_t *sp,
189 md_replicalist_t *rlp,
190 md_error_t *ep
191 )
192 {
193 md_replicalist_t *rl;
194 md_replica_t *r;
195 md_drive_desc *dd = NULL;
196 md_drive_desc *d;
197 int found;
198 md_set_desc *sd;
199 daddr_t nblks = 0;
200
201 if ((sd = metaget_setdesc(sp, ep)) == NULL)
202 return (NULL);
203
204 /* find the smallest existing replica */
205 for (rl = rlp; rl != NULL; rl = rl->rl_next) {
206 r = rl->rl_repp;
207 nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks));
208 }
209
210 if (nblks <= 0)
211 nblks = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;
212
213 for (rl = rlp; rl != NULL; rl = rl->rl_next) {
214 r = rl->rl_repp;
215
216 found = 0;
217 for (d = dd; d != NULL; d = d->dd_next) {
218 if (strcmp(r->r_namep->drivenamep->cname,
219 d->dd_dnp->cname) == 0) {
220 found = 1;
221 dd->dd_dbcnt++;
222 break;
223 }
224 }
225
226 if (! found)
227 (void) metadrivedesc_append(&dd, r->r_namep->drivenamep,
228 1, nblks, sd->sd_ctime, sd->sd_genid, MD_DR_OK);
229 }
230
231 return (dd);
232 }
233
234 /*
235 * Exported Entry Points
236 */
237
238 set_t
get_max_sets(md_error_t * ep)239 get_max_sets(md_error_t *ep)
240 {
241
242 static set_t max_sets = 0;
243
244 if (max_sets == 0)
245 if (metaioctl(MD_IOCGETNSET, &max_sets, ep, NULL) != 0)
246 return (0);
247
248 return (max_sets);
249 }
250
251 int
get_max_meds(md_error_t * ep)252 get_max_meds(md_error_t *ep)
253 {
254 static int max_meds = 0;
255
256 if (max_meds == 0)
257 if (metaioctl(MD_MED_GET_NMED, &max_meds, ep, NULL) != 0)
258 return (0);
259
260 return (max_meds);
261 }
262
263 side_t
getmyside(mdsetname_t * sp,md_error_t * ep)264 getmyside(mdsetname_t *sp, md_error_t *ep)
265 {
266 md_set_desc *sd;
267 char *node = NULL;
268 side_t sideno;
269
270 if (sp->setno == 0)
271 return (0);
272
273 if ((sd = metaget_setdesc(sp, ep)) == NULL)
274 return (MD_SIDEWILD);
275
276 node = mynode();
277
278 assert(node != NULL);
279
280 sideno = getnodeside(node, sd);
281
282 if (sideno != MD_SIDEWILD)
283 return (sideno);
284
285 return (mddserror(ep, MDE_DS_HOSTNOSIDE, sp->setno, node, NULL, node));
286 }
287
288 /*
289 * get set info from name
290 */
291 md_set_record *
getsetbyname(char * setname,md_error_t * ep)292 getsetbyname(char *setname, md_error_t *ep)
293 {
294 md_set_record *sr = NULL;
295 md_mnset_record *mnsr = NULL;
296 char *p;
297 size_t len;
298
299 /* get set info from daemon */
300 if (clnt_getset(mynode(), setname, MD_SET_BAD, &sr, ep) == -1)
301 return (NULL);
302 if (sr != NULL) {
303 /*
304 * Returned record could be for a multi-node set or a
305 * non-multi-node set.
306 */
307 if (MD_MNSET_REC(sr)) {
308 /*
309 * Record is for a multi-node set. Reissue call
310 * to get mnset information. Need to free
311 * record as if a non-multi-node set record since
312 * that is what clnt_getset gave us. If in
313 * the daemon, don't free since this is a pointer
314 * into the setrecords array.
315 */
316 if (! md_in_daemon) {
317 sr->sr_flags &= ~MD_SR_MN;
318 free_sr(sr);
319 }
320 if (clnt_mngetset(mynode(), setname, MD_SET_BAD, &mnsr,
321 ep) == -1)
322 return (NULL);
323 if (mnsr != NULL)
324 return ((struct md_set_record *)mnsr);
325 } else {
326 return (sr);
327 }
328 }
329
330 /* no such set */
331 len = strlen(setname) + 30;
332 p = Malloc(len);
333 (void) snprintf(p, len, "setname \"%s\"", setname);
334 (void) mderror(ep, MDE_NO_SET, p);
335 Free(p);
336 return (NULL);
337 }
338
339 /*
340 * get set info from number
341 */
342 md_set_record *
getsetbynum(set_t setno,md_error_t * ep)343 getsetbynum(set_t setno, md_error_t *ep)
344 {
345 md_set_record *sr;
346 md_mnset_record *mnsr = NULL;
347 char buf[100];
348
349 if (clnt_getset(mynode(), NULL, setno, &sr, ep) == -1)
350 return (NULL);
351
352 if (sr != NULL) {
353 /*
354 * Record is for a multi-node set. Reissue call
355 * to get mnset information. Need to free
356 * record as if a non-multi-node set record since
357 * that is what clnt_getset gave us. If in
358 * the daemon, don't free since this is a pointer
359 * into the setrecords array.
360 */
361 if (MD_MNSET_REC(sr)) {
362 /*
363 * Record is for a multi-node set. Reissue call
364 * to get mnset information.
365 */
366 if (! md_in_daemon) {
367 sr->sr_flags &= ~MD_SR_MN;
368 free_sr(sr);
369 }
370 if (clnt_mngetset(mynode(), NULL, setno, &mnsr,
371 ep) == -1)
372 return (NULL);
373 if (mnsr != NULL)
374 return ((struct md_set_record *)mnsr);
375 } else {
376 return (sr);
377 }
378 }
379
380 (void) sprintf(buf, "setno %u", setno);
381 (void) mderror(ep, MDE_NO_SET, buf);
382 return (NULL);
383 }
384
385 int
meta_check_drive_inuse(mdsetname_t * sp,mddrivename_t * dnp,int check_db,md_error_t * ep)386 meta_check_drive_inuse(
387 mdsetname_t *sp,
388 mddrivename_t *dnp,
389 int check_db,
390 md_error_t *ep
391 )
392 {
393 mdnamelist_t *nlp = NULL;
394 mdnamelist_t *p;
395 int rval = 0;
396
397 /* get all underlying partitions */
398 if (meta_getalldevs(sp, &nlp, check_db, ep) != 0)
399 return (-1);
400
401 /* search for drive */
402 for (p = nlp; (p != NULL); p = p->next) {
403 mdname_t *np = p->namep;
404
405 if (strcmp(dnp->cname, np->drivenamep->cname) == 0) {
406 rval = (mddserror(ep, MDE_DS_DRIVEINUSE, sp->setno,
407 NULL, dnp->cname, sp->setname));
408 break;
409 }
410 }
411
412 /* cleanup, return success */
413 metafreenamelist(nlp);
414 return (rval);
415 }
416
417 /*
418 * simple check for ownership
419 */
420 int
meta_check_ownership(mdsetname_t * sp,md_error_t * ep)421 meta_check_ownership(mdsetname_t *sp, md_error_t *ep)
422 {
423 int ownset;
424 md_set_desc *sd;
425 md_drive_desc *dd;
426 md_replicalist_t *rlp = NULL;
427 md_error_t xep = mdnullerror;
428
429 if (metaislocalset(sp))
430 return (0);
431
432 ownset = own_set(sp, NULL, TRUE, ep);
433 if (! mdisok(ep))
434 return (-1);
435
436 if ((sd = metaget_setdesc(sp, ep)) == NULL)
437 return (-1);
438
439 dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep);
440 if (! mdisok(ep))
441 return (-1);
442
443 /* If we have no drive descriptors, check for no ownership */
444 if (dd == NULL) {
445 if (ownset == MD_SETOWNER_NONE)
446 return (0);
447
448 /* If ownership somehow has come to exist, we must clean up */
449
450 if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp,
451 &xep) < 0)
452 mdclrerror(&xep);
453
454 if ((dd = rl_to_dd(sp, rlp, &xep)) == NULL)
455 if (! mdisok(&xep))
456 mdclrerror(&xep);
457
458 if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
459 if (rel_own_bydd(sp, dd, TRUE, &xep))
460 mdclrerror(&xep);
461 }
462
463 if (halt_set(sp, &xep))
464 mdclrerror(&xep);
465
466 metafreereplicalist(rlp);
467
468 metafreedrivedesc(&dd);
469
470 return (0);
471 }
472
473 metafreedrivedesc(&sd->sd_drvs);
474
475 if (ownset == MD_SETOWNER_YES)
476 return (0);
477
478 return (mddserror(ep, MDE_DS_NOOWNER, sp->setno, NULL, NULL,
479 sp->setname));
480 }
481
482 /*
483 * simple check for ownership
484 */
485 int
meta_check_ownership_on_host(mdsetname_t * sp,char * hostname,md_error_t * ep)486 meta_check_ownership_on_host(mdsetname_t *sp, char *hostname, md_error_t *ep)
487 {
488 md_set_desc *sd;
489 md_drive_desc *dd;
490 int bool;
491
492 if (metaislocalset(sp))
493 return (0);
494
495 if ((sd = metaget_setdesc(sp, ep)) == NULL)
496 return (-1);
497
498 if (getnodeside(hostname, sd) == MD_SIDEWILD)
499 return (mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
500 hostname, NULL, sp->setname));
501
502 dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep);
503 if (! mdisok(ep))
504 return (-1);
505
506 if (clnt_ownset(hostname, sp, &bool, ep) == -1)
507 return (-1);
508
509 if (dd == NULL)
510 return (0);
511
512 metafreedrivedesc(&sd->sd_drvs);
513
514 if (bool == TRUE)
515 return (0);
516
517 return (mddserror(ep, MDE_DS_NODEISNOTOWNER, sp->setno, hostname, NULL,
518 sp->setname));
519 }
520
521 /*
522 * Function that determines if a node is in the multinode diskset
523 * membership list. Calling node passes in node to be checked and
524 * the nodelist as returned from meta_read_nodelist. This routine
525 * anticipates being called many times using the same diskset membership
526 * list which is why the alloc and free of the diskset membership list
527 * is left to the calling routine.
528 * Returns:
529 * 1 - if a member
530 * 0 - not a member
531 */
532 int
meta_is_member(char * node_name,md_mn_nodeid_t node_id,mndiskset_membershiplist_t * nl)533 meta_is_member(
534 char *node_name,
535 md_mn_nodeid_t node_id,
536 mndiskset_membershiplist_t *nl
537 )
538 {
539 mndiskset_membershiplist_t *nl2;
540 int flag_check_name;
541
542 if (node_id != 0)
543 flag_check_name = 0;
544 else if (node_name != NULL)
545 flag_check_name = 1;
546 else
547 return (0);
548
549 nl2 = nl;
550 while (nl2) {
551 if (flag_check_name) {
552 /* Compare given name against name in member list */
553 if (strcmp(nl2->msl_node_name, node_name) == 0)
554 break;
555 } else {
556 /* Compare given nodeid against nodeid in member list */
557 if (nl2->msl_node_id == node_id)
558 break;
559 }
560 nl2 = nl2->next;
561 }
562 /* No match found in member list */
563 if (nl2 == NULL) {
564 return (0);
565 }
566 /* Return 1 if node is in member list */
567 return (1);
568 }
569
570 /*
571 * meta_getnext_devinfo should go to the host that
572 * has the device, to return the device name, driver name, minor num.
573 * We can take the big cheat for now, since it is a requirement
574 * that the device names and device numbers are the same, and
575 * just get the info locally.
576 *
577 * This routine is very similar to meta_getnextside_devinfo except
578 * that the specific side to be used is being passed in.
579 *
580 * Exit status:
581 * 0 - No more side info to return
582 * 1 - More side info's to return
583 * -1 - An error has been detected
584 */
585 /*ARGSUSED*/
586 int
meta_getside_devinfo(mdsetname_t * sp,char * bname,side_t sideno,char ** ret_bname,char ** ret_dname,minor_t * ret_mnum,md_error_t * ep)587 meta_getside_devinfo(
588 mdsetname_t *sp, /* for this set */
589 char *bname, /* local block name (myside) */
590 side_t sideno, /* sideno */
591 char **ret_bname, /* block device name of returned side */
592 char **ret_dname, /* driver name of returned side */
593 minor_t *ret_mnum, /* minor number of returned side */
594 md_error_t *ep
595 )
596 {
597 mdname_t *np;
598
599 if (ret_bname != NULL)
600 *ret_bname = NULL;
601 if (ret_dname != NULL)
602 *ret_dname = NULL;
603 if (ret_mnum != NULL)
604 *ret_mnum = NODEV32;
605
606
607 if ((np = metaname(&sp, bname, LOGICAL_DEVICE, ep)) == NULL)
608 return (-1);
609
610 /*
611 * NOTE (future) - There will be more work here once devids are integrated
612 * into disksets. Then the side should be used to find the correct
613 * host and the b/d names should be gotten from that host.
614 */
615
616 /*
617 * Return the side info.
618 */
619 if (ret_bname != NULL)
620 *ret_bname = Strdup(np->bname);
621
622 if (ret_dname != NULL) {
623 mdcinfo_t *cinfo;
624
625 if ((cinfo = metagetcinfo(np, ep)) == NULL)
626 return (-1);
627
628 *ret_dname = Strdup(cinfo->dname);
629 }
630
631 if (ret_mnum != NULL)
632 *ret_mnum = meta_getminor(np->dev);
633
634 return (1);
635 }
636
637 /*
638 * Get the information on the device from the remote node using the devid
639 * of the disk.
640 *
641 * Exit status:
642 * 0 - No more side info to return
643 * 1 - More side info's to return
644 * -1 - An error has been detected
645 */
646 int
meta_getnextside_devinfo(mdsetname_t * sp,char * bname,side_t * sideno,char ** ret_bname,char ** ret_dname,minor_t * ret_mnum,md_error_t * ep)647 meta_getnextside_devinfo(
648 mdsetname_t *sp, /* for this set */
649 char *bname, /* local block name (myside) */
650 side_t *sideno, /* previous sideno & returned sideno */
651 char **ret_bname, /* block device name of returned side */
652 char **ret_dname, /* driver name of returned side */
653 minor_t *ret_mnum, /* minor number of returned side */
654 md_error_t *ep
655 )
656 {
657 md_set_desc *sd;
658 int i;
659 mdname_t *np;
660 mddrivename_t *dnp;
661 char *devidstr = NULL;
662 int devidstrlen;
663 md_dev64_t retdev = NODEV64;
664 char *ret_devname = NULL;
665 char *ret_blkdevname = NULL;
666 char *ret_driver = NULL;
667 char *nodename;
668 int fd;
669 int ret = -1;
670 char *minor_name = NULL;
671 md_mnnode_desc *nd;
672
673
674 if (ret_bname != NULL)
675 *ret_bname = NULL;
676 if (ret_dname != NULL)
677 *ret_dname = NULL;
678 if (ret_mnum != NULL)
679 *ret_mnum = NODEV32;
680
681 if (metaislocalset(sp)) {
682 /* no more sides - we are done */
683 if (*sideno != MD_SIDEWILD)
684 return (0);
685
686 /* First time through - set up return sideno */
687 *sideno = 0;
688 } else {
689
690 /*
691 * Find the next sideno, starting after the one given.
692 */
693 if ((sd = metaget_setdesc(sp, ep)) == NULL)
694 return (-1);
695
696 if (MD_MNSET_DESC(sd)) {
697 nd = sd->sd_nodelist;
698 if ((*sideno == MD_SIDEWILD) &&
699 (nd != (struct md_mnnode_desc *)NULL)) {
700 *sideno = nd->nd_nodeid;
701 } else {
702 while (nd) {
703 /*
704 * Found given sideno, now find
705 * next sideno, if there is one.
706 */
707 if ((*sideno == nd->nd_nodeid) &&
708 (nd->nd_next !=
709 (struct md_mnnode_desc *)NULL)) {
710 *sideno =
711 nd->nd_next->nd_nodeid;
712 break;
713 }
714 nd = nd->nd_next;
715 }
716 if (nd == NULL) {
717 return (0);
718 }
719 }
720 if (*sideno == MD_SIDEWILD)
721 return (0);
722 } else {
723 for (i = (*sideno)+1; i < MD_MAXSIDES; i++)
724 /* Find next full slot */
725 if (sd->sd_nodes[i][0] != '\0')
726 break;
727
728 /* No more sides - we are done */
729 if (i == MD_MAXSIDES)
730 return (0);
731
732 /* Set up the return sideno */
733 *sideno = i;
734 nodename = (char *)sd->sd_nodes[i];
735 }
736 }
737
738 /*
739 * Need to pass the node the devid of the disk and get it to
740 * send back the details of the disk from that side.
741 */
742 if ((np = metaname(&sp, bname, UNKNOWN, ep)) == NULL)
743 return (-1);
744
745 dnp = np->drivenamep;
746
747 /*
748 * By default, set up the parameters so that they are copied out.
749 */
750 if (ret_bname != NULL)
751 *ret_bname = Strdup(np->bname);
752
753 if (ret_dname != NULL) {
754 mdcinfo_t *cinfo;
755
756 if ((cinfo = metagetcinfo(np, ep)) == NULL)
757 return (-1);
758
759 *ret_dname = Strdup(cinfo->dname);
760 }
761
762 if (ret_mnum != NULL)
763 *ret_mnum = meta_getminor(np->dev);
764
765 /*
766 * Try some optimization. If this is the local set or the device
767 * is a metadevice then just copy the information. If the device
768 * does not have a devid (due to not having a minor name) then
769 * fall back to the pre-devid behaviour of copying the information
770 * on the device: this is okay because the sanity checks before this
771 * call would have found any issues with the device. If it's a
772 * multi-node diskset also just return ie. copy.
773 */
774 if (metaislocalset(sp) || metaismeta(np) || (dnp->devid == NULL) ||
775 (MD_MNSET_DESC(sd)))
776 return (1);
777
778 if (np->minor_name == (char *)NULL) {
779 /*
780 * Have to get the minor name then. The slice should exist
781 * on the disk because it will have already been repartitioned
782 * up prior to getting to this point.
783 */
784 if ((fd = open(np->bname, (O_RDONLY|O_NDELAY), 0)) < 0) {
785 (void) mdsyserror(ep, errno, np->bname);
786 return (-1);
787 }
788 (void) devid_get_minor_name(fd, &minor_name);
789 np->minor_name = Strdup(minor_name);
790 devid_str_free(minor_name);
791 (void) close(fd);
792 }
793
794 /* allocate extra space for "/" and NULL hence +2 */
795 devidstrlen = strlen(dnp->devid) + strlen(np->minor_name) + 2;
796 devidstr = (char *)Malloc(devidstrlen);
797
798 /*
799 * As a minor name is supplied then the ret_devname will be
800 * appropriate to that minor_name and in this case it will be
801 * a block device ie /dev/dsk.
802 */
803 (void) snprintf(devidstr, devidstrlen,
804 "%s/%s", dnp->devid, np->minor_name);
805
806 ret = clnt_devinfo_by_devid(nodename, sp, devidstr, &retdev,
807 np->bname, &ret_devname, &ret_driver, ep);
808
809 Free(devidstr);
810
811 /*
812 * If the other side is not running device id in disksets,
813 * 'ret' is set to ENOTSUP in which case we fallback to
814 * the existing behaviour
815 */
816 if (ret == ENOTSUP)
817 return (1);
818 else if (ret == -1)
819 return (-1);
820
821 /*
822 * ret_devname comes from the rpc call and is a
823 * raw device name. We need to make this into a
824 * block device via blkname for further processing.
825 * Unfortunately, when our device id isn't found in
826 * the system, the rpc call will return a " " in
827 * ret_devname in which case we need to fill that in
828 * as ret_blkname because blkname of " " returns NULL.
829 */
830 if (ret_bname != NULL && ret_devname != NULL) {
831 ret_blkdevname = blkname(ret_devname);
832 if (ret_blkdevname == NULL)
833 *ret_bname = Strdup(ret_devname);
834 else
835 *ret_bname = Strdup(ret_blkdevname);
836 }
837
838 if (ret_dname != NULL && ret_driver != NULL)
839 *ret_dname = Strdup(ret_driver);
840
841 if (ret_mnum != NULL)
842 *ret_mnum = meta_getminor(retdev);
843
844 return (1);
845 }
846
847 int
meta_is_drive_in_anyset(mddrivename_t * dnp,mdsetname_t ** spp,int bypass_daemon,md_error_t * ep)848 meta_is_drive_in_anyset(
849 mddrivename_t *dnp,
850 mdsetname_t **spp,
851 int bypass_daemon,
852 md_error_t *ep
853 )
854 {
855 set_t setno;
856 mdsetname_t *this_sp;
857 int is_it;
858 set_t max_sets;
859
860 if ((max_sets = get_max_sets(ep)) == 0)
861 return (-1);
862
863 assert(spp != NULL);
864 *spp = NULL;
865
866 for (setno = 1; setno < max_sets; setno++) {
867 if (!bypass_daemon) {
868 if ((this_sp = metasetnosetname(setno, ep)) == NULL) {
869 if (mdismddberror(ep, MDE_DB_NODB)) {
870 mdclrerror(ep);
871 return (0);
872 }
873 if (mdiserror(ep, MDE_NO_SET)) {
874 mdclrerror(ep);
875 continue;
876 }
877 return (-1);
878 }
879 } else
880 this_sp = metafakesetname(setno, NULL);
881
882 if ((is_it = meta_is_drive_in_thisset(this_sp, dnp,
883 bypass_daemon, ep)) == -1) {
884 if (mdiserror(ep, MDE_NO_SET)) {
885 mdclrerror(ep);
886 continue;
887 }
888 return (-1);
889 }
890 if (is_it) {
891 *spp = this_sp;
892 return (0);
893 }
894 }
895 return (0);
896 }
897
898 int
meta_is_drive_in_thisset(mdsetname_t * sp,mddrivename_t * dnp,int bypass_daemon,md_error_t * ep)899 meta_is_drive_in_thisset(
900 mdsetname_t *sp,
901 mddrivename_t *dnp,
902 int bypass_daemon,
903 md_error_t *ep
904 )
905 {
906 md_drive_desc *dd, *p;
907
908 if (bypass_daemon)
909 dd = dr2drivedesc(sp, MD_SIDEWILD,
910 (MD_BASICNAME_OK | MD_BYPASS_DAEMON), ep);
911 else
912 dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
913
914 if (dd == NULL) {
915 if (! mdisok(ep))
916 return (-1);
917 return (0);
918 }
919
920
921 for (p = dd; p != NULL; p = p->dd_next)
922 if (strcmp(p->dd_dnp->cname, dnp->cname) == 0)
923 return (1);
924 return (0);
925 }
926
927 /*
928 * Check to see if devid is in use in any diskset.
929 * This is used in the case when a partial diskset is being imported
930 * to make sure that the unvailable drive isn't already in use in an
931 * already imported partial diskset. Can't check on the cname since the
932 * unavailable disk's cname is from the previous system and may collide
933 * with a cname on this system.
934 * Return values:
935 * 1: devid has been found in a diskset
936 * 0: devid not found in any diskset
937 */
938 int
meta_is_devid_in_anyset(void * devid,mdsetname_t ** spp,md_error_t * ep)939 meta_is_devid_in_anyset(
940 void *devid,
941 mdsetname_t **spp,
942 md_error_t *ep
943 )
944 {
945 set_t setno;
946 mdsetname_t *this_sp;
947 int is_it;
948 set_t max_sets;
949
950 if ((max_sets = get_max_sets(ep)) == 0)
951 return (-1);
952
953 assert(spp != NULL);
954 *spp = NULL;
955
956 for (setno = 1; setno < max_sets; setno++) {
957 if ((this_sp = metasetnosetname(setno, ep)) == NULL) {
958 if (mdismddberror(ep, MDE_DB_NODB)) {
959 mdclrerror(ep);
960 return (0);
961 }
962 if (mdiserror(ep, MDE_NO_SET)) {
963 mdclrerror(ep);
964 continue;
965 }
966 return (-1);
967 }
968
969 if ((is_it = meta_is_devid_in_thisset(this_sp,
970 devid, ep)) == -1) {
971 if (mdiserror(ep, MDE_NO_SET)) {
972 mdclrerror(ep);
973 continue;
974 }
975 return (-1);
976 }
977 if (is_it) {
978 *spp = this_sp;
979 return (0);
980 }
981 }
982 return (0);
983 }
984
985 int
meta_is_devid_in_thisset(mdsetname_t * sp,void * devid,md_error_t * ep)986 meta_is_devid_in_thisset(
987 mdsetname_t *sp,
988 void *devid,
989 md_error_t *ep
990 )
991 {
992 md_drive_desc *dd, *p;
993 ddi_devid_t dd_devid;
994
995 dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
996 if (dd == NULL) {
997 if (! mdisok(ep))
998 return (-1);
999 return (0);
1000 }
1001
1002 for (p = dd; p != NULL; p = p->dd_next) {
1003 if (p->dd_dnp->devid == NULL)
1004 continue;
1005 (void) devid_str_decode(p->dd_dnp->devid,
1006 &dd_devid, NULL);
1007 if (dd_devid == NULL)
1008 continue;
1009 if (devid_compare(devid, dd_devid) == 0) {
1010 devid_free(dd_devid);
1011 return (1);
1012 }
1013 devid_free(dd_devid);
1014 }
1015 return (0);
1016 }
1017
1018 int
meta_set_balance(mdsetname_t * sp,md_error_t * ep)1019 meta_set_balance(
1020 mdsetname_t *sp,
1021 md_error_t *ep
1022 )
1023 {
1024 md_set_desc *sd;
1025 md_drive_desc *dd, *curdd;
1026 daddr_t dbsize;
1027 daddr_t nblks;
1028 int i;
1029 int rval = 0;
1030 sigset_t oldsigs;
1031 md_setkey_t *cl_sk;
1032 md_error_t xep = mdnullerror;
1033 md_mnnode_desc *nd;
1034 int suspend1_flag = 0;
1035
1036 if ((sd = metaget_setdesc(sp, ep)) == NULL)
1037 return (-1);
1038
1039 dbsize = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;
1040
1041 /* Make sure we own the set */
1042 if (meta_check_ownership(sp, ep) != 0)
1043 return (-1);
1044
1045 /* END CHECK CODE */
1046
1047 /*
1048 * Get drive descriptors for the drives that are currently in the set.
1049 */
1050 curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep);
1051
1052 if (! mdisok(ep))
1053 return (-1);
1054
1055 /* Find the minimum replica size in use is or use the default */
1056 if ((nblks = meta_db_minreplica(sp, ep)) < 0)
1057 mdclrerror(ep);
1058 else
1059 dbsize = nblks; /* adjust replica size */
1060
1061 /* Make sure we are blocking all signals */
1062 if (procsigs(TRUE, &oldsigs, &xep) < 0)
1063 mdclrerror(&xep);
1064
1065 /*
1066 * Lock the set on current set members.
1067 * For MN diskset lock_set and SUSPEND are used to protect against
1068 * other meta* commands running on the other nodes.
1069 */
1070 if (MD_MNSET_DESC(sd)) {
1071 nd = sd->sd_nodelist;
1072 while (nd) {
1073 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1074 nd = nd->nd_next;
1075 continue;
1076 }
1077 if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
1078 rval = -1;
1079 goto out;
1080 }
1081 nd = nd->nd_next;
1082 }
1083 /*
1084 * Lock out other meta* commands by suspending
1085 * class 1 messages across the diskset.
1086 */
1087 nd = sd->sd_nodelist;
1088 while (nd) {
1089 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1090 nd = nd->nd_next;
1091 continue;
1092 }
1093 if (clnt_mdcommdctl(nd->nd_nodename,
1094 COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
1095 MD_MSCF_NO_FLAGS, ep)) {
1096 rval = -1;
1097 goto out;
1098 }
1099 suspend1_flag = 1;
1100 nd = nd->nd_next;
1101 }
1102 } else {
1103 for (i = 0; i < MD_MAXSIDES; i++) {
1104 /* Skip empty slots */
1105 if (sd->sd_nodes[i][0] == '\0') continue;
1106
1107 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
1108 rval = -1;
1109 goto out;
1110 }
1111 }
1112 }
1113
1114 /* We are not adding or deleting any drives, just balancing */
1115 dd = NULL;
1116
1117 /*
1118 * Balance the DB's according to the list of existing drives and the
1119 * list of added drives.
1120 */
1121 if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1)
1122 goto out;
1123
1124 out:
1125 /*
1126 * Unlock diskset by resuming class 1 messages across the diskset.
1127 * Just resume all classes so that resume is the same whether
1128 * just one class was locked or all classes were locked.
1129 */
1130 if (suspend1_flag) {
1131 nd = sd->sd_nodelist;
1132 while (nd) {
1133 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1134 nd = nd->nd_next;
1135 continue;
1136 }
1137 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
1138 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
1139 /*
1140 * We are here because we failed to resume
1141 * rpc.mdcommd. However we potentially have
1142 * an error from the previous call
1143 * (meta_db_balance). If the previous call
1144 * did fail, we capture that error and
1145 * generate a perror withthe string,
1146 * "Unable to resume...".
1147 * Setting rval to -1 ensures that in the
1148 * next iteration of the loop, ep is not
1149 * clobbered.
1150 */
1151 if (rval == 0)
1152 (void) mdstealerror(ep, &xep);
1153 else
1154 mdclrerror(&xep);
1155 rval = -1;
1156 mde_perror(ep, dgettext(TEXT_DOMAIN,
1157 "Unable to resume rpc.mdcommd."));
1158 }
1159 nd = nd->nd_next;
1160 }
1161 }
1162
1163 /* Unlock the set */
1164 cl_sk = cl_get_setkey(sp->setno, sp->setname);
1165 if (MD_MNSET_DESC(sd)) {
1166 nd = sd->sd_nodelist;
1167 while (nd) {
1168 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1169 nd = nd->nd_next;
1170 continue;
1171 }
1172 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
1173 if (rval == 0)
1174 (void) mdstealerror(ep, &xep);
1175 else
1176 mdclrerror(&xep);
1177 rval = -1;
1178 }
1179 nd = nd->nd_next;
1180 }
1181 } else {
1182 for (i = 0; i < MD_MAXSIDES; i++) {
1183 /* Skip empty slots */
1184 if (sd->sd_nodes[i][0] == '\0')
1185 continue;
1186
1187 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
1188 if (rval == 0)
1189 (void) mdstealerror(ep, &xep);
1190 rval = -1;
1191 }
1192 }
1193 }
1194
1195 /* release signals back to what they were on entry */
1196 if (procsigs(FALSE, &oldsigs, &xep) < 0)
1197 mdclrerror(&xep);
1198
1199 cl_set_setkey(NULL);
1200
1201 metaflushsetname(sp);
1202
1203 return (rval);
1204 }
1205
1206 int
meta_set_destroy(mdsetname_t * sp,int lock_set,md_error_t * ep)1207 meta_set_destroy(
1208 mdsetname_t *sp,
1209 int lock_set,
1210 md_error_t *ep
1211 )
1212 {
1213 int i;
1214 med_rec_t medr;
1215 md_set_desc *sd;
1216 md_drive_desc *dd, *p, *p1;
1217 mddrivename_t *dnp;
1218 mdname_t *np;
1219 mdnamelist_t *nlp = NULL;
1220 int num_users = 0;
1221 int has_set;
1222 side_t mysideno;
1223 sigset_t oldsigs;
1224 md_error_t xep = mdnullerror;
1225 md_setkey_t *cl_sk;
1226 int rval = 0;
1227 int delete_end = 1;
1228
1229 /* Make sure we are blocking all signals */
1230 if (procsigs(TRUE, &oldsigs, ep) < 0)
1231 return (-1);
1232
1233 if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1234 if (! mdisok(ep))
1235 rval = -1;
1236 goto out;
1237 }
1238
1239 /*
1240 * meta_set_destroy should not be called for a MN diskset.
1241 * This routine destroys a set without communicating this information
1242 * to the other nodes which would lead to an inconsistency in
1243 * the MN diskset.
1244 */
1245 if (MD_MNSET_DESC(sd)) {
1246 rval = -1;
1247 goto out;
1248 }
1249
1250 /* Continue if a traditional diskset */
1251
1252 /*
1253 * Check to see who has the set. If we are not the last user of the
1254 * set, we will not touch the replicas.
1255 */
1256 for (i = 0; i < MD_MAXSIDES; i++) {
1257 /* Skip empty slots */
1258 if (sd->sd_nodes[i][0] == '\0')
1259 continue;
1260
1261 has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NST_EQ,
1262 ep);
1263
1264 if (has_set < 0) {
1265 mdclrerror(ep);
1266 } else
1267 num_users++;
1268 }
1269
1270 if ((dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) == NULL) {
1271 if (! mdisok(ep)) {
1272 rval = -1;
1273 goto out;
1274 }
1275 }
1276
1277 if (setup_db_bydd(sp, dd, TRUE, ep) == -1) {
1278 rval = -1;
1279 goto out;
1280 }
1281
1282 if (lock_set == TRUE) {
1283 /* Lock the set on our side */
1284 if (clnt_lock_set(mynode(), sp, ep)) {
1285 rval = -1;
1286 goto out;
1287 }
1288 }
1289
1290 /*
1291 * A traditional diskset has no diskset stale information to send
1292 * since there can only be one owner node at a time.
1293 */
1294 if (snarf_set(sp, FALSE, ep))
1295 mdclrerror(ep);
1296
1297 if (dd != NULL) {
1298 /*
1299 * Make sure that no drives are in use as parts of metadrives
1300 * or hot spare pools, this is one of the few error conditions
1301 * that will stop this routine, unless the environment has
1302 * META_DESTROY_SET_OK set, in which case, the operation will
1303 * proceed.
1304 */
1305 if (getenv("META_DESTROY_SET_OK") == NULL) {
1306 for (p = dd; p != NULL; p = p->dd_next) {
1307 dnp = p->dd_dnp;
1308
1309 i = meta_check_drive_inuse(sp, dnp, FALSE, ep);
1310 if (i == -1) {
1311 /* need xep - wire calls clear error */
1312 i = metaget_setownership(sp, &xep);
1313 if (i == -1) {
1314 rval = -1;
1315 goto out;
1316 }
1317
1318 mysideno = getmyside(sp, &xep);
1319
1320 if (mysideno == MD_SIDEWILD) {
1321 rval = -1;
1322 goto out;
1323 }
1324
1325 if (sd->sd_isown[mysideno] == FALSE)
1326 if (halt_set(sp, &xep)) {
1327 rval = -1;
1328 goto out;
1329 }
1330
1331 rval = -1;
1332 goto out;
1333 }
1334 }
1335 }
1336
1337 for (i = 0; i < MD_MAXSIDES; i++) {
1338 /* Skip empty slots */
1339 if (sd->sd_nodes[i][0] == '\0')
1340 continue;
1341
1342 /* Skip non local nodes */
1343 if (strcmp(mynode(), sd->sd_nodes[i]) != 0)
1344 continue;
1345
1346 if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep))
1347 mdclrerror(ep);
1348 }
1349
1350 /*
1351 * Go thru each drive and individually delete the replicas.
1352 * This way we can ignore individual errors.
1353 */
1354 for (p = dd; p != NULL; p = p->dd_next) {
1355 uint_t rep_slice;
1356
1357 dnp = p->dd_dnp;
1358 if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) ||
1359 (((np = metaslicename(dnp, rep_slice, ep))
1360 == NULL) &&
1361 ((np = metaslicename(dnp, MD_SLICE0, ep))
1362 == NULL))) {
1363 rval = -1;
1364 goto out;
1365 }
1366
1367 if ((np = metaslicename(dnp,
1368 rep_slice, ep)) == NULL) {
1369 if ((np = metaslicename(dnp,
1370 MD_SLICE0, ep)) == NULL) {
1371 rval = -1;
1372 goto out;
1373 }
1374 mdclrerror(ep);
1375 }
1376
1377 /* Yes this is UGLY!!! */
1378 p1 = p->dd_next;
1379 p->dd_next = NULL;
1380 if (rel_own_bydd(sp, p, FALSE, ep))
1381 mdclrerror(ep);
1382 p->dd_next = p1;
1383
1384 if (p->dd_dbcnt == 0)
1385 continue;
1386
1387 /*
1388 * Skip the replica removal if we are not the last user
1389 */
1390 if (num_users != 1)
1391 continue;
1392
1393 nlp = NULL;
1394 (void) metanamelist_append(&nlp, np);
1395 if (meta_db_detach(sp, nlp,
1396 (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, ep))
1397 mdclrerror(ep);
1398 metafreenamelist(nlp);
1399 }
1400 }
1401
1402 if (halt_set(sp, ep)) {
1403 rval = -1;
1404 goto out;
1405 }
1406
1407 /* Setup the mediator record */
1408 (void) memset(&medr, '\0', sizeof (med_rec_t));
1409 medr.med_rec_mag = MED_REC_MAGIC;
1410 medr.med_rec_rev = MED_REC_REV;
1411 medr.med_rec_fl = 0;
1412 medr.med_rec_sn = sp->setno;
1413 (void) strcpy(medr.med_rec_snm, sp->setname);
1414 medr.med_rec_meds = sd->sd_med; /* structure assigment */
1415 (void) memset(&medr.med_rec_data, '\0', sizeof (med_data_t));
1416 medr.med_rec_foff = 0;
1417
1418 /*
1419 * If we are the last remaining user, then remove the mediator hosts
1420 */
1421 if (num_users == 1) {
1422 for (i = 0; i < MED_MAX_HOSTS; i++) {
1423 if (medr.med_rec_meds.n_lst[i].a_cnt != 0)
1424 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE,
1425 SVM_TAG_MEDIATOR, sp->setno, i);
1426 (void) memset(&medr.med_rec_meds.n_lst[i], '\0',
1427 sizeof (md_h_t));
1428 }
1429 medr.med_rec_meds.n_cnt = 0;
1430 } else { /* Remove this host from the mediator node list. */
1431 for (i = 0; i < MD_MAXSIDES; i++) {
1432 /* Skip empty slots */
1433 if (sd->sd_nodes[i][0] == '\0')
1434 continue;
1435
1436 /* Copy non local node */
1437 if (strcmp(mynode(), sd->sd_nodes[i]) != 0) {
1438 (void) strcpy(medr.med_rec_nodes[i],
1439 sd->sd_nodes[i]);
1440 continue;
1441 }
1442
1443 /* Clear local node */
1444 (void) memset(&medr.med_rec_nodes[i], '\0',
1445 sizeof (md_node_nm_t));
1446 }
1447 }
1448
1449 crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
1450
1451 /*
1452 * If the client is part of a cluster put the DCS service
1453 * into a deleteing state.
1454 */
1455 if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) {
1456 if (metad_isautotakebyname(sp->setname)) {
1457 delete_end = 0;
1458 } else {
1459 mdclrerror(ep);
1460 goto out;
1461 }
1462 }
1463
1464 /* Inform the mediator hosts of the new information */
1465 for (i = 0; i < MED_MAX_HOSTS; i++) {
1466 if (sd->sd_med.n_lst[i].a_cnt == 0)
1467 continue;
1468
1469 if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep))
1470 mdclrerror(ep);
1471 }
1472
1473 /* Delete the set locally */
1474 for (i = 0; i < MD_MAXSIDES; i++) {
1475 /* Skip empty slots */
1476 if (sd->sd_nodes[i][0] == '\0')
1477 continue;
1478
1479 /* Skip non local nodes */
1480 if (strcmp(mynode(), sd->sd_nodes[i]) != 0)
1481 continue;
1482
1483 if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1)
1484 mdclrerror(ep);
1485 }
1486 if (delete_end &&
1487 sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR)
1488 rval = -1;
1489
1490 out:
1491 /* release signals back to what they were on entry */
1492 if (procsigs(FALSE, &oldsigs, &xep) < 0) {
1493 if (rval == 0)
1494 (void) mdstealerror(ep, &xep);
1495 rval = -1;
1496 }
1497
1498 if (lock_set == TRUE) {
1499 cl_sk = cl_get_setkey(sp->setno, sp->setname);
1500 if (clnt_unlock_set(mynode(), cl_sk, &xep)) {
1501 if (rval == 0)
1502 (void) mdstealerror(ep, &xep);
1503 rval = -1;
1504 }
1505 cl_set_setkey(NULL);
1506 }
1507
1508 metaflushsetname(sp);
1509 return (rval);
1510 }
1511
1512 int
meta_set_purge(mdsetname_t * sp,int bypass_cluster,int forceflg,md_error_t * ep)1513 meta_set_purge(
1514 mdsetname_t *sp,
1515 int bypass_cluster,
1516 int forceflg,
1517 md_error_t *ep
1518 )
1519 {
1520 char *thishost = mynode();
1521 md_set_desc *sd;
1522 md_setkey_t *cl_sk;
1523 md_error_t xep = mdnullerror;
1524 int rval = 0;
1525 int i, num_hosts = 0;
1526 int has_set = 0;
1527 int max_node = 0;
1528 int delete_end = 1;
1529 md_mnnode_desc *nd;
1530
1531 if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1532 /* unable to find set description */
1533 rval = 1;
1534 return (rval);
1535 }
1536
1537 if (MD_MNSET_DESC(sd)) {
1538 /*
1539 * Get a count of the hosts in the set and also lock the set
1540 * on those hosts that know about it.
1541 */
1542 nd = sd->sd_nodelist;
1543 while (nd) {
1544 /*
1545 * Only deal with those nodes that are members of
1546 * the set (MD_MN_NODE_ALIVE) or the node on which
1547 * the purge is being run. We must lock the set
1548 * on the purging node because the delset call
1549 * requires the lock to be set.
1550 */
1551 if (!(nd->nd_flags & MD_MN_NODE_ALIVE) &&
1552 nd->nd_nodeid != sd->sd_mn_mynode->nd_nodeid) {
1553 nd = nd->nd_next;
1554 continue;
1555 }
1556 has_set = nodehasset(sp, nd->nd_nodename,
1557 NHS_NST_EQ, ep);
1558
1559 /*
1560 * The host is not aware of this set (has_set < 0) or
1561 * the set does not match (has_set == 0). This check
1562 * prevents the code getting confused by an apparent
1563 * inconsistancy in the set's state, this is in the
1564 * purge code so something is broken in any case and
1565 * this is just trying to fix the brokeness.
1566 */
1567 if (has_set <= 0) {
1568 mdclrerror(ep);
1569 nd->nd_flags |= MD_MN_NODE_NOSET;
1570 } else {
1571 num_hosts++;
1572 if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
1573 /*
1574 * If the force flag is set then
1575 * ignore any RPC failures because we
1576 * are only really interested with
1577 * the set on local node.
1578 */
1579 if (forceflg && mdanyrpcerror(ep)) {
1580 mdclrerror(ep);
1581 } else {
1582 /*
1583 * set max_node so that in the
1584 * unlock code nodes in the
1585 * set that have not been
1586 * locked are not unlocked.
1587 */
1588 max_node = nd->nd_nodeid;
1589 rval = 2;
1590 goto out1;
1591 }
1592 }
1593
1594 }
1595 nd = nd->nd_next;
1596 }
1597 max_node = 0;
1598 } else {
1599 /*
1600 * Get a count of the hosts in the set and also lock the set
1601 * on those hosts that know about it.
1602 */
1603 for (i = 0; i < MD_MAXSIDES; i++) {
1604 /* Skip empty slots */
1605 if (sd->sd_nodes[i][0] == '\0')
1606 continue;
1607
1608 has_set = nodehasset(sp, sd->sd_nodes[i],
1609 NHS_NST_EQ, ep);
1610
1611 /*
1612 * The host is not aware of this set (has_set < 0) or
1613 * the set does not match (has_set == 0). This check
1614 * prevents the code getting confused by an apparent
1615 * inconsistancy in the set's state, this is in the
1616 * purge code so something is broken in any case and
1617 * this is just trying to fix the brokeness.
1618 */
1619 if (has_set <= 0) {
1620 mdclrerror(ep);
1621 /*
1622 * set the node to NULL to prevent further
1623 * requests to this unresponsive node.
1624 */
1625 sd->sd_nodes[i][0] = '\0';
1626 } else {
1627 num_hosts++;
1628 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
1629 /*
1630 * If the force flag is set then
1631 * ignore any RPC failures because we
1632 * are only really interested with
1633 * the set on local node.
1634 */
1635 if (forceflg && mdanyrpcerror(ep)) {
1636 mdclrerror(ep);
1637 } else {
1638 rval = 2;
1639 /*
1640 * set max_node so that in the
1641 * unlock code nodes in the
1642 * set that have not been
1643 * locked are not unlocked.
1644 */
1645 max_node = i;
1646 goto out1;
1647 }
1648 }
1649 }
1650 }
1651 max_node = i; /* now MD_MAXSIDES */
1652 }
1653 if (!bypass_cluster) {
1654 /*
1655 * If there is only one host associated with the
1656 * set then remove the set from the cluster.
1657 */
1658 if (num_hosts == 1) {
1659 if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) {
1660 if (metad_isautotakebyname(sp->setname)) {
1661 delete_end = 0;
1662 } else {
1663 mdclrerror(ep);
1664 rval = 3;
1665 goto out1;
1666 }
1667 }
1668 }
1669 }
1670
1671 if (MD_MNSET_DESC(sd)) {
1672 nd = sd->sd_nodelist;
1673 while (nd) {
1674 if (nd->nd_nodeid == sd->sd_mn_mynode->nd_nodeid) {
1675 /*
1676 * This is the node on which the purge is
1677 * being run. We do not care if it is
1678 * alive or not, just want to get rid of
1679 * the set.
1680 */
1681 if (clnt_delset(nd->nd_nodename, sp,
1682 ep) == -1) {
1683 md_perror(dgettext(TEXT_DOMAIN,
1684 "delset"));
1685 if (!bypass_cluster && num_hosts == 1)
1686 (void) sdssc_delete_end(
1687 sp->setname, SDSSC_CLEANUP);
1688 mdclrerror(ep);
1689 goto out1;
1690 }
1691 nd = nd->nd_next;
1692 continue;
1693 }
1694
1695 /*
1696 * Only contact those nodes that are members of
1697 * the set.
1698 */
1699 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1700 nd = nd->nd_next;
1701 continue;
1702 }
1703
1704 /*
1705 * Tell the remote node to remove this node
1706 */
1707 if (clnt_delhosts(nd->nd_nodename, sp, 1, &thishost,
1708 ep) == -1) {
1709 /*
1710 * If we fail to delete ourselves
1711 * from the remote host it does not
1712 * really matter because the set is
1713 * being "purged" from this node. The
1714 * set can be purged from the other
1715 * node at a later time.
1716 */
1717 mdclrerror(ep);
1718 }
1719 nd = nd->nd_next;
1720 }
1721 } else {
1722 for (i = 0; i < MD_MAXSIDES; i++) {
1723 /* Skip empty slots */
1724 if (sd->sd_nodes[i][0] == '\0')
1725 continue;
1726 if (strcmp(thishost, sd->sd_nodes[i]) != 0) {
1727 /*
1728 * Tell the remote node to remove this node
1729 */
1730 if (clnt_delhosts(sd->sd_nodes[i], sp, 1,
1731 &thishost, ep) == -1) {
1732 /*
1733 * If we fail to delete ourselves
1734 * from the remote host it does not
1735 * really matter because the set is
1736 * being "purged" from this node. The
1737 * set can be purged from the other
1738 * node at a later time.
1739 */
1740 mdclrerror(ep);
1741 }
1742 continue;
1743 }
1744
1745 /* remove the set from this host */
1746 if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) {
1747 md_perror(dgettext(TEXT_DOMAIN, "delset"));
1748 if (!bypass_cluster && num_hosts == 1)
1749 (void) sdssc_delete_end(sp->setname,
1750 SDSSC_CLEANUP);
1751 mdclrerror(ep);
1752 goto out1;
1753 }
1754 }
1755 }
1756
1757 if (!bypass_cluster && num_hosts == 1) {
1758 if (delete_end && sdssc_delete_end(sp->setname, SDSSC_COMMIT) ==
1759 SDSSC_ERROR) {
1760 rval = 4;
1761 }
1762 }
1763
1764 out1:
1765
1766 cl_sk = cl_get_setkey(sp->setno, sp->setname);
1767
1768 /*
1769 * Remove the set lock on those nodes that had the set locked
1770 * max_node will either be MD_MAXSIDES or array index of the last
1771 * node contacted (or rather failed to contact) for traditional
1772 * diskset. For a MN diskset, max_node is the node_id of the node
1773 * that failed the lock.
1774 */
1775 if (MD_MNSET_DESC(sd)) {
1776 nd = sd->sd_nodelist;
1777 while (nd) {
1778 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1779 nd = nd->nd_next;
1780 continue;
1781 }
1782 if (nd->nd_nodeid == max_node)
1783 break;
1784 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
1785 if (forceflg && mdanyrpcerror(&xep)) {
1786 mdclrerror(&xep);
1787 nd = nd->nd_next;
1788 continue;
1789 }
1790 if (rval == 0)
1791 (void) mdstealerror(ep, &xep);
1792 rval = 5;
1793 }
1794 nd = nd->nd_next;
1795 }
1796 } else {
1797 for (i = 0; i < max_node; i++) {
1798 /* Skip empty slots */
1799 if (sd->sd_nodes[i][0] == '\0')
1800 continue;
1801
1802 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
1803 if (forceflg && mdanyrpcerror(&xep)) {
1804 mdclrerror(&xep);
1805 continue;
1806 }
1807 if (rval == 0)
1808 (void) mdstealerror(ep, &xep);
1809 rval = 5;
1810 }
1811 }
1812 }
1813
1814 cl_set_setkey(NULL);
1815
1816 return (rval);
1817 }
1818
1819 int
meta_set_query(mdsetname_t * sp,mddb_dtag_lst_t ** dtlpp,md_error_t * ep)1820 meta_set_query(
1821 mdsetname_t *sp,
1822 mddb_dtag_lst_t **dtlpp,
1823 md_error_t *ep
1824 )
1825 {
1826 mddb_dtag_get_parm_t dtgp;
1827
1828 (void) memset(&dtgp, '\0', sizeof (mddb_dtag_get_parm_t));
1829 dtgp.dtgp_setno = sp->setno;
1830
1831 /*CONSTCOND*/
1832 while (1) {
1833 if (metaioctl(MD_MED_GET_TAG, &dtgp, &dtgp.dtgp_mde, NULL) != 0)
1834 if (! mdismddberror(&dtgp.dtgp_mde, MDE_DB_NOTAG) ||
1835 *dtlpp == NULL)
1836 return (mdstealerror(ep, &dtgp.dtgp_mde));
1837 else
1838 break;
1839
1840 /*
1841 * Run to the end of the list
1842 */
1843 for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx)
1844 /* void */;
1845
1846 *dtlpp = Zalloc(sizeof (mddb_dtag_lst_t));
1847
1848 (void) memmove(&(*dtlpp)->dtl_dt, &dtgp.dtgp_dt,
1849 sizeof (mddb_dtag_t));
1850
1851 dtgp.dtgp_dt.dt_id++;
1852 }
1853 return (0);
1854 }
1855
1856 /*
1857 * return drivename get by key
1858 */
1859 mddrivename_t *
metadrivename_withdrkey(mdsetname_t * sp,side_t sideno,mdkey_t key,int flags,md_error_t * ep)1860 metadrivename_withdrkey(
1861 mdsetname_t *sp,
1862 side_t sideno,
1863 mdkey_t key,
1864 int flags,
1865 md_error_t *ep
1866 )
1867 {
1868 char *nm;
1869 mdname_t *np;
1870 mddrivename_t *dnp;
1871 ddi_devid_t devidp;
1872 md_set_desc *sd;
1873
1874 if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1875 return (NULL);
1876 }
1877
1878 /*
1879 * Get the devid associated with the key.
1880 *
1881 * If a devid was returned, it MUST be valid even in
1882 * the case where a device id has been "updated". The
1883 * "update" of the device id may have occured due to
1884 * a firmware upgrade.
1885 */
1886 if ((devidp = meta_getdidbykey(MD_LOCAL_SET, sideno+SKEW, key, ep))
1887 != NULL) {
1888 /*
1889 * Look for the correct dnp using the devid for comparison.
1890 */
1891 dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep);
1892 free(devidp);
1893
1894 /* dnp could be NULL if the devid could not be decoded. */
1895 if (dnp == NULL) {
1896 return (NULL);
1897 }
1898 dnp->side_names_key = key;
1899 } else {
1900 /*
1901 * We didn't get a devid. We'll try for a dnp using the
1902 * name. If we have a MN diskset or if the dnp is a did
1903 * device, we're done because then we don't have devids.
1904 * Otherwise we'll try to set the devid
1905 * and get the dnp via devid again.
1906 * We also need to clear the ep structure. When the
1907 * above call to meta_getdidbykey returned a null, it
1908 * also put an error code into ep. In this case, the null
1909 * return is actually OK and any errors can be ignored. The
1910 * reason it is OK is because this could be a MN set or
1911 * we could be running without devids (ex cluster).
1912 */
1913 mdclrerror(ep);
1914
1915 if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno, key,
1916 ep)) == NULL)
1917 return (NULL);
1918 /* get device name */
1919 if (flags & PRINT_FAST) {
1920 if ((np = metaname_fast(&sp, nm,
1921 LOGICAL_DEVICE, ep)) == NULL) {
1922 Free(nm);
1923 return (NULL);
1924 }
1925 } else {
1926 if ((np = metaname(&sp, nm, LOGICAL_DEVICE,
1927 ep)) == NULL) {
1928 Free(nm);
1929 return (NULL);
1930 }
1931 }
1932 Free(nm);
1933 /* make sure it's OK */
1934 if ((! (flags & MD_BASICNAME_OK)) && (metachkcomp(np,
1935 ep) != 0))
1936 return (NULL);
1937
1938 /* get drivename */
1939 dnp = np->drivenamep;
1940 dnp->side_names_key = key;
1941 /*
1942 * Skip the devid set/check for the following cases:
1943 * 1) If MN diskset, there are no devid's
1944 * 2) if dnp is did device
1945 * The device id is disabled for did device due to the
1946 * lack of minor name support in the did driver. The following
1947 * devid code path can set and propagate the error and
1948 * eventually prevent did disks from being added to the
1949 * diskset under SunCluster systems
1950 *
1951 * Note that this code can be called through rpc.mdcommd.
1952 * sdssc_version cannot be used because the library won't
1953 * be bound.
1954 */
1955 if ((strncmp(dnp->rname, "/dev/did/", strlen("/dev/did/"))
1956 == 0) || (MD_MNSET_DESC(sd)))
1957 goto out;
1958
1959 /*
1960 * It is okay if replica is not in devid mode
1961 */
1962 if (mdissyserror(ep, MDDB_F_NODEVID)) {
1963 mdclrerror(ep);
1964 goto out;
1965 }
1966
1967 /*
1968 * We're not MN or did devices but
1969 * devid is missing so this means that we have
1970 * just upgraded from a configuration where
1971 * devid's were not used so try to add in
1972 * the devid and requery. If the devid still isn't there,
1973 * that's OK. dnp->devid will be null as it is in any
1974 * configuration with no devids.
1975 */
1976 if (meta_setdid(MD_LOCAL_SET, sideno + SKEW, key, ep) < 0)
1977 return (NULL);
1978 if ((devidp = (ddi_devid_t)meta_getdidbykey(MD_LOCAL_SET,
1979 sideno+SKEW, key, ep)) != NULL) {
1980 /*
1981 * Found a devid so look for the dnp using the
1982 * devid as the search mechanism.
1983 */
1984 dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep);
1985 free(devidp);
1986 if (dnp == NULL) {
1987 return (NULL);
1988 }
1989 dnp->side_names_key = key;
1990 }
1991 }
1992
1993
1994
1995 out:
1996 if (flags & MD_BYPASS_DAEMON)
1997 return (dnp);
1998
1999 if (get_sidenmlist(sp, dnp, ep))
2000 return (NULL);
2001
2002 /* return success */
2003 return (dnp);
2004 }
2005
2006 void
metafreedrivedesc(md_drive_desc ** dd)2007 metafreedrivedesc(md_drive_desc **dd)
2008 {
2009 md_drive_desc *p, *next = NULL;
2010
2011 for (p = *dd; p != NULL; p = next) {
2012 next = p->dd_next;
2013 Free(p);
2014 }
2015 *dd = NULL;
2016 }
2017
2018 md_drive_desc *
metaget_drivedesc(mdsetname_t * sp,int flags,md_error_t * ep)2019 metaget_drivedesc(
2020 mdsetname_t *sp,
2021 int flags,
2022 md_error_t *ep
2023 )
2024 {
2025 side_t sideno = MD_SIDEWILD;
2026
2027 assert(! (flags & MD_BYPASS_DAEMON));
2028
2029 if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD)
2030 return (NULL);
2031
2032 return (metaget_drivedesc_sideno(sp, sideno, flags, ep));
2033 }
2034
2035 md_drive_desc *
metaget_drivedesc_fromnamelist(mdsetname_t * sp,mdnamelist_t * nlp,md_error_t * ep)2036 metaget_drivedesc_fromnamelist(
2037 mdsetname_t *sp,
2038 mdnamelist_t *nlp,
2039 md_error_t *ep
2040 )
2041 {
2042 md_set_desc *sd;
2043 mdnamelist_t *p;
2044 md_drive_desc *dd = NULL;
2045
2046 if ((sd = metaget_setdesc(sp, ep)) == NULL)
2047 return (NULL);
2048
2049 for (p = nlp; p != NULL; p = p->next)
2050 (void) metadrivedesc_append(&dd, p->namep->drivenamep, 0, 0,
2051 sd->sd_ctime, sd->sd_genid, MD_DR_ADD);
2052
2053 return (dd);
2054 }
2055
2056 md_drive_desc *
metaget_drivedesc_sideno(mdsetname_t * sp,side_t sideno,int flags,md_error_t * ep)2057 metaget_drivedesc_sideno(
2058 mdsetname_t *sp,
2059 side_t sideno,
2060 int flags,
2061 md_error_t *ep
2062 )
2063 {
2064 md_set_desc *sd = NULL;
2065
2066 assert(! (flags & MD_BYPASS_DAEMON));
2067
2068 if ((sd = metaget_setdesc(sp, ep)) == NULL)
2069 return (NULL);
2070
2071 if (sd->sd_drvs)
2072 return (sd->sd_drvs);
2073
2074 if ((sd->sd_drvs = dr2drivedesc(sp, sideno, flags, ep)) == NULL)
2075 return (NULL);
2076
2077 return (sd->sd_drvs);
2078 }
2079
2080 int
metaget_setownership(mdsetname_t * sp,md_error_t * ep)2081 metaget_setownership(
2082 mdsetname_t *sp,
2083 md_error_t *ep
2084 )
2085 {
2086 md_set_desc *sd;
2087 int bool;
2088 int i;
2089 md_mnnode_desc *nd;
2090
2091 if ((sd = metaget_setdesc(sp, ep)) == NULL)
2092 return (-1);
2093
2094 if (MD_MNSET_DESC(sd)) {
2095 nd = sd->sd_nodelist;
2096 while (nd) {
2097 /* If node isn't alive, can't own diskset */
2098 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2099 nd->nd_flags &= ~MD_MN_NODE_OWN;
2100 nd = nd->nd_next;
2101 continue;
2102 }
2103 /*
2104 * If can't communicate with rpc.metad, then mark
2105 * this node as not an owner. That node may
2106 * in fact, be an owner, but without rpc.metad running
2107 * that node can't do much.
2108 */
2109 if (clnt_ownset(nd->nd_nodename, sp, &bool, ep) == -1) {
2110 nd->nd_flags &= ~MD_MN_NODE_OWN;
2111 } else if (bool == TRUE) {
2112 nd->nd_flags |= MD_MN_NODE_OWN;
2113 } else {
2114 nd->nd_flags &= ~MD_MN_NODE_OWN;
2115 }
2116 nd = nd->nd_next;
2117 }
2118 return (0);
2119 }
2120
2121 /* Rest of code handles traditional disksets */
2122
2123 for (i = 0; i < MD_MAXSIDES; i++)
2124 sd->sd_isown[i] = 0;
2125
2126 if (clnt_ownset(mynode(), sp, &bool, ep) == -1)
2127 return (-1);
2128
2129 if (bool == TRUE)
2130 sd->sd_isown[getmyside(sp, ep)] = 1;
2131
2132 return (0);
2133 }
2134
2135 char *
mynode(void)2136 mynode(void)
2137 {
2138 static struct utsname myuname;
2139 static int done = 0;
2140
2141 if (! done) {
2142 if (uname(&myuname) == -1) {
2143 md_perror(dgettext(TEXT_DOMAIN, "uname"));
2144 assert(0);
2145 }
2146 done = 1;
2147 }
2148 return (myuname.nodename);
2149 }
2150
2151 int
strinlst(char * str,int cnt,char ** lst)2152 strinlst(char *str, int cnt, char **lst)
2153 {
2154 int i;
2155
2156 for (i = 0; i < cnt; i++)
2157 if (strcmp(lst[i], str) == 0)
2158 return (TRUE);
2159
2160 return (FALSE);
2161 }
2162
2163 /*
2164 * meta_get_reserved_names
2165 * returns an mdnamelist_t of reserved slices
2166 * reserved slices are those that are used but don't necessarily
2167 * show up as metadevices (ex. reserved slice for db in sets, logs)
2168 */
2169
2170 /*ARGSUSED*/
2171 int
meta_get_reserved_names(mdsetname_t * sp,mdnamelist_t ** nlpp,int options,md_error_t * ep)2172 meta_get_reserved_names(
2173 mdsetname_t *sp,
2174 mdnamelist_t **nlpp,
2175 int options,
2176 md_error_t *ep)
2177 {
2178 int count = 0;
2179 mdname_t *np = NULL;
2180 mdnamelist_t *transnlp = NULL;
2181 mdnamelist_t **tailpp = nlpp;
2182 mdnamelist_t *nlp;
2183 md_drive_desc *dd, *di;
2184
2185 if (metaislocalset(sp))
2186 goto out;
2187
2188 if (!(dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) && !mdisok(ep)) {
2189 count = -1;
2190 goto out;
2191 }
2192
2193 /* db in for sets on reserved slice */
2194 for (di = dd; di && count >= 0; di = di->dd_next) {
2195 uint_t rep_slice;
2196
2197 /*
2198 * Add the name struct to the end of the
2199 * namelist but keep a pointer to the last
2200 * element so that we don't incur the overhead
2201 * of traversing the list each time
2202 */
2203 if (di->dd_dnp &&
2204 (meta_replicaslice(di->dd_dnp, &rep_slice, ep) == 0) &&
2205 (np = metaslicename(di->dd_dnp, rep_slice, ep)) &&
2206 (tailpp = meta_namelist_append_wrapper(tailpp, np)))
2207 count++;
2208 else
2209 count = -1;
2210 }
2211
2212 /* now find logs */
2213 if (meta_get_trans_names(sp, &transnlp, 0, ep) < 0) {
2214 count = -1;
2215 goto out;
2216 }
2217
2218 for (nlp = transnlp; (nlp != NULL); nlp = nlp->next) {
2219 mdname_t *transnp = nlp->namep;
2220 md_trans_t *transp;
2221
2222 if ((transp = meta_get_trans(sp, transnp, ep)) == NULL) {
2223 count = -1;
2224 goto out;
2225 }
2226 if (transp->lognamep) {
2227 /*
2228 * Add the name struct to the end of the
2229 * namelist but keep a pointer to the last
2230 * element so that we don't incur the overhead
2231 * of traversing the list each time
2232 */
2233 tailpp = meta_namelist_append_wrapper(
2234 tailpp, transp->lognamep);
2235 }
2236 }
2237 out:
2238 metafreenamelist(transnlp);
2239 return (count);
2240 }
2241
2242 /*
2243 * Entry point to join a node to MultiNode diskset.
2244 *
2245 * Validate host in diskset.
2246 * - Should be in membership list from API
2247 * - Should not already be joined into diskset.
2248 * - Set must have drives
2249 * Assume valid configuration is stored in the set/drive/node records
2250 * in the local mddb since no node or drive can be added to the MNset
2251 * unless all drives and nodes are available. Reconfig steps will
2252 * resync all ALIVE nodes in case of panic in critical areas.
2253 *
2254 * Lock down the set.
2255 * Verify host is a member of this diskset.
2256 * If drives exist in the configuration, load the mddbs.
2257 * Set this node to active by notifying master if one exists.
2258 * If this is the first node active in the diskset, this node
2259 * becomes the master.
2260 * Unlock the set.
2261 *
2262 * Mirror Resync:
2263 * If this node is the last node to join the set and clustering
2264 * isn't running, then start the 'metasync -r' type resync
2265 * on all mirrors in this diskset.
2266 * If clustering is running, this resync operation will
2267 * be handled by the reconfig steps and should NOT
2268 * be handled during a join operation.
2269 *
2270 * There are multiple return values in order to assist
2271 * the join operation of all sets in the metaset command.
2272 *
2273 * Return values:
2274 * 0 - Node successfully joined to set.
2275 * -1 - Join attempted but failed
2276 * - any failure from libmeta calls
2277 * - node not in the member list
2278 * -2 - Join not attempted since
2279 * - this set had no drives in set
2280 * - this node already joined to set
2281 * - set is not a multinode set
2282 * -3 - Node joined to STALE set.
2283 */
2284 extern int
meta_set_join(mdsetname_t * sp,md_error_t * ep)2285 meta_set_join(
2286 mdsetname_t *sp,
2287 md_error_t *ep
2288 )
2289 {
2290 md_set_desc *sd;
2291 md_drive_desc *dd;
2292 md_mnnode_desc *nd, *nd2, my_nd;
2293 int rval = 0;
2294 md_setkey_t *cl_sk;
2295 md_error_t xep = mdnullerror;
2296 md_error_t ep_snarf = mdnullerror;
2297 int master_flag = 0;
2298 md_mnset_record *mas_mnsr = NULL;
2299 int clear_nr_flags = 0;
2300 md_mnnode_record *nr;
2301 int stale_set = 0;
2302 int rb_flags = 0;
2303 int stale_bool = FALSE;
2304 int suspendall_flag = 0;
2305 int suspend1_flag = 0;
2306 sigset_t oldsigs;
2307 int send_reinit = 0;
2308
2309 if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2310 return (-1);
2311 }
2312
2313 /* Must be a multinode diskset */
2314 if (!MD_MNSET_DESC(sd)) {
2315 (void) mderror(ep, MDE_NOT_MN, sp->setname);
2316 return (-2);
2317 }
2318
2319 /* Verify that the node is ALIVE (i.e. is in the API membership list) */
2320 if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_ALIVE)) {
2321 (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno,
2322 sd->sd_mn_mynode->nd_nodename, NULL, sp->setname);
2323 return (-1);
2324 }
2325
2326 /* Make sure we are blocking all signals */
2327 if (procsigs(TRUE, &oldsigs, &xep) < 0)
2328 mdclrerror(&xep);
2329
2330 /*
2331 * Lock the set on current set members.
2332 * For MN diskset lock_set and SUSPEND are used to protect against
2333 * other meta* commands running on the other nodes.
2334 */
2335 nd = sd->sd_nodelist;
2336 while (nd) {
2337 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2338 nd = nd->nd_next;
2339 continue;
2340 }
2341 if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2342 rval = -1;
2343 goto out;
2344 }
2345 nd = nd->nd_next;
2346 }
2347
2348 /*
2349 * Lock out other meta* commands by suspending
2350 * class 1 messages across the diskset.
2351 */
2352 nd = sd->sd_nodelist;
2353 while (nd) {
2354 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2355 nd = nd->nd_next;
2356 continue;
2357 }
2358 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2359 sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) {
2360 rval = -1;
2361 goto out;
2362 }
2363 suspend1_flag = 1;
2364 nd = nd->nd_next;
2365 }
2366
2367 /*
2368 * Verify that this host is a member (in the host list) of the set.
2369 */
2370 nd = sd->sd_nodelist;
2371 while (nd) {
2372 if (strcmp(mynode(), nd->nd_nodename) == 0) {
2373 break;
2374 }
2375 nd = nd->nd_next;
2376 }
2377 if (!nd) {
2378 (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
2379 sd->sd_mn_mynode->nd_nodename, NULL,
2380 sp->setname);
2381 rval = -1;
2382 goto out;
2383 }
2384
2385 /*
2386 * Need to return failure if host is already 'joined'
2387 * into the set. This is done so that if later the user
2388 * issues a command to join all sets and a failure is
2389 * encountered - that the resulting cleanup effort
2390 * (withdrawing from all sets that were joined
2391 * during that command) won't withdraw from this set.
2392 */
2393 if (nd->nd_flags & MD_MN_NODE_OWN) {
2394 rval = -2;
2395 goto out2;
2396 }
2397
2398 /*
2399 * Call metaget_setownership that calls each node in diskset and
2400 * marks in set descriptor if node is an owner of the set or not.
2401 * metaget_setownership checks to see if a node is an owner by
2402 * checking to see if that node's kernel has the mddb loaded.
2403 * If a node had panic'd during a reconfig or an
2404 * add/delete/join/withdraw operation, the other nodes' node
2405 * records may not reflect the current state of the diskset,
2406 * so calling metaget_setownership is the safest thing to do.
2407 */
2408 if (metaget_setownership(sp, ep) == -1) {
2409 rval = -1;
2410 goto out;
2411 }
2412
2413 /* If first active member of diskset, become the master. */
2414 nd = sd->sd_nodelist;
2415 while (nd) {
2416 if (nd->nd_flags & MD_MN_NODE_OWN)
2417 break;
2418 nd = nd->nd_next;
2419 }
2420 if (nd == NULL)
2421 master_flag = 1;
2422
2423 /*
2424 * If not first active member of diskset, then get the
2425 * master information from a node that is already joined
2426 * and set the master information for this node. Be sure
2427 * that this node (the already joined node) has its own
2428 * join flag set. If not, then this diskset isn't currently
2429 * consistent and shouldn't allow a node to join. This diskset
2430 * inconsistency should only occur when a node has panic'd in
2431 * the set while doing a metaset operation and the sysadmin is
2432 * attempting to join a node into the set. This inconsistency
2433 * will be fixed during a reconfig cycle which should be occurring
2434 * soon since a node panic'd.
2435 *
2436 * If unable to get this information from an owning node, then
2437 * this diskset isn't currently consistent and shouldn't
2438 * allow a node to join.
2439 */
2440 if (!master_flag) {
2441 /* get master information from an owner (joined) node */
2442 if (clnt_mngetset(nd->nd_nodename, sp->setname,
2443 sp->setno, &mas_mnsr, ep) == -1) {
2444 rval = -1;
2445 goto out;
2446 }
2447
2448 /* Verify that owner (joined) node has its own JOIN flag set */
2449 nr = mas_mnsr->sr_nodechain;
2450 while (nr) {
2451 if ((nd->nd_nodeid == nr->nr_nodeid) &&
2452 ((nr->nr_flags & MD_MN_NODE_OWN) == NULL)) {
2453 (void) mddserror(ep, MDE_DS_NODENOSET,
2454 sp->setno, nd->nd_nodename, NULL,
2455 nd->nd_nodename);
2456 free_sr((md_set_record *)mas_mnsr);
2457 rval = -1;
2458 goto out;
2459 }
2460 nr = nr->nr_next;
2461 }
2462
2463 /*
2464 * Does master have set marked as STALE?
2465 * If so, need to pass this down to kernel when
2466 * this node snarfs the set.
2467 */
2468 if (clnt_mn_is_stale(nd->nd_nodename, sp,
2469 &stale_bool, ep) == -1) {
2470 rval = -1;
2471 goto out;
2472 }
2473
2474 /* set master information in my rpc.metad's set record */
2475 if (clnt_mnsetmaster(mynode(), sp, mas_mnsr->sr_master_nodenm,
2476 mas_mnsr->sr_master_nodeid, ep)) {
2477 free_sr((md_set_record *)mas_mnsr);
2478 rval = -1;
2479 goto out;
2480 }
2481
2482 /* set master information in my cached set desc */
2483 (void) strcpy(sd->sd_mn_master_nodenm,
2484 mas_mnsr->sr_master_nodenm);
2485 sd->sd_mn_master_nodeid = mas_mnsr->sr_master_nodeid;
2486 nd2 = sd->sd_nodelist;
2487 while (nd2) {
2488 if (nd2->nd_nodeid == mas_mnsr->sr_master_nodeid) {
2489 sd->sd_mn_masternode = nd2;
2490 break;
2491 }
2492 nd2 = nd2->nd_next;
2493 }
2494 free_sr((md_set_record *)mas_mnsr);
2495
2496 /*
2497 * Set the node flags in mynode's rpc.metad node records for
2498 * the nodes that are in the diskset. Can use my sd
2499 * since earlier call to metaget_setownership set the
2500 * owner flags based on whether that node had snarfed
2501 * the MN diskset mddb. Reconfig steps guarantee that
2502 * return of metaget_setownership will match the owning
2503 * node's owner list except in the case where a node
2504 * has just panic'd and in this case, a reconfig will
2505 * be starting immediately and the owner lists will
2506 * be sync'd up by the reconfig.
2507 *
2508 * Flag of SET means to take no action except to
2509 * set the node flags as given in the nodelist linked list.
2510 */
2511 if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist,
2512 MD_NR_SET, NULL, ep)) {
2513 rval = -1;
2514 goto out;
2515 }
2516 }
2517
2518 /*
2519 * Read in the mddb if there are drives in the set.
2520 */
2521 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
2522 ep)) == NULL) {
2523 /* No drives in list */
2524 if (! mdisok(ep)) {
2525 rval = -1;
2526 goto out;
2527 }
2528 rval = -2;
2529 goto out;
2530 }
2531
2532 /*
2533 * Notify rpc.mdcommd on all nodes of a nodelist change.
2534 * Start by suspending rpc.mdcommd (which drains it of all messages),
2535 * then change the nodelist followed by a reinit and resume.
2536 */
2537 nd = sd->sd_nodelist;
2538 while (nd) {
2539 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2540 nd = nd->nd_next;
2541 continue;
2542 }
2543
2544 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp,
2545 MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
2546 rval = -1;
2547 goto out;
2548 }
2549 suspendall_flag = 1;
2550 nd = nd->nd_next;
2551 }
2552
2553 /* Set master in my set record in rpc.metad */
2554 if (master_flag) {
2555 if (clnt_mnsetmaster(mynode(), sp,
2556 sd->sd_mn_mynode->nd_nodename,
2557 sd->sd_mn_mynode->nd_nodeid, ep)) {
2558 rval = -1;
2559 goto out;
2560 }
2561 }
2562 /*
2563 * Causes mddbs to be loaded into the kernel.
2564 * Set the force flag so that replica locations can be
2565 * loaded into the kernel even if a mediator node was
2566 * unavailable. This allows a node to join an MO
2567 * diskset when there are sufficient replicas available,
2568 * but a mediator node in unavailable.
2569 */
2570 if (setup_db_bydd(sp, dd, TRUE, ep) == -1) {
2571 mde_perror(ep, dgettext(TEXT_DOMAIN,
2572 "Host not able to start diskset."));
2573 rval = -1;
2574 goto out;
2575 }
2576
2577 if (! mdisok(ep)) {
2578 rval = -1;
2579 goto out;
2580 }
2581
2582 /*
2583 * Set rollback flags to 1 so that halt_set is called if a failure
2584 * is seen after this point. If snarf_set fails, still need to
2585 * call halt_set to cleanup the diskset.
2586 */
2587 rb_flags = 1;
2588
2589 /* Starts the set */
2590 if (snarf_set(sp, stale_bool, ep) != 0) {
2591 if (mdismddberror(ep, MDE_DB_STALE)) {
2592 /*
2593 * Don't fail join, STALE means that set has
2594 * < 50% mddbs.
2595 */
2596 (void) mdstealerror(&ep_snarf, ep);
2597 stale_set = 1;
2598 } else if (mdisok(ep)) {
2599 /* If snarf failed, but no error was set - set it */
2600 (void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64,
2601 sp->setno, 0, NULL);
2602 rval = -1;
2603 goto out;
2604 } else if (!(mdismddberror(ep, MDE_DB_ACCOK))) {
2605 /*
2606 * Don't fail join if ACCOK; ACCOK means that mediator
2607 * provided extra vote.
2608 */
2609 rval = -1;
2610 goto out;
2611 }
2612 }
2613
2614 /* Did set really get snarfed? */
2615 if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_NO) {
2616 if (mdisok(ep)) {
2617 /* If snarf failed, but no error was set - set it */
2618 (void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64,
2619 sp->setno, 0, NULL);
2620 }
2621 mde_perror(ep, dgettext(TEXT_DOMAIN,
2622 "Host not able to start diskset."));
2623 rval = -1;
2624 goto out;
2625 }
2626
2627 /* Change to nodelist so need to send reinit to rpc.mdcommd */
2628 send_reinit = 1;
2629
2630 /* If first node to enter set, setup master and clear change log */
2631 if (master_flag) {
2632 /* Set master in my locally cached set descriptor */
2633 (void) strcpy(sd->sd_mn_master_nodenm,
2634 sd->sd_mn_mynode->nd_nodename);
2635 sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid;
2636 sd->sd_mn_am_i_master = 1;
2637
2638 /*
2639 * If first node to join set, then clear out change log
2640 * entries. Change log entries are only needed when a
2641 * change of master is occurring in a diskset that has
2642 * multiple owners. Since this node is the first owner
2643 * of the diskset, clear the entries.
2644 *
2645 * Only do this if we are in a single node non-SC3.x
2646 * situation.
2647 */
2648 if (meta_mn_singlenode() &&
2649 mdmn_reset_changelog(sp, ep, MDMN_CLF_RESETLOG) != 0) {
2650 mde_perror(ep, dgettext(TEXT_DOMAIN,
2651 "Unable to reset changelog."));
2652 rval = -1;
2653 goto out;
2654 }
2655 }
2656
2657 /* Set my locally cached flag */
2658 sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN;
2659
2660 /*
2661 * Set this node's own flag on all joined nodes in the set
2662 * (including my node).
2663 */
2664 clear_nr_flags = 1;
2665
2666 my_nd = *(sd->sd_mn_mynode);
2667 my_nd.nd_next = NULL;
2668 nd = sd->sd_nodelist;
2669 while (nd) {
2670 if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2671 nd = nd->nd_next;
2672 continue;
2673 }
2674 if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
2675 MD_NR_JOIN, NULL, ep)) {
2676 rval = -1;
2677 goto out;
2678 }
2679 nd = nd->nd_next;
2680 }
2681
2682 out:
2683 if (rval != NULL) {
2684 /*
2685 * If rollback flag is 1, then node was joined to set.
2686 * Since an error occurred, withdraw node from set in
2687 * order to rollback to before command was run.
2688 * Need to preserve ep so that calling function can
2689 * get error information.
2690 */
2691 if (rb_flags == 1) {
2692 if (halt_set(sp, &xep)) {
2693 mdclrerror(&xep);
2694 }
2695 }
2696
2697 /*
2698 * If error, reset master to INVALID.
2699 * Ignore error since (next) first node to successfully join
2700 * will set master on all nodes.
2701 */
2702 (void) clnt_mnsetmaster(mynode(), sp, "",
2703 MD_MN_INVALID_NID, &xep);
2704 mdclrerror(&xep);
2705 /* Reset master in my locally cached set descriptor */
2706 sd->sd_mn_master_nodeid = MD_MN_INVALID_NID;
2707 sd->sd_mn_am_i_master = 0;
2708
2709 /*
2710 * If nr flags set on other nodes, reset them.
2711 */
2712 if (clear_nr_flags) {
2713 nd = sd->sd_nodelist;
2714 while (nd) {
2715 if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2716 nd = nd->nd_next;
2717 continue;
2718 }
2719 (void) clnt_upd_nr_flags(nd->nd_nodename, sp,
2720 &my_nd, MD_NR_WITHDRAW, NULL, &xep);
2721 mdclrerror(&xep);
2722 nd = nd->nd_next;
2723 }
2724 /* Reset my locally cached flag */
2725 sd->sd_mn_mynode->nd_flags &= ~MD_MN_NODE_OWN;
2726 }
2727 }
2728
2729 /*
2730 * Notify rpc.mdcommd on all nodes of a nodelist change.
2731 * Send reinit command to mdcommd which forces it to get
2732 * fresh set description.
2733 */
2734 if (send_reinit) {
2735 /* Send reinit */
2736 nd = sd->sd_nodelist;
2737 while (nd) {
2738 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2739 nd = nd->nd_next;
2740 continue;
2741 }
2742
2743 /* Class is ignored for REINIT */
2744 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
2745 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
2746 /*
2747 * We are here because we failed to resume
2748 * rpc.mdcommd. However we potentially have
2749 * an error from the previous call
2750 * If the previous call did fail, we capture
2751 * that error and generate a perror with
2752 * the string, "Unable to resume...".
2753 * Setting rval to -1 ensures that in the
2754 * next iteration of the loop, ep is not
2755 * clobbered.
2756 */
2757 if (rval == 0)
2758 (void) mdstealerror(ep, &xep);
2759 else
2760 mdclrerror(&xep);
2761 rval = -1;
2762 mde_perror(ep, dgettext(TEXT_DOMAIN,
2763 "Unable to reinit rpc.mdcommd."));
2764 }
2765 nd = nd->nd_next;
2766 }
2767
2768 }
2769
2770 out2:
2771 /*
2772 * Unlock diskset by resuming messages across the diskset.
2773 * Just resume all classes so that resume is the same whether
2774 * just one class was locked or all classes were locked.
2775 */
2776 if ((suspend1_flag) || (suspendall_flag)) {
2777 nd = sd->sd_nodelist;
2778 while (nd) {
2779 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2780 nd = nd->nd_next;
2781 continue;
2782 }
2783 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
2784 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
2785 /*
2786 * We are here because we failed to resume
2787 * rpc.mdcommd. However we potentially have
2788 * an error from the previous call
2789 * If the previous call did fail, we capture
2790 * that error and generate a perror with
2791 * the string, "Unable to resume...".
2792 * Setting rval to -1 ensures that in the
2793 * next iteration of the loop, ep is not
2794 * clobbered.
2795 */
2796 if (rval == 0)
2797 (void) mdstealerror(ep, &xep);
2798 else
2799 mdclrerror(&xep);
2800 rval = -1;
2801 mde_perror(ep, dgettext(TEXT_DOMAIN,
2802 "Unable to resume rpc.mdcommd."));
2803 }
2804 nd = nd->nd_next;
2805 }
2806 meta_ping_mnset(sp->setno);
2807 }
2808
2809 /*
2810 * Unlock set. This flushes the caches on the servers.
2811 */
2812 cl_sk = cl_get_setkey(sp->setno, sp->setname);
2813 nd = sd->sd_nodelist;
2814 while (nd) {
2815 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2816 nd = nd->nd_next;
2817 continue;
2818 }
2819 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
2820 if (rval == 0)
2821 (void) mdstealerror(ep, &xep);
2822 else
2823 mdclrerror(&xep);
2824 rval = -1;
2825 }
2826 nd = nd->nd_next;
2827 }
2828
2829 /*
2830 * If this node is the last to join the diskset and clustering isn't
2831 * running, then resync the mirrors in the diskset. We have to wait
2832 * until all nodes are joined so that the status gets propagated to
2833 * all of the members of the set.
2834 * Ignore any error from the resync as the join function shouldn't fail
2835 * because the mirror resync had a problem.
2836 *
2837 * Don't start resync if set is stale.
2838 */
2839 if ((rval == 0) && (sdssc_bind_library() != SDSSC_OKAY) &&
2840 (stale_set != 1)) {
2841 nd = sd->sd_nodelist;
2842 while (nd) {
2843 if (!(nd->nd_flags & MD_MN_NODE_OWN))
2844 break;
2845 nd = nd->nd_next;
2846 }
2847 /*
2848 * nd set to NULL means that we have no nodes in the set that
2849 * haven't joined. In this case we start the resync.
2850 */
2851 if (nd == NULL) {
2852 (void) meta_mirror_resync_all(sp, 0, &xep);
2853 mdclrerror(&xep);
2854 }
2855 }
2856
2857 /* Update ABR state for all soft partitions */
2858 (void) meta_sp_update_abr(sp, &xep);
2859 mdclrerror(&xep);
2860
2861 /*
2862 * call metaflushsetnames to reset local cache for master and
2863 * node information.
2864 */
2865 metaflushsetname(sp);
2866
2867 /* release signals back to what they were on entry */
2868 if (procsigs(FALSE, &oldsigs, &xep) < 0)
2869 mdclrerror(&xep);
2870
2871 /*
2872 * If no error and stale_set is set, then set ep back
2873 * to ep from snarf_set call and return -3. If another error
2874 * occurred and rval is not 0, then that error would have
2875 * caused the node to be withdrawn from the set and would
2876 * have set ep to that error information.
2877 */
2878 if ((rval == 0) && (stale_set)) {
2879 (void) mdstealerror(ep, &ep_snarf);
2880 return (-3);
2881 }
2882
2883 return (rval);
2884 }
2885
2886 /*
2887 * Entry point to withdraw a node from MultiNode diskset.
2888 *
2889 * Validate host in diskset.
2890 * - Should be joined into diskset.
2891 * Assume valid configuration is stored in the set/drive/node records
2892 * in the local mddb since no node or drive can be added to the MNset
2893 * unless all drives and nodes are available. Reconfig steps will
2894 * resync all ALIVE nodes in case of panic in critical areas.
2895 *
2896 * Lock down the set.
2897 * Verify that drives exist in configuration.
2898 * Verify host is a member of this diskset.
2899 * Verify host is an owner of the diskset (host is joined to diskset).
2900 * Only allow withdrawal of master node if master node is the only joined
2901 * in the diskset.
2902 * Halt the diskset on this node.
2903 * Reset Master on this node.
2904 * Updated node flags that this node with withdrawn.
2905 * Unlock the set.
2906 *
2907 * Return values:
2908 * 0 - Node successfully withdrew from set.
2909 * -1 - Withdrawal attempted but failed
2910 * - any failure from libmeta calls
2911 * - node not in the member list
2912 * -2 - Withdrawal not attempted since
2913 * - this set had no drives in set
2914 * - this node not joined to set
2915 * - set is not a multinode set
2916 */
2917 extern int
meta_set_withdraw(mdsetname_t * sp,md_error_t * ep)2918 meta_set_withdraw(
2919 mdsetname_t *sp,
2920 md_error_t *ep
2921 )
2922 {
2923 md_set_desc *sd;
2924 md_drive_desc *dd = 0;
2925 md_mnnode_desc *nd, my_nd;
2926 int rval = 0;
2927 md_setkey_t *cl_sk;
2928 md_error_t xep = mdnullerror;
2929 int set_halted = 0;
2930 int suspendall_flag = 0;
2931 int suspend1_flag = 0;
2932 bool_t stale_bool = FALSE;
2933 mddb_config_t c;
2934 int node_id_list[1];
2935 sigset_t oldsigs;
2936 int send_reinit = 0;
2937
2938 if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2939 return (-1);
2940 }
2941
2942 /* Must be a multinode diskset */
2943 if (!MD_MNSET_DESC(sd)) {
2944 (void) mderror(ep, MDE_NOT_MN, sp->setname);
2945 return (-1);
2946 }
2947
2948 /* Make sure we are blocking all signals */
2949 if (procsigs(TRUE, &oldsigs, &xep) < 0)
2950 mdclrerror(&xep);
2951
2952 /*
2953 * Lock the set on current set members.
2954 * For MN diskset lock_set and SUSPEND are used to protect against
2955 * other meta* commands running on the other nodes.
2956 */
2957 nd = sd->sd_nodelist;
2958 while (nd) {
2959 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2960 nd = nd->nd_next;
2961 continue;
2962 }
2963 if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2964 rval = -1;
2965 goto out;
2966 }
2967 nd = nd->nd_next;
2968 }
2969 /*
2970 * Lock out other meta* commands by suspending
2971 * class 1 messages across the diskset.
2972 */
2973 nd = sd->sd_nodelist;
2974 while (nd) {
2975 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2976 nd = nd->nd_next;
2977 continue;
2978 }
2979 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2980 sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) {
2981 rval = -1;
2982 goto out;
2983 }
2984 suspend1_flag = 1;
2985 nd = nd->nd_next;
2986 }
2987
2988 /* Get list of drives - needed in case of failure */
2989 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
2990 ep)) == NULL) {
2991 /* Error getting drives in list */
2992 if (! mdisok(ep)) {
2993 rval = -1;
2994 goto out2;
2995 }
2996 /* no drives in list */
2997 rval = -2;
2998 goto out2;
2999 }
3000
3001 /*
3002 * Verify that this host is a member (in the host list) of the set.
3003 */
3004 nd = sd->sd_nodelist;
3005 while (nd) {
3006 if (strcmp(mynode(), nd->nd_nodename) == 0) {
3007 break;
3008 }
3009 nd = nd->nd_next;
3010 }
3011 if (!nd) {
3012 (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
3013 sd->sd_mn_mynode->nd_nodename, NULL,
3014 sp->setname);
3015 rval = -1;
3016 goto out2;
3017 }
3018
3019 /*
3020 * Call metaget_setownership that calls each node in diskset and
3021 * marks in set descriptor if node is an owner of the set or not.
3022 * metaget_setownership checks to see if a node is an owner by
3023 * checking to see if that node's kernel has the mddb loaded.
3024 * If a node had panic'd during a reconfig or an
3025 * add/delete/join/withdraw operation, the other nodes' node
3026 * records may not reflect the current state of the diskset,
3027 * so calling metaget_setownership is the safest thing to do.
3028 */
3029 if (metaget_setownership(sp, ep) == -1) {
3030 rval = -1;
3031 goto out2;
3032 }
3033
3034 /*
3035 * Verify that this node is joined
3036 * to diskset (i.e. is an owner of the diskset).
3037 */
3038 if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
3039 rval = -2;
3040 goto out2;
3041 }
3042
3043 /*
3044 * For a MN diskset, only withdraw master if it is
3045 * the only joined node.
3046 */
3047 if (sd->sd_mn_master_nodeid == sd->sd_mn_mynode->nd_nodeid) {
3048 nd = sd->sd_nodelist;
3049 while (nd) {
3050 /* Skip my node since checking for other owners */
3051 if (nd->nd_nodeid == sd->sd_mn_master_nodeid) {
3052 nd = nd->nd_next;
3053 continue;
3054 }
3055 /* If another owner node if found, error */
3056 if (nd->nd_flags & MD_MN_NODE_OWN) {
3057 (void) mddserror(ep, MDE_DS_WITHDRAWMASTER,
3058 sp->setno,
3059 sd->sd_mn_mynode->nd_nodename, NULL,
3060 sp->setname);
3061 rval = -1;
3062 goto out2;
3063 }
3064 nd = nd->nd_next;
3065 }
3066 }
3067
3068 /*
3069 * Is current set STALE?
3070 */
3071 (void) memset(&c, 0, sizeof (c));
3072 c.c_id = 0;
3073 c.c_setno = sp->setno;
3074 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
3075 (void) mdstealerror(ep, &c.c_mde);
3076 rval = -1;
3077 goto out;
3078 }
3079 if (c.c_flags & MDDB_C_STALE) {
3080 stale_bool = TRUE;
3081 }
3082
3083 /*
3084 * Notify rpc.mdcommd on all nodes of a nodelist change.
3085 * Start by suspending rpc.mdcommd (which drains it of all messages),
3086 * then change the nodelist followed by a reinit and resume.
3087 */
3088 nd = sd->sd_nodelist;
3089 while (nd) {
3090 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3091 nd = nd->nd_next;
3092 continue;
3093 }
3094
3095 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
3096 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
3097 rval = -1;
3098 goto out;
3099 }
3100 suspendall_flag = 1;
3101 nd = nd->nd_next;
3102 }
3103
3104 /*
3105 * Withdraw the set - halt set.
3106 * This will fail if any I/O is occuring to any metadevice which
3107 * includes a resync to a mirror metadevice.
3108 */
3109 set_halted = 1;
3110 if (halt_set(sp, ep)) {
3111 /* Was set actually halted? */
3112 if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_YES) {
3113 set_halted = 0;
3114 }
3115 rval = -1;
3116 goto out;
3117 }
3118
3119 /* Change to nodelist so need to send reinit to rpc.mdcommd */
3120 send_reinit = 1;
3121
3122 /* Reset master on withdrawn node */
3123 if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, "",
3124 MD_MN_INVALID_NID, ep)) {
3125 rval = -1;
3126 goto out;
3127 }
3128
3129 /* Mark my node as withdrawn and send to other nodes */
3130 nd = sd->sd_nodelist;
3131 my_nd = *(sd->sd_mn_mynode); /* structure copy */
3132 my_nd.nd_next = NULL;
3133 while (nd) {
3134 if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3135 nd = nd->nd_next;
3136 continue;
3137 }
3138 if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
3139 MD_NR_WITHDRAW, NULL, ep)) {
3140 rval = -1;
3141 goto out;
3142 }
3143 nd = nd->nd_next;
3144 }
3145
3146 /*
3147 * If withdrawn node is a mirror owner, reset mirror owner
3148 * to NULL. If an error occurs, print a warning and continue.
3149 * Don't fail metaset because of mirror owner reset problem since
3150 * next node to grab mirror will resolve this issue.
3151 * Before next node grabs mirrors, metaset will show the withdrawn
3152 * node as owner which is why an attempt to reset the mirror owner
3153 * is made.
3154 */
3155 node_id_list[0] = sd->sd_mn_mynode->nd_nodeid; /* Setup my nodeid */
3156 nd = sd->sd_nodelist;
3157 while (nd) {
3158 if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3159 nd = nd->nd_next;
3160 continue;
3161 }
3162 if (clnt_reset_mirror_owner(nd->nd_nodename, sp,
3163 1, &node_id_list[0], &xep) == 01) {
3164 mde_perror(&xep, dgettext(TEXT_DOMAIN,
3165 "Unable to reset mirror owner on node %s"),
3166 nd->nd_nodename);
3167 mdclrerror(&xep);
3168 }
3169 nd = nd->nd_next;
3170 }
3171
3172 out:
3173 if (rval == -1) {
3174 /* Rejoin node - Mark node as joined and send to other nodes */
3175 nd = sd->sd_nodelist;
3176 my_nd = *(sd->sd_mn_mynode); /* structure copy */
3177 my_nd.nd_next = NULL;
3178 while (nd) {
3179 if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3180 nd = nd->nd_next;
3181 continue;
3182 }
3183 if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
3184 MD_NR_JOIN, NULL, &xep)) {
3185 mdclrerror(&xep);
3186 }
3187 nd = nd->nd_next;
3188 }
3189
3190 /* Set master on withdrawn node */
3191 if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp,
3192 sd->sd_mn_master_nodenm,
3193 sd->sd_mn_master_nodeid, &xep)) {
3194 mdclrerror(&xep);
3195 }
3196
3197 /* Join set if halt_set had succeeded */
3198 if (set_halted) {
3199 /*
3200 * Causes mddbs to be loaded into the kernel.
3201 * Set the force flag so that replica locations can be
3202 * loaded into the kernel even if a mediator node was
3203 * unavailable. This allows a node to join an MO
3204 * diskset when there are sufficient replicas available,
3205 * but a mediator node in unavailable.
3206 */
3207 if (setup_db_bydd(sp, dd, TRUE, &xep) == -1) {
3208 mdclrerror(&xep);
3209 }
3210 /* If set previously stale - make it so at re-join */
3211 if (snarf_set(sp, stale_bool, &xep) != 0) {
3212 mdclrerror(&xep);
3213 (void) halt_set(sp, &xep);
3214 mdclrerror(&xep);
3215 }
3216 }
3217 }
3218
3219 /*
3220 * Notify rpc.mdcommd on all nodes of a nodelist change.
3221 * Send reinit command to mdcommd which forces it to get
3222 * fresh set description.
3223 */
3224 if (send_reinit) {
3225 /* Send reinit */
3226 nd = sd->sd_nodelist;
3227 while (nd) {
3228 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3229 nd = nd->nd_next;
3230 continue;
3231 }
3232
3233 /* Class is ignored for REINIT */
3234 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
3235 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
3236 /*
3237 * We are here because we failed to resume
3238 * rpc.mdcommd. However we potentially have
3239 * an error from the previous call.
3240 * If the previous call did fail, we
3241 * capture that error and generate a perror
3242 * withthe string, "Unable to resume...".
3243 * Setting rval to -1 ensures that in the
3244 * next iteration of the loop, ep is not
3245 * clobbered.
3246 */
3247 if (rval == 0)
3248 (void) mdstealerror(ep, &xep);
3249 else
3250 mdclrerror(&xep);
3251 rval = -1;
3252 mde_perror(ep, dgettext(TEXT_DOMAIN,
3253 "Unable to reinit rpc.mdcommd."));
3254 }
3255 nd = nd->nd_next;
3256 }
3257 }
3258
3259 out2:
3260 /*
3261 * Unlock diskset by resuming messages across the diskset.
3262 * Just resume all classes so that resume is the same whether
3263 * just one class was locked or all classes were locked.
3264 */
3265 if ((suspend1_flag) || (suspendall_flag)) {
3266 nd = sd->sd_nodelist;
3267 while (nd) {
3268 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3269 nd = nd->nd_next;
3270 continue;
3271 }
3272 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
3273 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
3274 /*
3275 * We are here because we failed to resume
3276 * rpc.mdcommd. However we potentially have
3277 * an error from the previous call
3278 * If the previous call did fail, we capture
3279 * that error and generate a perror with
3280 * the string, "Unable to resume...".
3281 * Setting rval to -1 ensures that in the
3282 * next iteration of the loop, ep is not
3283 * clobbered.
3284 */
3285 if (rval == 0)
3286 (void) mdstealerror(ep, &xep);
3287 else
3288 mdclrerror(&xep);
3289 rval = -1;
3290 mde_perror(ep, dgettext(TEXT_DOMAIN,
3291 "Unable to resume rpc.mdcommd."));
3292 }
3293 nd = nd->nd_next;
3294 }
3295 meta_ping_mnset(sp->setno);
3296 }
3297
3298 /*
3299 * Unlock set. This flushes the caches on the servers.
3300 */
3301 cl_sk = cl_get_setkey(sp->setno, sp->setname);
3302 nd = sd->sd_nodelist;
3303 while (nd) {
3304 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3305 nd = nd->nd_next;
3306 continue;
3307 }
3308 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
3309 if (rval == 0)
3310 (void) mdstealerror(ep, &xep);
3311 else
3312 mdclrerror(&xep);
3313 rval = -1;
3314 }
3315 nd = nd->nd_next;
3316 }
3317
3318 /*
3319 * call metaflushsetnames to reset local cache for master and
3320 * node information.
3321 */
3322 metaflushsetname(sp);
3323
3324 /* release signals back to what they were on entry */
3325 if (procsigs(FALSE, &oldsigs, &xep) < 0)
3326 mdclrerror(&xep);
3327
3328 return (rval);
3329
3330 }
3331
3332 /*
3333 * Update nodelist with cluster member information.
3334 * A node not in the member list will be marked
3335 * as not ALIVE and not OWN.
3336 * A node in the member list will be marked ALIVE, but
3337 * the OWN bit will not be changed.
3338 *
3339 * If mynode isn't in the membership list, fail causing
3340 * another reconfig cycle to be started since a non-member
3341 * node shouldn't be taking part in the reconfig cycle.
3342 *
3343 * Return values:
3344 * 0 - No problem.
3345 * 1 - Any failure including RPC failure to my node.
3346 */
3347 int
meta_reconfig_update_nodelist(mdsetname_t * sp,mndiskset_membershiplist_t * nl,md_set_desc * sd,md_error_t * ep)3348 meta_reconfig_update_nodelist(
3349 mdsetname_t *sp,
3350 mndiskset_membershiplist_t *nl,
3351 md_set_desc *sd,
3352 md_error_t *ep
3353 )
3354 {
3355 mndiskset_membershiplist_t *nl2;
3356 md_mnnode_desc *nd;
3357 md_error_t xep = mdnullerror;
3358 int rval = 0;
3359
3360 /*
3361 * Walk through nodelist, checking to see if each
3362 * node is in the member list.
3363 * If node is not a member, reset ALIVE and OWN node flag.
3364 * If node is a member, set ALIVE.
3365 * If mynode's OWN flag gets reset, then halt the diskset on this node.
3366 */
3367 nd = sd->sd_nodelist;
3368 while (nd) {
3369 nl2 = nl;
3370 while (nl2) {
3371 /* If node is in member list, set ALIVE */
3372 if (nl2->msl_node_id == nd->nd_nodeid) {
3373 nd->nd_flags |= MD_MN_NODE_ALIVE;
3374 break;
3375 } else {
3376 nl2 = nl2->next;
3377 }
3378 /* node is not in member list, mark !ALIVE and !OWN */
3379 if (nl2 == NULL) {
3380 /* If node is mynode, then halt set if needed */
3381 if (strcmp(mynode(), nd->nd_nodename) == 0) {
3382 /*
3383 * This shouldn't happen, but just
3384 * in case... Any node not in the
3385 * membership list should be dead and
3386 * not running reconfig step1.
3387 */
3388 if (nd->nd_flags & MD_MN_NODE_OWN) {
3389 if (halt_set(sp, &xep)) {
3390 mde_perror(&xep, "");
3391 mdclrerror(&xep);
3392 }
3393 }
3394 /*
3395 * Return failure since this node
3396 * (mynode) is not in the membership
3397 * list, but process the rest of the
3398 * nodelist first so that rpc.metad
3399 * can be updated with the latest
3400 * membership information.
3401 */
3402 (void) mddserror(ep,
3403 MDE_DS_NOTINMEMBERLIST,
3404 sp->setno, nd->nd_nodename, NULL,
3405 sp->setname);
3406 rval = 1;
3407 }
3408 nd->nd_flags &= ~MD_MN_NODE_ALIVE;
3409 nd->nd_flags &= ~MD_MN_NODE_OWN;
3410 }
3411 }
3412 nd = nd->nd_next;
3413 }
3414
3415 /* Send this information to rpc.metad */
3416 if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist,
3417 MD_NR_SET, MNSET_IN_RECONFIG, &xep)) {
3418 /* Return failure if can't send node flags to rpc.metad */
3419 if (rval == 0) {
3420 (void) mdstealerror(ep, &xep);
3421 rval = 1;
3422 }
3423 }
3424 return (rval);
3425 }
3426
3427 /*
3428 * Choose master determines the master for a diskset.
3429 * Each node determines the master on its own and
3430 * adds this information to its local rpc.metad nodelist
3431 * and also sends it to the kernel.
3432 *
3433 * Nodelist in set descriptor (sd) is sorted in
3434 * monotonically increasing sequence of nodeid.
3435 *
3436 * Return values:
3437 * 0 - No problem.
3438 * 205 - There was an RPC problem to another node.
3439 * -1 - There was an error. This could be an RPC error to my node.
3440 * This is a catastrophic failure causing node to panic.
3441 */
3442 int
meta_reconfig_choose_master_for_set(mdsetname_t * sp,md_set_desc * sd,md_error_t * ep)3443 meta_reconfig_choose_master_for_set(
3444 mdsetname_t *sp,
3445 md_set_desc *sd,
3446 md_error_t *ep
3447 )
3448 {
3449 int is_owner;
3450 md_mnset_record *mnsr = NULL;
3451 int lowest_alive_nodeid = 0;
3452 uint_t master_nodeid;
3453 md_mnnode_desc *nd, *nd2;
3454 md_mnnode_record *nr;
3455 md_drive_desc *dd;
3456 md_setkey_t *cl_sk;
3457 int rval = 0;
3458 md_error_t xep = mdnullerror;
3459 mddb_setflags_config_t sf;
3460
3461 /*
3462 * Is current node joined to diskset?
3463 * Don't trust flags, really check to see if mddb is snarfed.
3464 */
3465 if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
3466 /*
3467 * If a node is joined to the diskset, this node checks
3468 * to see if the current master of the diskset is valid and
3469 * is still in the membership list (ALIVE) and is
3470 * still joined (OWN). Need to verify if master is
3471 * really joined - don't trust the flags. (Can trust
3472 * ALIVE since set during earlier part of reconfig cycle.)
3473 * If the current master is valid, still in the membership
3474 * list and joined, then master is not changed on this node.
3475 * Just return.
3476 *
3477 * Verify that nodeid is valid before accessing masternode.
3478 */
3479 if ((sd->sd_mn_master_nodeid != MD_MN_INVALID_NID) &&
3480 (sd->sd_mn_masternode->nd_flags & MD_MN_NODE_ALIVE)) {
3481 if (clnt_ownset(sd->sd_mn_master_nodenm, sp,
3482 &is_owner, ep) == -1) {
3483 /* If RPC failure to another node return 205 */
3484 if ((mdanyrpcerror(ep)) &&
3485 (sd->sd_mn_mynode->nd_nodeid !=
3486 sd->sd_mn_master_nodeid)) {
3487 return (205);
3488 } else {
3489 /* Any other failure */
3490 return (-1);
3491 }
3492 } else {
3493 if (is_owner == TRUE) {
3494
3495 meta_mc_log(MC_LOG5, dgettext(
3496 TEXT_DOMAIN, "Set %s previous "
3497 "master chosen %s (%d): %s"),
3498 sp->setname,
3499 sd->sd_mn_master_nodenm,
3500 sd->sd_mn_master_nodeid,
3501 meta_print_hrtime(gethrtime() -
3502 start_time));
3503
3504 /* Previous master is ok - done */
3505 return (0);
3506 }
3507 }
3508 }
3509
3510 /*
3511 * If current master is no longer in the membership list or
3512 * is no longer joined, then this node uses the following
3513 * algorithm:
3514 * - node calls RPC routine clnt_ownset to get latest
3515 * information on which nodes are owners of diskset.
3516 * clnt_ownset checks on each node to see if its kernel
3517 * has that diskset snarfed.
3518 */
3519 nd = sd->sd_nodelist;
3520 while (nd) {
3521 /* Don't consider node that isn't in member list */
3522 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3523 nd = nd->nd_next;
3524 continue;
3525 }
3526
3527 if (clnt_ownset(nd->nd_nodename, sp,
3528 &is_owner, ep) == -1) {
3529 /* If RPC failure to another node return 205 */
3530 if ((mdanyrpcerror(ep)) &&
3531 (sd->sd_mn_mynode->nd_nodeid !=
3532 nd->nd_nodeid)) {
3533 return (205);
3534 } else {
3535 /* Any other failure */
3536 return (-1);
3537 }
3538 }
3539
3540 /*
3541 * Set owner flag for each node based on whether
3542 * that node really has a diskset mddb snarfed in
3543 * or not.
3544 */
3545 if (is_owner == TRUE)
3546 nd->nd_flags |= MD_MN_NODE_OWN;
3547 else
3548 nd->nd_flags &= ~MD_MN_NODE_OWN;
3549
3550 nd = nd->nd_next;
3551 }
3552
3553 /*
3554 * - node walks through nodelist looking for nodes that are
3555 * owners of the diskset that are in the membership list.
3556 * - for each owner, node calls RPC routine clnt_getset to
3557 * see if that node has its node record set to OK.
3558 * - If so, master is chosen to be this owner node.
3559 */
3560 nd = sd->sd_nodelist;
3561 while (nd) {
3562 /* Don't consider node that isn't in member list */
3563 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3564 nd = nd->nd_next;
3565 continue;
3566 }
3567
3568 /* Don't consider a node that isn't an owner */
3569 if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3570 nd = nd->nd_next;
3571 continue;
3572 }
3573
3574 /* Does node has its own node record set to OK? */
3575 if (clnt_mngetset(nd->nd_nodename, sp->setname,
3576 MD_SET_BAD, &mnsr, ep) == -1) {
3577 /* If RPC failure to another node return 205 */
3578 if ((mdanyrpcerror(ep)) &&
3579 (sd->sd_mn_mynode->nd_nodeid !=
3580 nd->nd_nodeid)) {
3581 return (205);
3582 } else {
3583 /* Any other failure */
3584 return (-1);
3585 }
3586 }
3587 nr = mnsr->sr_nodechain;
3588 while (nr) {
3589 if (nd->nd_nodeid == nr->nr_nodeid) {
3590 if (nr->nr_flags & MD_MN_NODE_OK) {
3591 /* Found a master */
3592 free_sr(
3593 (md_set_record *)mnsr);
3594 goto found_master;
3595 }
3596 }
3597 nr = nr->nr_next;
3598 }
3599 free_sr((md_set_record *)mnsr);
3600 nd = nd->nd_next;
3601 }
3602
3603 /*
3604 * - If no owner node has its own node record on its own node
3605 * set to OK, then this node checks all of the non-owner
3606 * nodes that are in the membership list.
3607 * - for each non-owner, node calls RPC routine clnt_getset to
3608 * see if that node has its node record set to OK.
3609 * - If set doesn't exist, don't choose node for master.
3610 * - If so, master is chosen to be this non-owner node.
3611 *
3612 */
3613 nd = sd->sd_nodelist;
3614 while (nd) {
3615 /* Don't consider node that isn't in member list */
3616 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3617 nd = nd->nd_next;
3618 continue;
3619 }
3620
3621 /* Only checking non-owner nodes this time around */
3622 if (nd->nd_flags & MD_MN_NODE_OWN) {
3623 nd = nd->nd_next;
3624 continue;
3625 }
3626
3627 /* Does node has its own node record set to OK? */
3628 if (clnt_mngetset(nd->nd_nodename, sp->setname,
3629 MD_SET_BAD, &mnsr, ep) == -1) {
3630 /*
3631 * If set doesn't exist on non-owner node,
3632 * don't consider this node for master.
3633 */
3634 if (mdiserror(ep, MDE_NO_SET)) {
3635 nd = nd->nd_next;
3636 continue;
3637 } else if ((mdanyrpcerror(ep)) &&
3638 (sd->sd_mn_mynode->nd_nodeid !=
3639 nd->nd_nodeid)) {
3640 /* RPC failure to another node */
3641 return (205);
3642 } else {
3643 /* Any other failure */
3644 return (-1);
3645 }
3646 }
3647 nr = mnsr->sr_nodechain;
3648 while (nr) {
3649 if (nd->nd_nodeid == nr->nr_nodeid) {
3650 if (nr->nr_flags & MD_MN_NODE_OK) {
3651 /* Found a master */
3652 free_sr(
3653 (md_set_record *)mnsr);
3654 goto found_master;
3655 }
3656 }
3657 nr = nr->nr_next;
3658 }
3659 free_sr((md_set_record *)mnsr);
3660 nd = nd->nd_next;
3661 }
3662
3663 /*
3664 * - If no node can be found that has its own node record on
3665 * its node to be set to OK, then all alive nodes
3666 * were in the process of being added to or deleted
3667 * from set. Each alive node will remove all
3668 * information pertaining to this set from its node.
3669 *
3670 * If all nodes in set are ALIVE, then call sdssc end routines
3671 * since set was truly being initially created or destroyed.
3672 */
3673 goto delete_set;
3674 } else {
3675
3676 /*
3677 * If node is not joined to diskset, then this
3678 * node uses the following algorithm:
3679 * - If unjoined node doesn't have a node record for itself,
3680 * just delete the diskset since diskset was in the
3681 * process of being created.
3682 * - node needs to find master of diskset before
3683 * reconfig cycle, if a master existed.
3684 * - node calls RPC routine clnt_ownset to get latest
3685 * information on which nodes are owners of diskset.
3686 * clnt_ownset checks on each node to see if its
3687 * kernel has that diskset snarfed.
3688 */
3689
3690 /*
3691 * Is my node in the set description?
3692 * If not, delete the set from this node.
3693 * sr2setdesc sets sd_mn_mynode pointer to the node
3694 * descriptor for this node if there was a node
3695 * record for this node.
3696 *
3697 */
3698 if (sd->sd_mn_mynode == NULL) {
3699 goto delete_set;
3700 }
3701
3702 nd = sd->sd_nodelist;
3703 while (nd) {
3704 /* Don't consider node that isn't in member list */
3705 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3706 nd = nd->nd_next;
3707 continue;
3708 }
3709
3710 if (clnt_ownset(nd->nd_nodename, sp,
3711 &is_owner, ep) == -1) {
3712 /* If RPC failure to another node return 205 */
3713 if ((mdanyrpcerror(ep)) &&
3714 (sd->sd_mn_mynode->nd_nodeid !=
3715 nd->nd_nodeid)) {
3716 return (205);
3717 } else {
3718 /* Any other failure */
3719 return (-1);
3720 }
3721 }
3722
3723 /*
3724 * Set owner flag for each node based on whether
3725 * that node really has a diskset mddb snarfed in
3726 * or not.
3727 */
3728 if (is_owner == TRUE)
3729 nd->nd_flags |= MD_MN_NODE_OWN;
3730 else
3731 nd->nd_flags &= ~MD_MN_NODE_OWN;
3732
3733 nd = nd->nd_next;
3734 }
3735
3736 /*
3737 * - node walks through nodelist looking for nodes that
3738 * are owners of the diskset that are in
3739 * the membership list.
3740 * - for each owner, node calls RPC routine clnt_getset to
3741 * see if that node has a master set and to get the
3742 * diskset description.
3743 * - If the owner node has a set description that doesn't
3744 * include the non-joined node in the nodelist, this node
3745 * removes its set description of that diskset
3746 * (i.e. removes the set from its local mddbs). This is
3747 * handling the case of when a node was removed from a
3748 * diskset while it was not in the cluster membership
3749 * list.
3750 * - If that node has a master set and the master is in the
3751 * membership list and is an owner, then either this was
3752 * the master from before the reconfig cycle or this
3753 * node has already chosen a new master - either way,
3754 * the master value is valid as long as it is in the
3755 * membership list and is an owner
3756 * - master is chosen to be owner node's master
3757 */
3758 nd = sd->sd_nodelist;
3759 while (nd) {
3760 /* Don't consider node that isn't in member list */
3761 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3762 nd = nd->nd_next;
3763 continue;
3764 }
3765
3766 /* Don't consider a node that isn't an owner */
3767 if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3768 nd = nd->nd_next;
3769 continue;
3770 }
3771
3772 /* Get owner node's set record */
3773 if (clnt_mngetset(nd->nd_nodename, sp->setname,
3774 MD_SET_BAD, &mnsr, ep) == -1) {
3775 /* If RPC failure to another node return 205 */
3776 if ((mdanyrpcerror(ep)) &&
3777 (sd->sd_mn_mynode->nd_nodeid !=
3778 nd->nd_nodeid)) {
3779 return (205);
3780 } else {
3781 /* Any other failure */
3782 return (-1);
3783 }
3784 }
3785
3786 /* Is this node in the owner node's set record */
3787 nr = mnsr->sr_nodechain;
3788 while (nr) {
3789 if (sd->sd_mn_mynode->nd_nodeid ==
3790 nr->nr_nodeid) {
3791 break;
3792 }
3793 nr = nr->nr_next;
3794 }
3795 if (nr == NULL) {
3796 /* my node not found - delete set */
3797 free_sr((md_set_record *)mnsr);
3798 goto delete_set;
3799 }
3800
3801 /* Is owner's node's master valid? */
3802 master_nodeid = mnsr->sr_master_nodeid;
3803 free_sr((md_set_record *)mnsr);
3804 if (master_nodeid == MD_MN_INVALID_NID) {
3805 nd = nd->nd_next;
3806 continue;
3807 }
3808
3809 nd2 = sd->sd_nodelist;
3810 while (nd2) {
3811 if ((nd2->nd_nodeid == master_nodeid) &&
3812 (nd2->nd_flags & MD_MN_NODE_ALIVE) &&
3813 (nd2->nd_flags & MD_MN_NODE_OWN)) {
3814 nd = nd2;
3815 goto found_master;
3816 }
3817 nd2 = nd2->nd_next;
3818 }
3819 nd = nd->nd_next;
3820 }
3821
3822 /*
3823 * - If no owner node has a valid master, then follow
3824 * algorithm of when a node is joined to the diskset.
3825 * - node walks through nodelist looking for nodes that are
3826 * owners of the diskset that are in the membership list.
3827 * - for each owner, node calls RPC routine clnt_getset to
3828 * see if that node has its node record set to OK.
3829 * - If so, master is chosen to be this owner node.
3830 */
3831 nd = sd->sd_nodelist;
3832 while (nd) {
3833 /* Don't consider node that isn't in member list */
3834 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3835 nd = nd->nd_next;
3836 continue;
3837 }
3838
3839 /* Don't consider a node that isn't an owner */
3840 if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3841 nd = nd->nd_next;
3842 continue;
3843 }
3844
3845 /* Does node has its own node record set to OK? */
3846 if (clnt_mngetset(nd->nd_nodename, sp->setname,
3847 MD_SET_BAD, &mnsr, ep) == -1) {
3848 /* If RPC failure to another node return 205 */
3849 if ((mdanyrpcerror(ep)) &&
3850 (sd->sd_mn_mynode->nd_nodeid !=
3851 nd->nd_nodeid)) {
3852 return (205);
3853 } else {
3854 /* Any other failure */
3855 return (-1);
3856 }
3857 }
3858 nr = mnsr->sr_nodechain;
3859 while (nr) {
3860 if (nd->nd_nodeid == nr->nr_nodeid) {
3861 if (nr->nr_flags & MD_MN_NODE_OK) {
3862 /* Found a master */
3863 free_sr(
3864 (md_set_record *)mnsr);
3865 goto found_master;
3866 }
3867 }
3868 nr = nr->nr_next;
3869 }
3870 free_sr((md_set_record *)mnsr);
3871 nd = nd->nd_next;
3872 }
3873
3874 /*
3875 * - If no owner node has its own node record on its own node
3876 * set to OK, then this node checks all of the non-owner
3877 * nodes that are in the membership list.
3878 * - for each non-owner, node calls RPC routine clnt_getset to
3879 * see if that node has its node record set to OK.
3880 * - If set doesn't exist, don't choose node for master.
3881 * - If this node doesn't exist in the nodelist on any of the
3882 * non-owner nodes, this node removes its set description
3883 * of that diskset (i.e. removes the set from its local
3884 * mddbs). This is handling the case of when a node was
3885 * removed from a diskset while it was not in the
3886 * cluster membership list.
3887 * - If non-owner node has its node record set to OK and if
3888 * this node hasn't removed this diskset (step directly
3889 * before this one), then the master is chosen to be this
3890 * non-owner node.
3891 */
3892 nd = sd->sd_nodelist;
3893 while (nd) {
3894 /* Don't consider node that isn't in member list */
3895 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3896 nd->nd_flags |= MD_MN_NODE_DEL;
3897 nd = nd->nd_next;
3898 continue;
3899 }
3900
3901 /* Don't consider owner nodes since none are OK */
3902 if (nd->nd_flags & MD_MN_NODE_OWN) {
3903 nd->nd_flags |= MD_MN_NODE_DEL;
3904 nd = nd->nd_next;
3905 continue;
3906 }
3907
3908 /*
3909 * Don't need to get nodelist from my node since
3910 * this is where sd_nodelist was obtained.
3911 */
3912 if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) {
3913 nd = nd->nd_next;
3914 continue;
3915 }
3916
3917 /*
3918 * If node has already been decided against for
3919 * master, then skip it.
3920 */
3921 if (nd->nd_flags & MD_MN_NODE_DEL) {
3922 nd = nd->nd_next;
3923 continue;
3924 }
3925
3926 /*
3927 * Does node in my nodelist have its own node
3928 * record marked OK on its node? And does node
3929 * in my nodelist exist on all other nodes?
3930 * Don't want to choose a node for master unless
3931 * that node is marked OK on its own node and that
3932 * node exists on all other alive nodes.
3933 *
3934 * This is guarding against the case when several
3935 * nodes are down and one of the downed nodes is
3936 * deleted from the diskset. When the down nodes
3937 * are rebooted into the cluster, you don't want
3938 * any node to pick the deleted node as the master.
3939 */
3940 if (clnt_mngetset(nd->nd_nodename, sp->setname,
3941 MD_SET_BAD, &mnsr, ep) == -1) {
3942 /*
3943 * If set doesn't exist on non-owner node,
3944 * don't consider this node for master.
3945 */
3946 if (mdiserror(ep, MDE_NO_SET)) {
3947 nd->nd_flags |= MD_MN_NODE_DEL;
3948 nd = nd->nd_next;
3949 continue;
3950 } else if (mdanyrpcerror(ep)) {
3951 /* RPC failure to another node */
3952 return (205);
3953 } else {
3954 /* Any other failure */
3955 return (-1);
3956 }
3957 }
3958 /*
3959 * Is my node in the nodelist gotten from the other
3960 * node? If not, then remove the set from my node
3961 * since set was deleted from my node while my node
3962 * was out of the cluster.
3963 */
3964 nr = mnsr->sr_nodechain;
3965 while (nr) {
3966 if (sd->sd_mn_mynode->nd_nodeid ==
3967 nr->nr_nodeid) {
3968 break;
3969 }
3970 nr = nr->nr_next;
3971 }
3972 if (nr == NULL) {
3973 /* my node not found - delete set */
3974 free_sr((md_set_record *)mnsr);
3975 goto delete_set;
3976 }
3977
3978 /* Is node being checked marked OK on its own node? */
3979 nr = mnsr->sr_nodechain;
3980 while (nr) {
3981 if (nd->nd_nodeid == nr->nr_nodeid) {
3982 if (!(nr->nr_flags & MD_MN_NODE_OK)) {
3983 nd->nd_flags |= MD_MN_NODE_DEL;
3984 }
3985 break;
3986 }
3987 nr = nr->nr_next;
3988 }
3989 /*
3990 * If node being checked doesn't exist on its
3991 * own node - don't choose it as master.
3992 */
3993 if (nr == NULL) {
3994 nd->nd_flags |= MD_MN_NODE_DEL;
3995 }
3996
3997 /*
3998 * Check every node in my node's nodelist against
3999 * the nodelist gotten from the other node.
4000 * If a node in my node's nodelist is not found in the
4001 * other node's nodelist, then set the DEL flag.
4002 */
4003 nd2 = sd->sd_nodelist;
4004 while (nd2) {
4005 nr = mnsr->sr_nodechain;
4006 while (nr) {
4007 if (nd2->nd_nodeid == nr->nr_nodeid) {
4008 break;
4009 }
4010 nr = nr->nr_next;
4011 }
4012 /* nd2 not found in other node's nodelist */
4013 if (nr == NULL) {
4014 nd2->nd_flags |= MD_MN_NODE_DEL;
4015 }
4016 nd2 = nd2->nd_next;
4017 }
4018
4019 free_sr((md_set_record *)mnsr);
4020 nd = nd->nd_next;
4021 }
4022
4023 /*
4024 * Rescan list look for node that has not been marked DEL.
4025 * First node found is the master.
4026 */
4027 nd = sd->sd_nodelist;
4028 while (nd) {
4029 if (!(nd->nd_flags & MD_MN_NODE_DEL)) {
4030 break;
4031 }
4032 nd = nd->nd_next;
4033 continue;
4034 }
4035 if (nd) {
4036 /* Found a master */
4037 goto found_master;
4038 }
4039
4040 /*
4041 * - If no node can be found that has its own node record on
4042 * its node to be set to OK, then all alive nodes
4043 * were in the process of being added to or deleted
4044 * from set. Each alive node will remove all
4045 * information pertaining to this set from its node.
4046 *
4047 * If all nodes in set are ALIVE, then call sdssc end routines
4048 * since set was truly being initially created or destroyed.
4049 */
4050 goto delete_set;
4051 }
4052
4053 found_master:
4054 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4055 "Set %s master chosen %s (%d): %s"),
4056 sp->setname, nd->nd_nodename, nd->nd_nodeid,
4057 meta_print_hrtime(gethrtime() - start_time));
4058
4059 if (clnt_lock_set(mynode(), sp, ep) == -1) {
4060 return (-1);
4061 }
4062
4063 cl_sk = cl_get_setkey(sp->setno, sp->setname);
4064
4065 if (clnt_mnsetmaster(mynode(), sp,
4066 nd->nd_nodename, nd->nd_nodeid, ep)) {
4067 rval = -1;
4068 } else if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) {
4069 /* If this node is new master, set flag in this node's kernel */
4070 (void) memset(&sf, 0, sizeof (sf));
4071 sf.sf_setno = sp->setno;
4072 sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
4073 /* Use magic to help protect ioctl against attack. */
4074 sf.sf_magic = MDDB_SETFLAGS_MAGIC;
4075 sf.sf_flags = MDDB_NM_SET;
4076
4077 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4078 "Setting new master flag for set %s: %s"),
4079 sp->setname, meta_print_hrtime(gethrtime() - start_time));
4080
4081 /*
4082 * Fail reconfig cycle if ioctl fails since it is critical
4083 * to set new master flag.
4084 */
4085 if (metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde,
4086 NULL) != NULL) {
4087 (void) mdstealerror(ep, &sf.sf_mde);
4088 rval = -1;
4089 }
4090 }
4091
4092 if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) {
4093 if (rval == 0) {
4094 (void) mdstealerror(ep, &xep);
4095 rval = -1;
4096 }
4097 }
4098
4099 cl_set_setkey(NULL);
4100
4101 metaflushsetname(sp);
4102
4103 return (rval);
4104
4105 delete_set:
4106 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4107 "Master not chosen, deleting set %s: %s"),
4108 sp->setname, meta_print_hrtime(gethrtime() - start_time));
4109
4110 /*
4111 * Remove all set information from this node:
4112 * - node records for this set
4113 * - drive records for this set
4114 * - set record for this set
4115 * (Only do this on this node since each node
4116 * will do it for its own local mddb.)
4117 *
4118 * If all nodes in set are ALIVE, then
4119 * the lowest numbered ALIVE nodeid in set
4120 * (irregardless of whether an owner node or not) will
4121 * call the DCS service to cleanup for create/delete of set.
4122 * sdssc_create_end(cleanup) if set was being created or
4123 * sdssc_delete_end(cleanup) if set was being deleted.
4124 * A node record with flag ADD denotes a set being
4125 * created. A node record with flag DEL denotes a
4126 * set being deleted.
4127 */
4128 nd = sd->sd_nodelist;
4129 while (nd) {
4130 /* Found a node that isn't alive */
4131 if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
4132 break;
4133
4134 /* Is my node the lowest numbered ALIVE node? */
4135 if (nd->nd_nodeid < sd->sd_mn_mynode->nd_nodeid) {
4136 break;
4137 }
4138 nd = nd->nd_next;
4139 }
4140 if (nd == NULL) {
4141 /* All nodes ALIVE and this is the lowest nodeid */
4142 lowest_alive_nodeid = 1;
4143 }
4144
4145 if (clnt_lock_set(mynode(), sp, ep) == -1) {
4146 return (-1);
4147 }
4148
4149
4150 /*
4151 * If this node had been joined, withdraw and reset master.
4152 *
4153 * This could happen if a node was being added to or removed
4154 * from a diskset and the node doing the add/delete operation and
4155 * all other nodes in the diskset have left the cluster.
4156 */
4157 if (sd->sd_mn_mynode) {
4158 nd = sd->sd_mn_mynode;
4159 if (nd->nd_flags & MD_MN_NODE_OWN) {
4160 if (clnt_withdrawset(mynode(), sp, ep)) {
4161 rval = -1;
4162 goto out;
4163 }
4164 if (clnt_mnsetmaster(mynode(), sp, "",
4165 MD_MN_INVALID_NID, ep)) {
4166 rval = -1;
4167 goto out;
4168 }
4169 }
4170 }
4171
4172 /*
4173 * Remove side records for this node (side) from local mddb
4174 * (clnt_deldrvs does this) if there are drives in the set.
4175 *
4176 * Don't need to mark this node as DEL since already marked as
4177 * ADD or DEL (or this node would have been chosen as master).
4178 * Don't need to mark other node records, drive records or
4179 * set records as DEL. If a panic occurs during clnt_delset,
4180 * these records will be deleted the next time this node
4181 * becomes a member and goes through the reconfig cycle.
4182 */
4183 /* Get the drive descriptors for this set */
4184 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
4185 ep)) == NULL) {
4186 if (! mdisok(ep)) {
4187 /*
4188 * Ignore and clear out any failures from
4189 * metaget_drivedesc since a panic could have
4190 * occurred when a node was partially added to a set.
4191 */
4192 mdclrerror(ep);
4193 }
4194 } else {
4195 if (clnt_deldrvs(mynode(), sp, dd, ep)) {
4196 rval = -1;
4197 goto out;
4198 }
4199 }
4200
4201 /*
4202 * Now, delete the set - this removes the node, drive
4203 * and set records from the local mddb.
4204 */
4205 if (clnt_delset(mynode(), sp, ep)) {
4206 rval = -1;
4207 goto out;
4208 }
4209
4210 out:
4211 cl_sk = cl_get_setkey(sp->setno, sp->setname);
4212
4213 /*
4214 * Ignore errors from unlock of set since set is no longer
4215 * known (if clnt_delset worked).
4216 */
4217 if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) {
4218 mdclrerror(&xep);
4219 }
4220
4221 cl_set_setkey(NULL);
4222
4223 metaflushsetname(sp);
4224
4225 /*
4226 * If this node is the lowest numbered nodeid then
4227 * call sdssc_create/delete_end depending on whether
4228 * this node is marked as ADD or DEL in the node record.
4229 */
4230 if (lowest_alive_nodeid) {
4231 if (nd->nd_flags & MD_MN_NODE_ADD)
4232 sdssc_create_end(sp->setname, SDSSC_CLEANUP);
4233 else if (nd->nd_flags & MD_MN_NODE_DEL)
4234 sdssc_delete_end(sp->setname, SDSSC_CLEANUP);
4235 }
4236
4237 /* Finished with this set -- return */
4238 return (rval);
4239 }
4240
4241 /*
4242 * Reconfig step to choose a new master for all MN disksets.
4243 * Return values:
4244 * 0 - Everything is great.
4245 * 1 - This node failed to reconfig.
4246 * 205 - Cause another reconfig due to a nodelist problem
4247 * or RPC failure to another node
4248 */
4249 int
meta_reconfig_choose_master(long timeout,md_error_t * ep)4250 meta_reconfig_choose_master(
4251 long timeout,
4252 md_error_t *ep
4253 )
4254 {
4255 set_t max_sets, setno;
4256 int nodecnt;
4257 mndiskset_membershiplist_t *nl;
4258 md_set_desc *sd;
4259 mdsetname_t *sp;
4260 int rval = 0;
4261 mddb_setflags_config_t sf;
4262 int start_node_delayed = 0;
4263
4264 if ((max_sets = get_max_sets(ep)) == 0) {
4265 mde_perror(ep, dgettext(TEXT_DOMAIN,
4266 "Unable to get number of sets"));
4267 return (1);
4268 }
4269
4270 /*
4271 * Get membershiplist from API routine. If there's
4272 * an error, return a 205 to cause another reconfig.
4273 */
4274 if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
4275 mde_perror(ep, "");
4276 return (205);
4277 }
4278
4279 for (setno = 1; setno < max_sets; setno++) {
4280 if ((sp = metasetnosetname(setno, ep)) == NULL) {
4281 if (mdiserror(ep, MDE_NO_SET)) {
4282 /* No set for this setno - continue */
4283 mdclrerror(ep);
4284 continue;
4285 } else {
4286 /*
4287 * If encountered an RPC error from my node,
4288 * then immediately fail.
4289 */
4290 if (mdanyrpcerror(ep)) {
4291 mde_perror(ep, "");
4292 return (1);
4293 }
4294 /* Can't get set information */
4295 mde_perror(ep, dgettext(TEXT_DOMAIN,
4296 "Unable to get information for "
4297 "set number %d"), setno);
4298 mdclrerror(ep);
4299 continue;
4300 }
4301 }
4302
4303 /* If setname is there, set desc should exist. */
4304 if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4305 /*
4306 * If encountered an RPC error from my node,
4307 * then immediately fail.
4308 */
4309 if (mdanyrpcerror(ep)) {
4310 mde_perror(ep, "");
4311 return (1);
4312 }
4313 mde_perror(ep, dgettext(TEXT_DOMAIN,
4314 "Unable to get set %s desc information"),
4315 sp->setname);
4316 mdclrerror(ep);
4317 continue;
4318 }
4319
4320 /* Only reconfig MN disksets */
4321 if (!MD_MNSET_DESC(sd)) {
4322 continue;
4323 }
4324
4325 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4326 "Begin choose master for set %s: %s"),
4327 sp->setname, meta_print_hrtime(gethrtime() - start_time));
4328
4329 /* Update nodelist with member information. */
4330 if (meta_reconfig_update_nodelist(sp, nl, sd, ep)) {
4331 /*
4332 * If encountered an RPC error from my node,
4333 * then immediately fail.
4334 */
4335 if (mdanyrpcerror(ep)) {
4336 mde_perror(ep, "");
4337 return (1);
4338 }
4339 mde_perror(ep, "");
4340 mdclrerror(ep);
4341 continue;
4342 }
4343
4344 /*
4345 * If all nodes in a cluster are starting, then
4346 * all nodes will attempt to contact all other nodes
4347 * to determine a master node. This can lead to a
4348 * problem where node 1 is trying to contact the rpc.metad
4349 * node 2 and node 2 is trying to contact the rpc.metad
4350 * on node 1 -- and this causes the rpc call to fail
4351 * on both nodes and causes a new reconfig cycle.
4352 *
4353 * In order to break this problem, a newly starting node
4354 * will delay a small amount of time (nodeid mod 4 seconds)
4355 * and will then run the code to choose a master for the
4356 * first set. Delay will only be done once regardless of the
4357 * number of sets.
4358 */
4359 if (start_node_delayed == 0) {
4360 (void) memset(&sf, 0, sizeof (sf));
4361 sf.sf_setno = sp->setno;
4362 sf.sf_flags = MDDB_NM_GET;
4363 /* Use magic to help protect ioctl against attack. */
4364 sf.sf_magic = MDDB_SETFLAGS_MAGIC;
4365 if ((metaioctl(MD_MN_GET_SETFLAGS, &sf,
4366 &sf.sf_mde, NULL) == 0) &&
4367 ((sf.sf_setflags & MD_SET_MN_START_RC) ==
4368 MD_SET_MN_START_RC)) {
4369 (void) sleep(sd->sd_mn_mynode->nd_nodeid % 4);
4370 }
4371 start_node_delayed = 1;
4372 }
4373
4374 /* Choose master for this set */
4375 rval = meta_reconfig_choose_master_for_set(sp, sd, ep);
4376 if (rval == -1) {
4377 mde_perror(ep, "");
4378 return (1);
4379 } else if (rval == 205) {
4380 mde_perror(ep, "");
4381 return (205);
4382 }
4383
4384 /* reinit rpc.mdcommd with new nodelist */
4385 if (mdmn_reinit_set(sp->setno, timeout)) {
4386 md_eprintf(dgettext(TEXT_DOMAIN,
4387 "Could not re-initialise rpc.mdcommd for "
4388 "set %s\n"), sp->setname);
4389 return (1);
4390 }
4391
4392 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4393 "Choose master for set %s completed: %s"),
4394 sp->setname, meta_print_hrtime(gethrtime() - start_time));
4395 }
4396
4397 /*
4398 * Each node turns on I/Os for all MN disksets.
4399 * This is to recover from the situation where the master died
4400 * during a previous reconfig cycle when I/Os were suspended
4401 * for a MN diskset.
4402 * If a failure occurs return a 1 which will force this node to
4403 * panic. Cannot leave node in the situation where I/Os are
4404 * not resumed.
4405 */
4406 setno = 0; /* 0 means all MN sets */
4407 if (metaioctl(MD_MN_RESUME_SET, &setno, ep, NULL)) {
4408 mde_perror(ep, "");
4409 return (1);
4410 }
4411
4412 /* Free the nodelist */
4413 if (nodecnt)
4414 meta_free_nodelist(nl);
4415
4416 return (0);
4417 }
4418
4419 /*
4420 * meta_mnsync_user_records will synchronize the diskset user records across
4421 * all nodes in the diskset. The diskset user records are stored in
4422 * each node's local set mddb.
4423 *
4424 * This needs to be done even if there is no master change during the
4425 * reconfig cycle since this routine should clean up any mess left by
4426 * the untimely termination of a metaset or metadb command (due to a
4427 * node panic or to user intervention).
4428 *
4429 * Caller is the Master node.
4430 *
4431 * Returns 0 - Success
4432 * 205 - Failure during RPC to another node
4433 * -1 - Any other failure and ep is filled in.
4434 */
4435 int
meta_mnsync_user_records(mdsetname_t * sp,md_error_t * ep)4436 meta_mnsync_user_records(
4437 mdsetname_t *sp,
4438 md_error_t *ep
4439 )
4440 {
4441 md_set_desc *sd;
4442 md_mnnode_desc *master_nodelist, *nd, *nd2, *ndtail;
4443 md_mnset_record *mnsr;
4444 md_mnsr_node_t *master_mnsr_node = NULL, *mnsr_node = NULL;
4445 md_mnnode_record *nr;
4446 md_drive_record *dr;
4447 int dr_cnt, dd_cnt;
4448 int found_my_nr;
4449 md_drive_desc *dd, *dd_prev, *master_dd, *other_dd;
4450 int all_drives_ok;
4451 int rval = 0;
4452 int max_genid = 0;
4453 int num_alive_nodes, num_alive_nodes_del = 0;
4454 int set_locked = 0;
4455 md_setkey_t *cl_sk;
4456 md_error_t xep = mdnullerror;
4457 char *anode[1];
4458 mddb_setflags_config_t sf;
4459
4460 /*
4461 * Sync up node records first.
4462 * Construct a master nodelist using the nodelist from this
4463 * node's rpc.metad node records and then setting the state of each
4464 * node following these rules:
4465 * - If a node record is marked OK on its node, mark it OK
4466 * in the master nodelist (and later OK on all nodes)
4467 * If a node record is also marked OWN on its node,
4468 * mark it OWN in the master nodelist.
4469 * - If a node record is not marked OK on its node, then mark
4470 * it as DEL in the master list (later deleting it)
4471 * - If node record doesn't exist on that node, then mark it DEL
4472 * (later deleting it)
4473 * - If set record doesn't exist on that node, mark node as DEL
4474 * - If a node record doesn't exist on all nodes, then mark it DEL
4475 * - If a node is not ALIVE, then
4476 * - If that node marked DEL on any node - mark it DEL
4477 * in master list but leave in nodelist
4478 * - If that node is marked as ADD on any node, mark it
4479 * ADD in the master list but leave in nodelist
4480 * - When that node returns to the living, the DEL
4481 * node record will be removed and the ADD node
4482 * record may be removed if marked ADD on that
4483 * node.
4484 * The key rule is to not remove a node from the nodelist until
4485 * that node record is removed from its own node. Do not want to
4486 * remove a node's record from all other nodes and then have
4487 * that node have its own record marked OK so that a node will pick
4488 * a different master than the other nodes.
4489 *
4490 * Next,
4491 * If node is ALIVE and node record is marked DEL in master nodelist,
4492 * remove node from set.
4493 * If node is ALIVE and node record is marked OK in master nodelist,
4494 * mark it OK on all other nodes.
4495 * If node is not ALIVE and node record is marked DEL in master
4496 * nodelist, mark it DEL on all other nodes.
4497 * If node is not ALIVE and node record is marked ADD in master,
4498 * nodelist, mark it ADD on all other nodes.
4499 */
4500 if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4501 return (-1);
4502 }
4503 master_nodelist = sd->sd_nodelist;
4504
4505 /*
4506 * Walk through nodelist creating a master nodelist.
4507 */
4508 num_alive_nodes = 0;
4509 nd = master_nodelist;
4510 while (nd) {
4511 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4512 nd = nd->nd_next;
4513 continue;
4514 }
4515 num_alive_nodes++;
4516 if (clnt_mngetset(nd->nd_nodename, sp->setname,
4517 MD_SET_BAD, &mnsr, ep) == -1) {
4518 if (mdiserror(ep, MDE_NO_SET)) {
4519 /* set doesn't exist, mark node as DEL */
4520 nd->nd_flags &= ~MD_MN_NODE_OK;
4521 nd->nd_flags &= ~MD_MN_NODE_ADD;
4522 nd->nd_flags |= MD_MN_NODE_DEL;
4523 nd->nd_flags |= MD_MN_NODE_NOSET;
4524 nd = nd->nd_next;
4525 continue;
4526 } else {
4527 /* If RPC failure to another node return 205 */
4528 if ((mdanyrpcerror(ep)) &&
4529 (sd->sd_mn_mynode->nd_nodeid !=
4530 nd->nd_nodeid)) {
4531 rval = 205;
4532 } else {
4533 /* Any other failure */
4534 rval = -1;
4535 }
4536 goto out;
4537 }
4538 }
4539 /* Find biggest genid in records for this diskset */
4540 if (mnsr->sr_genid > max_genid)
4541 max_genid = mnsr->sr_genid;
4542
4543 dr = mnsr->sr_drivechain;
4544 while (dr) {
4545 /* Find biggest genid in records for this diskset */
4546 if (dr->dr_genid > max_genid) {
4547 max_genid = dr->dr_genid;
4548 }
4549 dr = dr->dr_next;
4550 }
4551
4552 found_my_nr = 0;
4553 nr = mnsr->sr_nodechain;
4554 /* nr is the list of node recs from nd_nodename node */
4555 while (nr) {
4556 /* Find biggest genid in records for this diskset */
4557 if (nr->nr_genid > max_genid)
4558 max_genid = nr->nr_genid;
4559 nd2 = master_nodelist;
4560 ndtail = NULL;
4561 /* For each node record, is it in master list? */
4562 while (nd2) {
4563 if (nd2->nd_nodeid == nr->nr_nodeid)
4564 break;
4565 if (nd2->nd_next == NULL)
4566 ndtail = nd2;
4567 nd2 = nd2->nd_next;
4568 }
4569 /*
4570 * Found node record not in master list -- add it
4571 * to list marking it as DEL since node record
4572 * should exist on all nodes unless a panic occurred
4573 * during addition or deletion of host to diskset.
4574 */
4575 if (nd2 == NULL) {
4576 nd2 = Zalloc(sizeof (*nd2));
4577 (void) strcpy(nd2->nd_nodename,
4578 nr->nr_nodename);
4579 nd2->nd_flags = nr->nr_flags;
4580 nd2->nd_flags |= MD_MN_NODE_DEL;
4581 nd2->nd_nodeid = nr->nr_nodeid;
4582 nd2->nd_next = NULL;
4583 ndtail->nd_next = nd2;
4584 nd2 = NULL;
4585 nr = nr->nr_next;
4586 continue;
4587 }
4588 /*
4589 * Is this the node record for the node that
4590 * we requested the set desc from?
4591 * If so, check if node has its own node record
4592 * marked OK. If marked OK, check for the OWN bit.
4593 */
4594 if (nr->nr_nodeid == nd->nd_nodeid) {
4595 found_my_nr = 1;
4596 if (nr->nr_flags & MD_MN_NODE_OK) {
4597 /*
4598 * If node record is marked OK
4599 * on its own node, then mark it OK
4600 * in the master list. Node record
4601 * would have to exist on all nodes
4602 * in the ADD state before it could
4603 * be put into the OK state.
4604 */
4605 nd->nd_flags |= MD_MN_NODE_OK;
4606 nd->nd_flags &=
4607 ~(MD_MN_NODE_ADD | MD_MN_NODE_DEL);
4608 /*
4609 * Mark own in master list as marked
4610 * on own node.
4611 */
4612 if (nr->nr_flags & MD_MN_NODE_OWN)
4613 nd->nd_flags |= MD_MN_NODE_OWN;
4614 else
4615 nd->nd_flags &= ~MD_MN_NODE_OWN;
4616 } else {
4617 /* Otherwise, mark node as DEL */
4618 nd->nd_flags &= ~MD_MN_NODE_OK;
4619 nd->nd_flags &= ~MD_MN_NODE_ADD;
4620 nd->nd_flags |= MD_MN_NODE_DEL;
4621 }
4622 }
4623 /*
4624 * If node is not ALIVE and marked DEL
4625 * on any node, make it DEL in master list.
4626 * If node is not ALIVE and marked ADD
4627 * on any node, make it ADD in master list
4628 * unless node record has already been marked DEL.
4629 */
4630 if (!(nr->nr_flags & MD_MN_NODE_ALIVE)) {
4631 if (nr->nr_flags & MD_MN_NODE_ADD) {
4632 if (!(nd->nd_flags & MD_MN_NODE_DEL)) {
4633 /* If not DEL - mark it ADD */
4634 nd->nd_flags |= MD_MN_NODE_ADD;
4635 nd->nd_flags &= ~MD_MN_NODE_OK;
4636 }
4637 }
4638 if (nr->nr_flags & MD_MN_NODE_DEL) {
4639 nd->nd_flags |= MD_MN_NODE_DEL;
4640 nd->nd_flags &= ~MD_MN_NODE_OK;
4641 /* Could already be ADD - make it DEL */
4642 nd->nd_flags &= ~MD_MN_NODE_ADD;
4643 }
4644 }
4645 nr = nr->nr_next;
4646 }
4647 /*
4648 * If a node record doesn't exist on its own node,
4649 * then mark node as DEL.
4650 */
4651 if (found_my_nr == 0) {
4652 nd->nd_flags &= ~MD_MN_NODE_OK;
4653 nd->nd_flags |= MD_MN_NODE_DEL;
4654 }
4655
4656 /*
4657 * If node is OK - put mnsr onto master_mnsr_node list for
4658 * later use when syncing up the drive records in the set.
4659 */
4660 if (nd->nd_flags & MD_MN_NODE_OK) {
4661 mnsr_node = Zalloc(sizeof (*mnsr_node));
4662 mnsr_node->mmn_mnsr = mnsr;
4663 (void) strncpy(mnsr_node->mmn_nodename,
4664 nd->nd_nodename, MD_MAX_MNNODENAME_PLUS_1);
4665 mnsr_node->mmn_next = master_mnsr_node;
4666 master_mnsr_node = mnsr_node;
4667 } else {
4668 free_sr((struct md_set_record *)mnsr);
4669 }
4670
4671 nd = nd->nd_next;
4672 }
4673
4674 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4675 "Master nodelist created for set %s: %s"),
4676 sp->setname, meta_print_hrtime(gethrtime() - start_time));
4677
4678 /*
4679 * Send master nodelist to the rpc.metad on all nodes (including
4680 * myself) and each node will update itself. This will set the
4681 * ADD and DEL flags on each node as setup in the master nodelist.
4682 * Don't send nodelist to node where set doesn't exist.
4683 */
4684 nd = master_nodelist;
4685 while (nd) {
4686 if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4687 (nd->nd_flags & MD_MN_NODE_NOSET)) {
4688 nd = nd->nd_next;
4689 continue;
4690 }
4691 if (clnt_upd_nr_flags(nd->nd_nodename, sp,
4692 master_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) {
4693 /* If RPC failure to another node return 205 */
4694 if ((mdanyrpcerror(ep)) &&
4695 (sd->sd_mn_mynode->nd_nodeid !=
4696 nd->nd_nodeid)) {
4697 rval = 205;
4698 } else {
4699 /* Any other failure */
4700 rval = -1;
4701 }
4702 goto out;
4703 }
4704 nd = nd->nd_next;
4705 }
4706
4707 /*
4708 * Now, delete nodes that need to be deleted.
4709 */
4710 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
4711 ep)) == NULL) {
4712 if (! mdisok(ep)) {
4713 rval = -1;
4714 goto out;
4715 }
4716 }
4717
4718 /*
4719 * May be doing lots of RPC commands to the nodes, so lock the
4720 * ALIVE members of the set since most of the rpc.metad routines
4721 * require this for security reasons.
4722 */
4723 nd = master_nodelist;
4724 while (nd) {
4725 /* Skip non-alive nodes and node without set */
4726 if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4727 (nd->nd_flags & MD_MN_NODE_NOSET)) {
4728 nd = nd->nd_next;
4729 continue;
4730 }
4731 if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
4732 /* If RPC failure to another node return 205 */
4733 if ((mdanyrpcerror(ep)) &&
4734 (sd->sd_mn_mynode->nd_nodeid !=
4735 nd->nd_nodeid)) {
4736 rval = 205;
4737 } else {
4738 /* Any other failure */
4739 rval = -1;
4740 }
4741 goto out;
4742 }
4743 set_locked = 1;
4744 nd = nd->nd_next;
4745 }
4746
4747 nd = master_nodelist;
4748 while (nd) {
4749 /* Skip non-alive nodes */
4750 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4751 nd = nd->nd_next;
4752 continue;
4753 }
4754 if (nd->nd_flags & MD_MN_NODE_DEL) {
4755 num_alive_nodes_del++;
4756 /*
4757 * Delete this node rec from all ALIVE nodes in diskset.
4758 */
4759 nd2 = master_nodelist;
4760 while (nd2) {
4761 /* Skip non-alive nodes and node without set */
4762 if (!(nd2->nd_flags & MD_MN_NODE_ALIVE) ||
4763 (nd2->nd_flags & MD_MN_NODE_NOSET)) {
4764 nd2 = nd2->nd_next;
4765 continue;
4766 }
4767
4768 /* This is a node being deleted from set */
4769 if (nd2->nd_nodeid == nd->nd_nodeid) {
4770 /* Mark set record as DEL */
4771 if (clnt_upd_sr_flags(nd->nd_nodename,
4772 sp, MD_SR_DEL, ep)) {
4773 /* RPC failure to !my node */
4774 if ((mdanyrpcerror(ep)) &&
4775 (sd->sd_mn_mynode->
4776 nd_nodeid
4777 != nd->nd_nodeid)) {
4778 rval = 205;
4779 } else {
4780 /* Any other failure */
4781 rval = -1;
4782 }
4783 goto out;
4784 }
4785 if (clnt_deldrvs(nd->nd_nodename, sp,
4786 dd, ep)) {
4787 /* RPC failure to !my node */
4788 if ((mdanyrpcerror(ep)) &&
4789 (sd->sd_mn_mynode->
4790 nd_nodeid
4791 != nd->nd_nodeid)) {
4792 rval = 205;
4793 } else {
4794 /* Any other failure */
4795 rval = -1;
4796 }
4797 goto out;
4798 }
4799 if (clnt_delset(nd->nd_nodename, sp,
4800 ep) == -1) {
4801 /* RPC failure to !my node */
4802 if ((mdanyrpcerror(ep)) &&
4803 (sd->sd_mn_mynode->
4804 nd_nodeid
4805 != nd->nd_nodeid)) {
4806 rval = 205;
4807 } else {
4808 /* Any other failure */
4809 rval = -1;
4810 }
4811 goto out;
4812 }
4813 } else {
4814 /*
4815 * Delete host from sets on hosts
4816 * not being deleted.
4817 */
4818 anode[0] = Strdup(nd->nd_nodename);
4819 if (clnt_delhosts(nd2->nd_nodename, sp,
4820 1, anode, ep) == -1) {
4821 Free(anode[0]);
4822 /* RPC failure to !my node */
4823 if ((mdanyrpcerror(ep)) &&
4824 (sd->sd_mn_mynode->
4825 nd_nodeid
4826 != nd2->nd_nodeid)) {
4827 rval = 205;
4828 } else {
4829 /* Any other failure */
4830 rval = -1;
4831 }
4832 goto out;
4833 }
4834
4835 meta_mc_log(MC_LOG5,
4836 dgettext(TEXT_DOMAIN,
4837 "Deleted node %s (%d) on node %s "
4838 "from set %s: %s"),
4839 nd->nd_nodename, nd->nd_nodeid,
4840 nd2->nd_nodename,
4841 sp->setname,
4842 meta_print_hrtime(
4843 gethrtime() - start_time));
4844
4845 Free(anode[0]);
4846 }
4847 nd2 = nd2->nd_next;
4848 }
4849 }
4850 nd = nd->nd_next;
4851 }
4852
4853 nd = master_nodelist;
4854 cl_sk = cl_get_setkey(sp->setno, sp->setname);
4855 while (nd) {
4856 /* Skip non-alive nodes and node without set */
4857 if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4858 (nd->nd_flags & MD_MN_NODE_NOSET)) {
4859 nd = nd->nd_next;
4860 continue;
4861 }
4862 if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) {
4863 /* If RPC failure to another node return 205 */
4864 if ((mdanyrpcerror(ep)) &&
4865 (sd->sd_mn_mynode->nd_nodeid !=
4866 nd->nd_nodeid)) {
4867 rval = 205;
4868 } else {
4869 /* Any other failure */
4870 rval = -1;
4871 }
4872 goto out;
4873 }
4874 nd = nd->nd_next;
4875 }
4876 cl_set_setkey(NULL);
4877 set_locked = 0;
4878
4879 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4880 "Nodelist syncronization complete for set %s: %s"),
4881 sp->setname, meta_print_hrtime(gethrtime() - start_time));
4882
4883 metaflushsetname(sp);
4884
4885 /*
4886 * If all alive nodes have been deleted from set, just
4887 * return since nothing else can be done until non-alive
4888 * nodes (if there are any) rejoin the cluster.
4889 */
4890 if (num_alive_nodes == num_alive_nodes_del) {
4891 rval = 0;
4892 goto out;
4893 }
4894
4895 /*
4896 * Sync up drive records.
4897 *
4898 * If a node panic'd (or metaset command was killed) during the
4899 * addition or deletion of a drive to the diskset, the nodes
4900 * may have a different view of the drive list. During cleanup
4901 * of the drive list during reconfig, a drive will be deleted
4902 * from the list if the master node sees that the drive has been
4903 * marked in the ADD state on any node or is marked in the DEL state
4904 * on all nodes.
4905 * This cleanup must occur even if all nodes in the cluster are
4906 * not part of the cluster so that all nodes have the same view
4907 * of the drivelist.
4908 * Then if the entire cluster goes down and comes back up, the
4909 * new master node could be a node that wasn't in the cluster when
4910 * the node was deleted. This could lead to a situation where the
4911 * master node thinks that a drive is OK, but this drive isn't
4912 * known to the other nodes.
4913 * This situation can also occur during the addition of a drive
4914 * where a node has the drive marked OK, but the node executing the
4915 * metaset command enountered a failure before marking that drive OK
4916 * on the rest of the nodes. If the node with the OK drive then
4917 * panics, then rest of the nodes will remove that drive marked ADD
4918 * and when the node with the OK drive rejoins the cluster, it will
4919 * have a drive marked OK that is unknown by the other nodes.
4920 *
4921 * There are 2 situations to consider:
4922 * A) Master knows about a drive that other nodes don't know about.
4923 * B) At least one slave node knows about a drive that the master
4924 * node doesn't know about.
4925 *
4926 * To handle these situations the following steps are followed:
4927 * 1) Count number of drives known by this master node and the
4928 * other slave nodes.
4929 * If all nodes have the same number of drives and the master has
4930 * all drives marked OK, then skip to step4.
4931 *
4932 * 2) If a node has less drives listed than the master, the master
4933 * must get the drive descriptor list from that node so that
4934 * master can determine which drive it needs to delete from that
4935 * node. Master must get the drive descriptor list since the
4936 * drive record list does not contain the name of the drive, but
4937 * only a key and the key can only be interprested on that other
4938 * node.
4939 *
4940 * 3) The master will then create the master drive list by doing:
4941 * - Master starts with drive list known by master.
4942 * - Any drive marked ADD will be removed from the list.
4943 * - Any drive not known by another node (from step2) will be
4944 * removed from the drive list.
4945 * - If a drive is marked DEL on the master, the master must
4946 * verify that the drive record is marked DEL on all nodes.
4947 * If any node has the drive record marked OK, mark it OK
4948 * on the master. (The reason why is described below).
4949 *
4950 * 4) The master sends out the master drive list and the slave
4951 * nodes will force their drive lists to match the master
4952 * drive list by deleting drives, if necessary and by changing
4953 * the drive record states from ADD->OK if master has drive
4954 * marked OK and slave has drive marked ADD.
4955 *
4956 * Interesting scenarios:
4957 *
4958 * 1) System has 4 nodes with node 1 as the master. Node 3 starts
4959 * to delete a drive record (drive record on node 1 is marked DEL),
4960 * but is stopped when node 3 panics. Node 1 also panics.
4961 * During reconfig cycle, node 2 is picked as master and the drive
4962 * record is left alone since all nodes in the cluster have it
4963 * marked OK. User now sees drive as part of diskset.
4964 * Now, entire cluster is rebooted and node 1 rejoins the cluster.
4965 * Node 1 is picked as the master and node 1 has drive record
4966 * marked DEL. Node 1 contacts all other nodes in the cluster
4967 * and since at least one node has the drive record marked OK,
4968 * the master marks the drive record OK.
4969 * User continues to see the drive as part of the diskset.
4970 */
4971
4972 /* Reget set descriptor since flushed above */
4973 if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4974 rval = -1;
4975 goto out;
4976 }
4977
4978 /* Has side effect of setting sd->sd_drvs to same as master_dd */
4979 if ((master_dd = metaget_drivedesc_sideno(sp,
4980 sd->sd_mn_mynode->nd_nodeid,
4981 (MD_BASICNAME_OK | PRINT_FAST), ep)) == NULL) {
4982 /* No drives in list */
4983 if (!mdisok(ep)) {
4984 /*
4985 * Can't get drive list for this node, so
4986 * return -1 causing this node to be removed
4987 * cluster config and fixed.
4988 */
4989 rval = -1;
4990 goto out;
4991 }
4992 }
4993
4994 /* Count the number of drives for all nodes */
4995 mnsr_node = master_mnsr_node;
4996 while (mnsr_node) {
4997 dr_cnt = 0;
4998 dr = mnsr_node->mmn_mnsr->sr_drivechain;
4999 while (dr) {
5000 dr_cnt++;
5001 dr = dr->dr_next;
5002 }
5003 mnsr_node->mmn_numdrives = dr_cnt;
5004 mnsr_node = mnsr_node->mmn_next;
5005 }
5006
5007 /* Count the number of drives for the master; also check flags */
5008 all_drives_ok = 1;
5009 dd_cnt = 0;
5010 dd = master_dd;
5011 while (dd) {
5012 dd_cnt++;
5013 if (!(dd->dd_flags & MD_DR_OK))
5014 all_drives_ok = 0;
5015 dd = dd->dd_next;
5016 }
5017
5018 /* If all drives are ok, do quick check against number of drives */
5019 if (all_drives_ok) {
5020 /* If all nodes have same number of drives, almost done */
5021 mnsr_node = master_mnsr_node;
5022 while (mnsr_node) {
5023 if (mnsr_node->mmn_numdrives != dd_cnt)
5024 break;
5025 mnsr_node = mnsr_node->mmn_next;
5026 }
5027 /* All nodes have same number of drives, just send flags */
5028 if (mnsr_node == NULL) {
5029 goto send_drive_list;
5030 }
5031 }
5032
5033 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5034 "Begin detailed drive synchronization for set %s: %s"),
5035 sp->setname, meta_print_hrtime(gethrtime() - start_time));
5036
5037 /* Detailed check required */
5038 mnsr_node = master_mnsr_node;
5039 while (mnsr_node) {
5040 /* Does slave node have less drives than master? */
5041 if (mnsr_node->mmn_numdrives < dd_cnt) {
5042 /* Yes - must determine which drive is missing */
5043 if (clnt_getdrivedesc(mnsr_node->mmn_nodename, sp,
5044 &other_dd, ep)) {
5045 /* RPC failure to !my node */
5046 if ((mdanyrpcerror(ep)) &&
5047 (strcmp(mynode(), mnsr_node->mmn_nodename)
5048 != 0)) {
5049 rval = 205;
5050 } else {
5051 /* Any other failure */
5052 rval = -1;
5053 }
5054 mde_perror(ep, dgettext(TEXT_DOMAIN,
5055 "Master node %s unable to "
5056 "retrieve drive list from node %s"),
5057 mynode(), mnsr_node->mmn_nodename);
5058 goto out;
5059 }
5060 mnsr_node->mmn_dd = other_dd;
5061 dd = master_dd;
5062 while (dd) {
5063 if (!(dd->dd_flags & MD_DR_OK)) {
5064 dd = dd->dd_next;
5065 continue;
5066 }
5067 other_dd = mnsr_node->mmn_dd;
5068 while (other_dd) {
5069 /* Convert to devids, when available */
5070 if (strcmp(other_dd->dd_dnp->cname,
5071 dd->dd_dnp->cname) == 0) {
5072 break;
5073 }
5074 other_dd = other_dd->dd_next;
5075 }
5076 /*
5077 * dd not found on slave so mark it
5078 * ADD for later deletion (drives in ADD
5079 * state are deleted later in this routine).
5080 */
5081 if (other_dd == NULL) {
5082 dd->dd_flags = MD_DR_ADD;
5083 }
5084 dd = dd->dd_next;
5085 }
5086
5087 }
5088 mnsr_node = mnsr_node->mmn_next;
5089 }
5090
5091 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5092 "Drive check completed for set %s: %s"),
5093 sp->setname, meta_print_hrtime(gethrtime() - start_time));
5094
5095 dd = master_dd;
5096 dd_prev = 0;
5097 while (dd) {
5098 /* Remove any ADD drives from list */
5099 if (dd->dd_flags & MD_DR_ADD) {
5100 if (dd_prev) {
5101 dd_prev->dd_next = dd->dd_next;
5102 dd->dd_next = NULL;
5103 metafreedrivedesc(&dd);
5104 dd = dd_prev->dd_next;
5105 } else {
5106 /*
5107 * If removing drive descriptor from head
5108 * of linked list, also change sd->sd_drvs.
5109 */
5110 master_dd = sd->sd_drvs = dd->dd_next;
5111 dd->dd_next = NULL;
5112 metafreedrivedesc(&dd);
5113 dd = master_dd;
5114 }
5115 /* dd setup in if/else above */
5116 continue;
5117 }
5118 /*
5119 * If drive is marked DEL, check all other nodes.
5120 * If drive on another node is marked OK, mark drive OK
5121 * in master list. If drive is marked DEL or doesn't exist
5122 * on all nodes, remove drive from list.
5123 */
5124 if (dd->dd_flags & MD_DR_DEL) {
5125 mnsr_node = master_mnsr_node;
5126 while (mnsr_node) {
5127 if (mnsr_node->mmn_dd == NULL) {
5128 if (clnt_getdrivedesc(
5129 mnsr_node->mmn_nodename, sp,
5130 &other_dd, ep)) {
5131 /* RPC failure to !my node */
5132 if ((mdanyrpcerror(ep)) &&
5133 (strcmp(mynode(),
5134 mnsr_node->mmn_nodename)
5135 != 0)) {
5136 rval = 205;
5137 } else {
5138 /* Any other failure */
5139 rval = -1;
5140 }
5141 mde_perror(ep,
5142 dgettext(TEXT_DOMAIN,
5143 "Master node %s unable "
5144 "to retrieve drive list "
5145 "from node %s"), mynode(),
5146 mnsr_node->mmn_nodename);
5147 goto out;
5148 }
5149 mnsr_node->mmn_dd = other_dd;
5150 }
5151 other_dd = mnsr_node->mmn_dd;
5152 while (other_dd) {
5153 /* Found drive (OK) from other node */
5154 if (strcmp(dd->dd_dnp->cname,
5155 other_dd->dd_dnp->cname)
5156 == 0) {
5157 /* Drive marked OK */
5158 if (other_dd->dd_flags &
5159 MD_DR_OK) {
5160 dd->dd_flags = MD_DR_OK;
5161 }
5162 break;
5163 }
5164 other_dd = other_dd->dd_next;
5165 }
5166 if (dd->dd_flags == MD_DR_OK)
5167 break;
5168
5169 mnsr_node = mnsr_node->mmn_next;
5170 }
5171 /*
5172 * If no node had this drive marked OK, delete it.
5173 */
5174 if (dd->dd_flags & MD_DR_DEL) {
5175 if (dd_prev) {
5176 dd_prev->dd_next = dd->dd_next;
5177 dd->dd_next = NULL;
5178 metafreedrivedesc(&dd);
5179 dd = dd_prev->dd_next;
5180 } else {
5181 /*
5182 * If removing drive descriptor from
5183 * head of linked list, also change
5184 * sd->sd_drvs.
5185 */
5186 master_dd = sd->sd_drvs = dd->dd_next;
5187 dd->dd_next = NULL;
5188 metafreedrivedesc(&dd);
5189 dd = master_dd;
5190 }
5191 /* dd setup in if/else above */
5192 continue;
5193 }
5194 }
5195 dd_prev = dd;
5196 dd = dd->dd_next;
5197 }
5198
5199 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5200 "Setting drive states completed for set %s: %s"),
5201 sp->setname, meta_print_hrtime(gethrtime() - start_time));
5202
5203 send_drive_list:
5204 /*
5205 * Set genid on all drives to be the highest value seen.
5206 */
5207 dd = master_dd;
5208 while (dd) {
5209 dd->dd_genid = max_genid;
5210 dd = dd->dd_next;
5211 }
5212 /*
5213 * Send updated drive list to all alive nodes.
5214 * Will also set genid on set and node records to have same
5215 * as the drive records.
5216 */
5217 nd = sd->sd_nodelist;
5218 while (nd) {
5219 /* Skip non-alive nodes */
5220 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5221 nd = nd->nd_next;
5222 continue;
5223 }
5224 if (clnt_upd_dr_reconfig(nd->nd_nodename, sp, master_dd, ep)) {
5225 /* RPC failure to another node */
5226 if ((mdanyrpcerror(ep)) &&
5227 (sd->sd_mn_mynode->nd_nodeid != nd->nd_nodeid)) {
5228 rval = 205;
5229 } else {
5230 /* Any other failure */
5231 rval = -1;
5232 }
5233 goto out;
5234 }
5235 nd = nd->nd_next;
5236 }
5237
5238 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5239 "Sent drive list to all nodes for set %s: %s"),
5240 sp->setname, meta_print_hrtime(gethrtime() - start_time));
5241
5242 /*
5243 * If no drive records left in set and nodes had been joined,
5244 * withdraw the nodes. Always reset the master and mark
5245 * all nodes as withdrawn on all nodes.
5246 */
5247 if (master_dd == NULL) {
5248 /* Reset new master flag since no longer master */
5249 (void) memset(&sf, 0, sizeof (sf));
5250 sf.sf_setno = sp->setno;
5251 sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5252 sf.sf_flags = MDDB_NM_RESET;
5253 /* Use magic to help protect ioctl against attack. */
5254 sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5255 /* Ignore failure, failure to reset flag isn't catastrophic */
5256 (void) metaioctl(MD_MN_SET_SETFLAGS, &sf,
5257 &sf.sf_mde, NULL);
5258
5259 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5260 "Reset new master flag for " "set %s: %s"),
5261 sp->setname, meta_print_hrtime(gethrtime() - start_time));
5262
5263 nd = sd->sd_nodelist;
5264 while (nd) {
5265 /* Skip non-alive nodes */
5266 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5267 nd = nd->nd_next;
5268 continue;
5269 }
5270
5271 if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
5272 /* RPC failure to another node */
5273 if ((mdanyrpcerror(ep)) &&
5274 (sd->sd_mn_mynode->nd_nodeid !=
5275 nd->nd_nodeid)) {
5276 rval = 205;
5277 } else {
5278 /* Any other failure */
5279 rval = -1;
5280 }
5281 goto out;
5282 }
5283 set_locked = 1;
5284
5285 /* Withdraw node from set if owner */
5286 if ((nd->nd_flags & MD_MN_NODE_OWN) &&
5287 (clnt_withdrawset(nd->nd_nodename, sp, ep))) {
5288 /* RPC failure to another node */
5289 if ((mdanyrpcerror(ep)) &&
5290 (sd->sd_mn_mynode->nd_nodeid !=
5291 nd->nd_nodeid)) {
5292 rval = 205;
5293 } else {
5294 /* Any other failure */
5295 rval = -1;
5296 }
5297 goto out;
5298 }
5299
5300 /* Mark all nodes as withdrawn on this node */
5301 if (clnt_upd_nr_flags(nd->nd_nodename, sp,
5302 sd->sd_nodelist, MD_NR_WITHDRAW, NULL, ep)) {
5303 /* RPC failure to another node */
5304 if ((mdanyrpcerror(ep)) &&
5305 (sd->sd_mn_mynode->nd_nodeid !=
5306 nd->nd_nodeid)) {
5307 rval = 205;
5308 } else {
5309 /* Any other failure */
5310 rval = -1;
5311 }
5312 goto out;
5313 }
5314
5315 /* Resets master to no-master on this node */
5316 if (clnt_mnsetmaster(nd->nd_nodename, sp,
5317 "", MD_MN_INVALID_NID, ep)) {
5318 /* RPC failure to another node */
5319 if ((mdanyrpcerror(ep)) &&
5320 (sd->sd_mn_mynode->nd_nodeid !=
5321 nd->nd_nodeid)) {
5322 rval = 205;
5323 } else {
5324 /* Any other failure */
5325 rval = -1;
5326 }
5327 goto out;
5328 }
5329
5330 cl_sk = cl_get_setkey(sp->setno, sp->setname);
5331 if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) {
5332 /* RPC failure to another node */
5333 if ((mdanyrpcerror(ep)) &&
5334 (sd->sd_mn_mynode->nd_nodeid !=
5335 nd->nd_nodeid)) {
5336 rval = 205;
5337 } else {
5338 /* Any other failure */
5339 rval = -1;
5340 }
5341 goto out;
5342 }
5343 set_locked = 0;
5344 nd = nd->nd_next;
5345 }
5346 }
5347
5348 out:
5349 /*
5350 * If got here and set is still locked, then an error has
5351 * occurred and master_nodelist is still valid.
5352 * If error is not an RPC error, then unlock.
5353 * If error is an RPC error, skip unlocks since this could cause
5354 * yet another RPC timeout if a node has failed.
5355 * Ignore failures in unlock since unlock is just trying to
5356 * clean things up.
5357 */
5358 if ((set_locked) && !(mdanyrpcerror(ep))) {
5359 nd = master_nodelist;
5360 cl_sk = cl_get_setkey(sp->setno, sp->setname);
5361 while (nd) {
5362 /* Skip non-alive nodes */
5363 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5364 nd = nd->nd_next;
5365 continue;
5366 }
5367 /*
5368 * If clnt_unlock fails, just break out since next
5369 * reconfig cycle will reset the locks anyway.
5370 */
5371 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
5372 break;
5373 }
5374 nd = nd->nd_next;
5375 }
5376 cl_set_setkey(NULL);
5377 }
5378 /* Free master_mnsr and drive descs */
5379 mnsr_node = master_mnsr_node;
5380 while (mnsr_node) {
5381 master_mnsr_node = mnsr_node->mmn_next;
5382 free_sr((md_set_record *)mnsr_node->mmn_mnsr);
5383 free_rem_dd(mnsr_node->mmn_dd);
5384 Free(mnsr_node);
5385 mnsr_node = master_mnsr_node;
5386 }
5387
5388 /* Frees sd->sd_drvs (which is also master_dd) */
5389 metaflushsetname(sp);
5390 return (rval);
5391 }
5392
5393 /*
5394 * meta_mnsync_diskset_mddbs
5395 * Calling node is guaranteed to be an owner node.
5396 * Calling node is the master node.
5397 *
5398 * Master node verifies that ondisk mddb format matches its incore format.
5399 * If no nodes are joined to set, remove the change log entries.
5400 * If a node is joined to set, play the change log.
5401 *
5402 * Returns 0 - Success
5403 * 1 - Master unable to join to set.
5404 * 205 - Failure during RPC to another node
5405 * -1 - Any other failure and ep is filled in.
5406 * -1 return will eventually cause node to panic
5407 * in a SunCluster environment.
5408 */
5409 int
meta_mnsync_diskset_mddbs(mdsetname_t * sp,md_error_t * ep)5410 meta_mnsync_diskset_mddbs(
5411 mdsetname_t *sp,
5412 md_error_t *ep
5413 )
5414 {
5415 md_set_desc *sd;
5416 mddb_config_t c;
5417 md_mn_msgclass_t class;
5418 mddb_setflags_config_t sf;
5419 md_mnnode_desc *nd, *nd2;
5420 md_error_t xep = mdnullerror;
5421 int stale_set = 0;
5422
5423 /* If setname is there, set desc should exist. */
5424 if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5425 mde_perror(ep, dgettext(TEXT_DOMAIN,
5426 "Unable to get set %s desc information"), sp->setname);
5427 return (-1);
5428 }
5429
5430 /* Are there drives in the set? */
5431 if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
5432 ep) == NULL) {
5433 if (! mdisok(ep)) {
5434 return (-1);
5435 }
5436 /* No drives in set -- nothing to sync up */
5437 return (0);
5438 }
5439
5440 /*
5441 * Is master node (which is this node) joined to set?
5442 * If master node isn't joined (which means that no nodes
5443 * are joined to diskset), remove the change log entries
5444 * since no need to replay them - all nodes will have same
5445 * view of mddbs since all nodes are reading in the mddbs
5446 * from disk.
5447 * There is also no need to sync up the master and ondisk mddbs
5448 * since master has no incore knowledge.
5449 * Need to join master to set in order to flush the change
5450 * log entries. Don't need to block I/O during join of master
5451 * to set since no other nodes are joined to set and so no I/O
5452 * can be occurring.
5453 */
5454 if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
5455 /* Join master to set */
5456 if (clnt_joinset(mynode(), sp,
5457 MNSET_IN_RECONFIG, ep)) {
5458 if (mdismddberror(ep, MDE_DB_STALE)) {
5459 /*
5460 * If STALE, print message and continue on.
5461 * Don't do any writes or reads to mddbs
5462 * so don't clear change log.
5463 */
5464 mde_perror(ep, dgettext(TEXT_DOMAIN,
5465 "Join of master node to STALE set %s"),
5466 sp->setname);
5467 stale_set = 1;
5468 mdclrerror(ep);
5469 } else if (mdismddberror(ep, MDE_DB_ACCOK)) {
5470 /* ACCOK means mediator provided extra vote */
5471 mdclrerror(ep);
5472 } else {
5473 /*
5474 * If master is unable to join set, print an
5475 * error message. Don't return failure or node
5476 * will panic during cluster reconfig cycle.
5477 * Also, withdraw node from set in order to
5478 * cleanup from failed join attempt.
5479 */
5480 mde_perror(ep, dgettext(TEXT_DOMAIN,
5481 "Join of master node in set %s failed"),
5482 sp->setname);
5483 if (clnt_withdrawset(mynode(), sp, &xep))
5484 mdclrerror(&xep);
5485 return (1);
5486 }
5487 }
5488 /*
5489 * Master node successfully joined.
5490 * Set local copy of flags to OWN and
5491 * send owner flag to rpc.metad. If not stale,
5492 * flush the change log.
5493 */
5494 sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN;
5495 if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, MD_NR_SET,
5496 MNSET_IN_RECONFIG, ep)) {
5497 mde_perror(ep, dgettext(TEXT_DOMAIN,
5498 "Flag update of master node join in set %s failed"),
5499 sp->setname);
5500 return (-1);
5501 }
5502
5503 if (!stale_set) {
5504 if (mdmn_reset_changelog(sp, ep,
5505 MDMN_CLF_RESETLOG) != 0) {
5506 mde_perror(ep, dgettext(TEXT_DOMAIN,
5507 "Unable to reset changelog."));
5508 return (-1);
5509 }
5510 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5511 "Removed changelog entries for set %s: %s"),
5512 sp->setname,
5513 meta_print_hrtime(gethrtime() - start_time));
5514 }
5515 /* Reset new master flag before return */
5516 (void) memset(&sf, 0, sizeof (sf));
5517 sf.sf_setno = sp->setno;
5518 sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5519 sf.sf_flags = MDDB_NM_RESET;
5520 /* Use magic to help protect ioctl against attack. */
5521 sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5522 /* Ignore failure, failure to reset flag isn't catastrophic */
5523 (void) metaioctl(MD_MN_SET_SETFLAGS, &sf,
5524 &sf.sf_mde, NULL);
5525
5526 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5527 "Reset new master flag for set %s: %s"),
5528 sp->setname, meta_print_hrtime(gethrtime() - start_time));
5529
5530 return (0);
5531 }
5532
5533 /*
5534 * Is master already joined to STALE set (< 50% mddbs avail)?
5535 * If so, can make no config changes to mddbs so don't check or play
5536 * changelog and don't sync master node to ondisk mddbs.
5537 * To get out of the stale state all nodes must be withdrawn
5538 * from set. Then as nodes are re-joined, all nodes will
5539 * have same view of mddbs since all nodes are reading the
5540 * mddbs from disk.
5541 */
5542 (void) memset(&c, 0, sizeof (c));
5543 c.c_id = 0;
5544 c.c_setno = sp->setno;
5545 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
5546 (void) mdstealerror(ep, &c.c_mde);
5547 return (-1);
5548 }
5549 if (c.c_flags & MDDB_C_STALE) {
5550 return (0);
5551 }
5552
5553 /*
5554 * If this node is NOT a newly chosen master, then there's
5555 * nothing else to do since the change log should be empty and
5556 * the ondisk and incore mddbs are already consistent.
5557 *
5558 * A newly chosen master is a node that was not the master
5559 * at the beginning of the reconfig cycle. If a node is a new
5560 * master, then the new master state is reset after the ondisk
5561 * and incore mddbs are consistent and the change log has
5562 * been replayed.
5563 */
5564 (void) memset(&sf, 0, sizeof (sf));
5565 sf.sf_setno = sp->setno;
5566 sf.sf_flags = MDDB_NM_GET;
5567 /* Use magic to help protect ioctl against attack. */
5568 sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5569 if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, &sf.sf_mde, NULL) == 0) &&
5570 ((sf.sf_setflags & MD_SET_MN_NEWMAS_RC) == 0)) {
5571 return (0);
5572 }
5573
5574 /*
5575 * Now, sync up incore master view to ondisk mddbs.
5576 * This is needed in the case where a master node
5577 * had made a change to the mddb, but this change
5578 * may not have been relayed to the slaves yet.
5579 * So, the new master needs to verify that the ondisk
5580 * mddbs match what the new master has incore -
5581 * if different, new master rewrites all of the mddbs.
5582 * Then the new master will replay the changelog and the
5583 * new master will then execute what the old master had
5584 * done.
5585 *
5586 * Block all I/Os to disks in this diskset on all nodes in
5587 * the diskset. This will allow the rewriting of the mddbs
5588 * (if needed), to proceed in a timely manner.
5589 *
5590 * If block of I/Os fail, return a -1.
5591 */
5592
5593 nd = sd->sd_nodelist;
5594 while (nd) {
5595 /* Skip non-alive and non-owner nodes */
5596 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5597 (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5598 nd = nd->nd_next;
5599 continue;
5600 }
5601 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5602 MN_SUSP_IO, ep)) {
5603 mde_perror(ep, dgettext(TEXT_DOMAIN,
5604 "Unable to suspend I/O on node %s in set %s"),
5605 nd->nd_nodename, sp->setname);
5606
5607 /*
5608 * Resume all other nodes that had been suspended.
5609 * (Reconfig return step also resumes I/Os
5610 * for all sets.)
5611 */
5612 nd2 = sd->sd_nodelist;
5613 while (nd2) {
5614 /* Stop when reaching failed node */
5615 if (nd2->nd_nodeid == nd->nd_nodeid)
5616 break;
5617 /* Skip non-alive and non-owner nodes */
5618 if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) ||
5619 (!(nd2->nd_flags & MD_MN_NODE_OWN))) {
5620 nd2 = nd2->nd_next;
5621 continue;
5622 }
5623 (void) (clnt_mn_susp_res_io(nd2->nd_nodename,
5624 sp->setno, MN_RES_IO, &xep));
5625 nd2 = nd2->nd_next;
5626 }
5627
5628 /*
5629 * If an RPC failure on another node, return a 205.
5630 * Otherwise, exit with failure.
5631 */
5632 if ((mdanyrpcerror(ep)) &&
5633 (sd->sd_mn_mynode->nd_nodeid !=
5634 nd->nd_nodeid)) {
5635 return (205);
5636 } else {
5637 return (-1);
5638 }
5639
5640 }
5641 nd = nd->nd_next;
5642 }
5643
5644 (void) memset(&c, 0, sizeof (c));
5645 c.c_id = 0;
5646 c.c_setno = sp->setno;
5647 /* Master can't sync up to ondisk mddbs? Kick it out of cluster */
5648 if (metaioctl(MD_MN_CHK_WRT_MDDB, &c, &c.c_mde, NULL) != 0)
5649 return (-1);
5650
5651 /*
5652 * Resume I/Os that were suspended above.
5653 */
5654 nd = sd->sd_nodelist;
5655 while (nd) {
5656 /* Skip non-alive and non-owner nodes */
5657 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5658 (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5659 nd = nd->nd_next;
5660 continue;
5661 }
5662 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5663 MN_RES_IO, ep)) {
5664 mde_perror(ep, dgettext(TEXT_DOMAIN,
5665 "Unable to resume I/O on node %s in set %s"),
5666 nd->nd_nodename, sp->setname);
5667
5668 /*
5669 * If an RPC failure then don't do any
5670 * more RPC calls, since one timeout is enough
5671 * to endure. If RPC failure to another node, return
5672 * 205. If RPC failure to my node, return -1.
5673 * If not an RPC failure, continue resuming the
5674 * rest of the nodes and then return -1.
5675 */
5676 if (mdanyrpcerror(ep)) {
5677 if (sd->sd_mn_mynode->nd_nodeid ==
5678 nd->nd_nodeid) {
5679 return (-1);
5680 } else {
5681 return (205);
5682 }
5683 }
5684
5685 /*
5686 * If not an RPC error, continue resuming rest of
5687 * nodes, ignoring any failures except for an
5688 * RPC failure which constitutes an immediate exit.
5689 * Start in middle of list with failing node.
5690 */
5691 nd2 = nd->nd_next;
5692 while (nd2) {
5693 /* Skip non-alive and non-owner nodes */
5694 if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) ||
5695 (!(nd2->nd_flags & MD_MN_NODE_OWN))) {
5696 nd2 = nd2->nd_next;
5697 continue;
5698 }
5699 (void) (clnt_mn_susp_res_io(nd2->nd_nodename,
5700 sp->setno, MN_RES_IO, &xep));
5701 if (mdanyrpcerror(&xep)) {
5702 return (-1);
5703 }
5704 nd2 = nd2->nd_next;
5705 }
5706 }
5707 nd = nd->nd_next;
5708 }
5709
5710 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, "Master node has completed "
5711 "checking/writing the mddb for set %s: %s"), sp->setname,
5712 meta_print_hrtime(gethrtime() - start_time));
5713
5714 /*
5715 * Send (aka replay) all messages we find in the changelog.
5716 * Flag the messages with
5717 * MD_MSGF_REPLAY_MSG, so no new message ID is generated for them
5718 * MD_MSGF_OVERRIDE_SUSPEND so they can pass the suspended commd.
5719 */
5720 for (class = MD_MN_NCLASSES - 1; class > 0; class--) {
5721 mdmn_changelog_record_t *lr;
5722 md_error_t xep = mdnullerror;
5723 md_mn_result_t *resultp = NULL;
5724 int ret;
5725
5726 lr = mdmn_get_changelogrec(sp->setno, class);
5727 if ((lr->lr_flags & MD_MN_LR_INUSE) == 0) {
5728 /* no entry for this class */
5729 continue;
5730 }
5731
5732 meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN,
5733 "replaying message ID=(%d, 0x%llx-%d)\n"),
5734 MSGID_ELEMS(lr->lr_msg.msg_msgid));
5735
5736 ret = mdmn_send_message_with_msgid(
5737 lr->lr_msg.msg_setno,
5738 lr->lr_msg.msg_type,
5739 lr->lr_msg.msg_flags | MD_MSGF_REPLAY_MSG |
5740 MD_MSGF_OVERRIDE_SUSPEND,
5741 lr->lr_msg.msg_recipient,
5742 lr->lr_msg.msg_event_data,
5743 lr->lr_msg.msg_event_size,
5744 &resultp,
5745 &lr->lr_msg.msg_msgid,
5746 &xep);
5747
5748 meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN,
5749 "mdmn_send_message returned %d\n"), ret);
5750
5751 if (resultp)
5752 free_result(resultp);
5753 }
5754
5755 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5756 "Playing changelog completed for set %s: %s"),
5757 sp->setname, meta_print_hrtime(gethrtime() - start_time));
5758
5759 /*
5760 * Now that new master has ondisk and incore mddbs in sync, reset
5761 * this node's new master kernel flag (for this set). If this node
5762 * re-enters another reconfig cycle before the completion of this
5763 * reconfig cycle, this master node won't need to check if the ondisk
5764 * and incore mddbs are in sync since this node won't be considered
5765 * a new master (since this flag is being reset here in the middle of
5766 * step2). This will save time during any subsequent reconfig
5767 * cycles as long as this node continues to be master.
5768 */
5769 (void) memset(&sf, 0, sizeof (sf));
5770 sf.sf_setno = sp->setno;
5771 sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5772 sf.sf_flags = MDDB_NM_RESET;
5773 /* Use magic to help protect ioctl against attack. */
5774 sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5775 /* Ignore failure, since failure to reset flag isn't catastrophic */
5776 (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, NULL);
5777
5778 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5779 "Reset new master flag for set %s: %s"),
5780 sp->setname, meta_print_hrtime(gethrtime() - start_time));
5781
5782 return (0);
5783 }
5784
5785 /*
5786 * meta_mnjoin_all will join all starting nodes in the diskset.
5787 * A starting node is considered to be any node that is not
5788 * an owner of the set but is a member of the cluster.
5789 * Master node is already joined to set (done in meta_mnsync_diskset_mddbs).
5790 *
5791 * Caller is the Master node.
5792 *
5793 * Returns 0 - Success
5794 * 205 - Failure during RPC to another node
5795 * -1 - Any other failure and ep is filled in.
5796 */
5797 int
meta_mnjoin_all(mdsetname_t * sp,md_error_t * ep)5798 meta_mnjoin_all(
5799 mdsetname_t *sp,
5800 md_error_t *ep
5801 )
5802 {
5803 md_set_desc *sd;
5804 md_mnnode_desc *nd, *nd2;
5805 int rval = 0;
5806 int stale_flag = 0;
5807 mddb_config_t c;
5808 int susp_res_flag = 0;
5809 md_error_t xep = mdnullerror;
5810
5811 /* If setname is there, set desc should exist. */
5812 if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5813 mde_perror(ep, dgettext(TEXT_DOMAIN,
5814 "Unable to get set %s desc information"), sp->setname);
5815 return (-1);
5816 }
5817
5818 /* Are there drives in the set? */
5819 if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
5820 ep) == NULL) {
5821 if (! mdisok(ep)) {
5822 return (-1);
5823 }
5824 /* No drives in set -- nothing to join */
5825 return (0);
5826 }
5827
5828 /*
5829 * Is set currently stale?
5830 */
5831 (void) memset(&c, 0, sizeof (c));
5832 c.c_id = 0;
5833 c.c_setno = sp->setno;
5834 /* Ignore failure since master node may not be joined yet */
5835 (void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL);
5836 if (c.c_flags & MDDB_C_STALE) {
5837 stale_flag = MNSET_IS_STALE;
5838 }
5839
5840 /*
5841 * If any nodes are going to be joined to diskset, then
5842 * suspend I/O to all disks in diskset so that nodes can join
5843 * (read in mddbs) in a reasonable amount of time even under
5844 * high I/O load. Don't need to do this if set is STALE since
5845 * no I/O can be occurring to a STALE set.
5846 */
5847 if (stale_flag != MNSET_IS_STALE) {
5848 nd = sd->sd_nodelist;
5849 while (nd) {
5850 /* Found a node that will be joined to diskset */
5851 if ((nd->nd_flags & MD_MN_NODE_ALIVE) &&
5852 (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5853 /* Set flag that diskset should be suspended */
5854 susp_res_flag = 1;
5855 break;
5856 }
5857 nd = nd->nd_next;
5858 }
5859 }
5860
5861 if (susp_res_flag) {
5862 /*
5863 * Block all I/Os to disks in this diskset on all joined
5864 * nodes in the diskset.
5865 * If block of I/Os fails due to an RPC failure on another
5866 * node, return 205; otherwise, return -1.
5867 */
5868 nd = sd->sd_nodelist;
5869 while (nd) {
5870 /* Skip non-alive and non-owner nodes */
5871 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5872 (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5873 nd = nd->nd_next;
5874 continue;
5875 }
5876 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5877 MN_SUSP_IO, ep)) {
5878 mde_perror(ep, dgettext(TEXT_DOMAIN,
5879 "Unable to suspend I/O on node %s"
5880 " in set %s"), nd->nd_nodename,
5881 sp->setname);
5882 /*
5883 * Resume other nodes that had been suspended.
5884 * (Reconfig return step also resumes I/Os
5885 * for all sets.)
5886 */
5887 nd2 = sd->sd_nodelist;
5888 while (nd2) {
5889 /* Stop when reaching failed node */
5890 if (nd2->nd_nodeid == nd->nd_nodeid)
5891 break;
5892 /* Skip non-alive/non-owner nodes */
5893 if ((!(nd2->nd_flags &
5894 MD_MN_NODE_ALIVE)) ||
5895 (!(nd2->nd_flags &
5896 MD_MN_NODE_OWN))) {
5897 nd2 = nd2->nd_next;
5898 continue;
5899 }
5900 (void) (clnt_mn_susp_res_io(
5901 nd2->nd_nodename, sp->setno,
5902 MN_RES_IO, &xep));
5903 nd2 = nd2->nd_next;
5904 }
5905
5906 /*
5907 * If the suspend failed due to an
5908 * RPC failure on another node, return
5909 * a 205.
5910 * Otherwise, exit with failure.
5911 * The return reconfig step will resume
5912 * I/Os for all disksets.
5913 */
5914 if ((mdanyrpcerror(ep)) &&
5915 (sd->sd_mn_mynode->nd_nodeid !=
5916 nd->nd_nodeid)) {
5917 return (205);
5918 } else {
5919 return (-1);
5920 }
5921 }
5922 nd = nd->nd_next;
5923 }
5924 }
5925
5926 nd = sd->sd_nodelist;
5927 while (nd) {
5928 /*
5929 * If a node is in the membership list but isn't joined
5930 * to the set, try to join the node.
5931 */
5932 if ((nd->nd_flags & MD_MN_NODE_ALIVE) &&
5933 (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5934 if (clnt_joinset(nd->nd_nodename, sp,
5935 (MNSET_IN_RECONFIG | stale_flag), ep)) {
5936 /*
5937 * If RPC failure to another node
5938 * then exit without attempting anything else.
5939 * (Reconfig return step will resume I/Os
5940 * for all sets.)
5941 */
5942 if (mdanyrpcerror(ep)) {
5943 mde_perror(ep, "");
5944 return (205);
5945 }
5946 /*
5947 * STALE and ACCOK failures aren't true
5948 * failures. STALE means that <50% mddbs
5949 * are available. ACCOK means that the
5950 * mediator provided the extra vote.
5951 * If a true failure, then print messasge
5952 * and withdraw node from set in order to
5953 * cleanup from failed join attempt.
5954 */
5955 if ((!mdismddberror(ep, MDE_DB_STALE)) &&
5956 (!mdismddberror(ep, MDE_DB_ACCOK))) {
5957 mde_perror(ep,
5958 "WARNING: Unable to join node %s "
5959 "to set %s", nd->nd_nodename,
5960 sp->setname);
5961 mdclrerror(ep);
5962 if (clnt_withdrawset(nd->nd_nodename,
5963 sp, &xep))
5964 mdclrerror(&xep);
5965 nd = nd->nd_next;
5966 continue;
5967 }
5968 }
5969 /* Set owner flag even if STALE or ACCOK */
5970 nd->nd_flags |= MD_MN_NODE_OWN;
5971 }
5972 nd = nd->nd_next;
5973 }
5974 /*
5975 * Resume I/Os if suspended above.
5976 */
5977 if (susp_res_flag) {
5978 nd = sd->sd_nodelist;
5979 while (nd) {
5980 /*
5981 * Skip non-alive and non-owner nodes
5982 * (this list doesn't include any of
5983 * the nodes that were joined).
5984 */
5985 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5986 (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5987 nd = nd->nd_next;
5988 continue;
5989 }
5990 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5991 MN_RES_IO, ep)) {
5992 mde_perror(ep, dgettext(TEXT_DOMAIN,
5993 "Unable to resume I/O on node %s"
5994 " in set %s"), nd->nd_nodename,
5995 sp->setname);
5996
5997 /*
5998 * If an RPC failure then don't do any
5999 * more RPC calls, since one timeout is enough
6000 * to endure. If RPC failure to another node,
6001 * return 205. If RPC failure to my node,
6002 * return -1.
6003 * (Reconfig return step will resume I/Os
6004 * for all sets.)
6005 * If not an RPC failure, continue resuming the
6006 * rest of the nodes and then return -1.
6007 */
6008 if (mdanyrpcerror(ep)) {
6009 if (sd->sd_mn_mynode->nd_nodeid ==
6010 nd->nd_nodeid) {
6011 return (-1);
6012 } else {
6013 return (205);
6014 }
6015 }
6016
6017 /*
6018 * If not an RPC error, continue resuming rest
6019 * of nodes, ignoring any failures except for
6020 * an RPC failure which constitutes an
6021 * immediate exit.
6022 * Start in middle of list with failing node.
6023 */
6024 nd2 = nd->nd_next;
6025 while (nd2) {
6026 /* Skip non-owner nodes */
6027 if ((!(nd2->nd_flags &
6028 MD_MN_NODE_ALIVE)) ||
6029 (!(nd2->nd_flags &
6030 MD_MN_NODE_OWN))) {
6031 nd2 = nd2->nd_next;
6032 continue;
6033 }
6034 (void) (clnt_mn_susp_res_io(
6035 nd2->nd_nodename, sp->setno,
6036 MN_RES_IO, &xep));
6037 if (mdanyrpcerror(&xep)) {
6038 return (-1);
6039 }
6040 nd2 = nd2->nd_next;
6041 }
6042 }
6043 nd = nd->nd_next;
6044 }
6045 }
6046
6047 nd = sd->sd_nodelist;
6048 while (nd) {
6049 if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
6050 nd = nd->nd_next;
6051 continue;
6052 }
6053 /*
6054 * If 1 node fails - go ahead and update the rest except
6055 * in the case of an RPC failure, fail immediately.
6056 */
6057 if (clnt_upd_nr_flags(nd->nd_nodename, sp,
6058 sd->sd_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) {
6059 /* RPC failure to another node */
6060 if (mdanyrpcerror(ep)) {
6061 return (205);
6062 }
6063 nd = nd->nd_next;
6064 rval = -1;
6065 continue;
6066 }
6067 nd = nd->nd_next;
6068 }
6069
6070 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
6071 "Join of all nodes completed for set %s: %s"),
6072 sp->setname, meta_print_hrtime(gethrtime() - start_time));
6073
6074 return (rval);
6075 }
6076