1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Just in case we're not in a build environment, make sure that 30 * TEXT_DOMAIN gets set to something. 31 */ 32 #if !defined(TEXT_DOMAIN) 33 #define TEXT_DOMAIN "SYS_TEST" 34 #endif 35 36 /* 37 * Metadevice diskset interfaces 38 */ 39 40 #include "meta_set_prv.h" 41 #include <meta.h> 42 #include <metad.h> 43 #include <mdmn_changelog.h> 44 #include <sys/lvm/md_crc.h> 45 #include <sys/utsname.h> 46 #include <sdssc.h> 47 48 #include <sys/sysevent/eventdefs.h> 49 #include <sys/sysevent/svm.h> 50 extern char *blkname(char *); 51 52 static md_drive_desc * 53 dr2drivedesc( 54 mdsetname_t *sp, 55 side_t sideno, 56 int flags, 57 md_error_t *ep 58 ) 59 { 60 md_set_record *sr; 61 md_drive_record *dr; 62 mddrivename_t *dnp; 63 md_drive_desc *dd_head = NULL; 64 md_set_desc *sd; 65 66 if (flags & MD_BYPASS_DAEMON) { 67 if ((sr = metad_getsetbynum(sp->setno, ep)) == NULL) 68 return (NULL); 69 sd = metaget_setdesc(sp, ep); 70 sideno = getnodeside(mynode(), sd); 71 sp = metafakesetname(sp->setno, sr->sr_setname); 72 } else { 73 if ((sr = getsetbyname(sp->setname, ep)) == NULL) 74 return (NULL); 75 } 76 77 assert(sideno != MD_SIDEWILD); 78 79 /* 80 * WARNING: 81 * The act of getting the dnp from the namespace means that we 82 * will get the devid of the disk as recorded in the namespace. 83 * This devid has the potential to be stale if the disk is being 84 * replaced via a rebind, this means that any code that relies 85 * on any of the dnp information should take the appropriate action 86 * to preserve that information. For example in the rebind code the 87 * devid of the new disk is saved off and then copied back in once 88 * the code that has called this function has completed. 89 */ 90 for (dr = sr->sr_drivechain; dr != NULL; dr = dr->dr_next) { 91 if ((dnp = metadrivename_withdrkey(sp, sideno, dr->dr_key, 92 flags, ep)) == NULL) { 93 if (!(flags & MD_BYPASS_DAEMON)) 94 free_sr(sr); 95 metafreedrivedesc(&dd_head); 96 return (NULL); 97 } 98 99 (void) metadrivedesc_append(&dd_head, dnp, dr->dr_dbcnt, 100 dr->dr_dbsize, dr->dr_ctime, dr->dr_genid, dr->dr_flags); 101 } 102 103 if (!(flags & MD_BYPASS_DAEMON)) { 104 free_sr(sr); 105 } 106 return (dd_head); 107 } 108 109 static int 110 get_sidenmlist( 111 mdsetname_t *sp, 112 mddrivename_t *dnp, 113 md_error_t *ep 114 ) 115 { 116 md_set_desc *sd; 117 mdsidenames_t *sn, **sn_next; 118 int i; 119 120 if ((sd = metaget_setdesc(sp, ep)) == NULL) 121 return (-1); 122 123 metaflushsidenames(dnp); 124 sn_next = &dnp->side_names; 125 if (MD_MNSET_DESC(sd)) { 126 /* 127 * Only get sidenames for this node since 128 * that is the only side information stored in 129 * the local mddb for a multi-node diskset. 130 */ 131 if (sd->sd_mn_mynode) { 132 sn = Zalloc(sizeof (*sn)); 133 sn->sideno = sd->sd_mn_mynode->nd_nodeid; 134 if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET, 135 sn->sideno, dnp->side_names_key, &sn->dname, 136 &sn->mnum, NULL, ep)) == NULL) { 137 if (sn->dname != NULL) 138 Free(sn->dname); 139 Free(sn); 140 return (-1); 141 } 142 143 /* Add to the end of the linked list */ 144 assert(*sn_next == NULL); 145 *sn_next = sn; 146 sn_next = &sn->next; 147 } 148 } else { 149 for (i = 0; i < MD_MAXSIDES; i++) { 150 /* Skip empty slots */ 151 if (sd->sd_nodes[i][0] == '\0') 152 continue; 153 154 sn = Zalloc(sizeof (*sn)); 155 sn->sideno = i; 156 if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET, 157 i+SKEW, dnp->side_names_key, &sn->dname, 158 &sn->mnum, NULL, ep)) == NULL) { 159 /* 160 * It is possible that during the add of a 161 * host to have a 'missing' side as the side 162 * for this disk will be added later. So ignore 163 * the error. The 'missing' side will be added 164 * once the addhosts process has completed. 165 */ 166 if (mdissyserror(ep, ENOENT)) { 167 mdclrerror(ep); 168 Free(sn); 169 continue; 170 } 171 172 if (sn->dname != NULL) 173 Free(sn->dname); 174 Free(sn); 175 return (-1); 176 } 177 178 /* Add to the end of the linked list */ 179 assert(*sn_next == NULL); 180 *sn_next = sn; 181 sn_next = &sn->next; 182 } 183 } 184 185 return (0); 186 } 187 188 static md_drive_desc * 189 rl_to_dd( 190 mdsetname_t *sp, 191 md_replicalist_t *rlp, 192 md_error_t *ep 193 ) 194 { 195 md_replicalist_t *rl; 196 md_replica_t *r; 197 md_drive_desc *dd = NULL; 198 md_drive_desc *d; 199 int found; 200 md_set_desc *sd; 201 daddr_t nblks = 0; 202 203 if ((sd = metaget_setdesc(sp, ep)) == NULL) 204 return (NULL); 205 206 /* find the smallest existing replica */ 207 for (rl = rlp; rl != NULL; rl = rl->rl_next) { 208 r = rl->rl_repp; 209 nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks)); 210 } 211 212 if (nblks <= 0) 213 nblks = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE; 214 215 for (rl = rlp; rl != NULL; rl = rl->rl_next) { 216 r = rl->rl_repp; 217 218 found = 0; 219 for (d = dd; d != NULL; d = d->dd_next) { 220 if (strcmp(r->r_namep->drivenamep->cname, 221 d->dd_dnp->cname) == 0) { 222 found = 1; 223 dd->dd_dbcnt++; 224 break; 225 } 226 } 227 228 if (! found) 229 (void) metadrivedesc_append(&dd, r->r_namep->drivenamep, 230 1, nblks, sd->sd_ctime, sd->sd_genid, MD_DR_OK); 231 } 232 233 return (dd); 234 } 235 236 /* 237 * Exported Entry Points 238 */ 239 240 set_t 241 get_max_sets(md_error_t *ep) 242 { 243 244 static set_t max_sets = 0; 245 246 if (max_sets == 0) 247 if (metaioctl(MD_IOCGETNSET, &max_sets, ep, NULL) != 0) 248 return (0); 249 250 return (max_sets); 251 } 252 253 int 254 get_max_meds(md_error_t *ep) 255 { 256 static int max_meds = 0; 257 258 if (max_meds == 0) 259 if (metaioctl(MD_MED_GET_NMED, &max_meds, ep, NULL) != 0) 260 return (0); 261 262 return (max_meds); 263 } 264 265 side_t 266 getmyside(mdsetname_t *sp, md_error_t *ep) 267 { 268 md_set_desc *sd; 269 char *node = NULL; 270 side_t sideno; 271 272 if (sp->setno == 0) 273 return (0); 274 275 if ((sd = metaget_setdesc(sp, ep)) == NULL) 276 return (MD_SIDEWILD); 277 278 node = mynode(); 279 280 assert(node != NULL); 281 282 sideno = getnodeside(node, sd); 283 284 if (sideno != MD_SIDEWILD) 285 return (sideno); 286 287 return (mddserror(ep, MDE_DS_HOSTNOSIDE, sp->setno, node, NULL, node)); 288 } 289 290 /* 291 * get set info from name 292 */ 293 md_set_record * 294 getsetbyname(char *setname, md_error_t *ep) 295 { 296 md_set_record *sr = NULL; 297 md_mnset_record *mnsr = NULL; 298 char *p; 299 size_t len; 300 301 /* get set info from daemon */ 302 if (clnt_getset(mynode(), setname, MD_SET_BAD, &sr, ep) == -1) 303 return (NULL); 304 if (sr != NULL) { 305 /* 306 * Returned record could be for a multi-node set or a 307 * non-multi-node set. 308 */ 309 if (MD_MNSET_REC(sr)) { 310 /* 311 * Record is for a multi-node set. Reissue call 312 * to get mnset information. Need to free 313 * record as if a non-multi-node set record since 314 * that is what clnt_getset gave us. If in 315 * the daemon, don't free since this is a pointer 316 * into the setrecords array. 317 */ 318 if (! md_in_daemon) { 319 sr->sr_flags &= ~MD_SR_MN; 320 free_sr(sr); 321 } 322 if (clnt_mngetset(mynode(), setname, MD_SET_BAD, &mnsr, 323 ep) == -1) 324 return (NULL); 325 if (mnsr != NULL) 326 return ((struct md_set_record *)mnsr); 327 } else { 328 return (sr); 329 } 330 } 331 332 /* no such set */ 333 len = strlen(setname) + 30; 334 p = Malloc(len); 335 (void) snprintf(p, len, "setname \"%s\"", setname); 336 (void) mderror(ep, MDE_NO_SET, p); 337 Free(p); 338 return (NULL); 339 } 340 341 /* 342 * get set info from number 343 */ 344 md_set_record * 345 getsetbynum(set_t setno, md_error_t *ep) 346 { 347 md_set_record *sr; 348 md_mnset_record *mnsr = NULL; 349 char buf[100]; 350 351 if (clnt_getset(mynode(), NULL, setno, &sr, ep) == -1) 352 return (NULL); 353 354 if (sr != NULL) { 355 /* 356 * Record is for a multi-node set. Reissue call 357 * to get mnset information. Need to free 358 * record as if a non-multi-node set record since 359 * that is what clnt_getset gave us. If in 360 * the daemon, don't free since this is a pointer 361 * into the setrecords array. 362 */ 363 if (MD_MNSET_REC(sr)) { 364 /* 365 * Record is for a multi-node set. Reissue call 366 * to get mnset information. 367 */ 368 if (! md_in_daemon) { 369 sr->sr_flags &= ~MD_SR_MN; 370 free_sr(sr); 371 } 372 if (clnt_mngetset(mynode(), NULL, setno, &mnsr, 373 ep) == -1) 374 return (NULL); 375 if (mnsr != NULL) 376 return ((struct md_set_record *)mnsr); 377 } else { 378 return (sr); 379 } 380 } 381 382 (void) sprintf(buf, "setno %u", setno); 383 (void) mderror(ep, MDE_NO_SET, buf); 384 return (NULL); 385 } 386 387 int 388 meta_check_drive_inuse( 389 mdsetname_t *sp, 390 mddrivename_t *dnp, 391 int check_db, 392 md_error_t *ep 393 ) 394 { 395 mdnamelist_t *nlp = NULL; 396 mdnamelist_t *p; 397 int rval = 0; 398 399 /* get all underlying partitions */ 400 if (meta_getalldevs(sp, &nlp, check_db, ep) != 0) 401 return (-1); 402 403 /* search for drive */ 404 for (p = nlp; (p != NULL); p = p->next) { 405 mdname_t *np = p->namep; 406 407 if (strcmp(dnp->cname, np->drivenamep->cname) == 0) { 408 rval = (mddserror(ep, MDE_DS_DRIVEINUSE, sp->setno, 409 NULL, dnp->cname, sp->setname)); 410 break; 411 } 412 } 413 414 /* cleanup, return success */ 415 metafreenamelist(nlp); 416 return (rval); 417 } 418 419 /* 420 * simple check for ownership 421 */ 422 int 423 meta_check_ownership(mdsetname_t *sp, md_error_t *ep) 424 { 425 int ownset; 426 md_set_desc *sd; 427 md_drive_desc *dd; 428 md_replicalist_t *rlp = NULL; 429 md_error_t xep = mdnullerror; 430 431 if (metaislocalset(sp)) 432 return (0); 433 434 ownset = own_set(sp, NULL, TRUE, ep); 435 if (! mdisok(ep)) 436 return (-1); 437 438 if ((sd = metaget_setdesc(sp, ep)) == NULL) 439 return (-1); 440 441 dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep); 442 if (! mdisok(ep)) 443 return (-1); 444 445 /* If we have no drive descriptors, check for no ownership */ 446 if (dd == NULL) { 447 if (ownset == MD_SETOWNER_NONE) 448 return (0); 449 450 /* If ownership somehow has come to exist, we must clean up */ 451 452 if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp, 453 &xep) < 0) 454 mdclrerror(&xep); 455 456 if ((dd = rl_to_dd(sp, rlp, &xep)) == NULL) 457 if (! mdisok(&xep)) 458 mdclrerror(&xep); 459 460 if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) { 461 if (rel_own_bydd(sp, dd, TRUE, &xep)) 462 mdclrerror(&xep); 463 } 464 465 if (halt_set(sp, &xep)) 466 mdclrerror(&xep); 467 468 metafreereplicalist(rlp); 469 470 metafreedrivedesc(&dd); 471 472 return (0); 473 } 474 475 metafreedrivedesc(&sd->sd_drvs); 476 477 if (ownset == MD_SETOWNER_YES) 478 return (0); 479 480 return (mddserror(ep, MDE_DS_NOOWNER, sp->setno, NULL, NULL, 481 sp->setname)); 482 } 483 484 /* 485 * simple check for ownership 486 */ 487 int 488 meta_check_ownership_on_host(mdsetname_t *sp, char *hostname, md_error_t *ep) 489 { 490 md_set_desc *sd; 491 md_drive_desc *dd; 492 int bool; 493 494 if (metaislocalset(sp)) 495 return (0); 496 497 if ((sd = metaget_setdesc(sp, ep)) == NULL) 498 return (-1); 499 500 if (getnodeside(hostname, sd) == MD_SIDEWILD) 501 return (mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, 502 hostname, NULL, sp->setname)); 503 504 dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep); 505 if (! mdisok(ep)) 506 return (-1); 507 508 if (clnt_ownset(hostname, sp, &bool, ep) == -1) 509 return (-1); 510 511 if (dd == NULL) 512 return (0); 513 514 metafreedrivedesc(&sd->sd_drvs); 515 516 if (bool == TRUE) 517 return (0); 518 519 return (mddserror(ep, MDE_DS_NODEISNOTOWNER, sp->setno, hostname, NULL, 520 sp->setname)); 521 } 522 523 /* 524 * Function that determines if a node is in the multinode diskset 525 * membership list. Calling node passes in node to be checked and 526 * the nodelist as returned from meta_read_nodelist. This routine 527 * anticipates being called many times using the same diskset membership 528 * list which is why the alloc and free of the diskset membership list 529 * is left to the calling routine. 530 * Returns: 531 * 1 - if a member 532 * 0 - not a member 533 */ 534 int 535 meta_is_member( 536 char *node_name, 537 md_mn_nodeid_t node_id, 538 mndiskset_membershiplist_t *nl 539 ) 540 { 541 mndiskset_membershiplist_t *nl2; 542 int flag_check_name; 543 544 if (node_id != 0) 545 flag_check_name = 0; 546 else if (node_name != NULL) 547 flag_check_name = 1; 548 else 549 return (0); 550 551 nl2 = nl; 552 while (nl2) { 553 if (flag_check_name) { 554 /* Compare given name against name in member list */ 555 if (strcmp(nl2->msl_node_name, node_name) == 0) 556 break; 557 } else { 558 /* Compare given nodeid against nodeid in member list */ 559 if (nl2->msl_node_id == node_id) 560 break; 561 } 562 nl2 = nl2->next; 563 } 564 /* No match found in member list */ 565 if (nl2 == NULL) { 566 return (0); 567 } 568 /* Return 1 if node is in member list */ 569 return (1); 570 } 571 572 /* 573 * meta_getnext_devinfo should go to the host that 574 * has the device, to return the device name, driver name, minor num. 575 * We can take the big cheat for now, since it is a requirement 576 * that the device names and device numbers are the same, and 577 * just get the info locally. 578 * 579 * This routine is very similar to meta_getnextside_devinfo except 580 * that the specific side to be used is being passed in. 581 * 582 * Exit status: 583 * 0 - No more side info to return 584 * 1 - More side info's to return 585 * -1 - An error has been detected 586 */ 587 /*ARGSUSED*/ 588 int 589 meta_getside_devinfo( 590 mdsetname_t *sp, /* for this set */ 591 char *bname, /* local block name (myside) */ 592 side_t sideno, /* sideno */ 593 char **ret_bname, /* block device name of returned side */ 594 char **ret_dname, /* driver name of returned side */ 595 minor_t *ret_mnum, /* minor number of returned side */ 596 md_error_t *ep 597 ) 598 { 599 mdname_t *np; 600 601 if (ret_bname != NULL) 602 *ret_bname = NULL; 603 if (ret_dname != NULL) 604 *ret_dname = NULL; 605 if (ret_mnum != NULL) 606 *ret_mnum = NODEV32; 607 608 609 if ((np = metaname(&sp, bname, LOGICAL_DEVICE, ep)) == NULL) 610 return (-1); 611 612 /* 613 * NOTE (future) - There will be more work here once devids are integrated 614 * into disksets. Then the side should be used to find the correct 615 * host and the b/d names should be gotten from that host. 616 */ 617 618 /* 619 * Return the side info. 620 */ 621 if (ret_bname != NULL) 622 *ret_bname = Strdup(np->bname); 623 624 if (ret_dname != NULL) { 625 mdcinfo_t *cinfo; 626 627 if ((cinfo = metagetcinfo(np, ep)) == NULL) 628 return (-1); 629 630 *ret_dname = Strdup(cinfo->dname); 631 } 632 633 if (ret_mnum != NULL) 634 *ret_mnum = meta_getminor(np->dev); 635 636 return (1); 637 } 638 639 /* 640 * Get the information on the device from the remote node using the devid 641 * of the disk. 642 * 643 * Exit status: 644 * 0 - No more side info to return 645 * 1 - More side info's to return 646 * -1 - An error has been detected 647 */ 648 int 649 meta_getnextside_devinfo( 650 mdsetname_t *sp, /* for this set */ 651 char *bname, /* local block name (myside) */ 652 side_t *sideno, /* previous sideno & returned sideno */ 653 char **ret_bname, /* block device name of returned side */ 654 char **ret_dname, /* driver name of returned side */ 655 minor_t *ret_mnum, /* minor number of returned side */ 656 md_error_t *ep 657 ) 658 { 659 md_set_desc *sd; 660 int i; 661 mdname_t *np; 662 mddrivename_t *dnp; 663 char *devidstr = NULL; 664 int devidstrlen; 665 md_dev64_t retdev = NODEV64; 666 char *ret_devname = NULL; 667 char *ret_blkdevname = NULL; 668 char *ret_driver = NULL; 669 char *nodename; 670 int fd; 671 int ret = -1; 672 char *minor_name = NULL; 673 md_mnnode_desc *nd; 674 675 676 if (ret_bname != NULL) 677 *ret_bname = NULL; 678 if (ret_dname != NULL) 679 *ret_dname = NULL; 680 if (ret_mnum != NULL) 681 *ret_mnum = NODEV32; 682 683 if (metaislocalset(sp)) { 684 /* no more sides - we are done */ 685 if (*sideno != MD_SIDEWILD) 686 return (0); 687 688 /* First time through - set up return sideno */ 689 *sideno = 0; 690 } else { 691 692 /* 693 * Find the next sideno, starting after the one given. 694 */ 695 if ((sd = metaget_setdesc(sp, ep)) == NULL) 696 return (-1); 697 698 if (MD_MNSET_DESC(sd)) { 699 nd = sd->sd_nodelist; 700 if ((*sideno == MD_SIDEWILD) && 701 (nd != (struct md_mnnode_desc *)NULL)) { 702 *sideno = nd->nd_nodeid; 703 } else { 704 while (nd) { 705 /* 706 * Found given sideno, now find 707 * next sideno, if there is one. 708 */ 709 if ((*sideno == nd->nd_nodeid) && 710 (nd->nd_next != 711 (struct md_mnnode_desc *)NULL)) { 712 *sideno = 713 nd->nd_next->nd_nodeid; 714 break; 715 } 716 nd = nd->nd_next; 717 } 718 if (nd == NULL) { 719 return (0); 720 } 721 } 722 if (*sideno == MD_SIDEWILD) 723 return (0); 724 } else { 725 for (i = (*sideno)+1; i < MD_MAXSIDES; i++) 726 /* Find next full slot */ 727 if (sd->sd_nodes[i][0] != '\0') 728 break; 729 730 /* No more sides - we are done */ 731 if (i == MD_MAXSIDES) 732 return (0); 733 734 /* Set up the return sideno */ 735 *sideno = i; 736 nodename = (char *)sd->sd_nodes[i]; 737 } 738 } 739 740 /* 741 * Need to pass the node the devid of the disk and get it to 742 * send back the details of the disk from that side. 743 */ 744 if ((np = metaname(&sp, bname, UNKNOWN, ep)) == NULL) 745 return (-1); 746 747 dnp = np->drivenamep; 748 749 /* 750 * By default, set up the parameters so that they are copied out. 751 */ 752 if (ret_bname != NULL) 753 *ret_bname = Strdup(np->bname); 754 755 if (ret_dname != NULL) { 756 mdcinfo_t *cinfo; 757 758 if ((cinfo = metagetcinfo(np, ep)) == NULL) 759 return (-1); 760 761 *ret_dname = Strdup(cinfo->dname); 762 } 763 764 if (ret_mnum != NULL) 765 *ret_mnum = meta_getminor(np->dev); 766 767 /* 768 * Try some optimization. If this is the local set or the device 769 * is a metadevice then just copy the information. If the device 770 * does not have a devid (due to not having a minor name) then 771 * fall back to the pre-devid behaviour of copying the information 772 * on the device: this is okay because the sanity checks before this 773 * call would have found any issues with the device. If it's a 774 * multi-node diskset also just return ie. copy. 775 */ 776 if (metaislocalset(sp) || metaismeta(np) || (dnp->devid == NULL) || 777 (MD_MNSET_DESC(sd))) 778 return (1); 779 780 if (np->minor_name == (char *)NULL) { 781 /* 782 * Have to get the minor name then. The slice should exist 783 * on the disk because it will have already been repartitioned 784 * up prior to getting to this point. 785 */ 786 if ((fd = open(np->bname, (O_RDONLY|O_NDELAY), 0)) < 0) { 787 (void) mdsyserror(ep, errno, np->bname); 788 return (-1); 789 } 790 (void) devid_get_minor_name(fd, &minor_name); 791 np->minor_name = Strdup(minor_name); 792 devid_str_free(minor_name); 793 (void) close(fd); 794 } 795 796 /* allocate extra space for "/" and NULL hence +2 */ 797 devidstrlen = strlen(dnp->devid) + strlen(np->minor_name) + 2; 798 devidstr = (char *)Malloc(devidstrlen); 799 800 /* 801 * As a minor name is supplied then the ret_devname will be 802 * appropriate to that minor_name and in this case it will be 803 * a block device ie /dev/dsk. 804 */ 805 (void) snprintf(devidstr, devidstrlen, 806 "%s/%s", dnp->devid, np->minor_name); 807 808 ret = clnt_devinfo_by_devid(nodename, sp, devidstr, &retdev, 809 np->bname, &ret_devname, &ret_driver, ep); 810 811 Free(devidstr); 812 813 /* 814 * If the other side is not running device id in disksets, 815 * 'ret' is set to ENOTSUP in which case we fallback to 816 * the existing behaviour 817 */ 818 if (ret == ENOTSUP) 819 return (1); 820 else if (ret == -1) 821 return (-1); 822 823 /* 824 * ret_devname comes from the rpc call and is a 825 * raw device name. We need to make this into a 826 * block device via blkname for further processing. 827 * Unfortunately, when our device id isn't found in 828 * the system, the rpc call will return a " " in 829 * ret_devname in which case we need to fill that in 830 * as ret_blkname because blkname of " " returns NULL. 831 */ 832 if (ret_bname != NULL && ret_devname != NULL) { 833 ret_blkdevname = blkname(ret_devname); 834 if (ret_blkdevname == NULL) 835 *ret_bname = Strdup(ret_devname); 836 else 837 *ret_bname = Strdup(ret_blkdevname); 838 } 839 840 if (ret_dname != NULL && ret_driver != NULL) 841 *ret_dname = Strdup(ret_driver); 842 843 if (ret_mnum != NULL) 844 *ret_mnum = meta_getminor(retdev); 845 846 return (1); 847 } 848 849 int 850 meta_is_drive_in_anyset( 851 mddrivename_t *dnp, 852 mdsetname_t **spp, 853 int bypass_daemon, 854 md_error_t *ep 855 ) 856 { 857 set_t setno; 858 mdsetname_t *this_sp; 859 int is_it; 860 set_t max_sets; 861 862 if ((max_sets = get_max_sets(ep)) == 0) 863 return (-1); 864 865 assert(spp != NULL); 866 *spp = NULL; 867 868 for (setno = 1; setno < max_sets; setno++) { 869 if (!bypass_daemon) { 870 if ((this_sp = metasetnosetname(setno, ep)) == NULL) { 871 if (mdismddberror(ep, MDE_DB_NODB)) { 872 mdclrerror(ep); 873 return (0); 874 } 875 if (mdiserror(ep, MDE_NO_SET)) { 876 mdclrerror(ep); 877 continue; 878 } 879 return (-1); 880 } 881 } else 882 this_sp = metafakesetname(setno, NULL); 883 884 if ((is_it = meta_is_drive_in_thisset(this_sp, dnp, 885 bypass_daemon, ep)) == -1) { 886 if (mdiserror(ep, MDE_NO_SET)) { 887 mdclrerror(ep); 888 continue; 889 } 890 return (-1); 891 } 892 if (is_it) { 893 *spp = this_sp; 894 return (0); 895 } 896 } 897 return (0); 898 } 899 900 int 901 meta_is_drive_in_thisset( 902 mdsetname_t *sp, 903 mddrivename_t *dnp, 904 int bypass_daemon, 905 md_error_t *ep 906 ) 907 { 908 md_drive_desc *dd, *p; 909 910 if (bypass_daemon) 911 dd = dr2drivedesc(sp, MD_SIDEWILD, 912 (MD_BASICNAME_OK | MD_BYPASS_DAEMON), ep); 913 else 914 dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep); 915 916 if (dd == NULL) { 917 if (! mdisok(ep)) 918 return (-1); 919 return (0); 920 } 921 922 923 for (p = dd; p != NULL; p = p->dd_next) 924 if (strcmp(p->dd_dnp->cname, dnp->cname) == 0) 925 return (1); 926 return (0); 927 } 928 929 /* 930 * Check to see if devid is in use in any diskset. 931 * This is used in the case when a partial diskset is being imported 932 * to make sure that the unvailable drive isn't already in use in an 933 * already imported partial diskset. Can't check on the cname since the 934 * unavailable disk's cname is from the previous system and may collide 935 * with a cname on this system. 936 * Return values: 937 * 1: devid has been found in a diskset 938 * 0: devid not found in any diskset 939 */ 940 int 941 meta_is_devid_in_anyset( 942 void *devid, 943 mdsetname_t **spp, 944 md_error_t *ep 945 ) 946 { 947 set_t setno; 948 mdsetname_t *this_sp; 949 int is_it; 950 set_t max_sets; 951 952 if ((max_sets = get_max_sets(ep)) == 0) 953 return (-1); 954 955 assert(spp != NULL); 956 *spp = NULL; 957 958 for (setno = 1; setno < max_sets; setno++) { 959 if ((this_sp = metasetnosetname(setno, ep)) == NULL) { 960 if (mdismddberror(ep, MDE_DB_NODB)) { 961 mdclrerror(ep); 962 return (0); 963 } 964 if (mdiserror(ep, MDE_NO_SET)) { 965 mdclrerror(ep); 966 continue; 967 } 968 return (-1); 969 } 970 971 if ((is_it = meta_is_devid_in_thisset(this_sp, 972 devid, ep)) == -1) { 973 if (mdiserror(ep, MDE_NO_SET)) { 974 mdclrerror(ep); 975 continue; 976 } 977 return (-1); 978 } 979 if (is_it) { 980 *spp = this_sp; 981 return (0); 982 } 983 } 984 return (0); 985 } 986 987 int 988 meta_is_devid_in_thisset( 989 mdsetname_t *sp, 990 void *devid, 991 md_error_t *ep 992 ) 993 { 994 md_drive_desc *dd, *p; 995 ddi_devid_t dd_devid; 996 997 dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep); 998 if (dd == NULL) { 999 if (! mdisok(ep)) 1000 return (-1); 1001 return (0); 1002 } 1003 1004 for (p = dd; p != NULL; p = p->dd_next) { 1005 if (p->dd_dnp->devid == NULL) 1006 continue; 1007 (void) devid_str_decode(p->dd_dnp->devid, 1008 &dd_devid, NULL); 1009 if (dd_devid == NULL) 1010 continue; 1011 if (devid_compare(devid, dd_devid) == 0) { 1012 devid_free(dd_devid); 1013 return (1); 1014 } 1015 devid_free(dd_devid); 1016 } 1017 return (0); 1018 } 1019 1020 int 1021 meta_set_balance( 1022 mdsetname_t *sp, 1023 md_error_t *ep 1024 ) 1025 { 1026 md_set_desc *sd; 1027 md_drive_desc *dd, *curdd; 1028 daddr_t dbsize; 1029 daddr_t nblks; 1030 int i; 1031 int rval = 0; 1032 sigset_t oldsigs; 1033 md_setkey_t *cl_sk; 1034 md_error_t xep = mdnullerror; 1035 md_mnnode_desc *nd; 1036 int suspend1_flag = 0; 1037 1038 if ((sd = metaget_setdesc(sp, ep)) == NULL) 1039 return (-1); 1040 1041 dbsize = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE; 1042 1043 /* Make sure we own the set */ 1044 if (meta_check_ownership(sp, ep) != 0) 1045 return (-1); 1046 1047 /* END CHECK CODE */ 1048 1049 /* 1050 * Get drive descriptors for the drives that are currently in the set. 1051 */ 1052 curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep); 1053 1054 if (! mdisok(ep)) 1055 return (-1); 1056 1057 /* Find the minimum replica size in use is or use the default */ 1058 if ((nblks = meta_db_minreplica(sp, ep)) < 0) 1059 mdclrerror(ep); 1060 else 1061 dbsize = nblks; /* adjust replica size */ 1062 1063 /* Make sure we are blocking all signals */ 1064 if (procsigs(TRUE, &oldsigs, &xep) < 0) 1065 mdclrerror(&xep); 1066 1067 /* 1068 * Lock the set on current set members. 1069 * For MN diskset lock_set and SUSPEND are used to protect against 1070 * other meta* commands running on the other nodes. 1071 */ 1072 if (MD_MNSET_DESC(sd)) { 1073 nd = sd->sd_nodelist; 1074 while (nd) { 1075 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1076 nd = nd->nd_next; 1077 continue; 1078 } 1079 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 1080 rval = -1; 1081 goto out; 1082 } 1083 nd = nd->nd_next; 1084 } 1085 /* 1086 * Lock out other meta* commands by suspending 1087 * class 1 messages across the diskset. 1088 */ 1089 nd = sd->sd_nodelist; 1090 while (nd) { 1091 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1092 nd = nd->nd_next; 1093 continue; 1094 } 1095 if (clnt_mdcommdctl(nd->nd_nodename, 1096 COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1, 1097 MD_MSCF_NO_FLAGS, ep)) { 1098 rval = -1; 1099 goto out; 1100 } 1101 suspend1_flag = 1; 1102 nd = nd->nd_next; 1103 } 1104 } else { 1105 for (i = 0; i < MD_MAXSIDES; i++) { 1106 /* Skip empty slots */ 1107 if (sd->sd_nodes[i][0] == '\0') continue; 1108 1109 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { 1110 rval = -1; 1111 goto out; 1112 } 1113 } 1114 } 1115 1116 /* We are not adding or deleting any drives, just balancing */ 1117 dd = NULL; 1118 1119 /* 1120 * Balance the DB's according to the list of existing drives and the 1121 * list of added drives. 1122 */ 1123 if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1) 1124 goto out; 1125 1126 out: 1127 /* 1128 * Unlock diskset by resuming class 1 messages across the diskset. 1129 * Just resume all classes so that resume is the same whether 1130 * just one class was locked or all classes were locked. 1131 */ 1132 if (suspend1_flag) { 1133 nd = sd->sd_nodelist; 1134 while (nd) { 1135 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1136 nd = nd->nd_next; 1137 continue; 1138 } 1139 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 1140 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 1141 /* 1142 * We are here because we failed to resume 1143 * rpc.mdcommd. However we potentially have 1144 * an error from the previous call 1145 * (meta_db_balance). If the previous call 1146 * did fail, we capture that error and 1147 * generate a perror withthe string, 1148 * "Unable to resume...". 1149 * Setting rval to -1 ensures that in the 1150 * next iteration of the loop, ep is not 1151 * clobbered. 1152 */ 1153 if (rval == 0) 1154 (void) mdstealerror(ep, &xep); 1155 else 1156 mdclrerror(&xep); 1157 rval = -1; 1158 mde_perror(ep, dgettext(TEXT_DOMAIN, 1159 "Unable to resume rpc.mdcommd.")); 1160 } 1161 nd = nd->nd_next; 1162 } 1163 } 1164 1165 /* Unlock the set */ 1166 cl_sk = cl_get_setkey(sp->setno, sp->setname); 1167 if (MD_MNSET_DESC(sd)) { 1168 nd = sd->sd_nodelist; 1169 while (nd) { 1170 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1171 nd = nd->nd_next; 1172 continue; 1173 } 1174 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 1175 if (rval == 0) 1176 (void) mdstealerror(ep, &xep); 1177 else 1178 mdclrerror(&xep); 1179 rval = -1; 1180 } 1181 nd = nd->nd_next; 1182 } 1183 } else { 1184 for (i = 0; i < MD_MAXSIDES; i++) { 1185 /* Skip empty slots */ 1186 if (sd->sd_nodes[i][0] == '\0') 1187 continue; 1188 1189 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) { 1190 if (rval == 0) 1191 (void) mdstealerror(ep, &xep); 1192 rval = -1; 1193 } 1194 } 1195 } 1196 1197 /* release signals back to what they were on entry */ 1198 if (procsigs(FALSE, &oldsigs, &xep) < 0) 1199 mdclrerror(&xep); 1200 1201 cl_set_setkey(NULL); 1202 1203 metaflushsetname(sp); 1204 1205 return (rval); 1206 } 1207 1208 int 1209 meta_set_destroy( 1210 mdsetname_t *sp, 1211 int lock_set, 1212 md_error_t *ep 1213 ) 1214 { 1215 int i; 1216 med_rec_t medr; 1217 md_set_desc *sd; 1218 md_drive_desc *dd, *p, *p1; 1219 mddrivename_t *dnp; 1220 mdname_t *np; 1221 mdnamelist_t *nlp = NULL; 1222 int num_users = 0; 1223 int has_set; 1224 side_t mysideno; 1225 sigset_t oldsigs; 1226 md_error_t xep = mdnullerror; 1227 md_setkey_t *cl_sk; 1228 int rval = 0; 1229 int delete_end = 1; 1230 1231 /* Make sure we are blocking all signals */ 1232 if (procsigs(TRUE, &oldsigs, ep) < 0) 1233 return (-1); 1234 1235 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1236 if (! mdisok(ep)) 1237 rval = -1; 1238 goto out; 1239 } 1240 1241 /* 1242 * meta_set_destroy should not be called for a MN diskset. 1243 * This routine destroys a set without communicating this information 1244 * to the other nodes which would lead to an inconsistency in 1245 * the MN diskset. 1246 */ 1247 if (MD_MNSET_DESC(sd)) { 1248 rval = -1; 1249 goto out; 1250 } 1251 1252 /* Continue if a traditional diskset */ 1253 1254 /* 1255 * Check to see who has the set. If we are not the last user of the 1256 * set, we will not touch the replicas. 1257 */ 1258 for (i = 0; i < MD_MAXSIDES; i++) { 1259 /* Skip empty slots */ 1260 if (sd->sd_nodes[i][0] == '\0') 1261 continue; 1262 1263 has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NST_EQ, 1264 ep); 1265 1266 if (has_set < 0) { 1267 mdclrerror(ep); 1268 } else 1269 num_users++; 1270 } 1271 1272 if ((dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) == NULL) { 1273 if (! mdisok(ep)) { 1274 rval = -1; 1275 goto out; 1276 } 1277 } 1278 1279 if (setup_db_bydd(sp, dd, TRUE, ep) == -1) { 1280 rval = -1; 1281 goto out; 1282 } 1283 1284 if (lock_set == TRUE) { 1285 /* Lock the set on our side */ 1286 if (clnt_lock_set(mynode(), sp, ep)) { 1287 rval = -1; 1288 goto out; 1289 } 1290 } 1291 1292 /* 1293 * A traditional diskset has no diskset stale information to send 1294 * since there can only be one owner node at a time. 1295 */ 1296 if (snarf_set(sp, FALSE, ep)) 1297 mdclrerror(ep); 1298 1299 if (dd != NULL) { 1300 /* 1301 * Make sure that no drives are in use as parts of metadrives 1302 * or hot spare pools, this is one of the few error conditions 1303 * that will stop this routine, unless the environment has 1304 * META_DESTROY_SET_OK set, in which case, the operation will 1305 * proceed. 1306 */ 1307 if (getenv("META_DESTROY_SET_OK") == NULL) { 1308 for (p = dd; p != NULL; p = p->dd_next) { 1309 dnp = p->dd_dnp; 1310 1311 i = meta_check_drive_inuse(sp, dnp, FALSE, ep); 1312 if (i == -1) { 1313 /* need xep - wire calls clear error */ 1314 i = metaget_setownership(sp, &xep); 1315 if (i == -1) { 1316 rval = -1; 1317 goto out; 1318 } 1319 1320 mysideno = getmyside(sp, &xep); 1321 1322 if (mysideno == MD_SIDEWILD) { 1323 rval = -1; 1324 goto out; 1325 } 1326 1327 if (sd->sd_isown[mysideno] == FALSE) 1328 if (halt_set(sp, &xep)) { 1329 rval = -1; 1330 goto out; 1331 } 1332 1333 rval = -1; 1334 goto out; 1335 } 1336 } 1337 } 1338 1339 for (i = 0; i < MD_MAXSIDES; i++) { 1340 /* Skip empty slots */ 1341 if (sd->sd_nodes[i][0] == '\0') 1342 continue; 1343 1344 /* Skip non local nodes */ 1345 if (strcmp(mynode(), sd->sd_nodes[i]) != 0) 1346 continue; 1347 1348 if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep)) 1349 mdclrerror(ep); 1350 } 1351 1352 /* 1353 * Go thru each drive and individually delete the replicas. 1354 * This way we can ignore individual errors. 1355 */ 1356 for (p = dd; p != NULL; p = p->dd_next) { 1357 uint_t rep_slice; 1358 1359 dnp = p->dd_dnp; 1360 if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) || 1361 (((np = metaslicename(dnp, rep_slice, ep)) 1362 == NULL) && 1363 ((np = metaslicename(dnp, MD_SLICE0, ep)) 1364 == NULL))) { 1365 rval = -1; 1366 goto out; 1367 } 1368 1369 if ((np = metaslicename(dnp, 1370 rep_slice, ep)) == NULL) { 1371 if ((np = metaslicename(dnp, 1372 MD_SLICE0, ep)) == NULL) { 1373 rval = -1; 1374 goto out; 1375 } 1376 mdclrerror(ep); 1377 } 1378 1379 /* Yes this is UGLY!!! */ 1380 p1 = p->dd_next; 1381 p->dd_next = NULL; 1382 if (rel_own_bydd(sp, p, FALSE, ep)) 1383 mdclrerror(ep); 1384 p->dd_next = p1; 1385 1386 if (p->dd_dbcnt == 0) 1387 continue; 1388 1389 /* 1390 * Skip the replica removal if we are not the last user 1391 */ 1392 if (num_users != 1) 1393 continue; 1394 1395 nlp = NULL; 1396 (void) metanamelist_append(&nlp, np); 1397 if (meta_db_detach(sp, nlp, 1398 (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, ep)) 1399 mdclrerror(ep); 1400 metafreenamelist(nlp); 1401 } 1402 } 1403 1404 if (halt_set(sp, ep)) { 1405 rval = -1; 1406 goto out; 1407 } 1408 1409 /* Setup the mediator record */ 1410 (void) memset(&medr, '\0', sizeof (med_rec_t)); 1411 medr.med_rec_mag = MED_REC_MAGIC; 1412 medr.med_rec_rev = MED_REC_REV; 1413 medr.med_rec_fl = 0; 1414 medr.med_rec_sn = sp->setno; 1415 (void) strcpy(medr.med_rec_snm, sp->setname); 1416 medr.med_rec_meds = sd->sd_med; /* structure assigment */ 1417 (void) memset(&medr.med_rec_data, '\0', sizeof (med_data_t)); 1418 medr.med_rec_foff = 0; 1419 1420 /* 1421 * If we are the last remaining user, then remove the mediator hosts 1422 */ 1423 if (num_users == 1) { 1424 for (i = 0; i < MED_MAX_HOSTS; i++) { 1425 if (medr.med_rec_meds.n_lst[i].a_cnt != 0) 1426 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE, 1427 SVM_TAG_MEDIATOR, sp->setno, i); 1428 (void) memset(&medr.med_rec_meds.n_lst[i], '\0', 1429 sizeof (md_h_t)); 1430 } 1431 medr.med_rec_meds.n_cnt = 0; 1432 } else { /* Remove this host from the mediator node list. */ 1433 for (i = 0; i < MD_MAXSIDES; i++) { 1434 /* Skip empty slots */ 1435 if (sd->sd_nodes[i][0] == '\0') 1436 continue; 1437 1438 /* Copy non local node */ 1439 if (strcmp(mynode(), sd->sd_nodes[i]) != 0) { 1440 (void) strcpy(medr.med_rec_nodes[i], 1441 sd->sd_nodes[i]); 1442 continue; 1443 } 1444 1445 /* Clear local node */ 1446 (void) memset(&medr.med_rec_nodes[i], '\0', 1447 sizeof (md_node_nm_t)); 1448 } 1449 } 1450 1451 crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL); 1452 1453 /* 1454 * If the client is part of a cluster put the DCS service 1455 * into a deleteing state. 1456 */ 1457 if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) { 1458 if (metad_isautotakebyname(sp->setname)) { 1459 delete_end = 0; 1460 } else { 1461 mdclrerror(ep); 1462 goto out; 1463 } 1464 } 1465 1466 /* Inform the mediator hosts of the new information */ 1467 for (i = 0; i < MED_MAX_HOSTS; i++) { 1468 if (sd->sd_med.n_lst[i].a_cnt == 0) 1469 continue; 1470 1471 if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep)) 1472 mdclrerror(ep); 1473 } 1474 1475 /* Delete the set locally */ 1476 for (i = 0; i < MD_MAXSIDES; i++) { 1477 /* Skip empty slots */ 1478 if (sd->sd_nodes[i][0] == '\0') 1479 continue; 1480 1481 /* Skip non local nodes */ 1482 if (strcmp(mynode(), sd->sd_nodes[i]) != 0) 1483 continue; 1484 1485 if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) 1486 mdclrerror(ep); 1487 } 1488 if (delete_end && 1489 sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR) 1490 rval = -1; 1491 1492 out: 1493 /* release signals back to what they were on entry */ 1494 if (procsigs(FALSE, &oldsigs, &xep) < 0) { 1495 if (rval == 0) 1496 (void) mdstealerror(ep, &xep); 1497 rval = -1; 1498 } 1499 1500 if (lock_set == TRUE) { 1501 cl_sk = cl_get_setkey(sp->setno, sp->setname); 1502 if (clnt_unlock_set(mynode(), cl_sk, &xep)) { 1503 if (rval == 0) 1504 (void) mdstealerror(ep, &xep); 1505 rval = -1; 1506 } 1507 cl_set_setkey(NULL); 1508 } 1509 1510 metaflushsetname(sp); 1511 return (rval); 1512 } 1513 1514 int 1515 meta_set_purge( 1516 mdsetname_t *sp, 1517 int bypass_cluster, 1518 int forceflg, 1519 md_error_t *ep 1520 ) 1521 { 1522 char *thishost = mynode(); 1523 md_set_desc *sd; 1524 md_setkey_t *cl_sk; 1525 md_error_t xep = mdnullerror; 1526 int rval = 0; 1527 int i, num_hosts = 0; 1528 int has_set = 0; 1529 int max_node = 0; 1530 int delete_end = 1; 1531 md_mnnode_desc *nd; 1532 1533 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1534 /* unable to find set description */ 1535 rval = 1; 1536 return (rval); 1537 } 1538 1539 if (MD_MNSET_DESC(sd)) { 1540 /* 1541 * Get a count of the hosts in the set and also lock the set 1542 * on those hosts that know about it. 1543 */ 1544 nd = sd->sd_nodelist; 1545 while (nd) { 1546 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1547 nd = nd->nd_next; 1548 continue; 1549 } 1550 has_set = nodehasset(sp, nd->nd_nodename, 1551 NHS_NST_EQ, ep); 1552 1553 /* 1554 * The host is not aware of this set (has_set < 0) or 1555 * the set does not match (has_set == 0). This check 1556 * prevents the code getting confused by an apparent 1557 * inconsistancy in the set's state, this is in the 1558 * purge code so something is broken in any case and 1559 * this is just trying to fix the brokeness. 1560 */ 1561 if (has_set <= 0) { 1562 mdclrerror(ep); 1563 nd->nd_flags |= MD_MN_NODE_NOSET; 1564 } else { 1565 num_hosts++; 1566 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 1567 /* 1568 * If the force flag is set then 1569 * ignore any RPC failures because we 1570 * are only really interested with 1571 * the set on local node. 1572 */ 1573 if (forceflg && mdanyrpcerror(ep)) { 1574 mdclrerror(ep); 1575 } else { 1576 /* 1577 * set max_node so that in the 1578 * unlock code nodes in the 1579 * set that have not been 1580 * locked are not unlocked. 1581 */ 1582 max_node = nd->nd_nodeid; 1583 rval = 2; 1584 goto out1; 1585 } 1586 } 1587 1588 } 1589 nd = nd->nd_next; 1590 } 1591 max_node = 0; 1592 } else { 1593 /* 1594 * Get a count of the hosts in the set and also lock the set 1595 * on those hosts that know about it. 1596 */ 1597 for (i = 0; i < MD_MAXSIDES; i++) { 1598 /* Skip empty slots */ 1599 if (sd->sd_nodes[i][0] == '\0') 1600 continue; 1601 1602 has_set = nodehasset(sp, sd->sd_nodes[i], 1603 NHS_NST_EQ, ep); 1604 1605 /* 1606 * The host is not aware of this set (has_set < 0) or 1607 * the set does not match (has_set == 0). This check 1608 * prevents the code getting confused by an apparent 1609 * inconsistancy in the set's state, this is in the 1610 * purge code so something is broken in any case and 1611 * this is just trying to fix the brokeness. 1612 */ 1613 if (has_set <= 0) { 1614 mdclrerror(ep); 1615 /* 1616 * set the node to NULL to prevent further 1617 * requests to this unresponsive node. 1618 */ 1619 sd->sd_nodes[i][0] = '\0'; 1620 } else { 1621 num_hosts++; 1622 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { 1623 /* 1624 * If the force flag is set then 1625 * ignore any RPC failures because we 1626 * are only really interested with 1627 * the set on local node. 1628 */ 1629 if (forceflg && mdanyrpcerror(ep)) { 1630 mdclrerror(ep); 1631 } else { 1632 rval = 2; 1633 /* 1634 * set max_node so that in the 1635 * unlock code nodes in the 1636 * set that have not been 1637 * locked are not unlocked. 1638 */ 1639 max_node = i; 1640 goto out1; 1641 } 1642 } 1643 } 1644 } 1645 max_node = i; /* now MD_MAXSIDES */ 1646 } 1647 if (!bypass_cluster) { 1648 /* 1649 * If there is only one host associated with the 1650 * set then remove the set from the cluster. 1651 */ 1652 if (num_hosts == 1) { 1653 if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) { 1654 if (metad_isautotakebyname(sp->setname)) { 1655 delete_end = 0; 1656 } else { 1657 mdclrerror(ep); 1658 rval = 3; 1659 goto out1; 1660 } 1661 } 1662 } 1663 } 1664 1665 if (MD_MNSET_DESC(sd)) { 1666 /* 1667 * Get a count of the hosts in the set and also lock the set 1668 * on those hosts that know about it. 1669 */ 1670 nd = sd->sd_nodelist; 1671 while (nd) { 1672 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1673 nd = nd->nd_next; 1674 continue; 1675 } 1676 if (nd->nd_nodeid != sd->sd_mn_mynode->nd_nodeid) { 1677 /* 1678 * Tell the remote node to remove this node 1679 */ 1680 if (clnt_delhosts(nd->nd_nodename, sp, 1, 1681 &thishost, ep) == -1) { 1682 /* 1683 * If we fail to delete ourselves 1684 * from the remote host it does not 1685 * really matter because the set is 1686 * being "purged" from this node. The 1687 * set can be purged from the other 1688 * node at a later time. 1689 */ 1690 mdclrerror(ep); 1691 } 1692 nd = nd->nd_next; 1693 continue; 1694 } 1695 /* remove the set from this host */ 1696 if (clnt_delset(nd->nd_nodename, sp, ep) == -1) { 1697 md_perror(dgettext(TEXT_DOMAIN, "delset")); 1698 if (!bypass_cluster && num_hosts == 1) 1699 (void) sdssc_delete_end(sp->setname, 1700 SDSSC_CLEANUP); 1701 mdclrerror(ep); 1702 goto out1; 1703 } 1704 nd = nd->nd_next; 1705 } 1706 } else { 1707 for (i = 0; i < MD_MAXSIDES; i++) { 1708 /* Skip empty slots */ 1709 if (sd->sd_nodes[i][0] == '\0') 1710 continue; 1711 if (strcmp(thishost, sd->sd_nodes[i]) != 0) { 1712 /* 1713 * Tell the remote node to remove this node 1714 */ 1715 if (clnt_delhosts(sd->sd_nodes[i], sp, 1, 1716 &thishost, ep) == -1) { 1717 /* 1718 * If we fail to delete ourselves 1719 * from the remote host it does not 1720 * really matter because the set is 1721 * being "purged" from this node. The 1722 * set can be purged from the other 1723 * node at a later time. 1724 */ 1725 mdclrerror(ep); 1726 } 1727 continue; 1728 } 1729 1730 /* remove the set from this host */ 1731 if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) { 1732 md_perror(dgettext(TEXT_DOMAIN, "delset")); 1733 if (!bypass_cluster && num_hosts == 1) 1734 (void) sdssc_delete_end(sp->setname, 1735 SDSSC_CLEANUP); 1736 mdclrerror(ep); 1737 goto out1; 1738 } 1739 } 1740 } 1741 1742 if (!bypass_cluster && num_hosts == 1) { 1743 if (delete_end && sdssc_delete_end(sp->setname, SDSSC_COMMIT) == 1744 SDSSC_ERROR) { 1745 rval = 4; 1746 } 1747 } 1748 1749 out1: 1750 1751 cl_sk = cl_get_setkey(sp->setno, sp->setname); 1752 1753 /* 1754 * Remove the set lock on those nodes that had the set locked 1755 * max_node will either be MD_MAXSIDES or array index of the last 1756 * node contacted (or rather failed to contact) for traditional 1757 * diskset. For a MN diskset, max_node is the node_id of the node 1758 * that failed the lock. 1759 */ 1760 if (MD_MNSET_DESC(sd)) { 1761 nd = sd->sd_nodelist; 1762 while (nd) { 1763 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1764 nd = nd->nd_next; 1765 continue; 1766 } 1767 if (nd->nd_nodeid == max_node) 1768 break; 1769 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 1770 if (forceflg && mdanyrpcerror(&xep)) { 1771 mdclrerror(&xep); 1772 nd = nd->nd_next; 1773 continue; 1774 } 1775 if (rval == 0) 1776 (void) mdstealerror(ep, &xep); 1777 rval = 5; 1778 } 1779 nd = nd->nd_next; 1780 } 1781 } else { 1782 for (i = 0; i < max_node; i++) { 1783 /* Skip empty slots */ 1784 if (sd->sd_nodes[i][0] == '\0') 1785 continue; 1786 1787 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) { 1788 if (forceflg && mdanyrpcerror(&xep)) { 1789 mdclrerror(&xep); 1790 continue; 1791 } 1792 if (rval == 0) 1793 (void) mdstealerror(ep, &xep); 1794 rval = 5; 1795 } 1796 } 1797 } 1798 1799 cl_set_setkey(NULL); 1800 1801 return (rval); 1802 } 1803 1804 int 1805 meta_set_query( 1806 mdsetname_t *sp, 1807 mddb_dtag_lst_t **dtlpp, 1808 md_error_t *ep 1809 ) 1810 { 1811 mddb_dtag_get_parm_t dtgp; 1812 1813 (void) memset(&dtgp, '\0', sizeof (mddb_dtag_get_parm_t)); 1814 dtgp.dtgp_setno = sp->setno; 1815 1816 /*CONSTCOND*/ 1817 while (1) { 1818 if (metaioctl(MD_MED_GET_TAG, &dtgp, &dtgp.dtgp_mde, NULL) != 0) 1819 if (! mdismddberror(&dtgp.dtgp_mde, MDE_DB_NOTAG) || 1820 *dtlpp == NULL) 1821 return (mdstealerror(ep, &dtgp.dtgp_mde)); 1822 else 1823 break; 1824 1825 /* 1826 * Run to the end of the list 1827 */ 1828 for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx) 1829 /* void */; 1830 1831 *dtlpp = Zalloc(sizeof (mddb_dtag_lst_t)); 1832 1833 (void) memmove(&(*dtlpp)->dtl_dt, &dtgp.dtgp_dt, 1834 sizeof (mddb_dtag_t)); 1835 1836 dtgp.dtgp_dt.dt_id++; 1837 } 1838 return (0); 1839 } 1840 1841 /* 1842 * return drivename get by key 1843 */ 1844 mddrivename_t * 1845 metadrivename_withdrkey( 1846 mdsetname_t *sp, 1847 side_t sideno, 1848 mdkey_t key, 1849 int flags, 1850 md_error_t *ep 1851 ) 1852 { 1853 char *nm; 1854 mdname_t *np; 1855 mddrivename_t *dnp; 1856 ddi_devid_t devidp; 1857 md_set_desc *sd; 1858 1859 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1860 return (NULL); 1861 } 1862 1863 1864 /* 1865 * Get the devid associated with the key. 1866 * 1867 * If a devid was returned, it MUST be valid even in 1868 * the case where a device id has been "updated". The 1869 * "update" of the device id may have occured due to 1870 * a firmware upgrade. 1871 */ 1872 if ((devidp = meta_getdidbykey(MD_LOCAL_SET, sideno+SKEW, key, ep)) 1873 != NULL) { 1874 /* 1875 * Look for the correct dnp using the devid for comparison. 1876 */ 1877 dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep); 1878 free(devidp); 1879 dnp->side_names_key = key; 1880 } else { 1881 /* 1882 * We didn't get a devid. We'll try for a dnp using the 1883 * name. If we have a MN diskset or if the dnp is a did 1884 * device, we're done because then we don't have devids. 1885 * Otherwise we'll try to set the devid 1886 * and get the dnp via devid again. 1887 * We also need to clear the ep structure. When the 1888 * above call to meta_getdidbykey returned a null, it 1889 * also put an error code into ep. In this case, the null 1890 * return is actually OK and any errors can be ignored. The 1891 * reason it is OK is because this could be a MN set or 1892 * we could be running without devids (ex cluster). 1893 */ 1894 mdclrerror(ep); 1895 1896 if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno, key, 1897 ep)) == NULL) 1898 return (NULL); 1899 /* get device name */ 1900 if (flags & PRINT_FAST) { 1901 if ((np = metaname_fast(&sp, nm, 1902 LOGICAL_DEVICE, ep)) == NULL) { 1903 Free(nm); 1904 return (NULL); 1905 } 1906 } else { 1907 if ((np = metaname(&sp, nm, LOGICAL_DEVICE, 1908 ep)) == NULL) { 1909 Free(nm); 1910 return (NULL); 1911 } 1912 } 1913 Free(nm); 1914 /* make sure it's OK */ 1915 if ((! (flags & MD_BASICNAME_OK)) && (metachkcomp(np, 1916 ep) != 0)) 1917 return (NULL); 1918 1919 /* get drivename */ 1920 dnp = np->drivenamep; 1921 dnp->side_names_key = key; 1922 /* 1923 * Skip the devid set/check for the following cases: 1924 * 1) If MN diskset, there are no devid's 1925 * 2) if dnp is did device 1926 * The device id is disabled for did device due to the 1927 * lack of minor name support in the did driver. The following 1928 * devid code path can set and propagate the error and 1929 * eventually prevent did disks from being added to the 1930 * diskset under SunCluster systems 1931 * 1932 * Note that this code can be called through rpc.mdcommd. 1933 * sdssc_version cannot be used because the library won't 1934 * be bound. 1935 */ 1936 if ((strncmp(dnp->rname, "/dev/did/", strlen("/dev/did/")) 1937 == 0) || (MD_MNSET_DESC(sd))) 1938 goto out; 1939 1940 /* 1941 * It is okay if replica is not in devid mode 1942 */ 1943 if (mdissyserror(ep, MDDB_F_NODEVID)) { 1944 mdclrerror(ep); 1945 goto out; 1946 } 1947 1948 /* 1949 * We're not MN or did devices but 1950 * devid is missing so this means that we have 1951 * just upgraded from a configuration where 1952 * devid's were not used so try to add in 1953 * the devid and requery. If the devid still isn't there, 1954 * that's OK. dnp->devid will be null as it is in any 1955 * configuration with no devids. 1956 */ 1957 if (meta_setdid(MD_LOCAL_SET, sideno + SKEW, key, ep) < 0) 1958 return (NULL); 1959 if ((devidp = (ddi_devid_t)meta_getdidbykey(MD_LOCAL_SET, 1960 sideno+SKEW, key, ep)) != NULL) { 1961 /* 1962 * Found a devid so look for the dnp using the 1963 * devid as the search mechanism. 1964 */ 1965 dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep); 1966 free(devidp); 1967 dnp->side_names_key = key; 1968 } 1969 } 1970 1971 1972 1973 out: 1974 if (flags & MD_BYPASS_DAEMON) 1975 return (dnp); 1976 1977 if (get_sidenmlist(sp, dnp, ep)) 1978 return (NULL); 1979 1980 /* return success */ 1981 return (dnp); 1982 } 1983 1984 void 1985 metafreedrivedesc(md_drive_desc **dd) 1986 { 1987 md_drive_desc *p, *next = NULL; 1988 1989 for (p = *dd; p != NULL; p = next) { 1990 next = p->dd_next; 1991 Free(p); 1992 } 1993 *dd = NULL; 1994 } 1995 1996 md_drive_desc * 1997 metaget_drivedesc( 1998 mdsetname_t *sp, 1999 int flags, 2000 md_error_t *ep 2001 ) 2002 { 2003 side_t sideno = MD_SIDEWILD; 2004 2005 assert(! (flags & MD_BYPASS_DAEMON)); 2006 2007 if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD) 2008 return (NULL); 2009 2010 return (metaget_drivedesc_sideno(sp, sideno, flags, ep)); 2011 } 2012 2013 md_drive_desc * 2014 metaget_drivedesc_fromnamelist( 2015 mdsetname_t *sp, 2016 mdnamelist_t *nlp, 2017 md_error_t *ep 2018 ) 2019 { 2020 md_set_desc *sd; 2021 mdnamelist_t *p; 2022 md_drive_desc *dd = NULL; 2023 2024 if ((sd = metaget_setdesc(sp, ep)) == NULL) 2025 return (NULL); 2026 2027 for (p = nlp; p != NULL; p = p->next) 2028 (void) metadrivedesc_append(&dd, p->namep->drivenamep, 0, 0, 2029 sd->sd_ctime, sd->sd_genid, MD_DR_ADD); 2030 2031 return (dd); 2032 } 2033 2034 md_drive_desc * 2035 metaget_drivedesc_sideno( 2036 mdsetname_t *sp, 2037 side_t sideno, 2038 int flags, 2039 md_error_t *ep 2040 ) 2041 { 2042 md_set_desc *sd = NULL; 2043 2044 assert(! (flags & MD_BYPASS_DAEMON)); 2045 2046 if ((sd = metaget_setdesc(sp, ep)) == NULL) 2047 return (NULL); 2048 2049 if (sd->sd_drvs) 2050 return (sd->sd_drvs); 2051 2052 if ((sd->sd_drvs = dr2drivedesc(sp, sideno, flags, ep)) == NULL) 2053 return (NULL); 2054 2055 return (sd->sd_drvs); 2056 } 2057 2058 int 2059 metaget_setownership( 2060 mdsetname_t *sp, 2061 md_error_t *ep 2062 ) 2063 { 2064 md_set_desc *sd; 2065 int bool; 2066 int i; 2067 md_mnnode_desc *nd; 2068 2069 if ((sd = metaget_setdesc(sp, ep)) == NULL) 2070 return (-1); 2071 2072 if (MD_MNSET_DESC(sd)) { 2073 nd = sd->sd_nodelist; 2074 while (nd) { 2075 /* If node isn't alive, can't own diskset */ 2076 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2077 nd->nd_flags &= ~MD_MN_NODE_OWN; 2078 nd = nd->nd_next; 2079 continue; 2080 } 2081 /* 2082 * If can't communicate with rpc.metad, then mark 2083 * this node as not an owner. That node may 2084 * in fact, be an owner, but without rpc.metad running 2085 * that node can't do much. 2086 */ 2087 if (clnt_ownset(nd->nd_nodename, sp, &bool, ep) == -1) { 2088 nd->nd_flags &= ~MD_MN_NODE_OWN; 2089 } else if (bool == TRUE) { 2090 nd->nd_flags |= MD_MN_NODE_OWN; 2091 } else { 2092 nd->nd_flags &= ~MD_MN_NODE_OWN; 2093 } 2094 nd = nd->nd_next; 2095 } 2096 return (0); 2097 } 2098 2099 /* Rest of code handles traditional disksets */ 2100 2101 for (i = 0; i < MD_MAXSIDES; i++) 2102 sd->sd_isown[i] = 0; 2103 2104 if (clnt_ownset(mynode(), sp, &bool, ep) == -1) 2105 return (-1); 2106 2107 if (bool == TRUE) 2108 sd->sd_isown[getmyside(sp, ep)] = 1; 2109 2110 return (0); 2111 } 2112 2113 char * 2114 mynode(void) 2115 { 2116 static struct utsname myuname; 2117 static int done = 0; 2118 2119 if (! done) { 2120 if (uname(&myuname) == -1) { 2121 md_perror(dgettext(TEXT_DOMAIN, "uname")); 2122 assert(0); 2123 } 2124 done = 1; 2125 } 2126 return (myuname.nodename); 2127 } 2128 2129 int 2130 strinlst(char *str, int cnt, char **lst) 2131 { 2132 int i; 2133 2134 for (i = 0; i < cnt; i++) 2135 if (strcmp(lst[i], str) == 0) 2136 return (TRUE); 2137 2138 return (FALSE); 2139 } 2140 2141 /* 2142 * meta_get_reserved_names 2143 * returns an mdnamelist_t of reserved slices 2144 * reserved slices are those that are used but don't necessarily 2145 * show up as metadevices (ex. reserved slice for db in sets, logs) 2146 */ 2147 2148 /*ARGSUSED*/ 2149 int 2150 meta_get_reserved_names( 2151 mdsetname_t *sp, 2152 mdnamelist_t **nlpp, 2153 int options, 2154 md_error_t *ep) 2155 { 2156 int count = 0; 2157 mdname_t *np = NULL; 2158 mdnamelist_t *transnlp = NULL; 2159 mdnamelist_t **tailpp = nlpp; 2160 mdnamelist_t *nlp; 2161 md_drive_desc *dd, *di; 2162 2163 if (metaislocalset(sp)) 2164 goto out; 2165 2166 if (!(dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) && !mdisok(ep)) { 2167 count = -1; 2168 goto out; 2169 } 2170 2171 /* db in for sets on reserved slice */ 2172 for (di = dd; di && count >= 0; di = di->dd_next) { 2173 uint_t rep_slice; 2174 2175 /* 2176 * Add the name struct to the end of the 2177 * namelist but keep a pointer to the last 2178 * element so that we don't incur the overhead 2179 * of traversing the list each time 2180 */ 2181 if (di->dd_dnp && 2182 (meta_replicaslice(di->dd_dnp, &rep_slice, ep) == 0) && 2183 (np = metaslicename(di->dd_dnp, rep_slice, ep)) && 2184 (tailpp = meta_namelist_append_wrapper(tailpp, np))) 2185 count++; 2186 else 2187 count = -1; 2188 } 2189 2190 /* now find logs */ 2191 if (meta_get_trans_names(sp, &transnlp, 0, ep) < 0) { 2192 count = -1; 2193 goto out; 2194 } 2195 2196 for (nlp = transnlp; (nlp != NULL); nlp = nlp->next) { 2197 mdname_t *transnp = nlp->namep; 2198 md_trans_t *transp; 2199 2200 if ((transp = meta_get_trans(sp, transnp, ep)) == NULL) { 2201 count = -1; 2202 goto out; 2203 } 2204 if (transp->lognamep) { 2205 /* 2206 * Add the name struct to the end of the 2207 * namelist but keep a pointer to the last 2208 * element so that we don't incur the overhead 2209 * of traversing the list each time 2210 */ 2211 tailpp = meta_namelist_append_wrapper( 2212 tailpp, transp->lognamep); 2213 } 2214 } 2215 out: 2216 metafreenamelist(transnlp); 2217 return (count); 2218 } 2219 2220 /* 2221 * Entry point to join a node to MultiNode diskset. 2222 * 2223 * Validate host in diskset. 2224 * - Should be in membership list from API 2225 * - Should not already be joined into diskset. 2226 * - Set must have drives 2227 * Assume valid configuration is stored in the set/drive/node records 2228 * in the local mddb since no node or drive can be added to the MNset 2229 * unless all drives and nodes are available. Reconfig steps will 2230 * resync all ALIVE nodes in case of panic in critical areas. 2231 * 2232 * Lock down the set. 2233 * Verify host is a member of this diskset. 2234 * If drives exist in the configuration, load the mddbs. 2235 * Set this node to active by notifying master if one exists. 2236 * If this is the first node active in the diskset, this node 2237 * becomes the master. 2238 * Unlock the set. 2239 * 2240 * Mirror Resync: 2241 * If this node is the last node to join the set and clustering 2242 * isn't running, then start the 'metasync -r' type resync 2243 * on all mirrors in this diskset. 2244 * If clustering is running, this resync operation will 2245 * be handled by the reconfig steps and should NOT 2246 * be handled during a join operation. 2247 * 2248 * There are multiple return values in order to assist 2249 * the join operation of all sets in the metaset command. 2250 * 2251 * Return values: 2252 * 0 - Node successfully joined to set. 2253 * -1 - Join attempted but failed 2254 * - any failure from libmeta calls 2255 * - node not in the member list 2256 * -2 - Join not attempted since 2257 * - this set had no drives in set 2258 * - this node already joined to set 2259 * - set is not a multinode set 2260 * -3 - Node joined to STALE set. 2261 */ 2262 extern int 2263 meta_set_join( 2264 mdsetname_t *sp, 2265 md_error_t *ep 2266 ) 2267 { 2268 md_set_desc *sd; 2269 md_drive_desc *dd; 2270 md_mnnode_desc *nd, *nd2, my_nd; 2271 int rval = 0; 2272 md_setkey_t *cl_sk; 2273 md_error_t xep = mdnullerror; 2274 md_error_t ep_snarf = mdnullerror; 2275 int master_flag = 0; 2276 md_mnset_record *mas_mnsr = NULL; 2277 int clear_nr_flags = 0; 2278 md_mnnode_record *nr; 2279 int stale_set = 0; 2280 int rb_flags = 0; 2281 int stale_bool = FALSE; 2282 int suspendall_flag = 0; 2283 int suspend1_flag = 0; 2284 sigset_t oldsigs; 2285 int send_reinit = 0; 2286 2287 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 2288 return (-1); 2289 } 2290 2291 /* Must be a multinode diskset */ 2292 if (!MD_MNSET_DESC(sd)) { 2293 (void) mderror(ep, MDE_NOT_MN, sp->setname); 2294 return (-2); 2295 } 2296 2297 /* Verify that the node is ALIVE (i.e. is in the API membership list) */ 2298 if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_ALIVE)) { 2299 (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno, 2300 sd->sd_mn_mynode->nd_nodename, NULL, 2301 sp->setname); 2302 return (-1); 2303 } 2304 2305 /* Make sure we are blocking all signals */ 2306 if (procsigs(TRUE, &oldsigs, &xep) < 0) 2307 mdclrerror(&xep); 2308 2309 /* 2310 * Lock the set on current set members. 2311 * For MN diskset lock_set and SUSPEND are used to protect against 2312 * other meta* commands running on the other nodes. 2313 */ 2314 nd = sd->sd_nodelist; 2315 while (nd) { 2316 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2317 nd = nd->nd_next; 2318 continue; 2319 } 2320 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 2321 rval = -1; 2322 goto out; 2323 } 2324 nd = nd->nd_next; 2325 } 2326 2327 /* 2328 * Lock out other meta* commands by suspending 2329 * class 1 messages across the diskset. 2330 */ 2331 nd = sd->sd_nodelist; 2332 while (nd) { 2333 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2334 nd = nd->nd_next; 2335 continue; 2336 } 2337 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, 2338 sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) { 2339 rval = -1; 2340 goto out; 2341 } 2342 suspend1_flag = 1; 2343 nd = nd->nd_next; 2344 } 2345 2346 /* 2347 * Verify that this host is a member (in the host list) of the set. 2348 */ 2349 nd = sd->sd_nodelist; 2350 while (nd) { 2351 if (strcmp(mynode(), nd->nd_nodename) == 0) { 2352 break; 2353 } 2354 nd = nd->nd_next; 2355 } 2356 if (!nd) { 2357 (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, 2358 sd->sd_mn_mynode->nd_nodename, NULL, 2359 sp->setname); 2360 rval = -1; 2361 goto out; 2362 } 2363 2364 /* 2365 * Need to return failure if host is already 'joined' 2366 * into the set. This is done so that if later the user 2367 * issues a command to join all sets and a failure is 2368 * encountered - that the resulting cleanup effort 2369 * (withdrawing from all sets that were joined 2370 * during that command) won't withdraw from this set. 2371 */ 2372 if (nd->nd_flags & MD_MN_NODE_OWN) { 2373 rval = -2; 2374 goto out2; 2375 } 2376 2377 /* 2378 * Call metaget_setownership that calls each node in diskset and 2379 * marks in set descriptor if node is an owner of the set or not. 2380 * metaget_setownership checks to see if a node is an owner by 2381 * checking to see if that node's kernel has the mddb loaded. 2382 * If a node had panic'd during a reconfig or an 2383 * add/delete/join/withdraw operation, the other nodes' node 2384 * records may not reflect the current state of the diskset, 2385 * so calling metaget_setownership is the safest thing to do. 2386 */ 2387 if (metaget_setownership(sp, ep) == -1) { 2388 rval = -1; 2389 goto out; 2390 } 2391 2392 /* If first active member of diskset, become the master. */ 2393 nd = sd->sd_nodelist; 2394 while (nd) { 2395 if (nd->nd_flags & MD_MN_NODE_OWN) 2396 break; 2397 nd = nd->nd_next; 2398 } 2399 if (nd == NULL) 2400 master_flag = 1; 2401 2402 /* 2403 * If not first active member of diskset, then get the 2404 * master information from a node that is already joined 2405 * and set the master information for this node. Be sure 2406 * that this node (the already joined node) has its own 2407 * join flag set. If not, then this diskset isn't currently 2408 * consistent and shouldn't allow a node to join. This diskset 2409 * inconsistency should only occur when a node has panic'd in 2410 * the set while doing a metaset operation and the sysadmin is 2411 * attempting to join a node into the set. This inconsistency 2412 * will be fixed during a reconfig cycle which should be occurring 2413 * soon since a node panic'd. 2414 * 2415 * If unable to get this information from an owning node, then 2416 * this diskset isn't currently consistent and shouldn't 2417 * allow a node to join. 2418 */ 2419 if (!master_flag) { 2420 /* get master information from an owner (joined) node */ 2421 if (clnt_mngetset(nd->nd_nodename, sp->setname, 2422 sp->setno, &mas_mnsr, ep) == -1) { 2423 rval = -1; 2424 goto out; 2425 } 2426 2427 /* Verify that owner (joined) node has its own JOIN flag set */ 2428 nr = mas_mnsr->sr_nodechain; 2429 while (nr) { 2430 if ((nd->nd_nodeid == nr->nr_nodeid) && 2431 ((nr->nr_flags & MD_MN_NODE_OWN) == NULL)) { 2432 (void) mddserror(ep, MDE_DS_NODENOSET, 2433 sp->setno, nd->nd_nodename, NULL, 2434 nd->nd_nodename); 2435 free_sr((md_set_record *)mas_mnsr); 2436 rval = -1; 2437 goto out; 2438 } 2439 nr = nr->nr_next; 2440 } 2441 2442 /* 2443 * Does master have set marked as STALE? 2444 * If so, need to pass this down to kernel when 2445 * this node snarfs the set. 2446 */ 2447 if (clnt_mn_is_stale(nd->nd_nodename, sp, 2448 &stale_bool, ep) == -1) { 2449 rval = -1; 2450 goto out; 2451 } 2452 2453 /* set master information in my rpc.metad's set record */ 2454 if (clnt_mnsetmaster(mynode(), sp, mas_mnsr->sr_master_nodenm, 2455 mas_mnsr->sr_master_nodeid, ep)) { 2456 free_sr((md_set_record *)mas_mnsr); 2457 rval = -1; 2458 goto out; 2459 } 2460 2461 /* set master information in my cached set desc */ 2462 (void) strcpy(sd->sd_mn_master_nodenm, 2463 mas_mnsr->sr_master_nodenm); 2464 sd->sd_mn_master_nodeid = mas_mnsr->sr_master_nodeid; 2465 nd2 = sd->sd_nodelist; 2466 while (nd2) { 2467 if (nd2->nd_nodeid == mas_mnsr->sr_master_nodeid) { 2468 sd->sd_mn_masternode = nd2; 2469 break; 2470 } 2471 nd2 = nd2->nd_next; 2472 } 2473 free_sr((md_set_record *)mas_mnsr); 2474 2475 /* 2476 * Set the node flags in mynode's rpc.metad node records for 2477 * the nodes that are in the diskset. Can use my sd 2478 * since earlier call to metaget_setownership set the 2479 * owner flags based on whether that node had snarfed 2480 * the MN diskset mddb. Reconfig steps guarantee that 2481 * return of metaget_setownership will match the owning 2482 * node's owner list except in the case where a node 2483 * has just panic'd and in this case, a reconfig will 2484 * be starting immediately and the owner lists will 2485 * be sync'd up by the reconfig. 2486 * 2487 * Flag of SET means to take no action except to 2488 * set the node flags as given in the nodelist linked list. 2489 */ 2490 if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, 2491 MD_NR_SET, NULL, ep)) { 2492 rval = -1; 2493 goto out; 2494 } 2495 } 2496 2497 /* 2498 * Read in the mddb if there are drives in the set. 2499 */ 2500 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 2501 ep)) == NULL) { 2502 /* No drives in list */ 2503 if (! mdisok(ep)) { 2504 rval = -1; 2505 goto out; 2506 } 2507 rval = -2; 2508 goto out; 2509 } 2510 2511 /* 2512 * Notify rpc.mdcommd on all nodes of a nodelist change. 2513 * Start by suspending rpc.mdcommd (which drains it of all messages), 2514 * then change the nodelist followed by a reinit and resume. 2515 */ 2516 nd = sd->sd_nodelist; 2517 while (nd) { 2518 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2519 nd = nd->nd_next; 2520 continue; 2521 } 2522 2523 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp, 2524 MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) { 2525 rval = -1; 2526 goto out; 2527 } 2528 suspendall_flag = 1; 2529 nd = nd->nd_next; 2530 } 2531 2532 /* Set master in my set record in rpc.metad */ 2533 if (master_flag) { 2534 if (clnt_mnsetmaster(mynode(), sp, 2535 sd->sd_mn_mynode->nd_nodename, 2536 sd->sd_mn_mynode->nd_nodeid, ep)) { 2537 rval = -1; 2538 goto out; 2539 } 2540 } 2541 /* 2542 * Causes mddbs to be loaded into the kernel. 2543 * Set the force flag so that replica locations can be 2544 * loaded into the kernel even if a mediator node was 2545 * unavailable. This allows a node to join an MO 2546 * diskset when there are sufficient replicas available, 2547 * but a mediator node in unavailable. 2548 */ 2549 if (setup_db_bydd(sp, dd, TRUE, ep) == -1) { 2550 mde_perror(ep, dgettext(TEXT_DOMAIN, 2551 "Host not able to start diskset.")); 2552 rval = -1; 2553 goto out; 2554 } 2555 2556 if (! mdisok(ep)) { 2557 rval = -1; 2558 goto out; 2559 } 2560 2561 /* 2562 * Set rollback flags to 1 so that halt_set is called if a failure 2563 * is seen after this point. If snarf_set fails, still need to 2564 * call halt_set to cleanup the diskset. 2565 */ 2566 rb_flags = 1; 2567 2568 /* Starts the set */ 2569 if (snarf_set(sp, stale_bool, ep) != 0) { 2570 if (mdismddberror(ep, MDE_DB_STALE)) { 2571 /* 2572 * Don't fail join, STALE means that set has 2573 * < 50% mddbs. 2574 */ 2575 (void) mdstealerror(&ep_snarf, ep); 2576 stale_set = 1; 2577 } else if (mdisok(ep)) { 2578 /* If snarf failed, but no error was set - set it */ 2579 (void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64, 2580 sp->setno, 0, NULL); 2581 rval = -1; 2582 goto out; 2583 } else if (!(mdismddberror(ep, MDE_DB_ACCOK))) { 2584 /* 2585 * Don't fail join if ACCOK; ACCOK means that mediator 2586 * provided extra vote. 2587 */ 2588 rval = -1; 2589 goto out; 2590 } 2591 } 2592 2593 /* Did set really get snarfed? */ 2594 if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_NO) { 2595 if (mdisok(ep)) { 2596 /* If snarf failed, but no error was set - set it */ 2597 (void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64, 2598 sp->setno, 0, NULL); 2599 } 2600 mde_perror(ep, dgettext(TEXT_DOMAIN, 2601 "Host not able to start diskset.")); 2602 rval = -1; 2603 goto out; 2604 } 2605 2606 /* Change to nodelist so need to send reinit to rpc.mdcommd */ 2607 send_reinit = 1; 2608 2609 /* If first node to enter set, setup master and clear change log */ 2610 if (master_flag) { 2611 /* Set master in my locally cached set descriptor */ 2612 (void) strcpy(sd->sd_mn_master_nodenm, 2613 sd->sd_mn_mynode->nd_nodename); 2614 sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid; 2615 sd->sd_mn_am_i_master = 1; 2616 2617 /* 2618 * If first node to join set, then clear out change log 2619 * entries. Change log entries are only needed when a 2620 * change of master is occurring in a diskset that has 2621 * multiple owners. Since this node is the first owner 2622 * of the diskset, clear the entries. 2623 * 2624 * Only do this if we are in a single node non-SC3.x 2625 * situation. 2626 */ 2627 if (meta_mn_singlenode() && 2628 mdmn_reset_changelog(sp, ep, MDMN_CLF_RESETLOG) != 0) { 2629 mde_perror(ep, dgettext(TEXT_DOMAIN, 2630 "Unable to reset changelog.")); 2631 rval = -1; 2632 goto out; 2633 } 2634 } 2635 2636 /* Set my locally cached flag */ 2637 sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN; 2638 2639 /* 2640 * Set this node's own flag on all joined nodes in the set 2641 * (including my node). 2642 */ 2643 clear_nr_flags = 1; 2644 2645 my_nd = *(sd->sd_mn_mynode); 2646 my_nd.nd_next = NULL; 2647 nd = sd->sd_nodelist; 2648 while (nd) { 2649 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 2650 nd = nd->nd_next; 2651 continue; 2652 } 2653 if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, 2654 MD_NR_JOIN, NULL, ep)) { 2655 rval = -1; 2656 goto out; 2657 } 2658 nd = nd->nd_next; 2659 } 2660 2661 out: 2662 if (rval != NULL) { 2663 /* 2664 * If rollback flag is 1, then node was joined to set. 2665 * Since an error occurred, withdraw node from set in 2666 * order to rollback to before command was run. 2667 * Need to preserve ep so that calling function can 2668 * get error information. 2669 */ 2670 if (rb_flags == 1) { 2671 if (halt_set(sp, &xep)) { 2672 mdclrerror(&xep); 2673 } 2674 } 2675 2676 /* 2677 * If error, reset master to INVALID. 2678 * Ignore error since (next) first node to successfully join 2679 * will set master on all nodes. 2680 */ 2681 (void) clnt_mnsetmaster(mynode(), sp, "", 2682 MD_MN_INVALID_NID, &xep); 2683 mdclrerror(&xep); 2684 /* Reset master in my locally cached set descriptor */ 2685 sd->sd_mn_master_nodeid = MD_MN_INVALID_NID; 2686 sd->sd_mn_am_i_master = 0; 2687 2688 /* 2689 * If nr flags set on other nodes, reset them. 2690 */ 2691 if (clear_nr_flags) { 2692 nd = sd->sd_nodelist; 2693 while (nd) { 2694 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 2695 nd = nd->nd_next; 2696 continue; 2697 } 2698 (void) clnt_upd_nr_flags(nd->nd_nodename, sp, 2699 &my_nd, MD_NR_WITHDRAW, NULL, &xep); 2700 mdclrerror(&xep); 2701 nd = nd->nd_next; 2702 } 2703 /* Reset my locally cached flag */ 2704 sd->sd_mn_mynode->nd_flags &= ~MD_MN_NODE_OWN; 2705 } 2706 } 2707 2708 /* 2709 * Notify rpc.mdcommd on all nodes of a nodelist change. 2710 * Send reinit command to mdcommd which forces it to get 2711 * fresh set description. 2712 */ 2713 if (send_reinit) { 2714 /* Send reinit */ 2715 nd = sd->sd_nodelist; 2716 while (nd) { 2717 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2718 nd = nd->nd_next; 2719 continue; 2720 } 2721 2722 /* Class is ignored for REINIT */ 2723 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, 2724 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { 2725 /* 2726 * We are here because we failed to resume 2727 * rpc.mdcommd. However we potentially have 2728 * an error from the previous call 2729 * If the previous call did fail, we capture 2730 * that error and generate a perror with 2731 * the string, "Unable to resume...". 2732 * Setting rval to -1 ensures that in the 2733 * next iteration of the loop, ep is not 2734 * clobbered. 2735 */ 2736 if (rval == 0) 2737 (void) mdstealerror(ep, &xep); 2738 else 2739 mdclrerror(&xep); 2740 rval = -1; 2741 mde_perror(ep, dgettext(TEXT_DOMAIN, 2742 "Unable to reinit rpc.mdcommd.")); 2743 } 2744 nd = nd->nd_next; 2745 } 2746 2747 } 2748 2749 out2: 2750 /* 2751 * Unlock diskset by resuming messages across the diskset. 2752 * Just resume all classes so that resume is the same whether 2753 * just one class was locked or all classes were locked. 2754 */ 2755 if ((suspend1_flag) || (suspendall_flag)) { 2756 nd = sd->sd_nodelist; 2757 while (nd) { 2758 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2759 nd = nd->nd_next; 2760 continue; 2761 } 2762 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 2763 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 2764 /* 2765 * We are here because we failed to resume 2766 * rpc.mdcommd. However we potentially have 2767 * an error from the previous call 2768 * If the previous call did fail, we capture 2769 * that error and generate a perror with 2770 * the string, "Unable to resume...". 2771 * Setting rval to -1 ensures that in the 2772 * next iteration of the loop, ep is not 2773 * clobbered. 2774 */ 2775 if (rval == 0) 2776 (void) mdstealerror(ep, &xep); 2777 else 2778 mdclrerror(&xep); 2779 rval = -1; 2780 mde_perror(ep, dgettext(TEXT_DOMAIN, 2781 "Unable to resume rpc.mdcommd.")); 2782 } 2783 nd = nd->nd_next; 2784 } 2785 meta_ping_mnset(sp->setno); 2786 } 2787 2788 /* 2789 * Unlock set. This flushes the caches on the servers. 2790 */ 2791 cl_sk = cl_get_setkey(sp->setno, sp->setname); 2792 nd = sd->sd_nodelist; 2793 while (nd) { 2794 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2795 nd = nd->nd_next; 2796 continue; 2797 } 2798 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 2799 if (rval == 0) 2800 (void) mdstealerror(ep, &xep); 2801 else 2802 mdclrerror(&xep); 2803 rval = -1; 2804 } 2805 nd = nd->nd_next; 2806 } 2807 2808 /* 2809 * If this node is the last to join the diskset and clustering isn't 2810 * running, then resync the mirrors in the diskset. We have to wait 2811 * until all nodes are joined so that the status gets propagated to 2812 * all of the members of the set. 2813 * Ignore any error from the resync as the join function shouldn't fail 2814 * because the mirror resync had a problem. 2815 * 2816 * Don't start resync if set is stale. 2817 */ 2818 if ((rval == 0) && (sdssc_bind_library() != SDSSC_OKAY) && 2819 (stale_set != 1)) { 2820 nd = sd->sd_nodelist; 2821 while (nd) { 2822 if (!(nd->nd_flags & MD_MN_NODE_OWN)) 2823 break; 2824 nd = nd->nd_next; 2825 } 2826 /* 2827 * nd set to NULL means that we have no nodes in the set that 2828 * haven't joined. In this case we start the resync. 2829 */ 2830 if (nd == NULL) { 2831 (void) meta_mirror_resync_all(sp, 0, &xep); 2832 mdclrerror(&xep); 2833 } 2834 } 2835 2836 /* Update ABR state for all soft partitions */ 2837 (void) meta_sp_update_abr(sp, &xep); 2838 mdclrerror(&xep); 2839 2840 /* 2841 * call metaflushsetnames to reset local cache for master and 2842 * node information. 2843 */ 2844 metaflushsetname(sp); 2845 2846 /* release signals back to what they were on entry */ 2847 if (procsigs(FALSE, &oldsigs, &xep) < 0) 2848 mdclrerror(&xep); 2849 2850 /* 2851 * If no error and stale_set is set, then set ep back 2852 * to ep from snarf_set call and return -3. If another error 2853 * occurred and rval is not 0, then that error would have 2854 * caused the node to be withdrawn from the set and would 2855 * have set ep to that error information. 2856 */ 2857 if ((rval == 0) && (stale_set)) { 2858 (void) mdstealerror(ep, &ep_snarf); 2859 return (-3); 2860 } 2861 2862 return (rval); 2863 } 2864 2865 /* 2866 * Entry point to withdraw a node from MultiNode diskset. 2867 * 2868 * Validate host in diskset. 2869 * - Should be joined into diskset. 2870 * Assume valid configuration is stored in the set/drive/node records 2871 * in the local mddb since no node or drive can be added to the MNset 2872 * unless all drives and nodes are available. Reconfig steps will 2873 * resync all ALIVE nodes in case of panic in critical areas. 2874 * 2875 * Lock down the set. 2876 * Verify that drives exist in configuration. 2877 * Verify host is a member of this diskset. 2878 * Verify host is an owner of the diskset (host is joined to diskset). 2879 * Only allow withdrawal of master node if master node is the only joined 2880 * in the diskset. 2881 * Halt the diskset on this node. 2882 * Reset Master on this node. 2883 * Updated node flags that this node with withdrawn. 2884 * Unlock the set. 2885 * 2886 * Return values: 2887 * 0 - Node successfully withdrew from set. 2888 * -1 - Withdrawal attempted but failed 2889 * - any failure from libmeta calls 2890 * - node not in the member list 2891 * -2 - Withdrawal not attempted since 2892 * - this set had no drives in set 2893 * - this node not joined to set 2894 * - set is not a multinode set 2895 */ 2896 extern int 2897 meta_set_withdraw( 2898 mdsetname_t *sp, 2899 md_error_t *ep 2900 ) 2901 { 2902 md_set_desc *sd; 2903 md_drive_desc *dd = 0; 2904 md_mnnode_desc *nd, my_nd; 2905 int rval = 0; 2906 md_setkey_t *cl_sk; 2907 md_error_t xep = mdnullerror; 2908 int set_halted = 0; 2909 int suspendall_flag = 0; 2910 int suspend1_flag = 0; 2911 bool_t stale_bool = FALSE; 2912 mddb_config_t c; 2913 int node_id_list[1]; 2914 sigset_t oldsigs; 2915 int send_reinit = 0; 2916 2917 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 2918 return (-1); 2919 } 2920 2921 /* Must be a multinode diskset */ 2922 if (!MD_MNSET_DESC(sd)) { 2923 (void) mderror(ep, MDE_NOT_MN, sp->setname); 2924 return (-1); 2925 } 2926 2927 /* Make sure we are blocking all signals */ 2928 if (procsigs(TRUE, &oldsigs, &xep) < 0) 2929 mdclrerror(&xep); 2930 2931 /* 2932 * Lock the set on current set members. 2933 * For MN diskset lock_set and SUSPEND are used to protect against 2934 * other meta* commands running on the other nodes. 2935 */ 2936 nd = sd->sd_nodelist; 2937 while (nd) { 2938 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2939 nd = nd->nd_next; 2940 continue; 2941 } 2942 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 2943 rval = -1; 2944 goto out; 2945 } 2946 nd = nd->nd_next; 2947 } 2948 /* 2949 * Lock out other meta* commands by suspending 2950 * class 1 messages across the diskset. 2951 */ 2952 nd = sd->sd_nodelist; 2953 while (nd) { 2954 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2955 nd = nd->nd_next; 2956 continue; 2957 } 2958 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, 2959 sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) { 2960 rval = -1; 2961 goto out; 2962 } 2963 suspend1_flag = 1; 2964 nd = nd->nd_next; 2965 } 2966 2967 /* Get list of drives - needed in case of failure */ 2968 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 2969 ep)) == NULL) { 2970 /* Error getting drives in list */ 2971 if (! mdisok(ep)) { 2972 rval = -1; 2973 goto out2; 2974 } 2975 /* no drives in list */ 2976 rval = -2; 2977 goto out2; 2978 } 2979 2980 /* 2981 * Verify that this host is a member (in the host list) of the set. 2982 */ 2983 nd = sd->sd_nodelist; 2984 while (nd) { 2985 if (strcmp(mynode(), nd->nd_nodename) == 0) { 2986 break; 2987 } 2988 nd = nd->nd_next; 2989 } 2990 if (!nd) { 2991 (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, 2992 sd->sd_mn_mynode->nd_nodename, NULL, 2993 sp->setname); 2994 rval = -1; 2995 goto out2; 2996 } 2997 2998 /* 2999 * Call metaget_setownership that calls each node in diskset and 3000 * marks in set descriptor if node is an owner of the set or not. 3001 * metaget_setownership checks to see if a node is an owner by 3002 * checking to see if that node's kernel has the mddb loaded. 3003 * If a node had panic'd during a reconfig or an 3004 * add/delete/join/withdraw operation, the other nodes' node 3005 * records may not reflect the current state of the diskset, 3006 * so calling metaget_setownership is the safest thing to do. 3007 */ 3008 if (metaget_setownership(sp, ep) == -1) { 3009 rval = -1; 3010 goto out2; 3011 } 3012 3013 /* 3014 * Verify that this node is joined 3015 * to diskset (i.e. is an owner of the diskset). 3016 */ 3017 if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 3018 rval = -2; 3019 goto out2; 3020 } 3021 3022 /* 3023 * For a MN diskset, only withdraw master if it is 3024 * the only joined node. 3025 */ 3026 if (sd->sd_mn_master_nodeid == sd->sd_mn_mynode->nd_nodeid) { 3027 nd = sd->sd_nodelist; 3028 while (nd) { 3029 /* Skip my node since checking for other owners */ 3030 if (nd->nd_nodeid == sd->sd_mn_master_nodeid) { 3031 nd = nd->nd_next; 3032 continue; 3033 } 3034 /* If another owner node if found, error */ 3035 if (nd->nd_flags & MD_MN_NODE_OWN) { 3036 (void) mddserror(ep, MDE_DS_WITHDRAWMASTER, 3037 sp->setno, 3038 sd->sd_mn_mynode->nd_nodename, NULL, 3039 sp->setname); 3040 rval = -1; 3041 goto out2; 3042 } 3043 nd = nd->nd_next; 3044 } 3045 } 3046 3047 /* 3048 * Is current set STALE? 3049 */ 3050 (void) memset(&c, 0, sizeof (c)); 3051 c.c_id = 0; 3052 c.c_setno = sp->setno; 3053 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 3054 (void) mdstealerror(ep, &c.c_mde); 3055 rval = -1; 3056 goto out; 3057 } 3058 if (c.c_flags & MDDB_C_STALE) { 3059 stale_bool = TRUE; 3060 } 3061 3062 /* 3063 * Notify rpc.mdcommd on all nodes of a nodelist change. 3064 * Start by suspending rpc.mdcommd (which drains it of all messages), 3065 * then change the nodelist followed by a reinit and resume. 3066 */ 3067 nd = sd->sd_nodelist; 3068 while (nd) { 3069 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3070 nd = nd->nd_next; 3071 continue; 3072 } 3073 3074 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, 3075 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) { 3076 rval = -1; 3077 goto out; 3078 } 3079 suspendall_flag = 1; 3080 nd = nd->nd_next; 3081 } 3082 3083 /* 3084 * Withdraw the set - halt set. 3085 * This will fail if any I/O is occuring to any metadevice which 3086 * includes a resync to a mirror metadevice. 3087 */ 3088 set_halted = 1; 3089 if (halt_set(sp, ep)) { 3090 /* Was set actually halted? */ 3091 if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_YES) { 3092 set_halted = 0; 3093 } 3094 rval = -1; 3095 goto out; 3096 } 3097 3098 /* Change to nodelist so need to send reinit to rpc.mdcommd */ 3099 send_reinit = 1; 3100 3101 /* Reset master on withdrawn node */ 3102 if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, "", 3103 MD_MN_INVALID_NID, ep)) { 3104 rval = -1; 3105 goto out; 3106 } 3107 3108 /* Mark my node as withdrawn and send to other nodes */ 3109 nd = sd->sd_nodelist; 3110 my_nd = *(sd->sd_mn_mynode); /* structure copy */ 3111 my_nd.nd_next = NULL; 3112 while (nd) { 3113 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3114 nd = nd->nd_next; 3115 continue; 3116 } 3117 if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, 3118 MD_NR_WITHDRAW, NULL, ep)) { 3119 rval = -1; 3120 goto out; 3121 } 3122 nd = nd->nd_next; 3123 } 3124 3125 /* 3126 * If withdrawn node is a mirror owner, reset mirror owner 3127 * to NULL. If an error occurs, print a warning and continue. 3128 * Don't fail metaset because of mirror owner reset problem since 3129 * next node to grab mirror will resolve this issue. 3130 * Before next node grabs mirrors, metaset will show the withdrawn 3131 * node as owner which is why an attempt to reset the mirror owner 3132 * is made. 3133 */ 3134 node_id_list[0] = sd->sd_mn_mynode->nd_nodeid; /* Setup my nodeid */ 3135 nd = sd->sd_nodelist; 3136 while (nd) { 3137 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3138 nd = nd->nd_next; 3139 continue; 3140 } 3141 if (clnt_reset_mirror_owner(nd->nd_nodename, sp, 3142 1, &node_id_list[0], &xep) == 01) { 3143 mde_perror(&xep, dgettext(TEXT_DOMAIN, 3144 "Unable to reset mirror owner on node %s"), 3145 nd->nd_nodename); 3146 mdclrerror(&xep); 3147 } 3148 nd = nd->nd_next; 3149 } 3150 3151 out: 3152 if (rval == -1) { 3153 /* Rejoin node - Mark node as joined and send to other nodes */ 3154 nd = sd->sd_nodelist; 3155 my_nd = *(sd->sd_mn_mynode); /* structure copy */ 3156 my_nd.nd_next = NULL; 3157 while (nd) { 3158 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3159 nd = nd->nd_next; 3160 continue; 3161 } 3162 if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, 3163 MD_NR_JOIN, NULL, &xep)) { 3164 mdclrerror(&xep); 3165 } 3166 nd = nd->nd_next; 3167 } 3168 3169 /* Set master on withdrawn node */ 3170 if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, 3171 sd->sd_mn_master_nodenm, 3172 sd->sd_mn_master_nodeid, &xep)) { 3173 mdclrerror(&xep); 3174 } 3175 3176 /* Join set if halt_set had succeeded */ 3177 if (set_halted) { 3178 /* 3179 * Causes mddbs to be loaded into the kernel. 3180 * Set the force flag so that replica locations can be 3181 * loaded into the kernel even if a mediator node was 3182 * unavailable. This allows a node to join an MO 3183 * diskset when there are sufficient replicas available, 3184 * but a mediator node in unavailable. 3185 */ 3186 if (setup_db_bydd(sp, dd, TRUE, &xep) == -1) { 3187 mdclrerror(&xep); 3188 } 3189 /* If set previously stale - make it so at re-join */ 3190 if (snarf_set(sp, stale_bool, &xep) != 0) { 3191 mdclrerror(&xep); 3192 (void) halt_set(sp, &xep); 3193 mdclrerror(&xep); 3194 } 3195 } 3196 } 3197 3198 /* 3199 * Notify rpc.mdcommd on all nodes of a nodelist change. 3200 * Send reinit command to mdcommd which forces it to get 3201 * fresh set description. 3202 */ 3203 if (send_reinit) { 3204 /* Send reinit */ 3205 nd = sd->sd_nodelist; 3206 while (nd) { 3207 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3208 nd = nd->nd_next; 3209 continue; 3210 } 3211 3212 /* Class is ignored for REINIT */ 3213 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, 3214 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { 3215 /* 3216 * We are here because we failed to resume 3217 * rpc.mdcommd. However we potentially have 3218 * an error from the previous call. 3219 * If the previous call did fail, we 3220 * capture that error and generate a perror 3221 * withthe string, "Unable to resume...". 3222 * Setting rval to -1 ensures that in the 3223 * next iteration of the loop, ep is not 3224 * clobbered. 3225 */ 3226 if (rval == 0) 3227 (void) mdstealerror(ep, &xep); 3228 else 3229 mdclrerror(&xep); 3230 rval = -1; 3231 mde_perror(ep, dgettext(TEXT_DOMAIN, 3232 "Unable to reinit rpc.mdcommd.")); 3233 } 3234 nd = nd->nd_next; 3235 } 3236 } 3237 3238 out2: 3239 /* 3240 * Unlock diskset by resuming messages across the diskset. 3241 * Just resume all classes so that resume is the same whether 3242 * just one class was locked or all classes were locked. 3243 */ 3244 if ((suspend1_flag) || (suspendall_flag)) { 3245 nd = sd->sd_nodelist; 3246 while (nd) { 3247 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3248 nd = nd->nd_next; 3249 continue; 3250 } 3251 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 3252 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 3253 /* 3254 * We are here because we failed to resume 3255 * rpc.mdcommd. However we potentially have 3256 * an error from the previous call 3257 * If the previous call did fail, we capture 3258 * that error and generate a perror with 3259 * the string, "Unable to resume...". 3260 * Setting rval to -1 ensures that in the 3261 * next iteration of the loop, ep is not 3262 * clobbered. 3263 */ 3264 if (rval == 0) 3265 (void) mdstealerror(ep, &xep); 3266 else 3267 mdclrerror(&xep); 3268 rval = -1; 3269 mde_perror(ep, dgettext(TEXT_DOMAIN, 3270 "Unable to resume rpc.mdcommd.")); 3271 } 3272 nd = nd->nd_next; 3273 } 3274 meta_ping_mnset(sp->setno); 3275 } 3276 3277 /* 3278 * Unlock set. This flushes the caches on the servers. 3279 */ 3280 cl_sk = cl_get_setkey(sp->setno, sp->setname); 3281 nd = sd->sd_nodelist; 3282 while (nd) { 3283 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3284 nd = nd->nd_next; 3285 continue; 3286 } 3287 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 3288 if (rval == 0) 3289 (void) mdstealerror(ep, &xep); 3290 else 3291 mdclrerror(&xep); 3292 rval = -1; 3293 } 3294 nd = nd->nd_next; 3295 } 3296 3297 /* 3298 * call metaflushsetnames to reset local cache for master and 3299 * node information. 3300 */ 3301 metaflushsetname(sp); 3302 3303 /* release signals back to what they were on entry */ 3304 if (procsigs(FALSE, &oldsigs, &xep) < 0) 3305 mdclrerror(&xep); 3306 3307 return (rval); 3308 3309 } 3310 3311 /* 3312 * Update nodelist with cluster member information. 3313 * A node not in the member list will be marked 3314 * as not ALIVE and not OWN. 3315 * A node in the member list will be marked ALIVE, but 3316 * the OWN bit will not be changed. 3317 * 3318 * If mynode isn't in the membership list, fail causing 3319 * another reconfig cycle to be started since a non-member 3320 * node shouldn't be taking part in the reconfig cycle. 3321 * 3322 * Return values: 3323 * 0 - No problem. 3324 * 1 - Any failure including RPC failure to my node. 3325 */ 3326 int 3327 meta_reconfig_update_nodelist( 3328 mdsetname_t *sp, 3329 mndiskset_membershiplist_t *nl, 3330 md_set_desc *sd, 3331 md_error_t *ep 3332 ) 3333 { 3334 mndiskset_membershiplist_t *nl2; 3335 md_mnnode_desc *nd; 3336 md_error_t xep = mdnullerror; 3337 int rval = 0; 3338 3339 /* 3340 * Walk through nodelist, checking to see if each 3341 * node is in the member list. 3342 * If node is not a member, reset ALIVE and OWN node flag. 3343 * If node is a member, set ALIVE. 3344 * If mynode's OWN flag gets reset, then halt the diskset on this node. 3345 */ 3346 nd = sd->sd_nodelist; 3347 while (nd) { 3348 nl2 = nl; 3349 while (nl2) { 3350 /* If node is in member list, set ALIVE */ 3351 if (nl2->msl_node_id == nd->nd_nodeid) { 3352 nd->nd_flags |= MD_MN_NODE_ALIVE; 3353 break; 3354 } else { 3355 nl2 = nl2->next; 3356 } 3357 /* node is not in member list, mark !ALIVE and !OWN */ 3358 if (nl2 == NULL) { 3359 /* If node is mynode, then halt set if needed */ 3360 if (strcmp(mynode(), nd->nd_nodename) == 0) { 3361 /* 3362 * This shouldn't happen, but just 3363 * in case... Any node not in the 3364 * membership list should be dead and 3365 * not running reconfig step1. 3366 */ 3367 if (nd->nd_flags & MD_MN_NODE_OWN) { 3368 if (halt_set(sp, &xep)) { 3369 mde_perror(&xep, ""); 3370 mdclrerror(&xep); 3371 } 3372 } 3373 /* 3374 * Return failure since this node 3375 * (mynode) is not in the membership 3376 * list, but process the rest of the 3377 * nodelist first so that rpc.metad 3378 * can be updated with the latest 3379 * membership information. 3380 */ 3381 (void) mddserror(ep, 3382 MDE_DS_NOTINMEMBERLIST, 3383 sp->setno, nd->nd_nodename, NULL, 3384 sp->setname); 3385 rval = 1; 3386 } 3387 nd->nd_flags &= ~MD_MN_NODE_ALIVE; 3388 nd->nd_flags &= ~MD_MN_NODE_OWN; 3389 } 3390 } 3391 nd = nd->nd_next; 3392 } 3393 3394 /* Send this information to rpc.metad */ 3395 if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, 3396 MD_NR_SET, MNSET_IN_RECONFIG, &xep)) { 3397 /* Return failure if can't send node flags to rpc.metad */ 3398 if (rval == 0) { 3399 (void) mdstealerror(ep, &xep); 3400 rval = 1; 3401 } 3402 } 3403 return (rval); 3404 } 3405 3406 /* 3407 * Choose master determines the master for a diskset. 3408 * Each node determines the master on its own and 3409 * adds this information to its local rpc.metad nodelist 3410 * and also sends it to the kernel. 3411 * 3412 * Nodelist in set descriptor (sd) is sorted in 3413 * monotonically increasing sequence of nodeid. 3414 * 3415 * Return values: 3416 * 0 - No problem. 3417 * 205 - There was an RPC problem to another node. 3418 * -1 - There was an error. This could be an RPC error to my node. 3419 * This is a catastrophic failure causing node to panic. 3420 */ 3421 int 3422 meta_reconfig_choose_master_for_set( 3423 mdsetname_t *sp, 3424 md_set_desc *sd, 3425 md_error_t *ep 3426 ) 3427 { 3428 int is_owner; 3429 md_mnset_record *mnsr = NULL; 3430 int lowest_alive_nodeid = 0; 3431 uint_t master_nodeid; 3432 md_mnnode_desc *nd, *nd2; 3433 md_mnnode_record *nr; 3434 md_drive_desc *dd; 3435 md_setkey_t *cl_sk; 3436 int rval = 0; 3437 md_error_t xep = mdnullerror; 3438 mddb_setflags_config_t sf; 3439 3440 /* 3441 * Is current node joined to diskset? 3442 * Don't trust flags, really check to see if mddb is snarfed. 3443 */ 3444 if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { 3445 /* 3446 * If a node is joined to the diskset, this node checks 3447 * to see if the current master of the diskset is valid and 3448 * is still in the membership list (ALIVE) and is 3449 * still joined (OWN). Need to verify if master is 3450 * really joined - don't trust the flags. (Can trust 3451 * ALIVE since set during earlier part of reconfig cycle.) 3452 * If the current master is valid, still in the membership 3453 * list and joined, then master is not changed on this node. 3454 * Just return. 3455 * 3456 * Verify that nodeid is valid before accessing masternode. 3457 */ 3458 if ((sd->sd_mn_master_nodeid != MD_MN_INVALID_NID) && 3459 (sd->sd_mn_masternode->nd_flags & MD_MN_NODE_ALIVE)) { 3460 if (clnt_ownset(sd->sd_mn_master_nodenm, sp, 3461 &is_owner, ep) == -1) { 3462 /* If RPC failure to another node return 205 */ 3463 if ((mdanyrpcerror(ep)) && 3464 (sd->sd_mn_mynode->nd_nodeid != 3465 sd->sd_mn_master_nodeid)) { 3466 return (205); 3467 } else { 3468 /* Any other failure */ 3469 return (-1); 3470 } 3471 } else { 3472 if (is_owner == TRUE) { 3473 3474 meta_mc_log(MC_LOG5, dgettext( 3475 TEXT_DOMAIN, "Set %s previous " 3476 "master chosen %s (%d): %s"), 3477 sp->setname, 3478 sd->sd_mn_master_nodenm, 3479 sd->sd_mn_master_nodeid, 3480 meta_print_hrtime(gethrtime() - 3481 start_time)); 3482 3483 /* Previous master is ok - done */ 3484 return (0); 3485 } 3486 } 3487 } 3488 3489 /* 3490 * If current master is no longer in the membership list or 3491 * is no longer joined, then this node uses the following 3492 * algorithm: 3493 * - node calls RPC routine clnt_ownset to get latest 3494 * information on which nodes are owners of diskset. 3495 * clnt_ownset checks on each node to see if its kernel 3496 * has that diskset snarfed. 3497 */ 3498 nd = sd->sd_nodelist; 3499 while (nd) { 3500 /* Don't consider node that isn't in member list */ 3501 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3502 nd = nd->nd_next; 3503 continue; 3504 } 3505 3506 if (clnt_ownset(nd->nd_nodename, sp, 3507 &is_owner, ep) == -1) { 3508 /* If RPC failure to another node return 205 */ 3509 if ((mdanyrpcerror(ep)) && 3510 (sd->sd_mn_mynode->nd_nodeid != 3511 nd->nd_nodeid)) { 3512 return (205); 3513 } else { 3514 /* Any other failure */ 3515 return (-1); 3516 } 3517 } 3518 3519 /* 3520 * Set owner flag for each node based on whether 3521 * that node really has a diskset mddb snarfed in 3522 * or not. 3523 */ 3524 if (is_owner == TRUE) 3525 nd->nd_flags |= MD_MN_NODE_OWN; 3526 else 3527 nd->nd_flags &= ~MD_MN_NODE_OWN; 3528 3529 nd = nd->nd_next; 3530 } 3531 3532 /* 3533 * - node walks through nodelist looking for nodes that are 3534 * owners of the diskset that are in the membership list. 3535 * - for each owner, node calls RPC routine clnt_getset to 3536 * see if that node has its node record set to OK. 3537 * - If so, master is chosen to be this owner node. 3538 */ 3539 nd = sd->sd_nodelist; 3540 while (nd) { 3541 /* Don't consider node that isn't in member list */ 3542 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3543 nd = nd->nd_next; 3544 continue; 3545 } 3546 3547 /* Don't consider a node that isn't an owner */ 3548 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3549 nd = nd->nd_next; 3550 continue; 3551 } 3552 3553 /* Does node has its own node record set to OK? */ 3554 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3555 MD_SET_BAD, &mnsr, ep) == -1) { 3556 /* If RPC failure to another node return 205 */ 3557 if ((mdanyrpcerror(ep)) && 3558 (sd->sd_mn_mynode->nd_nodeid != 3559 nd->nd_nodeid)) { 3560 return (205); 3561 } else { 3562 /* Any other failure */ 3563 return (-1); 3564 } 3565 } 3566 nr = mnsr->sr_nodechain; 3567 while (nr) { 3568 if (nd->nd_nodeid == nr->nr_nodeid) { 3569 if (nr->nr_flags & MD_MN_NODE_OK) { 3570 /* Found a master */ 3571 free_sr( 3572 (md_set_record *)mnsr); 3573 goto found_master; 3574 } 3575 } 3576 nr = nr->nr_next; 3577 } 3578 free_sr((md_set_record *)mnsr); 3579 nd = nd->nd_next; 3580 } 3581 3582 /* 3583 * - If no owner node has its own node record on its own node 3584 * set to OK, then this node checks all of the non-owner 3585 * nodes that are in the membership list. 3586 * - for each non-owner, node calls RPC routine clnt_getset to 3587 * see if that node has its node record set to OK. 3588 * - If set doesn't exist, don't choose node for master. 3589 * - If so, master is chosen to be this non-owner node. 3590 * 3591 */ 3592 nd = sd->sd_nodelist; 3593 while (nd) { 3594 /* Don't consider node that isn't in member list */ 3595 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3596 nd = nd->nd_next; 3597 continue; 3598 } 3599 3600 /* Only checking non-owner nodes this time around */ 3601 if (nd->nd_flags & MD_MN_NODE_OWN) { 3602 nd = nd->nd_next; 3603 continue; 3604 } 3605 3606 /* Does node has its own node record set to OK? */ 3607 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3608 MD_SET_BAD, &mnsr, ep) == -1) { 3609 /* 3610 * If set doesn't exist on non-owner node, 3611 * don't consider this node for master. 3612 */ 3613 if (mdiserror(ep, MDE_NO_SET)) { 3614 nd = nd->nd_next; 3615 continue; 3616 } else if ((mdanyrpcerror(ep)) && 3617 (sd->sd_mn_mynode->nd_nodeid != 3618 nd->nd_nodeid)) { 3619 /* RPC failure to another node */ 3620 return (205); 3621 } else { 3622 /* Any other failure */ 3623 return (-1); 3624 } 3625 } 3626 nr = mnsr->sr_nodechain; 3627 while (nr) { 3628 if (nd->nd_nodeid == nr->nr_nodeid) { 3629 if (nr->nr_flags & MD_MN_NODE_OK) { 3630 /* Found a master */ 3631 free_sr( 3632 (md_set_record *)mnsr); 3633 goto found_master; 3634 } 3635 } 3636 nr = nr->nr_next; 3637 } 3638 free_sr((md_set_record *)mnsr); 3639 nd = nd->nd_next; 3640 } 3641 3642 /* 3643 * - If no node can be found that has its own node record on 3644 * its node to be set to OK, then all alive nodes 3645 * were in the process of being added to or deleted 3646 * from set. Each alive node will remove all 3647 * information pertaining to this set from its node. 3648 * 3649 * If all nodes in set are ALIVE, then call sdssc end routines 3650 * since set was truly being initially created or destroyed. 3651 */ 3652 goto delete_set; 3653 } else { 3654 3655 /* 3656 * If node is not joined to diskset, then this 3657 * node uses the following algorithm: 3658 * - If unjoined node doesn't have a node record for itself, 3659 * just delete the diskset since diskset was in the 3660 * process of being created. 3661 * - node needs to find master of diskset before 3662 * reconfig cycle, if a master existed. 3663 * - node calls RPC routine clnt_ownset to get latest 3664 * information on which nodes are owners of diskset. 3665 * clnt_ownset checks on each node to see if its 3666 * kernel has that diskset snarfed. 3667 */ 3668 3669 /* 3670 * Is my node in the set description? 3671 * If not, delete the set from this node. 3672 * sr2setdesc sets sd_mn_mynode pointer to the node 3673 * descriptor for this node if there was a node 3674 * record for this node. 3675 * 3676 */ 3677 if (sd->sd_mn_mynode == NULL) { 3678 goto delete_set; 3679 } 3680 3681 nd = sd->sd_nodelist; 3682 while (nd) { 3683 /* Don't consider node that isn't in member list */ 3684 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3685 nd = nd->nd_next; 3686 continue; 3687 } 3688 3689 if (clnt_ownset(nd->nd_nodename, sp, 3690 &is_owner, ep) == -1) { 3691 /* If RPC failure to another node return 205 */ 3692 if ((mdanyrpcerror(ep)) && 3693 (sd->sd_mn_mynode->nd_nodeid != 3694 nd->nd_nodeid)) { 3695 return (205); 3696 } else { 3697 /* Any other failure */ 3698 return (-1); 3699 } 3700 } 3701 3702 /* 3703 * Set owner flag for each node based on whether 3704 * that node really has a diskset mddb snarfed in 3705 * or not. 3706 */ 3707 if (is_owner == TRUE) 3708 nd->nd_flags |= MD_MN_NODE_OWN; 3709 else 3710 nd->nd_flags &= ~MD_MN_NODE_OWN; 3711 3712 nd = nd->nd_next; 3713 } 3714 3715 /* 3716 * - node walks through nodelist looking for nodes that 3717 * are owners of the diskset that are in 3718 * the membership list. 3719 * - for each owner, node calls RPC routine clnt_getset to 3720 * see if that node has a master set and to get the 3721 * diskset description. 3722 * - If the owner node has a set description that doesn't 3723 * include the non-joined node in the nodelist, this node 3724 * removes its set description of that diskset 3725 * (i.e. removes the set from its local mddbs). This is 3726 * handling the case of when a node was removed from a 3727 * diskset while it was not in the cluster membership 3728 * list. 3729 * - If that node has a master set and the master is in the 3730 * membership list and is an owner, then either this was 3731 * the master from before the reconfig cycle or this 3732 * node has already chosen a new master - either way, 3733 * the master value is valid as long as it is in the 3734 * membership list and is an owner 3735 * - master is chosen to be owner node's master 3736 */ 3737 nd = sd->sd_nodelist; 3738 while (nd) { 3739 /* Don't consider node that isn't in member list */ 3740 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3741 nd = nd->nd_next; 3742 continue; 3743 } 3744 3745 /* Don't consider a node that isn't an owner */ 3746 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3747 nd = nd->nd_next; 3748 continue; 3749 } 3750 3751 /* Get owner node's set record */ 3752 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3753 MD_SET_BAD, &mnsr, ep) == -1) { 3754 /* If RPC failure to another node return 205 */ 3755 if ((mdanyrpcerror(ep)) && 3756 (sd->sd_mn_mynode->nd_nodeid != 3757 nd->nd_nodeid)) { 3758 return (205); 3759 } else { 3760 /* Any other failure */ 3761 return (-1); 3762 } 3763 } 3764 3765 /* Is this node in the owner node's set record */ 3766 nr = mnsr->sr_nodechain; 3767 while (nr) { 3768 if (sd->sd_mn_mynode->nd_nodeid == 3769 nr->nr_nodeid) { 3770 break; 3771 } 3772 nr = nr->nr_next; 3773 } 3774 if (nr == NULL) { 3775 /* my node not found - delete set */ 3776 free_sr((md_set_record *)mnsr); 3777 goto delete_set; 3778 } 3779 3780 /* Is owner's node's master valid? */ 3781 master_nodeid = mnsr->sr_master_nodeid; 3782 free_sr((md_set_record *)mnsr); 3783 if (master_nodeid == MD_MN_INVALID_NID) { 3784 nd = nd->nd_next; 3785 continue; 3786 } 3787 3788 nd2 = sd->sd_nodelist; 3789 while (nd2) { 3790 if ((nd2->nd_nodeid == master_nodeid) && 3791 (nd2->nd_flags & MD_MN_NODE_ALIVE) && 3792 (nd2->nd_flags & MD_MN_NODE_OWN)) { 3793 nd = nd2; 3794 goto found_master; 3795 } 3796 nd2 = nd2->nd_next; 3797 } 3798 nd = nd->nd_next; 3799 } 3800 3801 /* 3802 * - If no owner node has a valid master, then follow 3803 * algorithm of when a node is joined to the diskset. 3804 * - node walks through nodelist looking for nodes that are 3805 * owners of the diskset that are in the membership list. 3806 * - for each owner, node calls RPC routine clnt_getset to 3807 * see if that node has its node record set to OK. 3808 * - If so, master is chosen to be this owner node. 3809 */ 3810 nd = sd->sd_nodelist; 3811 while (nd) { 3812 /* Don't consider node that isn't in member list */ 3813 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3814 nd = nd->nd_next; 3815 continue; 3816 } 3817 3818 /* Don't consider a node that isn't an owner */ 3819 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3820 nd = nd->nd_next; 3821 continue; 3822 } 3823 3824 /* Does node has its own node record set to OK? */ 3825 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3826 MD_SET_BAD, &mnsr, ep) == -1) { 3827 /* If RPC failure to another node return 205 */ 3828 if ((mdanyrpcerror(ep)) && 3829 (sd->sd_mn_mynode->nd_nodeid != 3830 nd->nd_nodeid)) { 3831 return (205); 3832 } else { 3833 /* Any other failure */ 3834 return (-1); 3835 } 3836 } 3837 nr = mnsr->sr_nodechain; 3838 while (nr) { 3839 if (nd->nd_nodeid == nr->nr_nodeid) { 3840 if (nr->nr_flags & MD_MN_NODE_OK) { 3841 /* Found a master */ 3842 free_sr( 3843 (md_set_record *)mnsr); 3844 goto found_master; 3845 } 3846 } 3847 nr = nr->nr_next; 3848 } 3849 free_sr((md_set_record *)mnsr); 3850 nd = nd->nd_next; 3851 } 3852 3853 /* 3854 * - If no owner node has its own node record on its own node 3855 * set to OK, then this node checks all of the non-owner 3856 * nodes that are in the membership list. 3857 * - for each non-owner, node calls RPC routine clnt_getset to 3858 * see if that node has its node record set to OK. 3859 * - If set doesn't exist, don't choose node for master. 3860 * - If this node doesn't exist in the nodelist on any of the 3861 * non-owner nodes, this node removes its set description 3862 * of that diskset (i.e. removes the set from its local 3863 * mddbs). This is handling the case of when a node was 3864 * removed from a diskset while it was not in the 3865 * cluster membership list. 3866 * - If non-owner node has its node record set to OK and if 3867 * this node hasn't removed this diskset (step directly 3868 * before this one), then the master is chosen to be this 3869 * non-owner node. 3870 */ 3871 nd = sd->sd_nodelist; 3872 while (nd) { 3873 /* Don't consider node that isn't in member list */ 3874 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3875 nd->nd_flags |= MD_MN_NODE_DEL; 3876 nd = nd->nd_next; 3877 continue; 3878 } 3879 3880 /* Don't consider owner nodes since none are OK */ 3881 if (nd->nd_flags & MD_MN_NODE_OWN) { 3882 nd->nd_flags |= MD_MN_NODE_DEL; 3883 nd = nd->nd_next; 3884 continue; 3885 } 3886 3887 /* 3888 * Don't need to get nodelist from my node since 3889 * this is where sd_nodelist was obtained. 3890 */ 3891 if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) { 3892 nd = nd->nd_next; 3893 continue; 3894 } 3895 3896 /* 3897 * If node has already been decided against for 3898 * master, then skip it. 3899 */ 3900 if (nd->nd_flags & MD_MN_NODE_DEL) { 3901 nd = nd->nd_next; 3902 continue; 3903 } 3904 3905 /* 3906 * Does node in my nodelist have its own node 3907 * record marked OK on its node? And does node 3908 * in my nodelist exist on all other nodes? 3909 * Don't want to choose a node for master unless 3910 * that node is marked OK on its own node and that 3911 * node exists on all other alive nodes. 3912 * 3913 * This is guarding against the case when several 3914 * nodes are down and one of the downed nodes is 3915 * deleted from the diskset. When the down nodes 3916 * are rebooted into the cluster, you don't want 3917 * any node to pick the deleted node as the master. 3918 */ 3919 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3920 MD_SET_BAD, &mnsr, ep) == -1) { 3921 /* 3922 * If set doesn't exist on non-owner node, 3923 * don't consider this node for master. 3924 */ 3925 if (mdiserror(ep, MDE_NO_SET)) { 3926 nd->nd_flags |= MD_MN_NODE_DEL; 3927 nd = nd->nd_next; 3928 continue; 3929 } else if (mdanyrpcerror(ep)) { 3930 /* RPC failure to another node */ 3931 return (205); 3932 } else { 3933 /* Any other failure */ 3934 return (-1); 3935 } 3936 } 3937 /* 3938 * Is my node in the nodelist gotten from the other 3939 * node? If not, then remove the set from my node 3940 * since set was deleted from my node while my node 3941 * was out of the cluster. 3942 */ 3943 nr = mnsr->sr_nodechain; 3944 while (nr) { 3945 if (sd->sd_mn_mynode->nd_nodeid == 3946 nr->nr_nodeid) { 3947 break; 3948 } 3949 nr = nr->nr_next; 3950 } 3951 if (nr == NULL) { 3952 /* my node not found - delete set */ 3953 free_sr((md_set_record *)mnsr); 3954 goto delete_set; 3955 } 3956 3957 /* Is node being checked marked OK on its own node? */ 3958 nr = mnsr->sr_nodechain; 3959 while (nr) { 3960 if (nd->nd_nodeid == nr->nr_nodeid) { 3961 if (!(nr->nr_flags & MD_MN_NODE_OK)) { 3962 nd->nd_flags |= MD_MN_NODE_DEL; 3963 } 3964 break; 3965 } 3966 nr = nr->nr_next; 3967 } 3968 /* 3969 * If node being checked doesn't exist on its 3970 * own node - don't choose it as master. 3971 */ 3972 if (nr == NULL) { 3973 nd->nd_flags |= MD_MN_NODE_DEL; 3974 } 3975 3976 /* 3977 * Check every node in my node's nodelist against 3978 * the nodelist gotten from the other node. 3979 * If a node in my node's nodelist is not found in the 3980 * other node's nodelist, then set the DEL flag. 3981 */ 3982 nd2 = sd->sd_nodelist; 3983 while (nd2) { 3984 nr = mnsr->sr_nodechain; 3985 while (nr) { 3986 if (nd2->nd_nodeid == nr->nr_nodeid) { 3987 break; 3988 } 3989 nr = nr->nr_next; 3990 } 3991 /* nd2 not found in other node's nodelist */ 3992 if (nr == NULL) { 3993 nd2->nd_flags |= MD_MN_NODE_DEL; 3994 } 3995 nd2 = nd2->nd_next; 3996 } 3997 3998 free_sr((md_set_record *)mnsr); 3999 nd = nd->nd_next; 4000 } 4001 4002 /* 4003 * Rescan list look for node that has not been marked DEL. 4004 * First node found is the master. 4005 */ 4006 nd = sd->sd_nodelist; 4007 while (nd) { 4008 if (!(nd->nd_flags & MD_MN_NODE_DEL)) { 4009 break; 4010 } 4011 nd = nd->nd_next; 4012 continue; 4013 } 4014 if (nd) { 4015 /* Found a master */ 4016 goto found_master; 4017 } 4018 4019 /* 4020 * - If no node can be found that has its own node record on 4021 * its node to be set to OK, then all alive nodes 4022 * were in the process of being added to or deleted 4023 * from set. Each alive node will remove all 4024 * information pertaining to this set from its node. 4025 * 4026 * If all nodes in set are ALIVE, then call sdssc end routines 4027 * since set was truly being initially created or destroyed. 4028 */ 4029 goto delete_set; 4030 } 4031 4032 found_master: 4033 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4034 "Set %s master chosen %s (%d): %s"), 4035 sp->setname, nd->nd_nodename, nd->nd_nodeid, 4036 meta_print_hrtime(gethrtime() - start_time)); 4037 4038 if (clnt_lock_set(mynode(), sp, ep) == -1) { 4039 return (-1); 4040 } 4041 4042 cl_sk = cl_get_setkey(sp->setno, sp->setname); 4043 4044 if (clnt_mnsetmaster(mynode(), sp, 4045 nd->nd_nodename, nd->nd_nodeid, ep)) { 4046 rval = -1; 4047 } else if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) { 4048 /* If this node is new master, set flag in this node's kernel */ 4049 (void) memset(&sf, 0, sizeof (sf)); 4050 sf.sf_setno = sp->setno; 4051 sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 4052 /* Use magic to help protect ioctl against attack. */ 4053 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 4054 sf.sf_flags = MDDB_NM_SET; 4055 4056 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4057 "Setting new master flag for set %s: %s"), 4058 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4059 4060 /* 4061 * Fail reconfig cycle if ioctl fails since it is critical 4062 * to set new master flag. 4063 */ 4064 if (metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, 4065 NULL) != NULL) { 4066 (void) mdstealerror(ep, &sf.sf_mde); 4067 rval = -1; 4068 } 4069 } 4070 4071 if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) { 4072 if (rval == 0) { 4073 (void) mdstealerror(ep, &xep); 4074 rval = -1; 4075 } 4076 } 4077 4078 cl_set_setkey(NULL); 4079 4080 metaflushsetname(sp); 4081 4082 return (rval); 4083 4084 delete_set: 4085 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4086 "Master not chosen, deleting set %s: %s"), 4087 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4088 4089 /* 4090 * Remove all set information from this node: 4091 * - node records for this set 4092 * - drive records for this set 4093 * - set record for this set 4094 * (Only do this on this node since each node 4095 * will do it for its own local mddb.) 4096 * 4097 * If all nodes in set are ALIVE, then 4098 * the lowest numbered ALIVE nodeid in set 4099 * (irregardless of whether an owner node or not) will 4100 * call the DCS service to cleanup for create/delete of set. 4101 * sdssc_create_end(cleanup) if set was being created or 4102 * sdssc_delete_end(cleanup) if set was being deleted. 4103 * A node record with flag ADD denotes a set being 4104 * created. A node record with flag DEL denotes a 4105 * set being deleted. 4106 */ 4107 nd = sd->sd_nodelist; 4108 while (nd) { 4109 /* Found a node that isn't alive */ 4110 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) 4111 break; 4112 4113 /* Is my node the lowest numbered ALIVE node? */ 4114 if (nd->nd_nodeid < sd->sd_mn_mynode->nd_nodeid) { 4115 break; 4116 } 4117 nd = nd->nd_next; 4118 } 4119 if (nd == NULL) { 4120 /* All nodes ALIVE and this is the lowest nodeid */ 4121 lowest_alive_nodeid = 1; 4122 } 4123 4124 if (clnt_lock_set(mynode(), sp, ep) == -1) { 4125 return (-1); 4126 } 4127 4128 4129 /* 4130 * If this node had been joined, withdraw and reset master. 4131 * 4132 * This could happen if a node was being added to or removed 4133 * from a diskset and the node doing the add/delete operation and 4134 * all other nodes in the diskset have left the cluster. 4135 */ 4136 if (sd->sd_mn_mynode) { 4137 nd = sd->sd_mn_mynode; 4138 if (nd->nd_flags & MD_MN_NODE_OWN) { 4139 if (clnt_withdrawset(mynode(), sp, ep)) { 4140 rval = -1; 4141 goto out; 4142 } 4143 if (clnt_mnsetmaster(mynode(), sp, "", 4144 MD_MN_INVALID_NID, ep)) { 4145 rval = -1; 4146 goto out; 4147 } 4148 } 4149 } 4150 4151 /* 4152 * Remove side records for this node (side) from local mddb 4153 * (clnt_deldrvs does this) if there are drives in the set. 4154 * 4155 * Don't need to mark this node as DEL since already marked as 4156 * ADD or DEL (or this node would have been chosen as master). 4157 * Don't need to mark other node records, drive records or 4158 * set records as DEL. If a panic occurs during clnt_delset, 4159 * these records will be deleted the next time this node 4160 * becomes a member and goes through the reconfig cycle. 4161 */ 4162 /* Get the drive descriptors for this set */ 4163 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 4164 ep)) == NULL) { 4165 if (! mdisok(ep)) { 4166 /* 4167 * Ignore and clear out any failures from 4168 * metaget_drivedesc since a panic could have 4169 * occurred when a node was partially added to a set. 4170 */ 4171 mdclrerror(ep); 4172 } 4173 } else { 4174 if (clnt_deldrvs(mynode(), sp, dd, ep)) { 4175 rval = -1; 4176 goto out; 4177 } 4178 } 4179 4180 /* 4181 * Now, delete the set - this removes the node, drive 4182 * and set records from the local mddb. 4183 */ 4184 if (clnt_delset(mynode(), sp, ep)) { 4185 rval = -1; 4186 goto out; 4187 } 4188 4189 out: 4190 cl_sk = cl_get_setkey(sp->setno, sp->setname); 4191 4192 /* 4193 * Ignore errors from unlock of set since set is no longer 4194 * known (if clnt_delset worked). 4195 */ 4196 if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) { 4197 mdclrerror(&xep); 4198 } 4199 4200 cl_set_setkey(NULL); 4201 4202 metaflushsetname(sp); 4203 4204 /* 4205 * If this node is the lowest numbered nodeid then 4206 * call sdssc_create/delete_end depending on whether 4207 * this node is marked as ADD or DEL in the node record. 4208 */ 4209 if (lowest_alive_nodeid) { 4210 if (nd->nd_flags & MD_MN_NODE_ADD) 4211 sdssc_create_end(sp->setname, SDSSC_CLEANUP); 4212 else if (nd->nd_flags & MD_MN_NODE_DEL) 4213 sdssc_delete_end(sp->setname, SDSSC_CLEANUP); 4214 } 4215 4216 /* Finished with this set -- return */ 4217 return (rval); 4218 } 4219 4220 /* 4221 * Reconfig step to choose a new master for all MN disksets. 4222 * Return values: 4223 * 0 - Everything is great. 4224 * 1 - This node failed to reconfig. 4225 * 205 - Cause another reconfig due to a nodelist problem 4226 * or RPC failure to another node 4227 */ 4228 int 4229 meta_reconfig_choose_master( 4230 long timeout, 4231 md_error_t *ep 4232 ) 4233 { 4234 set_t max_sets, setno; 4235 int nodecnt; 4236 mndiskset_membershiplist_t *nl; 4237 md_set_desc *sd; 4238 mdsetname_t *sp; 4239 int rval = 0; 4240 mddb_setflags_config_t sf; 4241 int start_node_delayed = 0; 4242 4243 if ((max_sets = get_max_sets(ep)) == 0) { 4244 mde_perror(ep, dgettext(TEXT_DOMAIN, 4245 "Unable to get number of sets")); 4246 return (1); 4247 } 4248 4249 /* 4250 * Get membershiplist from API routine. If there's 4251 * an error, return a 205 to cause another reconfig. 4252 */ 4253 if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) { 4254 mde_perror(ep, ""); 4255 return (205); 4256 } 4257 4258 for (setno = 1; setno < max_sets; setno++) { 4259 if ((sp = metasetnosetname(setno, ep)) == NULL) { 4260 if (mdiserror(ep, MDE_NO_SET)) { 4261 /* No set for this setno - continue */ 4262 mdclrerror(ep); 4263 continue; 4264 } else { 4265 /* 4266 * If encountered an RPC error from my node, 4267 * then immediately fail. 4268 */ 4269 if (mdanyrpcerror(ep)) { 4270 mde_perror(ep, ""); 4271 return (1); 4272 } 4273 /* Can't get set information */ 4274 mde_perror(ep, dgettext(TEXT_DOMAIN, 4275 "Unable to get information for " 4276 "set number %d"), setno); 4277 mdclrerror(ep); 4278 continue; 4279 } 4280 } 4281 4282 /* If setname is there, set desc should exist. */ 4283 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 4284 /* 4285 * If encountered an RPC error from my node, 4286 * then immediately fail. 4287 */ 4288 if (mdanyrpcerror(ep)) { 4289 mde_perror(ep, ""); 4290 return (1); 4291 } 4292 mde_perror(ep, dgettext(TEXT_DOMAIN, 4293 "Unable to get set %s desc information"), 4294 sp->setname); 4295 mdclrerror(ep); 4296 continue; 4297 } 4298 4299 /* Only reconfig MN disksets */ 4300 if (!MD_MNSET_DESC(sd)) { 4301 continue; 4302 } 4303 4304 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4305 "Begin choose master for set %s: %s"), 4306 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4307 4308 /* Update nodelist with member information. */ 4309 if (meta_reconfig_update_nodelist(sp, nl, sd, ep)) { 4310 /* 4311 * If encountered an RPC error from my node, 4312 * then immediately fail. 4313 */ 4314 if (mdanyrpcerror(ep)) { 4315 mde_perror(ep, ""); 4316 return (1); 4317 } 4318 mde_perror(ep, ""); 4319 mdclrerror(ep); 4320 continue; 4321 } 4322 4323 /* 4324 * If all nodes in a cluster are starting, then 4325 * all nodes will attempt to contact all other nodes 4326 * to determine a master node. This can lead to a 4327 * problem where node 1 is trying to contact the rpc.metad 4328 * node 2 and node 2 is trying to contact the rpc.metad 4329 * on node 1 -- and this causes the rpc call to fail 4330 * on both nodes and causes a new reconfig cycle. 4331 * 4332 * In order to break this problem, a newly starting node 4333 * will delay a small amount of time (nodeid mod 4 seconds) 4334 * and will then run the code to choose a master for the 4335 * first set. Delay will only be done once regardless of the 4336 * number of sets. 4337 */ 4338 if (start_node_delayed == 0) { 4339 (void) memset(&sf, 0, sizeof (sf)); 4340 sf.sf_setno = sp->setno; 4341 sf.sf_flags = MDDB_NM_GET; 4342 /* Use magic to help protect ioctl against attack. */ 4343 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 4344 if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, 4345 &sf.sf_mde, NULL) == 0) && 4346 ((sf.sf_setflags & MD_SET_MN_START_RC) == 4347 MD_SET_MN_START_RC)) { 4348 (void) sleep(sd->sd_mn_mynode->nd_nodeid % 4); 4349 } 4350 start_node_delayed = 1; 4351 } 4352 4353 /* Choose master for this set */ 4354 rval = meta_reconfig_choose_master_for_set(sp, sd, ep); 4355 if (rval == -1) { 4356 mde_perror(ep, ""); 4357 return (1); 4358 } else if (rval == 205) { 4359 mde_perror(ep, ""); 4360 return (205); 4361 } 4362 4363 /* reinit rpc.mdcommd with new nodelist */ 4364 if (mdmn_reinit_set(sp->setno, timeout)) { 4365 md_eprintf(dgettext(TEXT_DOMAIN, 4366 "Could not re-initialise rpc.mdcommd for " 4367 "set %s\n"), sp->setname); 4368 return (1); 4369 } 4370 4371 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4372 "Choose master for set %s completed: %s"), 4373 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4374 } 4375 4376 /* 4377 * Each node turns on I/Os for all MN disksets. 4378 * This is to recover from the situation where the master died 4379 * during a previous reconfig cycle when I/Os were suspended 4380 * for a MN diskset. 4381 * If a failure occurs return a 1 which will force this node to 4382 * panic. Cannot leave node in the situation where I/Os are 4383 * not resumed. 4384 */ 4385 setno = 0; /* 0 means all MN sets */ 4386 if (metaioctl(MD_MN_RESUME_SET, &setno, ep, NULL)) { 4387 mde_perror(ep, ""); 4388 return (1); 4389 } 4390 4391 /* Free the nodelist */ 4392 if (nodecnt) 4393 meta_free_nodelist(nl); 4394 4395 return (0); 4396 } 4397 4398 /* 4399 * meta_mnsync_user_records will synchronize the diskset user records across 4400 * all nodes in the diskset. The diskset user records are stored in 4401 * each node's local set mddb. 4402 * 4403 * This needs to be done even if there is no master change during the 4404 * reconfig cycle since this routine should clean up any mess left by 4405 * the untimely termination of a metaset or metadb command (due to a 4406 * node panic or to user intervention). 4407 * 4408 * Caller is the Master node. 4409 * 4410 * Returns 0 - Success 4411 * 205 - Failure during RPC to another node 4412 * -1 - Any other failure and ep is filled in. 4413 */ 4414 int 4415 meta_mnsync_user_records( 4416 mdsetname_t *sp, 4417 md_error_t *ep 4418 ) 4419 { 4420 md_set_desc *sd; 4421 md_mnnode_desc *master_nodelist, *nd, *nd2, *ndtail; 4422 md_mnset_record *mnsr; 4423 md_mnsr_node_t *master_mnsr_node = NULL, *mnsr_node = NULL; 4424 md_mnnode_record *nr; 4425 md_drive_record *dr; 4426 int dr_cnt, dd_cnt; 4427 int found_my_nr; 4428 md_drive_desc *dd, *dd_prev, *master_dd, *other_dd; 4429 int all_drives_ok; 4430 int rval = 0; 4431 int max_genid = 0; 4432 int num_alive_nodes, num_alive_nodes_del = 0; 4433 int set_locked = 0; 4434 md_setkey_t *cl_sk; 4435 md_error_t xep = mdnullerror; 4436 char *anode[1]; 4437 mddb_setflags_config_t sf; 4438 4439 /* 4440 * Sync up node records first. 4441 * Construct a master nodelist using the nodelist from this 4442 * node's rpc.metad node records and then setting the state of each 4443 * node following these rules: 4444 * - If a node record is marked OK on its node, mark it OK 4445 * in the master nodelist (and later OK on all nodes) 4446 * If a node record is also marked OWN on its node, 4447 * mark it OWN in the master nodelist. 4448 * - If a node record is not marked OK on its node, then mark 4449 * it as DEL in the master list (later deleting it) 4450 * - If node record doesn't exist on that node, then mark it DEL 4451 * (later deleting it) 4452 * - If set record doesn't exist on that node, mark node as DEL 4453 * - If a node record doesn't exist on all nodes, then mark it DEL 4454 * - If a node is not ALIVE, then 4455 * - If that node marked DEL on any node - mark it DEL 4456 * in master list but leave in nodelist 4457 * - If that node is marked as ADD on any node, mark it 4458 * ADD in the master list but leave in nodelist 4459 * - When that node returns to the living, the DEL 4460 * node record will be removed and the ADD node 4461 * record may be removed if marked ADD on that 4462 * node. 4463 * The key rule is to not remove a node from the nodelist until 4464 * that node record is removed from its own node. Do not want to 4465 * remove a node's record from all other nodes and then have 4466 * that node have its own record marked OK so that a node will pick 4467 * a different master than the other nodes. 4468 * 4469 * Next, 4470 * If node is ALIVE and node record is marked DEL in master nodelist, 4471 * remove node from set. 4472 * If node is ALIVE and node record is marked OK in master nodelist, 4473 * mark it OK on all other nodes. 4474 * If node is not ALIVE and node record is marked DEL in master 4475 * nodelist, mark it DEL on all other nodes. 4476 * If node is not ALIVE and node record is marked ADD in master, 4477 * nodelist, mark it ADD on all other nodes. 4478 */ 4479 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 4480 return (-1); 4481 } 4482 master_nodelist = sd->sd_nodelist; 4483 4484 /* 4485 * Walk through nodelist creating a master nodelist. 4486 */ 4487 num_alive_nodes = 0; 4488 nd = master_nodelist; 4489 while (nd) { 4490 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 4491 nd = nd->nd_next; 4492 continue; 4493 } 4494 num_alive_nodes++; 4495 if (clnt_mngetset(nd->nd_nodename, sp->setname, 4496 MD_SET_BAD, &mnsr, ep) == -1) { 4497 if (mdiserror(ep, MDE_NO_SET)) { 4498 /* set doesn't exist, mark node as DEL */ 4499 nd->nd_flags &= ~MD_MN_NODE_OK; 4500 nd->nd_flags &= ~MD_MN_NODE_ADD; 4501 nd->nd_flags |= MD_MN_NODE_DEL; 4502 nd->nd_flags |= MD_MN_NODE_NOSET; 4503 nd = nd->nd_next; 4504 continue; 4505 } else { 4506 /* If RPC failure to another node return 205 */ 4507 if ((mdanyrpcerror(ep)) && 4508 (sd->sd_mn_mynode->nd_nodeid != 4509 nd->nd_nodeid)) { 4510 rval = 205; 4511 } else { 4512 /* Any other failure */ 4513 rval = -1; 4514 } 4515 goto out; 4516 } 4517 } 4518 /* Find biggest genid in records for this diskset */ 4519 if (mnsr->sr_genid > max_genid) 4520 max_genid = mnsr->sr_genid; 4521 4522 dr = mnsr->sr_drivechain; 4523 while (dr) { 4524 /* Find biggest genid in records for this diskset */ 4525 if (dr->dr_genid > max_genid) { 4526 max_genid = dr->dr_genid; 4527 } 4528 dr = dr->dr_next; 4529 } 4530 4531 found_my_nr = 0; 4532 nr = mnsr->sr_nodechain; 4533 /* nr is the list of node recs from nd_nodename node */ 4534 while (nr) { 4535 /* Find biggest genid in records for this diskset */ 4536 if (nr->nr_genid > max_genid) 4537 max_genid = nr->nr_genid; 4538 nd2 = master_nodelist; 4539 ndtail = NULL; 4540 /* For each node record, is it in master list? */ 4541 while (nd2) { 4542 if (nd2->nd_nodeid == nr->nr_nodeid) 4543 break; 4544 if (nd2->nd_next == NULL) 4545 ndtail = nd2; 4546 nd2 = nd2->nd_next; 4547 } 4548 /* 4549 * Found node record not in master list -- add it 4550 * to list marking it as DEL since node record 4551 * should exist on all nodes unless a panic occurred 4552 * during addition or deletion of host to diskset. 4553 */ 4554 if (nd2 == NULL) { 4555 nd2 = Zalloc(sizeof (*nd2)); 4556 (void) strcpy(nd2->nd_nodename, 4557 nr->nr_nodename); 4558 nd2->nd_flags = nr->nr_flags; 4559 nd2->nd_flags |= MD_MN_NODE_DEL; 4560 nd2->nd_nodeid = nr->nr_nodeid; 4561 nd2->nd_next = NULL; 4562 ndtail->nd_next = nd2; 4563 nd2 = NULL; 4564 nr = nr->nr_next; 4565 continue; 4566 } 4567 /* 4568 * Is this the node record for the node that 4569 * we requested the set desc from? 4570 * If so, check if node has its own node record 4571 * marked OK. If marked OK, check for the OWN bit. 4572 */ 4573 if (nr->nr_nodeid == nd->nd_nodeid) { 4574 found_my_nr = 1; 4575 if (nr->nr_flags & MD_MN_NODE_OK) { 4576 /* 4577 * If node record is marked OK 4578 * on its own node, then mark it OK 4579 * in the master list. Node record 4580 * would have to exist on all nodes 4581 * in the ADD state before it could 4582 * be put into the OK state. 4583 */ 4584 nd->nd_flags |= MD_MN_NODE_OK; 4585 nd->nd_flags &= 4586 ~(MD_MN_NODE_ADD | MD_MN_NODE_DEL); 4587 /* 4588 * Mark own in master list as marked 4589 * on own node. 4590 */ 4591 if (nr->nr_flags & MD_MN_NODE_OWN) 4592 nd->nd_flags |= MD_MN_NODE_OWN; 4593 else 4594 nd->nd_flags &= ~MD_MN_NODE_OWN; 4595 } else { 4596 /* Otherwise, mark node as DEL */ 4597 nd->nd_flags &= ~MD_MN_NODE_OK; 4598 nd->nd_flags &= ~MD_MN_NODE_ADD; 4599 nd->nd_flags |= MD_MN_NODE_DEL; 4600 } 4601 } 4602 /* 4603 * If node is not ALIVE and marked DEL 4604 * on any node, make it DEL in master list. 4605 * If node is not ALIVE and marked ADD 4606 * on any node, make it ADD in master list 4607 * unless node record has already been marked DEL. 4608 */ 4609 if (!(nr->nr_flags & MD_MN_NODE_ALIVE)) { 4610 if (nr->nr_flags & MD_MN_NODE_ADD) { 4611 if (!(nd->nd_flags & MD_MN_NODE_DEL)) { 4612 /* If not DEL - mark it ADD */ 4613 nd->nd_flags |= MD_MN_NODE_ADD; 4614 nd->nd_flags &= ~MD_MN_NODE_OK; 4615 } 4616 } 4617 if (nr->nr_flags & MD_MN_NODE_DEL) { 4618 nd->nd_flags |= MD_MN_NODE_DEL; 4619 nd->nd_flags &= ~MD_MN_NODE_OK; 4620 /* Could already be ADD - make it DEL */ 4621 nd->nd_flags &= ~MD_MN_NODE_ADD; 4622 } 4623 } 4624 nr = nr->nr_next; 4625 } 4626 /* 4627 * If a node record doesn't exist on its own node, 4628 * then mark node as DEL. 4629 */ 4630 if (found_my_nr == 0) { 4631 nd->nd_flags &= ~MD_MN_NODE_OK; 4632 nd->nd_flags |= MD_MN_NODE_DEL; 4633 } 4634 4635 /* 4636 * If node is OK - put mnsr onto master_mnsr_node list for 4637 * later use when syncing up the drive records in the set. 4638 */ 4639 if (nd->nd_flags & MD_MN_NODE_OK) { 4640 mnsr_node = Zalloc(sizeof (*mnsr_node)); 4641 mnsr_node->mmn_mnsr = mnsr; 4642 (void) strncpy(mnsr_node->mmn_nodename, 4643 nd->nd_nodename, MD_MAX_MNNODENAME_PLUS_1); 4644 mnsr_node->mmn_next = master_mnsr_node; 4645 master_mnsr_node = mnsr_node; 4646 } else { 4647 free_sr((struct md_set_record *)mnsr); 4648 } 4649 4650 nd = nd->nd_next; 4651 } 4652 4653 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4654 "Master nodelist created for set %s: %s"), 4655 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4656 4657 /* 4658 * Send master nodelist to the rpc.metad on all nodes (including 4659 * myself) and each node will update itself. This will set the 4660 * ADD and DEL flags on each node as setup in the master nodelist. 4661 * Don't send nodelist to node where set doesn't exist. 4662 */ 4663 nd = master_nodelist; 4664 while (nd) { 4665 if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || 4666 (nd->nd_flags & MD_MN_NODE_NOSET)) { 4667 nd = nd->nd_next; 4668 continue; 4669 } 4670 if (clnt_upd_nr_flags(nd->nd_nodename, sp, 4671 master_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) { 4672 /* If RPC failure to another node return 205 */ 4673 if ((mdanyrpcerror(ep)) && 4674 (sd->sd_mn_mynode->nd_nodeid != 4675 nd->nd_nodeid)) { 4676 rval = 205; 4677 } else { 4678 /* Any other failure */ 4679 rval = -1; 4680 } 4681 goto out; 4682 } 4683 nd = nd->nd_next; 4684 } 4685 4686 /* 4687 * Now, delete nodes that need to be deleted. 4688 */ 4689 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 4690 ep)) == NULL) { 4691 if (! mdisok(ep)) { 4692 rval = -1; 4693 goto out; 4694 } 4695 } 4696 4697 /* 4698 * May be doing lots of RPC commands to the nodes, so lock the 4699 * ALIVE members of the set since most of the rpc.metad routines 4700 * require this for security reasons. 4701 */ 4702 nd = master_nodelist; 4703 while (nd) { 4704 /* Skip non-alive nodes and node without set */ 4705 if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || 4706 (nd->nd_flags & MD_MN_NODE_NOSET)) { 4707 nd = nd->nd_next; 4708 continue; 4709 } 4710 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 4711 /* If RPC failure to another node return 205 */ 4712 if ((mdanyrpcerror(ep)) && 4713 (sd->sd_mn_mynode->nd_nodeid != 4714 nd->nd_nodeid)) { 4715 rval = 205; 4716 } else { 4717 /* Any other failure */ 4718 rval = -1; 4719 } 4720 goto out; 4721 } 4722 set_locked = 1; 4723 nd = nd->nd_next; 4724 } 4725 4726 nd = master_nodelist; 4727 while (nd) { 4728 /* Skip non-alive nodes */ 4729 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 4730 nd = nd->nd_next; 4731 continue; 4732 } 4733 if (nd->nd_flags & MD_MN_NODE_DEL) { 4734 num_alive_nodes_del++; 4735 /* 4736 * Delete this node rec from all ALIVE nodes in diskset. 4737 */ 4738 nd2 = master_nodelist; 4739 while (nd2) { 4740 /* Skip non-alive nodes and node without set */ 4741 if (!(nd2->nd_flags & MD_MN_NODE_ALIVE) || 4742 (nd2->nd_flags & MD_MN_NODE_NOSET)) { 4743 nd2 = nd2->nd_next; 4744 continue; 4745 } 4746 4747 /* This is a node being deleted from set */ 4748 if (nd2->nd_nodeid == nd->nd_nodeid) { 4749 /* Mark set record as DEL */ 4750 if (clnt_upd_sr_flags(nd->nd_nodename, 4751 sp, MD_SR_DEL, ep)) { 4752 /* RPC failure to !my node */ 4753 if ((mdanyrpcerror(ep)) && 4754 (sd->sd_mn_mynode-> 4755 nd_nodeid 4756 != nd->nd_nodeid)) { 4757 rval = 205; 4758 } else { 4759 /* Any other failure */ 4760 rval = -1; 4761 } 4762 goto out; 4763 } 4764 if (clnt_deldrvs(nd->nd_nodename, sp, 4765 dd, ep)) { 4766 /* RPC failure to !my node */ 4767 if ((mdanyrpcerror(ep)) && 4768 (sd->sd_mn_mynode-> 4769 nd_nodeid 4770 != nd->nd_nodeid)) { 4771 rval = 205; 4772 } else { 4773 /* Any other failure */ 4774 rval = -1; 4775 } 4776 goto out; 4777 } 4778 if (clnt_delset(nd->nd_nodename, sp, 4779 ep) == -1) { 4780 /* RPC failure to !my node */ 4781 if ((mdanyrpcerror(ep)) && 4782 (sd->sd_mn_mynode-> 4783 nd_nodeid 4784 != nd->nd_nodeid)) { 4785 rval = 205; 4786 } else { 4787 /* Any other failure */ 4788 rval = -1; 4789 } 4790 goto out; 4791 } 4792 } else { 4793 /* 4794 * Delete host from sets on hosts 4795 * not being deleted. 4796 */ 4797 anode[0] = Strdup(nd->nd_nodename); 4798 if (clnt_delhosts(nd2->nd_nodename, sp, 4799 1, anode, ep) == -1) { 4800 Free(anode[0]); 4801 /* RPC failure to !my node */ 4802 if ((mdanyrpcerror(ep)) && 4803 (sd->sd_mn_mynode-> 4804 nd_nodeid 4805 != nd2->nd_nodeid)) { 4806 rval = 205; 4807 } else { 4808 /* Any other failure */ 4809 rval = -1; 4810 } 4811 goto out; 4812 } 4813 4814 meta_mc_log(MC_LOG5, 4815 dgettext(TEXT_DOMAIN, 4816 "Deleted node %s (%d) on node %s " 4817 "from set %s: %s"), 4818 nd->nd_nodename, nd->nd_nodeid, 4819 nd2->nd_nodename, 4820 sp->setname, 4821 meta_print_hrtime( 4822 gethrtime() - start_time)); 4823 4824 Free(anode[0]); 4825 } 4826 nd2 = nd2->nd_next; 4827 } 4828 } 4829 nd = nd->nd_next; 4830 } 4831 4832 nd = master_nodelist; 4833 cl_sk = cl_get_setkey(sp->setno, sp->setname); 4834 while (nd) { 4835 /* Skip non-alive nodes and node without set */ 4836 if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || 4837 (nd->nd_flags & MD_MN_NODE_NOSET)) { 4838 nd = nd->nd_next; 4839 continue; 4840 } 4841 if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) { 4842 /* If RPC failure to another node return 205 */ 4843 if ((mdanyrpcerror(ep)) && 4844 (sd->sd_mn_mynode->nd_nodeid != 4845 nd->nd_nodeid)) { 4846 rval = 205; 4847 } else { 4848 /* Any other failure */ 4849 rval = -1; 4850 } 4851 goto out; 4852 } 4853 nd = nd->nd_next; 4854 } 4855 cl_set_setkey(NULL); 4856 set_locked = 0; 4857 4858 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4859 "Nodelist syncronization complete for set %s: %s"), 4860 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4861 4862 metaflushsetname(sp); 4863 4864 /* 4865 * If all alive nodes have been deleted from set, just 4866 * return since nothing else can be done until non-alive 4867 * nodes (if there are any) rejoin the cluster. 4868 */ 4869 if (num_alive_nodes == num_alive_nodes_del) { 4870 rval = 0; 4871 goto out; 4872 } 4873 4874 /* 4875 * Sync up drive records. 4876 * 4877 * If a node panic'd (or metaset command was killed) during the 4878 * addition or deletion of a drive to the diskset, the nodes 4879 * may have a different view of the drive list. During cleanup 4880 * of the drive list during reconfig, a drive will be deleted 4881 * from the list if the master node sees that the drive has been 4882 * marked in the ADD state on any node or is marked in the DEL state 4883 * on all nodes. 4884 * This cleanup must occur even if all nodes in the cluster are 4885 * not part of the cluster so that all nodes have the same view 4886 * of the drivelist. 4887 * Then if the entire cluster goes down and comes back up, the 4888 * new master node could be a node that wasn't in the cluster when 4889 * the node was deleted. This could lead to a situation where the 4890 * master node thinks that a drive is OK, but this drive isn't 4891 * known to the other nodes. 4892 * This situation can also occur during the addition of a drive 4893 * where a node has the drive marked OK, but the node executing the 4894 * metaset command enountered a failure before marking that drive OK 4895 * on the rest of the nodes. If the node with the OK drive then 4896 * panics, then rest of the nodes will remove that drive marked ADD 4897 * and when the node with the OK drive rejoins the cluster, it will 4898 * have a drive marked OK that is unknown by the other nodes. 4899 * 4900 * There are 2 situations to consider: 4901 * A) Master knows about a drive that other nodes don't know about. 4902 * B) At least one slave node knows about a drive that the master 4903 * node doesn't know about. 4904 * 4905 * To handle these situations the following steps are followed: 4906 * 1) Count number of drives known by this master node and the 4907 * other slave nodes. 4908 * If all nodes have the same number of drives and the master has 4909 * all drives marked OK, then skip to step4. 4910 * 4911 * 2) If a node has less drives listed than the master, the master 4912 * must get the drive descriptor list from that node so that 4913 * master can determine which drive it needs to delete from that 4914 * node. Master must get the drive descriptor list since the 4915 * drive record list does not contain the name of the drive, but 4916 * only a key and the key can only be interprested on that other 4917 * node. 4918 * 4919 * 3) The master will then create the master drive list by doing: 4920 * - Master starts with drive list known by master. 4921 * - Any drive marked ADD will be removed from the list. 4922 * - Any drive not known by another node (from step2) will be 4923 * removed from the drive list. 4924 * - If a drive is marked DEL on the master, the master must 4925 * verify that the drive record is marked DEL on all nodes. 4926 * If any node has the drive record marked OK, mark it OK 4927 * on the master. (The reason why is described below). 4928 * 4929 * 4) The master sends out the master drive list and the slave 4930 * nodes will force their drive lists to match the master 4931 * drive list by deleting drives, if necessary and by changing 4932 * the drive record states from ADD->OK if master has drive 4933 * marked OK and slave has drive marked ADD. 4934 * 4935 * Interesting scenarios: 4936 * 4937 * 1) System has 4 nodes with node 1 as the master. Node 3 starts 4938 * to delete a drive record (drive record on node 1 is marked DEL), 4939 * but is stopped when node 3 panics. Node 1 also panics. 4940 * During reconfig cycle, node 2 is picked as master and the drive 4941 * record is left alone since all nodes in the cluster have it 4942 * marked OK. User now sees drive as part of diskset. 4943 * Now, entire cluster is rebooted and node 1 rejoins the cluster. 4944 * Node 1 is picked as the master and node 1 has drive record 4945 * marked DEL. Node 1 contacts all other nodes in the cluster 4946 * and since at least one node has the drive record marked OK, 4947 * the master marks the drive record OK. 4948 * User continues to see the drive as part of the diskset. 4949 */ 4950 4951 /* Reget set descriptor since flushed above */ 4952 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 4953 rval = -1; 4954 goto out; 4955 } 4956 4957 /* Has side effect of setting sd->sd_drvs to same as master_dd */ 4958 if ((master_dd = metaget_drivedesc_sideno(sp, 4959 sd->sd_mn_mynode->nd_nodeid, 4960 (MD_BASICNAME_OK | PRINT_FAST), ep)) == NULL) { 4961 /* No drives in list */ 4962 if (!mdisok(ep)) { 4963 /* 4964 * Can't get drive list for this node, so 4965 * return -1 causing this node to be removed 4966 * cluster config and fixed. 4967 */ 4968 rval = -1; 4969 goto out; 4970 } 4971 } 4972 4973 /* Count the number of drives for all nodes */ 4974 mnsr_node = master_mnsr_node; 4975 while (mnsr_node) { 4976 dr_cnt = 0; 4977 dr = mnsr_node->mmn_mnsr->sr_drivechain; 4978 while (dr) { 4979 dr_cnt++; 4980 dr = dr->dr_next; 4981 } 4982 mnsr_node->mmn_numdrives = dr_cnt; 4983 mnsr_node = mnsr_node->mmn_next; 4984 } 4985 4986 /* Count the number of drives for the master; also check flags */ 4987 all_drives_ok = 1; 4988 dd_cnt = 0; 4989 dd = master_dd; 4990 while (dd) { 4991 dd_cnt++; 4992 if (!(dd->dd_flags & MD_DR_OK)) 4993 all_drives_ok = 0; 4994 dd = dd->dd_next; 4995 } 4996 4997 /* If all drives are ok, do quick check against number of drives */ 4998 if (all_drives_ok) { 4999 /* If all nodes have same number of drives, almost done */ 5000 mnsr_node = master_mnsr_node; 5001 while (mnsr_node) { 5002 if (mnsr_node->mmn_numdrives != dd_cnt) 5003 break; 5004 mnsr_node = mnsr_node->mmn_next; 5005 } 5006 /* All nodes have same number of drives, just send flags */ 5007 if (mnsr_node == NULL) { 5008 goto send_drive_list; 5009 } 5010 } 5011 5012 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5013 "Begin detailed drive synchronization for set %s: %s"), 5014 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5015 5016 /* Detailed check required */ 5017 mnsr_node = master_mnsr_node; 5018 while (mnsr_node) { 5019 /* Does slave node have less drives than master? */ 5020 if (mnsr_node->mmn_numdrives < dd_cnt) { 5021 /* Yes - must determine which drive is missing */ 5022 if (clnt_getdrivedesc(mnsr_node->mmn_nodename, sp, 5023 &other_dd, ep)) { 5024 /* RPC failure to !my node */ 5025 if ((mdanyrpcerror(ep)) && 5026 (strcmp(mynode(), mnsr_node->mmn_nodename) 5027 != 0)) { 5028 rval = 205; 5029 } else { 5030 /* Any other failure */ 5031 rval = -1; 5032 } 5033 mde_perror(ep, dgettext(TEXT_DOMAIN, 5034 "Master node %s unable to " 5035 "retrieve drive list from node %s"), 5036 mynode(), mnsr_node->mmn_nodename); 5037 goto out; 5038 } 5039 mnsr_node->mmn_dd = other_dd; 5040 dd = master_dd; 5041 while (dd) { 5042 if (!(dd->dd_flags & MD_DR_OK)) { 5043 dd = dd->dd_next; 5044 continue; 5045 } 5046 other_dd = mnsr_node->mmn_dd; 5047 while (other_dd) { 5048 /* Convert to devids, when available */ 5049 if (strcmp(other_dd->dd_dnp->cname, 5050 dd->dd_dnp->cname) == 0) { 5051 break; 5052 } 5053 other_dd = other_dd->dd_next; 5054 } 5055 /* 5056 * dd not found on slave so mark it 5057 * ADD for later deletion (drives in ADD 5058 * state are deleted later in this routine). 5059 */ 5060 if (other_dd == NULL) { 5061 dd->dd_flags = MD_DR_ADD; 5062 } 5063 dd = dd->dd_next; 5064 } 5065 5066 } 5067 mnsr_node = mnsr_node->mmn_next; 5068 } 5069 5070 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5071 "Drive check completed for set %s: %s"), 5072 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5073 5074 dd = master_dd; 5075 dd_prev = 0; 5076 while (dd) { 5077 /* Remove any ADD drives from list */ 5078 if (dd->dd_flags & MD_DR_ADD) { 5079 if (dd_prev) { 5080 dd_prev->dd_next = dd->dd_next; 5081 dd->dd_next = NULL; 5082 metafreedrivedesc(&dd); 5083 dd = dd_prev->dd_next; 5084 } else { 5085 /* 5086 * If removing drive descriptor from head 5087 * of linked list, also change sd->sd_drvs. 5088 */ 5089 master_dd = sd->sd_drvs = dd->dd_next; 5090 dd->dd_next = NULL; 5091 metafreedrivedesc(&dd); 5092 dd = master_dd; 5093 } 5094 /* dd setup in if/else above */ 5095 continue; 5096 } 5097 /* 5098 * If drive is marked DEL, check all other nodes. 5099 * If drive on another node is marked OK, mark drive OK 5100 * in master list. If drive is marked DEL or doesn't exist 5101 * on all nodes, remove drive from list. 5102 */ 5103 if (dd->dd_flags & MD_DR_DEL) { 5104 mnsr_node = master_mnsr_node; 5105 while (mnsr_node) { 5106 if (mnsr_node->mmn_dd == NULL) { 5107 if (clnt_getdrivedesc( 5108 mnsr_node->mmn_nodename, sp, 5109 &other_dd, ep)) { 5110 /* RPC failure to !my node */ 5111 if ((mdanyrpcerror(ep)) && 5112 (strcmp(mynode(), 5113 mnsr_node->mmn_nodename) 5114 != 0)) { 5115 rval = 205; 5116 } else { 5117 /* Any other failure */ 5118 rval = -1; 5119 } 5120 mde_perror(ep, dgettext(TEXT_DOMAIN, 5121 "Master node %s unable " 5122 "to retrieve drive list from " 5123 "node %s"), mynode(), 5124 mnsr_node->mmn_nodename); 5125 goto out; 5126 } 5127 mnsr_node->mmn_dd = other_dd; 5128 } 5129 other_dd = mnsr_node->mmn_dd; 5130 while (other_dd) { 5131 /* Found drive (OK) from other node */ 5132 if (strcmp(dd->dd_dnp->cname, 5133 other_dd->dd_dnp->cname) 5134 == 0) { 5135 /* Drive marked OK */ 5136 if (other_dd->dd_flags & 5137 MD_DR_OK) { 5138 dd->dd_flags = MD_DR_OK; 5139 } 5140 break; 5141 } 5142 other_dd = other_dd->dd_next; 5143 } 5144 if (dd->dd_flags == MD_DR_OK) 5145 break; 5146 5147 mnsr_node = mnsr_node->mmn_next; 5148 } 5149 /* 5150 * If no node had this drive marked OK, delete it. 5151 */ 5152 if (dd->dd_flags & MD_DR_DEL) { 5153 if (dd_prev) { 5154 dd_prev->dd_next = dd->dd_next; 5155 dd->dd_next = NULL; 5156 metafreedrivedesc(&dd); 5157 dd = dd_prev->dd_next; 5158 } else { 5159 /* 5160 * If removing drive descriptor from 5161 * head of linked list, also change 5162 * sd->sd_drvs. 5163 */ 5164 master_dd = sd->sd_drvs = dd->dd_next; 5165 dd->dd_next = NULL; 5166 metafreedrivedesc(&dd); 5167 dd = master_dd; 5168 } 5169 /* dd setup in if/else above */ 5170 continue; 5171 } 5172 } 5173 dd_prev = dd; 5174 dd = dd->dd_next; 5175 } 5176 5177 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5178 "Setting drive states completed for set %s: %s"), 5179 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5180 5181 send_drive_list: 5182 /* 5183 * Set genid on all drives to be the highest value seen. 5184 */ 5185 dd = master_dd; 5186 while (dd) { 5187 dd->dd_genid = max_genid; 5188 dd = dd->dd_next; 5189 } 5190 /* 5191 * Send updated drive list to all alive nodes. 5192 * Will also set genid on set and node records to have same 5193 * as the drive records. 5194 */ 5195 nd = sd->sd_nodelist; 5196 while (nd) { 5197 /* Skip non-alive nodes */ 5198 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 5199 nd = nd->nd_next; 5200 continue; 5201 } 5202 if (clnt_upd_dr_reconfig(nd->nd_nodename, sp, master_dd, ep)) { 5203 /* RPC failure to another node */ 5204 if ((mdanyrpcerror(ep)) && 5205 (sd->sd_mn_mynode->nd_nodeid != nd->nd_nodeid)) { 5206 rval = 205; 5207 } else { 5208 /* Any other failure */ 5209 rval = -1; 5210 } 5211 goto out; 5212 } 5213 nd = nd->nd_next; 5214 } 5215 5216 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5217 "Sent drive list to all nodes for set %s: %s"), 5218 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5219 5220 /* 5221 * If no drive records left in set and nodes had been joined, 5222 * withdraw the nodes. Always reset the master and mark 5223 * all nodes as withdrawn on all nodes. 5224 */ 5225 if (master_dd == NULL) { 5226 /* Reset new master flag since no longer master */ 5227 (void) memset(&sf, 0, sizeof (sf)); 5228 sf.sf_setno = sp->setno; 5229 sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 5230 sf.sf_flags = MDDB_NM_RESET; 5231 /* Use magic to help protect ioctl against attack. */ 5232 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5233 /* Ignore failure, failure to reset flag isn't catastrophic */ 5234 (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, 5235 &sf.sf_mde, NULL); 5236 5237 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5238 "Reset new master flag for " "set %s: %s"), 5239 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5240 5241 nd = sd->sd_nodelist; 5242 while (nd) { 5243 /* Skip non-alive nodes */ 5244 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 5245 nd = nd->nd_next; 5246 continue; 5247 } 5248 5249 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 5250 /* RPC failure to another node */ 5251 if ((mdanyrpcerror(ep)) && 5252 (sd->sd_mn_mynode->nd_nodeid != 5253 nd->nd_nodeid)) { 5254 rval = 205; 5255 } else { 5256 /* Any other failure */ 5257 rval = -1; 5258 } 5259 goto out; 5260 } 5261 set_locked = 1; 5262 5263 /* Withdraw node from set if owner */ 5264 if ((nd->nd_flags & MD_MN_NODE_OWN) && 5265 (clnt_withdrawset(nd->nd_nodename, sp, ep))) { 5266 /* RPC failure to another node */ 5267 if ((mdanyrpcerror(ep)) && 5268 (sd->sd_mn_mynode->nd_nodeid != 5269 nd->nd_nodeid)) { 5270 rval = 205; 5271 } else { 5272 /* Any other failure */ 5273 rval = -1; 5274 } 5275 goto out; 5276 } 5277 5278 /* Mark all nodes as withdrawn on this node */ 5279 if (clnt_upd_nr_flags(nd->nd_nodename, sp, 5280 sd->sd_nodelist, MD_NR_WITHDRAW, NULL, ep)) { 5281 /* RPC failure to another node */ 5282 if ((mdanyrpcerror(ep)) && 5283 (sd->sd_mn_mynode->nd_nodeid != 5284 nd->nd_nodeid)) { 5285 rval = 205; 5286 } else { 5287 /* Any other failure */ 5288 rval = -1; 5289 } 5290 goto out; 5291 } 5292 5293 /* Resets master to no-master on this node */ 5294 if (clnt_mnsetmaster(nd->nd_nodename, sp, 5295 "", MD_MN_INVALID_NID, ep)) { 5296 /* RPC failure to another node */ 5297 if ((mdanyrpcerror(ep)) && 5298 (sd->sd_mn_mynode->nd_nodeid != 5299 nd->nd_nodeid)) { 5300 rval = 205; 5301 } else { 5302 /* Any other failure */ 5303 rval = -1; 5304 } 5305 goto out; 5306 } 5307 5308 cl_sk = cl_get_setkey(sp->setno, sp->setname); 5309 if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) { 5310 /* RPC failure to another node */ 5311 if ((mdanyrpcerror(ep)) && 5312 (sd->sd_mn_mynode->nd_nodeid != 5313 nd->nd_nodeid)) { 5314 rval = 205; 5315 } else { 5316 /* Any other failure */ 5317 rval = -1; 5318 } 5319 goto out; 5320 } 5321 set_locked = 0; 5322 nd = nd->nd_next; 5323 } 5324 } 5325 5326 out: 5327 /* 5328 * If got here and set is still locked, then an error has 5329 * occurred and master_nodelist is still valid. 5330 * If error is not an RPC error, then unlock. 5331 * If error is an RPC error, skip unlocks since this could cause 5332 * yet another RPC timeout if a node has failed. 5333 * Ignore failures in unlock since unlock is just trying to 5334 * clean things up. 5335 */ 5336 if ((set_locked) && !(mdanyrpcerror(ep))) { 5337 nd = master_nodelist; 5338 cl_sk = cl_get_setkey(sp->setno, sp->setname); 5339 while (nd) { 5340 /* Skip non-alive nodes */ 5341 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 5342 nd = nd->nd_next; 5343 continue; 5344 } 5345 /* 5346 * If clnt_unlock fails, just break out since next 5347 * reconfig cycle will reset the locks anyway. 5348 */ 5349 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 5350 break; 5351 } 5352 nd = nd->nd_next; 5353 } 5354 cl_set_setkey(NULL); 5355 } 5356 /* Free master_mnsr and drive descs */ 5357 mnsr_node = master_mnsr_node; 5358 while (mnsr_node) { 5359 master_mnsr_node = mnsr_node->mmn_next; 5360 free_sr((md_set_record *)mnsr_node->mmn_mnsr); 5361 free_rem_dd(mnsr_node->mmn_dd); 5362 Free(mnsr_node); 5363 mnsr_node = master_mnsr_node; 5364 } 5365 5366 /* Frees sd->sd_drvs (which is also master_dd) */ 5367 metaflushsetname(sp); 5368 return (rval); 5369 } 5370 5371 /* 5372 * meta_mnsync_diskset_mddbs 5373 * Calling node is guaranteed to be an owner node. 5374 * Calling node is the master node. 5375 * 5376 * Master node verifies that ondisk mddb format matches its incore format. 5377 * If no nodes are joined to set, remove the change log entries. 5378 * If a node is joined to set, play the change log. 5379 * 5380 * Returns 0 - Success 5381 * 1 - Master unable to join to set. 5382 * 205 - Failure during RPC to another node 5383 * -1 - Any other failure and ep is filled in. 5384 * -1 return will eventually cause node to panic 5385 * in a SunCluster environment. 5386 */ 5387 int 5388 meta_mnsync_diskset_mddbs( 5389 mdsetname_t *sp, 5390 md_error_t *ep 5391 ) 5392 { 5393 md_set_desc *sd; 5394 mddb_config_t c; 5395 md_mn_msgclass_t class; 5396 mddb_setflags_config_t sf; 5397 md_mnnode_desc *nd, *nd2; 5398 md_error_t xep = mdnullerror; 5399 int stale_set = 0; 5400 5401 /* If setname is there, set desc should exist. */ 5402 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 5403 mde_perror(ep, dgettext(TEXT_DOMAIN, 5404 "Unable to get set %s desc information"), sp->setname); 5405 return (-1); 5406 } 5407 5408 /* Are there drives in the set? */ 5409 if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 5410 ep) == NULL) { 5411 if (! mdisok(ep)) { 5412 return (-1); 5413 } 5414 /* No drives in set -- nothing to sync up */ 5415 return (0); 5416 } 5417 5418 /* 5419 * Is master node (which is this node) joined to set? 5420 * If master node isn't joined (which means that no nodes 5421 * are joined to diskset), remove the change log entries 5422 * since no need to replay them - all nodes will have same 5423 * view of mddbs since all nodes are reading in the mddbs 5424 * from disk. 5425 * There is also no need to sync up the master and ondisk mddbs 5426 * since master has no incore knowledge. 5427 * Need to join master to set in order to flush the change 5428 * log entries. Don't need to block I/O during join of master 5429 * to set since no other nodes are joined to set and so no I/O 5430 * can be occurring. 5431 */ 5432 if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 5433 /* Join master to set */ 5434 if (clnt_joinset(mynode(), sp, 5435 MNSET_IN_RECONFIG, ep)) { 5436 if (mdismddberror(ep, MDE_DB_STALE)) { 5437 /* 5438 * If STALE, print message and continue on. 5439 * Don't do any writes or reads to mddbs 5440 * so don't clear change log. 5441 */ 5442 mde_perror(ep, dgettext(TEXT_DOMAIN, 5443 "Join of master node to STALE set %s"), 5444 sp->setname); 5445 stale_set = 1; 5446 mdclrerror(ep); 5447 } else if (mdismddberror(ep, MDE_DB_ACCOK)) { 5448 /* ACCOK means mediator provided extra vote */ 5449 mdclrerror(ep); 5450 } else { 5451 /* 5452 * If master is unable to join set, print an 5453 * error message. Don't return failure or node 5454 * will panic during cluster reconfig cycle. 5455 * Also, withdraw node from set in order to 5456 * cleanup from failed join attempt. 5457 */ 5458 mde_perror(ep, dgettext(TEXT_DOMAIN, 5459 "Join of master node in set %s failed"), 5460 sp->setname); 5461 if (clnt_withdrawset(mynode(), sp, &xep)) 5462 mdclrerror(&xep); 5463 return (1); 5464 } 5465 } 5466 /* 5467 * Master node successfully joined. 5468 * Set local copy of flags to OWN and 5469 * send owner flag to rpc.metad. If not stale, 5470 * flush the change log. 5471 */ 5472 sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN; 5473 if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, MD_NR_SET, 5474 MNSET_IN_RECONFIG, ep)) { 5475 mde_perror(ep, dgettext(TEXT_DOMAIN, 5476 "Flag update of master node join in set %s failed"), 5477 sp->setname); 5478 return (-1); 5479 } 5480 5481 if (!stale_set) { 5482 if (mdmn_reset_changelog(sp, ep, 5483 MDMN_CLF_RESETLOG) != 0) { 5484 mde_perror(ep, dgettext(TEXT_DOMAIN, 5485 "Unable to reset changelog.")); 5486 return (-1); 5487 } 5488 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5489 "Removed changelog entries for set %s: %s"), 5490 sp->setname, 5491 meta_print_hrtime(gethrtime() - start_time)); 5492 } 5493 /* Reset new master flag before return */ 5494 (void) memset(&sf, 0, sizeof (sf)); 5495 sf.sf_setno = sp->setno; 5496 sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 5497 sf.sf_flags = MDDB_NM_RESET; 5498 /* Use magic to help protect ioctl against attack. */ 5499 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5500 /* Ignore failure, failure to reset flag isn't catastrophic */ 5501 (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, 5502 &sf.sf_mde, NULL); 5503 5504 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5505 "Reset new master flag for set %s: %s"), 5506 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5507 5508 return (0); 5509 } 5510 5511 /* 5512 * Is master already joined to STALE set (< 50% mddbs avail)? 5513 * If so, can make no config changes to mddbs so don't check or play 5514 * changelog and don't sync master node to ondisk mddbs. 5515 * To get out of the stale state all nodes must be withdrawn 5516 * from set. Then as nodes are re-joined, all nodes will 5517 * have same view of mddbs since all nodes are reading the 5518 * mddbs from disk. 5519 */ 5520 (void) memset(&c, 0, sizeof (c)); 5521 c.c_id = 0; 5522 c.c_setno = sp->setno; 5523 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 5524 (void) mdstealerror(ep, &c.c_mde); 5525 return (-1); 5526 } 5527 if (c.c_flags & MDDB_C_STALE) { 5528 return (0); 5529 } 5530 5531 /* 5532 * If this node is NOT a newly chosen master, then there's 5533 * nothing else to do since the change log should be empty and 5534 * the ondisk and incore mddbs are already consistent. 5535 * 5536 * A newly chosen master is a node that was not the master 5537 * at the beginning of the reconfig cycle. If a node is a new 5538 * master, then the new master state is reset after the ondisk 5539 * and incore mddbs are consistent and the change log has 5540 * been replayed. 5541 */ 5542 (void) memset(&sf, 0, sizeof (sf)); 5543 sf.sf_setno = sp->setno; 5544 sf.sf_flags = MDDB_NM_GET; 5545 /* Use magic to help protect ioctl against attack. */ 5546 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5547 if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, &sf.sf_mde, NULL) == 0) && 5548 ((sf.sf_setflags & MD_SET_MN_NEWMAS_RC) == 0)) { 5549 return (0); 5550 } 5551 5552 /* 5553 * Now, sync up incore master view to ondisk mddbs. 5554 * This is needed in the case where a master node 5555 * had made a change to the mddb, but this change 5556 * may not have been relayed to the slaves yet. 5557 * So, the new master needs to verify that the ondisk 5558 * mddbs match what the new master has incore - 5559 * if different, new master rewrites all of the mddbs. 5560 * Then the new master will replay the changelog and the 5561 * new master will then execute what the old master had 5562 * done. 5563 * 5564 * Block all I/Os to disks in this diskset on all nodes in 5565 * the diskset. This will allow the rewriting of the mddbs 5566 * (if needed), to proceed in a timely manner. 5567 * 5568 * If block of I/Os fail, return a -1. 5569 */ 5570 5571 nd = sd->sd_nodelist; 5572 while (nd) { 5573 /* Skip non-alive and non-owner nodes */ 5574 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5575 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5576 nd = nd->nd_next; 5577 continue; 5578 } 5579 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5580 MN_SUSP_IO, ep)) { 5581 mde_perror(ep, dgettext(TEXT_DOMAIN, 5582 "Unable to suspend I/O on node %s in set %s"), 5583 nd->nd_nodename, sp->setname); 5584 5585 /* 5586 * Resume all other nodes that had been suspended. 5587 * (Reconfig return step also resumes I/Os 5588 * for all sets.) 5589 */ 5590 nd2 = sd->sd_nodelist; 5591 while (nd2) { 5592 /* Stop when reaching failed node */ 5593 if (nd2->nd_nodeid == nd->nd_nodeid) 5594 break; 5595 /* Skip non-alive and non-owner nodes */ 5596 if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) || 5597 (!(nd2->nd_flags & MD_MN_NODE_OWN))) { 5598 nd2 = nd2->nd_next; 5599 continue; 5600 } 5601 (void) (clnt_mn_susp_res_io(nd2->nd_nodename, 5602 sp->setno, MN_RES_IO, &xep)); 5603 nd2 = nd2->nd_next; 5604 } 5605 5606 /* 5607 * If an RPC failure on another node, return a 205. 5608 * Otherwise, exit with failure. 5609 */ 5610 if ((mdanyrpcerror(ep)) && 5611 (sd->sd_mn_mynode->nd_nodeid != 5612 nd->nd_nodeid)) { 5613 return (205); 5614 } else { 5615 return (-1); 5616 } 5617 5618 } 5619 nd = nd->nd_next; 5620 } 5621 5622 (void) memset(&c, 0, sizeof (c)); 5623 c.c_id = 0; 5624 c.c_setno = sp->setno; 5625 /* Master can't sync up to ondisk mddbs? Kick it out of cluster */ 5626 if (metaioctl(MD_MN_CHK_WRT_MDDB, &c, &c.c_mde, NULL) != 0) 5627 return (-1); 5628 5629 /* 5630 * Resume I/Os that were suspended above. 5631 */ 5632 nd = sd->sd_nodelist; 5633 while (nd) { 5634 /* Skip non-alive and non-owner nodes */ 5635 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5636 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5637 nd = nd->nd_next; 5638 continue; 5639 } 5640 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5641 MN_RES_IO, ep)) { 5642 mde_perror(ep, dgettext(TEXT_DOMAIN, 5643 "Unable to resume I/O on node %s in set %s"), 5644 nd->nd_nodename, sp->setname); 5645 5646 /* 5647 * If an RPC failure then don't do any 5648 * more RPC calls, since one timeout is enough 5649 * to endure. If RPC failure to another node, return 5650 * 205. If RPC failure to my node, return -1. 5651 * If not an RPC failure, continue resuming the 5652 * rest of the nodes and then return -1. 5653 */ 5654 if (mdanyrpcerror(ep)) { 5655 if (sd->sd_mn_mynode->nd_nodeid == 5656 nd->nd_nodeid) { 5657 return (-1); 5658 } else { 5659 return (205); 5660 } 5661 } 5662 5663 /* 5664 * If not an RPC error, continue resuming rest of 5665 * nodes, ignoring any failures except for an 5666 * RPC failure which constitutes an immediate exit. 5667 * Start in middle of list with failing node. 5668 */ 5669 nd2 = nd->nd_next; 5670 while (nd2) { 5671 /* Skip non-alive and non-owner nodes */ 5672 if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) || 5673 (!(nd2->nd_flags & MD_MN_NODE_OWN))) { 5674 nd2 = nd2->nd_next; 5675 continue; 5676 } 5677 (void) (clnt_mn_susp_res_io(nd2->nd_nodename, 5678 sp->setno, MN_RES_IO, &xep)); 5679 if (mdanyrpcerror(&xep)) { 5680 return (-1); 5681 } 5682 nd2 = nd2->nd_next; 5683 } 5684 } 5685 nd = nd->nd_next; 5686 } 5687 5688 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, "Master node has completed " 5689 "checking/writing the mddb for set %s: %s"), sp->setname, 5690 meta_print_hrtime(gethrtime() - start_time)); 5691 5692 /* 5693 * Send (aka replay) all messages we find in the changelog. 5694 * Flag the messages with 5695 * MD_MSGF_REPLAY_MSG, so no new message ID is generated for them 5696 * MD_MSGF_OVERRIDE_SUSPEND so they can pass the suspended commd. 5697 */ 5698 for (class = MD_MN_NCLASSES - 1; class > 0; class--) { 5699 mdmn_changelog_record_t *lr; 5700 md_error_t xep = mdnullerror; 5701 md_mn_result_t *resultp = NULL; 5702 int ret; 5703 5704 lr = mdmn_get_changelogrec(sp->setno, class); 5705 if ((lr->lr_flags & MD_MN_LR_INUSE) == 0) { 5706 /* no entry for this class */ 5707 continue; 5708 } 5709 5710 meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN, 5711 "replaying message ID=(%d, 0x%llx-%d)\n"), 5712 MSGID_ELEMS(lr->lr_msg.msg_msgid)); 5713 5714 ret = mdmn_send_message_with_msgid( 5715 lr->lr_msg.msg_setno, 5716 lr->lr_msg.msg_type, 5717 lr->lr_msg.msg_flags | MD_MSGF_REPLAY_MSG | 5718 MD_MSGF_OVERRIDE_SUSPEND, 5719 lr->lr_msg.msg_event_data, 5720 lr->lr_msg.msg_event_size, 5721 &resultp, 5722 &lr->lr_msg.msg_msgid, 5723 &xep); 5724 5725 meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN, 5726 "mdmn_send_message returned %d\n"), ret); 5727 5728 if (resultp) 5729 free_result(resultp); 5730 } 5731 5732 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5733 "Playing changelog completed for set %s: %s"), 5734 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5735 5736 /* 5737 * Now that new master has ondisk and incore mddbs in sync, reset 5738 * this node's new master kernel flag (for this set). If this node 5739 * re-enters another reconfig cycle before the completion of this 5740 * reconfig cycle, this master node won't need to check if the ondisk 5741 * and incore mddbs are in sync since this node won't be considered 5742 * a new master (since this flag is being reset here in the middle of 5743 * step2). This will save time during any subsequent reconfig 5744 * cycles as long as this node continues to be master. 5745 */ 5746 (void) memset(&sf, 0, sizeof (sf)); 5747 sf.sf_setno = sp->setno; 5748 sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 5749 sf.sf_flags = MDDB_NM_RESET; 5750 /* Use magic to help protect ioctl against attack. */ 5751 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5752 /* Ignore failure, since failure to reset flag isn't catastrophic */ 5753 (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, NULL); 5754 5755 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5756 "Reset new master flag for set %s: %s"), 5757 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5758 5759 return (0); 5760 } 5761 5762 /* 5763 * meta_mnjoin_all will join all starting nodes in the diskset. 5764 * A starting node is considered to be any node that is not 5765 * an owner of the set but is a member of the cluster. 5766 * Master node is already joined to set (done in meta_mnsync_diskset_mddbs). 5767 * 5768 * Caller is the Master node. 5769 * 5770 * Returns 0 - Success 5771 * 205 - Failure during RPC to another node 5772 * -1 - Any other failure and ep is filled in. 5773 */ 5774 int 5775 meta_mnjoin_all( 5776 mdsetname_t *sp, 5777 md_error_t *ep 5778 ) 5779 { 5780 md_set_desc *sd; 5781 md_mnnode_desc *nd, *nd2; 5782 int rval = 0; 5783 int stale_flag = 0; 5784 mddb_config_t c; 5785 int susp_res_flag = 0; 5786 md_error_t xep = mdnullerror; 5787 5788 /* If setname is there, set desc should exist. */ 5789 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 5790 mde_perror(ep, dgettext(TEXT_DOMAIN, 5791 "Unable to get set %s desc information"), sp->setname); 5792 return (-1); 5793 } 5794 5795 /* Are there drives in the set? */ 5796 if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 5797 ep) == NULL) { 5798 if (! mdisok(ep)) { 5799 return (-1); 5800 } 5801 /* No drives in set -- nothing to join */ 5802 return (0); 5803 } 5804 5805 /* 5806 * Is set currently stale? 5807 */ 5808 (void) memset(&c, 0, sizeof (c)); 5809 c.c_id = 0; 5810 c.c_setno = sp->setno; 5811 /* Ignore failure since master node may not be joined yet */ 5812 (void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL); 5813 if (c.c_flags & MDDB_C_STALE) { 5814 stale_flag = MNSET_IS_STALE; 5815 } 5816 5817 /* 5818 * If any nodes are going to be joined to diskset, then 5819 * suspend I/O to all disks in diskset so that nodes can join 5820 * (read in mddbs) in a reasonable amount of time even under 5821 * high I/O load. Don't need to do this if set is STALE since 5822 * no I/O can be occurring to a STALE set. 5823 */ 5824 if (stale_flag != MNSET_IS_STALE) { 5825 nd = sd->sd_nodelist; 5826 while (nd) { 5827 /* Found a node that will be joined to diskset */ 5828 if ((nd->nd_flags & MD_MN_NODE_ALIVE) && 5829 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5830 /* Set flag that diskset should be suspended */ 5831 susp_res_flag = 1; 5832 break; 5833 } 5834 nd = nd->nd_next; 5835 } 5836 } 5837 5838 if (susp_res_flag) { 5839 /* 5840 * Block all I/Os to disks in this diskset on all joined 5841 * nodes in the diskset. 5842 * If block of I/Os fails due to an RPC failure on another 5843 * node, return 205; otherwise, return -1. 5844 */ 5845 nd = sd->sd_nodelist; 5846 while (nd) { 5847 /* Skip non-alive and non-owner nodes */ 5848 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5849 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5850 nd = nd->nd_next; 5851 continue; 5852 } 5853 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5854 MN_SUSP_IO, ep)) { 5855 mde_perror(ep, dgettext(TEXT_DOMAIN, 5856 "Unable to suspend I/O on node %s" 5857 " in set %s"), nd->nd_nodename, 5858 sp->setname); 5859 /* 5860 * Resume other nodes that had been suspended. 5861 * (Reconfig return step also resumes I/Os 5862 * for all sets.) 5863 */ 5864 nd2 = sd->sd_nodelist; 5865 while (nd2) { 5866 /* Stop when reaching failed node */ 5867 if (nd2->nd_nodeid == nd->nd_nodeid) 5868 break; 5869 /* Skip non-alive/non-owner nodes */ 5870 if ((!(nd2->nd_flags & 5871 MD_MN_NODE_ALIVE)) || 5872 (!(nd2->nd_flags & 5873 MD_MN_NODE_OWN))) { 5874 nd2 = nd2->nd_next; 5875 continue; 5876 } 5877 (void) (clnt_mn_susp_res_io( 5878 nd2->nd_nodename, sp->setno, 5879 MN_RES_IO, &xep)); 5880 nd2 = nd2->nd_next; 5881 } 5882 5883 /* 5884 * If the suspend failed due to an 5885 * RPC failure on another node, return 5886 * a 205. 5887 * Otherwise, exit with failure. 5888 * The return reconfig step will resume 5889 * I/Os for all disksets. 5890 */ 5891 if ((mdanyrpcerror(ep)) && 5892 (sd->sd_mn_mynode->nd_nodeid != 5893 nd->nd_nodeid)) { 5894 return (205); 5895 } else { 5896 return (-1); 5897 } 5898 } 5899 nd = nd->nd_next; 5900 } 5901 } 5902 5903 nd = sd->sd_nodelist; 5904 while (nd) { 5905 /* 5906 * If a node is in the membership list but isn't joined 5907 * to the set, try to join the node. 5908 */ 5909 if ((nd->nd_flags & MD_MN_NODE_ALIVE) && 5910 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5911 if (clnt_joinset(nd->nd_nodename, sp, 5912 (MNSET_IN_RECONFIG | stale_flag), ep)) { 5913 /* 5914 * If RPC failure to another node 5915 * then exit without attempting anything else. 5916 * (Reconfig return step will resume I/Os 5917 * for all sets.) 5918 */ 5919 if (mdanyrpcerror(ep)) { 5920 mde_perror(ep, ""); 5921 return (205); 5922 } 5923 /* 5924 * STALE and ACCOK failures aren't true 5925 * failures. STALE means that <50% mddbs 5926 * are available. ACCOK means that the 5927 * mediator provided the extra vote. 5928 * If a true failure, then print messasge 5929 * and withdraw node from set in order to 5930 * cleanup from failed join attempt. 5931 */ 5932 if ((!mdismddberror(ep, MDE_DB_STALE)) && 5933 (!mdismddberror(ep, MDE_DB_ACCOK))) { 5934 mde_perror(ep, 5935 "WARNING: Unable to join node %s " 5936 "to set %s", nd->nd_nodename, 5937 sp->setname); 5938 mdclrerror(ep); 5939 if (clnt_withdrawset(nd->nd_nodename, 5940 sp, &xep)) 5941 mdclrerror(&xep); 5942 nd = nd->nd_next; 5943 continue; 5944 } 5945 } 5946 /* Set owner flag even if STALE or ACCOK */ 5947 nd->nd_flags |= MD_MN_NODE_OWN; 5948 } 5949 nd = nd->nd_next; 5950 } 5951 /* 5952 * Resume I/Os if suspended above. 5953 */ 5954 if (susp_res_flag) { 5955 nd = sd->sd_nodelist; 5956 while (nd) { 5957 /* 5958 * Skip non-alive and non-owner nodes 5959 * (this list doesn't include any of 5960 * the nodes that were joined). 5961 */ 5962 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5963 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5964 nd = nd->nd_next; 5965 continue; 5966 } 5967 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5968 MN_RES_IO, ep)) { 5969 mde_perror(ep, dgettext(TEXT_DOMAIN, 5970 "Unable to resume I/O on node %s" 5971 " in set %s"), nd->nd_nodename, 5972 sp->setname); 5973 5974 /* 5975 * If an RPC failure then don't do any 5976 * more RPC calls, since one timeout is enough 5977 * to endure. If RPC failure to another node, 5978 * return 205. If RPC failure to my node, 5979 * return -1. 5980 * (Reconfig return step will resume I/Os 5981 * for all sets.) 5982 * If not an RPC failure, continue resuming the 5983 * rest of the nodes and then return -1. 5984 */ 5985 if (mdanyrpcerror(ep)) { 5986 if (sd->sd_mn_mynode->nd_nodeid == 5987 nd->nd_nodeid) { 5988 return (-1); 5989 } else { 5990 return (205); 5991 } 5992 } 5993 5994 /* 5995 * If not an RPC error, continue resuming rest 5996 * of nodes, ignoring any failures except for 5997 * an RPC failure which constitutes an 5998 * immediate exit. 5999 * Start in middle of list with failing node. 6000 */ 6001 nd2 = nd->nd_next; 6002 while (nd2) { 6003 /* Skip non-owner nodes */ 6004 if ((!(nd2->nd_flags & 6005 MD_MN_NODE_ALIVE)) || 6006 (!(nd2->nd_flags & 6007 MD_MN_NODE_OWN))) { 6008 nd2 = nd2->nd_next; 6009 continue; 6010 } 6011 (void) (clnt_mn_susp_res_io( 6012 nd2->nd_nodename, sp->setno, 6013 MN_RES_IO, &xep)); 6014 if (mdanyrpcerror(&xep)) { 6015 return (-1); 6016 } 6017 nd2 = nd2->nd_next; 6018 } 6019 } 6020 nd = nd->nd_next; 6021 } 6022 } 6023 6024 nd = sd->sd_nodelist; 6025 while (nd) { 6026 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 6027 nd = nd->nd_next; 6028 continue; 6029 } 6030 /* 6031 * If 1 node fails - go ahead and update the rest except 6032 * in the case of an RPC failure, fail immediately. 6033 */ 6034 if (clnt_upd_nr_flags(nd->nd_nodename, sp, 6035 sd->sd_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) { 6036 /* RPC failure to another node */ 6037 if (mdanyrpcerror(ep)) { 6038 return (205); 6039 } 6040 nd = nd->nd_next; 6041 rval = -1; 6042 continue; 6043 } 6044 nd = nd->nd_next; 6045 } 6046 6047 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 6048 "Join of all nodes completed for set %s: %s"), 6049 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 6050 6051 return (rval); 6052 } 6053