1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Just in case we're not in a build environment, make sure that 30 * TEXT_DOMAIN gets set to something. 31 */ 32 #if !defined(TEXT_DOMAIN) 33 #define TEXT_DOMAIN "SYS_TEST" 34 #endif 35 36 /* 37 * Metadevice diskset interfaces 38 */ 39 40 #include "meta_set_prv.h" 41 #include <meta.h> 42 #include <metad.h> 43 #include <mdmn_changelog.h> 44 #include <sys/lvm/md_crc.h> 45 #include <sys/utsname.h> 46 #include <sdssc.h> 47 48 #include <sys/sysevent/eventdefs.h> 49 #include <sys/sysevent/svm.h> 50 extern char *blkname(char *); 51 52 static md_drive_desc * 53 dr2drivedesc( 54 mdsetname_t *sp, 55 side_t sideno, 56 int flags, 57 md_error_t *ep 58 ) 59 { 60 md_set_record *sr; 61 md_drive_record *dr; 62 mddrivename_t *dnp; 63 md_drive_desc *dd_head = NULL; 64 md_set_desc *sd; 65 66 if (flags & MD_BYPASS_DAEMON) { 67 if ((sr = metad_getsetbynum(sp->setno, ep)) == NULL) 68 return (NULL); 69 sd = metaget_setdesc(sp, ep); 70 sideno = getnodeside(mynode(), sd); 71 sp = metafakesetname(sp->setno, sr->sr_setname); 72 } else { 73 if ((sr = getsetbyname(sp->setname, ep)) == NULL) 74 return (NULL); 75 } 76 77 assert(sideno != MD_SIDEWILD); 78 79 /* 80 * WARNING: 81 * The act of getting the dnp from the namespace means that we 82 * will get the devid of the disk as recorded in the namespace. 83 * This devid has the potential to be stale if the disk is being 84 * replaced via a rebind, this means that any code that relies 85 * on any of the dnp information should take the appropriate action 86 * to preserve that information. For example in the rebind code the 87 * devid of the new disk is saved off and then copied back in once 88 * the code that has called this function has completed. 89 */ 90 for (dr = sr->sr_drivechain; dr != NULL; dr = dr->dr_next) { 91 if ((dnp = metadrivename_withdrkey(sp, sideno, dr->dr_key, 92 flags, ep)) == NULL) { 93 if (!(flags & MD_BYPASS_DAEMON)) 94 free_sr(sr); 95 metafreedrivedesc(&dd_head); 96 return (NULL); 97 } 98 99 (void) metadrivedesc_append(&dd_head, dnp, dr->dr_dbcnt, 100 dr->dr_dbsize, dr->dr_ctime, dr->dr_genid, dr->dr_flags); 101 } 102 103 if (!(flags & MD_BYPASS_DAEMON)) { 104 free_sr(sr); 105 } 106 return (dd_head); 107 } 108 109 static int 110 get_sidenmlist( 111 mdsetname_t *sp, 112 mddrivename_t *dnp, 113 md_error_t *ep 114 ) 115 { 116 md_set_desc *sd; 117 mdsidenames_t *sn, **sn_next; 118 int i; 119 120 if ((sd = metaget_setdesc(sp, ep)) == NULL) 121 return (-1); 122 123 metaflushsidenames(dnp); 124 sn_next = &dnp->side_names; 125 if (MD_MNSET_DESC(sd)) { 126 /* 127 * Only get sidenames for this node since 128 * that is the only side information stored in 129 * the local mddb for a multi-node diskset. 130 */ 131 if (sd->sd_mn_mynode) { 132 sn = Zalloc(sizeof (*sn)); 133 sn->sideno = sd->sd_mn_mynode->nd_nodeid; 134 if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET, 135 sn->sideno, dnp->side_names_key, &sn->dname, 136 &sn->mnum, NULL, ep)) == NULL) { 137 if (sn->dname != NULL) 138 Free(sn->dname); 139 Free(sn); 140 return (-1); 141 } 142 143 /* Add to the end of the linked list */ 144 assert(*sn_next == NULL); 145 *sn_next = sn; 146 sn_next = &sn->next; 147 } 148 } else { 149 for (i = 0; i < MD_MAXSIDES; i++) { 150 /* Skip empty slots */ 151 if (sd->sd_nodes[i][0] == '\0') 152 continue; 153 154 sn = Zalloc(sizeof (*sn)); 155 sn->sideno = i; 156 if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET, 157 i+SKEW, dnp->side_names_key, &sn->dname, 158 &sn->mnum, NULL, ep)) == NULL) { 159 /* 160 * It is possible that during the add of a 161 * host to have a 'missing' side as the side 162 * for this disk will be added later. So ignore 163 * the error. The 'missing' side will be added 164 * once the addhosts process has completed. 165 */ 166 if (mdissyserror(ep, ENOENT)) { 167 mdclrerror(ep); 168 Free(sn); 169 continue; 170 } 171 172 if (sn->dname != NULL) 173 Free(sn->dname); 174 Free(sn); 175 return (-1); 176 } 177 178 /* Add to the end of the linked list */ 179 assert(*sn_next == NULL); 180 *sn_next = sn; 181 sn_next = &sn->next; 182 } 183 } 184 185 return (0); 186 } 187 188 static md_drive_desc * 189 rl_to_dd( 190 mdsetname_t *sp, 191 md_replicalist_t *rlp, 192 md_error_t *ep 193 ) 194 { 195 md_replicalist_t *rl; 196 md_replica_t *r; 197 md_drive_desc *dd = NULL; 198 md_drive_desc *d; 199 int found; 200 md_set_desc *sd; 201 daddr_t nblks = 0; 202 203 if ((sd = metaget_setdesc(sp, ep)) == NULL) 204 return (NULL); 205 206 /* find the smallest existing replica */ 207 for (rl = rlp; rl != NULL; rl = rl->rl_next) { 208 r = rl->rl_repp; 209 nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks)); 210 } 211 212 if (nblks <= 0) 213 nblks = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE; 214 215 for (rl = rlp; rl != NULL; rl = rl->rl_next) { 216 r = rl->rl_repp; 217 218 found = 0; 219 for (d = dd; d != NULL; d = d->dd_next) { 220 if (strcmp(r->r_namep->drivenamep->cname, 221 d->dd_dnp->cname) == 0) { 222 found = 1; 223 dd->dd_dbcnt++; 224 break; 225 } 226 } 227 228 if (! found) 229 (void) metadrivedesc_append(&dd, r->r_namep->drivenamep, 230 1, nblks, sd->sd_ctime, sd->sd_genid, MD_DR_OK); 231 } 232 233 return (dd); 234 } 235 236 /* 237 * Exported Entry Points 238 */ 239 240 set_t 241 get_max_sets(md_error_t *ep) 242 { 243 244 static set_t max_sets = 0; 245 246 if (max_sets == 0) 247 if (metaioctl(MD_IOCGETNSET, &max_sets, ep, NULL) != 0) 248 return (0); 249 250 return (max_sets); 251 } 252 253 int 254 get_max_meds(md_error_t *ep) 255 { 256 static int max_meds = 0; 257 258 if (max_meds == 0) 259 if (metaioctl(MD_MED_GET_NMED, &max_meds, ep, NULL) != 0) 260 return (0); 261 262 return (max_meds); 263 } 264 265 side_t 266 getmyside(mdsetname_t *sp, md_error_t *ep) 267 { 268 md_set_desc *sd; 269 char *node = NULL; 270 side_t sideno; 271 272 if (sp->setno == 0) 273 return (0); 274 275 if ((sd = metaget_setdesc(sp, ep)) == NULL) 276 return (MD_SIDEWILD); 277 278 node = mynode(); 279 280 assert(node != NULL); 281 282 sideno = getnodeside(node, sd); 283 284 if (sideno != MD_SIDEWILD) 285 return (sideno); 286 287 return (mddserror(ep, MDE_DS_HOSTNOSIDE, sp->setno, node, NULL, node)); 288 } 289 290 /* 291 * get set info from name 292 */ 293 md_set_record * 294 getsetbyname(char *setname, md_error_t *ep) 295 { 296 md_set_record *sr = NULL; 297 md_mnset_record *mnsr = NULL; 298 char *p; 299 size_t len; 300 301 /* get set info from daemon */ 302 if (clnt_getset(mynode(), setname, MD_SET_BAD, &sr, ep) == -1) 303 return (NULL); 304 if (sr != NULL) { 305 /* 306 * Returned record could be for a multi-node set or a 307 * non-multi-node set. 308 */ 309 if (MD_MNSET_REC(sr)) { 310 /* 311 * Record is for a multi-node set. Reissue call 312 * to get mnset information. Need to free 313 * record as if a non-multi-node set record since 314 * that is what clnt_getset gave us. If in 315 * the daemon, don't free since this is a pointer 316 * into the setrecords array. 317 */ 318 if (! md_in_daemon) { 319 sr->sr_flags &= ~MD_SR_MN; 320 free_sr(sr); 321 } 322 if (clnt_mngetset(mynode(), setname, MD_SET_BAD, &mnsr, 323 ep) == -1) 324 return (NULL); 325 if (mnsr != NULL) 326 return ((struct md_set_record *)mnsr); 327 } else { 328 return (sr); 329 } 330 } 331 332 /* no such set */ 333 len = strlen(setname) + 30; 334 p = Malloc(len); 335 (void) snprintf(p, len, "setname \"%s\"", setname); 336 (void) mderror(ep, MDE_NO_SET, p); 337 Free(p); 338 return (NULL); 339 } 340 341 /* 342 * get set info from number 343 */ 344 md_set_record * 345 getsetbynum(set_t setno, md_error_t *ep) 346 { 347 md_set_record *sr; 348 md_mnset_record *mnsr = NULL; 349 char buf[100]; 350 351 if (clnt_getset(mynode(), NULL, setno, &sr, ep) == -1) 352 return (NULL); 353 354 if (sr != NULL) { 355 /* 356 * Record is for a multi-node set. Reissue call 357 * to get mnset information. Need to free 358 * record as if a non-multi-node set record since 359 * that is what clnt_getset gave us. If in 360 * the daemon, don't free since this is a pointer 361 * into the setrecords array. 362 */ 363 if (MD_MNSET_REC(sr)) { 364 /* 365 * Record is for a multi-node set. Reissue call 366 * to get mnset information. 367 */ 368 if (! md_in_daemon) { 369 sr->sr_flags &= ~MD_SR_MN; 370 free_sr(sr); 371 } 372 if (clnt_mngetset(mynode(), NULL, setno, &mnsr, 373 ep) == -1) 374 return (NULL); 375 if (mnsr != NULL) 376 return ((struct md_set_record *)mnsr); 377 } else { 378 return (sr); 379 } 380 } 381 382 (void) sprintf(buf, "setno %u", setno); 383 (void) mderror(ep, MDE_NO_SET, buf); 384 return (NULL); 385 } 386 387 int 388 meta_check_drive_inuse( 389 mdsetname_t *sp, 390 mddrivename_t *dnp, 391 int check_db, 392 md_error_t *ep 393 ) 394 { 395 mdnamelist_t *nlp = NULL; 396 mdnamelist_t *p; 397 int rval = 0; 398 399 /* get all underlying partitions */ 400 if (meta_getalldevs(sp, &nlp, check_db, ep) != 0) 401 return (-1); 402 403 /* search for drive */ 404 for (p = nlp; (p != NULL); p = p->next) { 405 mdname_t *np = p->namep; 406 407 if (strcmp(dnp->cname, np->drivenamep->cname) == 0) { 408 rval = (mddserror(ep, MDE_DS_DRIVEINUSE, sp->setno, 409 NULL, dnp->cname, sp->setname)); 410 break; 411 } 412 } 413 414 /* cleanup, return success */ 415 metafreenamelist(nlp); 416 return (rval); 417 } 418 419 /* 420 * simple check for ownership 421 */ 422 int 423 meta_check_ownership(mdsetname_t *sp, md_error_t *ep) 424 { 425 int ownset; 426 md_set_desc *sd; 427 md_drive_desc *dd; 428 md_replicalist_t *rlp = NULL; 429 md_error_t xep = mdnullerror; 430 431 if (metaislocalset(sp)) 432 return (0); 433 434 ownset = own_set(sp, NULL, TRUE, ep); 435 if (! mdisok(ep)) 436 return (-1); 437 438 if ((sd = metaget_setdesc(sp, ep)) == NULL) 439 return (-1); 440 441 dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep); 442 if (! mdisok(ep)) 443 return (-1); 444 445 /* If we have no drive descriptors, check for no ownership */ 446 if (dd == NULL) { 447 if (ownset == MD_SETOWNER_NONE) 448 return (0); 449 450 /* If ownership somehow has come to exist, we must clean up */ 451 452 if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp, 453 &xep) < 0) 454 mdclrerror(&xep); 455 456 if ((dd = rl_to_dd(sp, rlp, &xep)) == NULL) 457 if (! mdisok(&xep)) 458 mdclrerror(&xep); 459 460 if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) { 461 if (rel_own_bydd(sp, dd, TRUE, &xep)) 462 mdclrerror(&xep); 463 } 464 465 if (halt_set(sp, &xep)) 466 mdclrerror(&xep); 467 468 metafreereplicalist(rlp); 469 470 metafreedrivedesc(&dd); 471 472 return (0); 473 } 474 475 metafreedrivedesc(&sd->sd_drvs); 476 477 if (ownset == MD_SETOWNER_YES) 478 return (0); 479 480 return (mddserror(ep, MDE_DS_NOOWNER, sp->setno, NULL, NULL, 481 sp->setname)); 482 } 483 484 /* 485 * simple check for ownership 486 */ 487 int 488 meta_check_ownership_on_host(mdsetname_t *sp, char *hostname, md_error_t *ep) 489 { 490 md_set_desc *sd; 491 md_drive_desc *dd; 492 int bool; 493 494 if (metaislocalset(sp)) 495 return (0); 496 497 if ((sd = metaget_setdesc(sp, ep)) == NULL) 498 return (-1); 499 500 if (getnodeside(hostname, sd) == MD_SIDEWILD) 501 return (mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, 502 hostname, NULL, sp->setname)); 503 504 dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep); 505 if (! mdisok(ep)) 506 return (-1); 507 508 if (clnt_ownset(hostname, sp, &bool, ep) == -1) 509 return (-1); 510 511 if (dd == NULL) 512 return (0); 513 514 metafreedrivedesc(&sd->sd_drvs); 515 516 if (bool == TRUE) 517 return (0); 518 519 return (mddserror(ep, MDE_DS_NODEISNOTOWNER, sp->setno, hostname, NULL, 520 sp->setname)); 521 } 522 523 /* 524 * Function that determines if a node is in the multinode diskset 525 * membership list. Calling node passes in node to be checked and 526 * the nodelist as returned from meta_read_nodelist. This routine 527 * anticipates being called many times using the same diskset membership 528 * list which is why the alloc and free of the diskset membership list 529 * is left to the calling routine. 530 * Returns: 531 * 1 - if a member 532 * 0 - not a member 533 */ 534 int 535 meta_is_member( 536 char *node_name, 537 md_mn_nodeid_t node_id, 538 mndiskset_membershiplist_t *nl 539 ) 540 { 541 mndiskset_membershiplist_t *nl2; 542 int flag_check_name; 543 544 if (node_id != 0) 545 flag_check_name = 0; 546 else if (node_name != NULL) 547 flag_check_name = 1; 548 else 549 return (0); 550 551 nl2 = nl; 552 while (nl2) { 553 if (flag_check_name) { 554 /* Compare given name against name in member list */ 555 if (strcmp(nl2->msl_node_name, node_name) == 0) 556 break; 557 } else { 558 /* Compare given nodeid against nodeid in member list */ 559 if (nl2->msl_node_id == node_id) 560 break; 561 } 562 nl2 = nl2->next; 563 } 564 /* No match found in member list */ 565 if (nl2 == NULL) { 566 return (0); 567 } 568 /* Return 1 if node is in member list */ 569 return (1); 570 } 571 572 /* 573 * meta_getnext_devinfo should go to the host that 574 * has the device, to return the device name, driver name, minor num. 575 * We can take the big cheat for now, since it is a requirement 576 * that the device names and device numbers are the same, and 577 * just get the info locally. 578 * 579 * This routine is very similar to meta_getnextside_devinfo except 580 * that the specific side to be used is being passed in. 581 * 582 * Exit status: 583 * 0 - No more side info to return 584 * 1 - More side info's to return 585 * -1 - An error has been detected 586 */ 587 /*ARGSUSED*/ 588 int 589 meta_getside_devinfo( 590 mdsetname_t *sp, /* for this set */ 591 char *bname, /* local block name (myside) */ 592 side_t sideno, /* sideno */ 593 char **ret_bname, /* block device name of returned side */ 594 char **ret_dname, /* driver name of returned side */ 595 minor_t *ret_mnum, /* minor number of returned side */ 596 md_error_t *ep 597 ) 598 { 599 mdname_t *np; 600 601 if (ret_bname != NULL) 602 *ret_bname = NULL; 603 if (ret_dname != NULL) 604 *ret_dname = NULL; 605 if (ret_mnum != NULL) 606 *ret_mnum = NODEV32; 607 608 609 if ((np = metaname(&sp, bname, LOGICAL_DEVICE, ep)) == NULL) 610 return (-1); 611 612 /* 613 * NOTE (future) - There will be more work here once devids are integrated 614 * into disksets. Then the side should be used to find the correct 615 * host and the b/d names should be gotten from that host. 616 */ 617 618 /* 619 * Return the side info. 620 */ 621 if (ret_bname != NULL) 622 *ret_bname = Strdup(np->bname); 623 624 if (ret_dname != NULL) { 625 mdcinfo_t *cinfo; 626 627 if ((cinfo = metagetcinfo(np, ep)) == NULL) 628 return (-1); 629 630 *ret_dname = Strdup(cinfo->dname); 631 } 632 633 if (ret_mnum != NULL) 634 *ret_mnum = meta_getminor(np->dev); 635 636 return (1); 637 } 638 639 /* 640 * Get the information on the device from the remote node using the devid 641 * of the disk. 642 * 643 * Exit status: 644 * 0 - No more side info to return 645 * 1 - More side info's to return 646 * -1 - An error has been detected 647 */ 648 int 649 meta_getnextside_devinfo( 650 mdsetname_t *sp, /* for this set */ 651 char *bname, /* local block name (myside) */ 652 side_t *sideno, /* previous sideno & returned sideno */ 653 char **ret_bname, /* block device name of returned side */ 654 char **ret_dname, /* driver name of returned side */ 655 minor_t *ret_mnum, /* minor number of returned side */ 656 md_error_t *ep 657 ) 658 { 659 md_set_desc *sd; 660 int i; 661 mdname_t *np; 662 mddrivename_t *dnp; 663 char *devidstr = NULL; 664 int devidstrlen; 665 md_dev64_t retdev = NODEV64; 666 char *ret_devname = NULL; 667 char *ret_blkdevname = NULL; 668 char *ret_driver = NULL; 669 char *nodename; 670 int fd; 671 int ret = -1; 672 char *minor_name = NULL; 673 md_mnnode_desc *nd; 674 675 676 if (ret_bname != NULL) 677 *ret_bname = NULL; 678 if (ret_dname != NULL) 679 *ret_dname = NULL; 680 if (ret_mnum != NULL) 681 *ret_mnum = NODEV32; 682 683 if (metaislocalset(sp)) { 684 /* no more sides - we are done */ 685 if (*sideno != MD_SIDEWILD) 686 return (0); 687 688 /* First time through - set up return sideno */ 689 *sideno = 0; 690 } else { 691 692 /* 693 * Find the next sideno, starting after the one given. 694 */ 695 if ((sd = metaget_setdesc(sp, ep)) == NULL) 696 return (-1); 697 698 if (MD_MNSET_DESC(sd)) { 699 nd = sd->sd_nodelist; 700 if ((*sideno == MD_SIDEWILD) && 701 (nd != (struct md_mnnode_desc *)NULL)) { 702 *sideno = nd->nd_nodeid; 703 } else { 704 while (nd) { 705 /* 706 * Found given sideno, now find 707 * next sideno, if there is one. 708 */ 709 if ((*sideno == nd->nd_nodeid) && 710 (nd->nd_next != 711 (struct md_mnnode_desc *)NULL)) { 712 *sideno = 713 nd->nd_next->nd_nodeid; 714 break; 715 } 716 nd = nd->nd_next; 717 } 718 if (nd == NULL) { 719 return (0); 720 } 721 } 722 if (*sideno == MD_SIDEWILD) 723 return (0); 724 } else { 725 for (i = (*sideno)+1; i < MD_MAXSIDES; i++) 726 /* Find next full slot */ 727 if (sd->sd_nodes[i][0] != '\0') 728 break; 729 730 /* No more sides - we are done */ 731 if (i == MD_MAXSIDES) 732 return (0); 733 734 /* Set up the return sideno */ 735 *sideno = i; 736 nodename = (char *)sd->sd_nodes[i]; 737 } 738 } 739 740 /* 741 * Need to pass the node the devid of the disk and get it to 742 * send back the details of the disk from that side. 743 */ 744 if ((np = metaname(&sp, bname, UNKNOWN, ep)) == NULL) 745 return (-1); 746 747 dnp = np->drivenamep; 748 749 /* 750 * By default, set up the parameters so that they are copied out. 751 */ 752 if (ret_bname != NULL) 753 *ret_bname = Strdup(np->bname); 754 755 if (ret_dname != NULL) { 756 mdcinfo_t *cinfo; 757 758 if ((cinfo = metagetcinfo(np, ep)) == NULL) 759 return (-1); 760 761 *ret_dname = Strdup(cinfo->dname); 762 } 763 764 if (ret_mnum != NULL) 765 *ret_mnum = meta_getminor(np->dev); 766 767 /* 768 * Try some optimization. If this is the local set or the device 769 * is a metadevice then just copy the information. If the device 770 * does not have a devid (due to not having a minor name) then 771 * fall back to the pre-devid behaviour of copying the information 772 * on the device: this is okay because the sanity checks before this 773 * call would have found any issues with the device. If it's a 774 * multi-node diskset also just return ie. copy. 775 */ 776 if (metaislocalset(sp) || metaismeta(np) || (dnp->devid == NULL) || 777 (MD_MNSET_DESC(sd))) 778 return (1); 779 780 if (np->minor_name == (char *)NULL) { 781 /* 782 * Have to get the minor name then. The slice should exist 783 * on the disk because it will have already been repartitioned 784 * up prior to getting to this point. 785 */ 786 if ((fd = open(np->bname, (O_RDONLY|O_NDELAY), 0)) < 0) { 787 (void) mdsyserror(ep, errno, np->bname); 788 return (-1); 789 } 790 (void) devid_get_minor_name(fd, &minor_name); 791 np->minor_name = Strdup(minor_name); 792 devid_str_free(minor_name); 793 (void) close(fd); 794 } 795 796 /* allocate extra space for "/" and NULL hence +2 */ 797 devidstrlen = strlen(dnp->devid) + strlen(np->minor_name) + 2; 798 devidstr = (char *)Malloc(devidstrlen); 799 800 /* 801 * As a minor name is supplied then the ret_devname will be 802 * appropriate to that minor_name and in this case it will be 803 * a block device ie /dev/dsk. 804 */ 805 (void) snprintf(devidstr, devidstrlen, 806 "%s/%s", dnp->devid, np->minor_name); 807 808 ret = clnt_devinfo_by_devid(nodename, sp, devidstr, &retdev, 809 np->bname, &ret_devname, &ret_driver, ep); 810 811 Free(devidstr); 812 813 /* 814 * If the other side is not running device id in disksets, 815 * 'ret' is set to ENOTSUP in which case we fallback to 816 * the existing behaviour 817 */ 818 if (ret == ENOTSUP) 819 return (1); 820 else if (ret == -1) 821 return (-1); 822 823 /* 824 * ret_devname comes from the rpc call and is a 825 * raw device name. We need to make this into a 826 * block device via blkname for further processing. 827 * Unfortunately, when our device id isn't found in 828 * the system, the rpc call will return a " " in 829 * ret_devname in which case we need to fill that in 830 * as ret_blkname because blkname of " " returns NULL. 831 */ 832 if (ret_bname != NULL && ret_devname != NULL) { 833 ret_blkdevname = blkname(ret_devname); 834 if (ret_blkdevname == NULL) 835 *ret_bname = Strdup(ret_devname); 836 else 837 *ret_bname = Strdup(ret_blkdevname); 838 } 839 840 if (ret_dname != NULL && ret_driver != NULL) 841 *ret_dname = Strdup(ret_driver); 842 843 if (ret_mnum != NULL) 844 *ret_mnum = meta_getminor(retdev); 845 846 return (1); 847 } 848 849 int 850 meta_is_drive_in_anyset( 851 mddrivename_t *dnp, 852 mdsetname_t **spp, 853 int bypass_daemon, 854 md_error_t *ep 855 ) 856 { 857 set_t setno; 858 mdsetname_t *this_sp; 859 int is_it; 860 set_t max_sets; 861 862 if ((max_sets = get_max_sets(ep)) == 0) 863 return (-1); 864 865 assert(spp != NULL); 866 *spp = NULL; 867 868 for (setno = 1; setno < max_sets; setno++) { 869 if (!bypass_daemon) { 870 if ((this_sp = metasetnosetname(setno, ep)) == NULL) { 871 if (mdismddberror(ep, MDE_DB_NODB)) { 872 mdclrerror(ep); 873 return (0); 874 } 875 if (mdiserror(ep, MDE_NO_SET)) { 876 mdclrerror(ep); 877 continue; 878 } 879 return (-1); 880 } 881 } else 882 this_sp = metafakesetname(setno, NULL); 883 884 if ((is_it = meta_is_drive_in_thisset(this_sp, dnp, 885 bypass_daemon, ep)) == -1) { 886 if (mdiserror(ep, MDE_NO_SET)) { 887 mdclrerror(ep); 888 continue; 889 } 890 return (-1); 891 } 892 if (is_it) { 893 *spp = this_sp; 894 return (0); 895 } 896 } 897 return (0); 898 } 899 900 int 901 meta_is_drive_in_thisset( 902 mdsetname_t *sp, 903 mddrivename_t *dnp, 904 int bypass_daemon, 905 md_error_t *ep 906 ) 907 { 908 md_drive_desc *dd, *p; 909 910 if (bypass_daemon) 911 dd = dr2drivedesc(sp, MD_SIDEWILD, 912 (MD_BASICNAME_OK | MD_BYPASS_DAEMON), ep); 913 else 914 dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep); 915 916 if (dd == NULL) { 917 if (! mdisok(ep)) 918 return (-1); 919 return (0); 920 } 921 922 923 for (p = dd; p != NULL; p = p->dd_next) 924 if (strcmp(p->dd_dnp->cname, dnp->cname) == 0) 925 return (1); 926 return (0); 927 } 928 929 /* 930 * Check to see if devid is in use in any diskset. 931 * This is used in the case when a partial diskset is being imported 932 * to make sure that the unvailable drive isn't already in use in an 933 * already imported partial diskset. Can't check on the cname since the 934 * unavailable disk's cname is from the previous system and may collide 935 * with a cname on this system. 936 * Return values: 937 * 1: devid has been found in a diskset 938 * 0: devid not found in any diskset 939 */ 940 int 941 meta_is_devid_in_anyset( 942 void *devid, 943 mdsetname_t **spp, 944 md_error_t *ep 945 ) 946 { 947 set_t setno; 948 mdsetname_t *this_sp; 949 int is_it; 950 set_t max_sets; 951 952 if ((max_sets = get_max_sets(ep)) == 0) 953 return (-1); 954 955 assert(spp != NULL); 956 *spp = NULL; 957 958 for (setno = 1; setno < max_sets; setno++) { 959 if ((this_sp = metasetnosetname(setno, ep)) == NULL) { 960 if (mdismddberror(ep, MDE_DB_NODB)) { 961 mdclrerror(ep); 962 return (0); 963 } 964 if (mdiserror(ep, MDE_NO_SET)) { 965 mdclrerror(ep); 966 continue; 967 } 968 return (-1); 969 } 970 971 if ((is_it = meta_is_devid_in_thisset(this_sp, 972 devid, ep)) == -1) { 973 if (mdiserror(ep, MDE_NO_SET)) { 974 mdclrerror(ep); 975 continue; 976 } 977 return (-1); 978 } 979 if (is_it) { 980 *spp = this_sp; 981 return (0); 982 } 983 } 984 return (0); 985 } 986 987 int 988 meta_is_devid_in_thisset( 989 mdsetname_t *sp, 990 void *devid, 991 md_error_t *ep 992 ) 993 { 994 md_drive_desc *dd, *p; 995 ddi_devid_t dd_devid; 996 997 dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep); 998 if (dd == NULL) { 999 if (! mdisok(ep)) 1000 return (-1); 1001 return (0); 1002 } 1003 1004 for (p = dd; p != NULL; p = p->dd_next) { 1005 if (p->dd_dnp->devid == NULL) 1006 continue; 1007 (void) devid_str_decode(p->dd_dnp->devid, 1008 &dd_devid, NULL); 1009 if (dd_devid == NULL) 1010 continue; 1011 if (devid_compare(devid, dd_devid) == 0) { 1012 devid_free(dd_devid); 1013 return (1); 1014 } 1015 devid_free(dd_devid); 1016 } 1017 return (0); 1018 } 1019 1020 int 1021 meta_set_balance( 1022 mdsetname_t *sp, 1023 md_error_t *ep 1024 ) 1025 { 1026 md_set_desc *sd; 1027 md_drive_desc *dd, *curdd; 1028 daddr_t dbsize; 1029 daddr_t nblks; 1030 int i; 1031 int rval = 0; 1032 sigset_t oldsigs; 1033 md_setkey_t *cl_sk; 1034 md_error_t xep = mdnullerror; 1035 md_mnnode_desc *nd; 1036 int suspend1_flag = 0; 1037 1038 if ((sd = metaget_setdesc(sp, ep)) == NULL) 1039 return (-1); 1040 1041 dbsize = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE; 1042 1043 /* Make sure we own the set */ 1044 if (meta_check_ownership(sp, ep) != 0) 1045 return (-1); 1046 1047 /* END CHECK CODE */ 1048 1049 /* 1050 * Get drive descriptors for the drives that are currently in the set. 1051 */ 1052 curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep); 1053 1054 if (! mdisok(ep)) 1055 return (-1); 1056 1057 /* Find the minimum replica size in use is or use the default */ 1058 if ((nblks = meta_db_minreplica(sp, ep)) < 0) 1059 mdclrerror(ep); 1060 else 1061 dbsize = nblks; /* adjust replica size */ 1062 1063 /* Make sure we are blocking all signals */ 1064 if (procsigs(TRUE, &oldsigs, &xep) < 0) 1065 mdclrerror(&xep); 1066 1067 /* 1068 * Lock the set on current set members. 1069 * For MN diskset lock_set and SUSPEND are used to protect against 1070 * other meta* commands running on the other nodes. 1071 */ 1072 if (MD_MNSET_DESC(sd)) { 1073 nd = sd->sd_nodelist; 1074 while (nd) { 1075 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1076 nd = nd->nd_next; 1077 continue; 1078 } 1079 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 1080 rval = -1; 1081 goto out; 1082 } 1083 nd = nd->nd_next; 1084 } 1085 /* 1086 * Lock out other meta* commands by suspending 1087 * class 1 messages across the diskset. 1088 */ 1089 nd = sd->sd_nodelist; 1090 while (nd) { 1091 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1092 nd = nd->nd_next; 1093 continue; 1094 } 1095 if (clnt_mdcommdctl(nd->nd_nodename, 1096 COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1, 1097 MD_MSCF_NO_FLAGS, ep)) { 1098 rval = -1; 1099 goto out; 1100 } 1101 suspend1_flag = 1; 1102 nd = nd->nd_next; 1103 } 1104 } else { 1105 for (i = 0; i < MD_MAXSIDES; i++) { 1106 /* Skip empty slots */ 1107 if (sd->sd_nodes[i][0] == '\0') continue; 1108 1109 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { 1110 rval = -1; 1111 goto out; 1112 } 1113 } 1114 } 1115 1116 /* We are not adding or deleting any drives, just balancing */ 1117 dd = NULL; 1118 1119 /* 1120 * Balance the DB's according to the list of existing drives and the 1121 * list of added drives. 1122 */ 1123 if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1) 1124 goto out; 1125 1126 out: 1127 /* 1128 * Unlock diskset by resuming class 1 messages across the diskset. 1129 * Just resume all classes so that resume is the same whether 1130 * just one class was locked or all classes were locked. 1131 */ 1132 if (suspend1_flag) { 1133 nd = sd->sd_nodelist; 1134 while (nd) { 1135 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1136 nd = nd->nd_next; 1137 continue; 1138 } 1139 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 1140 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 1141 /* 1142 * We are here because we failed to resume 1143 * rpc.mdcommd. However we potentially have 1144 * an error from the previous call 1145 * (meta_db_balance). If the previous call 1146 * did fail, we capture that error and 1147 * generate a perror withthe string, 1148 * "Unable to resume...". 1149 * Setting rval to -1 ensures that in the 1150 * next iteration of the loop, ep is not 1151 * clobbered. 1152 */ 1153 if (rval == 0) 1154 (void) mdstealerror(ep, &xep); 1155 else 1156 mdclrerror(&xep); 1157 rval = -1; 1158 mde_perror(ep, dgettext(TEXT_DOMAIN, 1159 "Unable to resume rpc.mdcommd.")); 1160 } 1161 nd = nd->nd_next; 1162 } 1163 } 1164 1165 /* Unlock the set */ 1166 cl_sk = cl_get_setkey(sp->setno, sp->setname); 1167 if (MD_MNSET_DESC(sd)) { 1168 nd = sd->sd_nodelist; 1169 while (nd) { 1170 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1171 nd = nd->nd_next; 1172 continue; 1173 } 1174 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 1175 if (rval == 0) 1176 (void) mdstealerror(ep, &xep); 1177 else 1178 mdclrerror(&xep); 1179 rval = -1; 1180 } 1181 nd = nd->nd_next; 1182 } 1183 } else { 1184 for (i = 0; i < MD_MAXSIDES; i++) { 1185 /* Skip empty slots */ 1186 if (sd->sd_nodes[i][0] == '\0') 1187 continue; 1188 1189 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) { 1190 if (rval == 0) 1191 (void) mdstealerror(ep, &xep); 1192 rval = -1; 1193 } 1194 } 1195 } 1196 1197 /* release signals back to what they were on entry */ 1198 if (procsigs(FALSE, &oldsigs, &xep) < 0) 1199 mdclrerror(&xep); 1200 1201 cl_set_setkey(NULL); 1202 1203 metaflushsetname(sp); 1204 1205 return (rval); 1206 } 1207 1208 int 1209 meta_set_destroy( 1210 mdsetname_t *sp, 1211 int lock_set, 1212 md_error_t *ep 1213 ) 1214 { 1215 int i; 1216 med_rec_t medr; 1217 md_set_desc *sd; 1218 md_drive_desc *dd, *p, *p1; 1219 mddrivename_t *dnp; 1220 mdname_t *np; 1221 mdnamelist_t *nlp = NULL; 1222 int num_users = 0; 1223 int has_set; 1224 side_t mysideno; 1225 sigset_t oldsigs; 1226 md_error_t xep = mdnullerror; 1227 md_setkey_t *cl_sk; 1228 int rval = 0; 1229 int delete_end = 1; 1230 1231 /* Make sure we are blocking all signals */ 1232 if (procsigs(TRUE, &oldsigs, ep) < 0) 1233 return (-1); 1234 1235 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1236 if (! mdisok(ep)) 1237 rval = -1; 1238 goto out; 1239 } 1240 1241 /* 1242 * meta_set_destroy should not be called for a MN diskset. 1243 * This routine destroys a set without communicating this information 1244 * to the other nodes which would lead to an inconsistency in 1245 * the MN diskset. 1246 */ 1247 if (MD_MNSET_DESC(sd)) { 1248 rval = -1; 1249 goto out; 1250 } 1251 1252 /* Continue if a traditional diskset */ 1253 1254 /* 1255 * Check to see who has the set. If we are not the last user of the 1256 * set, we will not touch the replicas. 1257 */ 1258 for (i = 0; i < MD_MAXSIDES; i++) { 1259 /* Skip empty slots */ 1260 if (sd->sd_nodes[i][0] == '\0') 1261 continue; 1262 1263 has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NST_EQ, 1264 ep); 1265 1266 if (has_set < 0) { 1267 mdclrerror(ep); 1268 } else 1269 num_users++; 1270 } 1271 1272 if ((dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) == NULL) { 1273 if (! mdisok(ep)) { 1274 rval = -1; 1275 goto out; 1276 } 1277 } 1278 1279 if (setup_db_bydd(sp, dd, TRUE, ep) == -1) { 1280 rval = -1; 1281 goto out; 1282 } 1283 1284 if (lock_set == TRUE) { 1285 /* Lock the set on our side */ 1286 if (clnt_lock_set(mynode(), sp, ep)) { 1287 rval = -1; 1288 goto out; 1289 } 1290 } 1291 1292 /* 1293 * A traditional diskset has no diskset stale information to send 1294 * since there can only be one owner node at a time. 1295 */ 1296 if (snarf_set(sp, FALSE, ep)) 1297 mdclrerror(ep); 1298 1299 if (dd != NULL) { 1300 /* 1301 * Make sure that no drives are in use as parts of metadrives 1302 * or hot spare pools, this is one of the few error conditions 1303 * that will stop this routine, unless the environment has 1304 * META_DESTROY_SET_OK set, in which case, the operation will 1305 * proceed. 1306 */ 1307 if (getenv("META_DESTROY_SET_OK") == NULL) { 1308 for (p = dd; p != NULL; p = p->dd_next) { 1309 dnp = p->dd_dnp; 1310 1311 i = meta_check_drive_inuse(sp, dnp, FALSE, ep); 1312 if (i == -1) { 1313 /* need xep - wire calls clear error */ 1314 i = metaget_setownership(sp, &xep); 1315 if (i == -1) { 1316 rval = -1; 1317 goto out; 1318 } 1319 1320 mysideno = getmyside(sp, &xep); 1321 1322 if (mysideno == MD_SIDEWILD) { 1323 rval = -1; 1324 goto out; 1325 } 1326 1327 if (sd->sd_isown[mysideno] == FALSE) 1328 if (halt_set(sp, &xep)) { 1329 rval = -1; 1330 goto out; 1331 } 1332 1333 rval = -1; 1334 goto out; 1335 } 1336 } 1337 } 1338 1339 for (i = 0; i < MD_MAXSIDES; i++) { 1340 /* Skip empty slots */ 1341 if (sd->sd_nodes[i][0] == '\0') 1342 continue; 1343 1344 /* Skip non local nodes */ 1345 if (strcmp(mynode(), sd->sd_nodes[i]) != 0) 1346 continue; 1347 1348 if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep)) 1349 mdclrerror(ep); 1350 } 1351 1352 /* 1353 * Go thru each drive and individually delete the replicas. 1354 * This way we can ignore individual errors. 1355 */ 1356 for (p = dd; p != NULL; p = p->dd_next) { 1357 uint_t rep_slice; 1358 1359 dnp = p->dd_dnp; 1360 if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) || 1361 (((np = metaslicename(dnp, rep_slice, ep)) 1362 == NULL) && 1363 ((np = metaslicename(dnp, MD_SLICE0, ep)) 1364 == NULL))) { 1365 rval = -1; 1366 goto out; 1367 } 1368 1369 if ((np = metaslicename(dnp, 1370 rep_slice, ep)) == NULL) { 1371 if ((np = metaslicename(dnp, 1372 MD_SLICE0, ep)) == NULL) { 1373 rval = -1; 1374 goto out; 1375 } 1376 mdclrerror(ep); 1377 } 1378 1379 /* Yes this is UGLY!!! */ 1380 p1 = p->dd_next; 1381 p->dd_next = NULL; 1382 if (rel_own_bydd(sp, p, FALSE, ep)) 1383 mdclrerror(ep); 1384 p->dd_next = p1; 1385 1386 if (p->dd_dbcnt == 0) 1387 continue; 1388 1389 /* 1390 * Skip the replica removal if we are not the last user 1391 */ 1392 if (num_users != 1) 1393 continue; 1394 1395 nlp = NULL; 1396 (void) metanamelist_append(&nlp, np); 1397 if (meta_db_detach(sp, nlp, 1398 (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, ep)) 1399 mdclrerror(ep); 1400 metafreenamelist(nlp); 1401 } 1402 } 1403 1404 if (halt_set(sp, ep)) { 1405 rval = -1; 1406 goto out; 1407 } 1408 1409 /* Setup the mediator record */ 1410 (void) memset(&medr, '\0', sizeof (med_rec_t)); 1411 medr.med_rec_mag = MED_REC_MAGIC; 1412 medr.med_rec_rev = MED_REC_REV; 1413 medr.med_rec_fl = 0; 1414 medr.med_rec_sn = sp->setno; 1415 (void) strcpy(medr.med_rec_snm, sp->setname); 1416 medr.med_rec_meds = sd->sd_med; /* structure assigment */ 1417 (void) memset(&medr.med_rec_data, '\0', sizeof (med_data_t)); 1418 medr.med_rec_foff = 0; 1419 1420 /* 1421 * If we are the last remaining user, then remove the mediator hosts 1422 */ 1423 if (num_users == 1) { 1424 for (i = 0; i < MED_MAX_HOSTS; i++) { 1425 if (medr.med_rec_meds.n_lst[i].a_cnt != 0) 1426 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE, 1427 SVM_TAG_MEDIATOR, sp->setno, i); 1428 (void) memset(&medr.med_rec_meds.n_lst[i], '\0', 1429 sizeof (md_h_t)); 1430 } 1431 medr.med_rec_meds.n_cnt = 0; 1432 } else { /* Remove this host from the mediator node list. */ 1433 for (i = 0; i < MD_MAXSIDES; i++) { 1434 /* Skip empty slots */ 1435 if (sd->sd_nodes[i][0] == '\0') 1436 continue; 1437 1438 /* Copy non local node */ 1439 if (strcmp(mynode(), sd->sd_nodes[i]) != 0) { 1440 (void) strcpy(medr.med_rec_nodes[i], 1441 sd->sd_nodes[i]); 1442 continue; 1443 } 1444 1445 /* Clear local node */ 1446 (void) memset(&medr.med_rec_nodes[i], '\0', 1447 sizeof (md_node_nm_t)); 1448 } 1449 } 1450 1451 crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL); 1452 1453 /* 1454 * If the client is part of a cluster put the DCS service 1455 * into a deleteing state. 1456 */ 1457 if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) { 1458 if (metad_isautotakebyname(sp->setname)) { 1459 delete_end = 0; 1460 } else { 1461 mdclrerror(ep); 1462 goto out; 1463 } 1464 } 1465 1466 /* Inform the mediator hosts of the new information */ 1467 for (i = 0; i < MED_MAX_HOSTS; i++) { 1468 if (sd->sd_med.n_lst[i].a_cnt == 0) 1469 continue; 1470 1471 if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep)) 1472 mdclrerror(ep); 1473 } 1474 1475 /* Delete the set locally */ 1476 for (i = 0; i < MD_MAXSIDES; i++) { 1477 /* Skip empty slots */ 1478 if (sd->sd_nodes[i][0] == '\0') 1479 continue; 1480 1481 /* Skip non local nodes */ 1482 if (strcmp(mynode(), sd->sd_nodes[i]) != 0) 1483 continue; 1484 1485 if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) 1486 mdclrerror(ep); 1487 } 1488 if (delete_end && 1489 sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR) 1490 rval = -1; 1491 1492 out: 1493 /* release signals back to what they were on entry */ 1494 if (procsigs(FALSE, &oldsigs, &xep) < 0) { 1495 if (rval == 0) 1496 (void) mdstealerror(ep, &xep); 1497 rval = -1; 1498 } 1499 1500 if (lock_set == TRUE) { 1501 cl_sk = cl_get_setkey(sp->setno, sp->setname); 1502 if (clnt_unlock_set(mynode(), cl_sk, &xep)) { 1503 if (rval == 0) 1504 (void) mdstealerror(ep, &xep); 1505 rval = -1; 1506 } 1507 cl_set_setkey(NULL); 1508 } 1509 1510 metaflushsetname(sp); 1511 return (rval); 1512 } 1513 1514 int 1515 meta_set_purge( 1516 mdsetname_t *sp, 1517 int bypass_cluster, 1518 int forceflg, 1519 md_error_t *ep 1520 ) 1521 { 1522 char *thishost = mynode(); 1523 md_set_desc *sd; 1524 md_setkey_t *cl_sk; 1525 md_error_t xep = mdnullerror; 1526 int rval = 0; 1527 int i, num_hosts = 0; 1528 int has_set = 0; 1529 int max_node = 0; 1530 int delete_end = 1; 1531 md_mnnode_desc *nd; 1532 1533 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1534 /* unable to find set description */ 1535 rval = 1; 1536 return (rval); 1537 } 1538 1539 if (MD_MNSET_DESC(sd)) { 1540 /* 1541 * Get a count of the hosts in the set and also lock the set 1542 * on those hosts that know about it. 1543 */ 1544 nd = sd->sd_nodelist; 1545 while (nd) { 1546 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1547 nd = nd->nd_next; 1548 continue; 1549 } 1550 has_set = nodehasset(sp, nd->nd_nodename, 1551 NHS_NST_EQ, ep); 1552 1553 /* 1554 * The host is not aware of this set (has_set < 0) or 1555 * the set does not match (has_set == 0). This check 1556 * prevents the code getting confused by an apparent 1557 * inconsistancy in the set's state, this is in the 1558 * purge code so something is broken in any case and 1559 * this is just trying to fix the brokeness. 1560 */ 1561 if (has_set <= 0) { 1562 mdclrerror(ep); 1563 nd->nd_flags |= MD_MN_NODE_NOSET; 1564 } else { 1565 num_hosts++; 1566 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 1567 /* 1568 * If the force flag is set then 1569 * ignore any RPC failures because we 1570 * are only really interested with 1571 * the set on local node. 1572 */ 1573 if (forceflg && mdanyrpcerror(ep)) { 1574 mdclrerror(ep); 1575 } else { 1576 /* 1577 * set max_node so that in the 1578 * unlock code nodes in the 1579 * set that have not been 1580 * locked are not unlocked. 1581 */ 1582 max_node = nd->nd_nodeid; 1583 rval = 2; 1584 goto out1; 1585 } 1586 } 1587 1588 } 1589 nd = nd->nd_next; 1590 } 1591 max_node = 0; 1592 } else { 1593 /* 1594 * Get a count of the hosts in the set and also lock the set 1595 * on those hosts that know about it. 1596 */ 1597 for (i = 0; i < MD_MAXSIDES; i++) { 1598 /* Skip empty slots */ 1599 if (sd->sd_nodes[i][0] == '\0') 1600 continue; 1601 1602 has_set = nodehasset(sp, sd->sd_nodes[i], 1603 NHS_NST_EQ, ep); 1604 1605 /* 1606 * The host is not aware of this set (has_set < 0) or 1607 * the set does not match (has_set == 0). This check 1608 * prevents the code getting confused by an apparent 1609 * inconsistancy in the set's state, this is in the 1610 * purge code so something is broken in any case and 1611 * this is just trying to fix the brokeness. 1612 */ 1613 if (has_set <= 0) { 1614 mdclrerror(ep); 1615 /* 1616 * set the node to NULL to prevent further 1617 * requests to this unresponsive node. 1618 */ 1619 sd->sd_nodes[i][0] = '\0'; 1620 } else { 1621 num_hosts++; 1622 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { 1623 /* 1624 * If the force flag is set then 1625 * ignore any RPC failures because we 1626 * are only really interested with 1627 * the set on local node. 1628 */ 1629 if (forceflg && mdanyrpcerror(ep)) { 1630 mdclrerror(ep); 1631 } else { 1632 rval = 2; 1633 /* 1634 * set max_node so that in the 1635 * unlock code nodes in the 1636 * set that have not been 1637 * locked are not unlocked. 1638 */ 1639 max_node = i; 1640 goto out1; 1641 } 1642 } 1643 } 1644 } 1645 max_node = i; /* now MD_MAXSIDES */ 1646 } 1647 if (!bypass_cluster) { 1648 /* 1649 * If there is only one host associated with the 1650 * set then remove the set from the cluster. 1651 */ 1652 if (num_hosts == 1) { 1653 if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) { 1654 if (metad_isautotakebyname(sp->setname)) { 1655 delete_end = 0; 1656 } else { 1657 mdclrerror(ep); 1658 rval = 3; 1659 goto out1; 1660 } 1661 } 1662 } 1663 } 1664 1665 if (MD_MNSET_DESC(sd)) { 1666 /* 1667 * Get a count of the hosts in the set and also lock the set 1668 * on those hosts that know about it. 1669 */ 1670 nd = sd->sd_nodelist; 1671 while (nd) { 1672 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1673 nd = nd->nd_next; 1674 continue; 1675 } 1676 if (nd->nd_nodeid != sd->sd_mn_mynode->nd_nodeid) { 1677 /* 1678 * Tell the remote node to remove this node 1679 */ 1680 if (clnt_delhosts(nd->nd_nodename, sp, 1, 1681 &thishost, ep) == -1) { 1682 /* 1683 * If we fail to delete ourselves 1684 * from the remote host it does not 1685 * really matter because the set is 1686 * being "purged" from this node. The 1687 * set can be purged from the other 1688 * node at a later time. 1689 */ 1690 mdclrerror(ep); 1691 } 1692 nd = nd->nd_next; 1693 continue; 1694 } 1695 /* remove the set from this host */ 1696 if (clnt_delset(nd->nd_nodename, sp, ep) == -1) { 1697 md_perror(dgettext(TEXT_DOMAIN, "delset")); 1698 if (!bypass_cluster && num_hosts == 1) 1699 (void) sdssc_delete_end(sp->setname, 1700 SDSSC_CLEANUP); 1701 mdclrerror(ep); 1702 goto out1; 1703 } 1704 nd = nd->nd_next; 1705 } 1706 } else { 1707 for (i = 0; i < MD_MAXSIDES; i++) { 1708 /* Skip empty slots */ 1709 if (sd->sd_nodes[i][0] == '\0') 1710 continue; 1711 if (strcmp(thishost, sd->sd_nodes[i]) != 0) { 1712 /* 1713 * Tell the remote node to remove this node 1714 */ 1715 if (clnt_delhosts(sd->sd_nodes[i], sp, 1, 1716 &thishost, ep) == -1) { 1717 /* 1718 * If we fail to delete ourselves 1719 * from the remote host it does not 1720 * really matter because the set is 1721 * being "purged" from this node. The 1722 * set can be purged from the other 1723 * node at a later time. 1724 */ 1725 mdclrerror(ep); 1726 } 1727 continue; 1728 } 1729 1730 /* remove the set from this host */ 1731 if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) { 1732 md_perror(dgettext(TEXT_DOMAIN, "delset")); 1733 if (!bypass_cluster && num_hosts == 1) 1734 (void) sdssc_delete_end(sp->setname, 1735 SDSSC_CLEANUP); 1736 mdclrerror(ep); 1737 goto out1; 1738 } 1739 } 1740 } 1741 1742 if (!bypass_cluster && num_hosts == 1) { 1743 if (delete_end && sdssc_delete_end(sp->setname, SDSSC_COMMIT) == 1744 SDSSC_ERROR) { 1745 rval = 4; 1746 } 1747 } 1748 1749 out1: 1750 1751 cl_sk = cl_get_setkey(sp->setno, sp->setname); 1752 1753 /* 1754 * Remove the set lock on those nodes that had the set locked 1755 * max_node will either be MD_MAXSIDES or array index of the last 1756 * node contacted (or rather failed to contact) for traditional 1757 * diskset. For a MN diskset, max_node is the node_id of the node 1758 * that failed the lock. 1759 */ 1760 if (MD_MNSET_DESC(sd)) { 1761 nd = sd->sd_nodelist; 1762 while (nd) { 1763 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1764 nd = nd->nd_next; 1765 continue; 1766 } 1767 if (nd->nd_nodeid == max_node) 1768 break; 1769 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 1770 if (forceflg && mdanyrpcerror(&xep)) { 1771 mdclrerror(&xep); 1772 nd = nd->nd_next; 1773 continue; 1774 } 1775 if (rval == 0) 1776 (void) mdstealerror(ep, &xep); 1777 rval = 5; 1778 } 1779 nd = nd->nd_next; 1780 } 1781 } else { 1782 for (i = 0; i < max_node; i++) { 1783 /* Skip empty slots */ 1784 if (sd->sd_nodes[i][0] == '\0') 1785 continue; 1786 1787 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) { 1788 if (forceflg && mdanyrpcerror(&xep)) { 1789 mdclrerror(&xep); 1790 continue; 1791 } 1792 if (rval == 0) 1793 (void) mdstealerror(ep, &xep); 1794 rval = 5; 1795 } 1796 } 1797 } 1798 1799 cl_set_setkey(NULL); 1800 1801 return (rval); 1802 } 1803 1804 int 1805 meta_set_query( 1806 mdsetname_t *sp, 1807 mddb_dtag_lst_t **dtlpp, 1808 md_error_t *ep 1809 ) 1810 { 1811 mddb_dtag_get_parm_t dtgp; 1812 1813 (void) memset(&dtgp, '\0', sizeof (mddb_dtag_get_parm_t)); 1814 dtgp.dtgp_setno = sp->setno; 1815 1816 /*CONSTCOND*/ 1817 while (1) { 1818 if (metaioctl(MD_MED_GET_TAG, &dtgp, &dtgp.dtgp_mde, NULL) != 0) 1819 if (! mdismddberror(&dtgp.dtgp_mde, MDE_DB_NOTAG) || 1820 *dtlpp == NULL) 1821 return (mdstealerror(ep, &dtgp.dtgp_mde)); 1822 else 1823 break; 1824 1825 /* 1826 * Run to the end of the list 1827 */ 1828 for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx) 1829 /* void */; 1830 1831 *dtlpp = Zalloc(sizeof (mddb_dtag_lst_t)); 1832 1833 (void) memmove(&(*dtlpp)->dtl_dt, &dtgp.dtgp_dt, 1834 sizeof (mddb_dtag_t)); 1835 1836 dtgp.dtgp_dt.dt_id++; 1837 } 1838 return (0); 1839 } 1840 1841 /* 1842 * return drivename get by key 1843 */ 1844 mddrivename_t * 1845 metadrivename_withdrkey( 1846 mdsetname_t *sp, 1847 side_t sideno, 1848 mdkey_t key, 1849 int flags, 1850 md_error_t *ep 1851 ) 1852 { 1853 char *nm; 1854 mdname_t *np; 1855 mddrivename_t *dnp; 1856 ddi_devid_t devidp; 1857 md_set_desc *sd; 1858 1859 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1860 return (NULL); 1861 } 1862 1863 1864 /* 1865 * Get the devid associated with the key. 1866 * 1867 * If a devid was returned, it MUST be valid even in 1868 * the case where a device id has been "updated". The 1869 * "update" of the device id may have occured due to 1870 * a firmware upgrade. 1871 */ 1872 if ((devidp = meta_getdidbykey(MD_LOCAL_SET, sideno+SKEW, key, ep)) 1873 != NULL) { 1874 /* 1875 * Look for the correct dnp using the devid for comparison. 1876 */ 1877 dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep); 1878 free(devidp); 1879 dnp->side_names_key = key; 1880 } else { 1881 /* 1882 * We didn't get a devid. We'll try for a dnp using the 1883 * name. If we have a MN diskset or if the dnp is a did 1884 * device, we're done because then we don't have devids. 1885 * Otherwise we'll try to set the devid 1886 * and get the dnp via devid again. 1887 * We also need to clear the ep structure. When the 1888 * above call to meta_getdidbykey returned a null, it 1889 * also put an error code into ep. In this case, the null 1890 * return is actually OK and any errors can be ignored. The 1891 * reason it is OK is because this could be a MN set or 1892 * we could be running without devids (ex cluster). 1893 */ 1894 mdclrerror(ep); 1895 1896 if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno, key, 1897 ep)) == NULL) 1898 return (NULL); 1899 /* get device name */ 1900 if (flags & PRINT_FAST) { 1901 if ((np = metaname_fast(&sp, nm, 1902 LOGICAL_DEVICE, ep)) == NULL) { 1903 Free(nm); 1904 return (NULL); 1905 } 1906 } else { 1907 if ((np = metaname(&sp, nm, LOGICAL_DEVICE, 1908 ep)) == NULL) { 1909 Free(nm); 1910 return (NULL); 1911 } 1912 } 1913 Free(nm); 1914 /* make sure it's OK */ 1915 if ((! (flags & MD_BASICNAME_OK)) && (metachkcomp(np, 1916 ep) != 0)) 1917 return (NULL); 1918 1919 /* get drivename */ 1920 dnp = np->drivenamep; 1921 dnp->side_names_key = key; 1922 /* 1923 * Skip the devid set/check for the following cases: 1924 * 1) If MN diskset, there are no devid's 1925 * 2) if dnp is did device 1926 * The device id is disabled for did device due to the 1927 * lack of minor name support in the did driver. The following 1928 * devid code path can set and propagate the error and 1929 * eventually prevent did disks from being added to the 1930 * diskset under SunCluster systems 1931 */ 1932 if ((strncmp(dnp->rname, "/dev/did/", strlen("/dev/did/")) 1933 == 0) || (MD_MNSET_DESC(sd))) 1934 goto out; 1935 1936 /* 1937 * It is okay if replica is not in devid mode 1938 */ 1939 if (mdissyserror(ep, MDDB_F_NODEVID)) { 1940 mdclrerror(ep); 1941 goto out; 1942 } 1943 1944 /* 1945 * We're not MN or did devices but 1946 * devid is missing so this means that we have 1947 * just upgraded from a configuration where 1948 * devid's were not used so try to add in 1949 * the devid and requery. If the devid still isn't there, 1950 * that's OK. dnp->devid will be null as it is in any 1951 * configuration with no devids. 1952 */ 1953 if (meta_setdid(MD_LOCAL_SET, sideno + SKEW, key, 1954 ep) < 0) 1955 return (NULL); 1956 if ((devidp = (ddi_devid_t)meta_getdidbykey(MD_LOCAL_SET, 1957 sideno+SKEW, key, ep)) != NULL) { 1958 /* 1959 * Found a devid so look for the dnp using the 1960 * devid as the search mechanism. 1961 */ 1962 dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep); 1963 free(devidp); 1964 dnp->side_names_key = key; 1965 } 1966 } 1967 1968 1969 1970 out: 1971 if (flags & MD_BYPASS_DAEMON) 1972 return (dnp); 1973 1974 if (get_sidenmlist(sp, dnp, ep)) 1975 return (NULL); 1976 1977 /* return success */ 1978 return (dnp); 1979 } 1980 1981 void 1982 metafreedrivedesc(md_drive_desc **dd) 1983 { 1984 md_drive_desc *p, *next = NULL; 1985 1986 for (p = *dd; p != NULL; p = next) { 1987 next = p->dd_next; 1988 Free(p); 1989 } 1990 *dd = NULL; 1991 } 1992 1993 md_drive_desc * 1994 metaget_drivedesc( 1995 mdsetname_t *sp, 1996 int flags, 1997 md_error_t *ep 1998 ) 1999 { 2000 side_t sideno = MD_SIDEWILD; 2001 2002 assert(! (flags & MD_BYPASS_DAEMON)); 2003 2004 if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD) 2005 return (NULL); 2006 2007 return (metaget_drivedesc_sideno(sp, sideno, flags, ep)); 2008 } 2009 2010 md_drive_desc * 2011 metaget_drivedesc_fromnamelist( 2012 mdsetname_t *sp, 2013 mdnamelist_t *nlp, 2014 md_error_t *ep 2015 ) 2016 { 2017 md_set_desc *sd; 2018 mdnamelist_t *p; 2019 md_drive_desc *dd = NULL; 2020 2021 if ((sd = metaget_setdesc(sp, ep)) == NULL) 2022 return (NULL); 2023 2024 for (p = nlp; p != NULL; p = p->next) 2025 (void) metadrivedesc_append(&dd, p->namep->drivenamep, 0, 0, 2026 sd->sd_ctime, sd->sd_genid, MD_DR_ADD); 2027 2028 return (dd); 2029 } 2030 2031 md_drive_desc * 2032 metaget_drivedesc_sideno( 2033 mdsetname_t *sp, 2034 side_t sideno, 2035 int flags, 2036 md_error_t *ep 2037 ) 2038 { 2039 md_set_desc *sd = NULL; 2040 2041 assert(! (flags & MD_BYPASS_DAEMON)); 2042 2043 if ((sd = metaget_setdesc(sp, ep)) == NULL) 2044 return (NULL); 2045 2046 if (sd->sd_drvs) 2047 return (sd->sd_drvs); 2048 2049 if ((sd->sd_drvs = dr2drivedesc(sp, sideno, flags, ep)) == NULL) 2050 return (NULL); 2051 2052 return (sd->sd_drvs); 2053 } 2054 2055 int 2056 metaget_setownership( 2057 mdsetname_t *sp, 2058 md_error_t *ep 2059 ) 2060 { 2061 md_set_desc *sd; 2062 int bool; 2063 int i; 2064 md_mnnode_desc *nd; 2065 2066 if ((sd = metaget_setdesc(sp, ep)) == NULL) 2067 return (-1); 2068 2069 if (MD_MNSET_DESC(sd)) { 2070 nd = sd->sd_nodelist; 2071 while (nd) { 2072 /* If node isn't alive, can't own diskset */ 2073 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2074 nd->nd_flags &= ~MD_MN_NODE_OWN; 2075 nd = nd->nd_next; 2076 continue; 2077 } 2078 /* 2079 * If can't communicate with rpc.metad, then mark 2080 * this node as not an owner. That node may 2081 * in fact, be an owner, but without rpc.metad running 2082 * that node can't do much. 2083 */ 2084 if (clnt_ownset(nd->nd_nodename, sp, &bool, ep) == -1) { 2085 nd->nd_flags &= ~MD_MN_NODE_OWN; 2086 } else if (bool == TRUE) { 2087 nd->nd_flags |= MD_MN_NODE_OWN; 2088 } else { 2089 nd->nd_flags &= ~MD_MN_NODE_OWN; 2090 } 2091 nd = nd->nd_next; 2092 } 2093 return (0); 2094 } 2095 2096 /* Rest of code handles traditional disksets */ 2097 2098 for (i = 0; i < MD_MAXSIDES; i++) 2099 sd->sd_isown[i] = 0; 2100 2101 if (clnt_ownset(mynode(), sp, &bool, ep) == -1) 2102 return (-1); 2103 2104 if (bool == TRUE) 2105 sd->sd_isown[getmyside(sp, ep)] = 1; 2106 2107 return (0); 2108 } 2109 2110 char * 2111 mynode(void) 2112 { 2113 static struct utsname myuname; 2114 static int done = 0; 2115 2116 if (! done) { 2117 if (uname(&myuname) == -1) { 2118 md_perror(dgettext(TEXT_DOMAIN, "uname")); 2119 assert(0); 2120 } 2121 done = 1; 2122 } 2123 return (myuname.nodename); 2124 } 2125 2126 int 2127 strinlst(char *str, int cnt, char **lst) 2128 { 2129 int i; 2130 2131 for (i = 0; i < cnt; i++) 2132 if (strcmp(lst[i], str) == 0) 2133 return (TRUE); 2134 2135 return (FALSE); 2136 } 2137 2138 /* 2139 * meta_get_reserved_names 2140 * returns an mdnamelist_t of reserved slices 2141 * reserved slices are those that are used but don't necessarily 2142 * show up as metadevices (ex. reserved slice for db in sets, logs) 2143 */ 2144 2145 /*ARGSUSED*/ 2146 int 2147 meta_get_reserved_names( 2148 mdsetname_t *sp, 2149 mdnamelist_t **nlpp, 2150 int options, 2151 md_error_t *ep) 2152 { 2153 int count = 0; 2154 mdname_t *np = NULL; 2155 mdnamelist_t *transnlp = NULL; 2156 mdnamelist_t **tailpp = nlpp; 2157 mdnamelist_t *nlp; 2158 md_drive_desc *dd, *di; 2159 2160 if (metaislocalset(sp)) 2161 goto out; 2162 2163 if (!(dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) && !mdisok(ep)) { 2164 count = -1; 2165 goto out; 2166 } 2167 2168 /* db in for sets on reserved slice */ 2169 for (di = dd; di && count >= 0; di = di->dd_next) { 2170 uint_t rep_slice; 2171 2172 /* 2173 * Add the name struct to the end of the 2174 * namelist but keep a pointer to the last 2175 * element so that we don't incur the overhead 2176 * of traversing the list each time 2177 */ 2178 if (di->dd_dnp && 2179 (meta_replicaslice(di->dd_dnp, &rep_slice, ep) == 0) && 2180 (np = metaslicename(di->dd_dnp, rep_slice, ep)) && 2181 (tailpp = meta_namelist_append_wrapper(tailpp, np))) 2182 count++; 2183 else 2184 count = -1; 2185 } 2186 2187 /* now find logs */ 2188 if (meta_get_trans_names(sp, &transnlp, 0, ep) < 0) { 2189 count = -1; 2190 goto out; 2191 } 2192 2193 for (nlp = transnlp; (nlp != NULL); nlp = nlp->next) { 2194 mdname_t *transnp = nlp->namep; 2195 md_trans_t *transp; 2196 2197 if ((transp = meta_get_trans(sp, transnp, ep)) == NULL) { 2198 count = -1; 2199 goto out; 2200 } 2201 if (transp->lognamep) { 2202 /* 2203 * Add the name struct to the end of the 2204 * namelist but keep a pointer to the last 2205 * element so that we don't incur the overhead 2206 * of traversing the list each time 2207 */ 2208 tailpp = meta_namelist_append_wrapper( 2209 tailpp, transp->lognamep); 2210 } 2211 } 2212 out: 2213 metafreenamelist(transnlp); 2214 return (count); 2215 } 2216 2217 /* 2218 * Entry point to join a node to MultiNode diskset. 2219 * 2220 * Validate host in diskset. 2221 * - Should be in membership list from API 2222 * - Should not already be joined into diskset. 2223 * - Set must have drives 2224 * Assume valid configuration is stored in the set/drive/node records 2225 * in the local mddb since no node or drive can be added to the MNset 2226 * unless all drives and nodes are available. Reconfig steps will 2227 * resync all ALIVE nodes in case of panic in critical areas. 2228 * 2229 * Lock down the set. 2230 * Verify host is a member of this diskset. 2231 * If drives exist in the configuration, load the mddbs. 2232 * Set this node to active by notifying master if one exists. 2233 * If this is the first node active in the diskset, this node 2234 * becomes the master. 2235 * Unlock the set. 2236 * 2237 * Mirror Resync: 2238 * If this node is the last node to join the set and clustering 2239 * isn't running, then start the 'metasync -r' type resync 2240 * on all mirrors in this diskset. 2241 * If clustering is running, this resync operation will 2242 * be handled by the reconfig steps and should NOT 2243 * be handled during a join operation. 2244 * 2245 * There are multiple return values in order to assist 2246 * the join operation of all sets in the metaset command. 2247 * 2248 * Return values: 2249 * 0 - Node successfully joined to set. 2250 * -1 - Join attempted but failed 2251 * - any failure from libmeta calls 2252 * - node not in the member list 2253 * -2 - Join not attempted since 2254 * - this set had no drives in set 2255 * - this node already joined to set 2256 * - set is not a multinode set 2257 * -3 - Node joined to STALE set. 2258 */ 2259 extern int 2260 meta_set_join( 2261 mdsetname_t *sp, 2262 md_error_t *ep 2263 ) 2264 { 2265 md_set_desc *sd; 2266 md_drive_desc *dd; 2267 md_mnnode_desc *nd, *nd2, my_nd; 2268 int rval = 0; 2269 md_setkey_t *cl_sk; 2270 md_error_t xep = mdnullerror; 2271 md_error_t ep_snarf = mdnullerror; 2272 int master_flag = 0; 2273 md_mnset_record *mas_mnsr = NULL; 2274 int clear_nr_flags = 0; 2275 md_mnnode_record *nr; 2276 int stale_set = 0; 2277 int rb_flags = 0; 2278 int stale_bool = FALSE; 2279 int suspendall_flag = 0; 2280 int suspend1_flag = 0; 2281 sigset_t oldsigs; 2282 int send_reinit = 0; 2283 2284 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 2285 return (-1); 2286 } 2287 2288 /* Must be a multinode diskset */ 2289 if (!MD_MNSET_DESC(sd)) { 2290 (void) mderror(ep, MDE_NOT_MN, sp->setname); 2291 return (-2); 2292 } 2293 2294 /* Verify that the node is ALIVE (i.e. is in the API membership list) */ 2295 if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_ALIVE)) { 2296 (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno, 2297 sd->sd_mn_mynode->nd_nodename, NULL, 2298 sp->setname); 2299 return (-1); 2300 } 2301 2302 /* Make sure we are blocking all signals */ 2303 if (procsigs(TRUE, &oldsigs, &xep) < 0) 2304 mdclrerror(&xep); 2305 2306 /* 2307 * Lock the set on current set members. 2308 * For MN diskset lock_set and SUSPEND are used to protect against 2309 * other meta* commands running on the other nodes. 2310 */ 2311 nd = sd->sd_nodelist; 2312 while (nd) { 2313 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2314 nd = nd->nd_next; 2315 continue; 2316 } 2317 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 2318 rval = -1; 2319 goto out; 2320 } 2321 nd = nd->nd_next; 2322 } 2323 2324 /* 2325 * Lock out other meta* commands by suspending 2326 * class 1 messages across the diskset. 2327 */ 2328 nd = sd->sd_nodelist; 2329 while (nd) { 2330 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2331 nd = nd->nd_next; 2332 continue; 2333 } 2334 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, 2335 sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) { 2336 rval = -1; 2337 goto out; 2338 } 2339 suspend1_flag = 1; 2340 nd = nd->nd_next; 2341 } 2342 2343 /* 2344 * Verify that this host is a member (in the host list) of the set. 2345 */ 2346 nd = sd->sd_nodelist; 2347 while (nd) { 2348 if (strcmp(mynode(), nd->nd_nodename) == 0) { 2349 break; 2350 } 2351 nd = nd->nd_next; 2352 } 2353 if (!nd) { 2354 (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, 2355 sd->sd_mn_mynode->nd_nodename, NULL, 2356 sp->setname); 2357 rval = -1; 2358 goto out; 2359 } 2360 2361 /* 2362 * Need to return failure if host is already 'joined' 2363 * into the set. This is done so that if later the user 2364 * issues a command to join all sets and a failure is 2365 * encountered - that the resulting cleanup effort 2366 * (withdrawing from all sets that were joined 2367 * during that command) won't withdraw from this set. 2368 */ 2369 if (nd->nd_flags & MD_MN_NODE_OWN) { 2370 rval = -2; 2371 goto out2; 2372 } 2373 2374 /* 2375 * Call metaget_setownership that calls each node in diskset and 2376 * marks in set descriptor if node is an owner of the set or not. 2377 * metaget_setownership checks to see if a node is an owner by 2378 * checking to see if that node's kernel has the mddb loaded. 2379 * If a node had panic'd during a reconfig or an 2380 * add/delete/join/withdraw operation, the other nodes' node 2381 * records may not reflect the current state of the diskset, 2382 * so calling metaget_setownership is the safest thing to do. 2383 */ 2384 if (metaget_setownership(sp, ep) == -1) { 2385 rval = -1; 2386 goto out; 2387 } 2388 2389 /* If first active member of diskset, become the master. */ 2390 nd = sd->sd_nodelist; 2391 while (nd) { 2392 if (nd->nd_flags & MD_MN_NODE_OWN) 2393 break; 2394 nd = nd->nd_next; 2395 } 2396 if (nd == NULL) 2397 master_flag = 1; 2398 2399 /* 2400 * If not first active member of diskset, then get the 2401 * master information from a node that is already joined 2402 * and set the master information for this node. Be sure 2403 * that this node (the already joined node) has its own 2404 * join flag set. If not, then this diskset isn't currently 2405 * consistent and shouldn't allow a node to join. This diskset 2406 * inconsistency should only occur when a node has panic'd in 2407 * the set while doing a metaset operation and the sysadmin is 2408 * attempting to join a node into the set. This inconsistency 2409 * will be fixed during a reconfig cycle which should be occurring 2410 * soon since a node panic'd. 2411 * 2412 * If unable to get this information from an owning node, then 2413 * this diskset isn't currently consistent and shouldn't 2414 * allow a node to join. 2415 */ 2416 if (!master_flag) { 2417 /* get master information from an owner (joined) node */ 2418 if (clnt_mngetset(nd->nd_nodename, sp->setname, 2419 sp->setno, &mas_mnsr, ep) == -1) { 2420 rval = -1; 2421 goto out; 2422 } 2423 2424 /* Verify that owner (joined) node has its own JOIN flag set */ 2425 nr = mas_mnsr->sr_nodechain; 2426 while (nr) { 2427 if ((nd->nd_nodeid == nr->nr_nodeid) && 2428 ((nr->nr_flags & MD_MN_NODE_OWN) == NULL)) { 2429 (void) mddserror(ep, MDE_DS_NODENOSET, 2430 sp->setno, nd->nd_nodename, NULL, 2431 nd->nd_nodename); 2432 free_sr((md_set_record *)mas_mnsr); 2433 rval = -1; 2434 goto out; 2435 } 2436 nr = nr->nr_next; 2437 } 2438 2439 /* 2440 * Does master have set marked as STALE? 2441 * If so, need to pass this down to kernel when 2442 * this node snarfs the set. 2443 */ 2444 if (clnt_mn_is_stale(nd->nd_nodename, sp, 2445 &stale_bool, ep) == -1) { 2446 rval = -1; 2447 goto out; 2448 } 2449 2450 /* set master information in my rpc.metad's set record */ 2451 if (clnt_mnsetmaster(mynode(), sp, mas_mnsr->sr_master_nodenm, 2452 mas_mnsr->sr_master_nodeid, ep)) { 2453 free_sr((md_set_record *)mas_mnsr); 2454 rval = -1; 2455 goto out; 2456 } 2457 2458 /* set master information in my cached set desc */ 2459 (void) strcpy(sd->sd_mn_master_nodenm, 2460 mas_mnsr->sr_master_nodenm); 2461 sd->sd_mn_master_nodeid = mas_mnsr->sr_master_nodeid; 2462 nd2 = sd->sd_nodelist; 2463 while (nd2) { 2464 if (nd2->nd_nodeid == mas_mnsr->sr_master_nodeid) { 2465 sd->sd_mn_masternode = nd2; 2466 break; 2467 } 2468 nd2 = nd2->nd_next; 2469 } 2470 free_sr((md_set_record *)mas_mnsr); 2471 2472 /* 2473 * Set the node flags in mynode's rpc.metad node records for 2474 * the nodes that are in the diskset. Can use my sd 2475 * since earlier call to metaget_setownership set the 2476 * owner flags based on whether that node had snarfed 2477 * the MN diskset mddb. Reconfig steps guarantee that 2478 * return of metaget_setownership will match the owning 2479 * node's owner list except in the case where a node 2480 * has just panic'd and in this case, a reconfig will 2481 * be starting immediately and the owner lists will 2482 * be sync'd up by the reconfig. 2483 * 2484 * Flag of SET means to take no action except to 2485 * set the node flags as given in the nodelist linked list. 2486 */ 2487 if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, 2488 MD_NR_SET, NULL, ep)) { 2489 rval = -1; 2490 goto out; 2491 } 2492 } 2493 2494 /* 2495 * Read in the mddb if there are drives in the set. 2496 */ 2497 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 2498 ep)) == NULL) { 2499 /* No drives in list */ 2500 if (! mdisok(ep)) { 2501 rval = -1; 2502 goto out; 2503 } 2504 rval = -2; 2505 goto out; 2506 } 2507 2508 /* 2509 * Notify rpc.mdcommd on all nodes of a nodelist change. 2510 * Start by suspending rpc.mdcommd (which drains it of all messages), 2511 * then change the nodelist followed by a reinit and resume. 2512 */ 2513 nd = sd->sd_nodelist; 2514 while (nd) { 2515 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2516 nd = nd->nd_next; 2517 continue; 2518 } 2519 2520 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp, 2521 MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) { 2522 rval = -1; 2523 goto out; 2524 } 2525 suspendall_flag = 1; 2526 nd = nd->nd_next; 2527 } 2528 2529 /* Set master in my set record in rpc.metad */ 2530 if (master_flag) { 2531 if (clnt_mnsetmaster(mynode(), sp, 2532 sd->sd_mn_mynode->nd_nodename, 2533 sd->sd_mn_mynode->nd_nodeid, ep)) { 2534 rval = -1; 2535 goto out; 2536 } 2537 } 2538 /* 2539 * Causes mddbs to be loaded into the kernel. 2540 * Set the force flag so that replica locations can be 2541 * loaded into the kernel even if a mediator node was 2542 * unavailable. This allows a node to join an MO 2543 * diskset when there are sufficient replicas available, 2544 * but a mediator node in unavailable. 2545 */ 2546 if (setup_db_bydd(sp, dd, TRUE, ep) == -1) { 2547 mde_perror(ep, dgettext(TEXT_DOMAIN, 2548 "Host not able to start diskset.")); 2549 rval = -1; 2550 goto out; 2551 } 2552 2553 if (! mdisok(ep)) { 2554 rval = -1; 2555 goto out; 2556 } 2557 2558 /* 2559 * Set rollback flags to 1 so that halt_set is called if a failure 2560 * is seen after this point. If snarf_set fails, still need to 2561 * call halt_set to cleanup the diskset. 2562 */ 2563 rb_flags = 1; 2564 2565 /* Starts the set */ 2566 if (snarf_set(sp, stale_bool, ep) != 0) { 2567 if (mdismddberror(ep, MDE_DB_STALE)) { 2568 /* 2569 * Don't fail join, STALE means that set has 2570 * < 50% mddbs. 2571 */ 2572 (void) mdstealerror(&ep_snarf, ep); 2573 stale_set = 1; 2574 } else if (mdisok(ep)) { 2575 /* If snarf failed, but no error was set - set it */ 2576 (void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64, 2577 sp->setno, 0, NULL); 2578 rval = -1; 2579 goto out; 2580 } else if (!(mdismddberror(ep, MDE_DB_ACCOK))) { 2581 /* 2582 * Don't fail join if ACCOK; ACCOK means that mediator 2583 * provided extra vote. 2584 */ 2585 rval = -1; 2586 goto out; 2587 } 2588 } 2589 2590 /* Did set really get snarfed? */ 2591 if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_NO) { 2592 if (mdisok(ep)) { 2593 /* If snarf failed, but no error was set - set it */ 2594 (void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64, 2595 sp->setno, 0, NULL); 2596 } 2597 mde_perror(ep, dgettext(TEXT_DOMAIN, 2598 "Host not able to start diskset.")); 2599 rval = -1; 2600 goto out; 2601 } 2602 2603 /* Change to nodelist so need to send reinit to rpc.mdcommd */ 2604 send_reinit = 1; 2605 2606 /* If first node to enter set, setup master and clear change log */ 2607 if (master_flag) { 2608 /* Set master in my locally cached set descriptor */ 2609 (void) strcpy(sd->sd_mn_master_nodenm, 2610 sd->sd_mn_mynode->nd_nodename); 2611 sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid; 2612 sd->sd_mn_am_i_master = 1; 2613 2614 /* 2615 * If first node to join set, then clear out change log 2616 * entries. Change log entries are only needed when a 2617 * change of master is occurring in a diskset that has 2618 * multiple owners. Since this node is the first owner 2619 * of the diskset, clear the entries. 2620 * 2621 * Only do this if we are in a single node non-SC3.x 2622 * situation. 2623 */ 2624 if (meta_mn_singlenode() && 2625 mdmn_reset_changelog(sp, ep, MDMN_CLF_RESETLOG) != 0) { 2626 mde_perror(ep, dgettext(TEXT_DOMAIN, 2627 "Unable to reset changelog.")); 2628 rval = -1; 2629 goto out; 2630 } 2631 } 2632 2633 /* Set my locally cached flag */ 2634 sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN; 2635 2636 /* 2637 * Set this node's own flag on all joined nodes in the set 2638 * (including my node). 2639 */ 2640 clear_nr_flags = 1; 2641 2642 my_nd = *(sd->sd_mn_mynode); 2643 my_nd.nd_next = NULL; 2644 nd = sd->sd_nodelist; 2645 while (nd) { 2646 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 2647 nd = nd->nd_next; 2648 continue; 2649 } 2650 if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, 2651 MD_NR_JOIN, NULL, ep)) { 2652 rval = -1; 2653 goto out; 2654 } 2655 nd = nd->nd_next; 2656 } 2657 2658 out: 2659 if (rval != NULL) { 2660 /* 2661 * If rollback flag is 1, then node was joined to set. 2662 * Since an error occurred, withdraw node from set in 2663 * order to rollback to before command was run. 2664 * Need to preserve ep so that calling function can 2665 * get error information. 2666 */ 2667 if (rb_flags == 1) { 2668 if (halt_set(sp, &xep)) { 2669 mdclrerror(&xep); 2670 } 2671 } 2672 2673 /* 2674 * If error, reset master to INVALID. 2675 * Ignore error since (next) first node to successfully join 2676 * will set master on all nodes. 2677 */ 2678 (void) clnt_mnsetmaster(mynode(), sp, "", 2679 MD_MN_INVALID_NID, &xep); 2680 mdclrerror(&xep); 2681 /* Reset master in my locally cached set descriptor */ 2682 sd->sd_mn_master_nodeid = MD_MN_INVALID_NID; 2683 sd->sd_mn_am_i_master = 0; 2684 2685 /* 2686 * If nr flags set on other nodes, reset them. 2687 */ 2688 if (clear_nr_flags) { 2689 nd = sd->sd_nodelist; 2690 while (nd) { 2691 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 2692 nd = nd->nd_next; 2693 continue; 2694 } 2695 (void) clnt_upd_nr_flags(nd->nd_nodename, sp, 2696 &my_nd, MD_NR_WITHDRAW, NULL, &xep); 2697 mdclrerror(&xep); 2698 nd = nd->nd_next; 2699 } 2700 /* Reset my locally cached flag */ 2701 sd->sd_mn_mynode->nd_flags &= ~MD_MN_NODE_OWN; 2702 } 2703 } 2704 2705 /* 2706 * Notify rpc.mdcommd on all nodes of a nodelist change. 2707 * Send reinit command to mdcommd which forces it to get 2708 * fresh set description. 2709 */ 2710 if (send_reinit) { 2711 /* Send reinit */ 2712 nd = sd->sd_nodelist; 2713 while (nd) { 2714 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2715 nd = nd->nd_next; 2716 continue; 2717 } 2718 2719 /* Class is ignored for REINIT */ 2720 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, 2721 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { 2722 /* 2723 * We are here because we failed to resume 2724 * rpc.mdcommd. However we potentially have 2725 * an error from the previous call 2726 * If the previous call did fail, we capture 2727 * that error and generate a perror with 2728 * the string, "Unable to resume...". 2729 * Setting rval to -1 ensures that in the 2730 * next iteration of the loop, ep is not 2731 * clobbered. 2732 */ 2733 if (rval == 0) 2734 (void) mdstealerror(ep, &xep); 2735 else 2736 mdclrerror(&xep); 2737 rval = -1; 2738 mde_perror(ep, dgettext(TEXT_DOMAIN, 2739 "Unable to reinit rpc.mdcommd.")); 2740 } 2741 nd = nd->nd_next; 2742 } 2743 2744 } 2745 2746 out2: 2747 /* 2748 * Unlock diskset by resuming messages across the diskset. 2749 * Just resume all classes so that resume is the same whether 2750 * just one class was locked or all classes were locked. 2751 */ 2752 if ((suspend1_flag) || (suspendall_flag)) { 2753 nd = sd->sd_nodelist; 2754 while (nd) { 2755 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2756 nd = nd->nd_next; 2757 continue; 2758 } 2759 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 2760 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 2761 /* 2762 * We are here because we failed to resume 2763 * rpc.mdcommd. However we potentially have 2764 * an error from the previous call 2765 * If the previous call did fail, we capture 2766 * that error and generate a perror with 2767 * the string, "Unable to resume...". 2768 * Setting rval to -1 ensures that in the 2769 * next iteration of the loop, ep is not 2770 * clobbered. 2771 */ 2772 if (rval == 0) 2773 (void) mdstealerror(ep, &xep); 2774 else 2775 mdclrerror(&xep); 2776 rval = -1; 2777 mde_perror(ep, dgettext(TEXT_DOMAIN, 2778 "Unable to resume rpc.mdcommd.")); 2779 } 2780 nd = nd->nd_next; 2781 } 2782 meta_ping_mnset(sp->setno); 2783 } 2784 2785 /* 2786 * Unlock set. This flushes the caches on the servers. 2787 */ 2788 cl_sk = cl_get_setkey(sp->setno, sp->setname); 2789 nd = sd->sd_nodelist; 2790 while (nd) { 2791 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2792 nd = nd->nd_next; 2793 continue; 2794 } 2795 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 2796 if (rval == 0) 2797 (void) mdstealerror(ep, &xep); 2798 else 2799 mdclrerror(&xep); 2800 rval = -1; 2801 } 2802 nd = nd->nd_next; 2803 } 2804 2805 /* 2806 * If this node is the last to join the diskset and clustering isn't 2807 * running, then resync the mirrors in the diskset. We have to wait 2808 * until all nodes are joined so that the status gets propagated to 2809 * all of the members of the set. 2810 * Ignore any error from the resync as the join function shouldn't fail 2811 * because the mirror resync had a problem. 2812 * 2813 * Don't start resync if set is stale. 2814 */ 2815 if ((rval == 0) && (sdssc_bind_library() != SDSSC_OKAY) && 2816 (stale_set != 1)) { 2817 nd = sd->sd_nodelist; 2818 while (nd) { 2819 if (!(nd->nd_flags & MD_MN_NODE_OWN)) 2820 break; 2821 nd = nd->nd_next; 2822 } 2823 /* 2824 * nd set to NULL means that we have no nodes in the set that 2825 * haven't joined. In this case we start the resync. 2826 */ 2827 if (nd == NULL) { 2828 (void) meta_mirror_resync_all(sp, 0, &xep); 2829 mdclrerror(&xep); 2830 } 2831 } 2832 2833 /* Update ABR state for all soft partitions */ 2834 (void) meta_sp_update_abr(sp, &xep); 2835 mdclrerror(&xep); 2836 2837 /* 2838 * call metaflushsetnames to reset local cache for master and 2839 * node information. 2840 */ 2841 metaflushsetname(sp); 2842 2843 /* release signals back to what they were on entry */ 2844 if (procsigs(FALSE, &oldsigs, &xep) < 0) 2845 mdclrerror(&xep); 2846 2847 /* 2848 * If no error and stale_set is set, then set ep back 2849 * to ep from snarf_set call and return -3. If another error 2850 * occurred and rval is not 0, then that error would have 2851 * caused the node to be withdrawn from the set and would 2852 * have set ep to that error information. 2853 */ 2854 if ((rval == 0) && (stale_set)) { 2855 (void) mdstealerror(ep, &ep_snarf); 2856 return (-3); 2857 } 2858 2859 return (rval); 2860 } 2861 2862 /* 2863 * Entry point to withdraw a node from MultiNode diskset. 2864 * 2865 * Validate host in diskset. 2866 * - Should be joined into diskset. 2867 * Assume valid configuration is stored in the set/drive/node records 2868 * in the local mddb since no node or drive can be added to the MNset 2869 * unless all drives and nodes are available. Reconfig steps will 2870 * resync all ALIVE nodes in case of panic in critical areas. 2871 * 2872 * Lock down the set. 2873 * Verify that drives exist in configuration. 2874 * Verify host is a member of this diskset. 2875 * Verify host is an owner of the diskset (host is joined to diskset). 2876 * Only allow withdrawal of master node if master node is the only joined 2877 * in the diskset. 2878 * Halt the diskset on this node. 2879 * Reset Master on this node. 2880 * Updated node flags that this node with withdrawn. 2881 * Unlock the set. 2882 * 2883 * Return values: 2884 * 0 - Node successfully withdrew from set. 2885 * -1 - Withdrawal attempted but failed 2886 * - any failure from libmeta calls 2887 * - node not in the member list 2888 * -2 - Withdrawal not attempted since 2889 * - this set had no drives in set 2890 * - this node not joined to set 2891 * - set is not a multinode set 2892 */ 2893 extern int 2894 meta_set_withdraw( 2895 mdsetname_t *sp, 2896 md_error_t *ep 2897 ) 2898 { 2899 md_set_desc *sd; 2900 md_drive_desc *dd = 0; 2901 md_mnnode_desc *nd, my_nd; 2902 int rval = 0; 2903 md_setkey_t *cl_sk; 2904 md_error_t xep = mdnullerror; 2905 int set_halted = 0; 2906 int suspendall_flag = 0; 2907 int suspend1_flag = 0; 2908 bool_t stale_bool = FALSE; 2909 mddb_config_t c; 2910 int node_id_list[1]; 2911 sigset_t oldsigs; 2912 int send_reinit = 0; 2913 2914 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 2915 return (-1); 2916 } 2917 2918 /* Must be a multinode diskset */ 2919 if (!MD_MNSET_DESC(sd)) { 2920 (void) mderror(ep, MDE_NOT_MN, sp->setname); 2921 return (-1); 2922 } 2923 2924 /* Make sure we are blocking all signals */ 2925 if (procsigs(TRUE, &oldsigs, &xep) < 0) 2926 mdclrerror(&xep); 2927 2928 /* 2929 * Lock the set on current set members. 2930 * For MN diskset lock_set and SUSPEND are used to protect against 2931 * other meta* commands running on the other nodes. 2932 */ 2933 nd = sd->sd_nodelist; 2934 while (nd) { 2935 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2936 nd = nd->nd_next; 2937 continue; 2938 } 2939 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 2940 rval = -1; 2941 goto out; 2942 } 2943 nd = nd->nd_next; 2944 } 2945 /* 2946 * Lock out other meta* commands by suspending 2947 * class 1 messages across the diskset. 2948 */ 2949 nd = sd->sd_nodelist; 2950 while (nd) { 2951 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2952 nd = nd->nd_next; 2953 continue; 2954 } 2955 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, 2956 sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) { 2957 rval = -1; 2958 goto out; 2959 } 2960 suspend1_flag = 1; 2961 nd = nd->nd_next; 2962 } 2963 2964 /* Get list of drives - needed in case of failure */ 2965 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 2966 ep)) == NULL) { 2967 /* Error getting drives in list */ 2968 if (! mdisok(ep)) { 2969 rval = -1; 2970 goto out2; 2971 } 2972 /* no drives in list */ 2973 rval = -2; 2974 goto out2; 2975 } 2976 2977 /* 2978 * Verify that this host is a member (in the host list) of the set. 2979 */ 2980 nd = sd->sd_nodelist; 2981 while (nd) { 2982 if (strcmp(mynode(), nd->nd_nodename) == 0) { 2983 break; 2984 } 2985 nd = nd->nd_next; 2986 } 2987 if (!nd) { 2988 (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, 2989 sd->sd_mn_mynode->nd_nodename, NULL, 2990 sp->setname); 2991 rval = -1; 2992 goto out2; 2993 } 2994 2995 /* 2996 * Call metaget_setownership that calls each node in diskset and 2997 * marks in set descriptor if node is an owner of the set or not. 2998 * metaget_setownership checks to see if a node is an owner by 2999 * checking to see if that node's kernel has the mddb loaded. 3000 * If a node had panic'd during a reconfig or an 3001 * add/delete/join/withdraw operation, the other nodes' node 3002 * records may not reflect the current state of the diskset, 3003 * so calling metaget_setownership is the safest thing to do. 3004 */ 3005 if (metaget_setownership(sp, ep) == -1) { 3006 rval = -1; 3007 goto out2; 3008 } 3009 3010 /* 3011 * Verify that this node is joined 3012 * to diskset (i.e. is an owner of the diskset). 3013 */ 3014 if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 3015 rval = -2; 3016 goto out2; 3017 } 3018 3019 /* 3020 * For a MN diskset, only withdraw master if it is 3021 * the only joined node. 3022 */ 3023 if (sd->sd_mn_master_nodeid == sd->sd_mn_mynode->nd_nodeid) { 3024 nd = sd->sd_nodelist; 3025 while (nd) { 3026 /* Skip my node since checking for other owners */ 3027 if (nd->nd_nodeid == sd->sd_mn_master_nodeid) { 3028 nd = nd->nd_next; 3029 continue; 3030 } 3031 /* If another owner node if found, error */ 3032 if (nd->nd_flags & MD_MN_NODE_OWN) { 3033 (void) mddserror(ep, MDE_DS_WITHDRAWMASTER, 3034 sp->setno, 3035 sd->sd_mn_mynode->nd_nodename, NULL, 3036 sp->setname); 3037 rval = -1; 3038 goto out2; 3039 } 3040 nd = nd->nd_next; 3041 } 3042 } 3043 3044 /* 3045 * Is current set STALE? 3046 */ 3047 (void) memset(&c, 0, sizeof (c)); 3048 c.c_id = 0; 3049 c.c_setno = sp->setno; 3050 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 3051 (void) mdstealerror(ep, &c.c_mde); 3052 rval = -1; 3053 goto out; 3054 } 3055 if (c.c_flags & MDDB_C_STALE) { 3056 stale_bool = TRUE; 3057 } 3058 3059 /* 3060 * Notify rpc.mdcommd on all nodes of a nodelist change. 3061 * Start by suspending rpc.mdcommd (which drains it of all messages), 3062 * then change the nodelist followed by a reinit and resume. 3063 */ 3064 nd = sd->sd_nodelist; 3065 while (nd) { 3066 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3067 nd = nd->nd_next; 3068 continue; 3069 } 3070 3071 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, 3072 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) { 3073 rval = -1; 3074 goto out; 3075 } 3076 suspendall_flag = 1; 3077 nd = nd->nd_next; 3078 } 3079 3080 /* 3081 * Withdraw the set - halt set. 3082 * This will fail if any I/O is occuring to any metadevice which 3083 * includes a resync to a mirror metadevice. 3084 */ 3085 set_halted = 1; 3086 if (halt_set(sp, ep)) { 3087 /* Was set actually halted? */ 3088 if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_YES) { 3089 set_halted = 0; 3090 } 3091 rval = -1; 3092 goto out; 3093 } 3094 3095 /* Change to nodelist so need to send reinit to rpc.mdcommd */ 3096 send_reinit = 1; 3097 3098 /* Reset master on withdrawn node */ 3099 if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, "", 3100 MD_MN_INVALID_NID, ep)) { 3101 rval = -1; 3102 goto out; 3103 } 3104 3105 /* Mark my node as withdrawn and send to other nodes */ 3106 nd = sd->sd_nodelist; 3107 my_nd = *(sd->sd_mn_mynode); /* structure copy */ 3108 my_nd.nd_next = NULL; 3109 while (nd) { 3110 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3111 nd = nd->nd_next; 3112 continue; 3113 } 3114 if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, 3115 MD_NR_WITHDRAW, NULL, ep)) { 3116 rval = -1; 3117 goto out; 3118 } 3119 nd = nd->nd_next; 3120 } 3121 3122 /* 3123 * If withdrawn node is a mirror owner, reset mirror owner 3124 * to NULL. If an error occurs, print a warning and continue. 3125 * Don't fail metaset because of mirror owner reset problem since 3126 * next node to grab mirror will resolve this issue. 3127 * Before next node grabs mirrors, metaset will show the withdrawn 3128 * node as owner which is why an attempt to reset the mirror owner 3129 * is made. 3130 */ 3131 node_id_list[0] = sd->sd_mn_mynode->nd_nodeid; /* Setup my nodeid */ 3132 nd = sd->sd_nodelist; 3133 while (nd) { 3134 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3135 nd = nd->nd_next; 3136 continue; 3137 } 3138 if (clnt_reset_mirror_owner(nd->nd_nodename, sp, 3139 1, &node_id_list[0], &xep) == 01) { 3140 mde_perror(&xep, dgettext(TEXT_DOMAIN, 3141 "Unable to reset mirror owner on node %s"), 3142 nd->nd_nodename); 3143 mdclrerror(&xep); 3144 } 3145 nd = nd->nd_next; 3146 } 3147 3148 out: 3149 if (rval == -1) { 3150 /* Rejoin node - Mark node as joined and send to other nodes */ 3151 nd = sd->sd_nodelist; 3152 my_nd = *(sd->sd_mn_mynode); /* structure copy */ 3153 my_nd.nd_next = NULL; 3154 while (nd) { 3155 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3156 nd = nd->nd_next; 3157 continue; 3158 } 3159 if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, 3160 MD_NR_JOIN, NULL, &xep)) { 3161 mdclrerror(&xep); 3162 } 3163 nd = nd->nd_next; 3164 } 3165 3166 /* Set master on withdrawn node */ 3167 if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, 3168 sd->sd_mn_master_nodenm, 3169 sd->sd_mn_master_nodeid, &xep)) { 3170 mdclrerror(&xep); 3171 } 3172 3173 /* Join set if halt_set had succeeded */ 3174 if (set_halted) { 3175 /* 3176 * Causes mddbs to be loaded into the kernel. 3177 * Set the force flag so that replica locations can be 3178 * loaded into the kernel even if a mediator node was 3179 * unavailable. This allows a node to join an MO 3180 * diskset when there are sufficient replicas available, 3181 * but a mediator node in unavailable. 3182 */ 3183 if (setup_db_bydd(sp, dd, TRUE, &xep) == -1) { 3184 mdclrerror(&xep); 3185 } 3186 /* If set previously stale - make it so at re-join */ 3187 if (snarf_set(sp, stale_bool, &xep) != 0) { 3188 mdclrerror(&xep); 3189 (void) halt_set(sp, &xep); 3190 mdclrerror(&xep); 3191 } 3192 } 3193 } 3194 3195 /* 3196 * Notify rpc.mdcommd on all nodes of a nodelist change. 3197 * Send reinit command to mdcommd which forces it to get 3198 * fresh set description. 3199 */ 3200 if (send_reinit) { 3201 /* Send reinit */ 3202 nd = sd->sd_nodelist; 3203 while (nd) { 3204 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3205 nd = nd->nd_next; 3206 continue; 3207 } 3208 3209 /* Class is ignored for REINIT */ 3210 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, 3211 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { 3212 /* 3213 * We are here because we failed to resume 3214 * rpc.mdcommd. However we potentially have 3215 * an error from the previous call. 3216 * If the previous call did fail, we 3217 * capture that error and generate a perror 3218 * withthe string, "Unable to resume...". 3219 * Setting rval to -1 ensures that in the 3220 * next iteration of the loop, ep is not 3221 * clobbered. 3222 */ 3223 if (rval == 0) 3224 (void) mdstealerror(ep, &xep); 3225 else 3226 mdclrerror(&xep); 3227 rval = -1; 3228 mde_perror(ep, dgettext(TEXT_DOMAIN, 3229 "Unable to reinit rpc.mdcommd.")); 3230 } 3231 nd = nd->nd_next; 3232 } 3233 } 3234 3235 out2: 3236 /* 3237 * Unlock diskset by resuming messages across the diskset. 3238 * Just resume all classes so that resume is the same whether 3239 * just one class was locked or all classes were locked. 3240 */ 3241 if ((suspend1_flag) || (suspendall_flag)) { 3242 nd = sd->sd_nodelist; 3243 while (nd) { 3244 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3245 nd = nd->nd_next; 3246 continue; 3247 } 3248 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 3249 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 3250 /* 3251 * We are here because we failed to resume 3252 * rpc.mdcommd. However we potentially have 3253 * an error from the previous call 3254 * If the previous call did fail, we capture 3255 * that error and generate a perror with 3256 * the string, "Unable to resume...". 3257 * Setting rval to -1 ensures that in the 3258 * next iteration of the loop, ep is not 3259 * clobbered. 3260 */ 3261 if (rval == 0) 3262 (void) mdstealerror(ep, &xep); 3263 else 3264 mdclrerror(&xep); 3265 rval = -1; 3266 mde_perror(ep, dgettext(TEXT_DOMAIN, 3267 "Unable to resume rpc.mdcommd.")); 3268 } 3269 nd = nd->nd_next; 3270 } 3271 meta_ping_mnset(sp->setno); 3272 } 3273 3274 /* 3275 * Unlock set. This flushes the caches on the servers. 3276 */ 3277 cl_sk = cl_get_setkey(sp->setno, sp->setname); 3278 nd = sd->sd_nodelist; 3279 while (nd) { 3280 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3281 nd = nd->nd_next; 3282 continue; 3283 } 3284 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 3285 if (rval == 0) 3286 (void) mdstealerror(ep, &xep); 3287 else 3288 mdclrerror(&xep); 3289 rval = -1; 3290 } 3291 nd = nd->nd_next; 3292 } 3293 3294 /* 3295 * call metaflushsetnames to reset local cache for master and 3296 * node information. 3297 */ 3298 metaflushsetname(sp); 3299 3300 /* release signals back to what they were on entry */ 3301 if (procsigs(FALSE, &oldsigs, &xep) < 0) 3302 mdclrerror(&xep); 3303 3304 return (rval); 3305 3306 } 3307 3308 /* 3309 * Update nodelist with cluster member information. 3310 * A node not in the member list will be marked 3311 * as not ALIVE and not OWN. 3312 * A node in the member list will be marked ALIVE, but 3313 * the OWN bit will not be changed. 3314 * 3315 * If mynode isn't in the membership list, fail causing 3316 * another reconfig cycle to be started since a non-member 3317 * node shouldn't be taking part in the reconfig cycle. 3318 * 3319 * Return values: 3320 * 0 - No problem. 3321 * 1 - Any failure including RPC failure to my node. 3322 */ 3323 int 3324 meta_reconfig_update_nodelist( 3325 mdsetname_t *sp, 3326 mndiskset_membershiplist_t *nl, 3327 md_set_desc *sd, 3328 md_error_t *ep 3329 ) 3330 { 3331 mndiskset_membershiplist_t *nl2; 3332 md_mnnode_desc *nd; 3333 md_error_t xep = mdnullerror; 3334 int rval = 0; 3335 3336 /* 3337 * Walk through nodelist, checking to see if each 3338 * node is in the member list. 3339 * If node is not a member, reset ALIVE and OWN node flag. 3340 * If node is a member, set ALIVE. 3341 * If mynode's OWN flag gets reset, then halt the diskset on this node. 3342 */ 3343 nd = sd->sd_nodelist; 3344 while (nd) { 3345 nl2 = nl; 3346 while (nl2) { 3347 /* If node is in member list, set ALIVE */ 3348 if (nl2->msl_node_id == nd->nd_nodeid) { 3349 nd->nd_flags |= MD_MN_NODE_ALIVE; 3350 break; 3351 } else { 3352 nl2 = nl2->next; 3353 } 3354 /* node is not in member list, mark !ALIVE and !OWN */ 3355 if (nl2 == NULL) { 3356 /* If node is mynode, then halt set if needed */ 3357 if (strcmp(mynode(), nd->nd_nodename) == 0) { 3358 /* 3359 * This shouldn't happen, but just 3360 * in case... Any node not in the 3361 * membership list should be dead and 3362 * not running reconfig step1. 3363 */ 3364 if (nd->nd_flags & MD_MN_NODE_OWN) { 3365 if (halt_set(sp, &xep)) { 3366 mde_perror(&xep, ""); 3367 mdclrerror(&xep); 3368 } 3369 } 3370 /* 3371 * Return failure since this node 3372 * (mynode) is not in the membership 3373 * list, but process the rest of the 3374 * nodelist first so that rpc.metad 3375 * can be updated with the latest 3376 * membership information. 3377 */ 3378 (void) mddserror(ep, 3379 MDE_DS_NOTINMEMBERLIST, 3380 sp->setno, nd->nd_nodename, NULL, 3381 sp->setname); 3382 rval = 1; 3383 } 3384 nd->nd_flags &= ~MD_MN_NODE_ALIVE; 3385 nd->nd_flags &= ~MD_MN_NODE_OWN; 3386 } 3387 } 3388 nd = nd->nd_next; 3389 } 3390 3391 /* Send this information to rpc.metad */ 3392 if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, 3393 MD_NR_SET, MNSET_IN_RECONFIG, &xep)) { 3394 /* Return failure if can't send node flags to rpc.metad */ 3395 if (rval == 0) { 3396 (void) mdstealerror(ep, &xep); 3397 rval = 1; 3398 } 3399 } 3400 return (rval); 3401 } 3402 3403 /* 3404 * Choose master determines the master for a diskset. 3405 * Each node determines the master on its own and 3406 * adds this information to its local rpc.metad nodelist 3407 * and also sends it to the kernel. 3408 * 3409 * Nodelist in set descriptor (sd) is sorted in 3410 * monotonically increasing sequence of nodeid. 3411 * 3412 * Return values: 3413 * 0 - No problem. 3414 * 205 - There was an RPC problem to another node. 3415 * -1 - There was an error. This could be an RPC error to my node. 3416 * This is a catastrophic failure causing node to panic. 3417 */ 3418 int 3419 meta_reconfig_choose_master_for_set( 3420 mdsetname_t *sp, 3421 md_set_desc *sd, 3422 md_error_t *ep 3423 ) 3424 { 3425 int is_owner; 3426 md_mnset_record *mnsr = NULL; 3427 int lowest_alive_nodeid = 0; 3428 uint_t master_nodeid; 3429 md_mnnode_desc *nd, *nd2; 3430 md_mnnode_record *nr; 3431 md_drive_desc *dd; 3432 md_setkey_t *cl_sk; 3433 int rval = 0; 3434 md_error_t xep = mdnullerror; 3435 mddb_setflags_config_t sf; 3436 3437 /* 3438 * Is current node joined to diskset? 3439 * Don't trust flags, really check to see if mddb is snarfed. 3440 */ 3441 if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { 3442 /* 3443 * If a node is joined to the diskset, this node checks 3444 * to see if the current master of the diskset is valid and 3445 * is still in the membership list (ALIVE) and is 3446 * still joined (OWN). Need to verify if master is 3447 * really joined - don't trust the flags. (Can trust 3448 * ALIVE since set during earlier part of reconfig cycle.) 3449 * If the current master is valid, still in the membership 3450 * list and joined, then master is not changed on this node. 3451 * Just return. 3452 * 3453 * Verify that nodeid is valid before accessing masternode. 3454 */ 3455 if ((sd->sd_mn_master_nodeid != MD_MN_INVALID_NID) && 3456 (sd->sd_mn_masternode->nd_flags & MD_MN_NODE_ALIVE)) { 3457 if (clnt_ownset(sd->sd_mn_master_nodenm, sp, 3458 &is_owner, ep) == -1) { 3459 /* If RPC failure to another node return 205 */ 3460 if ((mdanyrpcerror(ep)) && 3461 (sd->sd_mn_mynode->nd_nodeid != 3462 sd->sd_mn_master_nodeid)) { 3463 return (205); 3464 } else { 3465 /* Any other failure */ 3466 return (-1); 3467 } 3468 } else { 3469 if (is_owner == TRUE) { 3470 3471 meta_mc_log(MC_LOG5, dgettext( 3472 TEXT_DOMAIN, "Set %s previous " 3473 "master chosen %s (%d): %s"), 3474 sp->setname, 3475 sd->sd_mn_master_nodenm, 3476 sd->sd_mn_master_nodeid, 3477 meta_print_hrtime(gethrtime() - 3478 start_time)); 3479 3480 /* Previous master is ok - done */ 3481 return (0); 3482 } 3483 } 3484 } 3485 3486 /* 3487 * If current master is no longer in the membership list or 3488 * is no longer joined, then this node uses the following 3489 * algorithm: 3490 * - node calls RPC routine clnt_ownset to get latest 3491 * information on which nodes are owners of diskset. 3492 * clnt_ownset checks on each node to see if its kernel 3493 * has that diskset snarfed. 3494 */ 3495 nd = sd->sd_nodelist; 3496 while (nd) { 3497 /* Don't consider node that isn't in member list */ 3498 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3499 nd = nd->nd_next; 3500 continue; 3501 } 3502 3503 if (clnt_ownset(nd->nd_nodename, sp, 3504 &is_owner, ep) == -1) { 3505 /* If RPC failure to another node return 205 */ 3506 if ((mdanyrpcerror(ep)) && 3507 (sd->sd_mn_mynode->nd_nodeid != 3508 nd->nd_nodeid)) { 3509 return (205); 3510 } else { 3511 /* Any other failure */ 3512 return (-1); 3513 } 3514 } 3515 3516 /* 3517 * Set owner flag for each node based on whether 3518 * that node really has a diskset mddb snarfed in 3519 * or not. 3520 */ 3521 if (is_owner == TRUE) 3522 nd->nd_flags |= MD_MN_NODE_OWN; 3523 else 3524 nd->nd_flags &= ~MD_MN_NODE_OWN; 3525 3526 nd = nd->nd_next; 3527 } 3528 3529 /* 3530 * - node walks through nodelist looking for nodes that are 3531 * owners of the diskset that are in the membership list. 3532 * - for each owner, node calls RPC routine clnt_getset to 3533 * see if that node has its node record set to OK. 3534 * - If so, master is chosen to be this owner node. 3535 */ 3536 nd = sd->sd_nodelist; 3537 while (nd) { 3538 /* Don't consider node that isn't in member list */ 3539 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3540 nd = nd->nd_next; 3541 continue; 3542 } 3543 3544 /* Don't consider a node that isn't an owner */ 3545 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3546 nd = nd->nd_next; 3547 continue; 3548 } 3549 3550 /* Does node has its own node record set to OK? */ 3551 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3552 MD_SET_BAD, &mnsr, ep) == -1) { 3553 /* If RPC failure to another node return 205 */ 3554 if ((mdanyrpcerror(ep)) && 3555 (sd->sd_mn_mynode->nd_nodeid != 3556 nd->nd_nodeid)) { 3557 return (205); 3558 } else { 3559 /* Any other failure */ 3560 return (-1); 3561 } 3562 } 3563 nr = mnsr->sr_nodechain; 3564 while (nr) { 3565 if (nd->nd_nodeid == nr->nr_nodeid) { 3566 if (nr->nr_flags & MD_MN_NODE_OK) { 3567 /* Found a master */ 3568 free_sr( 3569 (md_set_record *)mnsr); 3570 goto found_master; 3571 } 3572 } 3573 nr = nr->nr_next; 3574 } 3575 free_sr((md_set_record *)mnsr); 3576 nd = nd->nd_next; 3577 } 3578 3579 /* 3580 * - If no owner node has its own node record on its own node 3581 * set to OK, then this node checks all of the non-owner 3582 * nodes that are in the membership list. 3583 * - for each non-owner, node calls RPC routine clnt_getset to 3584 * see if that node has its node record set to OK. 3585 * - If set doesn't exist, don't choose node for master. 3586 * - If so, master is chosen to be this non-owner node. 3587 * 3588 */ 3589 nd = sd->sd_nodelist; 3590 while (nd) { 3591 /* Don't consider node that isn't in member list */ 3592 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3593 nd = nd->nd_next; 3594 continue; 3595 } 3596 3597 /* Only checking non-owner nodes this time around */ 3598 if (nd->nd_flags & MD_MN_NODE_OWN) { 3599 nd = nd->nd_next; 3600 continue; 3601 } 3602 3603 /* Does node has its own node record set to OK? */ 3604 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3605 MD_SET_BAD, &mnsr, ep) == -1) { 3606 /* 3607 * If set doesn't exist on non-owner node, 3608 * don't consider this node for master. 3609 */ 3610 if (mdiserror(ep, MDE_NO_SET)) { 3611 nd = nd->nd_next; 3612 continue; 3613 } else if ((mdanyrpcerror(ep)) && 3614 (sd->sd_mn_mynode->nd_nodeid != 3615 nd->nd_nodeid)) { 3616 /* RPC failure to another node */ 3617 return (205); 3618 } else { 3619 /* Any other failure */ 3620 return (-1); 3621 } 3622 } 3623 nr = mnsr->sr_nodechain; 3624 while (nr) { 3625 if (nd->nd_nodeid == nr->nr_nodeid) { 3626 if (nr->nr_flags & MD_MN_NODE_OK) { 3627 /* Found a master */ 3628 free_sr( 3629 (md_set_record *)mnsr); 3630 goto found_master; 3631 } 3632 } 3633 nr = nr->nr_next; 3634 } 3635 free_sr((md_set_record *)mnsr); 3636 nd = nd->nd_next; 3637 } 3638 3639 /* 3640 * - If no node can be found that has its own node record on 3641 * its node to be set to OK, then all alive nodes 3642 * were in the process of being added to or deleted 3643 * from set. Each alive node will remove all 3644 * information pertaining to this set from its node. 3645 * 3646 * If all nodes in set are ALIVE, then call sdssc end routines 3647 * since set was truly being initially created or destroyed. 3648 */ 3649 goto delete_set; 3650 } else { 3651 3652 /* 3653 * If node is not joined to diskset, then this 3654 * node uses the following algorithm: 3655 * - If unjoined node doesn't have a node record for itself, 3656 * just delete the diskset since diskset was in the 3657 * process of being created. 3658 * - node needs to find master of diskset before 3659 * reconfig cycle, if a master existed. 3660 * - node calls RPC routine clnt_ownset to get latest 3661 * information on which nodes are owners of diskset. 3662 * clnt_ownset checks on each node to see if its 3663 * kernel has that diskset snarfed. 3664 */ 3665 3666 /* 3667 * Is my node in the set description? 3668 * If not, delete the set from this node. 3669 * sr2setdesc sets sd_mn_mynode pointer to the node 3670 * descriptor for this node if there was a node 3671 * record for this node. 3672 * 3673 */ 3674 if (sd->sd_mn_mynode == NULL) { 3675 goto delete_set; 3676 } 3677 3678 nd = sd->sd_nodelist; 3679 while (nd) { 3680 /* Don't consider node that isn't in member list */ 3681 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3682 nd = nd->nd_next; 3683 continue; 3684 } 3685 3686 if (clnt_ownset(nd->nd_nodename, sp, 3687 &is_owner, ep) == -1) { 3688 /* If RPC failure to another node return 205 */ 3689 if ((mdanyrpcerror(ep)) && 3690 (sd->sd_mn_mynode->nd_nodeid != 3691 nd->nd_nodeid)) { 3692 return (205); 3693 } else { 3694 /* Any other failure */ 3695 return (-1); 3696 } 3697 } 3698 3699 /* 3700 * Set owner flag for each node based on whether 3701 * that node really has a diskset mddb snarfed in 3702 * or not. 3703 */ 3704 if (is_owner == TRUE) 3705 nd->nd_flags |= MD_MN_NODE_OWN; 3706 else 3707 nd->nd_flags &= ~MD_MN_NODE_OWN; 3708 3709 nd = nd->nd_next; 3710 } 3711 3712 /* 3713 * - node walks through nodelist looking for nodes that 3714 * are owners of the diskset that are in 3715 * the membership list. 3716 * - for each owner, node calls RPC routine clnt_getset to 3717 * see if that node has a master set and to get the 3718 * diskset description. 3719 * - If the owner node has a set description that doesn't 3720 * include the non-joined node in the nodelist, this node 3721 * removes its set description of that diskset 3722 * (i.e. removes the set from its local mddbs). This is 3723 * handling the case of when a node was removed from a 3724 * diskset while it was not in the cluster membership 3725 * list. 3726 * - If that node has a master set and the master is in the 3727 * membership list and is an owner, then either this was 3728 * the master from before the reconfig cycle or this 3729 * node has already chosen a new master - either way, 3730 * the master value is valid as long as it is in the 3731 * membership list and is an owner 3732 * - master is chosen to be owner node's master 3733 */ 3734 nd = sd->sd_nodelist; 3735 while (nd) { 3736 /* Don't consider node that isn't in member list */ 3737 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3738 nd = nd->nd_next; 3739 continue; 3740 } 3741 3742 /* Don't consider a node that isn't an owner */ 3743 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3744 nd = nd->nd_next; 3745 continue; 3746 } 3747 3748 /* Get owner node's set record */ 3749 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3750 MD_SET_BAD, &mnsr, ep) == -1) { 3751 /* If RPC failure to another node return 205 */ 3752 if ((mdanyrpcerror(ep)) && 3753 (sd->sd_mn_mynode->nd_nodeid != 3754 nd->nd_nodeid)) { 3755 return (205); 3756 } else { 3757 /* Any other failure */ 3758 return (-1); 3759 } 3760 } 3761 3762 /* Is this node in the owner node's set record */ 3763 nr = mnsr->sr_nodechain; 3764 while (nr) { 3765 if (sd->sd_mn_mynode->nd_nodeid == 3766 nr->nr_nodeid) { 3767 break; 3768 } 3769 nr = nr->nr_next; 3770 } 3771 if (nr == NULL) { 3772 /* my node not found - delete set */ 3773 free_sr((md_set_record *)mnsr); 3774 goto delete_set; 3775 } 3776 3777 /* Is owner's node's master valid? */ 3778 master_nodeid = mnsr->sr_master_nodeid; 3779 free_sr((md_set_record *)mnsr); 3780 if (master_nodeid == MD_MN_INVALID_NID) { 3781 nd = nd->nd_next; 3782 continue; 3783 } 3784 3785 nd2 = sd->sd_nodelist; 3786 while (nd2) { 3787 if ((nd2->nd_nodeid == master_nodeid) && 3788 (nd2->nd_flags & MD_MN_NODE_ALIVE) && 3789 (nd2->nd_flags & MD_MN_NODE_OWN)) { 3790 nd = nd2; 3791 goto found_master; 3792 } 3793 nd2 = nd2->nd_next; 3794 } 3795 nd = nd->nd_next; 3796 } 3797 3798 /* 3799 * - If no owner node has a valid master, then follow 3800 * algorithm of when a node is joined to the diskset. 3801 * - node walks through nodelist looking for nodes that are 3802 * owners of the diskset that are in the membership list. 3803 * - for each owner, node calls RPC routine clnt_getset to 3804 * see if that node has its node record set to OK. 3805 * - If so, master is chosen to be this owner node. 3806 */ 3807 nd = sd->sd_nodelist; 3808 while (nd) { 3809 /* Don't consider node that isn't in member list */ 3810 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3811 nd = nd->nd_next; 3812 continue; 3813 } 3814 3815 /* Don't consider a node that isn't an owner */ 3816 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3817 nd = nd->nd_next; 3818 continue; 3819 } 3820 3821 /* Does node has its own node record set to OK? */ 3822 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3823 MD_SET_BAD, &mnsr, ep) == -1) { 3824 /* If RPC failure to another node return 205 */ 3825 if ((mdanyrpcerror(ep)) && 3826 (sd->sd_mn_mynode->nd_nodeid != 3827 nd->nd_nodeid)) { 3828 return (205); 3829 } else { 3830 /* Any other failure */ 3831 return (-1); 3832 } 3833 } 3834 nr = mnsr->sr_nodechain; 3835 while (nr) { 3836 if (nd->nd_nodeid == nr->nr_nodeid) { 3837 if (nr->nr_flags & MD_MN_NODE_OK) { 3838 /* Found a master */ 3839 free_sr( 3840 (md_set_record *)mnsr); 3841 goto found_master; 3842 } 3843 } 3844 nr = nr->nr_next; 3845 } 3846 free_sr((md_set_record *)mnsr); 3847 nd = nd->nd_next; 3848 } 3849 3850 /* 3851 * - If no owner node has its own node record on its own node 3852 * set to OK, then this node checks all of the non-owner 3853 * nodes that are in the membership list. 3854 * - for each non-owner, node calls RPC routine clnt_getset to 3855 * see if that node has its node record set to OK. 3856 * - If set doesn't exist, don't choose node for master. 3857 * - If this node doesn't exist in the nodelist on any of the 3858 * non-owner nodes, this node removes its set description 3859 * of that diskset (i.e. removes the set from its local 3860 * mddbs). This is handling the case of when a node was 3861 * removed from a diskset while it was not in the 3862 * cluster membership list. 3863 * - If non-owner node has its node record set to OK and if 3864 * this node hasn't removed this diskset (step directly 3865 * before this one), then the master is chosen to be this 3866 * non-owner node. 3867 */ 3868 nd = sd->sd_nodelist; 3869 while (nd) { 3870 /* Don't consider node that isn't in member list */ 3871 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3872 nd->nd_flags |= MD_MN_NODE_DEL; 3873 nd = nd->nd_next; 3874 continue; 3875 } 3876 3877 /* Don't consider owner nodes since none are OK */ 3878 if (nd->nd_flags & MD_MN_NODE_OWN) { 3879 nd->nd_flags |= MD_MN_NODE_DEL; 3880 nd = nd->nd_next; 3881 continue; 3882 } 3883 3884 /* 3885 * Don't need to get nodelist from my node since 3886 * this is where sd_nodelist was obtained. 3887 */ 3888 if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) { 3889 nd = nd->nd_next; 3890 continue; 3891 } 3892 3893 /* 3894 * If node has already been decided against for 3895 * master, then skip it. 3896 */ 3897 if (nd->nd_flags & MD_MN_NODE_DEL) { 3898 nd = nd->nd_next; 3899 continue; 3900 } 3901 3902 /* 3903 * Does node in my nodelist have its own node 3904 * record marked OK on its node? And does node 3905 * in my nodelist exist on all other nodes? 3906 * Don't want to choose a node for master unless 3907 * that node is marked OK on its own node and that 3908 * node exists on all other alive nodes. 3909 * 3910 * This is guarding against the case when several 3911 * nodes are down and one of the downed nodes is 3912 * deleted from the diskset. When the down nodes 3913 * are rebooted into the cluster, you don't want 3914 * any node to pick the deleted node as the master. 3915 */ 3916 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3917 MD_SET_BAD, &mnsr, ep) == -1) { 3918 /* 3919 * If set doesn't exist on non-owner node, 3920 * don't consider this node for master. 3921 */ 3922 if (mdiserror(ep, MDE_NO_SET)) { 3923 nd->nd_flags |= MD_MN_NODE_DEL; 3924 nd = nd->nd_next; 3925 continue; 3926 } else if (mdanyrpcerror(ep)) { 3927 /* RPC failure to another node */ 3928 return (205); 3929 } else { 3930 /* Any other failure */ 3931 return (-1); 3932 } 3933 } 3934 /* 3935 * Is my node in the nodelist gotten from the other 3936 * node? If not, then remove the set from my node 3937 * since set was deleted from my node while my node 3938 * was out of the cluster. 3939 */ 3940 nr = mnsr->sr_nodechain; 3941 while (nr) { 3942 if (sd->sd_mn_mynode->nd_nodeid == 3943 nr->nr_nodeid) { 3944 break; 3945 } 3946 nr = nr->nr_next; 3947 } 3948 if (nr == NULL) { 3949 /* my node not found - delete set */ 3950 free_sr((md_set_record *)mnsr); 3951 goto delete_set; 3952 } 3953 3954 /* Is node being checked marked OK on its own node? */ 3955 nr = mnsr->sr_nodechain; 3956 while (nr) { 3957 if (nd->nd_nodeid == nr->nr_nodeid) { 3958 if (!(nr->nr_flags & MD_MN_NODE_OK)) { 3959 nd->nd_flags |= MD_MN_NODE_DEL; 3960 } 3961 break; 3962 } 3963 nr = nr->nr_next; 3964 } 3965 /* 3966 * If node being checked doesn't exist on its 3967 * own node - don't choose it as master. 3968 */ 3969 if (nr == NULL) { 3970 nd->nd_flags |= MD_MN_NODE_DEL; 3971 } 3972 3973 /* 3974 * Check every node in my node's nodelist against 3975 * the nodelist gotten from the other node. 3976 * If a node in my node's nodelist is not found in the 3977 * other node's nodelist, then set the DEL flag. 3978 */ 3979 nd2 = sd->sd_nodelist; 3980 while (nd2) { 3981 nr = mnsr->sr_nodechain; 3982 while (nr) { 3983 if (nd2->nd_nodeid == nr->nr_nodeid) { 3984 break; 3985 } 3986 nr = nr->nr_next; 3987 } 3988 /* nd2 not found in other node's nodelist */ 3989 if (nr == NULL) { 3990 nd2->nd_flags |= MD_MN_NODE_DEL; 3991 } 3992 nd2 = nd2->nd_next; 3993 } 3994 3995 free_sr((md_set_record *)mnsr); 3996 nd = nd->nd_next; 3997 } 3998 3999 /* 4000 * Rescan list look for node that has not been marked DEL. 4001 * First node found is the master. 4002 */ 4003 nd = sd->sd_nodelist; 4004 while (nd) { 4005 if (!(nd->nd_flags & MD_MN_NODE_DEL)) { 4006 break; 4007 } 4008 nd = nd->nd_next; 4009 continue; 4010 } 4011 if (nd) { 4012 /* Found a master */ 4013 goto found_master; 4014 } 4015 4016 /* 4017 * - If no node can be found that has its own node record on 4018 * its node to be set to OK, then all alive nodes 4019 * were in the process of being added to or deleted 4020 * from set. Each alive node will remove all 4021 * information pertaining to this set from its node. 4022 * 4023 * If all nodes in set are ALIVE, then call sdssc end routines 4024 * since set was truly being initially created or destroyed. 4025 */ 4026 goto delete_set; 4027 } 4028 4029 found_master: 4030 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4031 "Set %s master chosen %s (%d): %s"), 4032 sp->setname, nd->nd_nodename, nd->nd_nodeid, 4033 meta_print_hrtime(gethrtime() - start_time)); 4034 4035 if (clnt_lock_set(mynode(), sp, ep) == -1) { 4036 return (-1); 4037 } 4038 4039 cl_sk = cl_get_setkey(sp->setno, sp->setname); 4040 4041 if (clnt_mnsetmaster(mynode(), sp, 4042 nd->nd_nodename, nd->nd_nodeid, ep)) { 4043 rval = -1; 4044 } else if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) { 4045 /* If this node is new master, set flag in this node's kernel */ 4046 (void) memset(&sf, 0, sizeof (sf)); 4047 sf.sf_setno = sp->setno; 4048 sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 4049 /* Use magic to help protect ioctl against attack. */ 4050 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 4051 sf.sf_flags = MDDB_NM_SET; 4052 4053 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4054 "Setting new master flag for set %s: %s"), 4055 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4056 4057 /* 4058 * Fail reconfig cycle if ioctl fails since it is critical 4059 * to set new master flag. 4060 */ 4061 if (metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, 4062 NULL) != NULL) { 4063 (void) mdstealerror(ep, &sf.sf_mde); 4064 rval = -1; 4065 } 4066 } 4067 4068 if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) { 4069 if (rval == 0) { 4070 (void) mdstealerror(ep, &xep); 4071 rval = -1; 4072 } 4073 } 4074 4075 cl_set_setkey(NULL); 4076 4077 metaflushsetname(sp); 4078 4079 return (rval); 4080 4081 delete_set: 4082 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4083 "Master not chosen, deleting set %s: %s"), 4084 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4085 4086 /* 4087 * Remove all set information from this node: 4088 * - node records for this set 4089 * - drive records for this set 4090 * - set record for this set 4091 * (Only do this on this node since each node 4092 * will do it for its own local mddb.) 4093 * 4094 * If all nodes in set are ALIVE, then 4095 * the lowest numbered ALIVE nodeid in set 4096 * (irregardless of whether an owner node or not) will 4097 * call the DCS service to cleanup for create/delete of set. 4098 * sdssc_create_end(cleanup) if set was being created or 4099 * sdssc_delete_end(cleanup) if set was being deleted. 4100 * A node record with flag ADD denotes a set being 4101 * created. A node record with flag DEL denotes a 4102 * set being deleted. 4103 */ 4104 nd = sd->sd_nodelist; 4105 while (nd) { 4106 /* Found a node that isn't alive */ 4107 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) 4108 break; 4109 4110 /* Is my node the lowest numbered ALIVE node? */ 4111 if (nd->nd_nodeid < sd->sd_mn_mynode->nd_nodeid) { 4112 break; 4113 } 4114 nd = nd->nd_next; 4115 } 4116 if (nd == NULL) { 4117 /* All nodes ALIVE and this is the lowest nodeid */ 4118 lowest_alive_nodeid = 1; 4119 } 4120 4121 if (clnt_lock_set(mynode(), sp, ep) == -1) { 4122 return (-1); 4123 } 4124 4125 4126 /* 4127 * If this node had been joined, withdraw and reset master. 4128 * 4129 * This could happen if a node was being added to or removed 4130 * from a diskset and the node doing the add/delete operation and 4131 * all other nodes in the diskset have left the cluster. 4132 */ 4133 if (sd->sd_mn_mynode) { 4134 nd = sd->sd_mn_mynode; 4135 if (nd->nd_flags & MD_MN_NODE_OWN) { 4136 if (clnt_withdrawset(mynode(), sp, ep)) { 4137 rval = -1; 4138 goto out; 4139 } 4140 if (clnt_mnsetmaster(mynode(), sp, "", 4141 MD_MN_INVALID_NID, ep)) { 4142 rval = -1; 4143 goto out; 4144 } 4145 } 4146 } 4147 4148 /* 4149 * Remove side records for this node (side) from local mddb 4150 * (clnt_deldrvs does this) if there are drives in the set. 4151 * 4152 * Don't need to mark this node as DEL since already marked as 4153 * ADD or DEL (or this node would have been chosen as master). 4154 * Don't need to mark other node records, drive records or 4155 * set records as DEL. If a panic occurs during clnt_delset, 4156 * these records will be deleted the next time this node 4157 * becomes a member and goes through the reconfig cycle. 4158 */ 4159 /* Get the drive descriptors for this set */ 4160 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 4161 ep)) == NULL) { 4162 if (! mdisok(ep)) { 4163 /* 4164 * Ignore and clear out any failures from 4165 * metaget_drivedesc since a panic could have 4166 * occurred when a node was partially added to a set. 4167 */ 4168 mdclrerror(ep); 4169 } 4170 } else { 4171 if (clnt_deldrvs(mynode(), sp, dd, ep)) { 4172 rval = -1; 4173 goto out; 4174 } 4175 } 4176 4177 /* 4178 * Now, delete the set - this removes the node, drive 4179 * and set records from the local mddb. 4180 */ 4181 if (clnt_delset(mynode(), sp, ep)) { 4182 rval = -1; 4183 goto out; 4184 } 4185 4186 out: 4187 cl_sk = cl_get_setkey(sp->setno, sp->setname); 4188 4189 /* 4190 * Ignore errors from unlock of set since set is no longer 4191 * known (if clnt_delset worked). 4192 */ 4193 if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) { 4194 mdclrerror(&xep); 4195 } 4196 4197 cl_set_setkey(NULL); 4198 4199 metaflushsetname(sp); 4200 4201 /* 4202 * If this node is the lowest numbered nodeid then 4203 * call sdssc_create/delete_end depending on whether 4204 * this node is marked as ADD or DEL in the node record. 4205 */ 4206 if (lowest_alive_nodeid) { 4207 if (nd->nd_flags & MD_MN_NODE_ADD) 4208 sdssc_create_end(sp->setname, SDSSC_CLEANUP); 4209 else if (nd->nd_flags & MD_MN_NODE_DEL) 4210 sdssc_delete_end(sp->setname, SDSSC_CLEANUP); 4211 } 4212 4213 /* Finished with this set -- return */ 4214 return (rval); 4215 } 4216 4217 /* 4218 * Reconfig step to choose a new master for all MN disksets. 4219 * Return values: 4220 * 0 - Everything is great. 4221 * 1 - This node failed to reconfig. 4222 * 205 - Cause another reconfig due to a nodelist problem 4223 * or RPC failure to another node 4224 */ 4225 int 4226 meta_reconfig_choose_master( 4227 md_error_t *ep 4228 ) 4229 { 4230 set_t max_sets, setno; 4231 int nodecnt; 4232 mndiskset_membershiplist_t *nl; 4233 md_set_desc *sd; 4234 mdsetname_t *sp; 4235 int rval = 0; 4236 mddb_setflags_config_t sf; 4237 int start_node_delayed = 0; 4238 4239 if ((max_sets = get_max_sets(ep)) == 0) { 4240 mde_perror(ep, dgettext(TEXT_DOMAIN, 4241 "Unable to get number of sets")); 4242 return (1); 4243 } 4244 4245 /* 4246 * Get membershiplist from API routine. If there's 4247 * an error, return a 205 to cause another reconfig. 4248 */ 4249 if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) { 4250 mde_perror(ep, ""); 4251 return (205); 4252 } 4253 4254 for (setno = 1; setno < max_sets; setno++) { 4255 if ((sp = metasetnosetname(setno, ep)) == NULL) { 4256 if (mdiserror(ep, MDE_NO_SET)) { 4257 /* No set for this setno - continue */ 4258 mdclrerror(ep); 4259 continue; 4260 } else { 4261 /* 4262 * If encountered an RPC error from my node, 4263 * then immediately fail. 4264 */ 4265 if (mdanyrpcerror(ep)) { 4266 mde_perror(ep, ""); 4267 return (1); 4268 } 4269 /* Can't get set information */ 4270 mde_perror(ep, dgettext(TEXT_DOMAIN, 4271 "Unable to get information for " 4272 "set number %d"), setno); 4273 mdclrerror(ep); 4274 continue; 4275 } 4276 } 4277 4278 /* If setname is there, set desc should exist. */ 4279 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 4280 /* 4281 * If encountered an RPC error from my node, 4282 * then immediately fail. 4283 */ 4284 if (mdanyrpcerror(ep)) { 4285 mde_perror(ep, ""); 4286 return (1); 4287 } 4288 mde_perror(ep, dgettext(TEXT_DOMAIN, 4289 "Unable to get set %s desc information"), 4290 sp->setname); 4291 mdclrerror(ep); 4292 continue; 4293 } 4294 4295 /* Only reconfig MN disksets */ 4296 if (!MD_MNSET_DESC(sd)) { 4297 continue; 4298 } 4299 4300 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4301 "Begin choose master for set %s: %s"), 4302 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4303 4304 /* Update nodelist with member information. */ 4305 if (meta_reconfig_update_nodelist(sp, nl, sd, ep)) { 4306 /* 4307 * If encountered an RPC error from my node, 4308 * then immediately fail. 4309 */ 4310 if (mdanyrpcerror(ep)) { 4311 mde_perror(ep, ""); 4312 return (1); 4313 } 4314 mde_perror(ep, ""); 4315 mdclrerror(ep); 4316 continue; 4317 } 4318 4319 /* 4320 * If all nodes in a cluster are starting, then 4321 * all nodes will attempt to contact all other nodes 4322 * to determine a master node. This can lead to a 4323 * problem where node 1 is trying to contact the rpc.metad 4324 * node 2 and node 2 is trying to contact the rpc.metad 4325 * on node 1 -- and this causes the rpc call to fail 4326 * on both nodes and causes a new reconfig cycle. 4327 * 4328 * In order to break this problem, a newly starting node 4329 * will delay a small amount of time (nodeid mod 4 seconds) 4330 * and will then run the code to choose a master for the 4331 * first set. Delay will only be done once regardless of the 4332 * number of sets. 4333 */ 4334 if (start_node_delayed == 0) { 4335 (void) memset(&sf, 0, sizeof (sf)); 4336 sf.sf_setno = sp->setno; 4337 sf.sf_flags = MDDB_NM_GET; 4338 /* Use magic to help protect ioctl against attack. */ 4339 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 4340 if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, 4341 &sf.sf_mde, NULL) == 0) && 4342 ((sf.sf_setflags & MD_SET_MN_START_RC) == 4343 MD_SET_MN_START_RC)) { 4344 (void) sleep(sd->sd_mn_mynode->nd_nodeid % 4); 4345 } 4346 start_node_delayed = 1; 4347 } 4348 4349 /* Choose master for this set */ 4350 rval = meta_reconfig_choose_master_for_set(sp, sd, ep); 4351 if (rval == -1) { 4352 mde_perror(ep, ""); 4353 return (1); 4354 } else if (rval == 205) { 4355 mde_perror(ep, ""); 4356 return (205); 4357 } 4358 4359 /* Send new nodelist to rpc.mdcommd */ 4360 (void) mdmn_reinit_set(sp->setno); 4361 4362 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4363 "Choose master for set %s completed: %s"), 4364 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4365 } 4366 4367 /* 4368 * Each node turns on I/Os for all MN disksets. 4369 * This is to recover from the situation where the master died 4370 * during a previous reconfig cycle when I/Os were suspended 4371 * for a MN diskset. 4372 * If a failure occurs return a 1 which will force this node to 4373 * panic. Cannot leave node in the situation where I/Os are 4374 * not resumed. 4375 */ 4376 setno = 0; /* 0 means all MN sets */ 4377 if (metaioctl(MD_MN_RESUME_SET, &setno, ep, NULL)) { 4378 mde_perror(ep, ""); 4379 return (1); 4380 } 4381 4382 /* Free the nodelist */ 4383 if (nodecnt) 4384 meta_free_nodelist(nl); 4385 4386 return (0); 4387 } 4388 4389 /* 4390 * meta_mnsync_user_records will synchronize the diskset user records across 4391 * all nodes in the diskset. The diskset user records are stored in 4392 * each node's local set mddb. 4393 * 4394 * This needs to be done even if there is no master change during the 4395 * reconfig cycle since this routine should clean up any mess left by 4396 * the untimely termination of a metaset or metadb command (due to a 4397 * node panic or to user intervention). 4398 * 4399 * Caller is the Master node. 4400 * 4401 * Returns 0 - Success 4402 * 205 - Failure during RPC to another node 4403 * -1 - Any other failure and ep is filled in. 4404 */ 4405 int 4406 meta_mnsync_user_records( 4407 mdsetname_t *sp, 4408 md_error_t *ep 4409 ) 4410 { 4411 md_set_desc *sd; 4412 md_mnnode_desc *master_nodelist, *nd, *nd2, *ndtail; 4413 md_mnset_record *mnsr; 4414 md_mnsr_node_t *master_mnsr_node = NULL, *mnsr_node = NULL; 4415 md_mnnode_record *nr; 4416 md_drive_record *dr; 4417 int dr_cnt, dd_cnt; 4418 int found_my_nr; 4419 md_drive_desc *dd, *dd_prev, *master_dd, *other_dd; 4420 int all_drives_ok; 4421 int rval = 0; 4422 int max_genid = 0; 4423 int num_alive_nodes, num_alive_nodes_del = 0; 4424 int set_locked = 0; 4425 md_setkey_t *cl_sk; 4426 md_error_t xep = mdnullerror; 4427 char *anode[1]; 4428 mddb_setflags_config_t sf; 4429 4430 /* 4431 * Sync up node records first. 4432 * Construct a master nodelist using the nodelist from this 4433 * node's rpc.metad node records and then setting the state of each 4434 * node following these rules: 4435 * - If a node record is marked OK on its node, mark it OK 4436 * in the master nodelist (and later OK on all nodes) 4437 * If a node record is also marked OWN on its node, 4438 * mark it OWN in the master nodelist. 4439 * - If a node record is not marked OK on its node, then mark 4440 * it as DEL in the master list (later deleting it) 4441 * - If node record doesn't exist on that node, then mark it DEL 4442 * (later deleting it) 4443 * - If set record doesn't exist on that node, mark node as DEL 4444 * - If a node record doesn't exist on all nodes, then mark it DEL 4445 * - If a node is not ALIVE, then 4446 * - If that node marked DEL on any node - mark it DEL 4447 * in master list but leave in nodelist 4448 * - If that node is marked as ADD on any node, mark it 4449 * ADD in the master list but leave in nodelist 4450 * - When that node returns to the living, the DEL 4451 * node record will be removed and the ADD node 4452 * record may be removed if marked ADD on that 4453 * node. 4454 * The key rule is to not remove a node from the nodelist until 4455 * that node record is removed from its own node. Do not want to 4456 * remove a node's record from all other nodes and then have 4457 * that node have its own record marked OK so that a node will pick 4458 * a different master than the other nodes. 4459 * 4460 * Next, 4461 * If node is ALIVE and node record is marked DEL in master nodelist, 4462 * remove node from set. 4463 * If node is ALIVE and node record is marked OK in master nodelist, 4464 * mark it OK on all other nodes. 4465 * If node is not ALIVE and node record is marked DEL in master 4466 * nodelist, mark it DEL on all other nodes. 4467 * If node is not ALIVE and node record is marked ADD in master, 4468 * nodelist, mark it ADD on all other nodes. 4469 */ 4470 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 4471 return (-1); 4472 } 4473 master_nodelist = sd->sd_nodelist; 4474 4475 /* 4476 * Walk through nodelist creating a master nodelist. 4477 */ 4478 num_alive_nodes = 0; 4479 nd = master_nodelist; 4480 while (nd) { 4481 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 4482 nd = nd->nd_next; 4483 continue; 4484 } 4485 num_alive_nodes++; 4486 if (clnt_mngetset(nd->nd_nodename, sp->setname, 4487 MD_SET_BAD, &mnsr, ep) == -1) { 4488 if (mdiserror(ep, MDE_NO_SET)) { 4489 /* set doesn't exist, mark node as DEL */ 4490 nd->nd_flags &= ~MD_MN_NODE_OK; 4491 nd->nd_flags &= ~MD_MN_NODE_ADD; 4492 nd->nd_flags |= MD_MN_NODE_DEL; 4493 nd->nd_flags |= MD_MN_NODE_NOSET; 4494 nd = nd->nd_next; 4495 continue; 4496 } else { 4497 /* If RPC failure to another node return 205 */ 4498 if ((mdanyrpcerror(ep)) && 4499 (sd->sd_mn_mynode->nd_nodeid != 4500 nd->nd_nodeid)) { 4501 rval = 205; 4502 } else { 4503 /* Any other failure */ 4504 rval = -1; 4505 } 4506 goto out; 4507 } 4508 } 4509 /* Find biggest genid in records for this diskset */ 4510 if (mnsr->sr_genid > max_genid) 4511 max_genid = mnsr->sr_genid; 4512 4513 dr = mnsr->sr_drivechain; 4514 while (dr) { 4515 /* Find biggest genid in records for this diskset */ 4516 if (dr->dr_genid > max_genid) { 4517 max_genid = dr->dr_genid; 4518 } 4519 dr = dr->dr_next; 4520 } 4521 4522 found_my_nr = 0; 4523 nr = mnsr->sr_nodechain; 4524 /* nr is the list of node recs from nd_nodename node */ 4525 while (nr) { 4526 /* Find biggest genid in records for this diskset */ 4527 if (nr->nr_genid > max_genid) 4528 max_genid = nr->nr_genid; 4529 nd2 = master_nodelist; 4530 ndtail = NULL; 4531 /* For each node record, is it in master list? */ 4532 while (nd2) { 4533 if (nd2->nd_nodeid == nr->nr_nodeid) 4534 break; 4535 if (nd2->nd_next == NULL) 4536 ndtail = nd2; 4537 nd2 = nd2->nd_next; 4538 } 4539 /* 4540 * Found node record not in master list -- add it 4541 * to list marking it as DEL since node record 4542 * should exist on all nodes unless a panic occurred 4543 * during addition or deletion of host to diskset. 4544 */ 4545 if (nd2 == NULL) { 4546 nd2 = Zalloc(sizeof (*nd2)); 4547 (void) strcpy(nd2->nd_nodename, 4548 nr->nr_nodename); 4549 nd2->nd_flags = nr->nr_flags; 4550 nd2->nd_flags |= MD_MN_NODE_DEL; 4551 nd2->nd_nodeid = nr->nr_nodeid; 4552 nd2->nd_next = NULL; 4553 ndtail->nd_next = nd2; 4554 nd2 = NULL; 4555 nr = nr->nr_next; 4556 continue; 4557 } 4558 /* 4559 * Is this the node record for the node that 4560 * we requested the set desc from? 4561 * If so, check if node has its own node record 4562 * marked OK. If marked OK, check for the OWN bit. 4563 */ 4564 if (nr->nr_nodeid == nd->nd_nodeid) { 4565 found_my_nr = 1; 4566 if (nr->nr_flags & MD_MN_NODE_OK) { 4567 /* 4568 * If node record is marked OK 4569 * on its own node, then mark it OK 4570 * in the master list. Node record 4571 * would have to exist on all nodes 4572 * in the ADD state before it could 4573 * be put into the OK state. 4574 */ 4575 nd->nd_flags |= MD_MN_NODE_OK; 4576 nd->nd_flags &= 4577 ~(MD_MN_NODE_ADD | MD_MN_NODE_DEL); 4578 /* 4579 * Mark own in master list as marked 4580 * on own node. 4581 */ 4582 if (nr->nr_flags & MD_MN_NODE_OWN) 4583 nd->nd_flags |= MD_MN_NODE_OWN; 4584 else 4585 nd->nd_flags &= ~MD_MN_NODE_OWN; 4586 } else { 4587 /* Otherwise, mark node as DEL */ 4588 nd->nd_flags &= ~MD_MN_NODE_OK; 4589 nd->nd_flags &= ~MD_MN_NODE_ADD; 4590 nd->nd_flags |= MD_MN_NODE_DEL; 4591 } 4592 } 4593 /* 4594 * If node is not ALIVE and marked DEL 4595 * on any node, make it DEL in master list. 4596 * If node is not ALIVE and marked ADD 4597 * on any node, make it ADD in master list 4598 * unless node record has already been marked DEL. 4599 */ 4600 if (!(nr->nr_flags & MD_MN_NODE_ALIVE)) { 4601 if (nr->nr_flags & MD_MN_NODE_ADD) { 4602 if (!(nd->nd_flags & MD_MN_NODE_DEL)) { 4603 /* If not DEL - mark it ADD */ 4604 nd->nd_flags |= MD_MN_NODE_ADD; 4605 nd->nd_flags &= ~MD_MN_NODE_OK; 4606 } 4607 } 4608 if (nr->nr_flags & MD_MN_NODE_DEL) { 4609 nd->nd_flags |= MD_MN_NODE_DEL; 4610 nd->nd_flags &= ~MD_MN_NODE_OK; 4611 /* Could already be ADD - make it DEL */ 4612 nd->nd_flags &= ~MD_MN_NODE_ADD; 4613 } 4614 } 4615 nr = nr->nr_next; 4616 } 4617 /* 4618 * If a node record doesn't exist on its own node, 4619 * then mark node as DEL. 4620 */ 4621 if (found_my_nr == 0) { 4622 nd->nd_flags &= ~MD_MN_NODE_OK; 4623 nd->nd_flags |= MD_MN_NODE_DEL; 4624 } 4625 4626 /* 4627 * If node is OK - put mnsr onto master_mnsr_node list for 4628 * later use when syncing up the drive records in the set. 4629 */ 4630 if (nd->nd_flags & MD_MN_NODE_OK) { 4631 mnsr_node = Zalloc(sizeof (*mnsr_node)); 4632 mnsr_node->mmn_mnsr = mnsr; 4633 (void) strncpy(mnsr_node->mmn_nodename, 4634 nd->nd_nodename, MD_MAX_MNNODENAME_PLUS_1); 4635 mnsr_node->mmn_next = master_mnsr_node; 4636 master_mnsr_node = mnsr_node; 4637 } else { 4638 free_sr((struct md_set_record *)mnsr); 4639 } 4640 4641 nd = nd->nd_next; 4642 } 4643 4644 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4645 "Master nodelist created for set %s: %s"), 4646 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4647 4648 /* 4649 * Send master nodelist to the rpc.metad on all nodes (including 4650 * myself) and each node will update itself. This will set the 4651 * ADD and DEL flags on each node as setup in the master nodelist. 4652 * Don't send nodelist to node where set doesn't exist. 4653 */ 4654 nd = master_nodelist; 4655 while (nd) { 4656 if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || 4657 (nd->nd_flags & MD_MN_NODE_NOSET)) { 4658 nd = nd->nd_next; 4659 continue; 4660 } 4661 if (clnt_upd_nr_flags(nd->nd_nodename, sp, 4662 master_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) { 4663 /* If RPC failure to another node return 205 */ 4664 if ((mdanyrpcerror(ep)) && 4665 (sd->sd_mn_mynode->nd_nodeid != 4666 nd->nd_nodeid)) { 4667 rval = 205; 4668 } else { 4669 /* Any other failure */ 4670 rval = -1; 4671 } 4672 goto out; 4673 } 4674 nd = nd->nd_next; 4675 } 4676 4677 /* 4678 * Now, delete nodes that need to be deleted. 4679 */ 4680 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 4681 ep)) == NULL) { 4682 if (! mdisok(ep)) { 4683 rval = -1; 4684 goto out; 4685 } 4686 } 4687 4688 /* 4689 * May be doing lots of RPC commands to the nodes, so lock the 4690 * ALIVE members of the set since most of the rpc.metad routines 4691 * require this for security reasons. 4692 */ 4693 nd = master_nodelist; 4694 while (nd) { 4695 /* Skip non-alive nodes and node without set */ 4696 if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || 4697 (nd->nd_flags & MD_MN_NODE_NOSET)) { 4698 nd = nd->nd_next; 4699 continue; 4700 } 4701 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 4702 /* If RPC failure to another node return 205 */ 4703 if ((mdanyrpcerror(ep)) && 4704 (sd->sd_mn_mynode->nd_nodeid != 4705 nd->nd_nodeid)) { 4706 rval = 205; 4707 } else { 4708 /* Any other failure */ 4709 rval = -1; 4710 } 4711 goto out; 4712 } 4713 set_locked = 1; 4714 nd = nd->nd_next; 4715 } 4716 4717 nd = master_nodelist; 4718 while (nd) { 4719 /* Skip non-alive nodes */ 4720 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 4721 nd = nd->nd_next; 4722 continue; 4723 } 4724 if (nd->nd_flags & MD_MN_NODE_DEL) { 4725 num_alive_nodes_del++; 4726 /* 4727 * Delete this node rec from all ALIVE nodes in diskset. 4728 */ 4729 nd2 = master_nodelist; 4730 while (nd2) { 4731 /* Skip non-alive nodes and node without set */ 4732 if (!(nd2->nd_flags & MD_MN_NODE_ALIVE) || 4733 (nd2->nd_flags & MD_MN_NODE_NOSET)) { 4734 nd2 = nd2->nd_next; 4735 continue; 4736 } 4737 4738 /* This is a node being deleted from set */ 4739 if (nd2->nd_nodeid == nd->nd_nodeid) { 4740 /* Mark set record as DEL */ 4741 if (clnt_upd_sr_flags(nd->nd_nodename, 4742 sp, MD_SR_DEL, ep)) { 4743 /* RPC failure to !my node */ 4744 if ((mdanyrpcerror(ep)) && 4745 (sd->sd_mn_mynode-> 4746 nd_nodeid 4747 != nd->nd_nodeid)) { 4748 rval = 205; 4749 } else { 4750 /* Any other failure */ 4751 rval = -1; 4752 } 4753 goto out; 4754 } 4755 if (clnt_deldrvs(nd->nd_nodename, sp, 4756 dd, ep)) { 4757 /* RPC failure to !my node */ 4758 if ((mdanyrpcerror(ep)) && 4759 (sd->sd_mn_mynode-> 4760 nd_nodeid 4761 != nd->nd_nodeid)) { 4762 rval = 205; 4763 } else { 4764 /* Any other failure */ 4765 rval = -1; 4766 } 4767 goto out; 4768 } 4769 if (clnt_delset(nd->nd_nodename, sp, 4770 ep) == -1) { 4771 /* RPC failure to !my node */ 4772 if ((mdanyrpcerror(ep)) && 4773 (sd->sd_mn_mynode-> 4774 nd_nodeid 4775 != nd->nd_nodeid)) { 4776 rval = 205; 4777 } else { 4778 /* Any other failure */ 4779 rval = -1; 4780 } 4781 goto out; 4782 } 4783 } else { 4784 /* 4785 * Delete host from sets on hosts 4786 * not being deleted. 4787 */ 4788 anode[0] = Strdup(nd->nd_nodename); 4789 if (clnt_delhosts(nd2->nd_nodename, sp, 4790 1, anode, ep) == -1) { 4791 Free(anode[0]); 4792 /* RPC failure to !my node */ 4793 if ((mdanyrpcerror(ep)) && 4794 (sd->sd_mn_mynode-> 4795 nd_nodeid 4796 != nd2->nd_nodeid)) { 4797 rval = 205; 4798 } else { 4799 /* Any other failure */ 4800 rval = -1; 4801 } 4802 goto out; 4803 } 4804 4805 meta_mc_log(MC_LOG5, 4806 dgettext(TEXT_DOMAIN, 4807 "Deleted node %s (%d) on node %s " 4808 "from set %s: %s"), 4809 nd->nd_nodename, nd->nd_nodeid, 4810 nd2->nd_nodename, 4811 sp->setname, 4812 meta_print_hrtime( 4813 gethrtime() - start_time)); 4814 4815 Free(anode[0]); 4816 } 4817 nd2 = nd2->nd_next; 4818 } 4819 } 4820 nd = nd->nd_next; 4821 } 4822 4823 nd = master_nodelist; 4824 cl_sk = cl_get_setkey(sp->setno, sp->setname); 4825 while (nd) { 4826 /* Skip non-alive nodes and node without set */ 4827 if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || 4828 (nd->nd_flags & MD_MN_NODE_NOSET)) { 4829 nd = nd->nd_next; 4830 continue; 4831 } 4832 if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) { 4833 /* If RPC failure to another node return 205 */ 4834 if ((mdanyrpcerror(ep)) && 4835 (sd->sd_mn_mynode->nd_nodeid != 4836 nd->nd_nodeid)) { 4837 rval = 205; 4838 } else { 4839 /* Any other failure */ 4840 rval = -1; 4841 } 4842 goto out; 4843 } 4844 nd = nd->nd_next; 4845 } 4846 cl_set_setkey(NULL); 4847 set_locked = 0; 4848 4849 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4850 "Nodelist syncronization complete for set %s: %s"), 4851 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4852 4853 metaflushsetname(sp); 4854 4855 /* 4856 * If all alive nodes have been deleted from set, just 4857 * return since nothing else can be done until non-alive 4858 * nodes (if there are any) rejoin the cluster. 4859 */ 4860 if (num_alive_nodes == num_alive_nodes_del) { 4861 rval = 0; 4862 goto out; 4863 } 4864 4865 /* 4866 * Sync up drive records. 4867 * 4868 * If a node panic'd (or metaset command was killed) during the 4869 * addition or deletion of a drive to the diskset, the nodes 4870 * may have a different view of the drive list. During cleanup 4871 * of the drive list during reconfig, a drive will be deleted 4872 * from the list if the master node sees that the drive has been 4873 * marked in the ADD state on any node or is marked in the DEL state 4874 * on all nodes. 4875 * This cleanup must occur even if all nodes in the cluster are 4876 * not part of the cluster so that all nodes have the same view 4877 * of the drivelist. 4878 * Then if the entire cluster goes down and comes back up, the 4879 * new master node could be a node that wasn't in the cluster when 4880 * the node was deleted. This could lead to a situation where the 4881 * master node thinks that a drive is OK, but this drive isn't 4882 * known to the other nodes. 4883 * This situation can also occur during the addition of a drive 4884 * where a node has the drive marked OK, but the node executing the 4885 * metaset command enountered a failure before marking that drive OK 4886 * on the rest of the nodes. If the node with the OK drive then 4887 * panics, then rest of the nodes will remove that drive marked ADD 4888 * and when the node with the OK drive rejoins the cluster, it will 4889 * have a drive marked OK that is unknown by the other nodes. 4890 * 4891 * There are 2 situations to consider: 4892 * A) Master knows about a drive that other nodes don't know about. 4893 * B) At least one slave node knows about a drive that the master 4894 * node doesn't know about. 4895 * 4896 * To handle these situations the following steps are followed: 4897 * 1) Count number of drives known by this master node and the 4898 * other slave nodes. 4899 * If all nodes have the same number of drives and the master has 4900 * all drives marked OK, then skip to step4. 4901 * 4902 * 2) If a node has less drives listed than the master, the master 4903 * must get the drive descriptor list from that node so that 4904 * master can determine which drive it needs to delete from that 4905 * node. Master must get the drive descriptor list since the 4906 * drive record list does not contain the name of the drive, but 4907 * only a key and the key can only be interprested on that other 4908 * node. 4909 * 4910 * 3) The master will then create the master drive list by doing: 4911 * - Master starts with drive list known by master. 4912 * - Any drive marked ADD will be removed from the list. 4913 * - Any drive not known by another node (from step2) will be 4914 * removed from the drive list. 4915 * - If a drive is marked DEL on the master, the master must 4916 * verify that the drive record is marked DEL on all nodes. 4917 * If any node has the drive record marked OK, mark it OK 4918 * on the master. (The reason why is described below). 4919 * 4920 * 4) The master sends out the master drive list and the slave 4921 * nodes will force their drive lists to match the master 4922 * drive list by deleting drives, if necessary and by changing 4923 * the drive record states from ADD->OK if master has drive 4924 * marked OK and slave has drive marked ADD. 4925 * 4926 * Interesting scenarios: 4927 * 4928 * 1) System has 4 nodes with node 1 as the master. Node 3 starts 4929 * to delete a drive record (drive record on node 1 is marked DEL), 4930 * but is stopped when node 3 panics. Node 1 also panics. 4931 * During reconfig cycle, node 2 is picked as master and the drive 4932 * record is left alone since all nodes in the cluster have it 4933 * marked OK. User now sees drive as part of diskset. 4934 * Now, entire cluster is rebooted and node 1 rejoins the cluster. 4935 * Node 1 is picked as the master and node 1 has drive record 4936 * marked DEL. Node 1 contacts all other nodes in the cluster 4937 * and since at least one node has the drive record marked OK, 4938 * the master marks the drive record OK. 4939 * User continues to see the drive as part of the diskset. 4940 */ 4941 4942 /* Reget set descriptor since flushed above */ 4943 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 4944 rval = -1; 4945 goto out; 4946 } 4947 4948 /* Has side effect of setting sd->sd_drvs to same as master_dd */ 4949 if ((master_dd = metaget_drivedesc_sideno(sp, 4950 sd->sd_mn_mynode->nd_nodeid, 4951 (MD_BASICNAME_OK | PRINT_FAST), ep)) == NULL) { 4952 /* No drives in list */ 4953 if (!mdisok(ep)) { 4954 /* 4955 * Can't get drive list for this node, so 4956 * return -1 causing this node to be removed 4957 * cluster config and fixed. 4958 */ 4959 rval = -1; 4960 goto out; 4961 } 4962 } 4963 4964 /* Count the number of drives for all nodes */ 4965 mnsr_node = master_mnsr_node; 4966 while (mnsr_node) { 4967 dr_cnt = 0; 4968 dr = mnsr_node->mmn_mnsr->sr_drivechain; 4969 while (dr) { 4970 dr_cnt++; 4971 dr = dr->dr_next; 4972 } 4973 mnsr_node->mmn_numdrives = dr_cnt; 4974 mnsr_node = mnsr_node->mmn_next; 4975 } 4976 4977 /* Count the number of drives for the master; also check flags */ 4978 all_drives_ok = 1; 4979 dd_cnt = 0; 4980 dd = master_dd; 4981 while (dd) { 4982 dd_cnt++; 4983 if (!(dd->dd_flags & MD_DR_OK)) 4984 all_drives_ok = 0; 4985 dd = dd->dd_next; 4986 } 4987 4988 /* If all drives are ok, do quick check against number of drives */ 4989 if (all_drives_ok) { 4990 /* If all nodes have same number of drives, almost done */ 4991 mnsr_node = master_mnsr_node; 4992 while (mnsr_node) { 4993 if (mnsr_node->mmn_numdrives != dd_cnt) 4994 break; 4995 mnsr_node = mnsr_node->mmn_next; 4996 } 4997 /* All nodes have same number of drives, just send flags */ 4998 if (mnsr_node == NULL) { 4999 goto send_drive_list; 5000 } 5001 } 5002 5003 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5004 "Begin detailed drive synchronization for set %s: %s"), 5005 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5006 5007 /* Detailed check required */ 5008 mnsr_node = master_mnsr_node; 5009 while (mnsr_node) { 5010 /* Does slave node have less drives than master? */ 5011 if (mnsr_node->mmn_numdrives < dd_cnt) { 5012 /* Yes - must determine which drive is missing */ 5013 if (clnt_getdrivedesc(mnsr_node->mmn_nodename, sp, 5014 &other_dd, ep)) { 5015 /* RPC failure to !my node */ 5016 if ((mdanyrpcerror(ep)) && 5017 (strcmp(mynode(), mnsr_node->mmn_nodename) 5018 != 0)) { 5019 rval = 205; 5020 } else { 5021 /* Any other failure */ 5022 rval = -1; 5023 } 5024 mde_perror(ep, dgettext(TEXT_DOMAIN, 5025 "Master node %s unable to " 5026 "retrieve drive list from node %s"), 5027 mynode(), mnsr_node->mmn_nodename); 5028 goto out; 5029 } 5030 mnsr_node->mmn_dd = other_dd; 5031 dd = master_dd; 5032 while (dd) { 5033 if (!(dd->dd_flags & MD_DR_OK)) { 5034 dd = dd->dd_next; 5035 continue; 5036 } 5037 other_dd = mnsr_node->mmn_dd; 5038 while (other_dd) { 5039 /* Convert to devids, when available */ 5040 if (strcmp(other_dd->dd_dnp->cname, 5041 dd->dd_dnp->cname) == 0) { 5042 break; 5043 } 5044 other_dd = other_dd->dd_next; 5045 } 5046 /* 5047 * dd not found on slave so mark it 5048 * ADD for later deletion (drives in ADD 5049 * state are deleted later in this routine). 5050 */ 5051 if (other_dd == NULL) { 5052 dd->dd_flags = MD_DR_ADD; 5053 } 5054 dd = dd->dd_next; 5055 } 5056 5057 } 5058 mnsr_node = mnsr_node->mmn_next; 5059 } 5060 5061 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5062 "Drive check completed for set %s: %s"), 5063 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5064 5065 dd = master_dd; 5066 dd_prev = 0; 5067 while (dd) { 5068 /* Remove any ADD drives from list */ 5069 if (dd->dd_flags & MD_DR_ADD) { 5070 if (dd_prev) { 5071 dd_prev->dd_next = dd->dd_next; 5072 dd->dd_next = NULL; 5073 metafreedrivedesc(&dd); 5074 dd = dd_prev->dd_next; 5075 } else { 5076 /* 5077 * If removing drive descriptor from head 5078 * of linked list, also change sd->sd_drvs. 5079 */ 5080 master_dd = sd->sd_drvs = dd->dd_next; 5081 dd->dd_next = NULL; 5082 metafreedrivedesc(&dd); 5083 dd = master_dd; 5084 } 5085 /* dd setup in if/else above */ 5086 continue; 5087 } 5088 /* 5089 * If drive is marked DEL, check all other nodes. 5090 * If drive on another node is marked OK, mark drive OK 5091 * in master list. If drive is marked DEL or doesn't exist 5092 * on all nodes, remove drive from list. 5093 */ 5094 if (dd->dd_flags & MD_DR_DEL) { 5095 mnsr_node = master_mnsr_node; 5096 while (mnsr_node) { 5097 if (mnsr_node->mmn_dd == NULL) { 5098 if (clnt_getdrivedesc( 5099 mnsr_node->mmn_nodename, sp, 5100 &other_dd, ep)) { 5101 /* RPC failure to !my node */ 5102 if ((mdanyrpcerror(ep)) && 5103 (strcmp(mynode(), 5104 mnsr_node->mmn_nodename) 5105 != 0)) { 5106 rval = 205; 5107 } else { 5108 /* Any other failure */ 5109 rval = -1; 5110 } 5111 mde_perror(ep, dgettext(TEXT_DOMAIN, 5112 "Master node %s unable " 5113 "to retrieve drive list from " 5114 "node %s"), mynode(), 5115 mnsr_node->mmn_nodename); 5116 goto out; 5117 } 5118 mnsr_node->mmn_dd = other_dd; 5119 } 5120 other_dd = mnsr_node->mmn_dd; 5121 while (other_dd) { 5122 /* Found drive (OK) from other node */ 5123 if (strcmp(dd->dd_dnp->cname, 5124 other_dd->dd_dnp->cname) 5125 == 0) { 5126 /* Drive marked OK */ 5127 if (other_dd->dd_flags & 5128 MD_DR_OK) { 5129 dd->dd_flags = MD_DR_OK; 5130 } 5131 break; 5132 } 5133 other_dd = other_dd->dd_next; 5134 } 5135 if (dd->dd_flags == MD_DR_OK) 5136 break; 5137 5138 mnsr_node = mnsr_node->mmn_next; 5139 } 5140 /* 5141 * If no node had this drive marked OK, delete it. 5142 */ 5143 if (dd->dd_flags & MD_DR_DEL) { 5144 if (dd_prev) { 5145 dd_prev->dd_next = dd->dd_next; 5146 dd->dd_next = NULL; 5147 metafreedrivedesc(&dd); 5148 dd = dd_prev->dd_next; 5149 } else { 5150 /* 5151 * If removing drive descriptor from 5152 * head of linked list, also change 5153 * sd->sd_drvs. 5154 */ 5155 master_dd = sd->sd_drvs = dd->dd_next; 5156 dd->dd_next = NULL; 5157 metafreedrivedesc(&dd); 5158 dd = master_dd; 5159 } 5160 /* dd setup in if/else above */ 5161 continue; 5162 } 5163 } 5164 dd_prev = dd; 5165 dd = dd->dd_next; 5166 } 5167 5168 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5169 "Setting drive states completed for set %s: %s"), 5170 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5171 5172 send_drive_list: 5173 /* 5174 * Set genid on all drives to be the highest value seen. 5175 */ 5176 dd = master_dd; 5177 while (dd) { 5178 dd->dd_genid = max_genid; 5179 dd = dd->dd_next; 5180 } 5181 /* 5182 * Send updated drive list to all alive nodes. 5183 * Will also set genid on set and node records to have same 5184 * as the drive records. 5185 */ 5186 nd = sd->sd_nodelist; 5187 while (nd) { 5188 /* Skip non-alive nodes */ 5189 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 5190 nd = nd->nd_next; 5191 continue; 5192 } 5193 if (clnt_upd_dr_reconfig(nd->nd_nodename, sp, master_dd, ep)) { 5194 /* RPC failure to another node */ 5195 if ((mdanyrpcerror(ep)) && 5196 (sd->sd_mn_mynode->nd_nodeid != nd->nd_nodeid)) { 5197 rval = 205; 5198 } else { 5199 /* Any other failure */ 5200 rval = -1; 5201 } 5202 goto out; 5203 } 5204 nd = nd->nd_next; 5205 } 5206 5207 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5208 "Sent drive list to all nodes for set %s: %s"), 5209 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5210 5211 /* 5212 * If no drive records left in set and nodes had been joined, 5213 * withdraw the nodes. Always reset the master and mark 5214 * all nodes as withdrawn on all nodes. 5215 */ 5216 if (master_dd == NULL) { 5217 /* Reset new master flag since no longer master */ 5218 (void) memset(&sf, 0, sizeof (sf)); 5219 sf.sf_setno = sp->setno; 5220 sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 5221 sf.sf_flags = MDDB_NM_RESET; 5222 /* Use magic to help protect ioctl against attack. */ 5223 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5224 /* Ignore failure, failure to reset flag isn't catastrophic */ 5225 (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, 5226 &sf.sf_mde, NULL); 5227 5228 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5229 "Reset new master flag for " "set %s: %s"), 5230 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5231 5232 nd = sd->sd_nodelist; 5233 while (nd) { 5234 /* Skip non-alive nodes */ 5235 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 5236 nd = nd->nd_next; 5237 continue; 5238 } 5239 5240 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 5241 /* RPC failure to another node */ 5242 if ((mdanyrpcerror(ep)) && 5243 (sd->sd_mn_mynode->nd_nodeid != 5244 nd->nd_nodeid)) { 5245 rval = 205; 5246 } else { 5247 /* Any other failure */ 5248 rval = -1; 5249 } 5250 goto out; 5251 } 5252 set_locked = 1; 5253 5254 /* Withdraw node from set if owner */ 5255 if ((nd->nd_flags & MD_MN_NODE_OWN) && 5256 (clnt_withdrawset(nd->nd_nodename, sp, ep))) { 5257 /* RPC failure to another node */ 5258 if ((mdanyrpcerror(ep)) && 5259 (sd->sd_mn_mynode->nd_nodeid != 5260 nd->nd_nodeid)) { 5261 rval = 205; 5262 } else { 5263 /* Any other failure */ 5264 rval = -1; 5265 } 5266 goto out; 5267 } 5268 5269 /* Mark all nodes as withdrawn on this node */ 5270 if (clnt_upd_nr_flags(nd->nd_nodename, sp, 5271 sd->sd_nodelist, MD_NR_WITHDRAW, NULL, ep)) { 5272 /* RPC failure to another node */ 5273 if ((mdanyrpcerror(ep)) && 5274 (sd->sd_mn_mynode->nd_nodeid != 5275 nd->nd_nodeid)) { 5276 rval = 205; 5277 } else { 5278 /* Any other failure */ 5279 rval = -1; 5280 } 5281 goto out; 5282 } 5283 5284 /* Resets master to no-master on this node */ 5285 if (clnt_mnsetmaster(nd->nd_nodename, sp, 5286 "", MD_MN_INVALID_NID, ep)) { 5287 /* RPC failure to another node */ 5288 if ((mdanyrpcerror(ep)) && 5289 (sd->sd_mn_mynode->nd_nodeid != 5290 nd->nd_nodeid)) { 5291 rval = 205; 5292 } else { 5293 /* Any other failure */ 5294 rval = -1; 5295 } 5296 goto out; 5297 } 5298 5299 cl_sk = cl_get_setkey(sp->setno, sp->setname); 5300 if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) { 5301 /* RPC failure to another node */ 5302 if ((mdanyrpcerror(ep)) && 5303 (sd->sd_mn_mynode->nd_nodeid != 5304 nd->nd_nodeid)) { 5305 rval = 205; 5306 } else { 5307 /* Any other failure */ 5308 rval = -1; 5309 } 5310 goto out; 5311 } 5312 set_locked = 0; 5313 nd = nd->nd_next; 5314 } 5315 } 5316 5317 out: 5318 /* 5319 * If got here and set is still locked, then an error has 5320 * occurred and master_nodelist is still valid. 5321 * If error is not an RPC error, then unlock. 5322 * If error is an RPC error, skip unlocks since this could cause 5323 * yet another RPC timeout if a node has failed. 5324 * Ignore failures in unlock since unlock is just trying to 5325 * clean things up. 5326 */ 5327 if ((set_locked) && !(mdanyrpcerror(ep))) { 5328 nd = master_nodelist; 5329 cl_sk = cl_get_setkey(sp->setno, sp->setname); 5330 while (nd) { 5331 /* Skip non-alive nodes */ 5332 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 5333 nd = nd->nd_next; 5334 continue; 5335 } 5336 /* 5337 * If clnt_unlock fails, just break out since next 5338 * reconfig cycle will reset the locks anyway. 5339 */ 5340 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 5341 break; 5342 } 5343 nd = nd->nd_next; 5344 } 5345 cl_set_setkey(NULL); 5346 } 5347 /* Free master_mnsr and drive descs */ 5348 mnsr_node = master_mnsr_node; 5349 while (mnsr_node) { 5350 master_mnsr_node = mnsr_node->mmn_next; 5351 free_sr((md_set_record *)mnsr_node->mmn_mnsr); 5352 free_rem_dd(mnsr_node->mmn_dd); 5353 Free(mnsr_node); 5354 mnsr_node = master_mnsr_node; 5355 } 5356 5357 /* Frees sd->sd_drvs (which is also master_dd) */ 5358 metaflushsetname(sp); 5359 return (rval); 5360 } 5361 5362 /* 5363 * meta_mnsync_diskset_mddbs 5364 * Calling node is guaranteed to be an owner node. 5365 * Calling node is the master node. 5366 * 5367 * Master node verifies that ondisk mddb format matches its incore format. 5368 * If no nodes are joined to set, remove the change log entries. 5369 * If a node is joined to set, play the change log. 5370 * 5371 * Returns 0 - Success 5372 * 1 - Master unable to join to set. 5373 * 205 - Failure during RPC to another node 5374 * -1 - Any other failure and ep is filled in. 5375 * -1 return will eventually cause node to panic 5376 * in a SunCluster environment. 5377 */ 5378 int 5379 meta_mnsync_diskset_mddbs( 5380 mdsetname_t *sp, 5381 md_error_t *ep 5382 ) 5383 { 5384 md_set_desc *sd; 5385 mddb_config_t c; 5386 md_mn_msgclass_t class; 5387 mddb_setflags_config_t sf; 5388 md_mnnode_desc *nd, *nd2; 5389 md_error_t xep = mdnullerror; 5390 int stale_set = 0; 5391 5392 /* If setname is there, set desc should exist. */ 5393 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 5394 mde_perror(ep, dgettext(TEXT_DOMAIN, 5395 "Unable to get set %s desc information"), sp->setname); 5396 return (-1); 5397 } 5398 5399 /* Are there drives in the set? */ 5400 if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 5401 ep) == NULL) { 5402 if (! mdisok(ep)) { 5403 return (-1); 5404 } 5405 /* No drives in set -- nothing to sync up */ 5406 return (0); 5407 } 5408 5409 /* 5410 * Is master node (which is this node) joined to set? 5411 * If master node isn't joined (which means that no nodes 5412 * are joined to diskset), remove the change log entries 5413 * since no need to replay them - all nodes will have same 5414 * view of mddbs since all nodes are reading in the mddbs 5415 * from disk. 5416 * There is also no need to sync up the master and ondisk mddbs 5417 * since master has no incore knowledge. 5418 * Need to join master to set in order to flush the change 5419 * log entries. Don't need to block I/O during join of master 5420 * to set since no other nodes are joined to set and so no I/O 5421 * can be occurring. 5422 */ 5423 if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 5424 /* Join master to set */ 5425 if (clnt_joinset(mynode(), sp, 5426 MNSET_IN_RECONFIG, ep)) { 5427 if (mdismddberror(ep, MDE_DB_STALE)) { 5428 /* 5429 * If STALE, print message and continue on. 5430 * Don't do any writes or reads to mddbs 5431 * so don't clear change log. 5432 */ 5433 mde_perror(ep, dgettext(TEXT_DOMAIN, 5434 "Join of master node to STALE set %s"), 5435 sp->setname); 5436 stale_set = 1; 5437 mdclrerror(ep); 5438 } else if (mdismddberror(ep, MDE_DB_ACCOK)) { 5439 /* ACCOK means mediator provided extra vote */ 5440 mdclrerror(ep); 5441 } else { 5442 /* 5443 * If master is unable to join set, print an 5444 * error message. Don't return failure or node 5445 * will panic during cluster reconfig cycle. 5446 * Also, withdraw node from set in order to 5447 * cleanup from failed join attempt. 5448 */ 5449 mde_perror(ep, dgettext(TEXT_DOMAIN, 5450 "Join of master node in set %s failed"), 5451 sp->setname); 5452 if (clnt_withdrawset(mynode(), sp, &xep)) 5453 mdclrerror(&xep); 5454 return (1); 5455 } 5456 } 5457 /* 5458 * Master node successfully joined. 5459 * Set local copy of flags to OWN and 5460 * send owner flag to rpc.metad. If not stale, 5461 * flush the change log. 5462 */ 5463 sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN; 5464 if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, MD_NR_SET, 5465 MNSET_IN_RECONFIG, ep)) { 5466 mde_perror(ep, dgettext(TEXT_DOMAIN, 5467 "Flag update of master node join in set %s failed"), 5468 sp->setname); 5469 return (-1); 5470 } 5471 5472 if (!stale_set) { 5473 if (mdmn_reset_changelog(sp, ep, 5474 MDMN_CLF_RESETLOG) != 0) { 5475 mde_perror(ep, dgettext(TEXT_DOMAIN, 5476 "Unable to reset changelog.")); 5477 return (-1); 5478 } 5479 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5480 "Removed changelog entries for set %s: %s"), 5481 sp->setname, 5482 meta_print_hrtime(gethrtime() - start_time)); 5483 } 5484 /* Reset new master flag before return */ 5485 (void) memset(&sf, 0, sizeof (sf)); 5486 sf.sf_setno = sp->setno; 5487 sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 5488 sf.sf_flags = MDDB_NM_RESET; 5489 /* Use magic to help protect ioctl against attack. */ 5490 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5491 /* Ignore failure, failure to reset flag isn't catastrophic */ 5492 (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, 5493 &sf.sf_mde, NULL); 5494 5495 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5496 "Reset new master flag for set %s: %s"), 5497 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5498 5499 return (0); 5500 } 5501 5502 /* 5503 * Is master already joined to STALE set (< 50% mddbs avail)? 5504 * If so, can make no config changes to mddbs so don't check or play 5505 * changelog and don't sync master node to ondisk mddbs. 5506 * To get out of the stale state all nodes must be withdrawn 5507 * from set. Then as nodes are re-joined, all nodes will 5508 * have same view of mddbs since all nodes are reading the 5509 * mddbs from disk. 5510 */ 5511 (void) memset(&c, 0, sizeof (c)); 5512 c.c_id = 0; 5513 c.c_setno = sp->setno; 5514 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 5515 (void) mdstealerror(ep, &c.c_mde); 5516 return (-1); 5517 } 5518 if (c.c_flags & MDDB_C_STALE) { 5519 return (0); 5520 } 5521 5522 /* 5523 * If this node is NOT a newly chosen master, then there's 5524 * nothing else to do since the change log should be empty and 5525 * the ondisk and incore mddbs are already consistent. 5526 * 5527 * A newly chosen master is a node that was not the master 5528 * at the beginning of the reconfig cycle. If a node is a new 5529 * master, then the new master state is reset after the ondisk 5530 * and incore mddbs are consistent and the change log has 5531 * been replayed. 5532 */ 5533 (void) memset(&sf, 0, sizeof (sf)); 5534 sf.sf_setno = sp->setno; 5535 sf.sf_flags = MDDB_NM_GET; 5536 /* Use magic to help protect ioctl against attack. */ 5537 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5538 if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, &sf.sf_mde, NULL) == 0) && 5539 ((sf.sf_setflags & MD_SET_MN_NEWMAS_RC) == 0)) { 5540 return (0); 5541 } 5542 5543 /* 5544 * Now, sync up incore master view to ondisk mddbs. 5545 * This is needed in the case where a master node 5546 * had made a change to the mddb, but this change 5547 * may not have been relayed to the slaves yet. 5548 * So, the new master needs to verify that the ondisk 5549 * mddbs match what the new master has incore - 5550 * if different, new master rewrites all of the mddbs. 5551 * Then the new master will replay the changelog and the 5552 * new master will then execute what the old master had 5553 * done. 5554 * 5555 * Block all I/Os to disks in this diskset on all nodes in 5556 * the diskset. This will allow the rewriting of the mddbs 5557 * (if needed), to proceed in a timely manner. 5558 * 5559 * If block of I/Os fail, return a -1. 5560 */ 5561 5562 nd = sd->sd_nodelist; 5563 while (nd) { 5564 /* Skip non-alive and non-owner nodes */ 5565 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5566 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5567 nd = nd->nd_next; 5568 continue; 5569 } 5570 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5571 MN_SUSP_IO, ep)) { 5572 mde_perror(ep, dgettext(TEXT_DOMAIN, 5573 "Unable to suspend I/O on node %s in set %s"), 5574 nd->nd_nodename, sp->setname); 5575 5576 /* 5577 * Resume all other nodes that had been suspended. 5578 * (Reconfig return step also resumes I/Os 5579 * for all sets.) 5580 */ 5581 nd2 = sd->sd_nodelist; 5582 while (nd2) { 5583 /* Stop when reaching failed node */ 5584 if (nd2->nd_nodeid == nd->nd_nodeid) 5585 break; 5586 /* Skip non-alive and non-owner nodes */ 5587 if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) || 5588 (!(nd2->nd_flags & MD_MN_NODE_OWN))) { 5589 nd2 = nd2->nd_next; 5590 continue; 5591 } 5592 (void) (clnt_mn_susp_res_io(nd2->nd_nodename, 5593 sp->setno, MN_RES_IO, &xep)); 5594 nd2 = nd2->nd_next; 5595 } 5596 5597 /* 5598 * If an RPC failure on another node, return a 205. 5599 * Otherwise, exit with failure. 5600 */ 5601 if ((mdanyrpcerror(ep)) && 5602 (sd->sd_mn_mynode->nd_nodeid != 5603 nd->nd_nodeid)) { 5604 return (205); 5605 } else { 5606 return (-1); 5607 } 5608 5609 } 5610 nd = nd->nd_next; 5611 } 5612 5613 (void) memset(&c, 0, sizeof (c)); 5614 c.c_id = 0; 5615 c.c_setno = sp->setno; 5616 /* Master can't sync up to ondisk mddbs? Kick it out of cluster */ 5617 if (metaioctl(MD_MN_CHK_WRT_MDDB, &c, &c.c_mde, NULL) != 0) 5618 return (-1); 5619 5620 /* 5621 * Resume I/Os that were suspended above. 5622 */ 5623 nd = sd->sd_nodelist; 5624 while (nd) { 5625 /* Skip non-alive and non-owner nodes */ 5626 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5627 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5628 nd = nd->nd_next; 5629 continue; 5630 } 5631 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5632 MN_RES_IO, ep)) { 5633 mde_perror(ep, dgettext(TEXT_DOMAIN, 5634 "Unable to resume I/O on node %s in set %s"), 5635 nd->nd_nodename, sp->setname); 5636 5637 /* 5638 * If an RPC failure then don't do any 5639 * more RPC calls, since one timeout is enough 5640 * to endure. If RPC failure to another node, return 5641 * 205. If RPC failure to my node, return -1. 5642 * If not an RPC failure, continue resuming the 5643 * rest of the nodes and then return -1. 5644 */ 5645 if (mdanyrpcerror(ep)) { 5646 if (sd->sd_mn_mynode->nd_nodeid == 5647 nd->nd_nodeid) { 5648 return (-1); 5649 } else { 5650 return (205); 5651 } 5652 } 5653 5654 /* 5655 * If not an RPC error, continue resuming rest of 5656 * nodes, ignoring any failures except for an 5657 * RPC failure which constitutes an immediate exit. 5658 * Start in middle of list with failing node. 5659 */ 5660 nd2 = nd->nd_next; 5661 while (nd2) { 5662 /* Skip non-alive and non-owner nodes */ 5663 if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) || 5664 (!(nd2->nd_flags & MD_MN_NODE_OWN))) { 5665 nd2 = nd2->nd_next; 5666 continue; 5667 } 5668 (void) (clnt_mn_susp_res_io(nd2->nd_nodename, 5669 sp->setno, MN_RES_IO, &xep)); 5670 if (mdanyrpcerror(&xep)) { 5671 return (-1); 5672 } 5673 nd2 = nd2->nd_next; 5674 } 5675 } 5676 nd = nd->nd_next; 5677 } 5678 5679 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, "Master node has completed " 5680 "checking/writing the mddb for set %s: %s"), sp->setname, 5681 meta_print_hrtime(gethrtime() - start_time)); 5682 5683 /* 5684 * Send (aka replay) all messages we find in the changelog. 5685 * Flag the messages with 5686 * MD_MSGF_REPLAY_MSG, so no new message ID is generated for them 5687 * MD_MSGF_OVERRIDE_SUSPEND so they can pass the suspended commd. 5688 */ 5689 for (class = MD_MN_NCLASSES - 1; class > 0; class--) { 5690 mdmn_changelog_record_t *lr; 5691 md_error_t xep = mdnullerror; 5692 md_mn_result_t *resultp = NULL; 5693 int ret; 5694 5695 lr = mdmn_get_changelogrec(sp->setno, class); 5696 if ((lr->lr_flags & MD_MN_LR_INUSE) == 0) { 5697 /* no entry for this class */ 5698 continue; 5699 } 5700 5701 meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN, 5702 "replaying message ID=(%d, 0x%llx-%d)\n"), 5703 MSGID_ELEMS(lr->lr_msg.msg_msgid)); 5704 5705 ret = mdmn_send_message_with_msgid( 5706 lr->lr_msg.msg_setno, 5707 lr->lr_msg.msg_type, 5708 lr->lr_msg.msg_flags | MD_MSGF_REPLAY_MSG | 5709 MD_MSGF_OVERRIDE_SUSPEND, 5710 lr->lr_msg.msg_event_data, 5711 lr->lr_msg.msg_event_size, 5712 &resultp, 5713 &lr->lr_msg.msg_msgid, 5714 &xep); 5715 5716 meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN, 5717 "mdmn_send_message returned %d\n"), ret); 5718 5719 if (resultp) 5720 free_result(resultp); 5721 } 5722 5723 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5724 "Playing changelog completed for set %s: %s"), 5725 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5726 5727 /* 5728 * Now that new master has ondisk and incore mddbs in sync, reset 5729 * this node's new master kernel flag (for this set). If this node 5730 * re-enters another reconfig cycle before the completion of this 5731 * reconfig cycle, this master node won't need to check if the ondisk 5732 * and incore mddbs are in sync since this node won't be considered 5733 * a new master (since this flag is being reset here in the middle of 5734 * step2). This will save time during any subsequent reconfig 5735 * cycles as long as this node continues to be master. 5736 */ 5737 (void) memset(&sf, 0, sizeof (sf)); 5738 sf.sf_setno = sp->setno; 5739 sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 5740 sf.sf_flags = MDDB_NM_RESET; 5741 /* Use magic to help protect ioctl against attack. */ 5742 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5743 /* Ignore failure, since failure to reset flag isn't catastrophic */ 5744 (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, NULL); 5745 5746 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5747 "Reset new master flag for set %s: %s"), 5748 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5749 5750 return (0); 5751 } 5752 5753 /* 5754 * meta_mnjoin_all will join all starting nodes in the diskset. 5755 * A starting node is considered to be any node that is not 5756 * an owner of the set but is a member of the cluster. 5757 * Master node is already joined to set (done in meta_mnsync_diskset_mddbs). 5758 * 5759 * Caller is the Master node. 5760 * 5761 * Returns 0 - Success 5762 * 205 - Failure during RPC to another node 5763 * -1 - Any other failure and ep is filled in. 5764 */ 5765 int 5766 meta_mnjoin_all( 5767 mdsetname_t *sp, 5768 md_error_t *ep 5769 ) 5770 { 5771 md_set_desc *sd; 5772 md_mnnode_desc *nd, *nd2; 5773 int rval = 0; 5774 int stale_flag = 0; 5775 mddb_config_t c; 5776 int susp_res_flag = 0; 5777 md_error_t xep = mdnullerror; 5778 5779 /* If setname is there, set desc should exist. */ 5780 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 5781 mde_perror(ep, dgettext(TEXT_DOMAIN, 5782 "Unable to get set %s desc information"), sp->setname); 5783 return (-1); 5784 } 5785 5786 /* Are there drives in the set? */ 5787 if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 5788 ep) == NULL) { 5789 if (! mdisok(ep)) { 5790 return (-1); 5791 } 5792 /* No drives in set -- nothing to join */ 5793 return (0); 5794 } 5795 5796 /* 5797 * Is set currently stale? 5798 */ 5799 (void) memset(&c, 0, sizeof (c)); 5800 c.c_id = 0; 5801 c.c_setno = sp->setno; 5802 /* Ignore failure since master node may not be joined yet */ 5803 (void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL); 5804 if (c.c_flags & MDDB_C_STALE) { 5805 stale_flag = MNSET_IS_STALE; 5806 } 5807 5808 /* 5809 * If any nodes are going to be joined to diskset, then 5810 * suspend I/O to all disks in diskset so that nodes can join 5811 * (read in mddbs) in a reasonable amount of time even under 5812 * high I/O load. Don't need to do this if set is STALE since 5813 * no I/O can be occurring to a STALE set. 5814 */ 5815 if (stale_flag != MNSET_IS_STALE) { 5816 nd = sd->sd_nodelist; 5817 while (nd) { 5818 /* Found a node that will be joined to diskset */ 5819 if ((nd->nd_flags & MD_MN_NODE_ALIVE) && 5820 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5821 /* Set flag that diskset should be suspended */ 5822 susp_res_flag = 1; 5823 break; 5824 } 5825 nd = nd->nd_next; 5826 } 5827 } 5828 5829 if (susp_res_flag) { 5830 /* 5831 * Block all I/Os to disks in this diskset on all joined 5832 * nodes in the diskset. 5833 * If block of I/Os fails due to an RPC failure on another 5834 * node, return 205; otherwise, return -1. 5835 */ 5836 nd = sd->sd_nodelist; 5837 while (nd) { 5838 /* Skip non-alive and non-owner nodes */ 5839 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5840 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5841 nd = nd->nd_next; 5842 continue; 5843 } 5844 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5845 MN_SUSP_IO, ep)) { 5846 mde_perror(ep, dgettext(TEXT_DOMAIN, 5847 "Unable to suspend I/O on node %s" 5848 " in set %s"), nd->nd_nodename, 5849 sp->setname); 5850 /* 5851 * Resume other nodes that had been suspended. 5852 * (Reconfig return step also resumes I/Os 5853 * for all sets.) 5854 */ 5855 nd2 = sd->sd_nodelist; 5856 while (nd2) { 5857 /* Stop when reaching failed node */ 5858 if (nd2->nd_nodeid == nd->nd_nodeid) 5859 break; 5860 /* Skip non-alive/non-owner nodes */ 5861 if ((!(nd2->nd_flags & 5862 MD_MN_NODE_ALIVE)) || 5863 (!(nd2->nd_flags & 5864 MD_MN_NODE_OWN))) { 5865 nd2 = nd2->nd_next; 5866 continue; 5867 } 5868 (void) (clnt_mn_susp_res_io( 5869 nd2->nd_nodename, sp->setno, 5870 MN_RES_IO, &xep)); 5871 nd2 = nd2->nd_next; 5872 } 5873 5874 /* 5875 * If the suspend failed due to an 5876 * RPC failure on another node, return 5877 * a 205. 5878 * Otherwise, exit with failure. 5879 * The return reconfig step will resume 5880 * I/Os for all disksets. 5881 */ 5882 if ((mdanyrpcerror(ep)) && 5883 (sd->sd_mn_mynode->nd_nodeid != 5884 nd->nd_nodeid)) { 5885 return (205); 5886 } else { 5887 return (-1); 5888 } 5889 } 5890 nd = nd->nd_next; 5891 } 5892 } 5893 5894 nd = sd->sd_nodelist; 5895 while (nd) { 5896 /* 5897 * If a node is in the membership list but isn't joined 5898 * to the set, try to join the node. 5899 */ 5900 if ((nd->nd_flags & MD_MN_NODE_ALIVE) && 5901 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5902 if (clnt_joinset(nd->nd_nodename, sp, 5903 (MNSET_IN_RECONFIG | stale_flag), ep)) { 5904 /* 5905 * If RPC failure to another node 5906 * then exit without attempting anything else. 5907 * (Reconfig return step will resume I/Os 5908 * for all sets.) 5909 */ 5910 if (mdanyrpcerror(ep)) { 5911 mde_perror(ep, ""); 5912 return (205); 5913 } 5914 /* 5915 * STALE and ACCOK failures aren't true 5916 * failures. STALE means that <50% mddbs 5917 * are available. ACCOK means that the 5918 * mediator provided the extra vote. 5919 * If a true failure, then print messasge 5920 * and withdraw node from set in order to 5921 * cleanup from failed join attempt. 5922 */ 5923 if ((!mdismddberror(ep, MDE_DB_STALE)) && 5924 (!mdismddberror(ep, MDE_DB_ACCOK))) { 5925 mde_perror(ep, 5926 "WARNING: Unable to join node %s " 5927 "to set %s", nd->nd_nodename, 5928 sp->setname); 5929 mdclrerror(ep); 5930 if (clnt_withdrawset(nd->nd_nodename, 5931 sp, &xep)) 5932 mdclrerror(&xep); 5933 nd = nd->nd_next; 5934 continue; 5935 } 5936 } 5937 /* Set owner flag even if STALE or ACCOK */ 5938 nd->nd_flags |= MD_MN_NODE_OWN; 5939 } 5940 nd = nd->nd_next; 5941 } 5942 /* 5943 * Resume I/Os if suspended above. 5944 */ 5945 if (susp_res_flag) { 5946 nd = sd->sd_nodelist; 5947 while (nd) { 5948 /* 5949 * Skip non-alive and non-owner nodes 5950 * (this list doesn't include any of 5951 * the nodes that were joined). 5952 */ 5953 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5954 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5955 nd = nd->nd_next; 5956 continue; 5957 } 5958 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5959 MN_RES_IO, ep)) { 5960 mde_perror(ep, dgettext(TEXT_DOMAIN, 5961 "Unable to resume I/O on node %s" 5962 " in set %s"), nd->nd_nodename, 5963 sp->setname); 5964 5965 /* 5966 * If an RPC failure then don't do any 5967 * more RPC calls, since one timeout is enough 5968 * to endure. If RPC failure to another node, 5969 * return 205. If RPC failure to my node, 5970 * return -1. 5971 * (Reconfig return step will resume I/Os 5972 * for all sets.) 5973 * If not an RPC failure, continue resuming the 5974 * rest of the nodes and then return -1. 5975 */ 5976 if (mdanyrpcerror(ep)) { 5977 if (sd->sd_mn_mynode->nd_nodeid == 5978 nd->nd_nodeid) { 5979 return (-1); 5980 } else { 5981 return (205); 5982 } 5983 } 5984 5985 /* 5986 * If not an RPC error, continue resuming rest 5987 * of nodes, ignoring any failures except for 5988 * an RPC failure which constitutes an 5989 * immediate exit. 5990 * Start in middle of list with failing node. 5991 */ 5992 nd2 = nd->nd_next; 5993 while (nd2) { 5994 /* Skip non-owner nodes */ 5995 if ((!(nd2->nd_flags & 5996 MD_MN_NODE_ALIVE)) || 5997 (!(nd2->nd_flags & 5998 MD_MN_NODE_OWN))) { 5999 nd2 = nd2->nd_next; 6000 continue; 6001 } 6002 (void) (clnt_mn_susp_res_io( 6003 nd2->nd_nodename, sp->setno, 6004 MN_RES_IO, &xep)); 6005 if (mdanyrpcerror(&xep)) { 6006 return (-1); 6007 } 6008 nd2 = nd2->nd_next; 6009 } 6010 } 6011 nd = nd->nd_next; 6012 } 6013 } 6014 6015 nd = sd->sd_nodelist; 6016 while (nd) { 6017 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 6018 nd = nd->nd_next; 6019 continue; 6020 } 6021 /* 6022 * If 1 node fails - go ahead and update the rest except 6023 * in the case of an RPC failure, fail immediately. 6024 */ 6025 if (clnt_upd_nr_flags(nd->nd_nodename, sp, 6026 sd->sd_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) { 6027 /* RPC failure to another node */ 6028 if (mdanyrpcerror(ep)) { 6029 return (205); 6030 } 6031 nd = nd->nd_next; 6032 rval = -1; 6033 continue; 6034 } 6035 nd = nd->nd_next; 6036 } 6037 6038 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 6039 "Join of all nodes completed for set %s: %s"), 6040 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 6041 6042 return (rval); 6043 } 6044